remove redundant imports (#426)

* remove redundant imports * getting ride of hf dataset
2025-12-28 23:49:13 +00:00 · 2022-01-24 17:24:14 -05:00 · 2022-01-24 17:24:14 -05:00 · 4814091d87
commit 4814091d87
parent 6a7caa6a3d
6 changed files with 174 additions and 53 deletions
--- a/flaml/nlp/huggingface/switch_head_auto.py
+++ b/flaml/nlp/huggingface/switch_head_auto.py
@ -5,14 +5,10 @@ import transformers
 if transformers.__version__.startswith("3"):
    from transformers.modeling_electra import ElectraClassificationHead
    from transformers.modeling_roberta import RobertaClassificationHead
-    from transformers.models.electra.modeling_electra import ElectraForTokenClassification
-    from transformers.models.roberta.modeling_roberta import RobertaForTokenClassification

 else:
    from transformers.models.electra.modeling_electra import ElectraClassificationHead
    from transformers.models.roberta.modeling_roberta import RobertaClassificationHead
-    from transformers.models.electra.modeling_electra import ElectraForTokenClassification
-    from transformers.models.roberta.modeling_roberta import RobertaForTokenClassification

 MODEL_CLASSIFICATION_HEAD_MAPPING = OrderedDict(
    [
--- a/test/nlp/test_autohf.py
+++ b/test/nlp/test_autohf.py
@ -7,19 +7,61 @@ import shutil
@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
 def test_hf_data():
    from flaml import AutoML
-    import requests
-    from datasets import load_dataset
+    import pandas as pd

-    try:
-        train_dataset = load_dataset("glue", "mrpc", split="train[:1%]").to_pandas()
-        dev_dataset = (
-            load_dataset("glue", "mrpc", split="train[1%:2%]").to_pandas().iloc[:4]
-        )
-        test_dataset = (
-            load_dataset("glue", "mrpc", split="test[2%:3%]").to_pandas().iloc[:4]
-        )
-    except requests.exceptions.ConnectionError:
-        return
+    train_data = {
+        "sentence1": [
+            'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
+            "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
+            "They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .",
+            "Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .",
+        ],
+        "sentence2": [
+            'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
+            "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
+            "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
+            "Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .",
+        ],
+        "label": [1, 0, 1, 0],
+        "idx": [0, 1, 2, 3],
+    }
+    train_dataset = pd.DataFrame(train_data)
+
+    dev_data = {
+        "sentence1": [
+            "The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .",
+            "Revenue in the first quarter of the year dropped 15 percent from the same period a year earlier .",
+            "The Nasdaq had a weekly gain of 17.27 , or 1.2 percent , closing at 1,520.15 on Friday .",
+            "The DVD-CCA then appealed to the state Supreme Court .",
+        ],
+        "sentence2": [
+            "PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .",
+            "With the scandal hanging over Stewart 's company , revenue the first quarter of the year dropped 15 percent from the same period a year earlier .",
+            "The tech-laced Nasdaq Composite .IXIC rallied 30.46 points , or 2.04 percent , to 1,520.15 .",
+            "The DVD CCA appealed that decision to the U.S. Supreme Court .",
+        ],
+        "label": [1, 1, 0, 1],
+        "idx": [4, 5, 6, 7],
+    }
+    dev_dataset = pd.DataFrame(dev_data)
+
+    test_data = {
+        "sentence1": [
+            "That compared with $ 35.18 million , or 24 cents per share , in the year-ago period .",
+            "Shares of Genentech , a much larger company with several products on the market , rose more than 2 percent .",
+            "Legislation making it harder for consumers to erase their debts in bankruptcy court won overwhelming House approval in March .",
+            "The Nasdaq composite index increased 10.73 , or 0.7 percent , to 1,514.77 .",
+        ],
+        "sentence2": [
+            "Earnings were affected by a non-recurring $ 8 million tax benefit in the year-ago period .",
+            "Shares of Xoma fell 16 percent in early trade , while shares of Genentech , a much larger company with several products on the market , were up 2 percent .",
+            "Legislation making it harder for consumers to erase their debts in bankruptcy court won speedy , House approval in March and was endorsed by the White House .",
+            "The Nasdaq Composite index , full of technology stocks , was lately up around 18 points .",
+        ],
+        "label": [0, 0, 0, 0],
+        "idx": [8, 10, 11, 12],
+    }
+    test_dataset = pd.DataFrame(test_data)

    custom_sent_keys = ["sentence1", "sentence2"]
    label_key = "label"
--- a/test/nlp/test_autohf_classificationhead.py
+++ b/test/nlp/test_autohf_classificationhead.py
@ -1,17 +1,36 @@
 def test_classification_head():
    from flaml import AutoML
-    import requests
-    from datasets import load_dataset
+    import pandas as pd

-    try:
-        train_dataset = (
-            load_dataset("emotion", split="train[:1%]").to_pandas().iloc[0:10]
-        )
-        dev_dataset = (
-            load_dataset("emotion", split="train[1%:2%]").to_pandas().iloc[0:10]
-        )
-    except requests.exceptions.ConnectionError:
-        return
+    train_data = {
+        "text": [
+            "i didnt feel humiliated",
+            "i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake",
+            "im grabbing a minute to post i feel greedy wrong",
+            "i am ever feeling nostalgic about the fireplace i will know that it is still on the property",
+            "i am feeling grouchy",
+            "ive been feeling a little burdened lately wasnt sure why that was",
+            "ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny",
+            "i feel as confused about life as a teenager or as jaded as a year old man",
+            "i have been with petronas for years i feel that petronas has performed well and made a huge profit",
+            "i feel romantic too",
+            "i feel like i have to make the suffering i m seeing mean something",
+            "i do feel that running is a divine experience and that i can expect to have some type of spiritual encounter",
+        ],
+        "label": [0, 0, 3, 2, 3, 0, 5, 4, 1, 2, 0, 1],
+    }
+    train_dataset = pd.DataFrame(train_data)
+
+    dev_data = {
+        "text": [
+            "i think it s the easiest time of year to feel dissatisfied",
+            "i feel low energy i m just thirsty",
+            "i have immense sympathy with the general point but as a possible proto writer trying to find time to write in the corners of life and with no sign of an agent let alone a publishing contract this feels a little precious",
+            "i do not feel reassured anxiety is on each side",
+        ],
+        "label": [3, 0, 1, 1],
+    }
+    dev_dataset = pd.DataFrame(dev_data)

    custom_sent_keys = ["text"]
    label_key = "label"
--- a/test/nlp/test_autohf_custom_metric.py
+++ b/test/nlp/test_autohf_custom_metric.py
@ -42,16 +42,43 @@ def custom_metric(
@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
 def test_custom_metric():
    from flaml import AutoML
-    import requests
-    from datasets import load_dataset
+    import pandas as pd

-    try:
-        train_dataset = (
-            load_dataset("glue", "mrpc", split="train").to_pandas().iloc[0:4]
-        )
-        dev_dataset = load_dataset("glue", "mrpc", split="train").to_pandas().iloc[0:4]
-    except requests.exceptions.ConnectionError:
-        return
+    train_data = {
+        "sentence1": [
+            'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
+            "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
+            "They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .",
+            "Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .",
+        ],
+        "sentence2": [
+            'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
+            "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
+            "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
+            "Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .",
+        ],
+        "label": [1, 0, 1, 0],
+        "idx": [0, 1, 2, 3],
+    }
+    train_dataset = pd.DataFrame(train_data)
+
+    dev_data = {
+        "sentence1": [
+            "The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .",
+            "Revenue in the first quarter of the year dropped 15 percent from the same period a year earlier .",
+            "The Nasdaq had a weekly gain of 17.27 , or 1.2 percent , closing at 1,520.15 on Friday .",
+            "The DVD-CCA then appealed to the state Supreme Court .",
+        ],
+        "sentence2": [
+            "PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .",
+            "With the scandal hanging over Stewart 's company , revenue the first quarter of the year dropped 15 percent from the same period a year earlier .",
+            "The tech-laced Nasdaq Composite .IXIC rallied 30.46 points , or 2.04 percent , to 1,520.15 .",
+            "The DVD CCA appealed that decision to the U.S. Supreme Court .",
+        ],
+        "label": [1, 1, 0, 1],
+        "idx": [4, 5, 6, 7],
+    }
+    dev_dataset = pd.DataFrame(dev_data)

    custom_sent_keys = ["sentence1", "sentence2"]
    label_key = "label"
--- a/test/nlp/test_autohf_cv.py
+++ b/test/nlp/test_autohf_cv.py
@ -5,15 +5,25 @@ import pytest
@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
 def test_cv():
    from flaml import AutoML
-    import requests
-    from datasets import load_dataset
+    import pandas as pd

-    try:
-        train_dataset = (
-            load_dataset("glue", "mrpc", split="train[:1%]").to_pandas().iloc[0:4]
-        )
-    except requests.exceptions.ConnectionError:
-        return
+    train_data = {
+        "sentence1": [
+            'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
+            "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
+            "They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .",
+            "Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .",
+        ],
+        "sentence2": [
+            'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
+            "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
+            "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
+            "Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .",
+        ],
+        "label": [1, 0, 1, 0],
+        "idx": [0, 1, 2, 3],
+    }
+    train_dataset = pd.DataFrame(train_data)

    custom_sent_keys = ["sentence1", "sentence2"]
    label_key = "label"
--- a/test/nlp/test_autohf_regression.py
+++ b/test/nlp/test_autohf_regression.py
@ -9,16 +9,43 @@ def test_regression():
    except ImportError:
        return
    from flaml import AutoML
-    import requests
-    from datasets import load_dataset
+    import pandas as pd

-    try:
-        train_dataset = load_dataset("glue", "stsb", split="train[:2%]").to_pandas()
-        dev_dataset = (
-            load_dataset("glue", "stsb", split="train[2%:3%]").to_pandas().iloc[:32]
-        )
-    except requests.exceptions.ConnectionError:
-        return
+    train_data = {
+        "sentence1": [
+            "A plane is taking off.",
+            "A man is playing a large flute.",
+            "A man is spreading shreded cheese on a pizza.",
+            "Three men are playing chess.",
+        ],
+        "sentence2": [
+            "An air plane is taking off.",
+            "A man is playing a flute.",
+            "A man is spreading shredded cheese on an uncooked pizza.",
+            "Two men are playing chess.",
+        ],
+        "label": [5.0, 3.799999952316284, 3.799999952316284, 2.5999999046325684],
+        "idx": [0, 1, 2, 3],
+    }
+    train_dataset = pd.DataFrame(train_data)
+
+    dev_data = {
+        "sentence1": [
+            "A man is playing the cello.",
+            "Some men are fighting.",
+            "A man is smoking.",
+            "The man is playing the piano.",
+        ],
+        "sentence2": [
+            "A man seated is playing the cello.",
+            "Two men are fighting.",
+            "A man is skating.",
+            "The man is playing the guitar.",
+        ],
+        "label": [4.25, 4.25, 0.5, 1.600000023841858],
+        "idx": [4, 5, 6, 7],
+    }
+    dev_dataset = pd.DataFrame(dev_data)

    custom_sent_keys = ["sentence1", "sentence2"]
    label_key = "label"