autogen/test/nlp/test_autohf.py
2022-01-29 22:53:32 -08:00

189 lines
6.6 KiB
Python

import sys
import pytest
import pickle
import shutil
import requests
@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
def test_hf_data():
from flaml import AutoML
import pandas as pd
train_data = {
"sentence1": [
'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
"Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
"They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .",
"Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .",
],
"sentence2": [
'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
"Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
"On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
"Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .",
],
"label": [1, 0, 1, 0],
"idx": [0, 1, 2, 3],
}
train_dataset = pd.DataFrame(train_data)
dev_data = {
"sentence1": [
"The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .",
"Revenue in the first quarter of the year dropped 15 percent from the same period a year earlier .",
"The Nasdaq had a weekly gain of 17.27 , or 1.2 percent , closing at 1,520.15 on Friday .",
"The DVD-CCA then appealed to the state Supreme Court .",
],
"sentence2": [
"PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .",
"With the scandal hanging over Stewart 's company , revenue the first quarter of the year dropped 15 percent from the same period a year earlier .",
"The tech-laced Nasdaq Composite .IXIC rallied 30.46 points , or 2.04 percent , to 1,520.15 .",
"The DVD CCA appealed that decision to the U.S. Supreme Court .",
],
"label": [1, 1, 0, 1],
"idx": [4, 5, 6, 7],
}
dev_dataset = pd.DataFrame(dev_data)
test_data = {
"sentence1": [
"That compared with $ 35.18 million , or 24 cents per share , in the year-ago period .",
"Shares of Genentech , a much larger company with several products on the market , rose more than 2 percent .",
"Legislation making it harder for consumers to erase their debts in bankruptcy court won overwhelming House approval in March .",
"The Nasdaq composite index increased 10.73 , or 0.7 percent , to 1,514.77 .",
],
"sentence2": [
"Earnings were affected by a non-recurring $ 8 million tax benefit in the year-ago period .",
"Shares of Xoma fell 16 percent in early trade , while shares of Genentech , a much larger company with several products on the market , were up 2 percent .",
"Legislation making it harder for consumers to erase their debts in bankruptcy court won speedy , House approval in March and was endorsed by the White House .",
"The Nasdaq Composite index , full of technology stocks , was lately up around 18 points .",
],
"label": [0, 0, 0, 0],
"idx": [8, 10, 11, 12],
}
test_dataset = pd.DataFrame(test_data)
custom_sent_keys = ["sentence1", "sentence2"]
label_key = "label"
X_train = train_dataset[custom_sent_keys]
y_train = train_dataset[label_key]
X_val = dev_dataset[custom_sent_keys]
y_val = dev_dataset[label_key]
X_test = test_dataset[custom_sent_keys]
automl = AutoML()
automl_settings = {
"gpu_per_trial": 0,
"max_iter": 3,
"time_budget": 10,
"task": "seq-classification",
"metric": "accuracy",
"log_file_name": "seqclass.log",
}
automl_settings["custom_hpo_args"] = {
"model_path": "google/electra-small-discriminator",
"output_dir": "test/data/output/",
"ckpt_per_epoch": 5,
"fp16": False,
}
try:
automl.fit(
X_train=X_train,
y_train=y_train,
X_val=X_val,
y_val=y_val,
**automl_settings
)
except requests.exceptions.HTTPError:
return
automl = AutoML()
automl.retrain_from_log(
X_train=X_train,
y_train=y_train,
train_full=True,
record_id=0,
**automl_settings
)
with open("automl.pkl", "wb") as f:
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
with open("automl.pkl", "rb") as f:
automl = pickle.load(f)
shutil.rmtree("test/data/output/")
automl.predict(X_test)
automl.predict(["test test", "test test"])
automl.predict(
[
["test test", "test test"],
["test test", "test test"],
["test test", "test test"],
]
)
automl.predict_proba(X_test)
print(automl.classes_)
def _test_custom_data():
from flaml import AutoML
import requests
import pandas as pd
try:
train_dataset = pd.read_csv("data/input/train.tsv", delimiter="\t", quoting=3)
dev_dataset = pd.read_csv("data/input/dev.tsv", delimiter="\t", quoting=3)
test_dataset = pd.read_csv("data/input/test.tsv", delimiter="\t", quoting=3)
except requests.exceptions.HTTPError:
return
custom_sent_keys = ["#1 String", "#2 String"]
label_key = "Quality"
X_train = train_dataset[custom_sent_keys]
y_train = train_dataset[label_key]
X_val = dev_dataset[custom_sent_keys]
y_val = dev_dataset[label_key]
X_test = test_dataset[custom_sent_keys]
automl = AutoML()
automl_settings = {
"gpu_per_trial": 0,
"max_iter": 3,
"time_budget": 5,
"task": "seq-classification",
"metric": "accuracy",
}
automl_settings["custom_hpo_args"] = {
"model_path": "google/electra-small-discriminator",
"output_dir": "data/output/",
"ckpt_per_epoch": 1,
}
automl.fit(
X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
)
automl.predict(X_test)
automl.predict(["test test"])
automl.predict(
[
["test test", "test test"],
["test test", "test test"],
["test test", "test test"],
]
)
if __name__ == "__main__":
test_hf_data()