autogen/test/automl/test_classification.py
Xueqing Liu 2a8decdc50
fix the post-processing bug in NER (#534)
* fix conll bug

* update DataCollatorForAuto

* adding label_list comments
2022-05-10 17:22:57 -04:00

351 lines
11 KiB
Python

import unittest
import numpy as np
import scipy.sparse
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import pandas as pd
from datetime import datetime
from flaml import AutoML
from flaml.model import LGBMEstimator
from flaml import tune
class MyLargeLGBM(LGBMEstimator):
@classmethod
def search_space(cls, **params):
return {
"n_estimators": {
"domain": tune.lograndint(lower=4, upper=32768),
"init_value": 32768,
"low_cost_init_value": 4,
},
"num_leaves": {
"domain": tune.lograndint(lower=4, upper=32768),
"init_value": 32768,
"low_cost_init_value": 4,
},
}
class TestClassification(unittest.TestCase):
def test_preprocess(self):
automl = AutoML()
X = pd.DataFrame(
{
"f1": [1, -2, 3, -4, 5, -6, -7, 8, -9, -10, -11, -12, -13, -14],
"f2": [
3.0,
16.0,
10.0,
12.0,
3.0,
14.0,
11.0,
12.0,
5.0,
14.0,
20.0,
16.0,
15.0,
11.0,
],
"f3": [
"a",
"b",
"a",
"c",
"c",
"b",
"b",
"b",
"b",
"a",
"b",
1.0,
1.0,
"a",
],
"f4": [
True,
True,
False,
True,
True,
False,
False,
False,
True,
True,
False,
False,
True,
True,
],
}
)
y = pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])
automl = AutoML()
automl_settings = {
"time_budget": 6,
"task": "classification",
"n_jobs": 1,
"estimator_list": ["catboost", "lrl2"],
"eval_method": "cv",
"n_splits": 3,
"metric": "accuracy",
"log_training_metric": True,
# "verbose": 4,
"ensemble": True,
}
automl.fit(X, y, **automl_settings)
assert automl.model is not None
automl = AutoML()
automl_settings = {
"time_budget": 2,
"task": "classification",
"n_jobs": 1,
"estimator_list": ["lrl2", "kneighbor"],
"eval_method": "cv",
"n_splits": 3,
"metric": "accuracy",
"log_training_metric": True,
"verbose": 4,
"ensemble": True,
}
automl.fit(X, y, **automl_settings)
automl = AutoML()
automl_settings = {
"time_budget": 3,
"task": "classification",
"n_jobs": 1,
"estimator_list": ["xgboost", "catboost", "kneighbor"],
"eval_method": "cv",
"n_splits": 3,
"metric": "accuracy",
"log_training_metric": True,
# "verbose": 4,
"ensemble": True,
}
automl.fit(X, y, **automl_settings)
automl = AutoML()
automl_settings = {
"time_budget": 3,
"task": "classification",
"n_jobs": 1,
"estimator_list": ["lgbm", "catboost", "kneighbor"],
"eval_method": "cv",
"n_splits": 3,
"metric": "accuracy",
"log_training_metric": True,
# "verbose": 4,
"ensemble": True,
}
automl.fit(X, y, **automl_settings)
def test_binary(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 1,
"task": "binary",
"log_file_name": "test/breast_cancer.log",
"log_training_metric": True,
"n_jobs": 1,
"model_history": True,
}
X_train, y_train = load_breast_cancer(return_X_y=True)
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
_ = automl_experiment.predict(X_train)
def test_datetime_columns(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
"log_file_name": "test/datetime_columns.log",
"log_training_metric": True,
"n_jobs": 1,
"model_history": True,
}
fake_df = pd.DataFrame(
{
"A": [
datetime(1900, 2, 3),
datetime(1900, 3, 4),
datetime(1900, 3, 4),
datetime(1900, 3, 4),
datetime(1900, 7, 2),
datetime(1900, 8, 9),
],
"B": [
datetime(1900, 1, 1),
datetime(1900, 1, 1),
datetime(1900, 1, 1),
datetime(1900, 1, 1),
datetime(1900, 1, 1),
datetime(1900, 1, 1),
],
"year_A": [
datetime(1900, 1, 2),
datetime(1900, 8, 1),
datetime(1900, 1, 4),
datetime(1900, 6, 1),
datetime(1900, 1, 5),
datetime(1900, 4, 1),
],
}
)
y = np.array([0, 1, 0, 1, 0, 0])
automl_experiment.fit(X_train=fake_df, y_train=y, **automl_settings)
_ = automl_experiment.predict(fake_df)
def test_sparse_matrix_xgboost(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 3,
"metric": "ap",
"task": "classification",
"log_file_name": "test/sparse_classification.log",
"estimator_list": ["xgboost"],
"log_type": "all",
"n_jobs": 1,
}
X_train = scipy.sparse.eye(900000)
y_train = np.random.randint(2, size=900000)
import xgboost as xgb
callback = xgb.callback.TrainingCallback()
automl_experiment.fit(
X_train=X_train, y_train=y_train, callbacks=[callback], **automl_settings
)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.best_model_for_estimator("xgboost"))
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
def test_ray_classification(self):
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
automl = AutoML()
try:
automl.fit(
X_train,
y_train,
X_val=X_test,
y_val=y_test,
time_budget=10,
task="classification",
use_ray=True,
)
automl.fit(
X_train,
y_train,
X_val=X_test,
y_val=y_test,
time_budget=10,
task="classification",
n_concurrent_trials=2,
)
except ImportError:
return
def test_parallel_xgboost(self, hpo_method=None):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 10,
"metric": "ap",
"task": "classification",
"log_file_name": "test/sparse_classification.log",
"estimator_list": ["xgboost"],
"log_type": "all",
"n_jobs": 1,
"n_concurrent_trials": 2,
"hpo_method": hpo_method,
}
X_train = scipy.sparse.eye(900000)
y_train = np.random.randint(2, size=900000)
try:
import ray
X_train_ref = ray.put(X_train)
automl_experiment.fit(
X_train=X_train_ref, y_train=y_train, **automl_settings
)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.best_model_for_estimator("xgboost"))
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
except ImportError:
return
def test_parallel_xgboost_others(self):
# use random search as the hpo_method
self.test_parallel_xgboost(hpo_method="random")
def test_random_skip_oom(self):
automl_experiment = AutoML()
automl_experiment.add_learner(
learner_name="large_lgbm", learner_class=MyLargeLGBM
)
automl_settings = {
"time_budget": 2,
"task": "classification",
"log_file_name": "test/sparse_classification_oom.log",
"estimator_list": ["large_lgbm"],
"log_type": "all",
"n_jobs": 1,
"hpo_method": "random",
"n_concurrent_trials": 2,
}
X_train = scipy.sparse.eye(900000)
y_train = np.random.randint(2, size=900000)
try:
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.best_model_for_estimator("large_lgbm"))
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
except ImportError:
print("skipping concurrency test as ray is not installed")
return
def test_sparse_matrix_lr(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 3,
"metric": "f1",
"task": "classification",
"log_file_name": "test/sparse_classification.log",
"estimator_list": ["lrl1", "lrl2"],
"log_type": "all",
"n_jobs": 1,
}
X_train = scipy.sparse.random(3000, 3000, density=0.1)
y_train = np.random.randint(2, size=3000)
automl_experiment.fit(
X_train=X_train, y_train=y_train, train_time_limit=1, **automl_settings
)
automl_settings["time_budget"] = 5
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.best_model_for_estimator("lrl2"))
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
if __name__ == "__main__":
unittest.main()
test = TestClassification()