mirror of
https://github.com/microsoft/autogen.git
synced 2025-07-25 01:41:01 +00:00
351 lines
11 KiB
Python
351 lines
11 KiB
Python
import unittest
|
|
import numpy as np
|
|
import scipy.sparse
|
|
from sklearn.datasets import load_breast_cancer
|
|
from sklearn.model_selection import train_test_split
|
|
import pandas as pd
|
|
from datetime import datetime
|
|
from flaml import AutoML
|
|
from flaml.model import LGBMEstimator
|
|
from flaml import tune
|
|
|
|
|
|
class MyLargeLGBM(LGBMEstimator):
|
|
@classmethod
|
|
def search_space(cls, **params):
|
|
return {
|
|
"n_estimators": {
|
|
"domain": tune.lograndint(lower=4, upper=32768),
|
|
"init_value": 32768,
|
|
"low_cost_init_value": 4,
|
|
},
|
|
"num_leaves": {
|
|
"domain": tune.lograndint(lower=4, upper=32768),
|
|
"init_value": 32768,
|
|
"low_cost_init_value": 4,
|
|
},
|
|
}
|
|
|
|
|
|
class TestClassification(unittest.TestCase):
|
|
def test_preprocess(self):
|
|
automl = AutoML()
|
|
X = pd.DataFrame(
|
|
{
|
|
"f1": [1, -2, 3, -4, 5, -6, -7, 8, -9, -10, -11, -12, -13, -14],
|
|
"f2": [
|
|
3.0,
|
|
16.0,
|
|
10.0,
|
|
12.0,
|
|
3.0,
|
|
14.0,
|
|
11.0,
|
|
12.0,
|
|
5.0,
|
|
14.0,
|
|
20.0,
|
|
16.0,
|
|
15.0,
|
|
11.0,
|
|
],
|
|
"f3": [
|
|
"a",
|
|
"b",
|
|
"a",
|
|
"c",
|
|
"c",
|
|
"b",
|
|
"b",
|
|
"b",
|
|
"b",
|
|
"a",
|
|
"b",
|
|
1.0,
|
|
1.0,
|
|
"a",
|
|
],
|
|
"f4": [
|
|
True,
|
|
True,
|
|
False,
|
|
True,
|
|
True,
|
|
False,
|
|
False,
|
|
False,
|
|
True,
|
|
True,
|
|
False,
|
|
False,
|
|
True,
|
|
True,
|
|
],
|
|
}
|
|
)
|
|
y = pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])
|
|
|
|
automl = AutoML()
|
|
automl_settings = {
|
|
"time_budget": 6,
|
|
"task": "classification",
|
|
"n_jobs": 1,
|
|
"estimator_list": ["catboost", "lrl2"],
|
|
"eval_method": "cv",
|
|
"n_splits": 3,
|
|
"metric": "accuracy",
|
|
"log_training_metric": True,
|
|
# "verbose": 4,
|
|
"ensemble": True,
|
|
}
|
|
automl.fit(X, y, **automl_settings)
|
|
assert automl.model is not None
|
|
|
|
automl = AutoML()
|
|
automl_settings = {
|
|
"time_budget": 2,
|
|
"task": "classification",
|
|
"n_jobs": 1,
|
|
"estimator_list": ["lrl2", "kneighbor"],
|
|
"eval_method": "cv",
|
|
"n_splits": 3,
|
|
"metric": "accuracy",
|
|
"log_training_metric": True,
|
|
"verbose": 4,
|
|
"ensemble": True,
|
|
}
|
|
automl.fit(X, y, **automl_settings)
|
|
|
|
automl = AutoML()
|
|
automl_settings = {
|
|
"time_budget": 3,
|
|
"task": "classification",
|
|
"n_jobs": 1,
|
|
"estimator_list": ["xgboost", "catboost", "kneighbor"],
|
|
"eval_method": "cv",
|
|
"n_splits": 3,
|
|
"metric": "accuracy",
|
|
"log_training_metric": True,
|
|
# "verbose": 4,
|
|
"ensemble": True,
|
|
}
|
|
automl.fit(X, y, **automl_settings)
|
|
|
|
automl = AutoML()
|
|
automl_settings = {
|
|
"time_budget": 3,
|
|
"task": "classification",
|
|
"n_jobs": 1,
|
|
"estimator_list": ["lgbm", "catboost", "kneighbor"],
|
|
"eval_method": "cv",
|
|
"n_splits": 3,
|
|
"metric": "accuracy",
|
|
"log_training_metric": True,
|
|
# "verbose": 4,
|
|
"ensemble": True,
|
|
}
|
|
automl.fit(X, y, **automl_settings)
|
|
|
|
def test_binary(self):
|
|
automl_experiment = AutoML()
|
|
automl_settings = {
|
|
"time_budget": 1,
|
|
"task": "binary",
|
|
"log_file_name": "test/breast_cancer.log",
|
|
"log_training_metric": True,
|
|
"n_jobs": 1,
|
|
"model_history": True,
|
|
}
|
|
X_train, y_train = load_breast_cancer(return_X_y=True)
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
|
_ = automl_experiment.predict(X_train)
|
|
|
|
def test_datetime_columns(self):
|
|
automl_experiment = AutoML()
|
|
automl_settings = {
|
|
"time_budget": 2,
|
|
"log_file_name": "test/datetime_columns.log",
|
|
"log_training_metric": True,
|
|
"n_jobs": 1,
|
|
"model_history": True,
|
|
}
|
|
fake_df = pd.DataFrame(
|
|
{
|
|
"A": [
|
|
datetime(1900, 2, 3),
|
|
datetime(1900, 3, 4),
|
|
datetime(1900, 3, 4),
|
|
datetime(1900, 3, 4),
|
|
datetime(1900, 7, 2),
|
|
datetime(1900, 8, 9),
|
|
],
|
|
"B": [
|
|
datetime(1900, 1, 1),
|
|
datetime(1900, 1, 1),
|
|
datetime(1900, 1, 1),
|
|
datetime(1900, 1, 1),
|
|
datetime(1900, 1, 1),
|
|
datetime(1900, 1, 1),
|
|
],
|
|
"year_A": [
|
|
datetime(1900, 1, 2),
|
|
datetime(1900, 8, 1),
|
|
datetime(1900, 1, 4),
|
|
datetime(1900, 6, 1),
|
|
datetime(1900, 1, 5),
|
|
datetime(1900, 4, 1),
|
|
],
|
|
}
|
|
)
|
|
y = np.array([0, 1, 0, 1, 0, 0])
|
|
automl_experiment.fit(X_train=fake_df, y_train=y, **automl_settings)
|
|
_ = automl_experiment.predict(fake_df)
|
|
|
|
def test_sparse_matrix_xgboost(self):
|
|
automl_experiment = AutoML()
|
|
automl_settings = {
|
|
"time_budget": 3,
|
|
"metric": "ap",
|
|
"task": "classification",
|
|
"log_file_name": "test/sparse_classification.log",
|
|
"estimator_list": ["xgboost"],
|
|
"log_type": "all",
|
|
"n_jobs": 1,
|
|
}
|
|
X_train = scipy.sparse.eye(900000)
|
|
y_train = np.random.randint(2, size=900000)
|
|
import xgboost as xgb
|
|
|
|
callback = xgb.callback.TrainingCallback()
|
|
automl_experiment.fit(
|
|
X_train=X_train, y_train=y_train, callbacks=[callback], **automl_settings
|
|
)
|
|
print(automl_experiment.predict(X_train))
|
|
print(automl_experiment.model)
|
|
print(automl_experiment.config_history)
|
|
print(automl_experiment.best_model_for_estimator("xgboost"))
|
|
print(automl_experiment.best_iteration)
|
|
print(automl_experiment.best_estimator)
|
|
|
|
def test_ray_classification(self):
|
|
X, y = load_breast_cancer(return_X_y=True)
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
|
|
|
|
automl = AutoML()
|
|
try:
|
|
automl.fit(
|
|
X_train,
|
|
y_train,
|
|
X_val=X_test,
|
|
y_val=y_test,
|
|
time_budget=10,
|
|
task="classification",
|
|
use_ray=True,
|
|
)
|
|
automl.fit(
|
|
X_train,
|
|
y_train,
|
|
X_val=X_test,
|
|
y_val=y_test,
|
|
time_budget=10,
|
|
task="classification",
|
|
n_concurrent_trials=2,
|
|
)
|
|
except ImportError:
|
|
return
|
|
|
|
def test_parallel_xgboost(self, hpo_method=None):
|
|
automl_experiment = AutoML()
|
|
automl_settings = {
|
|
"time_budget": 10,
|
|
"metric": "ap",
|
|
"task": "classification",
|
|
"log_file_name": "test/sparse_classification.log",
|
|
"estimator_list": ["xgboost"],
|
|
"log_type": "all",
|
|
"n_jobs": 1,
|
|
"n_concurrent_trials": 2,
|
|
"hpo_method": hpo_method,
|
|
}
|
|
X_train = scipy.sparse.eye(900000)
|
|
y_train = np.random.randint(2, size=900000)
|
|
try:
|
|
import ray
|
|
|
|
X_train_ref = ray.put(X_train)
|
|
automl_experiment.fit(
|
|
X_train=X_train_ref, y_train=y_train, **automl_settings
|
|
)
|
|
print(automl_experiment.predict(X_train))
|
|
print(automl_experiment.model)
|
|
print(automl_experiment.config_history)
|
|
print(automl_experiment.best_model_for_estimator("xgboost"))
|
|
print(automl_experiment.best_iteration)
|
|
print(automl_experiment.best_estimator)
|
|
except ImportError:
|
|
return
|
|
|
|
def test_parallel_xgboost_others(self):
|
|
# use random search as the hpo_method
|
|
self.test_parallel_xgboost(hpo_method="random")
|
|
|
|
def test_random_skip_oom(self):
|
|
automl_experiment = AutoML()
|
|
automl_experiment.add_learner(
|
|
learner_name="large_lgbm", learner_class=MyLargeLGBM
|
|
)
|
|
automl_settings = {
|
|
"time_budget": 2,
|
|
"task": "classification",
|
|
"log_file_name": "test/sparse_classification_oom.log",
|
|
"estimator_list": ["large_lgbm"],
|
|
"log_type": "all",
|
|
"n_jobs": 1,
|
|
"hpo_method": "random",
|
|
"n_concurrent_trials": 2,
|
|
}
|
|
X_train = scipy.sparse.eye(900000)
|
|
y_train = np.random.randint(2, size=900000)
|
|
|
|
try:
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
|
print(automl_experiment.predict(X_train))
|
|
print(automl_experiment.model)
|
|
print(automl_experiment.config_history)
|
|
print(automl_experiment.best_model_for_estimator("large_lgbm"))
|
|
print(automl_experiment.best_iteration)
|
|
print(automl_experiment.best_estimator)
|
|
except ImportError:
|
|
print("skipping concurrency test as ray is not installed")
|
|
return
|
|
|
|
def test_sparse_matrix_lr(self):
|
|
automl_experiment = AutoML()
|
|
automl_settings = {
|
|
"time_budget": 3,
|
|
"metric": "f1",
|
|
"task": "classification",
|
|
"log_file_name": "test/sparse_classification.log",
|
|
"estimator_list": ["lrl1", "lrl2"],
|
|
"log_type": "all",
|
|
"n_jobs": 1,
|
|
}
|
|
X_train = scipy.sparse.random(3000, 3000, density=0.1)
|
|
y_train = np.random.randint(2, size=3000)
|
|
automl_experiment.fit(
|
|
X_train=X_train, y_train=y_train, train_time_limit=1, **automl_settings
|
|
)
|
|
automl_settings["time_budget"] = 5
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
|
print(automl_experiment.predict(X_train))
|
|
print(automl_experiment.model)
|
|
print(automl_experiment.config_history)
|
|
print(automl_experiment.best_model_for_estimator("lrl2"))
|
|
print(automl_experiment.best_iteration)
|
|
print(automl_experiment.best_estimator)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|
|
test = TestClassification()
|