2020-12-04 09:40:27 -08:00
|
|
|
import unittest
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
import scipy.sparse
|
2021-09-02 17:50:22 +02:00
|
|
|
from sklearn.datasets import load_boston, load_iris, load_wine, load_breast_cancer
|
2020-12-04 09:40:27 -08:00
|
|
|
|
2021-04-21 16:22:54 +02:00
|
|
|
import pandas as pd
|
|
|
|
from datetime import datetime
|
|
|
|
|
2020-12-15 08:10:43 -08:00
|
|
|
from flaml import AutoML
|
|
|
|
from flaml.data import get_output_from_log
|
|
|
|
|
2021-08-23 19:36:51 -04:00
|
|
|
from flaml.model import LGBMEstimator, SKLearnEstimator, XGBoostEstimator
|
2020-12-15 08:10:43 -08:00
|
|
|
from rgf.sklearn import RGFClassifier, RGFRegressor
|
2021-02-05 21:41:14 -08:00
|
|
|
from flaml import tune
|
2021-09-10 16:39:16 -07:00
|
|
|
from flaml.training_log import training_log_reader
|
2020-12-15 08:10:43 -08:00
|
|
|
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
class MyRegularizedGreedyForest(SKLearnEstimator):
|
2021-09-10 16:39:16 -07:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
task="binary",
|
|
|
|
n_jobs=1,
|
|
|
|
max_leaf=4,
|
|
|
|
n_iter=1,
|
|
|
|
n_tree_search=1,
|
|
|
|
opt_interval=1,
|
|
|
|
learning_rate=1.0,
|
|
|
|
min_samples_leaf=1,
|
|
|
|
**params
|
|
|
|
):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
super().__init__(task, **params)
|
|
|
|
|
2021-09-10 16:39:16 -07:00
|
|
|
if "regression" in task:
|
2020-12-15 08:10:43 -08:00
|
|
|
self.estimator_class = RGFRegressor
|
|
|
|
else:
|
|
|
|
self.estimator_class = RGFClassifier
|
|
|
|
|
|
|
|
# round integer hyperparameters
|
|
|
|
self.params = {
|
2021-02-05 21:41:14 -08:00
|
|
|
"n_jobs": n_jobs,
|
2021-09-10 16:39:16 -07:00
|
|
|
"max_leaf": int(round(max_leaf)),
|
|
|
|
"n_iter": int(round(n_iter)),
|
|
|
|
"n_tree_search": int(round(n_tree_search)),
|
|
|
|
"opt_interval": int(round(opt_interval)),
|
|
|
|
"learning_rate": learning_rate,
|
|
|
|
"min_samples_leaf": int(round(min_samples_leaf)),
|
2021-04-08 09:29:55 -07:00
|
|
|
}
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def search_space(cls, data_size, task):
|
|
|
|
space = {
|
2021-09-10 16:39:16 -07:00
|
|
|
"max_leaf": {
|
|
|
|
"domain": tune.qloguniform(lower=4, upper=data_size, q=1),
|
|
|
|
"init_value": 4,
|
|
|
|
},
|
|
|
|
"n_iter": {
|
|
|
|
"domain": tune.qloguniform(lower=1, upper=data_size, q=1),
|
|
|
|
"init_value": 1,
|
|
|
|
},
|
|
|
|
"n_tree_search": {
|
|
|
|
"domain": tune.qloguniform(lower=1, upper=32768, q=1),
|
|
|
|
"init_value": 1,
|
|
|
|
},
|
|
|
|
"opt_interval": {
|
|
|
|
"domain": tune.qloguniform(lower=1, upper=10000, q=1),
|
|
|
|
"init_value": 100,
|
|
|
|
},
|
|
|
|
"learning_rate": {"domain": tune.loguniform(lower=0.01, upper=20.0)},
|
|
|
|
"min_samples_leaf": {
|
|
|
|
"domain": tune.qloguniform(lower=1, upper=20, q=1),
|
|
|
|
"init_value": 20,
|
|
|
|
},
|
2021-02-05 21:41:14 -08:00
|
|
|
}
|
|
|
|
return space
|
2020-12-04 09:40:27 -08:00
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
@classmethod
|
|
|
|
def size(cls, config):
|
2021-09-10 16:39:16 -07:00
|
|
|
max_leaves = int(round(config["max_leaf"]))
|
|
|
|
n_estimators = int(round(config["n_iter"]))
|
2021-04-08 09:29:55 -07:00
|
|
|
return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8
|
2020-12-04 09:40:27 -08:00
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
@classmethod
|
|
|
|
def cost_relative2lgbm(cls):
|
2021-04-08 09:29:55 -07:00
|
|
|
return 1.0
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
2021-04-10 21:14:28 -04:00
|
|
|
def logregobj(preds, dtrain):
|
|
|
|
labels = dtrain.get_label()
|
2021-05-18 15:57:42 -07:00
|
|
|
preds = 1.0 / (1.0 + np.exp(-preds)) # transform raw leaf weight
|
2021-04-10 21:14:28 -04:00
|
|
|
grad = preds - labels
|
|
|
|
hess = preds * (1.0 - preds)
|
|
|
|
return grad, hess
|
|
|
|
|
|
|
|
|
|
|
|
class MyXGB1(XGBoostEstimator):
|
2021-09-10 16:39:16 -07:00
|
|
|
"""XGBoostEstimator with logregobj as the objective function"""
|
2021-04-10 21:14:28 -04:00
|
|
|
|
|
|
|
def __init__(self, **params):
|
2021-05-18 15:57:42 -07:00
|
|
|
super().__init__(objective=logregobj, **params)
|
2021-04-10 21:14:28 -04:00
|
|
|
|
|
|
|
|
|
|
|
class MyXGB2(XGBoostEstimator):
|
2021-09-10 16:39:16 -07:00
|
|
|
"""XGBoostEstimator with 'reg:squarederror' as the objective function"""
|
2021-04-10 21:14:28 -04:00
|
|
|
|
|
|
|
def __init__(self, **params):
|
2021-09-10 16:39:16 -07:00
|
|
|
super().__init__(objective="reg:squarederror", **params)
|
2021-04-10 21:14:28 -04:00
|
|
|
|
|
|
|
|
2021-08-23 19:36:51 -04:00
|
|
|
class MyLargeLGBM(LGBMEstimator):
|
|
|
|
@classmethod
|
|
|
|
def search_space(cls, **params):
|
|
|
|
return {
|
2021-09-10 16:39:16 -07:00
|
|
|
"n_estimators": {
|
|
|
|
"domain": tune.lograndint(lower=4, upper=32768),
|
|
|
|
"init_value": 32768,
|
|
|
|
"low_cost_init_value": 4,
|
2021-08-23 19:36:51 -04:00
|
|
|
},
|
2021-09-10 16:39:16 -07:00
|
|
|
"num_leaves": {
|
|
|
|
"domain": tune.lograndint(lower=4, upper=32768),
|
|
|
|
"init_value": 32768,
|
|
|
|
"low_cost_init_value": 4,
|
2021-08-23 19:36:51 -04:00
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-09-10 16:39:16 -07:00
|
|
|
def custom_metric(
|
|
|
|
X_test,
|
|
|
|
y_test,
|
|
|
|
estimator,
|
|
|
|
labels,
|
|
|
|
X_train,
|
|
|
|
y_train,
|
|
|
|
weight_test=None,
|
|
|
|
weight_train=None,
|
|
|
|
config=None,
|
|
|
|
groups_test=None,
|
|
|
|
groups_train=None,
|
|
|
|
):
|
2020-12-04 09:40:27 -08:00
|
|
|
from sklearn.metrics import log_loss
|
2021-07-10 09:02:17 -07:00
|
|
|
import time
|
2021-09-10 16:39:16 -07:00
|
|
|
|
2021-07-10 09:02:17 -07:00
|
|
|
start = time.time()
|
2020-12-04 09:40:27 -08:00
|
|
|
y_pred = estimator.predict_proba(X_test)
|
2021-07-10 09:02:17 -07:00
|
|
|
pred_time = (time.time() - start) / len(X_test)
|
2021-09-10 16:39:16 -07:00
|
|
|
test_loss = log_loss(y_test, y_pred, labels=labels, sample_weight=weight_test)
|
2020-12-04 09:40:27 -08:00
|
|
|
y_pred = estimator.predict_proba(X_train)
|
2021-09-10 16:39:16 -07:00
|
|
|
train_loss = log_loss(y_train, y_pred, labels=labels, sample_weight=weight_train)
|
2020-12-04 09:40:27 -08:00
|
|
|
alpha = 0.5
|
2021-07-10 09:02:17 -07:00
|
|
|
return test_loss * (1 + alpha) - alpha * train_loss, {
|
2021-09-10 16:39:16 -07:00
|
|
|
"test_loss": test_loss,
|
|
|
|
"train_loss": train_loss,
|
|
|
|
"pred_time": pred_time,
|
2021-07-10 09:02:17 -07:00
|
|
|
}
|
2020-12-04 09:40:27 -08:00
|
|
|
|
|
|
|
|
|
|
|
class TestAutoML(unittest.TestCase):
|
2020-12-15 08:10:43 -08:00
|
|
|
def test_custom_learner(self):
|
|
|
|
automl = AutoML()
|
2021-09-10 16:39:16 -07:00
|
|
|
automl.add_learner(learner_name="RGF", learner_class=MyRegularizedGreedyForest)
|
2020-12-15 08:10:43 -08:00
|
|
|
X_train, y_train = load_wine(return_X_y=True)
|
|
|
|
settings = {
|
2021-09-10 16:39:16 -07:00
|
|
|
"time_budget": 8, # total running time in seconds
|
|
|
|
"estimator_list": ["RGF", "lgbm", "rf", "xgboost"],
|
|
|
|
"task": "classification", # task type
|
2021-04-08 09:29:55 -07:00
|
|
|
"sample": True, # whether to subsample training data
|
2020-12-15 08:10:43 -08:00
|
|
|
"log_file_name": "test/wine.log",
|
2021-04-08 09:29:55 -07:00
|
|
|
"log_training_metric": True, # whether to log training metric
|
2021-02-05 21:41:14 -08:00
|
|
|
"n_jobs": 1,
|
|
|
|
}
|
|
|
|
|
2021-09-10 16:39:16 -07:00
|
|
|
"""The main flaml automl API"""
|
2021-04-08 09:29:55 -07:00
|
|
|
automl.fit(X_train=X_train, y_train=y_train, **settings)
|
2021-03-16 22:13:35 -07:00
|
|
|
# print the best model found for RGF
|
|
|
|
print(automl.best_model_for_estimator("RGF"))
|
2021-02-05 21:41:14 -08:00
|
|
|
|
2021-09-10 16:39:16 -07:00
|
|
|
MyRegularizedGreedyForest.search_space = lambda data_size, task: {}
|
|
|
|
automl.fit(X_train=X_train, y_train=y_train, **settings)
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
def test_ensemble(self):
|
|
|
|
automl = AutoML()
|
2021-09-10 16:39:16 -07:00
|
|
|
automl.add_learner(learner_name="RGF", learner_class=MyRegularizedGreedyForest)
|
2021-02-05 21:41:14 -08:00
|
|
|
X_train, y_train = load_wine(return_X_y=True)
|
|
|
|
settings = {
|
2021-07-10 09:02:17 -07:00
|
|
|
"time_budget": 5, # total running time in seconds
|
2021-09-10 16:39:16 -07:00
|
|
|
"estimator_list": ["rf", "xgboost", "catboost"],
|
|
|
|
"task": "classification", # task type
|
2021-04-08 09:29:55 -07:00
|
|
|
"sample": True, # whether to subsample training data
|
2021-02-05 21:41:14 -08:00
|
|
|
"log_file_name": "test/wine.log",
|
2021-04-08 09:29:55 -07:00
|
|
|
"log_training_metric": True, # whether to log training metric
|
2021-09-01 16:25:04 -07:00
|
|
|
"ensemble": {
|
|
|
|
"final_estimator": MyRegularizedGreedyForest(),
|
|
|
|
"passthrough": False,
|
|
|
|
},
|
2021-02-05 21:41:14 -08:00
|
|
|
"n_jobs": 1,
|
2020-12-15 08:10:43 -08:00
|
|
|
}
|
|
|
|
|
2021-09-10 16:39:16 -07:00
|
|
|
"""The main flaml automl API"""
|
2021-04-08 09:29:55 -07:00
|
|
|
automl.fit(X_train=X_train, y_train=y_train, **settings)
|
2020-12-15 08:10:43 -08:00
|
|
|
|
2021-07-27 18:02:49 -07:00
|
|
|
def test_preprocess(self):
|
|
|
|
automl = AutoML()
|
2021-09-10 16:39:16 -07:00
|
|
|
X = pd.DataFrame(
|
|
|
|
{
|
|
|
|
"f1": [1, -2, 3, -4, 5, -6, -7, 8, -9, -10, -11, -12, -13, -14],
|
|
|
|
"f2": [
|
|
|
|
3.0,
|
|
|
|
16.0,
|
|
|
|
10.0,
|
|
|
|
12.0,
|
|
|
|
3.0,
|
|
|
|
14.0,
|
|
|
|
11.0,
|
|
|
|
12.0,
|
|
|
|
5.0,
|
|
|
|
14.0,
|
|
|
|
20.0,
|
|
|
|
16.0,
|
|
|
|
15.0,
|
|
|
|
11.0,
|
|
|
|
],
|
|
|
|
"f3": [
|
|
|
|
"a",
|
|
|
|
"b",
|
|
|
|
"a",
|
|
|
|
"c",
|
|
|
|
"c",
|
|
|
|
"b",
|
|
|
|
"b",
|
|
|
|
"b",
|
|
|
|
"b",
|
|
|
|
"a",
|
|
|
|
"b",
|
|
|
|
1.0,
|
|
|
|
1.0,
|
|
|
|
"a",
|
|
|
|
],
|
|
|
|
"f4": [
|
|
|
|
True,
|
|
|
|
True,
|
|
|
|
False,
|
|
|
|
True,
|
|
|
|
True,
|
|
|
|
False,
|
|
|
|
False,
|
|
|
|
False,
|
|
|
|
True,
|
|
|
|
True,
|
|
|
|
False,
|
|
|
|
False,
|
|
|
|
True,
|
|
|
|
True,
|
|
|
|
],
|
|
|
|
}
|
|
|
|
)
|
2021-07-27 18:02:49 -07:00
|
|
|
y = pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])
|
|
|
|
|
2021-08-12 02:02:22 -04:00
|
|
|
automl = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 6,
|
2021-09-10 16:39:16 -07:00
|
|
|
"task": "classification",
|
2021-08-12 02:02:22 -04:00
|
|
|
"n_jobs": 1,
|
2021-09-10 16:39:16 -07:00
|
|
|
"estimator_list": ["catboost", "lrl2"],
|
2021-08-12 02:02:22 -04:00
|
|
|
"eval_method": "cv",
|
|
|
|
"n_splits": 3,
|
|
|
|
"metric": "accuracy",
|
|
|
|
"log_training_metric": True,
|
|
|
|
"verbose": 1,
|
|
|
|
"ensemble": True,
|
|
|
|
}
|
|
|
|
automl.fit(X, y, **automl_settings)
|
|
|
|
|
|
|
|
automl = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 2,
|
2021-09-10 16:39:16 -07:00
|
|
|
"task": "classification",
|
2021-08-12 02:02:22 -04:00
|
|
|
"n_jobs": 1,
|
2021-09-10 16:39:16 -07:00
|
|
|
"estimator_list": ["lrl2", "kneighbor"],
|
2021-08-12 02:02:22 -04:00
|
|
|
"eval_method": "cv",
|
|
|
|
"n_splits": 3,
|
|
|
|
"metric": "accuracy",
|
|
|
|
"log_training_metric": True,
|
|
|
|
"verbose": 1,
|
|
|
|
"ensemble": True,
|
|
|
|
}
|
|
|
|
automl.fit(X, y, **automl_settings)
|
|
|
|
|
|
|
|
automl = AutoML()
|
2021-07-27 18:02:49 -07:00
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 3,
|
2021-09-10 16:39:16 -07:00
|
|
|
"task": "classification",
|
2021-07-27 18:02:49 -07:00
|
|
|
"n_jobs": 1,
|
2021-09-10 16:39:16 -07:00
|
|
|
"estimator_list": ["xgboost", "catboost", "kneighbor"],
|
2021-07-27 18:02:49 -07:00
|
|
|
"eval_method": "cv",
|
|
|
|
"n_splits": 3,
|
|
|
|
"metric": "accuracy",
|
2021-08-12 02:02:22 -04:00
|
|
|
"log_training_metric": True,
|
|
|
|
"verbose": 1,
|
|
|
|
"ensemble": True,
|
|
|
|
}
|
|
|
|
automl.fit(X, y, **automl_settings)
|
|
|
|
|
|
|
|
automl = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 3,
|
2021-09-10 16:39:16 -07:00
|
|
|
"task": "classification",
|
2021-08-12 02:02:22 -04:00
|
|
|
"n_jobs": 1,
|
2021-09-10 16:39:16 -07:00
|
|
|
"estimator_list": ["lgbm", "catboost", "kneighbor"],
|
2021-08-12 02:02:22 -04:00
|
|
|
"eval_method": "cv",
|
|
|
|
"n_splits": 3,
|
|
|
|
"metric": "accuracy",
|
2021-07-27 18:02:49 -07:00
|
|
|
"log_training_metric": True,
|
|
|
|
"verbose": 1,
|
|
|
|
"ensemble": True,
|
|
|
|
}
|
|
|
|
automl.fit(X, y, **automl_settings)
|
|
|
|
|
2020-12-04 09:40:27 -08:00
|
|
|
def test_dataframe(self):
|
|
|
|
self.test_classification(True)
|
|
|
|
|
|
|
|
def test_custom_metric(self):
|
2021-07-20 17:00:44 -07:00
|
|
|
df, y = load_iris(return_X_y=True, as_frame=True)
|
2021-09-10 16:39:16 -07:00
|
|
|
df["label"] = y
|
2020-12-04 09:40:27 -08:00
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
2021-07-20 17:00:44 -07:00
|
|
|
"dataframe": df,
|
2021-09-10 16:39:16 -07:00
|
|
|
"label": "label",
|
2021-06-18 21:19:59 -07:00
|
|
|
"time_budget": 5,
|
2021-09-10 16:39:16 -07:00
|
|
|
"eval_method": "cv",
|
2020-12-04 09:40:27 -08:00
|
|
|
"metric": custom_metric,
|
2021-09-10 16:39:16 -07:00
|
|
|
"task": "classification",
|
2020-12-04 09:40:27 -08:00
|
|
|
"log_file_name": "test/iris_custom.log",
|
|
|
|
"log_training_metric": True,
|
2021-09-10 16:39:16 -07:00
|
|
|
"log_type": "all",
|
2021-02-05 21:41:14 -08:00
|
|
|
"n_jobs": 1,
|
|
|
|
"model_history": True,
|
2021-07-20 17:00:44 -07:00
|
|
|
"sample_weight": np.ones(len(y)),
|
2021-07-10 09:02:17 -07:00
|
|
|
"pred_time_limit": 1e-5,
|
2021-09-04 01:42:21 -07:00
|
|
|
"ensemble": True,
|
2020-12-04 09:40:27 -08:00
|
|
|
}
|
2021-07-20 17:00:44 -07:00
|
|
|
automl_experiment.fit(**automl_settings)
|
2020-12-04 09:40:27 -08:00
|
|
|
print(automl_experiment.classes_)
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
estimator = automl_experiment.get_estimator_from_log(
|
2021-09-10 16:39:16 -07:00
|
|
|
automl_settings["log_file_name"], record_id=0, task="multi"
|
|
|
|
)
|
2020-12-04 09:40:27 -08:00
|
|
|
print(estimator)
|
2021-09-10 16:39:16 -07:00
|
|
|
(
|
|
|
|
time_history,
|
|
|
|
best_valid_loss_history,
|
|
|
|
valid_loss_history,
|
|
|
|
config_history,
|
|
|
|
metric_history,
|
|
|
|
) = get_output_from_log(
|
|
|
|
filename=automl_settings["log_file_name"], time_budget=6
|
|
|
|
)
|
2021-09-01 16:25:04 -07:00
|
|
|
print(metric_history)
|
2020-12-04 09:40:27 -08:00
|
|
|
|
2021-09-02 17:50:22 +02:00
|
|
|
def test_binary(self):
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 1,
|
2021-09-10 16:39:16 -07:00
|
|
|
"task": "binary",
|
2021-09-02 17:50:22 +02:00
|
|
|
"log_file_name": "test/breast_cancer.log",
|
|
|
|
"log_training_metric": True,
|
|
|
|
"n_jobs": 1,
|
2021-09-10 16:39:16 -07:00
|
|
|
"model_history": True,
|
2021-09-02 17:50:22 +02:00
|
|
|
}
|
|
|
|
X_train, y_train = load_breast_cancer(return_X_y=True)
|
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
|
|
|
_ = automl_experiment.predict(X_train)
|
|
|
|
|
2020-12-04 09:40:27 -08:00
|
|
|
def test_classification(self, as_frame=False):
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 4,
|
2021-09-10 16:39:16 -07:00
|
|
|
"metric": "accuracy",
|
|
|
|
"task": "classification",
|
2020-12-04 09:40:27 -08:00
|
|
|
"log_file_name": "test/iris.log",
|
|
|
|
"log_training_metric": True,
|
2021-02-05 21:41:14 -08:00
|
|
|
"n_jobs": 1,
|
2021-09-10 16:39:16 -07:00
|
|
|
"model_history": True,
|
2020-12-04 09:40:27 -08:00
|
|
|
}
|
|
|
|
X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
|
2021-03-19 09:50:47 -07:00
|
|
|
if as_frame:
|
|
|
|
# test drop column
|
|
|
|
X_train.columns = range(X_train.shape[1])
|
|
|
|
X_train[X_train.shape[1]] = np.zeros(len(y_train))
|
2021-09-10 16:39:16 -07:00
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
2020-12-04 09:40:27 -08:00
|
|
|
print(automl_experiment.classes_)
|
2021-07-20 17:00:44 -07:00
|
|
|
print(automl_experiment.predict(X_train)[:5])
|
2020-12-04 09:40:27 -08:00
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
del automl_settings["metric"]
|
|
|
|
del automl_settings["model_history"]
|
|
|
|
del automl_settings["log_training_metric"]
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
duration = automl_experiment.retrain_from_log(
|
|
|
|
log_file_name=automl_settings["log_file_name"],
|
2021-09-10 16:39:16 -07:00
|
|
|
X_train=X_train,
|
|
|
|
y_train=y_train,
|
|
|
|
train_full=True,
|
|
|
|
record_id=0,
|
|
|
|
)
|
2020-12-04 09:40:27 -08:00
|
|
|
print(duration)
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.predict_proba(X_train)[:5])
|
|
|
|
|
2021-04-21 16:22:54 +02:00
|
|
|
def test_datetime_columns(self):
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
2021-05-18 15:57:42 -07:00
|
|
|
"time_budget": 2,
|
|
|
|
"log_file_name": "test/datetime_columns.log",
|
2021-04-21 16:22:54 +02:00
|
|
|
"log_training_metric": True,
|
2021-05-18 15:57:42 -07:00
|
|
|
"n_jobs": 1,
|
2021-07-20 17:00:44 -07:00
|
|
|
"model_history": True,
|
2021-04-21 16:22:54 +02:00
|
|
|
}
|
2021-09-10 16:39:16 -07:00
|
|
|
fake_df = pd.DataFrame(
|
|
|
|
{
|
|
|
|
"A": [
|
|
|
|
datetime(1900, 2, 3),
|
|
|
|
datetime(1900, 3, 4),
|
|
|
|
datetime(1900, 3, 4),
|
|
|
|
datetime(1900, 3, 4),
|
|
|
|
datetime(1900, 7, 2),
|
|
|
|
datetime(1900, 8, 9),
|
|
|
|
],
|
|
|
|
"B": [
|
|
|
|
datetime(1900, 1, 1),
|
|
|
|
datetime(1900, 1, 1),
|
|
|
|
datetime(1900, 1, 1),
|
|
|
|
datetime(1900, 1, 1),
|
|
|
|
datetime(1900, 1, 1),
|
|
|
|
datetime(1900, 1, 1),
|
|
|
|
],
|
|
|
|
"year_A": [
|
|
|
|
datetime(1900, 1, 2),
|
|
|
|
datetime(1900, 8, 1),
|
|
|
|
datetime(1900, 1, 4),
|
|
|
|
datetime(1900, 6, 1),
|
|
|
|
datetime(1900, 1, 5),
|
|
|
|
datetime(1900, 4, 1),
|
|
|
|
],
|
|
|
|
}
|
|
|
|
)
|
2021-05-25 17:30:08 +02:00
|
|
|
y = np.array([0, 1, 0, 1, 0, 0])
|
|
|
|
automl_experiment.fit(X_train=fake_df, y_train=y, **automl_settings)
|
|
|
|
_ = automl_experiment.predict(fake_df)
|
2021-04-24 02:14:29 +02:00
|
|
|
|
2021-04-26 20:50:41 +02:00
|
|
|
def test_micro_macro_f1(self):
|
2021-05-25 17:30:08 +02:00
|
|
|
automl_experiment_micro = AutoML()
|
2021-04-26 20:50:41 +02:00
|
|
|
automl_experiment_macro = AutoML()
|
|
|
|
automl_settings = {
|
2021-05-18 15:57:42 -07:00
|
|
|
"time_budget": 2,
|
2021-09-10 16:39:16 -07:00
|
|
|
"task": "classification",
|
2021-05-18 15:57:42 -07:00
|
|
|
"log_file_name": "test/micro_macro_f1.log",
|
2021-04-26 20:50:41 +02:00
|
|
|
"log_training_metric": True,
|
2021-05-18 15:57:42 -07:00
|
|
|
"n_jobs": 1,
|
2021-09-10 16:39:16 -07:00
|
|
|
"model_history": True,
|
2021-04-26 20:50:41 +02:00
|
|
|
}
|
|
|
|
X_train, y_train = load_iris(return_X_y=True)
|
2021-05-25 17:30:08 +02:00
|
|
|
automl_experiment_micro.fit(
|
2021-09-10 16:39:16 -07:00
|
|
|
X_train=X_train, y_train=y_train, metric="micro_f1", **automl_settings
|
|
|
|
)
|
2021-05-07 04:29:38 +00:00
|
|
|
automl_experiment_macro.fit(
|
2021-09-10 16:39:16 -07:00
|
|
|
X_train=X_train, y_train=y_train, metric="macro_f1", **automl_settings
|
|
|
|
)
|
2021-06-04 10:31:33 -07:00
|
|
|
estimator = automl_experiment_macro.model
|
|
|
|
y_pred = estimator.predict(X_train)
|
|
|
|
y_pred_proba = estimator.predict_proba(X_train)
|
|
|
|
from flaml.ml import norm_confusion_matrix, multi_class_curves
|
2021-09-10 16:39:16 -07:00
|
|
|
|
2021-06-04 10:31:33 -07:00
|
|
|
print(norm_confusion_matrix(y_train, y_pred))
|
|
|
|
from sklearn.metrics import roc_curve, precision_recall_curve
|
2021-09-10 16:39:16 -07:00
|
|
|
|
2021-06-04 10:31:33 -07:00
|
|
|
print(multi_class_curves(y_train, y_pred_proba, roc_curve))
|
|
|
|
print(multi_class_curves(y_train, y_pred_proba, precision_recall_curve))
|
2021-04-26 20:50:41 +02:00
|
|
|
|
2021-08-23 07:16:10 +09:00
|
|
|
def test_roc_auc_ovr(self):
|
|
|
|
automl_experiment = AutoML()
|
2021-09-04 01:42:21 -07:00
|
|
|
X_train, y_train = load_iris(return_X_y=True)
|
2021-08-23 07:16:10 +09:00
|
|
|
automl_settings = {
|
2021-09-04 01:42:21 -07:00
|
|
|
"time_budget": 1,
|
2021-08-23 07:16:10 +09:00
|
|
|
"metric": "roc_auc_ovr",
|
|
|
|
"task": "classification",
|
|
|
|
"log_file_name": "test/roc_auc_ovr.log",
|
|
|
|
"log_training_metric": True,
|
|
|
|
"n_jobs": 1,
|
2021-09-04 01:42:21 -07:00
|
|
|
"sample_weight": np.ones(len(y_train)),
|
|
|
|
"eval_method": "holdout",
|
2021-09-10 16:39:16 -07:00
|
|
|
"model_history": True,
|
2021-08-23 07:16:10 +09:00
|
|
|
}
|
2021-09-10 16:39:16 -07:00
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
2021-08-23 07:16:10 +09:00
|
|
|
|
|
|
|
def test_roc_auc_ovo(self):
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
2021-09-04 01:42:21 -07:00
|
|
|
"time_budget": 1,
|
2021-08-23 07:16:10 +09:00
|
|
|
"metric": "roc_auc_ovo",
|
|
|
|
"task": "classification",
|
|
|
|
"log_file_name": "test/roc_auc_ovo.log",
|
|
|
|
"log_training_metric": True,
|
|
|
|
"n_jobs": 1,
|
2021-09-10 16:39:16 -07:00
|
|
|
"model_history": True,
|
2021-08-23 07:16:10 +09:00
|
|
|
}
|
|
|
|
X_train, y_train = load_iris(return_X_y=True)
|
2021-09-10 16:39:16 -07:00
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
2021-08-23 07:16:10 +09:00
|
|
|
|
2020-12-04 09:40:27 -08:00
|
|
|
def test_regression(self):
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 2,
|
2021-09-10 16:39:16 -07:00
|
|
|
"task": "regression",
|
2020-12-04 09:40:27 -08:00
|
|
|
"log_file_name": "test/boston.log",
|
|
|
|
"log_training_metric": True,
|
2021-02-05 21:41:14 -08:00
|
|
|
"n_jobs": 1,
|
2021-09-10 16:39:16 -07:00
|
|
|
"model_history": True,
|
2020-12-04 09:40:27 -08:00
|
|
|
}
|
|
|
|
X_train, y_train = load_boston(return_X_y=True)
|
2021-04-08 09:29:55 -07:00
|
|
|
n = int(len(y_train) * 9 // 10)
|
2021-09-10 16:39:16 -07:00
|
|
|
automl_experiment.fit(
|
|
|
|
X_train=X_train[:n],
|
|
|
|
y_train=y_train[:n],
|
|
|
|
X_val=X_train[n:],
|
|
|
|
y_val=y_train[n:],
|
|
|
|
**automl_settings
|
|
|
|
)
|
|
|
|
assert automl_experiment._state.eval_method == "holdout"
|
2020-12-04 09:40:27 -08:00
|
|
|
print(automl_experiment.predict(X_train))
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
print(get_output_from_log(automl_settings["log_file_name"], 1))
|
2021-07-20 17:00:44 -07:00
|
|
|
automl_experiment.retrain_from_log(
|
|
|
|
task="regression",
|
|
|
|
log_file_name=automl_settings["log_file_name"],
|
2021-09-10 16:39:16 -07:00
|
|
|
X_train=X_train,
|
|
|
|
y_train=y_train,
|
|
|
|
train_full=True,
|
|
|
|
time_budget=1,
|
|
|
|
)
|
2021-09-04 01:42:21 -07:00
|
|
|
automl_experiment.retrain_from_log(
|
|
|
|
task="regression",
|
|
|
|
log_file_name=automl_settings["log_file_name"],
|
2021-09-10 16:39:16 -07:00
|
|
|
X_train=X_train,
|
|
|
|
y_train=y_train,
|
|
|
|
train_full=True,
|
|
|
|
time_budget=0,
|
|
|
|
)
|
2020-12-04 09:40:27 -08:00
|
|
|
|
|
|
|
def test_sparse_matrix_classification(self):
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 2,
|
2021-09-10 16:39:16 -07:00
|
|
|
"metric": "auto",
|
|
|
|
"task": "classification",
|
2020-12-04 09:40:27 -08:00
|
|
|
"log_file_name": "test/sparse_classification.log",
|
|
|
|
"split_type": "uniform",
|
2021-02-05 21:41:14 -08:00
|
|
|
"n_jobs": 1,
|
2021-09-10 16:39:16 -07:00
|
|
|
"model_history": True,
|
2020-12-04 09:40:27 -08:00
|
|
|
}
|
|
|
|
X_train = scipy.sparse.random(1554, 21, dtype=int)
|
|
|
|
y_train = np.random.randint(3, size=1554)
|
2021-09-10 16:39:16 -07:00
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
2020-12-04 09:40:27 -08:00
|
|
|
print(automl_experiment.classes_)
|
|
|
|
print(automl_experiment.predict_proba(X_train))
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
|
|
|
|
def test_sparse_matrix_regression(self):
|
2021-03-31 22:11:56 -07:00
|
|
|
X_train = scipy.sparse.random(300, 900, density=0.0001)
|
|
|
|
y_train = np.random.uniform(size=300)
|
|
|
|
X_val = scipy.sparse.random(100, 900, density=0.0001)
|
|
|
|
y_val = np.random.uniform(size=100)
|
2020-12-04 09:40:27 -08:00
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 2,
|
2021-09-10 16:39:16 -07:00
|
|
|
"metric": "mae",
|
|
|
|
"task": "regression",
|
2020-12-04 09:40:27 -08:00
|
|
|
"log_file_name": "test/sparse_regression.log",
|
2021-02-05 21:41:14 -08:00
|
|
|
"n_jobs": 1,
|
2021-03-19 09:50:47 -07:00
|
|
|
"model_history": True,
|
2021-08-26 13:45:13 -07:00
|
|
|
"keep_search_state": True,
|
2021-03-19 09:50:47 -07:00
|
|
|
"verbose": 0,
|
2021-09-10 16:39:16 -07:00
|
|
|
"early_stop": True,
|
2020-12-04 09:40:27 -08:00
|
|
|
}
|
2021-09-10 16:39:16 -07:00
|
|
|
automl_experiment.fit(
|
|
|
|
X_train=X_train,
|
|
|
|
y_train=y_train,
|
|
|
|
X_val=X_val,
|
|
|
|
y_val=y_val,
|
|
|
|
**automl_settings
|
|
|
|
)
|
2021-02-05 21:41:14 -08:00
|
|
|
assert automl_experiment._state.X_val.shape == X_val.shape
|
2020-12-04 09:40:27 -08:00
|
|
|
print(automl_experiment.predict(X_train))
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
print(automl_experiment.best_config)
|
|
|
|
print(automl_experiment.best_loss)
|
|
|
|
print(automl_experiment.best_config_train_time)
|
|
|
|
|
|
|
|
def test_sparse_matrix_xgboost(self):
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
2021-02-22 22:10:41 -08:00
|
|
|
"time_budget": 3,
|
2021-09-10 16:39:16 -07:00
|
|
|
"metric": "ap",
|
|
|
|
"task": "classification",
|
2020-12-04 09:40:27 -08:00
|
|
|
"log_file_name": "test/sparse_classification.log",
|
|
|
|
"estimator_list": ["xgboost"],
|
|
|
|
"log_type": "all",
|
2021-02-05 21:41:14 -08:00
|
|
|
"n_jobs": 1,
|
2020-12-04 09:40:27 -08:00
|
|
|
}
|
|
|
|
X_train = scipy.sparse.eye(900000)
|
|
|
|
y_train = np.random.randint(2, size=900000)
|
2021-09-10 16:39:16 -07:00
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
2020-12-04 09:40:27 -08:00
|
|
|
print(automl_experiment.predict(X_train))
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
|
2021-09-01 16:25:04 -07:00
|
|
|
def test_parallel(self, hpo_method=None):
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 10,
|
2021-09-10 16:39:16 -07:00
|
|
|
"task": "regression",
|
2021-09-01 16:25:04 -07:00
|
|
|
"log_file_name": "test/boston.log",
|
|
|
|
"log_type": "all",
|
|
|
|
"n_jobs": 1,
|
|
|
|
"n_concurrent_trials": 2,
|
|
|
|
"hpo_method": hpo_method,
|
|
|
|
}
|
|
|
|
X_train, y_train = load_boston(return_X_y=True)
|
|
|
|
try:
|
2021-09-10 16:39:16 -07:00
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
2021-09-01 16:25:04 -07:00
|
|
|
print(automl_experiment.predict(X_train))
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
except ImportError:
|
|
|
|
return
|
|
|
|
|
2021-08-23 19:36:51 -04:00
|
|
|
def test_parallel_xgboost(self, hpo_method=None):
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 10,
|
2021-09-10 16:39:16 -07:00
|
|
|
"metric": "ap",
|
|
|
|
"task": "classification",
|
2021-08-23 19:36:51 -04:00
|
|
|
"log_file_name": "test/sparse_classification.log",
|
|
|
|
"estimator_list": ["xgboost"],
|
|
|
|
"log_type": "all",
|
|
|
|
"n_jobs": 1,
|
|
|
|
"n_concurrent_trials": 2,
|
|
|
|
"hpo_method": hpo_method,
|
|
|
|
}
|
|
|
|
X_train = scipy.sparse.eye(900000)
|
|
|
|
y_train = np.random.randint(2, size=900000)
|
|
|
|
try:
|
2021-09-10 16:39:16 -07:00
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
2021-08-23 19:36:51 -04:00
|
|
|
print(automl_experiment.predict(X_train))
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
except ImportError:
|
|
|
|
return
|
|
|
|
|
2021-09-04 01:42:21 -07:00
|
|
|
def test_parallel_xgboost_others(self):
|
2021-08-23 19:36:51 -04:00
|
|
|
# use random search as the hpo_method
|
2021-09-10 16:39:16 -07:00
|
|
|
self.test_parallel_xgboost(hpo_method="random")
|
2021-08-23 19:36:51 -04:00
|
|
|
|
|
|
|
def test_random_out_of_memory(self):
|
|
|
|
automl_experiment = AutoML()
|
2021-09-04 01:42:21 -07:00
|
|
|
automl_experiment.add_learner(
|
2021-09-10 16:39:16 -07:00
|
|
|
learner_name="large_lgbm", learner_class=MyLargeLGBM
|
|
|
|
)
|
2021-08-23 19:36:51 -04:00
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 2,
|
2021-09-10 16:39:16 -07:00
|
|
|
"metric": "ap",
|
|
|
|
"task": "classification",
|
2021-08-23 19:36:51 -04:00
|
|
|
"log_file_name": "test/sparse_classification_oom.log",
|
|
|
|
"estimator_list": ["large_lgbm"],
|
|
|
|
"log_type": "all",
|
|
|
|
"n_jobs": 1,
|
|
|
|
"n_concurrent_trials": 2,
|
2021-09-10 16:39:16 -07:00
|
|
|
"hpo_method": "random",
|
2021-08-23 19:36:51 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
X_train = scipy.sparse.eye(900000)
|
|
|
|
y_train = np.random.randint(2, size=900000)
|
|
|
|
try:
|
2021-09-10 16:39:16 -07:00
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
2021-08-23 19:36:51 -04:00
|
|
|
print(automl_experiment.predict(X_train))
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
except ImportError:
|
|
|
|
return
|
|
|
|
|
2020-12-04 09:40:27 -08:00
|
|
|
def test_sparse_matrix_lr(self):
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 2,
|
2021-09-10 16:39:16 -07:00
|
|
|
"metric": "f1",
|
|
|
|
"task": "classification",
|
2020-12-04 09:40:27 -08:00
|
|
|
"log_file_name": "test/sparse_classification.log",
|
|
|
|
"estimator_list": ["lrl1", "lrl2"],
|
|
|
|
"log_type": "all",
|
2021-02-05 21:41:14 -08:00
|
|
|
"n_jobs": 1,
|
2020-12-04 09:40:27 -08:00
|
|
|
}
|
|
|
|
X_train = scipy.sparse.random(3000, 900, density=0.1)
|
|
|
|
y_train = np.random.randint(2, size=3000)
|
2021-09-10 16:39:16 -07:00
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
2020-12-04 09:40:27 -08:00
|
|
|
print(automl_experiment.predict(X_train))
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
|
2021-09-04 01:42:21 -07:00
|
|
|
def test_sparse_matrix_regression_holdout(self):
|
2021-03-31 22:11:56 -07:00
|
|
|
X_train = scipy.sparse.random(8, 100)
|
|
|
|
y_train = np.random.uniform(size=8)
|
2020-12-04 09:40:27 -08:00
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
2021-09-04 01:42:21 -07:00
|
|
|
"time_budget": 1,
|
2021-09-10 16:39:16 -07:00
|
|
|
"eval_method": "holdout",
|
|
|
|
"task": "regression",
|
2020-12-04 09:40:27 -08:00
|
|
|
"log_file_name": "test/sparse_regression.log",
|
2021-02-05 21:41:14 -08:00
|
|
|
"n_jobs": 1,
|
2021-03-19 09:50:47 -07:00
|
|
|
"model_history": True,
|
2021-03-31 22:11:56 -07:00
|
|
|
"metric": "mse",
|
|
|
|
"sample_weight": np.ones(len(y_train)),
|
2021-09-10 16:39:16 -07:00
|
|
|
"early_stop": True,
|
2020-12-04 09:40:27 -08:00
|
|
|
}
|
2021-09-10 16:39:16 -07:00
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
2020-12-04 09:40:27 -08:00
|
|
|
print(automl_experiment.predict(X_train))
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
|
2021-04-10 21:14:28 -04:00
|
|
|
def test_regression_xgboost(self):
|
|
|
|
X_train = scipy.sparse.random(300, 900, density=0.0001)
|
|
|
|
y_train = np.random.uniform(size=300)
|
|
|
|
X_val = scipy.sparse.random(100, 900, density=0.0001)
|
|
|
|
y_val = np.random.uniform(size=100)
|
|
|
|
automl_experiment = AutoML()
|
2021-09-10 16:39:16 -07:00
|
|
|
automl_experiment.add_learner(learner_name="my_xgb1", learner_class=MyXGB1)
|
|
|
|
automl_experiment.add_learner(learner_name="my_xgb2", learner_class=MyXGB2)
|
2021-04-10 21:14:28 -04:00
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 2,
|
2021-09-10 16:39:16 -07:00
|
|
|
"estimator_list": ["my_xgb1", "my_xgb2"],
|
|
|
|
"task": "regression",
|
|
|
|
"log_file_name": "test/regression_xgboost.log",
|
2021-04-10 21:14:28 -04:00
|
|
|
"n_jobs": 1,
|
|
|
|
"model_history": True,
|
2021-08-26 13:45:13 -07:00
|
|
|
"keep_search_state": True,
|
2021-09-10 16:39:16 -07:00
|
|
|
"early_stop": True,
|
2021-04-10 21:14:28 -04:00
|
|
|
}
|
2021-09-10 16:39:16 -07:00
|
|
|
automl_experiment.fit(
|
|
|
|
X_train=X_train,
|
|
|
|
y_train=y_train,
|
|
|
|
X_val=X_val,
|
|
|
|
y_val=y_val,
|
|
|
|
**automl_settings
|
|
|
|
)
|
2021-04-10 21:14:28 -04:00
|
|
|
assert automl_experiment._state.X_val.shape == X_val.shape
|
|
|
|
print(automl_experiment.predict(X_train))
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
print(automl_experiment.best_config)
|
|
|
|
print(automl_experiment.best_loss)
|
|
|
|
print(automl_experiment.best_config_train_time)
|
|
|
|
|
2021-07-31 16:39:31 -04:00
|
|
|
def test_fit_w_starting_point(self, as_frame=True):
|
2021-09-10 16:39:16 -07:00
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 3,
|
|
|
|
"metric": "accuracy",
|
|
|
|
"task": "classification",
|
|
|
|
"log_file_name": "test/iris.log",
|
|
|
|
"log_training_metric": True,
|
|
|
|
"n_jobs": 1,
|
|
|
|
"model_history": True,
|
|
|
|
}
|
|
|
|
X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
|
|
|
|
if as_frame:
|
|
|
|
# test drop column
|
|
|
|
X_train.columns = range(X_train.shape[1])
|
|
|
|
X_train[X_train.shape[1]] = np.zeros(len(y_train))
|
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
|
|
|
automl_val_accuracy = 1.0 - automl_experiment.best_loss
|
|
|
|
print("Best ML leaner:", automl_experiment.best_estimator)
|
|
|
|
print("Best hyperparmeter config:", automl_experiment.best_config)
|
|
|
|
print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
|
|
|
|
print(
|
|
|
|
"Training duration of best run: {0:.4g} s".format(
|
|
|
|
automl_experiment.best_config_train_time
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
starting_points = automl_experiment.best_config_per_estimator
|
|
|
|
print("starting_points", starting_points)
|
|
|
|
automl_settings_resume = {
|
|
|
|
"time_budget": 2,
|
|
|
|
"metric": "accuracy",
|
|
|
|
"task": "classification",
|
|
|
|
"log_file_name": "test/iris_resume.log",
|
|
|
|
"log_training_metric": True,
|
|
|
|
"n_jobs": 1,
|
|
|
|
"model_history": True,
|
|
|
|
"log_type": "all",
|
|
|
|
"starting_points": starting_points,
|
|
|
|
}
|
|
|
|
new_automl_experiment = AutoML()
|
|
|
|
new_automl_experiment.fit(
|
|
|
|
X_train=X_train, y_train=y_train, **automl_settings_resume
|
|
|
|
)
|
|
|
|
|
|
|
|
new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss
|
|
|
|
print("Best ML leaner:", new_automl_experiment.best_estimator)
|
|
|
|
print("Best hyperparmeter config:", new_automl_experiment.best_config)
|
|
|
|
print(
|
|
|
|
"Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
|
|
|
|
)
|
|
|
|
print(
|
|
|
|
"Training duration of best run: {0:.4g} s".format(
|
|
|
|
new_automl_experiment.best_config_train_time
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_fit_w_starting_points_list(self, as_frame=True):
|
2021-07-31 16:39:31 -04:00
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 3,
|
|
|
|
"metric": 'accuracy',
|
|
|
|
"task": 'classification',
|
|
|
|
"log_file_name": "test/iris.log",
|
|
|
|
"log_training_metric": True,
|
|
|
|
"n_jobs": 1,
|
|
|
|
"model_history": True,
|
|
|
|
}
|
|
|
|
X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
|
|
|
|
if as_frame:
|
|
|
|
# test drop column
|
|
|
|
X_train.columns = range(X_train.shape[1])
|
|
|
|
X_train[X_train.shape[1]] = np.zeros(len(y_train))
|
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train,
|
|
|
|
**automl_settings)
|
|
|
|
automl_val_accuracy = 1.0 - automl_experiment.best_loss
|
|
|
|
print('Best ML leaner:', automl_experiment.best_estimator)
|
|
|
|
print('Best hyperparmeter config:', automl_experiment.best_config)
|
|
|
|
print('Best accuracy on validation data: {0:.4g}'.format(automl_val_accuracy))
|
|
|
|
print('Training duration of best run: {0:.4g} s'.format(automl_experiment.best_config_train_time))
|
|
|
|
|
2021-09-10 16:39:16 -07:00
|
|
|
starting_points = {}
|
|
|
|
log_file_name = automl_settings['log_file_name']
|
|
|
|
with training_log_reader(log_file_name) as reader:
|
|
|
|
for record in reader.records():
|
|
|
|
config = record.config
|
|
|
|
learner = record.learner
|
|
|
|
if learner not in starting_points:
|
|
|
|
starting_points[learner] = []
|
|
|
|
starting_points[learner].append(config)
|
|
|
|
max_iter = sum([len(s) for k, s in starting_points.items()])
|
2021-07-31 16:39:31 -04:00
|
|
|
automl_settings_resume = {
|
|
|
|
"time_budget": 2,
|
|
|
|
"metric": 'accuracy',
|
|
|
|
"task": 'classification',
|
2021-09-10 16:39:16 -07:00
|
|
|
"log_file_name": "test/iris_resume_all.log",
|
2021-07-31 16:39:31 -04:00
|
|
|
"log_training_metric": True,
|
|
|
|
"n_jobs": 1,
|
2021-09-10 16:39:16 -07:00
|
|
|
"max_iter": max_iter,
|
2021-07-31 16:39:31 -04:00
|
|
|
"model_history": True,
|
|
|
|
"log_type": 'all',
|
|
|
|
"starting_points": starting_points,
|
2021-09-10 16:39:16 -07:00
|
|
|
"append_log": True,
|
2021-07-31 16:39:31 -04:00
|
|
|
}
|
|
|
|
new_automl_experiment = AutoML()
|
|
|
|
new_automl_experiment.fit(X_train=X_train, y_train=y_train,
|
|
|
|
**automl_settings_resume)
|
|
|
|
|
|
|
|
new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss
|
2021-09-10 16:39:16 -07:00
|
|
|
# print('Best ML leaner:', new_automl_experiment.best_estimator)
|
|
|
|
# print('Best hyperparmeter config:', new_automl_experiment.best_config)
|
2021-07-31 16:39:31 -04:00
|
|
|
print('Best accuracy on validation data: {0:.4g}'.format(new_automl_val_accuracy))
|
2021-09-10 16:39:16 -07:00
|
|
|
# print('Training duration of best run: {0:.4g} s'.format(new_automl_experiment.best_config_train_time))
|
2021-07-31 16:39:31 -04:00
|
|
|
|
2020-12-04 09:40:27 -08:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
unittest.main()
|