2020-12-04 09:40:27 -08:00
|
|
|
import unittest
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
import scipy.sparse
|
2020-12-15 08:10:43 -08:00
|
|
|
from sklearn.datasets import load_boston, load_iris, load_wine
|
2020-12-04 09:40:27 -08:00
|
|
|
|
2020-12-15 08:10:43 -08:00
|
|
|
from flaml import AutoML
|
|
|
|
from flaml.data import get_output_from_log
|
|
|
|
|
|
|
|
from flaml.model import BaseEstimator
|
|
|
|
from flaml.space import ConfigSearchInfo
|
|
|
|
from rgf.sklearn import RGFClassifier, RGFRegressor
|
|
|
|
|
|
|
|
|
|
|
|
class MyRegularizedGreedyForest(BaseEstimator):
|
|
|
|
|
|
|
|
# search space
|
|
|
|
params_configsearch_info = {
|
|
|
|
'max_leaf': ConfigSearchInfo(name = 'max_leaf',
|
|
|
|
type = int, lower = 4, init = 4, upper = 10000),
|
|
|
|
'n_iter': ConfigSearchInfo(name = 'n_iter', type = int, lower = 1,
|
|
|
|
init = 1, upper = 32768),
|
|
|
|
'n_tree_search': ConfigSearchInfo(name = 'n_tree_search', type = int,
|
|
|
|
lower = 1, init = 1, upper = 32768),
|
|
|
|
'opt_interval': ConfigSearchInfo(name = 'opt_interval', type = int,
|
|
|
|
lower = 1, init = 100, upper = 10000),
|
|
|
|
'learning_rate': ConfigSearchInfo(name = 'learning_rate', type = float,
|
|
|
|
lower = 0.01, init = 1.0, upper = 20.0),
|
|
|
|
'min_samples_leaf': ConfigSearchInfo(name = 'min_samples_leaf',
|
|
|
|
type = int, lower = 1, init = 20, upper = 20)
|
|
|
|
}
|
|
|
|
|
|
|
|
def __init__(self, objective_name = 'binary:logistic', n_jobs = 1,
|
|
|
|
max_leaf = 1000, n_iter = 1, n_tree_search = 1, opt_interval = 1,
|
|
|
|
learning_rate = 1.0, min_samples_leaf = 1):
|
|
|
|
|
|
|
|
self.objective_name = objective_name
|
|
|
|
|
|
|
|
if 'regression' in objective_name:
|
|
|
|
self.estimator_class = RGFRegressor
|
|
|
|
else:
|
|
|
|
self.estimator_class = RGFClassifier
|
|
|
|
|
|
|
|
# round integer hyperparameters
|
|
|
|
self.params = {
|
|
|
|
'max_leaf': int(round(max_leaf)),
|
|
|
|
'n_iter': int(round(n_iter)),
|
|
|
|
'n_tree_search': int(round(n_tree_search)),
|
|
|
|
'opt_interval': int(round(opt_interval)),
|
|
|
|
'learning_rate': learning_rate,
|
|
|
|
'min_samples_leaf':int(round(min_samples_leaf)),
|
|
|
|
"n_jobs": n_jobs,
|
|
|
|
}
|
2020-12-04 09:40:27 -08:00
|
|
|
|
|
|
|
|
|
|
|
def custom_metric(X_test, y_test, estimator, labels, X_train, y_train):
|
|
|
|
from sklearn.metrics import log_loss
|
|
|
|
y_pred = estimator.predict_proba(X_test)
|
|
|
|
test_loss = log_loss(y_test, y_pred, labels=labels)
|
|
|
|
y_pred = estimator.predict_proba(X_train)
|
|
|
|
train_loss = log_loss(y_train, y_pred, labels=labels)
|
|
|
|
alpha = 0.5
|
|
|
|
return test_loss * (1 + alpha) - alpha * train_loss, [test_loss, train_loss]
|
|
|
|
|
|
|
|
|
|
|
|
class TestAutoML(unittest.TestCase):
|
|
|
|
|
2020-12-15 08:10:43 -08:00
|
|
|
def test_custom_learner(self):
|
|
|
|
automl = AutoML()
|
|
|
|
automl.add_learner(learner_name = 'RGF',
|
|
|
|
learner_class = MyRegularizedGreedyForest)
|
|
|
|
X_train, y_train = load_wine(return_X_y=True)
|
|
|
|
settings = {
|
|
|
|
"time_budget": 10, # total running time in seconds
|
|
|
|
"estimator_list": ['RGF', 'lgbm', 'rf', 'xgboost'],
|
|
|
|
"task": 'classification', # task type
|
|
|
|
"sample": True, # whether to subsample training data
|
|
|
|
"log_file_name": "test/wine.log",
|
|
|
|
"log_training_metric": True, # whether to log training metric
|
|
|
|
}
|
|
|
|
|
|
|
|
'''The main flaml automl API'''
|
|
|
|
automl.fit(X_train = X_train, y_train = y_train, **settings)
|
|
|
|
|
2020-12-04 09:40:27 -08:00
|
|
|
def test_dataframe(self):
|
|
|
|
self.test_classification(True)
|
|
|
|
|
|
|
|
def test_custom_metric(self):
|
|
|
|
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 10,
|
|
|
|
'eval_method': 'holdout',
|
|
|
|
"metric": custom_metric,
|
|
|
|
"task": 'classification',
|
|
|
|
"log_file_name": "test/iris_custom.log",
|
|
|
|
"log_training_metric": True,
|
|
|
|
'log_type': 'all',
|
|
|
|
"model_history": True
|
|
|
|
}
|
|
|
|
X_train, y_train = load_iris(return_X_y=True)
|
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train,
|
|
|
|
**automl_settings)
|
|
|
|
print(automl_experiment.classes_)
|
|
|
|
print(automl_experiment.predict_proba(X_train))
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
estimator = automl_experiment.get_estimator_from_log(
|
|
|
|
automl_settings["log_file_name"], record_id=0,
|
|
|
|
objective='multi')
|
|
|
|
print(estimator)
|
|
|
|
time_history, best_valid_loss_history, valid_loss_history, \
|
|
|
|
config_history, train_loss_history = get_output_from_log(
|
|
|
|
filename=automl_settings['log_file_name'], time_budget=6)
|
|
|
|
print(train_loss_history)
|
|
|
|
|
|
|
|
def test_classification(self, as_frame=False):
|
|
|
|
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 4,
|
|
|
|
"metric": 'accuracy',
|
|
|
|
"task": 'classification',
|
|
|
|
"log_file_name": "test/iris.log",
|
|
|
|
"log_training_metric": True,
|
|
|
|
"model_history": True
|
|
|
|
}
|
|
|
|
X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
|
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train,
|
|
|
|
**automl_settings)
|
|
|
|
print(automl_experiment.classes_)
|
|
|
|
print(automl_experiment.predict_proba(X_train)[:5])
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
del automl_settings["metric"]
|
|
|
|
del automl_settings["model_history"]
|
|
|
|
del automl_settings["log_training_metric"]
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
duration = automl_experiment.retrain_from_log(
|
|
|
|
log_file_name=automl_settings["log_file_name"],
|
|
|
|
X_train=X_train, y_train=y_train,
|
|
|
|
train_full=True, record_id=0)
|
|
|
|
print(duration)
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.predict_proba(X_train)[:5])
|
|
|
|
|
|
|
|
def test_regression(self):
|
|
|
|
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 2,
|
|
|
|
"metric": 'mse',
|
|
|
|
"task": 'regression',
|
|
|
|
"log_file_name": "test/boston.log",
|
|
|
|
"log_training_metric": True,
|
|
|
|
"model_history": True
|
|
|
|
}
|
|
|
|
X_train, y_train = load_boston(return_X_y=True)
|
2020-12-14 23:10:03 -08:00
|
|
|
n = int(len(y_train)*9//10)
|
|
|
|
automl_experiment.fit(X_train=X_train[:n], y_train=y_train[:n],
|
|
|
|
X_val=X_train[n:], y_val=y_train[n:],
|
2020-12-04 09:40:27 -08:00
|
|
|
**automl_settings)
|
|
|
|
assert automl_experiment.eval_method == 'holdout'
|
|
|
|
print(automl_experiment.predict(X_train))
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
print(get_output_from_log(automl_settings["log_file_name"], 1))
|
|
|
|
|
|
|
|
def test_sparse_matrix_classification(self):
|
|
|
|
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 2,
|
|
|
|
"metric": 'auto',
|
|
|
|
"task": 'classification',
|
|
|
|
"log_file_name": "test/sparse_classification.log",
|
|
|
|
"split_type": "uniform",
|
|
|
|
"model_history": True
|
|
|
|
}
|
|
|
|
X_train = scipy.sparse.random(1554, 21, dtype=int)
|
|
|
|
y_train = np.random.randint(3, size=1554)
|
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train,
|
|
|
|
**automl_settings)
|
|
|
|
print(automl_experiment.classes_)
|
|
|
|
print(automl_experiment.predict_proba(X_train))
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
|
|
|
|
def test_sparse_matrix_regression(self):
|
|
|
|
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 2,
|
|
|
|
"metric": 'mae',
|
|
|
|
"task": 'regression',
|
|
|
|
"log_file_name": "test/sparse_regression.log",
|
|
|
|
"model_history": True
|
|
|
|
}
|
|
|
|
X_train = scipy.sparse.random(300, 900, density=0.0001)
|
|
|
|
y_train = np.random.uniform(size=300)
|
|
|
|
X_val = scipy.sparse.random(100, 900, density=0.0001)
|
|
|
|
y_val = np.random.uniform(size=100)
|
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train,
|
|
|
|
X_val=X_val, y_val=y_val,
|
|
|
|
**automl_settings)
|
|
|
|
assert automl_experiment.X_val.shape == X_val.shape
|
|
|
|
print(automl_experiment.predict(X_train))
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
print(automl_experiment.best_config)
|
|
|
|
print(automl_experiment.best_loss)
|
|
|
|
print(automl_experiment.best_config_train_time)
|
|
|
|
|
|
|
|
def test_sparse_matrix_xgboost(self):
|
|
|
|
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 2,
|
|
|
|
"metric": 'ap',
|
|
|
|
"task": 'classification',
|
|
|
|
"log_file_name": "test/sparse_classification.log",
|
|
|
|
"estimator_list": ["xgboost"],
|
|
|
|
"log_type": "all",
|
|
|
|
}
|
|
|
|
X_train = scipy.sparse.eye(900000)
|
|
|
|
y_train = np.random.randint(2, size=900000)
|
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train,
|
|
|
|
**automl_settings)
|
|
|
|
print(automl_experiment.predict(X_train))
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
|
|
|
|
def test_sparse_matrix_lr(self):
|
|
|
|
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 2,
|
|
|
|
"metric": 'f1',
|
|
|
|
"task": 'classification',
|
|
|
|
"log_file_name": "test/sparse_classification.log",
|
|
|
|
"estimator_list": ["lrl1", "lrl2"],
|
|
|
|
"log_type": "all",
|
|
|
|
}
|
|
|
|
X_train = scipy.sparse.random(3000, 900, density=0.1)
|
|
|
|
y_train = np.random.randint(2, size=3000)
|
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train,
|
|
|
|
**automl_settings)
|
|
|
|
print(automl_experiment.predict(X_train))
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
|
|
|
|
def test_sparse_matrix_regression_cv(self):
|
|
|
|
|
|
|
|
automl_experiment = AutoML()
|
|
|
|
automl_settings = {
|
|
|
|
"time_budget": 2,
|
|
|
|
'eval_method': 'cv',
|
|
|
|
"task": 'regression',
|
|
|
|
"log_file_name": "test/sparse_regression.log",
|
|
|
|
"model_history": True
|
|
|
|
}
|
|
|
|
X_train = scipy.sparse.random(100, 100)
|
|
|
|
y_train = np.random.uniform(size=100)
|
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train,
|
|
|
|
**automl_settings)
|
|
|
|
print(automl_experiment.predict(X_train))
|
|
|
|
print(automl_experiment.model)
|
|
|
|
print(automl_experiment.config_history)
|
|
|
|
print(automl_experiment.model_history)
|
|
|
|
print(automl_experiment.best_iteration)
|
|
|
|
print(automl_experiment.best_estimator)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
unittest.main()
|