autogen/test/test_automl.py

import unittest

import numpy as np
import scipy.sparse
from sklearn.datasets import load_boston, load_iris, load_wine

from flaml import AutoML
from flaml.data import get_output_from_log

from flaml.model import BaseEstimator
from flaml.space import ConfigSearchInfo
from rgf.sklearn import RGFClassifier, RGFRegressor


class MyRegularizedGreedyForest(BaseEstimator):

    # search space
    params_configsearch_info = {
        'max_leaf': ConfigSearchInfo(name = 'max_leaf',
         type = int, lower = 4, init = 4, upper = 10000),
        'n_iter': ConfigSearchInfo(name = 'n_iter', type = int, lower = 1,
         init = 1, upper = 32768),
        'n_tree_search': ConfigSearchInfo(name = 'n_tree_search', type = int,
         lower = 1, init = 1, upper = 32768),
        'opt_interval': ConfigSearchInfo(name = 'opt_interval', type = int,
         lower = 1, init = 100, upper = 10000),
        'learning_rate': ConfigSearchInfo(name = 'learning_rate', type = float,
         lower = 0.01, init = 1.0, upper = 20.0),
        'min_samples_leaf': ConfigSearchInfo(name = 'min_samples_leaf',
         type = int, lower = 1, init = 20, upper = 20)
    }
    
    def __init__(self, objective_name = 'binary:logistic', n_jobs = 1,
     max_leaf = 1000, n_iter = 1, n_tree_search = 1, opt_interval = 1,
      learning_rate = 1.0, min_samples_leaf = 1):

        self.objective_name = objective_name

        if 'regression' in objective_name:
            self.estimator_class = RGFRegressor
        else:
            self.estimator_class = RGFClassifier

        # round integer hyperparameters
        self.params = {
            'max_leaf': int(round(max_leaf)),
            'n_iter': int(round(n_iter)),
            'n_tree_search': int(round(n_tree_search)),
            'opt_interval': int(round(opt_interval)),
            'learning_rate': learning_rate,
            'min_samples_leaf':int(round(min_samples_leaf)),
            "n_jobs": n_jobs,
        }            


def custom_metric(X_test, y_test, estimator, labels, X_train, y_train):
    from sklearn.metrics import log_loss
    y_pred = estimator.predict_proba(X_test)
    test_loss = log_loss(y_test, y_pred, labels=labels)
    y_pred = estimator.predict_proba(X_train)
    train_loss = log_loss(y_train, y_pred, labels=labels)
    alpha = 0.5
    return test_loss * (1 + alpha) - alpha * train_loss, [test_loss, train_loss]


class TestAutoML(unittest.TestCase):

    def test_custom_learner(self):
        automl = AutoML()
        automl.add_learner(learner_name = 'RGF',
            learner_class = MyRegularizedGreedyForest)            
        X_train, y_train = load_wine(return_X_y=True)
        settings = {
            "time_budget": 10, # total running time in seconds
            "estimator_list": ['RGF', 'lgbm', 'rf', 'xgboost'], 
            "task": 'classification', # task type    
            "sample": True, # whether to subsample training data
            "log_file_name": "test/wine.log",
            "log_training_metric": True, # whether to log training metric
        }

        '''The main flaml automl API'''
        automl.fit(X_train = X_train, y_train = y_train, **settings)

    def test_dataframe(self):
        self.test_classification(True)

    def test_custom_metric(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 10,
            'eval_method': 'holdout',
            "metric": custom_metric,
            "task": 'classification',
            "log_file_name": "test/iris_custom.log",
            "log_training_metric": True,
            'log_type': 'all',
            "model_history": True
        }
        X_train, y_train = load_iris(return_X_y=True)
        automl_experiment.fit(X_train=X_train, y_train=y_train,
                              **automl_settings)
        print(automl_experiment.classes_)
        print(automl_experiment.predict_proba(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        automl_experiment = AutoML()
        estimator = automl_experiment.get_estimator_from_log(
            automl_settings["log_file_name"], record_id=0,
            objective='multi')
        print(estimator)
        time_history, best_valid_loss_history, valid_loss_history, \
            config_history, train_loss_history = get_output_from_log(
                filename=automl_settings['log_file_name'], time_budget=6)
        print(train_loss_history)

    def test_classification(self, as_frame=False):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 4,
            "metric": 'accuracy',
            "task": 'classification',
            "log_file_name": "test/iris.log",
            "log_training_metric": True,
            "model_history": True
        }
        X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
        automl_experiment.fit(X_train=X_train, y_train=y_train,
                              **automl_settings)
        print(automl_experiment.classes_)
        print(automl_experiment.predict_proba(X_train)[:5])
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        del automl_settings["metric"]
        del automl_settings["model_history"]
        del automl_settings["log_training_metric"]
        automl_experiment = AutoML()
        duration = automl_experiment.retrain_from_log(
            log_file_name=automl_settings["log_file_name"],
            X_train=X_train, y_train=y_train,
            train_full=True, record_id=0)
        print(duration)
        print(automl_experiment.model)
        print(automl_experiment.predict_proba(X_train)[:5])

    def test_regression(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            "metric": 'mse',
            "task": 'regression',
            "log_file_name": "test/boston.log",
            "log_training_metric": True,
            "model_history": True
        }
        X_train, y_train = load_boston(return_X_y=True)
        n = int(len(y_train)*9//10)
        automl_experiment.fit(X_train=X_train[:n], y_train=y_train[:n],
                              X_val=X_train[n:], y_val=y_train[n:],
                              **automl_settings)
        assert automl_experiment.eval_method == 'holdout'
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        print(get_output_from_log(automl_settings["log_file_name"], 1))

    def test_sparse_matrix_classification(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            "metric": 'auto',
            "task": 'classification',
            "log_file_name": "test/sparse_classification.log",
            "split_type": "uniform",
            "model_history": True
        }
        X_train = scipy.sparse.random(1554, 21, dtype=int)
        y_train = np.random.randint(3, size=1554)
        automl_experiment.fit(X_train=X_train, y_train=y_train,
                              **automl_settings)
        print(automl_experiment.classes_)
        print(automl_experiment.predict_proba(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)

    def test_sparse_matrix_regression(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            "metric": 'mae',
            "task": 'regression',
            "log_file_name": "test/sparse_regression.log",
            "model_history": True
        }
        X_train = scipy.sparse.random(300, 900, density=0.0001)
        y_train = np.random.uniform(size=300)
        X_val = scipy.sparse.random(100, 900, density=0.0001)
        y_val = np.random.uniform(size=100)
        automl_experiment.fit(X_train=X_train, y_train=y_train,
                              X_val=X_val, y_val=y_val,
                              **automl_settings)
        assert automl_experiment.X_val.shape == X_val.shape
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        print(automl_experiment.best_config)
        print(automl_experiment.best_loss)
        print(automl_experiment.best_config_train_time)

    def test_sparse_matrix_xgboost(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            "metric": 'ap',
            "task": 'classification',
            "log_file_name": "test/sparse_classification.log",
            "estimator_list": ["xgboost"],
            "log_type": "all",
        }
        X_train = scipy.sparse.eye(900000)
        y_train = np.random.randint(2, size=900000)
        automl_experiment.fit(X_train=X_train, y_train=y_train,
                              **automl_settings)
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)

    def test_sparse_matrix_lr(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            "metric": 'f1',
            "task": 'classification',
            "log_file_name": "test/sparse_classification.log",
            "estimator_list": ["lrl1", "lrl2"],
            "log_type": "all",
        }
        X_train = scipy.sparse.random(3000, 900, density=0.1)
        y_train = np.random.randint(2, size=3000)
        automl_experiment.fit(X_train=X_train, y_train=y_train,
                              **automl_settings)
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)

    def test_sparse_matrix_regression_cv(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            'eval_method': 'cv',
            "task": 'regression',
            "log_file_name": "test/sparse_regression.log",
            "model_history": True
        }
        X_train = scipy.sparse.random(100, 100)
        y_train = np.random.uniform(size=100)
        automl_experiment.fit(X_train=X_train, y_train=y_train,
                              **automl_settings)
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)


if __name__ == "__main__":
    unittest.main()
v0.1.0 2020-12-04 09:40:27 -08:00			`import unittest`

			`import numpy as np`
			`import scipy.sparse`
v0.1.3 Set default logging level to INFO (#14) * set default logging level to INFO * remove unnecessary import * API future compatibility * add test for customized learner * test dependency Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com> 2020-12-15 08:10:43 -08:00			`from sklearn.datasets import load_boston, load_iris, load_wine`
v0.1.0 2020-12-04 09:40:27 -08:00
v0.1.3 Set default logging level to INFO (#14) * set default logging level to INFO * remove unnecessary import * API future compatibility * add test for customized learner * test dependency Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com> 2020-12-15 08:10:43 -08:00			`from flaml import AutoML`
			`from flaml.data import get_output_from_log`

			`from flaml.model import BaseEstimator`
			`from flaml.space import ConfigSearchInfo`
			`from rgf.sklearn import RGFClassifier, RGFRegressor`


			`class MyRegularizedGreedyForest(BaseEstimator):`

			`# search space`
			`params_configsearch_info = {`
			`'max_leaf': ConfigSearchInfo(name = 'max_leaf',`
			`type = int, lower = 4, init = 4, upper = 10000),`
			`'n_iter': ConfigSearchInfo(name = 'n_iter', type = int, lower = 1,`
			`init = 1, upper = 32768),`
			`'n_tree_search': ConfigSearchInfo(name = 'n_tree_search', type = int,`
			`lower = 1, init = 1, upper = 32768),`
			`'opt_interval': ConfigSearchInfo(name = 'opt_interval', type = int,`
			`lower = 1, init = 100, upper = 10000),`
			`'learning_rate': ConfigSearchInfo(name = 'learning_rate', type = float,`
			`lower = 0.01, init = 1.0, upper = 20.0),`
			`'min_samples_leaf': ConfigSearchInfo(name = 'min_samples_leaf',`
			`type = int, lower = 1, init = 20, upper = 20)`
			`}`

			`def __init__(self, objective_name = 'binary:logistic', n_jobs = 1,`
			`max_leaf = 1000, n_iter = 1, n_tree_search = 1, opt_interval = 1,`
			`learning_rate = 1.0, min_samples_leaf = 1):`

			`self.objective_name = objective_name`

			`if 'regression' in objective_name:`
			`self.estimator_class = RGFRegressor`
			`else:`
			`self.estimator_class = RGFClassifier`

			`# round integer hyperparameters`
			`self.params = {`
			`'max_leaf': int(round(max_leaf)),`
			`'n_iter': int(round(n_iter)),`
			`'n_tree_search': int(round(n_tree_search)),`
			`'opt_interval': int(round(opt_interval)),`
			`'learning_rate': learning_rate,`
			`'min_samples_leaf':int(round(min_samples_leaf)),`
			`"n_jobs": n_jobs,`
			`}`
v0.1.0 2020-12-04 09:40:27 -08:00

			`def custom_metric(X_test, y_test, estimator, labels, X_train, y_train):`
			`from sklearn.metrics import log_loss`
			`y_pred = estimator.predict_proba(X_test)`
			`test_loss = log_loss(y_test, y_pred, labels=labels)`
			`y_pred = estimator.predict_proba(X_train)`
			`train_loss = log_loss(y_train, y_pred, labels=labels)`
			`alpha = 0.5`
			`return test_loss * (1 + alpha) - alpha * train_loss, [test_loss, train_loss]`


			`class TestAutoML(unittest.TestCase):`

v0.1.3 Set default logging level to INFO (#14) * set default logging level to INFO * remove unnecessary import * API future compatibility * add test for customized learner * test dependency Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com> 2020-12-15 08:10:43 -08:00			`def test_custom_learner(self):`
			`automl = AutoML()`
			`automl.add_learner(learner_name = 'RGF',`
			`learner_class = MyRegularizedGreedyForest)`
			`X_train, y_train = load_wine(return_X_y=True)`
			`settings = {`
			`"time_budget": 10, # total running time in seconds`
			`"estimator_list": ['RGF', 'lgbm', 'rf', 'xgboost'],`
			`"task": 'classification', # task type`
			`"sample": True, # whether to subsample training data`
			`"log_file_name": "test/wine.log",`
			`"log_training_metric": True, # whether to log training metric`
			`}`

			`'''The main flaml automl API'''`
			`automl.fit(X_train = X_train, y_train = y_train, **settings)`

v0.1.0 2020-12-04 09:40:27 -08:00			`def test_dataframe(self):`
			`self.test_classification(True)`

			`def test_custom_metric(self):`

			`automl_experiment = AutoML()`
			`automl_settings = {`
			`"time_budget": 10,`
			`'eval_method': 'holdout',`
			`"metric": custom_metric,`
			`"task": 'classification',`
			`"log_file_name": "test/iris_custom.log",`
			`"log_training_metric": True,`
			`'log_type': 'all',`
			`"model_history": True`
			`}`
			`X_train, y_train = load_iris(return_X_y=True)`
			`automl_experiment.fit(X_train=X_train, y_train=y_train,`
			`**automl_settings)`
			`print(automl_experiment.classes_)`
			`print(automl_experiment.predict_proba(X_train))`
			`print(automl_experiment.model)`
			`print(automl_experiment.config_history)`
			`print(automl_experiment.model_history)`
			`print(automl_experiment.best_iteration)`
			`print(automl_experiment.best_estimator)`
			`automl_experiment = AutoML()`
			`estimator = automl_experiment.get_estimator_from_log(`
			`automl_settings["log_file_name"], record_id=0,`
			`objective='multi')`
			`print(estimator)`
			`time_history, best_valid_loss_history, valid_loss_history, \`
			`config_history, train_loss_history = get_output_from_log(`
			`filename=automl_settings['log_file_name'], time_budget=6)`
			`print(train_loss_history)`

			`def test_classification(self, as_frame=False):`

			`automl_experiment = AutoML()`
			`automl_settings = {`
			`"time_budget": 4,`
			`"metric": 'accuracy',`
			`"task": 'classification',`
			`"log_file_name": "test/iris.log",`
			`"log_training_metric": True,`
			`"model_history": True`
			`}`
			`X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)`
			`automl_experiment.fit(X_train=X_train, y_train=y_train,`
			`**automl_settings)`
			`print(automl_experiment.classes_)`
			`print(automl_experiment.predict_proba(X_train)[:5])`
			`print(automl_experiment.model)`
			`print(automl_experiment.config_history)`
			`print(automl_experiment.model_history)`
			`print(automl_experiment.best_iteration)`
			`print(automl_experiment.best_estimator)`
			`del automl_settings["metric"]`
			`del automl_settings["model_history"]`
			`del automl_settings["log_training_metric"]`
			`automl_experiment = AutoML()`
			`duration = automl_experiment.retrain_from_log(`
			`log_file_name=automl_settings["log_file_name"],`
			`X_train=X_train, y_train=y_train,`
			`train_full=True, record_id=0)`
			`print(duration)`
			`print(automl_experiment.model)`
			`print(automl_experiment.predict_proba(X_train)[:5])`

			`def test_regression(self):`

			`automl_experiment = AutoML()`
			`automl_settings = {`
			`"time_budget": 2,`
			`"metric": 'mse',`
			`"task": 'regression',`
			`"log_file_name": "test/boston.log",`
			`"log_training_metric": True,`
			`"model_history": True`
			`}`
			`X_train, y_train = load_boston(return_X_y=True)`
Fix #11; add tests for training log and python logger (#12) 2020-12-14 23:10:03 -08:00			`n = int(len(y_train)*9//10)`
			`automl_experiment.fit(X_train=X_train[:n], y_train=y_train[:n],`
			`X_val=X_train[n:], y_val=y_train[n:],`
v0.1.0 2020-12-04 09:40:27 -08:00			`**automl_settings)`
			`assert automl_experiment.eval_method == 'holdout'`
			`print(automl_experiment.predict(X_train))`
			`print(automl_experiment.model)`
			`print(automl_experiment.config_history)`
			`print(automl_experiment.model_history)`
			`print(automl_experiment.best_iteration)`
			`print(automl_experiment.best_estimator)`
			`print(get_output_from_log(automl_settings["log_file_name"], 1))`

			`def test_sparse_matrix_classification(self):`

			`automl_experiment = AutoML()`
			`automl_settings = {`
			`"time_budget": 2,`
			`"metric": 'auto',`
			`"task": 'classification',`
			`"log_file_name": "test/sparse_classification.log",`
			`"split_type": "uniform",`
			`"model_history": True`
			`}`
			`X_train = scipy.sparse.random(1554, 21, dtype=int)`
			`y_train = np.random.randint(3, size=1554)`
			`automl_experiment.fit(X_train=X_train, y_train=y_train,`
			`**automl_settings)`
			`print(automl_experiment.classes_)`
			`print(automl_experiment.predict_proba(X_train))`
			`print(automl_experiment.model)`
			`print(automl_experiment.config_history)`
			`print(automl_experiment.model_history)`
			`print(automl_experiment.best_iteration)`
			`print(automl_experiment.best_estimator)`

			`def test_sparse_matrix_regression(self):`

			`automl_experiment = AutoML()`
			`automl_settings = {`
			`"time_budget": 2,`
			`"metric": 'mae',`
			`"task": 'regression',`
			`"log_file_name": "test/sparse_regression.log",`
			`"model_history": True`
			`}`
			`X_train = scipy.sparse.random(300, 900, density=0.0001)`
			`y_train = np.random.uniform(size=300)`
			`X_val = scipy.sparse.random(100, 900, density=0.0001)`
			`y_val = np.random.uniform(size=100)`
			`automl_experiment.fit(X_train=X_train, y_train=y_train,`
			`X_val=X_val, y_val=y_val,`
			`**automl_settings)`
			`assert automl_experiment.X_val.shape == X_val.shape`
			`print(automl_experiment.predict(X_train))`
			`print(automl_experiment.model)`
			`print(automl_experiment.config_history)`
			`print(automl_experiment.model_history)`
			`print(automl_experiment.best_iteration)`
			`print(automl_experiment.best_estimator)`
			`print(automl_experiment.best_config)`
			`print(automl_experiment.best_loss)`
			`print(automl_experiment.best_config_train_time)`

			`def test_sparse_matrix_xgboost(self):`

			`automl_experiment = AutoML()`
			`automl_settings = {`
			`"time_budget": 2,`
			`"metric": 'ap',`
			`"task": 'classification',`
			`"log_file_name": "test/sparse_classification.log",`
			`"estimator_list": ["xgboost"],`
			`"log_type": "all",`
			`}`
			`X_train = scipy.sparse.eye(900000)`
			`y_train = np.random.randint(2, size=900000)`
			`automl_experiment.fit(X_train=X_train, y_train=y_train,`
			`**automl_settings)`
			`print(automl_experiment.predict(X_train))`
			`print(automl_experiment.model)`
			`print(automl_experiment.config_history)`
			`print(automl_experiment.model_history)`
			`print(automl_experiment.best_iteration)`
			`print(automl_experiment.best_estimator)`

			`def test_sparse_matrix_lr(self):`

			`automl_experiment = AutoML()`
			`automl_settings = {`
			`"time_budget": 2,`
			`"metric": 'f1',`
			`"task": 'classification',`
			`"log_file_name": "test/sparse_classification.log",`
			`"estimator_list": ["lrl1", "lrl2"],`
			`"log_type": "all",`
			`}`
			`X_train = scipy.sparse.random(3000, 900, density=0.1)`
			`y_train = np.random.randint(2, size=3000)`
			`automl_experiment.fit(X_train=X_train, y_train=y_train,`
			`**automl_settings)`
			`print(automl_experiment.predict(X_train))`
			`print(automl_experiment.model)`
			`print(automl_experiment.config_history)`
			`print(automl_experiment.model_history)`
			`print(automl_experiment.best_iteration)`
			`print(automl_experiment.best_estimator)`

			`def test_sparse_matrix_regression_cv(self):`

			`automl_experiment = AutoML()`
			`automl_settings = {`
			`"time_budget": 2,`
			`'eval_method': 'cv',`
			`"task": 'regression',`
			`"log_file_name": "test/sparse_regression.log",`
			`"model_history": True`
			`}`
			`X_train = scipy.sparse.random(100, 100)`
			`y_train = np.random.uniform(size=100)`
			`automl_experiment.fit(X_train=X_train, y_train=y_train,`
			`**automl_settings)`
			`print(automl_experiment.predict(X_train))`
			`print(automl_experiment.model)`
			`print(automl_experiment.config_history)`
			`print(automl_experiment.model_history)`
			`print(automl_experiment.best_iteration)`
			`print(automl_experiment.best_estimator)`


			`if __name__ == "__main__":`
			`unittest.main()`