autogen/test/test_notebook_example.py

from openml.exceptions import OpenMLServerException


def test_automl(budget=5, dataset_format='dataframe'):
    from flaml.data import load_openml_dataset
    try:
        X_train, X_test, y_train, y_test = load_openml_dataset(
            dataset_id=1169, data_dir='test/', dataset_format=dataset_format)
    except OpenMLServerException:
        print("OpenMLServerException raised")
        return
    ''' import AutoML class from flaml package '''
    from flaml import AutoML
    automl = AutoML()
    settings = {
        "time_budget": budget,  # total running time in seconds
        "metric": 'accuracy',  # primary metrics can be chosen from: ['accuracy','roc_auc','roc_auc_ovr','roc_auc_ovo','f1','log_loss','mae','mse','r2']
        "task": 'classification',  # task type
        "log_file_name": 'airlines_experiment.log',  # flaml log file
        "seed": 7654321,    # random seed
    }
    '''The main flaml automl API'''
    automl.fit(X_train=X_train, y_train=y_train, **settings)
    ''' retrieve best config and best learner'''
    print('Best ML leaner:', automl.best_estimator)
    print('Best hyperparmeter config:', automl.best_config)
    print('Best accuracy on validation data: {0:.4g}'.format(1 - automl.best_loss))
    print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))
    print(automl.model.estimator)
    ''' pickle and save the automl object '''
    import pickle
    with open('automl.pkl', 'wb') as f:
        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
    ''' compute predictions of testing dataset '''
    y_pred = automl.predict(X_test)
    print('Predicted labels', y_pred)
    print('True labels', y_test)
    y_pred_proba = automl.predict_proba(X_test)[:, 1]
    ''' compute different metric values on testing dataset'''
    from flaml.ml import sklearn_metric_loss_score
    print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))
    print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))
    print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))
    from flaml.data import get_output_from_log
    time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
        get_output_from_log(filename=settings['log_file_name'], time_budget=60)
    for config in config_history:
        print(config)
    print(automl.prune_attr)
    print(automl.max_resource)
    print(automl.min_resource)


def test_automl_array():
    test_automl(5, 'array')


def test_mlflow():
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "mlflow"])
    import mlflow
    from flaml.data import load_openml_task
    try:
        X_train, X_test, y_train, y_test = load_openml_task(
            task_id=7592, data_dir='test/')
    except OpenMLServerException:
        print("OpenMLServerException raised")
        return
    ''' import AutoML class from flaml package '''
    from flaml import AutoML
    automl = AutoML()
    settings = {
        "time_budget": 5,  # total running time in seconds
        "metric": 'accuracy',  # primary metrics can be chosen from: ['accuracy','roc_auc','roc_auc_ovr','roc_auc_ovo','f1','log_loss','mae','mse','r2']
        "estimator_list": ['lgbm', 'rf', 'xgboost'],  # list of ML learners
        "task": 'classification',  # task type
        "sample": False,  # whether to subsample training data
        "log_file_name": 'adult.log',  # flaml log file
    }
    mlflow.set_experiment("flaml")
    with mlflow.start_run():
        '''The main flaml automl API'''
        automl.fit(X_train=X_train, y_train=y_train, **settings)
    # subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "mlflow"])


if __name__ == "__main__":
    test_automl(300)
v0.5.12 (#150) * remove extra comma * exclusive bound * log file name * add cost to space * dataset_format * add load_openml_dataset test * docstr * revise test format * simplify restore * order categories * openml server exception in test * process space * add warning * log format * reduce n_cpu * nested space * hierarchical search space for CFO * non hierarchical for bs * unflatten hierarchical config * connection error * random sample * config signature * check ray version * preprocess numpy array * catboost preprocess * time budget * seed, verbose, hpo_method * test cfocat * shallow copy in flatten_dict prevent lgbm model duplication * match estimator name * quantize and log * test qloguniform and qrandint * test qlograndint * thread.running Co-authored-by: Chi Wang <wang.chi@microsoft.com> Co-authored-by: Qingyun Wu <qingyunwu@Qingyuns-MacBook-Pro-2.local> 2021-08-12 02:02:22 -04:00			`from openml.exceptions import OpenMLServerException`


			`def test_automl(budget=5, dataset_format='dataframe'):`
coverage (#135) * coverage * readme * timeout 2021-07-20 17:00:44 -07:00			`from flaml.data import load_openml_dataset`
v0.5.12 (#150) * remove extra comma * exclusive bound * log file name * add cost to space * dataset_format * add load_openml_dataset test * docstr * revise test format * simplify restore * order categories * openml server exception in test * process space * add warning * log format * reduce n_cpu * nested space * hierarchical search space for CFO * non hierarchical for bs * unflatten hierarchical config * connection error * random sample * config signature * check ray version * preprocess numpy array * catboost preprocess * time budget * seed, verbose, hpo_method * test cfocat * shallow copy in flatten_dict prevent lgbm model duplication * match estimator name * quantize and log * test qloguniform and qrandint * test qlograndint * thread.running Co-authored-by: Chi Wang <wang.chi@microsoft.com> Co-authored-by: Qingyun Wu <qingyunwu@Qingyuns-MacBook-Pro-2.local> 2021-08-12 02:02:22 -04:00			`try:`
			`X_train, X_test, y_train, y_test = load_openml_dataset(`
			`dataset_id=1169, data_dir='test/', dataset_format=dataset_format)`
			`except OpenMLServerException:`
			`print("OpenMLServerException raised")`
			`return`
coverage (#135) * coverage * readme * timeout 2021-07-20 17:00:44 -07:00			`''' import AutoML class from flaml package '''`
			`from flaml import AutoML`
			`automl = AutoML()`
			`settings = {`
			`"time_budget": budget, # total running time in seconds`
support ROC and AUC for multi-class classification (#170) * support ROC and AUC for multi-class classification * add a test case to cover ROC and AUC for multi-class classification 2021-08-23 07:16:10 +09:00			`"metric": 'accuracy', # primary metrics can be chosen from: ['accuracy','roc_auc','roc_auc_ovr','roc_auc_ovo','f1','log_loss','mae','mse','r2']`
coverage (#135) * coverage * readme * timeout 2021-07-20 17:00:44 -07:00			`"task": 'classification', # task type`
			`"log_file_name": 'airlines_experiment.log', # flaml log file`
Support parallel and add random search (#167) * non hashable value out of signature * parallel trials * add random in _search_parallel * fix bug in retraining * check memory constraint before training * retrain_full * log custom metric * retraining budget check * sample size check before retrain * remove 'time2eval' from result * report 'total_search_time' in result * rename total_search_time to wall_clock_time * rename train_loss boolean to log_training_metric * set default train_loss to None * exclude oom result * log retrained model * no subsample * doc str * notebook * predicted value is NaN for sarimax * version Co-authored-by: Chi Wang <wang.chi@microsoft.com> Co-authored-by: Qingyun Wu <qxw5138@psu.edu> 2021-08-23 19:36:51 -04:00			`"seed": 7654321, # random seed`
coverage (#135) * coverage * readme * timeout 2021-07-20 17:00:44 -07:00			`}`
			`'''The main flaml automl API'''`
			`automl.fit(X_train=X_train, y_train=y_train, **settings)`
			`''' retrieve best config and best learner'''`
			`print('Best ML leaner:', automl.best_estimator)`
			`print('Best hyperparmeter config:', automl.best_config)`
			`print('Best accuracy on validation data: {0:.4g}'.format(1 - automl.best_loss))`
			`print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))`
			`print(automl.model.estimator)`
			`''' pickle and save the automl object '''`
			`import pickle`
			`with open('automl.pkl', 'wb') as f:`
			`pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)`
			`''' compute predictions of testing dataset '''`
			`y_pred = automl.predict(X_test)`
			`print('Predicted labels', y_pred)`
			`print('True labels', y_test)`
			`y_pred_proba = automl.predict_proba(X_test)[:, 1]`
			`''' compute different metric values on testing dataset'''`
			`from flaml.ml import sklearn_metric_loss_score`
			`print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))`
			`print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))`
			`print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))`
			`from flaml.data import get_output_from_log`
remove catboost training dir; ensemble api; blendsearch for hierarchical space; ranking task; forecast improvement (#178) * remove catboost training dir * close #48 * bs for hierarchical space. close #85 * retrain for hierarchical space * clean ml (#180) Co-authored-by: Qingyun Wu <qxw5138@psu.edu> * support ranking task * examples * cv shuffle * forecast api and implementation cleaner * period constraints * delete groups after fit 2021-09-01 16:25:04 -07:00			`time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \`
coverage (#135) * coverage * readme * timeout 2021-07-20 17:00:44 -07:00			`get_output_from_log(filename=settings['log_file_name'], time_budget=60)`
			`for config in config_history:`
			`print(config)`
space -> main (#148) * subspace in flow2 * search space and trainable from AutoML * experimental features: multivariate TPE, grouping, add_evaluated_points * test experimental features * readme * define by run * set time_budget_s for bs Co-authored-by: liususan091219 <Xqq630517> * version * acl * test define_by_run_func * size * constraints Co-authored-by: Chi Wang <wang.chi@microsoft.com> 2021-08-02 19:10:26 -04:00			`print(automl.prune_attr)`
			`print(automl.max_resource)`
			`print(automl.min_resource)`
coverage (#135) * coverage * readme * timeout 2021-07-20 17:00:44 -07:00

v0.5.12 (#150) * remove extra comma * exclusive bound * log file name * add cost to space * dataset_format * add load_openml_dataset test * docstr * revise test format * simplify restore * order categories * openml server exception in test * process space * add warning * log format * reduce n_cpu * nested space * hierarchical search space for CFO * non hierarchical for bs * unflatten hierarchical config * connection error * random sample * config signature * check ray version * preprocess numpy array * catboost preprocess * time budget * seed, verbose, hpo_method * test cfocat * shallow copy in flatten_dict prevent lgbm model duplication * match estimator name * quantize and log * test qloguniform and qrandint * test qlograndint * thread.running Co-authored-by: Chi Wang <wang.chi@microsoft.com> Co-authored-by: Qingyun Wu <qingyunwu@Qingyuns-MacBook-Pro-2.local> 2021-08-12 02:02:22 -04:00			`def test_automl_array():`
			`test_automl(5, 'array')`


coverage (#135) * coverage * readme * timeout 2021-07-20 17:00:44 -07:00			`def test_mlflow():`
			`import subprocess`
			`import sys`
			`subprocess.check_call([sys.executable, "-m", "pip", "install", "mlflow"])`
			`import mlflow`
			`from flaml.data import load_openml_task`
v0.5.12 (#150) * remove extra comma * exclusive bound * log file name * add cost to space * dataset_format * add load_openml_dataset test * docstr * revise test format * simplify restore * order categories * openml server exception in test * process space * add warning * log format * reduce n_cpu * nested space * hierarchical search space for CFO * non hierarchical for bs * unflatten hierarchical config * connection error * random sample * config signature * check ray version * preprocess numpy array * catboost preprocess * time budget * seed, verbose, hpo_method * test cfocat * shallow copy in flatten_dict prevent lgbm model duplication * match estimator name * quantize and log * test qloguniform and qrandint * test qlograndint * thread.running Co-authored-by: Chi Wang <wang.chi@microsoft.com> Co-authored-by: Qingyun Wu <qingyunwu@Qingyuns-MacBook-Pro-2.local> 2021-08-12 02:02:22 -04:00			`try:`
			`X_train, X_test, y_train, y_test = load_openml_task(`
			`task_id=7592, data_dir='test/')`
			`except OpenMLServerException:`
			`print("OpenMLServerException raised")`
			`return`
coverage (#135) * coverage * readme * timeout 2021-07-20 17:00:44 -07:00			`''' import AutoML class from flaml package '''`
			`from flaml import AutoML`
			`automl = AutoML()`
			`settings = {`
			`"time_budget": 5, # total running time in seconds`
support ROC and AUC for multi-class classification (#170) * support ROC and AUC for multi-class classification * add a test case to cover ROC and AUC for multi-class classification 2021-08-23 07:16:10 +09:00			`"metric": 'accuracy', # primary metrics can be chosen from: ['accuracy','roc_auc','roc_auc_ovr','roc_auc_ovo','f1','log_loss','mae','mse','r2']`
coverage (#135) * coverage * readme * timeout 2021-07-20 17:00:44 -07:00			`"estimator_list": ['lgbm', 'rf', 'xgboost'], # list of ML learners`
			`"task": 'classification', # task type`
			`"sample": False, # whether to subsample training data`
			`"log_file_name": 'adult.log', # flaml log file`
			`}`
			`mlflow.set_experiment("flaml")`
			`with mlflow.start_run():`
			`'''The main flaml automl API'''`
			`automl.fit(X_train=X_train, y_train=y_train, **settings)`
			`# subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "mlflow"])`


			`if __name__ == "__main__":`
			`test_automl(300)`