autogen/test/automl/test_warmstart.py

import unittest
import numpy as np
from sklearn.datasets import load_iris
from flaml import AutoML
from flaml.model import LGBMEstimator
from flaml import tune


class TestWarmStart(unittest.TestCase):
    def test_fit_w_freezinghp_starting_point(self, as_frame=True):
        automl = AutoML()
        automl_settings = {
            "time_budget": 1,
            "metric": "accuracy",
            "task": "classification",
            "estimator_list": ["lgbm"],
            "log_file_name": "test/iris.log",
            "log_training_metric": True,
            "n_jobs": 1,
            "model_history": True,
        }
        X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
        if as_frame:
            # test drop column
            X_train.columns = range(X_train.shape[1])
            X_train[X_train.shape[1]] = np.zeros(len(y_train))
        automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
        automl_val_accuracy = 1.0 - automl.best_loss
        print("Best ML leaner:", automl.best_estimator)
        print("Best hyperparmeter config:", automl.best_config)
        print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
        print(
            "Training duration of best run: {0:.4g} s".format(
                automl.best_config_train_time
            )
        )
        # 1. Get starting points from previous experiments.
        starting_points = automl.best_config_per_estimator
        print("starting_points", starting_points)
        print("loss of the starting_points", automl.best_loss_per_estimator)
        starting_point = starting_points["lgbm"]
        hps_to_freeze = ["colsample_bytree", "reg_alpha", "reg_lambda", "log_max_bin"]

        # 2. Constrct a new class:
        # a. write the hps you want to freeze as hps with constant 'domain';
        # b. specify the new search space of the other hps accrodingly.

        class MyPartiallyFreezedLargeLGBM(LGBMEstimator):
            @classmethod
            def search_space(cls, **params):
                # (1) Get the hps in the original search space
                space = LGBMEstimator.search_space(**params)
                # (2) Set up the fixed value from hps from the starting point
                for hp_name in hps_to_freeze:
                    # if an hp is specifed to be freezed, use tine value provided in the starting_point
                    # otherwise use the setting from the original search space
                    if hp_name in starting_point:
                        space[hp_name] = {"domain": starting_point[hp_name]}
                # (3.1) Configure the search space for hps that are in the original search space
                #  but you want to change something, for example the range.
                revised_hps_to_search = {
                    "n_estimators": {
                        "domain": tune.lograndint(lower=10, upper=32768),
                        "init_value": starting_point.get("n_estimators")
                        or space["n_estimators"].get("init_value", 10),
                        "low_cost_init_value": space["n_estimators"].get(
                            "low_cost_init_value", 10
                        ),
                    },
                    "num_leaves": {
                        "domain": tune.lograndint(lower=10, upper=3276),
                        "init_value": starting_point.get("num_leaves")
                        or space["num_leaves"].get("init_value", 10),
                        "low_cost_init_value": space["num_leaves"].get(
                            "low_cost_init_value", 10
                        ),
                    },
                    # (3.2) Add a new hp which is not in the original search space
                    "subsample": {
                        "domain": tune.uniform(lower=0.1, upper=1.0),
                        "init_value": 0.1,
                    },
                }
                space.update(revised_hps_to_search)
                return space

        new_estimator_name = "large_lgbm"
        new_automl = AutoML()
        new_automl.add_learner(
            learner_name=new_estimator_name, learner_class=MyPartiallyFreezedLargeLGBM
        )

        automl_settings_resume = {
            "time_budget": 3,
            "metric": "accuracy",
            "task": "classification",
            "estimator_list": [new_estimator_name],
            "log_file_name": "test/iris_resume.log",
            "log_training_metric": True,
            "n_jobs": 1,
            "model_history": True,
            "log_type": "all",
            "starting_points": {new_estimator_name: starting_point},
        }

        new_automl.fit(X_train=X_train, y_train=y_train, **automl_settings_resume)

        new_automl_val_accuracy = 1.0 - new_automl.best_loss
        print("Best ML leaner:", new_automl.best_estimator)
        print("Best hyperparmeter config:", new_automl.best_config)
        print(
            "Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
        )
        print(
            "Training duration of best run: {0:.4g} s".format(
                new_automl.best_config_train_time
            )
        )

    def test_nobudget(self):
        automl = AutoML()
        X_train, y_train = load_iris(return_X_y=True)
        automl.fit(X_train, y_train)
        print(automl.best_config_per_estimator)

    def test_FLAML_sample_size_in_starting_points(self):
        from flaml.data import load_openml_dataset
        from flaml import AutoML

        X_train, X_test, y_train, y_test = load_openml_dataset(
            dataset_id=1169, data_dir="./"
        )

        automl_settings = {
            "time_budget": 3,
            "task": "classification",
        }

        automl1 = AutoML()
        print(len(y_train))
        automl1.fit(X_train, y_train, **automl_settings)
        print("automl1.best_config_per_estimator", automl1.best_config_per_estimator)

        automl_settings["starting_points"] = automl1.best_config_per_estimator
        automl2 = AutoML()
        automl2.fit(X_train, y_train, **automl_settings)

        automl_settings["starting_points"] = {
            "xgboost": {
                "n_estimators": 4,
                "max_leaves": 4,
                "min_child_weight": 0.26208115308159446,
                "learning_rate": 0.25912534572860507,
                "subsample": 0.9266743941610592,
                "colsample_bylevel": 1.0,
                "colsample_bytree": 1.0,
                "reg_alpha": 0.0013933617380144255,
                "reg_lambda": 0.18096917948292954,
                "FLAML_sample_size": 20000,
            },
            "xgb_limitdepth": None,
            "lrl1": None,
        }
        from flaml import tune

        automl_settings["custom_hp"] = {
            "xgboost": {
                "n_estimators": {
                    "domain": tune.choice([10, 20]),
                },
            }
        }
        automl2 = AutoML()
        automl2.fit(X_train, y_train, **automl_settings)

        try:
            import ray

            automl_settings["n_concurrent_trials"] = 2
        except ImportError:
            automl_settings["n_concurrent_trials"] = 1
        # setting different FLAML_sample_size
        automl_settings["starting_points"] = {
            "catboost": {
                "early_stopping_rounds": 10,
                "learning_rate": 0.09999999999999996,
                "n_estimators": 1,
                "FLAML_sample_size": 10000,
            },
            "xgboost": {
                "n_estimators": 4,
                "max_leaves": 4,
                "min_child_weight": 0.26208115308159446,
                "learning_rate": 0.25912534572860507,
                "subsample": 0.9266743941610592,
                "colsample_bylevel": 1.0,
                "colsample_bytree": 1.0,
                "reg_alpha": 0.0013933617380144255,
                "reg_lambda": 0.18096917948292954,
                "FLAML_sample_size": 20000,
            },
            "xgb_limitdepth": None,
            "lrl1": None,
        }
        automl3 = AutoML()
        automl3.fit(X_train, y_train, **automl_settings)

        automl_settings["sample"] = False
        automl4 = AutoML()
        try:
            automl4.fit(
                X_train,
                y_train,
                **automl_settings,
            )
            raise RuntimeError(
                "When sample=False and starting_points contain FLAML_sample_size, AssertionError is expected but not raised."
            )
        except AssertionError:
            pass


if __name__ == "__main__":
    unittest.main()