mirror of
https://github.com/microsoft/autogen.git
synced 2025-09-07 15:27:58 +00:00

* FLAML_sample_size * clean up * starting_points as a list * catch AssertionError * per estimator sample size * import * per estimator min_sample_size * Update flaml/automl.py Co-authored-by: Chi Wang <wang.chi@microsoft.com> * Update test/automl/test_warmstart.py Co-authored-by: Chi Wang <wang.chi@microsoft.com> * add warnings * adding more tests * fix a bug in validating starting points * improve test * revise test * revise test * documentation about custom_hp * doc and efficiency * update test Co-authored-by: Chi Wang <wang.chi@microsoft.com>
225 lines
8.5 KiB
Python
225 lines
8.5 KiB
Python
import unittest
|
|
import numpy as np
|
|
from sklearn.datasets import load_iris
|
|
from flaml import AutoML
|
|
from flaml.model import LGBMEstimator
|
|
from flaml import tune
|
|
|
|
|
|
class TestWarmStart(unittest.TestCase):
|
|
def test_fit_w_freezinghp_starting_point(self, as_frame=True):
|
|
automl = AutoML()
|
|
automl_settings = {
|
|
"time_budget": 1,
|
|
"metric": "accuracy",
|
|
"task": "classification",
|
|
"estimator_list": ["lgbm"],
|
|
"log_file_name": "test/iris.log",
|
|
"log_training_metric": True,
|
|
"n_jobs": 1,
|
|
"model_history": True,
|
|
}
|
|
X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
|
|
if as_frame:
|
|
# test drop column
|
|
X_train.columns = range(X_train.shape[1])
|
|
X_train[X_train.shape[1]] = np.zeros(len(y_train))
|
|
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
|
automl_val_accuracy = 1.0 - automl.best_loss
|
|
print("Best ML leaner:", automl.best_estimator)
|
|
print("Best hyperparmeter config:", automl.best_config)
|
|
print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
|
|
print(
|
|
"Training duration of best run: {0:.4g} s".format(
|
|
automl.best_config_train_time
|
|
)
|
|
)
|
|
# 1. Get starting points from previous experiments.
|
|
starting_points = automl.best_config_per_estimator
|
|
print("starting_points", starting_points)
|
|
print("loss of the starting_points", automl.best_loss_per_estimator)
|
|
starting_point = starting_points["lgbm"]
|
|
hps_to_freeze = ["colsample_bytree", "reg_alpha", "reg_lambda", "log_max_bin"]
|
|
|
|
# 2. Constrct a new class:
|
|
# a. write the hps you want to freeze as hps with constant 'domain';
|
|
# b. specify the new search space of the other hps accrodingly.
|
|
|
|
class MyPartiallyFreezedLargeLGBM(LGBMEstimator):
|
|
@classmethod
|
|
def search_space(cls, **params):
|
|
# (1) Get the hps in the original search space
|
|
space = LGBMEstimator.search_space(**params)
|
|
# (2) Set up the fixed value from hps from the starting point
|
|
for hp_name in hps_to_freeze:
|
|
# if an hp is specifed to be freezed, use tine value provided in the starting_point
|
|
# otherwise use the setting from the original search space
|
|
if hp_name in starting_point:
|
|
space[hp_name] = {"domain": starting_point[hp_name]}
|
|
# (3.1) Configure the search space for hps that are in the original search space
|
|
# but you want to change something, for example the range.
|
|
revised_hps_to_search = {
|
|
"n_estimators": {
|
|
"domain": tune.lograndint(lower=10, upper=32768),
|
|
"init_value": starting_point.get("n_estimators")
|
|
or space["n_estimators"].get("init_value", 10),
|
|
"low_cost_init_value": space["n_estimators"].get(
|
|
"low_cost_init_value", 10
|
|
),
|
|
},
|
|
"num_leaves": {
|
|
"domain": tune.lograndint(lower=10, upper=3276),
|
|
"init_value": starting_point.get("num_leaves")
|
|
or space["num_leaves"].get("init_value", 10),
|
|
"low_cost_init_value": space["num_leaves"].get(
|
|
"low_cost_init_value", 10
|
|
),
|
|
},
|
|
# (3.2) Add a new hp which is not in the original search space
|
|
"subsample": {
|
|
"domain": tune.uniform(lower=0.1, upper=1.0),
|
|
"init_value": 0.1,
|
|
},
|
|
}
|
|
space.update(revised_hps_to_search)
|
|
return space
|
|
|
|
new_estimator_name = "large_lgbm"
|
|
new_automl = AutoML()
|
|
new_automl.add_learner(
|
|
learner_name=new_estimator_name, learner_class=MyPartiallyFreezedLargeLGBM
|
|
)
|
|
|
|
automl_settings_resume = {
|
|
"time_budget": 3,
|
|
"metric": "accuracy",
|
|
"task": "classification",
|
|
"estimator_list": [new_estimator_name],
|
|
"log_file_name": "test/iris_resume.log",
|
|
"log_training_metric": True,
|
|
"n_jobs": 1,
|
|
"model_history": True,
|
|
"log_type": "all",
|
|
"starting_points": {new_estimator_name: starting_point},
|
|
}
|
|
|
|
new_automl.fit(X_train=X_train, y_train=y_train, **automl_settings_resume)
|
|
|
|
new_automl_val_accuracy = 1.0 - new_automl.best_loss
|
|
print("Best ML leaner:", new_automl.best_estimator)
|
|
print("Best hyperparmeter config:", new_automl.best_config)
|
|
print(
|
|
"Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
|
|
)
|
|
print(
|
|
"Training duration of best run: {0:.4g} s".format(
|
|
new_automl.best_config_train_time
|
|
)
|
|
)
|
|
|
|
def test_nobudget(self):
|
|
automl = AutoML()
|
|
X_train, y_train = load_iris(return_X_y=True)
|
|
automl.fit(X_train, y_train)
|
|
print(automl.best_config_per_estimator)
|
|
|
|
def test_FLAML_sample_size_in_starting_points(self):
|
|
from flaml.data import load_openml_dataset
|
|
from flaml import AutoML
|
|
|
|
X_train, X_test, y_train, y_test = load_openml_dataset(
|
|
dataset_id=1169, data_dir="./"
|
|
)
|
|
|
|
automl_settings = {
|
|
"time_budget": 3,
|
|
"task": "classification",
|
|
}
|
|
|
|
automl1 = AutoML()
|
|
print(len(y_train))
|
|
automl1.fit(X_train, y_train, **automl_settings)
|
|
print("automl1.best_config_per_estimator", automl1.best_config_per_estimator)
|
|
|
|
automl_settings["starting_points"] = automl1.best_config_per_estimator
|
|
automl2 = AutoML()
|
|
automl2.fit(X_train, y_train, **automl_settings)
|
|
|
|
automl_settings["starting_points"] = {
|
|
"xgboost": {
|
|
"n_estimators": 4,
|
|
"max_leaves": 4,
|
|
"min_child_weight": 0.26208115308159446,
|
|
"learning_rate": 0.25912534572860507,
|
|
"subsample": 0.9266743941610592,
|
|
"colsample_bylevel": 1.0,
|
|
"colsample_bytree": 1.0,
|
|
"reg_alpha": 0.0013933617380144255,
|
|
"reg_lambda": 0.18096917948292954,
|
|
"FLAML_sample_size": 20000,
|
|
},
|
|
"xgb_limitdepth": None,
|
|
"lrl1": None,
|
|
}
|
|
from flaml import tune
|
|
|
|
automl_settings["custom_hp"] = {
|
|
"xgboost": {
|
|
"n_estimators": {
|
|
"domain": tune.choice([10, 20]),
|
|
},
|
|
}
|
|
}
|
|
automl2 = AutoML()
|
|
automl2.fit(X_train, y_train, **automl_settings)
|
|
|
|
try:
|
|
import ray
|
|
|
|
automl_settings["n_concurrent_trials"] = 2
|
|
except ImportError:
|
|
automl_settings["n_concurrent_trials"] = 1
|
|
# setting different FLAML_sample_size
|
|
automl_settings["starting_points"] = {
|
|
"catboost": {
|
|
"early_stopping_rounds": 10,
|
|
"learning_rate": 0.09999999999999996,
|
|
"n_estimators": 1,
|
|
"FLAML_sample_size": 10000,
|
|
},
|
|
"xgboost": {
|
|
"n_estimators": 4,
|
|
"max_leaves": 4,
|
|
"min_child_weight": 0.26208115308159446,
|
|
"learning_rate": 0.25912534572860507,
|
|
"subsample": 0.9266743941610592,
|
|
"colsample_bylevel": 1.0,
|
|
"colsample_bytree": 1.0,
|
|
"reg_alpha": 0.0013933617380144255,
|
|
"reg_lambda": 0.18096917948292954,
|
|
"FLAML_sample_size": 20000,
|
|
},
|
|
"xgb_limitdepth": None,
|
|
"lrl1": None,
|
|
}
|
|
automl3 = AutoML()
|
|
automl3.fit(X_train, y_train, **automl_settings)
|
|
|
|
automl_settings["sample"] = False
|
|
automl4 = AutoML()
|
|
try:
|
|
automl4.fit(
|
|
X_train,
|
|
y_train,
|
|
**automl_settings,
|
|
)
|
|
raise RuntimeError(
|
|
"When sample=False and starting_points contain FLAML_sample_size, AssertionError is expected but not raised."
|
|
)
|
|
except AssertionError:
|
|
pass
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|