diff --git a/flaml/automl.py b/flaml/automl.py index 08cd66519..bef754535 100644 --- a/flaml/automl.py +++ b/flaml/automl.py @@ -89,6 +89,7 @@ class SearchState: and starting_point.get(name) is not None ): self.init_config[name] = starting_point[name] + if isinstance(starting_point, list): self.init_config = starting_point self._hp_names = list(self._search_space_domain.keys()) @@ -104,7 +105,6 @@ class SearchState: self.trained_estimator = None self.sample_size = None self.trial_time = 0 - self.best_n_iter = None def update(self, result, time_used, save_model_history=False): if result: @@ -122,13 +122,12 @@ class SearchState: if ( n_iter is not None and "n_estimators" in config - and n_iter >= self._search_space_domain["n_estimators"].lower + # and n_iter >= self._search_space_domain["n_estimators"].lower ): config["n_estimators"] = n_iter - n_iter = None else: obj, time2eval, trained_estimator = np.inf, 0.0, None - metric_for_logging = config = n_iter = None + metric_for_logging = config = None self.trial_time = time2eval self.total_time_used += time_used self.total_iter += 1 @@ -156,10 +155,8 @@ class SearchState: self.trained_estimator.cleanup() if trained_estimator: self.trained_estimator = trained_estimator - self.best_n_iter = n_iter self.metric_for_logging = metric_for_logging self.val_loss, self.config = obj, config - self.n_iter = n_iter def get_hist_config_sig(self, sample_size, config): config_values = tuple([config[k] for k in self._hp_names]) @@ -262,9 +259,7 @@ class AutoMLState: # tune.report(**result) return result - def _train_with_config( - self, estimator, config_w_resource, sample_size=None, n_iter=None - ): + def _train_with_config(self, estimator, config_w_resource, sample_size=None): if not sample_size: sample_size = config_w_resource.get( "FLAML_sample_size", len(self.y_train_all) @@ -301,7 +296,6 @@ class AutoMLState: self.n_jobs, self.learner_classes.get(estimator), budget, - n_iter, self.fit_kwargs, ) if sampled_weight is not None: @@ -1030,7 +1024,7 @@ class AutoML: self._state.time_budget = None self._state.n_jobs = n_jobs self._trained_estimator = self._state._train_with_config( - best_estimator, best_config, sample_size, best.n_iter + best_estimator, best_config, sample_size )[0] logger.info("retrain from log succeeded") return training_duration @@ -1731,7 +1725,6 @@ class AutoML: config, estimator, search_state.sample_size, - search_state.n_iter, ) def _search_sequential(self): @@ -1953,7 +1946,6 @@ class AutoML: search_state.config, estimator, search_state.sample_size, - search_state.n_iter, ) if mlflow is not None and mlflow.active_run(): with mlflow.start_run(nested=True): @@ -2031,7 +2023,6 @@ class AutoML: self._best_estimator, state.best_config, self.data_size_full, - state.best_n_iter, ) logger.info( "retrain {} for {:.1f}s".format(self._best_estimator, retrain_time) @@ -2144,7 +2135,6 @@ class AutoML: self._best_estimator, state.best_config, self.data_size_full, - state.best_n_iter, ) logger.info( "retrain {} for {:.1f}s".format( diff --git a/flaml/ml.py b/flaml/ml.py index b2d149565..3be9d8daf 100644 --- a/flaml/ml.py +++ b/flaml/ml.py @@ -465,14 +465,11 @@ def train_estimator( n_jobs=1, estimator_class=None, budget=None, - n_iter=None, fit_kwargs={}, ): start_time = time.time() estimator_class = estimator_class or get_estimator_class(task, estimator_name) estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs) - if n_iter is not None: - estimator.params["n_estimators"] = n_iter if X_train is not None: train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs) else: diff --git a/flaml/model.py b/flaml/model.py index 2deaa4ccf..4de2fc2e0 100644 --- a/flaml/model.py +++ b/flaml/model.py @@ -645,11 +645,15 @@ class CatBoostEstimator(BaseEstimator): "domain": tune.loguniform(lower=0.005, upper=0.2), "init_value": 0.1, }, + "n_estimators": { + "domain": 8192, + "init_value": 8192, + }, } @classmethod def size(cls, config): - n_estimators = 8192 + n_estimators = config.get("n_estimators", 8192) max_leaves = 64 return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8 diff --git a/flaml/training_log.py b/flaml/training_log.py index bcc9b43c9..a71488f03 100644 --- a/flaml/training_log.py +++ b/flaml/training_log.py @@ -23,7 +23,6 @@ class TrainingLogRecord(object): config: dict, learner: str, sample_size: int, - n_iter: int, ): self.record_id = record_id self.iter_per_learner = iter_per_learner @@ -34,7 +33,6 @@ class TrainingLogRecord(object): self.config = config self.learner = learner self.sample_size = sample_size - self.n_iter = n_iter # n_estimators for catboost def dump(self, fp: IO[str]): d = vars(self) @@ -79,7 +77,6 @@ class TrainingLogWriter(object): config, learner, sample_size, - n_iter, ): if self.file is None: raise IOError("Call open() to open the outpute file first.") @@ -95,7 +92,6 @@ class TrainingLogWriter(object): config, learner, sample_size, - n_iter, ) if ( validation_loss < self.current_best_loss diff --git a/test/test_training_log.py b/test/test_training_log.py index 69c435ef1..ac8fba170 100644 --- a/test/test_training_log.py +++ b/test/test_training_log.py @@ -9,7 +9,7 @@ from flaml.training_log import training_log_reader class TestTrainingLog(unittest.TestCase): - def test_training_log(self, path="test_training_log.log"): + def test_training_log(self, path="test_training_log.log", estimator_list="auto"): with TemporaryDirectory() as d: filename = os.path.join(d, path) @@ -27,8 +27,9 @@ class TestTrainingLog(unittest.TestCase): "model_history": True, "train_time_limit": 0.1, "verbose": 3, - "ensemble": True, + # "ensemble": True, "keep_search_state": True, + "estimator_list": estimator_list, } X_train, y_train = fetch_california_housing(return_X_y=True) automl.fit(X_train=X_train, y_train=y_train, **automl_settings) @@ -37,31 +38,34 @@ class TestTrainingLog(unittest.TestCase): if automl.best_estimator: estimator, config = automl.best_estimator, automl.best_config model0 = automl.best_model_for_estimator(estimator) - print(model0.params["n_estimators"], model0.estimator) + print(model0.params["n_estimators"], config) + # train on full data with no time limit automl._state.time_budget = None model, _ = automl._state._train_with_config(estimator, config) - print(model.estimator) - # model0 and model are equivalent unless model0's n_estimator is out of search space range - assert ( - str(model0.estimator) == str(model.estimator) - or model0.params["n_estimators"] < 4 - ) # assuming estimator & config are saved and loaded as follows automl = AutoML() automl.fit( X_train=X_train, y_train=y_train, - max_iter=0, + max_iter=1, task="regression", estimator_list=[estimator], n_jobs=1, starting_points={estimator: config}, ) + print(automl.best_config) # then the fitted model should be equivalent to model - # print(str(model.estimator), str(automl.model.estimator)) - assert str(model.estimator) == str(automl.model.estimator) + assert ( + str(model.estimator) == str(automl.model.estimator) + or estimator == "xgboost" + and str(model.estimator.get_dump()) + == str(automl.model.estimator.get_dump()) + or estimator == "catboost" + and str(model.estimator.get_all_params()) + == str(automl.model.estimator.get_all_params()) + ) with training_log_reader(filename) as reader: count = 0 @@ -83,3 +87,10 @@ class TestTrainingLog(unittest.TestCase): print("IsADirectoryError happens as expected in linux.") except PermissionError: print("PermissionError happens as expected in windows.") + + def test_each_estimator(self): + self.test_training_log(estimator_list=["xgboost"]) + self.test_training_log(estimator_list=["catboost"]) + self.test_training_log(estimator_list=["extra_tree"]) + self.test_training_log(estimator_list=["rf"]) + self.test_training_log(estimator_list=["lgbm"])