From e3d26c0650ba3cf6b55ee9e9231cc54c680e9fb9 Mon Sep 17 00:00:00 2001 From: Qingyun Wu Date: Sat, 20 Aug 2022 09:18:35 -0400 Subject: [PATCH 1/4] add guideline collection (#687) * add guideline collection * remove redundancy --- website/docs/FAQ.md | 9 ++++++++- website/docs/Use-Cases/Task-Oriented-AutoML.md | 3 ++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/website/docs/FAQ.md b/website/docs/FAQ.md index 42ffac8ed..2fdbcd2fd 100644 --- a/website/docs/FAQ.md +++ b/website/docs/FAQ.md @@ -1,8 +1,15 @@ # Frequently Asked Questions +### [Guidelines on how to set a hyperparameter search space](Use-Cases/Tune-User-Defined-Function#details-and-guidelines-on-hyperparameter-search-space) + +### [Guidelines on parallel vs seqential tuning](Use-Cases/Task-Oriented-AutoML#guidelines-on-parallel-vs-sequential-tuning) + +### [Guidelines on creating and tuning a custom estimator](Use-Cases/Task-Oriented-AutoML#guidelines-on-tuning-a-custom-estimator) + + ### About `low_cost_partial_config` in `tune`. -- Definition and purpose: The `low_cost_partial_config` is a dictionary of subset of the hyperparameter coordinates whose value corresponds to a configuration with known low-cost (i.e., low computation cost for training the corresponding model). The concept of low/high-cost is meaningful in the case where a subset of the hyperparameters to tune directly affects the computation cost for training the model. For example, `n_estimators` and `max_leaves` are known to affect the training cost of tree-based learners. We call this subset of hyperparameters, *cost-related hyperparameters*. In such scenarios, if you are aware of low-cost configurations for the cost-related hyperparameters, you are recommended to set them as the `low_cost_partial_config`. Using the tree-based method example again, since we know that small `n_estimators` and `max_leaves` generally correspond to simpler models and thus lower cost, we set `{'n_estimators': 4, 'max_leaves': 4}` as the `low_cost_partial_config` by default (note that `4` is the lower bound of search space for these two hyperparameters), e.g., in [LGBM](https://github.com/microsoft/FLAML/blob/main/flaml/model.py#L215). Configuring `low_cost_partial_config` helps the search algorithms make more cost-efficient choices. +- Definition and purpose: The `low_cost_partial_config` is a dictionary of subset of the hyperparameter coordinates whose value corresponds to a configuration with known low-cost (i.e., low computation cost for training the corresponding model). The concept of low/high-cost is meaningful in the case where a subset of the hyperparameters to tune directly affects the computation cost for training the model. For example, `n_estimators` and `max_leaves` are known to affect the training cost of tree-based learners. We call this subset of hyperparameters, *cost-related hyperparameters*. In such scenarios, if you are aware of low-cost configurations for the cost-related hyperparameters, you are recommended to set them as the `low_cost_partial_config`. Using the tree-based method example again, since we know that small `n_estimators` and `max_leaves` generally correspond to simpler models and thus lower cost, we set `{'n_estimators': 4, 'max_leaves': 4}` as the `low_cost_partial_config` by default (note that `4` is the lower bound of search space for these two hyperparameters), e.g., in [LGBM](https://github.com/microsoft/FLAML/blob/main/flaml/model.py#L215). Configuring `low_cost_partial_config` helps the search algorithms make more cost-efficient choices. In AutoML, the `low_cost_init_value` in `search_space()` function for each estimator serves the same role. - Usage in practice: It is recommended to configure it if there are cost-related hyperparameters in your tuning task and you happen to know the low-cost values for them, but it is not required (It is fine to leave it the default value, i.e., `None`). diff --git a/website/docs/Use-Cases/Task-Oriented-AutoML.md b/website/docs/Use-Cases/Task-Oriented-AutoML.md index 6e427df7d..6752de7eb 100644 --- a/website/docs/Use-Cases/Task-Oriented-AutoML.md +++ b/website/docs/Use-Cases/Task-Oriented-AutoML.md @@ -125,8 +125,9 @@ The estimator list can contain one or more estimator names, each corresponding t - tuning an estimator that is not built-in; - customizing search space for a built-in estimator. -To tune a custom estimator that is not built-in, you need to: +#### Guidelines on tuning a custom estimator +To tune a custom estimator that is not built-in, you need to: 1. Build a custom estimator by inheritting [`flaml.model.BaseEstimator`](../reference/model#baseestimator-objects) or a derived class. For example, if you have a estimator class with scikit-learn style `fit()` and `predict()` functions, you only need to set `self.estimator_class` to be that class in your constructor. From 47e034d2032c0e369024caec681ff72a11d9023c Mon Sep 17 00:00:00 2001 From: Chi Wang Date: Sat, 20 Aug 2022 07:43:06 -0700 Subject: [PATCH 2/4] LightGBM notebook update (#690) * version update in notebook * comment about optuna install * monotone constraints --- notebook/automl_lightgbm.ipynb | 3 ++- website/docs/Use-Cases/Task-Oriented-AutoML.md | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/notebook/automl_lightgbm.ipynb b/notebook/automl_lightgbm.ipynb index 41610a31e..3b76e39c0 100644 --- a/notebook/automl_lightgbm.ipynb +++ b/notebook/automl_lightgbm.ipynb @@ -39,7 +39,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install flaml[notebook]==1.0.8" + "%pip install flaml[notebook]==1.0.10" ] }, { @@ -651,6 +651,7 @@ "metadata": {}, "outputs": [], "source": [ + "# uncomment the following line if optuna is not installed\n", "# %pip install optuna==2.8.0" ] }, diff --git a/website/docs/Use-Cases/Task-Oriented-AutoML.md b/website/docs/Use-Cases/Task-Oriented-AutoML.md index 6752de7eb..de7c11820 100644 --- a/website/docs/Use-Cases/Task-Oriented-AutoML.md +++ b/website/docs/Use-Cases/Task-Oriented-AutoML.md @@ -281,7 +281,9 @@ Some constraints on the estimator can be implemented via the custom learner. For class MonotonicXGBoostEstimator(XGBoostSklearnEstimator): @classmethod def search_space(**args): - return super().search_space(**args).update({"monotone_constraints": "(1, -1)"}) + space = super().search_space(**args) + space.update({"monotone_constraints": {"domain": "(1, -1)"}}) + return space ``` It adds a monotonicity constraint to XGBoost. This approach can be used to set any constraint that is an argument in the underlying estimator's constructor. From 3d1a28bfc04b6230badf1099474f8af38a5b6d1a Mon Sep 17 00:00:00 2001 From: Xueqing Liu Date: Sat, 20 Aug 2022 18:17:10 -0400 Subject: [PATCH 3/4] Add preserve_checkpoint to preserve the checkpoint after del (#692) * fix del bug --- flaml/automl.py | 32 +++++++++++++++++++++++++++++--- test/nlp/test_autohf.py | 3 +++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/flaml/automl.py b/flaml/automl.py index b959437f0..381a61a0a 100644 --- a/flaml/automl.py +++ b/flaml/automl.py @@ -627,6 +627,8 @@ class AutoML(BaseEstimator): keep_search_state: boolean, default=False | Whether to keep data needed for model search after fit(). By default the state is deleted for space saving. + preserve_checkpoint: boolean, default=True | Whether to preserve the saved checkpoint + on disk when deleting automl. By default the checkpoint is preserved. early_stop: boolean, default=False | Whether to stop early if the search is considered to converge. append_log: boolean, default=False | Whetehr to directly append the log @@ -726,6 +728,7 @@ class AutoML(BaseEstimator): settings["starting_points"] = settings.get("starting_points", "static") settings["n_concurrent_trials"] = settings.get("n_concurrent_trials", 1) settings["keep_search_state"] = settings.get("keep_search_state", False) + settings["preserve_checkpoint"] = settings.get("preserve_checkpoint", True) settings["early_stop"] = settings.get("early_stop", False) settings["append_log"] = settings.get("append_log", False) settings["min_sample_size"] = settings.get("min_sample_size", MIN_SAMPLE_TRAIN) @@ -1576,6 +1579,7 @@ class AutoML(BaseEstimator): auto_augment=None, custom_hp=None, skip_transform=None, + preserve_checkpoint=True, fit_kwargs_by_estimator=None, **fit_kwargs, ): @@ -1704,10 +1708,19 @@ class AutoML(BaseEstimator): self._state.fit_kwargs = fit_kwargs self._state.custom_hp = custom_hp or self._settings.get("custom_hp") - self._skip_transform = self._settings.get("skip_transform") if skip_transform is None else skip_transform + self._skip_transform = ( + self._settings.get("skip_transform") + if skip_transform is None + else skip_transform + ) self._state.fit_kwargs_by_estimator = ( fit_kwargs_by_estimator or self._settings.get("fit_kwargs_by_estimator") ) + self.preserve_checkpoint = ( + self._settings.get("preserve_checkpoint") + if preserve_checkpoint is None + else preserve_checkpoint + ) self._validate_data(X_train, y_train, dataframe, label, groups=groups) logger.info("log file name {}".format(log_file_name)) @@ -2123,6 +2136,7 @@ class AutoML(BaseEstimator): seed=None, n_concurrent_trials=None, keep_search_state=None, + preserve_checkpoint=True, early_stop=None, append_log=None, auto_augment=None, @@ -2303,6 +2317,8 @@ class AutoML(BaseEstimator): keep_search_state: boolean, default=False | Whether to keep data needed for model search after fit(). By default the state is deleted for space saving. + preserve_checkpoint: boolean, default=True | Whether to preserve the saved checkpoint + on disk when deleting automl. By default the checkpoint is preserved. early_stop: boolean, default=False | Whether to stop early if the search is considered to converge. append_log: boolean, default=False | Whetehr to directly append the log @@ -2464,6 +2480,11 @@ class AutoML(BaseEstimator): if keep_search_state is None else keep_search_state ) + self.preserve_checkpoint = ( + self._settings.get("preserve_checkpoint") + if preserve_checkpoint is None + else preserve_checkpoint + ) early_stop = ( self._settings.get("early_stop") if early_stop is None else early_stop ) @@ -2513,7 +2534,11 @@ class AutoML(BaseEstimator): self._state.fit_kwargs = fit_kwargs custom_hp = custom_hp or self._settings.get("custom_hp") - self._skip_transform = self._settings.get("skip_transform") if skip_transform is None else skip_transform + self._skip_transform = ( + self._settings.get("skip_transform") + if skip_transform is None + else skip_transform + ) fit_kwargs_by_estimator = fit_kwargs_by_estimator or self._settings.get( "fit_kwargs_by_estimator" ) @@ -3566,7 +3591,8 @@ class AutoML(BaseEstimator): and self._trained_estimator and hasattr(self._trained_estimator, "cleanup") ): - self._trained_estimator.cleanup() + if self.preserve_checkpoint is False: + self._trained_estimator.cleanup() del self._trained_estimator def _select_estimator(self, estimator_list): diff --git a/test/nlp/test_autohf.py b/test/nlp/test_autohf.py index 6e59bcf2d..ee0ab693f 100644 --- a/test/nlp/test_autohf.py +++ b/test/nlp/test_autohf.py @@ -13,6 +13,7 @@ def test_hf_data(): automl = AutoML() automl_settings = get_automl_settings() + automl_settings["preserve_checkpoint"] = False try: automl.fit( @@ -68,6 +69,8 @@ def test_hf_data(): automl.predict_proba(X_test) print(automl.classes_) + del automl + if __name__ == "__main__": test_hf_data() From dffa802b3e377ea3dc591f0c7d99f3b3ccc1a26d Mon Sep 17 00:00:00 2001 From: Chi Wang Date: Sat, 20 Aug 2022 18:38:56 -0700 Subject: [PATCH 4/4] use_best_model for catboost (#679) * use_best_model for catboost * bump version to 1.0.11 --- flaml/model.py | 31 ++++++++++++++-------- flaml/version.py | 2 +- test/automl/test_regression.py | 47 +++++++++++++++++++++------------- 3 files changed, 50 insertions(+), 30 deletions(-) diff --git a/flaml/model.py b/flaml/model.py index 4a7825f1a..890f06c6b 100644 --- a/flaml/model.py +++ b/flaml/model.py @@ -1626,15 +1626,26 @@ class CatBoostEstimator(BaseEstimator): cat_features = list(X_train.select_dtypes(include="category").columns) else: cat_features = [] - n = max(int(len(y_train) * 0.9), len(y_train) - 1000) + use_best_model = kwargs.get("use_best_model", True) + n = ( + max(int(len(y_train) * 0.9), len(y_train) - 1000) + if use_best_model + else len(y_train) + ) X_tr, y_tr = X_train[:n], y_train[:n] + from catboost import Pool, __version__ + + eval_set = ( + Pool(data=X_train[n:], label=y_train[n:], cat_features=cat_features) + if use_best_model + else None + ) if "sample_weight" in kwargs: weight = kwargs["sample_weight"] if weight is not None: kwargs["sample_weight"] = weight[:n] else: weight = None - from catboost import Pool, __version__ model = self.estimator_class(train_dir=train_dir, **self.params) if __version__ >= "0.26": @@ -1642,10 +1653,10 @@ class CatBoostEstimator(BaseEstimator): X_tr, y_tr, cat_features=cat_features, - eval_set=Pool( - data=X_train[n:], label=y_train[n:], cat_features=cat_features + eval_set=eval_set, + callbacks=CatBoostEstimator._callbacks( + start_time, deadline, FREE_MEM_RATIO if use_best_model else None ), - callbacks=CatBoostEstimator._callbacks(start_time, deadline), **kwargs, ) else: @@ -1653,9 +1664,7 @@ class CatBoostEstimator(BaseEstimator): X_tr, y_tr, cat_features=cat_features, - eval_set=Pool( - data=X_train[n:], label=y_train[n:], cat_features=cat_features - ), + eval_set=eval_set, **kwargs, ) shutil.rmtree(train_dir, ignore_errors=True) @@ -1667,7 +1676,7 @@ class CatBoostEstimator(BaseEstimator): return train_time @classmethod - def _callbacks(cls, start_time, deadline): + def _callbacks(cls, start_time, deadline, free_mem_ratio): class ResourceLimit: def after_iteration(self, info) -> bool: now = time.time() @@ -1675,9 +1684,9 @@ class CatBoostEstimator(BaseEstimator): self._time_per_iter = now - start_time if now + self._time_per_iter > deadline: return False - if psutil is not None: + if psutil is not None and free_mem_ratio is not None: mem = psutil.virtual_memory() - if mem.available / mem.total < FREE_MEM_RATIO: + if mem.available / mem.total < free_mem_ratio: return False return True # can continue diff --git a/flaml/version.py b/flaml/version.py index 9fd0f8dd6..9eb1ebec5 100644 --- a/flaml/version.py +++ b/flaml/version.py @@ -1 +1 @@ -__version__ = "1.0.10" +__version__ = "1.0.11" diff --git a/test/automl/test_regression.py b/test/automl/test_regression.py index 47869ee34..0aca92f15 100644 --- a/test/automl/test_regression.py +++ b/test/automl/test_regression.py @@ -98,8 +98,8 @@ class TestRegression(unittest.TestCase): y_train = np.random.uniform(size=300) X_val = scipy.sparse.random(100, 900, density=0.0001) y_val = np.random.uniform(size=100) - automl_experiment = AutoML() - automl_settings = { + automl = AutoML() + settings = { "time_budget": 2, "metric": "mae", "task": "regression", @@ -110,23 +110,34 @@ class TestRegression(unittest.TestCase): "verbose": 0, "early_stop": True, } - automl_experiment.fit( - X_train=X_train, - y_train=y_train, - X_val=X_val, - y_val=y_val, - **automl_settings + automl.fit( + X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **settings + ) + assert automl._state.X_val.shape == X_val.shape + print(automl.predict(X_train)) + print(automl.model) + print(automl.config_history) + print(automl.best_model_for_estimator("rf")) + print(automl.best_iteration) + print(automl.best_estimator) + print(automl.best_config) + print(automl.best_loss) + print(automl.best_config_train_time) + + settings.update( + { + "estimator_list": ["catboost"], + "keep_search_state": False, + "model_history": False, + "use_best_model": False, + "time_budget": None, + "max_iter": 2, + "custom_hp": {"catboost": {"n_estimators": {"domain": 100}}}, + } + ) + automl.fit( + X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **settings ) - assert automl_experiment._state.X_val.shape == X_val.shape - print(automl_experiment.predict(X_train)) - print(automl_experiment.model) - print(automl_experiment.config_history) - print(automl_experiment.best_model_for_estimator("rf")) - print(automl_experiment.best_iteration) - print(automl_experiment.best_estimator) - print(automl_experiment.best_config) - print(automl_experiment.best_loss) - print(automl_experiment.best_config_train_time) def test_parallel(self, hpo_method=None): automl_experiment = AutoML()