From b7846048dcc4e78fa1733e29e4d3a2d2da818c2a Mon Sep 17 00:00:00 2001 From: Qingyun Wu Date: Sat, 9 Jul 2022 16:04:46 -0400 Subject: [PATCH] Allow FLAML_sample_size in starting_points (#619) * FLAML_sample_size * clean up * starting_points as a list * catch AssertionError * per estimator sample size * import * per estimator min_sample_size * Update flaml/automl.py Co-authored-by: Chi Wang * Update test/automl/test_warmstart.py Co-authored-by: Chi Wang * add warnings * adding more tests * fix a bug in validating starting points * improve test * revise test * revise test * documentation about custom_hp * doc and efficiency * update test Co-authored-by: Chi Wang --- flaml/automl.py | 112 ++++++++++++++++++++++++++------- test/automl/test_multiclass.py | 3 + test/automl/test_warmstart.py | 96 ++++++++++++++++++++++++++++ test/nlp/test_default.py | 16 ++++- 4 files changed, 203 insertions(+), 24 deletions(-) diff --git a/flaml/automl.py b/flaml/automl.py index 3b065504a..55a27984c 100644 --- a/flaml/automl.py +++ b/flaml/automl.py @@ -102,13 +102,10 @@ class SearchState: return True def valid_starting_point(self, starting_point, search_space): - return any( - [ - self.valid_starting_point_one_dim( - value, search_space[name].get("domain") - ) - for name, value in starting_point.items() - ] + return all( + self.valid_starting_point_one_dim(value, search_space[name].get("domain")) + for name, value in starting_point.items() + if name != "FLAML_sample_size" ) def __init__( @@ -656,9 +653,17 @@ class AutoML(BaseEstimator): the automl constructor, flaml will automatically (and under the hood) add it as an additional element in the metric_constraints. Essentially 'pred_time_limit' specifies a constraint about the prediction latency constraint in seconds. - custom_hp: dict, default=None | The custom search space specified by user - Each key is the estimator name, each value is a dict of the custom search space for that estimator. Notice the - domain of the custom search space can either be a value of a sample.Domain object. + custom_hp: dict, default=None | The custom search space specified by user. + It is a nested dict with keys being the estimator names, and values being dicts + per estimator search space. In the per estimator search space dict, + the keys are the hyperparameter names, and values are dicts of info ("domain", + "init_value", and "low_cost_init_value") about the search space associated with + the hyperparameter (i.e., per hyperparameter search space dict). When custom_hp + is provided, the built-in search space which is also a nested dict of per estimator + search space dict, will be updated with custom_hp. Note that during this nested dict update, + the per hyperparameter search space dicts will be replaced (instead of updated) by the ones + provided in custom_hp. Note that the value for "domain" can either be a constant + or a sample.Domain object. e.g., ```python @@ -2430,18 +2435,68 @@ class AutoML(BaseEstimator): eval_method == "holdout" and self._state.X_val is None ) self._auto_augment = auto_augment - self._min_sample_size = min_sample_size + + _sample_size_from_starting_points = {} + if isinstance(starting_points, dict): + for _estimator, _point_per_estimator in starting_points.items(): + sample_size = ( + _point_per_estimator + and isinstance(_point_per_estimator, dict) + and _point_per_estimator.get("FLAML_sample_size") + ) + if sample_size: + _sample_size_from_starting_points[_estimator] = sample_size + elif _point_per_estimator and isinstance(_point_per_estimator, list): + _sample_size_set = set( + [ + config["FLAML_sample_size"] + for config in _point_per_estimator + if "FLAML_sample_size" in config + ] + ) + if _sample_size_set: + _sample_size_from_starting_points[_estimator] = min( + _sample_size_set + ) + if len(_sample_size_set) > 1: + logger.warning( + "Using the min FLAML_sample_size of all the provided starting points for estimator {}. (Provided FLAML_sample_size are: {})".format( + _estimator, _sample_size_set + ) + ) + + if not sample and isinstance(starting_points, dict): + assert ( + not _sample_size_from_starting_points + ), "When subsampling is disabled, do not include FLAML_sample_size in the starting point." + self._min_sample_size = _sample_size_from_starting_points or min_sample_size + self._min_sample_size_input = min_sample_size self._prepare_data(eval_method, split_ratio, n_splits) - self._sample = ( - sample - and task != "rank" - and eval_method != "cv" - and ( - self._min_sample_size * SAMPLE_MULTIPLY_FACTOR - < self._state.data_size[0] + if isinstance(self._min_sample_size, dict): + self._sample = { + ( + k, + sample + and task != "rank" + and eval_method != "cv" + and ( + self._min_sample_size[k] * SAMPLE_MULTIPLY_FACTOR + < self._state.data_size[0] + ), + ) + for k in self._min_sample_size.keys() + } + else: + self._sample = ( + sample + and task != "rank" + and eval_method != "cv" + and ( + self._min_sample_size * SAMPLE_MULTIPLY_FACTOR + < self._state.data_size[0] + ) ) - ) if "auto" == metric: if _is_nlp_task(self._state.task): from .nlp.utils import load_default_huggingface_metric_for_task @@ -2752,6 +2807,16 @@ class AutoML(BaseEstimator): self._state.time_from_start = time.time() - self._start_time_flag time_left = self._state.time_budget - self._state.time_from_start if self._hpo_method != "optuna": + min_resource = self.min_resource + if isinstance(min_resource, dict): + _min_resource_set = set(min_resource.values()) + min_resource_all_estimator = min(_min_resource_set) + if len(_min_resource_set) > 1: + logger.warning( + "Using the min FLAML_sample_size of all the provided starting points as the starting sample size in the case of parallel search." + ) + else: + min_resource_all_estimator = min_resource search_alg = SearchAlgo( metric="val_loss", space=space, @@ -2759,7 +2824,7 @@ class AutoML(BaseEstimator): points_to_evaluate=self.points_to_evaluate, cat_hp_cost=self.cat_hp_cost, resource_attr=self.resource_attr, - min_resource=self.min_resource, + min_resource=min_resource_all_estimator, max_resource=self.max_resource, config_constraints=[ (partial(size, self._state), "<=", self._mem_thres) @@ -2947,7 +3012,12 @@ class AutoML(BaseEstimator): search_space = search_state.search_space if self._sample: resource_attr = "FLAML_sample_size" - min_resource = self._min_sample_size + min_resource = ( + self._min_sample_size[estimator] + if isinstance(self._min_sample_size, dict) + and estimator in self._min_sample_size + else self._min_sample_size_input + ) max_resource = self._state.data_size[0] else: resource_attr = min_resource = max_resource = None diff --git a/test/automl/test_multiclass.py b/test/automl/test_multiclass.py index 8e519db30..ab8987e59 100644 --- a/test/automl/test_multiclass.py +++ b/test/automl/test_multiclass.py @@ -474,8 +474,11 @@ class TestMultiClass(unittest.TestCase): starting_points = {} log_file_name = automl_settings["log_file_name"] with training_log_reader(log_file_name) as reader: + sample_size = 1000 for record in reader.records(): config = record.config + config["FLAML_sample_size"] = sample_size + sample_size += 1000 learner = record.learner if learner not in starting_points: starting_points[learner] = [] diff --git a/test/automl/test_warmstart.py b/test/automl/test_warmstart.py index c6eba5ac0..c443cf6ce 100644 --- a/test/automl/test_warmstart.py +++ b/test/automl/test_warmstart.py @@ -123,6 +123,102 @@ class TestWarmStart(unittest.TestCase): automl.fit(X_train, y_train) print(automl.best_config_per_estimator) + def test_FLAML_sample_size_in_starting_points(self): + from flaml.data import load_openml_dataset + from flaml import AutoML + + X_train, X_test, y_train, y_test = load_openml_dataset( + dataset_id=1169, data_dir="./" + ) + + automl_settings = { + "time_budget": 3, + "task": "classification", + } + + automl1 = AutoML() + print(len(y_train)) + automl1.fit(X_train, y_train, **automl_settings) + print("automl1.best_config_per_estimator", automl1.best_config_per_estimator) + + automl_settings["starting_points"] = automl1.best_config_per_estimator + automl2 = AutoML() + automl2.fit(X_train, y_train, **automl_settings) + + automl_settings["starting_points"] = { + "xgboost": { + "n_estimators": 4, + "max_leaves": 4, + "min_child_weight": 0.26208115308159446, + "learning_rate": 0.25912534572860507, + "subsample": 0.9266743941610592, + "colsample_bylevel": 1.0, + "colsample_bytree": 1.0, + "reg_alpha": 0.0013933617380144255, + "reg_lambda": 0.18096917948292954, + "FLAML_sample_size": 20000, + }, + "xgb_limitdepth": None, + "lrl1": None, + } + from flaml import tune + + automl_settings["custom_hp"] = { + "xgboost": { + "n_estimators": { + "domain": tune.choice([10, 20]), + }, + } + } + automl2 = AutoML() + automl2.fit(X_train, y_train, **automl_settings) + + try: + import ray + + automl_settings["n_concurrent_trials"] = 2 + except ImportError: + automl_settings["n_concurrent_trials"] = 1 + # setting different FLAML_sample_size + automl_settings["starting_points"] = { + "catboost": { + "early_stopping_rounds": 10, + "learning_rate": 0.09999999999999996, + "n_estimators": 1, + "FLAML_sample_size": 10000, + }, + "xgboost": { + "n_estimators": 4, + "max_leaves": 4, + "min_child_weight": 0.26208115308159446, + "learning_rate": 0.25912534572860507, + "subsample": 0.9266743941610592, + "colsample_bylevel": 1.0, + "colsample_bytree": 1.0, + "reg_alpha": 0.0013933617380144255, + "reg_lambda": 0.18096917948292954, + "FLAML_sample_size": 20000, + }, + "xgb_limitdepth": None, + "lrl1": None, + } + automl3 = AutoML() + automl3.fit(X_train, y_train, **automl_settings) + + automl_settings["sample"] = False + automl4 = AutoML() + try: + automl4.fit( + X_train, + y_train, + **automl_settings, + ) + raise RuntimeError( + "When sample=False and starting_points contain FLAML_sample_size, AssertionError is expected but not raised." + ) + except AssertionError: + pass + if __name__ == "__main__": unittest.main() diff --git a/test/nlp/test_default.py b/test/nlp/test_default.py index 8422b55f7..2118d10ea 100644 --- a/test/nlp/test_default.py +++ b/test/nlp/test_default.py @@ -57,15 +57,25 @@ def test_starting_point_not_in_search_space(): "learning_rate": { "domain": tune.choice([1e-4, 1e-5]), }, + "per_device_train_batch_size": { + "domain": 2, + }, } } automl_settings["starting_points"] = "data:test/nlp/default/" del automl_settings["fit_kwargs_by_estimator"][this_estimator_name]["model_path"] automl.fit(X_train, y_train, **automl_settings) - assert ( - len(automl._search_states[this_estimator_name].init_config) == 0 - ) # check that init config is not updated, but search space is updated + assert len(automl._search_states[this_estimator_name].init_config) == len( + automl._search_states[this_estimator_name]._search_space_domain + ) - len(automl_settings["custom_hp"][this_estimator_name]), ( + "The search space is updated with the custom_hp on {} hyperparameters of " + "the specified estimator without an initial value. Thus a valid init config " + "should only contain the cardinality of the search space minus {}".format( + len(automl_settings["custom_hp"][this_estimator_name]), + len(automl_settings["custom_hp"][this_estimator_name]), + ) + ) assert ( automl._search_states[this_estimator_name].search_space["model_path"] == "albert-base-v2"