From c57954fbbd13e2e66a5bba9d0f2fb3555c4f945f Mon Sep 17 00:00:00 2001 From: Chi Wang Date: Fri, 3 Dec 2021 09:15:21 -0800 Subject: [PATCH] include default value in rf search space (#317) * include default value in rf search space * init _mem_per_iter with -1 * bump version to 0.8.2 * docstr for search space's arguments --- flaml/automl.py | 41 +++++++++++++++-------------- flaml/model.py | 47 ++++++++++++++++++++-------------- flaml/version.py | 2 +- notebook/flaml_automl.ipynb | 6 ++--- test/automl/test_multiclass.py | 4 +-- test/automl/test_xgboost2d.py | 2 +- 6 files changed, 57 insertions(+), 45 deletions(-) diff --git a/flaml/automl.py b/flaml/automl.py index a712a0bcd..a0e173409 100644 --- a/flaml/automl.py +++ b/flaml/automl.py @@ -119,7 +119,7 @@ class SearchState: if config and "FLAML_sample_size" in config: self.sample_size = config["FLAML_sample_size"] else: - self.sample_size = self.data_size + self.sample_size = self.data_size[0] obj = result["val_loss"] metric_for_logging = result["metric_for_logging"] time2eval = result["time_total_s"] @@ -181,7 +181,7 @@ class SearchState: class AutoMLState: def _prepare_sample_train_data(self, sample_size): sampled_weight = groups = None - if sample_size <= self.data_size: + if sample_size <= self.data_size[0]: if isinstance(self.X_train, pd.DataFrame): sampled_X_train = self.X_train.iloc[:sample_size] else: @@ -205,7 +205,7 @@ class AutoMLState: if "FLAML_sample_size" in config_w_resource: sample_size = int(config_w_resource["FLAML_sample_size"]) else: - sample_size = self.data_size + sample_size = self.data_size[0] ( sampled_X_train, sampled_y_train, @@ -226,11 +226,11 @@ class AutoMLState: None if self.time_budget is None else self.time_budget - self.time_from_start - if sample_size == self.data_size + if sample_size == self.data_size[0] else (self.time_budget - self.time_from_start) / 2 * sample_size - / self.data_size + / self.data_size[0] ) if _is_nlp_task(self.task): @@ -1122,7 +1122,7 @@ class AutoML(BaseEstimator): test_size=split_ratio, random_state=RANDOM_SEED, ) - self._state.data_size = X_train.shape[0] + self._state.data_size = X_train.shape self.data_size_full = len(y_train_all) self._state.X_train, self._state.y_train = X_train, y_train self._state.X_val, self._state.y_val = X_val, y_val @@ -1555,7 +1555,7 @@ class AutoML(BaseEstimator): Returns: A float for the maximal sample size or None. """ - return self._state.data_size if self._sample else None + return self._state.data_size[0] if self._sample else None @property def trainable(self) -> Callable[[dict], Optional[float]]: @@ -1965,7 +1965,10 @@ class AutoML(BaseEstimator): sample and task != "rank" and eval_method != "cv" - and (self._min_sample_size * SAMPLE_MULTIPLY_FACTOR < self._state.data_size) + and ( + self._min_sample_size * SAMPLE_MULTIPLY_FACTOR + < self._state.data_size[0] + ) ) if "auto" == metric: if "binary" in self._state.task: @@ -2205,7 +2208,7 @@ class AutoML(BaseEstimator): search_state.update(result, 0) if result["wall_clock_time"] is not None: self._state.time_from_start = result["wall_clock_time"] - if search_state.sample_size == self._state.data_size: + if search_state.sample_size == self._state.data_size[0]: self._iter_per_learner[estimator] += 1 if not self._fullsize_reached: self._fullsize_reached = True @@ -2275,7 +2278,7 @@ class AutoML(BaseEstimator): self._max_iter = 0 self._best_estimator = estimator = self.estimator_list[0] self._selected = state = self._search_states[estimator] - state.best_config_sample_size = self._state.data_size + state.best_config_sample_size = self._state.data_size[0] state.best_config = ( state.init_config if isinstance(state.init_config, dict) @@ -2298,7 +2301,7 @@ class AutoML(BaseEstimator): or better or (not self.best_estimator) or self._search_states[self.best_estimator].sample_size - < self._state.data_size + < self._state.data_size[0] else time_left - est_retrain_time ) if not search_state.search_alg: @@ -2309,7 +2312,7 @@ class AutoML(BaseEstimator): if self._sample: prune_attr = "FLAML_sample_size" min_resource = self._min_sample_size - max_resource = self._state.data_size + max_resource = self._state.data_size[0] else: prune_attr = min_resource = max_resource = None learner_class = self._state.learner_classes.get(estimator) @@ -2398,7 +2401,7 @@ class AutoML(BaseEstimator): min_budget = max(10 * self._eci[0], sum(self._eci)) max_budget = 10000 * self._eci[0] if search_state.sample_size: - ratio = search_state.data_size / search_state.sample_size + ratio = search_state.data_size[0] / search_state.sample_size min_budget *= ratio max_budget *= ratio logger.info( @@ -2408,7 +2411,7 @@ class AutoML(BaseEstimator): if result["wall_clock_time"] is not None: self._state.time_from_start = result["wall_clock_time"] # logger.info(f"{self._search_states[estimator].sample_size}, {data_size}") - if search_state.sample_size == self._state.data_size: + if search_state.sample_size == self._state.data_size[0]: self._iter_per_learner[estimator] += 1 self._fullsize_reached = True if search_state.best_loss < self._state.best_loss: @@ -2519,7 +2522,7 @@ class AutoML(BaseEstimator): and est_retrain_time and not better and self._search_states[self._best_estimator].sample_size - == self._state.data_size + == self._state.data_size[0] and ( est_retrain_time <= self._state.time_budget - self._state.time_from_start @@ -2560,7 +2563,7 @@ class AutoML(BaseEstimator): self._best_iteration = 0 self._time_taken_best_iter = 0 self._config_history = {} - self._max_iter_per_learner = 1000000 # TODO + self._max_iter_per_learner = 10000 self._iter_per_learner = dict([(e, 0) for e in self.estimator_list]) self._fullsize_reached = False self._trained_estimator = None @@ -2680,7 +2683,7 @@ class AutoML(BaseEstimator): self._state.time_budget - self._state.time_from_start > self._selected.est_retrain_time(self.data_size_full) and self._selected.best_config_sample_size - == self._state.data_size + == self._state.data_size[0] ) ): state = self._search_states[self._best_estimator] @@ -2736,13 +2739,13 @@ class AutoML(BaseEstimator): inv.append(0) continue estimated_cost = search_state.estimated_cost4improvement - if search_state.sample_size < self._state.data_size: + if search_state.sample_size < self._state.data_size[0]: estimated_cost = min( estimated_cost, search_state.time2eval_best * min( SAMPLE_MULTIPLY_FACTOR, - self._state.data_size / search_state.sample_size, + self._state.data_size[0] / search_state.sample_size, ), ) gap = search_state.best_loss - self._state.best_loss diff --git a/flaml/model.py b/flaml/model.py index 61159d2ed..bc37bd6a3 100644 --- a/flaml/model.py +++ b/flaml/model.py @@ -233,9 +233,13 @@ class BaseEstimator: self._model = None @classmethod - def search_space(cls, **params): + def search_space(cls, data_size, task, **params): """[required method] search space. + Args: + data_size: A tuple of two integers, number of rows and columns. + task: A str of the task type, e.g., "binary", "multi", "regression". + Returns: A dictionary of the search space. Each key is the name of a hyperparameter, and value is a dict with @@ -674,7 +678,7 @@ class LGBMEstimator(BaseEstimator): @classmethod def search_space(cls, data_size, **params): - upper = min(32768, int(data_size)) + upper = min(32768, int(data_size[0])) return { "n_estimators": { "domain": tune.lograndint(lower=4, upper=upper), @@ -728,7 +732,7 @@ class LGBMEstimator(BaseEstimator): round( config.get("num_leaves") or config.get("max_leaves") - or 1 << config["max_depth"] + or 1 << config.get("max_depth", 16) ) ) n_estimators = int(round(config["n_estimators"])) @@ -752,7 +756,7 @@ class LGBMEstimator(BaseEstimator): self.estimator_class = LGBMClassifier self._time_per_iter = None self._train_size = 0 - self._mem_per_iter = 1 + self._mem_per_iter = -1 self.HAS_CALLBACK = self.HAS_CALLBACK and self._callbacks(0, 0) is not None def _preprocess(self, X): @@ -784,7 +788,7 @@ class LGBMEstimator(BaseEstimator): or abs(self._train_size - X_train.shape[0]) > 4 ) and budget is not None - or self._mem_per_iter <= 1 + or self._mem_per_iter < 0 and psutil is not None ) and n_iter > 1: self.params[self.ITER_HP] = 1 @@ -806,8 +810,8 @@ class LGBMEstimator(BaseEstimator): self._mem_per_iter = min( self._mem1, self._mem2 / self.params[self.ITER_HP] ) - if self._mem_per_iter <= 1 and psutil is not None: - n_iter = self.params[self.ITER_HP] + # if self._mem_per_iter <= 1 and psutil is not None: + # n_iter = self.params[self.ITER_HP] self._time_per_iter = ( (self._t2 - self._t1) / (self.params[self.ITER_HP] - 1) if self._t2 > self._t1 @@ -837,7 +841,7 @@ class LGBMEstimator(BaseEstimator): if budget is not None else n_iter, int((1 - FREE_MEM_RATIO) * mem0 / self._mem_per_iter) - if psutil is not None + if psutil is not None and self._mem_per_iter > 0 else n_iter, ) if trained and max_iter <= self.params[self.ITER_HP]: @@ -887,7 +891,7 @@ class XGBoostEstimator(SKLearnEstimator): @classmethod def search_space(cls, data_size, **params): - upper = min(32768, int(data_size)) + upper = min(32768, int(data_size[0])) return { "n_estimators": { "domain": tune.lograndint(lower=4, upper=upper), @@ -1086,7 +1090,7 @@ class XGBoostLimitDepthEstimator(XGBoostSklearnEstimator): def search_space(cls, data_size, **params): space = XGBoostEstimator.search_space(data_size) space.pop("max_leaves") - upper = max(6, int(np.log2(data_size))) + upper = max(6, int(np.log2(data_size[0]))) space["max_depth"] = { "domain": tune.randint(lower=1, upper=min(upper, 16)), "init_value": 6, @@ -1105,11 +1109,14 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator): """The class for tuning Random Forest.""" HAS_CALLBACK = False + nrows = 101 @classmethod def search_space(cls, data_size, task, **params): - data_size = int(data_size) - upper = min(2048, data_size) + RandomForestEstimator.nrows = int(data_size[0]) + upper = min(2048, RandomForestEstimator.nrows) + init = 1 / np.sqrt(data_size[1]) if task in CLASSIFICATION else 1 + lower = min(0.1, init) space = { "n_estimators": { "domain": tune.lograndint(lower=4, upper=upper), @@ -1117,11 +1124,13 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator): "low_cost_init_value": 4, }, "max_features": { - "domain": tune.loguniform(lower=0.1, upper=1.0), - "init_value": 1.0, + "domain": tune.loguniform(lower=lower, upper=1.0), + "init_value": init, }, "max_leaves": { - "domain": tune.lograndint(lower=4, upper=min(32768, data_size)), + "domain": tune.lograndint( + lower=4, upper=min(32768, RandomForestEstimator.nrows >> 1) + ), "init_value": 4, "low_cost_init_value": 4, }, @@ -1129,13 +1138,13 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator): if task in CLASSIFICATION: space["criterion"] = { "domain": tune.choice(["gini", "entropy"]), - # 'init_value': 'gini', + # "init_value": "gini", } return space @classmethod def cost_relative2lgbm(cls): - return 2.0 + return 2 def config2params(cls, config: dict) -> dict: params = config.copy() @@ -1234,7 +1243,7 @@ class CatBoostEstimator(BaseEstimator): @classmethod def search_space(cls, data_size, **params): - upper = max(min(round(1500000 / data_size), 150), 12) + upper = max(min(round(1500000 / data_size[0]), 150), 12) return { "early_stopping_rounds": { "domain": tune.lograndint(lower=10, upper=upper), @@ -1380,7 +1389,7 @@ class CatBoostEstimator(BaseEstimator): class KNeighborsEstimator(BaseEstimator): @classmethod def search_space(cls, data_size, **params): - upper = min(512, int(data_size / 2)) + upper = min(512, int(data_size[0] / 2)) return { "n_neighbors": { "domain": tune.lograndint(lower=1, upper=upper), diff --git a/flaml/version.py b/flaml/version.py index 8088f7513..deded3247 100644 --- a/flaml/version.py +++ b/flaml/version.py @@ -1 +1 @@ -__version__ = "0.8.1" +__version__ = "0.8.2" diff --git a/notebook/flaml_automl.ipynb b/notebook/flaml_automl.ipynb index 9b29f95c6..fc7a95bca 100644 --- a/notebook/flaml_automl.ipynb +++ b/notebook/flaml_automl.ipynb @@ -893,8 +893,8 @@ " {'domain': tune.randint(lower=1, upper=10), 'init_value': 1}.\n", " '''\n", " space = { \n", - " 'max_leaf': {'domain': tune.lograndint(lower=4, upper=data_size), 'init_value': 4, 'low_cost_init_value': 4},\n", - " 'n_iter': {'domain': tune.lograndint(lower=1, upper=data_size), 'init_value': 1, 'low_cost_init_value': 1},\n", + " 'max_leaf': {'domain': tune.lograndint(lower=4, upper=data_size[0]), 'init_value': 4, 'low_cost_init_value': 4},\n", + " 'n_iter': {'domain': tune.lograndint(lower=1, upper=data_size[0]), 'init_value': 1, 'low_cost_init_value': 1},\n", " 'n_tree_search': {'domain': tune.lograndint(lower=1, upper=32768), 'init_value': 1, 'low_cost_init_value': 1},\n", " 'opt_interval': {'domain': tune.lograndint(lower=1, upper=10000), 'init_value': 100},\n", " 'learning_rate': {'domain': tune.loguniform(lower=0.01, upper=20.0)},\n", @@ -1278,7 +1278,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.12" + "version": "3.9.7" } }, "nbformat": 4, diff --git a/test/automl/test_multiclass.py b/test/automl/test_multiclass.py index 28a6b148d..c39f7bf72 100644 --- a/test/automl/test_multiclass.py +++ b/test/automl/test_multiclass.py @@ -29,11 +29,11 @@ class MyRegularizedGreedyForest(SKLearnEstimator): def search_space(cls, data_size, task): space = { "max_leaf": { - "domain": tune.lograndint(lower=4, upper=data_size), + "domain": tune.lograndint(lower=4, upper=data_size[0]), "init_value": 4, }, "n_iter": { - "domain": tune.lograndint(lower=1, upper=data_size), + "domain": tune.lograndint(lower=1, upper=data_size[0]), "init_value": 1, }, "n_tree_search": { diff --git a/test/automl/test_xgboost2d.py b/test/automl/test_xgboost2d.py index a73b5b68e..2c17850a0 100644 --- a/test/automl/test_xgboost2d.py +++ b/test/automl/test_xgboost2d.py @@ -13,7 +13,7 @@ dataset = "credit-g" class XGBoost2D(XGBoostSklearnEstimator): @classmethod def search_space(cls, data_size, task): - upper = min(32768, int(data_size)) + upper = min(32768, int(data_size[0])) return { "n_estimators": { "domain": tune.lograndint(lower=4, upper=upper),