include default value in rf search space (#317)

* include default value in rf search space * init _mem_per_iter with -1 * bump version to 0.8.2 * docstr for search space's arguments
2025-12-12 15:31:21 +00:00 · 2021-12-03 09:15:21 -08:00 · 2021-12-03 09:15:21 -08:00 · c57954fbbd
commit c57954fbbd
parent 1545d5a6d2
6 changed files with 57 additions and 45 deletions
--- a/flaml/automl.py
+++ b/flaml/automl.py
@ -119,7 +119,7 @@ class SearchState:
            if config and "FLAML_sample_size" in config:
                self.sample_size = config["FLAML_sample_size"]
            else:
-                self.sample_size = self.data_size
+                self.sample_size = self.data_size[0]
            obj = result["val_loss"]
            metric_for_logging = result["metric_for_logging"]
            time2eval = result["time_total_s"]
@ -181,7 +181,7 @@ class SearchState:
 class AutoMLState:
    def _prepare_sample_train_data(self, sample_size):
        sampled_weight = groups = None
-        if sample_size <= self.data_size:
+        if sample_size <= self.data_size[0]:
            if isinstance(self.X_train, pd.DataFrame):
                sampled_X_train = self.X_train.iloc[:sample_size]
            else:
@ -205,7 +205,7 @@ class AutoMLState:
        if "FLAML_sample_size" in config_w_resource:
            sample_size = int(config_w_resource["FLAML_sample_size"])
        else:
-            sample_size = self.data_size
+            sample_size = self.data_size[0]
        (
            sampled_X_train,
            sampled_y_train,
@ -226,11 +226,11 @@ class AutoMLState:
            None
            if self.time_budget is None
            else self.time_budget - self.time_from_start
-            if sample_size == self.data_size
+            if sample_size == self.data_size[0]
            else (self.time_budget - self.time_from_start)
            / 2
            * sample_size
-            / self.data_size
+            / self.data_size[0]
        )

        if _is_nlp_task(self.task):
@ -1122,7 +1122,7 @@ class AutoML(BaseEstimator):
                        test_size=split_ratio,
                        random_state=RANDOM_SEED,
                    )
-        self._state.data_size = X_train.shape[0]
+        self._state.data_size = X_train.shape
        self.data_size_full = len(y_train_all)
        self._state.X_train, self._state.y_train = X_train, y_train
        self._state.X_val, self._state.y_val = X_val, y_val
@ -1555,7 +1555,7 @@ class AutoML(BaseEstimator):
        Returns:
            A float for the maximal sample size or None.
        """
-        return self._state.data_size if self._sample else None
+        return self._state.data_size[0] if self._sample else None

    @property
    def trainable(self) -> Callable[[dict], Optional[float]]:
@ -1965,7 +1965,10 @@ class AutoML(BaseEstimator):
            sample
            and task != "rank"
            and eval_method != "cv"
-            and (self._min_sample_size * SAMPLE_MULTIPLY_FACTOR < self._state.data_size)
+            and (
+                self._min_sample_size * SAMPLE_MULTIPLY_FACTOR
+                < self._state.data_size[0]
+            )
        )
        if "auto" == metric:
            if "binary" in self._state.task:
@ -2205,7 +2208,7 @@ class AutoML(BaseEstimator):
                search_state.update(result, 0)
                if result["wall_clock_time"] is not None:
                    self._state.time_from_start = result["wall_clock_time"]
-                if search_state.sample_size == self._state.data_size:
+                if search_state.sample_size == self._state.data_size[0]:
                    self._iter_per_learner[estimator] += 1
                    if not self._fullsize_reached:
                        self._fullsize_reached = True
@ -2275,7 +2278,7 @@ class AutoML(BaseEstimator):
            self._max_iter = 0
            self._best_estimator = estimator = self.estimator_list[0]
            self._selected = state = self._search_states[estimator]
-            state.best_config_sample_size = self._state.data_size
+            state.best_config_sample_size = self._state.data_size[0]
            state.best_config = (
                state.init_config
                if isinstance(state.init_config, dict)
@ -2298,7 +2301,7 @@ class AutoML(BaseEstimator):
                or better
                or (not self.best_estimator)
                or self._search_states[self.best_estimator].sample_size
-                < self._state.data_size
+                < self._state.data_size[0]
                else time_left - est_retrain_time
            )
            if not search_state.search_alg:
@ -2309,7 +2312,7 @@ class AutoML(BaseEstimator):
                if self._sample:
                    prune_attr = "FLAML_sample_size"
                    min_resource = self._min_sample_size
-                    max_resource = self._state.data_size
+                    max_resource = self._state.data_size[0]
                else:
                    prune_attr = min_resource = max_resource = None
                learner_class = self._state.learner_classes.get(estimator)
@ -2398,7 +2401,7 @@ class AutoML(BaseEstimator):
                    min_budget = max(10 * self._eci[0], sum(self._eci))
                    max_budget = 10000 * self._eci[0]
                    if search_state.sample_size:
-                        ratio = search_state.data_size / search_state.sample_size
+                        ratio = search_state.data_size[0] / search_state.sample_size
                        min_budget *= ratio
                        max_budget *= ratio
                    logger.info(
@ -2408,7 +2411,7 @@ class AutoML(BaseEstimator):
                if result["wall_clock_time"] is not None:
                    self._state.time_from_start = result["wall_clock_time"]
                # logger.info(f"{self._search_states[estimator].sample_size}, {data_size}")
-                if search_state.sample_size == self._state.data_size:
+                if search_state.sample_size == self._state.data_size[0]:
                    self._iter_per_learner[estimator] += 1
                    self._fullsize_reached = True
                if search_state.best_loss < self._state.best_loss:
@ -2519,7 +2522,7 @@ class AutoML(BaseEstimator):
                and est_retrain_time
                and not better
                and self._search_states[self._best_estimator].sample_size
-                == self._state.data_size
+                == self._state.data_size[0]
                and (
                    est_retrain_time
                    <= self._state.time_budget - self._state.time_from_start
@ -2560,7 +2563,7 @@ class AutoML(BaseEstimator):
        self._best_iteration = 0
        self._time_taken_best_iter = 0
        self._config_history = {}
-        self._max_iter_per_learner = 1000000  # TODO
+        self._max_iter_per_learner = 10000
        self._iter_per_learner = dict([(e, 0) for e in self.estimator_list])
        self._fullsize_reached = False
        self._trained_estimator = None
@ -2680,7 +2683,7 @@ class AutoML(BaseEstimator):
                        self._state.time_budget - self._state.time_from_start
                        > self._selected.est_retrain_time(self.data_size_full)
                        and self._selected.best_config_sample_size
-                        == self._state.data_size
+                        == self._state.data_size[0]
                    )
                ):
                    state = self._search_states[self._best_estimator]
@ -2736,13 +2739,13 @@ class AutoML(BaseEstimator):
                    inv.append(0)
                    continue
                estimated_cost = search_state.estimated_cost4improvement
-                if search_state.sample_size < self._state.data_size:
+                if search_state.sample_size < self._state.data_size[0]:
                    estimated_cost = min(
                        estimated_cost,
                        search_state.time2eval_best
                        * min(
                            SAMPLE_MULTIPLY_FACTOR,
-                            self._state.data_size / search_state.sample_size,
+                            self._state.data_size[0] / search_state.sample_size,
                        ),
                    )
                gap = search_state.best_loss - self._state.best_loss
--- a/flaml/model.py
+++ b/flaml/model.py
@ -233,9 +233,13 @@ class BaseEstimator:
        self._model = None

    @classmethod
-    def search_space(cls, **params):
+    def search_space(cls, data_size, task, **params):
        """[required method] search space.

+        Args:
+            data_size: A tuple of two integers, number of rows and columns.
+            task: A str of the task type, e.g., "binary", "multi", "regression".
+
        Returns:
            A dictionary of the search space.
            Each key is the name of a hyperparameter, and value is a dict with
@ -674,7 +678,7 @@ class LGBMEstimator(BaseEstimator):

    @classmethod
    def search_space(cls, data_size, **params):
-        upper = min(32768, int(data_size))
+        upper = min(32768, int(data_size[0]))
        return {
            "n_estimators": {
                "domain": tune.lograndint(lower=4, upper=upper),
@ -728,7 +732,7 @@ class LGBMEstimator(BaseEstimator):
            round(
                config.get("num_leaves")
                or config.get("max_leaves")
-                or 1 << config["max_depth"]
+                or 1 << config.get("max_depth", 16)
            )
        )
        n_estimators = int(round(config["n_estimators"]))
@ -752,7 +756,7 @@ class LGBMEstimator(BaseEstimator):
            self.estimator_class = LGBMClassifier
        self._time_per_iter = None
        self._train_size = 0
-        self._mem_per_iter = 1
+        self._mem_per_iter = -1
        self.HAS_CALLBACK = self.HAS_CALLBACK and self._callbacks(0, 0) is not None

    def _preprocess(self, X):
@ -784,7 +788,7 @@ class LGBMEstimator(BaseEstimator):
                    or abs(self._train_size - X_train.shape[0]) > 4
                )
                and budget is not None
-                or self._mem_per_iter <= 1
+                or self._mem_per_iter < 0
                and psutil is not None
            ) and n_iter > 1:
                self.params[self.ITER_HP] = 1
@ -806,8 +810,8 @@ class LGBMEstimator(BaseEstimator):
                self._mem_per_iter = min(
                    self._mem1, self._mem2 / self.params[self.ITER_HP]
                )
-                if self._mem_per_iter <= 1 and psutil is not None:
-                    n_iter = self.params[self.ITER_HP]
+                # if self._mem_per_iter <= 1 and psutil is not None:
+                #     n_iter = self.params[self.ITER_HP]
                self._time_per_iter = (
                    (self._t2 - self._t1) / (self.params[self.ITER_HP] - 1)
                    if self._t2 > self._t1
@ -837,7 +841,7 @@ class LGBMEstimator(BaseEstimator):
                    if budget is not None
                    else n_iter,
                    int((1 - FREE_MEM_RATIO) * mem0 / self._mem_per_iter)
-                    if psutil is not None
+                    if psutil is not None and self._mem_per_iter > 0
                    else n_iter,
                )
                if trained and max_iter <= self.params[self.ITER_HP]:
@ -887,7 +891,7 @@ class XGBoostEstimator(SKLearnEstimator):

    @classmethod
    def search_space(cls, data_size, **params):
-        upper = min(32768, int(data_size))
+        upper = min(32768, int(data_size[0]))
        return {
            "n_estimators": {
                "domain": tune.lograndint(lower=4, upper=upper),
@ -1086,7 +1090,7 @@ class XGBoostLimitDepthEstimator(XGBoostSklearnEstimator):
    def search_space(cls, data_size, **params):
        space = XGBoostEstimator.search_space(data_size)
        space.pop("max_leaves")
-        upper = max(6, int(np.log2(data_size)))
+        upper = max(6, int(np.log2(data_size[0])))
        space["max_depth"] = {
            "domain": tune.randint(lower=1, upper=min(upper, 16)),
            "init_value": 6,
@ -1105,11 +1109,14 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
    """The class for tuning Random Forest."""

    HAS_CALLBACK = False
+    nrows = 101

    @classmethod
    def search_space(cls, data_size, task, **params):
-        data_size = int(data_size)
-        upper = min(2048, data_size)
+        RandomForestEstimator.nrows = int(data_size[0])
+        upper = min(2048, RandomForestEstimator.nrows)
+        init = 1 / np.sqrt(data_size[1]) if task in CLASSIFICATION else 1
+        lower = min(0.1, init)
        space = {
            "n_estimators": {
                "domain": tune.lograndint(lower=4, upper=upper),
@ -1117,11 +1124,13 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
                "low_cost_init_value": 4,
            },
            "max_features": {
-                "domain": tune.loguniform(lower=0.1, upper=1.0),
-                "init_value": 1.0,
+                "domain": tune.loguniform(lower=lower, upper=1.0),
+                "init_value": init,
            },
            "max_leaves": {
-                "domain": tune.lograndint(lower=4, upper=min(32768, data_size)),
+                "domain": tune.lograndint(
+                    lower=4, upper=min(32768, RandomForestEstimator.nrows >> 1)
+                ),
                "init_value": 4,
                "low_cost_init_value": 4,
            },
@ -1129,13 +1138,13 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
        if task in CLASSIFICATION:
            space["criterion"] = {
                "domain": tune.choice(["gini", "entropy"]),
-                # 'init_value': 'gini',
+                # "init_value": "gini",
            }
        return space

    @classmethod
    def cost_relative2lgbm(cls):
-        return 2.0
+        return 2

    def config2params(cls, config: dict) -> dict:
        params = config.copy()
@ -1234,7 +1243,7 @@ class CatBoostEstimator(BaseEstimator):

    @classmethod
    def search_space(cls, data_size, **params):
-        upper = max(min(round(1500000 / data_size), 150), 12)
+        upper = max(min(round(1500000 / data_size[0]), 150), 12)
        return {
            "early_stopping_rounds": {
                "domain": tune.lograndint(lower=10, upper=upper),
@ -1380,7 +1389,7 @@ class CatBoostEstimator(BaseEstimator):
 class KNeighborsEstimator(BaseEstimator):
    @classmethod
    def search_space(cls, data_size, **params):
-        upper = min(512, int(data_size / 2))
+        upper = min(512, int(data_size[0] / 2))
        return {
            "n_neighbors": {
                "domain": tune.lograndint(lower=1, upper=upper),
--- a/flaml/version.py
+++ b/flaml/version.py
@ -1 +1 @@
-__version__ = "0.8.1"
+__version__ = "0.8.2"
--- a/notebook/flaml_automl.ipynb
+++ b/notebook/flaml_automl.ipynb
@ -893,8 +893,8 @@
    "                {'domain': tune.randint(lower=1, upper=10), 'init_value': 1}.\n",
    "        '''\n",
    "        space = {        \n",
-    "            'max_leaf': {'domain': tune.lograndint(lower=4, upper=data_size), 'init_value': 4, 'low_cost_init_value': 4},\n",
-    "            'n_iter': {'domain': tune.lograndint(lower=1, upper=data_size), 'init_value': 1, 'low_cost_init_value': 1},\n",
+    "            'max_leaf': {'domain': tune.lograndint(lower=4, upper=data_size[0]), 'init_value': 4, 'low_cost_init_value': 4},\n",
+    "            'n_iter': {'domain': tune.lograndint(lower=1, upper=data_size[0]), 'init_value': 1, 'low_cost_init_value': 1},\n",
    "            'n_tree_search': {'domain': tune.lograndint(lower=1, upper=32768), 'init_value': 1, 'low_cost_init_value': 1},\n",
    "            'opt_interval': {'domain': tune.lograndint(lower=1, upper=10000), 'init_value': 100},\n",
    "            'learning_rate': {'domain': tune.loguniform(lower=0.01, upper=20.0)},\n",
@ -1278,7 +1278,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.12"
+   "version": "3.9.7"
  }
 },
 "nbformat": 4,
--- a/test/automl/test_multiclass.py
+++ b/test/automl/test_multiclass.py
@ -29,11 +29,11 @@ class MyRegularizedGreedyForest(SKLearnEstimator):
    def search_space(cls, data_size, task):
        space = {
            "max_leaf": {
-                "domain": tune.lograndint(lower=4, upper=data_size),
+                "domain": tune.lograndint(lower=4, upper=data_size[0]),
                "init_value": 4,
            },
            "n_iter": {
-                "domain": tune.lograndint(lower=1, upper=data_size),
+                "domain": tune.lograndint(lower=1, upper=data_size[0]),
                "init_value": 1,
            },
            "n_tree_search": {
--- a/test/automl/test_xgboost2d.py
+++ b/test/automl/test_xgboost2d.py
@ -13,7 +13,7 @@ dataset = "credit-g"
 class XGBoost2D(XGBoostSklearnEstimator):
    @classmethod
    def search_space(cls, data_size, task):
-        upper = min(32768, int(data_size))
+        upper = min(32768, int(data_size[0]))
        return {
            "n_estimators": {
                "domain": tune.lograndint(lower=4, upper=upper),