From b7846048dcc4e78fa1733e29e4d3a2d2da818c2a Mon Sep 17 00:00:00 2001
From: Qingyun Wu <qingyun.wu@psu.edu>
Date: Sat, 9 Jul 2022 16:04:46 -0400
Subject: [PATCH] Allow FLAML_sample_size in starting_points (#619)

* FLAML_sample_size

* clean up

* starting_points as a list

* catch AssertionError

* per estimator sample size

* import

* per estimator min_sample_size

* Update flaml/automl.py

Co-authored-by: Chi Wang <wang.chi@microsoft.com>

* Update test/automl/test_warmstart.py

Co-authored-by: Chi Wang <wang.chi@microsoft.com>

* add warnings

* adding more tests

* fix a bug in validating starting points

* improve test

* revise test

* revise test

* documentation about custom_hp

* doc and efficiency

* update test

Co-authored-by: Chi Wang <wang.chi@microsoft.com>
---
 flaml/automl.py                | 112 ++++++++++++++++++++++++++-------
 test/automl/test_multiclass.py |   3 +
 test/automl/test_warmstart.py  |  96 ++++++++++++++++++++++++++++
 test/nlp/test_default.py       |  16 ++++-
 4 files changed, 203 insertions(+), 24 deletions(-)

diff --git a/flaml/automl.py b/flaml/automl.py
index 3b065504a..55a27984c 100644
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -102,13 +102,10 @@ class SearchState:
         return True
 
     def valid_starting_point(self, starting_point, search_space):
-        return any(
-            [
-                self.valid_starting_point_one_dim(
-                    value, search_space[name].get("domain")
-                )
-                for name, value in starting_point.items()
-            ]
+        return all(
+            self.valid_starting_point_one_dim(value, search_space[name].get("domain"))
+            for name, value in starting_point.items()
+            if name != "FLAML_sample_size"
         )
 
     def __init__(
@@ -656,9 +653,17 @@ class AutoML(BaseEstimator):
                 the automl constructor, flaml will automatically (and under the hood)
                 add it as an additional element in the metric_constraints. Essentially 'pred_time_limit'
                 specifies a constraint about the prediction latency constraint in seconds.
-            custom_hp: dict, default=None | The custom search space specified by user
-                Each key is the estimator name, each value is a dict of the custom search space for that estimator. Notice the
-                domain of the custom search space can either be a value of a sample.Domain object.
+            custom_hp: dict, default=None | The custom search space specified by user.
+                It is a nested dict with keys being the estimator names, and values being dicts
+                per estimator search space. In the per estimator search space dict,
+                the keys are the hyperparameter names, and values are dicts of info ("domain",
+                "init_value", and "low_cost_init_value") about the search space associated with
+                the hyperparameter (i.e., per hyperparameter search space dict). When custom_hp
+                is provided, the built-in search space which is also a nested dict of per estimator
+                search space dict, will be updated with custom_hp. Note that during this nested dict update,
+                the per hyperparameter search space dicts will be replaced (instead of updated) by the ones
+                provided in custom_hp. Note that the value for "domain" can either be a constant
+                or a sample.Domain object.
                 e.g.,
 
         ```python
@@ -2430,18 +2435,68 @@ class AutoML(BaseEstimator):
             eval_method == "holdout" and self._state.X_val is None
         )
         self._auto_augment = auto_augment
-        self._min_sample_size = min_sample_size
+
+        _sample_size_from_starting_points = {}
+        if isinstance(starting_points, dict):
+            for _estimator, _point_per_estimator in starting_points.items():
+                sample_size = (
+                    _point_per_estimator
+                    and isinstance(_point_per_estimator, dict)
+                    and _point_per_estimator.get("FLAML_sample_size")
+                )
+                if sample_size:
+                    _sample_size_from_starting_points[_estimator] = sample_size
+                elif _point_per_estimator and isinstance(_point_per_estimator, list):
+                    _sample_size_set = set(
+                        [
+                            config["FLAML_sample_size"]
+                            for config in _point_per_estimator
+                            if "FLAML_sample_size" in config
+                        ]
+                    )
+                    if _sample_size_set:
+                        _sample_size_from_starting_points[_estimator] = min(
+                            _sample_size_set
+                        )
+                    if len(_sample_size_set) > 1:
+                        logger.warning(
+                            "Using the min FLAML_sample_size of all the provided starting points for estimator {}. (Provided FLAML_sample_size are: {})".format(
+                                _estimator, _sample_size_set
+                            )
+                        )
+
+        if not sample and isinstance(starting_points, dict):
+            assert (
+                not _sample_size_from_starting_points
+            ), "When subsampling is disabled, do not include FLAML_sample_size in the starting point."
+        self._min_sample_size = _sample_size_from_starting_points or min_sample_size
+        self._min_sample_size_input = min_sample_size
         self._prepare_data(eval_method, split_ratio, n_splits)
 
-        self._sample = (
-            sample
-            and task != "rank"
-            and eval_method != "cv"
-            and (
-                self._min_sample_size * SAMPLE_MULTIPLY_FACTOR
-                < self._state.data_size[0]
+        if isinstance(self._min_sample_size, dict):
+            self._sample = {
+                (
+                    k,
+                    sample
+                    and task != "rank"
+                    and eval_method != "cv"
+                    and (
+                        self._min_sample_size[k] * SAMPLE_MULTIPLY_FACTOR
+                        < self._state.data_size[0]
+                    ),
+                )
+                for k in self._min_sample_size.keys()
+            }
+        else:
+            self._sample = (
+                sample
+                and task != "rank"
+                and eval_method != "cv"
+                and (
+                    self._min_sample_size * SAMPLE_MULTIPLY_FACTOR
+                    < self._state.data_size[0]
+                )
             )
-        )
         if "auto" == metric:
             if _is_nlp_task(self._state.task):
                 from .nlp.utils import load_default_huggingface_metric_for_task
@@ -2752,6 +2807,16 @@ class AutoML(BaseEstimator):
             self._state.time_from_start = time.time() - self._start_time_flag
             time_left = self._state.time_budget - self._state.time_from_start
             if self._hpo_method != "optuna":
+                min_resource = self.min_resource
+                if isinstance(min_resource, dict):
+                    _min_resource_set = set(min_resource.values())
+                    min_resource_all_estimator = min(_min_resource_set)
+                    if len(_min_resource_set) > 1:
+                        logger.warning(
+                            "Using the min FLAML_sample_size of all the provided starting points as the starting sample size in the case of parallel search."
+                        )
+                else:
+                    min_resource_all_estimator = min_resource
                 search_alg = SearchAlgo(
                     metric="val_loss",
                     space=space,
@@ -2759,7 +2824,7 @@ class AutoML(BaseEstimator):
                     points_to_evaluate=self.points_to_evaluate,
                     cat_hp_cost=self.cat_hp_cost,
                     resource_attr=self.resource_attr,
-                    min_resource=self.min_resource,
+                    min_resource=min_resource_all_estimator,
                     max_resource=self.max_resource,
                     config_constraints=[
                         (partial(size, self._state), "<=", self._mem_thres)
@@ -2947,7 +3012,12 @@ class AutoML(BaseEstimator):
                 search_space = search_state.search_space
                 if self._sample:
                     resource_attr = "FLAML_sample_size"
-                    min_resource = self._min_sample_size
+                    min_resource = (
+                        self._min_sample_size[estimator]
+                        if isinstance(self._min_sample_size, dict)
+                        and estimator in self._min_sample_size
+                        else self._min_sample_size_input
+                    )
                     max_resource = self._state.data_size[0]
                 else:
                     resource_attr = min_resource = max_resource = None
diff --git a/test/automl/test_multiclass.py b/test/automl/test_multiclass.py
index 8e519db30..ab8987e59 100644
--- a/test/automl/test_multiclass.py
+++ b/test/automl/test_multiclass.py
@@ -474,8 +474,11 @@ class TestMultiClass(unittest.TestCase):
         starting_points = {}
         log_file_name = automl_settings["log_file_name"]
         with training_log_reader(log_file_name) as reader:
+            sample_size = 1000
             for record in reader.records():
                 config = record.config
+                config["FLAML_sample_size"] = sample_size
+                sample_size += 1000
                 learner = record.learner
                 if learner not in starting_points:
                     starting_points[learner] = []
diff --git a/test/automl/test_warmstart.py b/test/automl/test_warmstart.py
index c6eba5ac0..c443cf6ce 100644
--- a/test/automl/test_warmstart.py
+++ b/test/automl/test_warmstart.py
@@ -123,6 +123,102 @@ class TestWarmStart(unittest.TestCase):
         automl.fit(X_train, y_train)
         print(automl.best_config_per_estimator)
 
+    def test_FLAML_sample_size_in_starting_points(self):
+        from flaml.data import load_openml_dataset
+        from flaml import AutoML
+
+        X_train, X_test, y_train, y_test = load_openml_dataset(
+            dataset_id=1169, data_dir="./"
+        )
+
+        automl_settings = {
+            "time_budget": 3,
+            "task": "classification",
+        }
+
+        automl1 = AutoML()
+        print(len(y_train))
+        automl1.fit(X_train, y_train, **automl_settings)
+        print("automl1.best_config_per_estimator", automl1.best_config_per_estimator)
+
+        automl_settings["starting_points"] = automl1.best_config_per_estimator
+        automl2 = AutoML()
+        automl2.fit(X_train, y_train, **automl_settings)
+
+        automl_settings["starting_points"] = {
+            "xgboost": {
+                "n_estimators": 4,
+                "max_leaves": 4,
+                "min_child_weight": 0.26208115308159446,
+                "learning_rate": 0.25912534572860507,
+                "subsample": 0.9266743941610592,
+                "colsample_bylevel": 1.0,
+                "colsample_bytree": 1.0,
+                "reg_alpha": 0.0013933617380144255,
+                "reg_lambda": 0.18096917948292954,
+                "FLAML_sample_size": 20000,
+            },
+            "xgb_limitdepth": None,
+            "lrl1": None,
+        }
+        from flaml import tune
+
+        automl_settings["custom_hp"] = {
+            "xgboost": {
+                "n_estimators": {
+                    "domain": tune.choice([10, 20]),
+                },
+            }
+        }
+        automl2 = AutoML()
+        automl2.fit(X_train, y_train, **automl_settings)
+
+        try:
+            import ray
+
+            automl_settings["n_concurrent_trials"] = 2
+        except ImportError:
+            automl_settings["n_concurrent_trials"] = 1
+        # setting different FLAML_sample_size
+        automl_settings["starting_points"] = {
+            "catboost": {
+                "early_stopping_rounds": 10,
+                "learning_rate": 0.09999999999999996,
+                "n_estimators": 1,
+                "FLAML_sample_size": 10000,
+            },
+            "xgboost": {
+                "n_estimators": 4,
+                "max_leaves": 4,
+                "min_child_weight": 0.26208115308159446,
+                "learning_rate": 0.25912534572860507,
+                "subsample": 0.9266743941610592,
+                "colsample_bylevel": 1.0,
+                "colsample_bytree": 1.0,
+                "reg_alpha": 0.0013933617380144255,
+                "reg_lambda": 0.18096917948292954,
+                "FLAML_sample_size": 20000,
+            },
+            "xgb_limitdepth": None,
+            "lrl1": None,
+        }
+        automl3 = AutoML()
+        automl3.fit(X_train, y_train, **automl_settings)
+
+        automl_settings["sample"] = False
+        automl4 = AutoML()
+        try:
+            automl4.fit(
+                X_train,
+                y_train,
+                **automl_settings,
+            )
+            raise RuntimeError(
+                "When sample=False and starting_points contain FLAML_sample_size, AssertionError is expected but not raised."
+            )
+        except AssertionError:
+            pass
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/nlp/test_default.py b/test/nlp/test_default.py
index 8422b55f7..2118d10ea 100644
--- a/test/nlp/test_default.py
+++ b/test/nlp/test_default.py
@@ -57,15 +57,25 @@ def test_starting_point_not_in_search_space():
             "learning_rate": {
                 "domain": tune.choice([1e-4, 1e-5]),
             },
+            "per_device_train_batch_size": {
+                "domain": 2,
+            },
         }
     }
     automl_settings["starting_points"] = "data:test/nlp/default/"
     del automl_settings["fit_kwargs_by_estimator"][this_estimator_name]["model_path"]
 
     automl.fit(X_train, y_train, **automl_settings)
-    assert (
-        len(automl._search_states[this_estimator_name].init_config) == 0
-    )  # check that init config is not updated, but search space is updated
+    assert len(automl._search_states[this_estimator_name].init_config) == len(
+        automl._search_states[this_estimator_name]._search_space_domain
+    ) - len(automl_settings["custom_hp"][this_estimator_name]), (
+        "The search space is updated with the custom_hp on {} hyperparameters of "
+        "the specified estimator without an initial value. Thus a valid init config "
+        "should only contain the cardinality of the search space minus {}".format(
+            len(automl_settings["custom_hp"][this_estimator_name]),
+            len(automl_settings["custom_hp"][this_estimator_name]),
+        )
+    )
     assert (
         automl._search_states[this_estimator_name].search_space["model_path"]
         == "albert-base-v2"