Allow FLAML_sample_size in starting_points (#619)

* FLAML_sample_size

* clean up

* starting_points as a list

* catch AssertionError

* per estimator sample size

* import

* per estimator min_sample_size

* Update flaml/automl.py

Co-authored-by: Chi Wang <wang.chi@microsoft.com>

* Update test/automl/test_warmstart.py

Co-authored-by: Chi Wang <wang.chi@microsoft.com>

* add warnings

* adding more tests

* fix a bug in validating starting points

* improve test

* revise test

* revise test

* documentation about custom_hp

* doc and efficiency

* update test

Co-authored-by: Chi Wang <wang.chi@microsoft.com>
This commit is contained in:
Qingyun Wu 2022-07-09 16:04:46 -04:00 committed by GitHub
parent 6cb6a2a19a
commit b7846048dc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 203 additions and 24 deletions

View File

@ -102,13 +102,10 @@ class SearchState:
return True
def valid_starting_point(self, starting_point, search_space):
return any(
[
self.valid_starting_point_one_dim(
value, search_space[name].get("domain")
)
for name, value in starting_point.items()
]
return all(
self.valid_starting_point_one_dim(value, search_space[name].get("domain"))
for name, value in starting_point.items()
if name != "FLAML_sample_size"
)
def __init__(
@ -656,9 +653,17 @@ class AutoML(BaseEstimator):
the automl constructor, flaml will automatically (and under the hood)
add it as an additional element in the metric_constraints. Essentially 'pred_time_limit'
specifies a constraint about the prediction latency constraint in seconds.
custom_hp: dict, default=None | The custom search space specified by user
Each key is the estimator name, each value is a dict of the custom search space for that estimator. Notice the
domain of the custom search space can either be a value of a sample.Domain object.
custom_hp: dict, default=None | The custom search space specified by user.
It is a nested dict with keys being the estimator names, and values being dicts
per estimator search space. In the per estimator search space dict,
the keys are the hyperparameter names, and values are dicts of info ("domain",
"init_value", and "low_cost_init_value") about the search space associated with
the hyperparameter (i.e., per hyperparameter search space dict). When custom_hp
is provided, the built-in search space which is also a nested dict of per estimator
search space dict, will be updated with custom_hp. Note that during this nested dict update,
the per hyperparameter search space dicts will be replaced (instead of updated) by the ones
provided in custom_hp. Note that the value for "domain" can either be a constant
or a sample.Domain object.
e.g.,
```python
@ -2430,18 +2435,68 @@ class AutoML(BaseEstimator):
eval_method == "holdout" and self._state.X_val is None
)
self._auto_augment = auto_augment
self._min_sample_size = min_sample_size
_sample_size_from_starting_points = {}
if isinstance(starting_points, dict):
for _estimator, _point_per_estimator in starting_points.items():
sample_size = (
_point_per_estimator
and isinstance(_point_per_estimator, dict)
and _point_per_estimator.get("FLAML_sample_size")
)
if sample_size:
_sample_size_from_starting_points[_estimator] = sample_size
elif _point_per_estimator and isinstance(_point_per_estimator, list):
_sample_size_set = set(
[
config["FLAML_sample_size"]
for config in _point_per_estimator
if "FLAML_sample_size" in config
]
)
if _sample_size_set:
_sample_size_from_starting_points[_estimator] = min(
_sample_size_set
)
if len(_sample_size_set) > 1:
logger.warning(
"Using the min FLAML_sample_size of all the provided starting points for estimator {}. (Provided FLAML_sample_size are: {})".format(
_estimator, _sample_size_set
)
)
if not sample and isinstance(starting_points, dict):
assert (
not _sample_size_from_starting_points
), "When subsampling is disabled, do not include FLAML_sample_size in the starting point."
self._min_sample_size = _sample_size_from_starting_points or min_sample_size
self._min_sample_size_input = min_sample_size
self._prepare_data(eval_method, split_ratio, n_splits)
self._sample = (
sample
and task != "rank"
and eval_method != "cv"
and (
self._min_sample_size * SAMPLE_MULTIPLY_FACTOR
< self._state.data_size[0]
if isinstance(self._min_sample_size, dict):
self._sample = {
(
k,
sample
and task != "rank"
and eval_method != "cv"
and (
self._min_sample_size[k] * SAMPLE_MULTIPLY_FACTOR
< self._state.data_size[0]
),
)
for k in self._min_sample_size.keys()
}
else:
self._sample = (
sample
and task != "rank"
and eval_method != "cv"
and (
self._min_sample_size * SAMPLE_MULTIPLY_FACTOR
< self._state.data_size[0]
)
)
)
if "auto" == metric:
if _is_nlp_task(self._state.task):
from .nlp.utils import load_default_huggingface_metric_for_task
@ -2752,6 +2807,16 @@ class AutoML(BaseEstimator):
self._state.time_from_start = time.time() - self._start_time_flag
time_left = self._state.time_budget - self._state.time_from_start
if self._hpo_method != "optuna":
min_resource = self.min_resource
if isinstance(min_resource, dict):
_min_resource_set = set(min_resource.values())
min_resource_all_estimator = min(_min_resource_set)
if len(_min_resource_set) > 1:
logger.warning(
"Using the min FLAML_sample_size of all the provided starting points as the starting sample size in the case of parallel search."
)
else:
min_resource_all_estimator = min_resource
search_alg = SearchAlgo(
metric="val_loss",
space=space,
@ -2759,7 +2824,7 @@ class AutoML(BaseEstimator):
points_to_evaluate=self.points_to_evaluate,
cat_hp_cost=self.cat_hp_cost,
resource_attr=self.resource_attr,
min_resource=self.min_resource,
min_resource=min_resource_all_estimator,
max_resource=self.max_resource,
config_constraints=[
(partial(size, self._state), "<=", self._mem_thres)
@ -2947,7 +3012,12 @@ class AutoML(BaseEstimator):
search_space = search_state.search_space
if self._sample:
resource_attr = "FLAML_sample_size"
min_resource = self._min_sample_size
min_resource = (
self._min_sample_size[estimator]
if isinstance(self._min_sample_size, dict)
and estimator in self._min_sample_size
else self._min_sample_size_input
)
max_resource = self._state.data_size[0]
else:
resource_attr = min_resource = max_resource = None

View File

@ -474,8 +474,11 @@ class TestMultiClass(unittest.TestCase):
starting_points = {}
log_file_name = automl_settings["log_file_name"]
with training_log_reader(log_file_name) as reader:
sample_size = 1000
for record in reader.records():
config = record.config
config["FLAML_sample_size"] = sample_size
sample_size += 1000
learner = record.learner
if learner not in starting_points:
starting_points[learner] = []

View File

@ -123,6 +123,102 @@ class TestWarmStart(unittest.TestCase):
automl.fit(X_train, y_train)
print(automl.best_config_per_estimator)
def test_FLAML_sample_size_in_starting_points(self):
from flaml.data import load_openml_dataset
from flaml import AutoML
X_train, X_test, y_train, y_test = load_openml_dataset(
dataset_id=1169, data_dir="./"
)
automl_settings = {
"time_budget": 3,
"task": "classification",
}
automl1 = AutoML()
print(len(y_train))
automl1.fit(X_train, y_train, **automl_settings)
print("automl1.best_config_per_estimator", automl1.best_config_per_estimator)
automl_settings["starting_points"] = automl1.best_config_per_estimator
automl2 = AutoML()
automl2.fit(X_train, y_train, **automl_settings)
automl_settings["starting_points"] = {
"xgboost": {
"n_estimators": 4,
"max_leaves": 4,
"min_child_weight": 0.26208115308159446,
"learning_rate": 0.25912534572860507,
"subsample": 0.9266743941610592,
"colsample_bylevel": 1.0,
"colsample_bytree": 1.0,
"reg_alpha": 0.0013933617380144255,
"reg_lambda": 0.18096917948292954,
"FLAML_sample_size": 20000,
},
"xgb_limitdepth": None,
"lrl1": None,
}
from flaml import tune
automl_settings["custom_hp"] = {
"xgboost": {
"n_estimators": {
"domain": tune.choice([10, 20]),
},
}
}
automl2 = AutoML()
automl2.fit(X_train, y_train, **automl_settings)
try:
import ray
automl_settings["n_concurrent_trials"] = 2
except ImportError:
automl_settings["n_concurrent_trials"] = 1
# setting different FLAML_sample_size
automl_settings["starting_points"] = {
"catboost": {
"early_stopping_rounds": 10,
"learning_rate": 0.09999999999999996,
"n_estimators": 1,
"FLAML_sample_size": 10000,
},
"xgboost": {
"n_estimators": 4,
"max_leaves": 4,
"min_child_weight": 0.26208115308159446,
"learning_rate": 0.25912534572860507,
"subsample": 0.9266743941610592,
"colsample_bylevel": 1.0,
"colsample_bytree": 1.0,
"reg_alpha": 0.0013933617380144255,
"reg_lambda": 0.18096917948292954,
"FLAML_sample_size": 20000,
},
"xgb_limitdepth": None,
"lrl1": None,
}
automl3 = AutoML()
automl3.fit(X_train, y_train, **automl_settings)
automl_settings["sample"] = False
automl4 = AutoML()
try:
automl4.fit(
X_train,
y_train,
**automl_settings,
)
raise RuntimeError(
"When sample=False and starting_points contain FLAML_sample_size, AssertionError is expected but not raised."
)
except AssertionError:
pass
if __name__ == "__main__":
unittest.main()

View File

@ -57,15 +57,25 @@ def test_starting_point_not_in_search_space():
"learning_rate": {
"domain": tune.choice([1e-4, 1e-5]),
},
"per_device_train_batch_size": {
"domain": 2,
},
}
}
automl_settings["starting_points"] = "data:test/nlp/default/"
del automl_settings["fit_kwargs_by_estimator"][this_estimator_name]["model_path"]
automl.fit(X_train, y_train, **automl_settings)
assert (
len(automl._search_states[this_estimator_name].init_config) == 0
) # check that init config is not updated, but search space is updated
assert len(automl._search_states[this_estimator_name].init_config) == len(
automl._search_states[this_estimator_name]._search_space_domain
) - len(automl_settings["custom_hp"][this_estimator_name]), (
"The search space is updated with the custom_hp on {} hyperparameters of "
"the specified estimator without an initial value. Thus a valid init config "
"should only contain the cardinality of the search space minus {}".format(
len(automl_settings["custom_hp"][this_estimator_name]),
len(automl_settings["custom_hp"][this_estimator_name]),
)
)
assert (
automl._search_states[this_estimator_name].search_space["model_path"]
== "albert-base-v2"