mirror of
https://github.com/microsoft/autogen.git
synced 2025-09-08 15:56:13 +00:00
Allow FLAML_sample_size in starting_points (#619)
* FLAML_sample_size * clean up * starting_points as a list * catch AssertionError * per estimator sample size * import * per estimator min_sample_size * Update flaml/automl.py Co-authored-by: Chi Wang <wang.chi@microsoft.com> * Update test/automl/test_warmstart.py Co-authored-by: Chi Wang <wang.chi@microsoft.com> * add warnings * adding more tests * fix a bug in validating starting points * improve test * revise test * revise test * documentation about custom_hp * doc and efficiency * update test Co-authored-by: Chi Wang <wang.chi@microsoft.com>
This commit is contained in:
parent
6cb6a2a19a
commit
b7846048dc
112
flaml/automl.py
112
flaml/automl.py
@ -102,13 +102,10 @@ class SearchState:
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
def valid_starting_point(self, starting_point, search_space):
|
def valid_starting_point(self, starting_point, search_space):
|
||||||
return any(
|
return all(
|
||||||
[
|
self.valid_starting_point_one_dim(value, search_space[name].get("domain"))
|
||||||
self.valid_starting_point_one_dim(
|
for name, value in starting_point.items()
|
||||||
value, search_space[name].get("domain")
|
if name != "FLAML_sample_size"
|
||||||
)
|
|
||||||
for name, value in starting_point.items()
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -656,9 +653,17 @@ class AutoML(BaseEstimator):
|
|||||||
the automl constructor, flaml will automatically (and under the hood)
|
the automl constructor, flaml will automatically (and under the hood)
|
||||||
add it as an additional element in the metric_constraints. Essentially 'pred_time_limit'
|
add it as an additional element in the metric_constraints. Essentially 'pred_time_limit'
|
||||||
specifies a constraint about the prediction latency constraint in seconds.
|
specifies a constraint about the prediction latency constraint in seconds.
|
||||||
custom_hp: dict, default=None | The custom search space specified by user
|
custom_hp: dict, default=None | The custom search space specified by user.
|
||||||
Each key is the estimator name, each value is a dict of the custom search space for that estimator. Notice the
|
It is a nested dict with keys being the estimator names, and values being dicts
|
||||||
domain of the custom search space can either be a value of a sample.Domain object.
|
per estimator search space. In the per estimator search space dict,
|
||||||
|
the keys are the hyperparameter names, and values are dicts of info ("domain",
|
||||||
|
"init_value", and "low_cost_init_value") about the search space associated with
|
||||||
|
the hyperparameter (i.e., per hyperparameter search space dict). When custom_hp
|
||||||
|
is provided, the built-in search space which is also a nested dict of per estimator
|
||||||
|
search space dict, will be updated with custom_hp. Note that during this nested dict update,
|
||||||
|
the per hyperparameter search space dicts will be replaced (instead of updated) by the ones
|
||||||
|
provided in custom_hp. Note that the value for "domain" can either be a constant
|
||||||
|
or a sample.Domain object.
|
||||||
e.g.,
|
e.g.,
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@ -2430,18 +2435,68 @@ class AutoML(BaseEstimator):
|
|||||||
eval_method == "holdout" and self._state.X_val is None
|
eval_method == "holdout" and self._state.X_val is None
|
||||||
)
|
)
|
||||||
self._auto_augment = auto_augment
|
self._auto_augment = auto_augment
|
||||||
self._min_sample_size = min_sample_size
|
|
||||||
|
_sample_size_from_starting_points = {}
|
||||||
|
if isinstance(starting_points, dict):
|
||||||
|
for _estimator, _point_per_estimator in starting_points.items():
|
||||||
|
sample_size = (
|
||||||
|
_point_per_estimator
|
||||||
|
and isinstance(_point_per_estimator, dict)
|
||||||
|
and _point_per_estimator.get("FLAML_sample_size")
|
||||||
|
)
|
||||||
|
if sample_size:
|
||||||
|
_sample_size_from_starting_points[_estimator] = sample_size
|
||||||
|
elif _point_per_estimator and isinstance(_point_per_estimator, list):
|
||||||
|
_sample_size_set = set(
|
||||||
|
[
|
||||||
|
config["FLAML_sample_size"]
|
||||||
|
for config in _point_per_estimator
|
||||||
|
if "FLAML_sample_size" in config
|
||||||
|
]
|
||||||
|
)
|
||||||
|
if _sample_size_set:
|
||||||
|
_sample_size_from_starting_points[_estimator] = min(
|
||||||
|
_sample_size_set
|
||||||
|
)
|
||||||
|
if len(_sample_size_set) > 1:
|
||||||
|
logger.warning(
|
||||||
|
"Using the min FLAML_sample_size of all the provided starting points for estimator {}. (Provided FLAML_sample_size are: {})".format(
|
||||||
|
_estimator, _sample_size_set
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not sample and isinstance(starting_points, dict):
|
||||||
|
assert (
|
||||||
|
not _sample_size_from_starting_points
|
||||||
|
), "When subsampling is disabled, do not include FLAML_sample_size in the starting point."
|
||||||
|
self._min_sample_size = _sample_size_from_starting_points or min_sample_size
|
||||||
|
self._min_sample_size_input = min_sample_size
|
||||||
self._prepare_data(eval_method, split_ratio, n_splits)
|
self._prepare_data(eval_method, split_ratio, n_splits)
|
||||||
|
|
||||||
self._sample = (
|
if isinstance(self._min_sample_size, dict):
|
||||||
sample
|
self._sample = {
|
||||||
and task != "rank"
|
(
|
||||||
and eval_method != "cv"
|
k,
|
||||||
and (
|
sample
|
||||||
self._min_sample_size * SAMPLE_MULTIPLY_FACTOR
|
and task != "rank"
|
||||||
< self._state.data_size[0]
|
and eval_method != "cv"
|
||||||
|
and (
|
||||||
|
self._min_sample_size[k] * SAMPLE_MULTIPLY_FACTOR
|
||||||
|
< self._state.data_size[0]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
for k in self._min_sample_size.keys()
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
self._sample = (
|
||||||
|
sample
|
||||||
|
and task != "rank"
|
||||||
|
and eval_method != "cv"
|
||||||
|
and (
|
||||||
|
self._min_sample_size * SAMPLE_MULTIPLY_FACTOR
|
||||||
|
< self._state.data_size[0]
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
if "auto" == metric:
|
if "auto" == metric:
|
||||||
if _is_nlp_task(self._state.task):
|
if _is_nlp_task(self._state.task):
|
||||||
from .nlp.utils import load_default_huggingface_metric_for_task
|
from .nlp.utils import load_default_huggingface_metric_for_task
|
||||||
@ -2752,6 +2807,16 @@ class AutoML(BaseEstimator):
|
|||||||
self._state.time_from_start = time.time() - self._start_time_flag
|
self._state.time_from_start = time.time() - self._start_time_flag
|
||||||
time_left = self._state.time_budget - self._state.time_from_start
|
time_left = self._state.time_budget - self._state.time_from_start
|
||||||
if self._hpo_method != "optuna":
|
if self._hpo_method != "optuna":
|
||||||
|
min_resource = self.min_resource
|
||||||
|
if isinstance(min_resource, dict):
|
||||||
|
_min_resource_set = set(min_resource.values())
|
||||||
|
min_resource_all_estimator = min(_min_resource_set)
|
||||||
|
if len(_min_resource_set) > 1:
|
||||||
|
logger.warning(
|
||||||
|
"Using the min FLAML_sample_size of all the provided starting points as the starting sample size in the case of parallel search."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
min_resource_all_estimator = min_resource
|
||||||
search_alg = SearchAlgo(
|
search_alg = SearchAlgo(
|
||||||
metric="val_loss",
|
metric="val_loss",
|
||||||
space=space,
|
space=space,
|
||||||
@ -2759,7 +2824,7 @@ class AutoML(BaseEstimator):
|
|||||||
points_to_evaluate=self.points_to_evaluate,
|
points_to_evaluate=self.points_to_evaluate,
|
||||||
cat_hp_cost=self.cat_hp_cost,
|
cat_hp_cost=self.cat_hp_cost,
|
||||||
resource_attr=self.resource_attr,
|
resource_attr=self.resource_attr,
|
||||||
min_resource=self.min_resource,
|
min_resource=min_resource_all_estimator,
|
||||||
max_resource=self.max_resource,
|
max_resource=self.max_resource,
|
||||||
config_constraints=[
|
config_constraints=[
|
||||||
(partial(size, self._state), "<=", self._mem_thres)
|
(partial(size, self._state), "<=", self._mem_thres)
|
||||||
@ -2947,7 +3012,12 @@ class AutoML(BaseEstimator):
|
|||||||
search_space = search_state.search_space
|
search_space = search_state.search_space
|
||||||
if self._sample:
|
if self._sample:
|
||||||
resource_attr = "FLAML_sample_size"
|
resource_attr = "FLAML_sample_size"
|
||||||
min_resource = self._min_sample_size
|
min_resource = (
|
||||||
|
self._min_sample_size[estimator]
|
||||||
|
if isinstance(self._min_sample_size, dict)
|
||||||
|
and estimator in self._min_sample_size
|
||||||
|
else self._min_sample_size_input
|
||||||
|
)
|
||||||
max_resource = self._state.data_size[0]
|
max_resource = self._state.data_size[0]
|
||||||
else:
|
else:
|
||||||
resource_attr = min_resource = max_resource = None
|
resource_attr = min_resource = max_resource = None
|
||||||
|
@ -474,8 +474,11 @@ class TestMultiClass(unittest.TestCase):
|
|||||||
starting_points = {}
|
starting_points = {}
|
||||||
log_file_name = automl_settings["log_file_name"]
|
log_file_name = automl_settings["log_file_name"]
|
||||||
with training_log_reader(log_file_name) as reader:
|
with training_log_reader(log_file_name) as reader:
|
||||||
|
sample_size = 1000
|
||||||
for record in reader.records():
|
for record in reader.records():
|
||||||
config = record.config
|
config = record.config
|
||||||
|
config["FLAML_sample_size"] = sample_size
|
||||||
|
sample_size += 1000
|
||||||
learner = record.learner
|
learner = record.learner
|
||||||
if learner not in starting_points:
|
if learner not in starting_points:
|
||||||
starting_points[learner] = []
|
starting_points[learner] = []
|
||||||
|
@ -123,6 +123,102 @@ class TestWarmStart(unittest.TestCase):
|
|||||||
automl.fit(X_train, y_train)
|
automl.fit(X_train, y_train)
|
||||||
print(automl.best_config_per_estimator)
|
print(automl.best_config_per_estimator)
|
||||||
|
|
||||||
|
def test_FLAML_sample_size_in_starting_points(self):
|
||||||
|
from flaml.data import load_openml_dataset
|
||||||
|
from flaml import AutoML
|
||||||
|
|
||||||
|
X_train, X_test, y_train, y_test = load_openml_dataset(
|
||||||
|
dataset_id=1169, data_dir="./"
|
||||||
|
)
|
||||||
|
|
||||||
|
automl_settings = {
|
||||||
|
"time_budget": 3,
|
||||||
|
"task": "classification",
|
||||||
|
}
|
||||||
|
|
||||||
|
automl1 = AutoML()
|
||||||
|
print(len(y_train))
|
||||||
|
automl1.fit(X_train, y_train, **automl_settings)
|
||||||
|
print("automl1.best_config_per_estimator", automl1.best_config_per_estimator)
|
||||||
|
|
||||||
|
automl_settings["starting_points"] = automl1.best_config_per_estimator
|
||||||
|
automl2 = AutoML()
|
||||||
|
automl2.fit(X_train, y_train, **automl_settings)
|
||||||
|
|
||||||
|
automl_settings["starting_points"] = {
|
||||||
|
"xgboost": {
|
||||||
|
"n_estimators": 4,
|
||||||
|
"max_leaves": 4,
|
||||||
|
"min_child_weight": 0.26208115308159446,
|
||||||
|
"learning_rate": 0.25912534572860507,
|
||||||
|
"subsample": 0.9266743941610592,
|
||||||
|
"colsample_bylevel": 1.0,
|
||||||
|
"colsample_bytree": 1.0,
|
||||||
|
"reg_alpha": 0.0013933617380144255,
|
||||||
|
"reg_lambda": 0.18096917948292954,
|
||||||
|
"FLAML_sample_size": 20000,
|
||||||
|
},
|
||||||
|
"xgb_limitdepth": None,
|
||||||
|
"lrl1": None,
|
||||||
|
}
|
||||||
|
from flaml import tune
|
||||||
|
|
||||||
|
automl_settings["custom_hp"] = {
|
||||||
|
"xgboost": {
|
||||||
|
"n_estimators": {
|
||||||
|
"domain": tune.choice([10, 20]),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
automl2 = AutoML()
|
||||||
|
automl2.fit(X_train, y_train, **automl_settings)
|
||||||
|
|
||||||
|
try:
|
||||||
|
import ray
|
||||||
|
|
||||||
|
automl_settings["n_concurrent_trials"] = 2
|
||||||
|
except ImportError:
|
||||||
|
automl_settings["n_concurrent_trials"] = 1
|
||||||
|
# setting different FLAML_sample_size
|
||||||
|
automl_settings["starting_points"] = {
|
||||||
|
"catboost": {
|
||||||
|
"early_stopping_rounds": 10,
|
||||||
|
"learning_rate": 0.09999999999999996,
|
||||||
|
"n_estimators": 1,
|
||||||
|
"FLAML_sample_size": 10000,
|
||||||
|
},
|
||||||
|
"xgboost": {
|
||||||
|
"n_estimators": 4,
|
||||||
|
"max_leaves": 4,
|
||||||
|
"min_child_weight": 0.26208115308159446,
|
||||||
|
"learning_rate": 0.25912534572860507,
|
||||||
|
"subsample": 0.9266743941610592,
|
||||||
|
"colsample_bylevel": 1.0,
|
||||||
|
"colsample_bytree": 1.0,
|
||||||
|
"reg_alpha": 0.0013933617380144255,
|
||||||
|
"reg_lambda": 0.18096917948292954,
|
||||||
|
"FLAML_sample_size": 20000,
|
||||||
|
},
|
||||||
|
"xgb_limitdepth": None,
|
||||||
|
"lrl1": None,
|
||||||
|
}
|
||||||
|
automl3 = AutoML()
|
||||||
|
automl3.fit(X_train, y_train, **automl_settings)
|
||||||
|
|
||||||
|
automl_settings["sample"] = False
|
||||||
|
automl4 = AutoML()
|
||||||
|
try:
|
||||||
|
automl4.fit(
|
||||||
|
X_train,
|
||||||
|
y_train,
|
||||||
|
**automl_settings,
|
||||||
|
)
|
||||||
|
raise RuntimeError(
|
||||||
|
"When sample=False and starting_points contain FLAML_sample_size, AssertionError is expected but not raised."
|
||||||
|
)
|
||||||
|
except AssertionError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
@ -57,15 +57,25 @@ def test_starting_point_not_in_search_space():
|
|||||||
"learning_rate": {
|
"learning_rate": {
|
||||||
"domain": tune.choice([1e-4, 1e-5]),
|
"domain": tune.choice([1e-4, 1e-5]),
|
||||||
},
|
},
|
||||||
|
"per_device_train_batch_size": {
|
||||||
|
"domain": 2,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
automl_settings["starting_points"] = "data:test/nlp/default/"
|
automl_settings["starting_points"] = "data:test/nlp/default/"
|
||||||
del automl_settings["fit_kwargs_by_estimator"][this_estimator_name]["model_path"]
|
del automl_settings["fit_kwargs_by_estimator"][this_estimator_name]["model_path"]
|
||||||
|
|
||||||
automl.fit(X_train, y_train, **automl_settings)
|
automl.fit(X_train, y_train, **automl_settings)
|
||||||
assert (
|
assert len(automl._search_states[this_estimator_name].init_config) == len(
|
||||||
len(automl._search_states[this_estimator_name].init_config) == 0
|
automl._search_states[this_estimator_name]._search_space_domain
|
||||||
) # check that init config is not updated, but search space is updated
|
) - len(automl_settings["custom_hp"][this_estimator_name]), (
|
||||||
|
"The search space is updated with the custom_hp on {} hyperparameters of "
|
||||||
|
"the specified estimator without an initial value. Thus a valid init config "
|
||||||
|
"should only contain the cardinality of the search space minus {}".format(
|
||||||
|
len(automl_settings["custom_hp"][this_estimator_name]),
|
||||||
|
len(automl_settings["custom_hp"][this_estimator_name]),
|
||||||
|
)
|
||||||
|
)
|
||||||
assert (
|
assert (
|
||||||
automl._search_states[this_estimator_name].search_space["model_path"]
|
automl._search_states[this_estimator_name].search_space["model_path"]
|
||||||
== "albert-base-v2"
|
== "albert-base-v2"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user