Merge branch 'main' into main

This commit is contained in:
zsk 2022-08-21 13:58:18 -04:00 committed by GitHub
commit 126c41f3ee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 97 additions and 37 deletions

View File

@ -627,6 +627,8 @@ class AutoML(BaseEstimator):
keep_search_state: boolean, default=False | Whether to keep data needed keep_search_state: boolean, default=False | Whether to keep data needed
for model search after fit(). By default the state is deleted for for model search after fit(). By default the state is deleted for
space saving. space saving.
preserve_checkpoint: boolean, default=True | Whether to preserve the saved checkpoint
on disk when deleting automl. By default the checkpoint is preserved.
early_stop: boolean, default=False | Whether to stop early if the early_stop: boolean, default=False | Whether to stop early if the
search is considered to converge. search is considered to converge.
append_log: boolean, default=False | Whetehr to directly append the log append_log: boolean, default=False | Whetehr to directly append the log
@ -726,6 +728,7 @@ class AutoML(BaseEstimator):
settings["starting_points"] = settings.get("starting_points", "static") settings["starting_points"] = settings.get("starting_points", "static")
settings["n_concurrent_trials"] = settings.get("n_concurrent_trials", 1) settings["n_concurrent_trials"] = settings.get("n_concurrent_trials", 1)
settings["keep_search_state"] = settings.get("keep_search_state", False) settings["keep_search_state"] = settings.get("keep_search_state", False)
settings["preserve_checkpoint"] = settings.get("preserve_checkpoint", True)
settings["early_stop"] = settings.get("early_stop", False) settings["early_stop"] = settings.get("early_stop", False)
settings["append_log"] = settings.get("append_log", False) settings["append_log"] = settings.get("append_log", False)
settings["min_sample_size"] = settings.get("min_sample_size", MIN_SAMPLE_TRAIN) settings["min_sample_size"] = settings.get("min_sample_size", MIN_SAMPLE_TRAIN)
@ -1576,6 +1579,7 @@ class AutoML(BaseEstimator):
auto_augment=None, auto_augment=None,
custom_hp=None, custom_hp=None,
skip_transform=None, skip_transform=None,
preserve_checkpoint=True,
fit_kwargs_by_estimator=None, fit_kwargs_by_estimator=None,
**fit_kwargs, **fit_kwargs,
): ):
@ -1704,10 +1708,19 @@ class AutoML(BaseEstimator):
self._state.fit_kwargs = fit_kwargs self._state.fit_kwargs = fit_kwargs
self._state.custom_hp = custom_hp or self._settings.get("custom_hp") self._state.custom_hp = custom_hp or self._settings.get("custom_hp")
self._skip_transform = self._settings.get("skip_transform") if skip_transform is None else skip_transform self._skip_transform = (
self._settings.get("skip_transform")
if skip_transform is None
else skip_transform
)
self._state.fit_kwargs_by_estimator = ( self._state.fit_kwargs_by_estimator = (
fit_kwargs_by_estimator or self._settings.get("fit_kwargs_by_estimator") fit_kwargs_by_estimator or self._settings.get("fit_kwargs_by_estimator")
) )
self.preserve_checkpoint = (
self._settings.get("preserve_checkpoint")
if preserve_checkpoint is None
else preserve_checkpoint
)
self._validate_data(X_train, y_train, dataframe, label, groups=groups) self._validate_data(X_train, y_train, dataframe, label, groups=groups)
logger.info("log file name {}".format(log_file_name)) logger.info("log file name {}".format(log_file_name))
@ -2123,6 +2136,7 @@ class AutoML(BaseEstimator):
seed=None, seed=None,
n_concurrent_trials=None, n_concurrent_trials=None,
keep_search_state=None, keep_search_state=None,
preserve_checkpoint=True,
early_stop=None, early_stop=None,
append_log=None, append_log=None,
auto_augment=None, auto_augment=None,
@ -2303,6 +2317,8 @@ class AutoML(BaseEstimator):
keep_search_state: boolean, default=False | Whether to keep data needed keep_search_state: boolean, default=False | Whether to keep data needed
for model search after fit(). By default the state is deleted for for model search after fit(). By default the state is deleted for
space saving. space saving.
preserve_checkpoint: boolean, default=True | Whether to preserve the saved checkpoint
on disk when deleting automl. By default the checkpoint is preserved.
early_stop: boolean, default=False | Whether to stop early if the early_stop: boolean, default=False | Whether to stop early if the
search is considered to converge. search is considered to converge.
append_log: boolean, default=False | Whetehr to directly append the log append_log: boolean, default=False | Whetehr to directly append the log
@ -2464,6 +2480,11 @@ class AutoML(BaseEstimator):
if keep_search_state is None if keep_search_state is None
else keep_search_state else keep_search_state
) )
self.preserve_checkpoint = (
self._settings.get("preserve_checkpoint")
if preserve_checkpoint is None
else preserve_checkpoint
)
early_stop = ( early_stop = (
self._settings.get("early_stop") if early_stop is None else early_stop self._settings.get("early_stop") if early_stop is None else early_stop
) )
@ -2513,7 +2534,11 @@ class AutoML(BaseEstimator):
self._state.fit_kwargs = fit_kwargs self._state.fit_kwargs = fit_kwargs
custom_hp = custom_hp or self._settings.get("custom_hp") custom_hp = custom_hp or self._settings.get("custom_hp")
self._skip_transform = self._settings.get("skip_transform") if skip_transform is None else skip_transform self._skip_transform = (
self._settings.get("skip_transform")
if skip_transform is None
else skip_transform
)
fit_kwargs_by_estimator = fit_kwargs_by_estimator or self._settings.get( fit_kwargs_by_estimator = fit_kwargs_by_estimator or self._settings.get(
"fit_kwargs_by_estimator" "fit_kwargs_by_estimator"
) )
@ -3566,7 +3591,8 @@ class AutoML(BaseEstimator):
and self._trained_estimator and self._trained_estimator
and hasattr(self._trained_estimator, "cleanup") and hasattr(self._trained_estimator, "cleanup")
): ):
self._trained_estimator.cleanup() if self.preserve_checkpoint is False:
self._trained_estimator.cleanup()
del self._trained_estimator del self._trained_estimator
def _select_estimator(self, estimator_list): def _select_estimator(self, estimator_list):

View File

@ -1626,15 +1626,26 @@ class CatBoostEstimator(BaseEstimator):
cat_features = list(X_train.select_dtypes(include="category").columns) cat_features = list(X_train.select_dtypes(include="category").columns)
else: else:
cat_features = [] cat_features = []
n = max(int(len(y_train) * 0.9), len(y_train) - 1000) use_best_model = kwargs.get("use_best_model", True)
n = (
max(int(len(y_train) * 0.9), len(y_train) - 1000)
if use_best_model
else len(y_train)
)
X_tr, y_tr = X_train[:n], y_train[:n] X_tr, y_tr = X_train[:n], y_train[:n]
from catboost import Pool, __version__
eval_set = (
Pool(data=X_train[n:], label=y_train[n:], cat_features=cat_features)
if use_best_model
else None
)
if "sample_weight" in kwargs: if "sample_weight" in kwargs:
weight = kwargs["sample_weight"] weight = kwargs["sample_weight"]
if weight is not None: if weight is not None:
kwargs["sample_weight"] = weight[:n] kwargs["sample_weight"] = weight[:n]
else: else:
weight = None weight = None
from catboost import Pool, __version__
model = self.estimator_class(train_dir=train_dir, **self.params) model = self.estimator_class(train_dir=train_dir, **self.params)
if __version__ >= "0.26": if __version__ >= "0.26":
@ -1642,10 +1653,10 @@ class CatBoostEstimator(BaseEstimator):
X_tr, X_tr,
y_tr, y_tr,
cat_features=cat_features, cat_features=cat_features,
eval_set=Pool( eval_set=eval_set,
data=X_train[n:], label=y_train[n:], cat_features=cat_features callbacks=CatBoostEstimator._callbacks(
start_time, deadline, FREE_MEM_RATIO if use_best_model else None
), ),
callbacks=CatBoostEstimator._callbacks(start_time, deadline),
**kwargs, **kwargs,
) )
else: else:
@ -1653,9 +1664,7 @@ class CatBoostEstimator(BaseEstimator):
X_tr, X_tr,
y_tr, y_tr,
cat_features=cat_features, cat_features=cat_features,
eval_set=Pool( eval_set=eval_set,
data=X_train[n:], label=y_train[n:], cat_features=cat_features
),
**kwargs, **kwargs,
) )
shutil.rmtree(train_dir, ignore_errors=True) shutil.rmtree(train_dir, ignore_errors=True)
@ -1667,7 +1676,7 @@ class CatBoostEstimator(BaseEstimator):
return train_time return train_time
@classmethod @classmethod
def _callbacks(cls, start_time, deadline): def _callbacks(cls, start_time, deadline, free_mem_ratio):
class ResourceLimit: class ResourceLimit:
def after_iteration(self, info) -> bool: def after_iteration(self, info) -> bool:
now = time.time() now = time.time()
@ -1675,9 +1684,9 @@ class CatBoostEstimator(BaseEstimator):
self._time_per_iter = now - start_time self._time_per_iter = now - start_time
if now + self._time_per_iter > deadline: if now + self._time_per_iter > deadline:
return False return False
if psutil is not None: if psutil is not None and free_mem_ratio is not None:
mem = psutil.virtual_memory() mem = psutil.virtual_memory()
if mem.available / mem.total < FREE_MEM_RATIO: if mem.available / mem.total < free_mem_ratio:
return False return False
return True # can continue return True # can continue

View File

@ -1 +1 @@
__version__ = "1.0.10" __version__ = "1.0.11"

View File

@ -39,7 +39,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"%pip install flaml[notebook]==1.0.8" "%pip install flaml[notebook]==1.0.10"
] ]
}, },
{ {
@ -651,6 +651,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# uncomment the following line if optuna is not installed\n",
"# %pip install optuna==2.8.0" "# %pip install optuna==2.8.0"
] ]
}, },

View File

@ -98,8 +98,8 @@ class TestRegression(unittest.TestCase):
y_train = np.random.uniform(size=300) y_train = np.random.uniform(size=300)
X_val = scipy.sparse.random(100, 900, density=0.0001) X_val = scipy.sparse.random(100, 900, density=0.0001)
y_val = np.random.uniform(size=100) y_val = np.random.uniform(size=100)
automl_experiment = AutoML() automl = AutoML()
automl_settings = { settings = {
"time_budget": 2, "time_budget": 2,
"metric": "mae", "metric": "mae",
"task": "regression", "task": "regression",
@ -110,23 +110,34 @@ class TestRegression(unittest.TestCase):
"verbose": 0, "verbose": 0,
"early_stop": True, "early_stop": True,
} }
automl_experiment.fit( automl.fit(
X_train=X_train, X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **settings
y_train=y_train, )
X_val=X_val, assert automl._state.X_val.shape == X_val.shape
y_val=y_val, print(automl.predict(X_train))
**automl_settings print(automl.model)
print(automl.config_history)
print(automl.best_model_for_estimator("rf"))
print(automl.best_iteration)
print(automl.best_estimator)
print(automl.best_config)
print(automl.best_loss)
print(automl.best_config_train_time)
settings.update(
{
"estimator_list": ["catboost"],
"keep_search_state": False,
"model_history": False,
"use_best_model": False,
"time_budget": None,
"max_iter": 2,
"custom_hp": {"catboost": {"n_estimators": {"domain": 100}}},
}
)
automl.fit(
X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **settings
) )
assert automl_experiment._state.X_val.shape == X_val.shape
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.best_model_for_estimator("rf"))
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
print(automl_experiment.best_config)
print(automl_experiment.best_loss)
print(automl_experiment.best_config_train_time)
def test_parallel(self, hpo_method=None): def test_parallel(self, hpo_method=None):
automl_experiment = AutoML() automl_experiment = AutoML()

View File

@ -13,6 +13,7 @@ def test_hf_data():
automl = AutoML() automl = AutoML()
automl_settings = get_automl_settings() automl_settings = get_automl_settings()
automl_settings["preserve_checkpoint"] = False
try: try:
automl.fit( automl.fit(
@ -68,6 +69,8 @@ def test_hf_data():
automl.predict_proba(X_test) automl.predict_proba(X_test)
print(automl.classes_) print(automl.classes_)
del automl
if __name__ == "__main__": if __name__ == "__main__":
test_hf_data() test_hf_data()

View File

@ -1,8 +1,15 @@
# Frequently Asked Questions # Frequently Asked Questions
### [Guidelines on how to set a hyperparameter search space](Use-Cases/Tune-User-Defined-Function#details-and-guidelines-on-hyperparameter-search-space)
### [Guidelines on parallel vs seqential tuning](Use-Cases/Task-Oriented-AutoML#guidelines-on-parallel-vs-sequential-tuning)
### [Guidelines on creating and tuning a custom estimator](Use-Cases/Task-Oriented-AutoML#guidelines-on-tuning-a-custom-estimator)
### About `low_cost_partial_config` in `tune`. ### About `low_cost_partial_config` in `tune`.
- Definition and purpose: The `low_cost_partial_config` is a dictionary of subset of the hyperparameter coordinates whose value corresponds to a configuration with known low-cost (i.e., low computation cost for training the corresponding model). The concept of low/high-cost is meaningful in the case where a subset of the hyperparameters to tune directly affects the computation cost for training the model. For example, `n_estimators` and `max_leaves` are known to affect the training cost of tree-based learners. We call this subset of hyperparameters, *cost-related hyperparameters*. In such scenarios, if you are aware of low-cost configurations for the cost-related hyperparameters, you are recommended to set them as the `low_cost_partial_config`. Using the tree-based method example again, since we know that small `n_estimators` and `max_leaves` generally correspond to simpler models and thus lower cost, we set `{'n_estimators': 4, 'max_leaves': 4}` as the `low_cost_partial_config` by default (note that `4` is the lower bound of search space for these two hyperparameters), e.g., in [LGBM](https://github.com/microsoft/FLAML/blob/main/flaml/model.py#L215). Configuring `low_cost_partial_config` helps the search algorithms make more cost-efficient choices. - Definition and purpose: The `low_cost_partial_config` is a dictionary of subset of the hyperparameter coordinates whose value corresponds to a configuration with known low-cost (i.e., low computation cost for training the corresponding model). The concept of low/high-cost is meaningful in the case where a subset of the hyperparameters to tune directly affects the computation cost for training the model. For example, `n_estimators` and `max_leaves` are known to affect the training cost of tree-based learners. We call this subset of hyperparameters, *cost-related hyperparameters*. In such scenarios, if you are aware of low-cost configurations for the cost-related hyperparameters, you are recommended to set them as the `low_cost_partial_config`. Using the tree-based method example again, since we know that small `n_estimators` and `max_leaves` generally correspond to simpler models and thus lower cost, we set `{'n_estimators': 4, 'max_leaves': 4}` as the `low_cost_partial_config` by default (note that `4` is the lower bound of search space for these two hyperparameters), e.g., in [LGBM](https://github.com/microsoft/FLAML/blob/main/flaml/model.py#L215). Configuring `low_cost_partial_config` helps the search algorithms make more cost-efficient choices.
In AutoML, the `low_cost_init_value` in `search_space()` function for each estimator serves the same role. In AutoML, the `low_cost_init_value` in `search_space()` function for each estimator serves the same role.
- Usage in practice: It is recommended to configure it if there are cost-related hyperparameters in your tuning task and you happen to know the low-cost values for them, but it is not required (It is fine to leave it the default value, i.e., `None`). - Usage in practice: It is recommended to configure it if there are cost-related hyperparameters in your tuning task and you happen to know the low-cost values for them, but it is not required (It is fine to leave it the default value, i.e., `None`).

View File

@ -125,8 +125,9 @@ The estimator list can contain one or more estimator names, each corresponding t
- tuning an estimator that is not built-in; - tuning an estimator that is not built-in;
- customizing search space for a built-in estimator. - customizing search space for a built-in estimator.
To tune a custom estimator that is not built-in, you need to: #### Guidelines on tuning a custom estimator
To tune a custom estimator that is not built-in, you need to:
1. Build a custom estimator by inheritting [`flaml.model.BaseEstimator`](../reference/model#baseestimator-objects) or a derived class. 1. Build a custom estimator by inheritting [`flaml.model.BaseEstimator`](../reference/model#baseestimator-objects) or a derived class.
For example, if you have a estimator class with scikit-learn style `fit()` and `predict()` functions, you only need to set `self.estimator_class` to be that class in your constructor. For example, if you have a estimator class with scikit-learn style `fit()` and `predict()` functions, you only need to set `self.estimator_class` to be that class in your constructor.
@ -280,7 +281,9 @@ Some constraints on the estimator can be implemented via the custom learner. For
class MonotonicXGBoostEstimator(XGBoostSklearnEstimator): class MonotonicXGBoostEstimator(XGBoostSklearnEstimator):
@classmethod @classmethod
def search_space(**args): def search_space(**args):
return super().search_space(**args).update({"monotone_constraints": "(1, -1)"}) space = super().search_space(**args)
space.update({"monotone_constraints": {"domain": "(1, -1)"}})
return space
``` ```
It adds a monotonicity constraint to XGBoost. This approach can be used to set any constraint that is an argument in the underlying estimator's constructor. It adds a monotonicity constraint to XGBoost. This approach can be used to set any constraint that is an argument in the underlying estimator's constructor.