n_estimators for catboost

This commit is contained in:
Chi Wang 2021-10-16 10:40:01 -07:00
parent 9e9356f436
commit 7d6e860102
5 changed files with 33 additions and 35 deletions

View File

@ -89,6 +89,7 @@ class SearchState:
and starting_point.get(name) is not None and starting_point.get(name) is not None
): ):
self.init_config[name] = starting_point[name] self.init_config[name] = starting_point[name]
if isinstance(starting_point, list): if isinstance(starting_point, list):
self.init_config = starting_point self.init_config = starting_point
self._hp_names = list(self._search_space_domain.keys()) self._hp_names = list(self._search_space_domain.keys())
@ -104,7 +105,6 @@ class SearchState:
self.trained_estimator = None self.trained_estimator = None
self.sample_size = None self.sample_size = None
self.trial_time = 0 self.trial_time = 0
self.best_n_iter = None
def update(self, result, time_used, save_model_history=False): def update(self, result, time_used, save_model_history=False):
if result: if result:
@ -122,13 +122,12 @@ class SearchState:
if ( if (
n_iter is not None n_iter is not None
and "n_estimators" in config and "n_estimators" in config
and n_iter >= self._search_space_domain["n_estimators"].lower # and n_iter >= self._search_space_domain["n_estimators"].lower
): ):
config["n_estimators"] = n_iter config["n_estimators"] = n_iter
n_iter = None
else: else:
obj, time2eval, trained_estimator = np.inf, 0.0, None obj, time2eval, trained_estimator = np.inf, 0.0, None
metric_for_logging = config = n_iter = None metric_for_logging = config = None
self.trial_time = time2eval self.trial_time = time2eval
self.total_time_used += time_used self.total_time_used += time_used
self.total_iter += 1 self.total_iter += 1
@ -156,10 +155,8 @@ class SearchState:
self.trained_estimator.cleanup() self.trained_estimator.cleanup()
if trained_estimator: if trained_estimator:
self.trained_estimator = trained_estimator self.trained_estimator = trained_estimator
self.best_n_iter = n_iter
self.metric_for_logging = metric_for_logging self.metric_for_logging = metric_for_logging
self.val_loss, self.config = obj, config self.val_loss, self.config = obj, config
self.n_iter = n_iter
def get_hist_config_sig(self, sample_size, config): def get_hist_config_sig(self, sample_size, config):
config_values = tuple([config[k] for k in self._hp_names]) config_values = tuple([config[k] for k in self._hp_names])
@ -262,9 +259,7 @@ class AutoMLState:
# tune.report(**result) # tune.report(**result)
return result return result
def _train_with_config( def _train_with_config(self, estimator, config_w_resource, sample_size=None):
self, estimator, config_w_resource, sample_size=None, n_iter=None
):
if not sample_size: if not sample_size:
sample_size = config_w_resource.get( sample_size = config_w_resource.get(
"FLAML_sample_size", len(self.y_train_all) "FLAML_sample_size", len(self.y_train_all)
@ -301,7 +296,6 @@ class AutoMLState:
self.n_jobs, self.n_jobs,
self.learner_classes.get(estimator), self.learner_classes.get(estimator),
budget, budget,
n_iter,
self.fit_kwargs, self.fit_kwargs,
) )
if sampled_weight is not None: if sampled_weight is not None:
@ -1030,7 +1024,7 @@ class AutoML:
self._state.time_budget = None self._state.time_budget = None
self._state.n_jobs = n_jobs self._state.n_jobs = n_jobs
self._trained_estimator = self._state._train_with_config( self._trained_estimator = self._state._train_with_config(
best_estimator, best_config, sample_size, best.n_iter best_estimator, best_config, sample_size
)[0] )[0]
logger.info("retrain from log succeeded") logger.info("retrain from log succeeded")
return training_duration return training_duration
@ -1731,7 +1725,6 @@ class AutoML:
config, config,
estimator, estimator,
search_state.sample_size, search_state.sample_size,
search_state.n_iter,
) )
def _search_sequential(self): def _search_sequential(self):
@ -1953,7 +1946,6 @@ class AutoML:
search_state.config, search_state.config,
estimator, estimator,
search_state.sample_size, search_state.sample_size,
search_state.n_iter,
) )
if mlflow is not None and mlflow.active_run(): if mlflow is not None and mlflow.active_run():
with mlflow.start_run(nested=True): with mlflow.start_run(nested=True):
@ -2031,7 +2023,6 @@ class AutoML:
self._best_estimator, self._best_estimator,
state.best_config, state.best_config,
self.data_size_full, self.data_size_full,
state.best_n_iter,
) )
logger.info( logger.info(
"retrain {} for {:.1f}s".format(self._best_estimator, retrain_time) "retrain {} for {:.1f}s".format(self._best_estimator, retrain_time)
@ -2144,7 +2135,6 @@ class AutoML:
self._best_estimator, self._best_estimator,
state.best_config, state.best_config,
self.data_size_full, self.data_size_full,
state.best_n_iter,
) )
logger.info( logger.info(
"retrain {} for {:.1f}s".format( "retrain {} for {:.1f}s".format(

View File

@ -465,14 +465,11 @@ def train_estimator(
n_jobs=1, n_jobs=1,
estimator_class=None, estimator_class=None,
budget=None, budget=None,
n_iter=None,
fit_kwargs={}, fit_kwargs={},
): ):
start_time = time.time() start_time = time.time()
estimator_class = estimator_class or get_estimator_class(task, estimator_name) estimator_class = estimator_class or get_estimator_class(task, estimator_name)
estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs) estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
if n_iter is not None:
estimator.params["n_estimators"] = n_iter
if X_train is not None: if X_train is not None:
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs) train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
else: else:

View File

@ -645,11 +645,15 @@ class CatBoostEstimator(BaseEstimator):
"domain": tune.loguniform(lower=0.005, upper=0.2), "domain": tune.loguniform(lower=0.005, upper=0.2),
"init_value": 0.1, "init_value": 0.1,
}, },
"n_estimators": {
"domain": 8192,
"init_value": 8192,
},
} }
@classmethod @classmethod
def size(cls, config): def size(cls, config):
n_estimators = 8192 n_estimators = config.get("n_estimators", 8192)
max_leaves = 64 max_leaves = 64
return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8 return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8

View File

@ -23,7 +23,6 @@ class TrainingLogRecord(object):
config: dict, config: dict,
learner: str, learner: str,
sample_size: int, sample_size: int,
n_iter: int,
): ):
self.record_id = record_id self.record_id = record_id
self.iter_per_learner = iter_per_learner self.iter_per_learner = iter_per_learner
@ -34,7 +33,6 @@ class TrainingLogRecord(object):
self.config = config self.config = config
self.learner = learner self.learner = learner
self.sample_size = sample_size self.sample_size = sample_size
self.n_iter = n_iter # n_estimators for catboost
def dump(self, fp: IO[str]): def dump(self, fp: IO[str]):
d = vars(self) d = vars(self)
@ -79,7 +77,6 @@ class TrainingLogWriter(object):
config, config,
learner, learner,
sample_size, sample_size,
n_iter,
): ):
if self.file is None: if self.file is None:
raise IOError("Call open() to open the outpute file first.") raise IOError("Call open() to open the outpute file first.")
@ -95,7 +92,6 @@ class TrainingLogWriter(object):
config, config,
learner, learner,
sample_size, sample_size,
n_iter,
) )
if ( if (
validation_loss < self.current_best_loss validation_loss < self.current_best_loss

View File

@ -9,7 +9,7 @@ from flaml.training_log import training_log_reader
class TestTrainingLog(unittest.TestCase): class TestTrainingLog(unittest.TestCase):
def test_training_log(self, path="test_training_log.log"): def test_training_log(self, path="test_training_log.log", estimator_list="auto"):
with TemporaryDirectory() as d: with TemporaryDirectory() as d:
filename = os.path.join(d, path) filename = os.path.join(d, path)
@ -27,8 +27,9 @@ class TestTrainingLog(unittest.TestCase):
"model_history": True, "model_history": True,
"train_time_limit": 0.1, "train_time_limit": 0.1,
"verbose": 3, "verbose": 3,
"ensemble": True, # "ensemble": True,
"keep_search_state": True, "keep_search_state": True,
"estimator_list": estimator_list,
} }
X_train, y_train = fetch_california_housing(return_X_y=True) X_train, y_train = fetch_california_housing(return_X_y=True)
automl.fit(X_train=X_train, y_train=y_train, **automl_settings) automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
@ -37,31 +38,34 @@ class TestTrainingLog(unittest.TestCase):
if automl.best_estimator: if automl.best_estimator:
estimator, config = automl.best_estimator, automl.best_config estimator, config = automl.best_estimator, automl.best_config
model0 = automl.best_model_for_estimator(estimator) model0 = automl.best_model_for_estimator(estimator)
print(model0.params["n_estimators"], model0.estimator) print(model0.params["n_estimators"], config)
# train on full data with no time limit
automl._state.time_budget = None automl._state.time_budget = None
model, _ = automl._state._train_with_config(estimator, config) model, _ = automl._state._train_with_config(estimator, config)
print(model.estimator)
# model0 and model are equivalent unless model0's n_estimator is out of search space range
assert (
str(model0.estimator) == str(model.estimator)
or model0.params["n_estimators"] < 4
)
# assuming estimator & config are saved and loaded as follows # assuming estimator & config are saved and loaded as follows
automl = AutoML() automl = AutoML()
automl.fit( automl.fit(
X_train=X_train, X_train=X_train,
y_train=y_train, y_train=y_train,
max_iter=0, max_iter=1,
task="regression", task="regression",
estimator_list=[estimator], estimator_list=[estimator],
n_jobs=1, n_jobs=1,
starting_points={estimator: config}, starting_points={estimator: config},
) )
print(automl.best_config)
# then the fitted model should be equivalent to model # then the fitted model should be equivalent to model
# print(str(model.estimator), str(automl.model.estimator)) assert (
assert str(model.estimator) == str(automl.model.estimator) str(model.estimator) == str(automl.model.estimator)
or estimator == "xgboost"
and str(model.estimator.get_dump())
== str(automl.model.estimator.get_dump())
or estimator == "catboost"
and str(model.estimator.get_all_params())
== str(automl.model.estimator.get_all_params())
)
with training_log_reader(filename) as reader: with training_log_reader(filename) as reader:
count = 0 count = 0
@ -83,3 +87,10 @@ class TestTrainingLog(unittest.TestCase):
print("IsADirectoryError happens as expected in linux.") print("IsADirectoryError happens as expected in linux.")
except PermissionError: except PermissionError:
print("PermissionError happens as expected in windows.") print("PermissionError happens as expected in windows.")
def test_each_estimator(self):
self.test_training_log(estimator_list=["xgboost"])
self.test_training_log(estimator_list=["catboost"])
self.test_training_log(estimator_list=["extra_tree"])
self.test_training_log(estimator_list=["rf"])
self.test_training_log(estimator_list=["lgbm"])