n_estimators for catboost

This commit is contained in:
Chi Wang 2021-10-16 10:40:01 -07:00
parent 9e9356f436
commit 7d6e860102
5 changed files with 33 additions and 35 deletions

View File

@ -89,6 +89,7 @@ class SearchState:
and starting_point.get(name) is not None
):
self.init_config[name] = starting_point[name]
if isinstance(starting_point, list):
self.init_config = starting_point
self._hp_names = list(self._search_space_domain.keys())
@ -104,7 +105,6 @@ class SearchState:
self.trained_estimator = None
self.sample_size = None
self.trial_time = 0
self.best_n_iter = None
def update(self, result, time_used, save_model_history=False):
if result:
@ -122,13 +122,12 @@ class SearchState:
if (
n_iter is not None
and "n_estimators" in config
and n_iter >= self._search_space_domain["n_estimators"].lower
# and n_iter >= self._search_space_domain["n_estimators"].lower
):
config["n_estimators"] = n_iter
n_iter = None
else:
obj, time2eval, trained_estimator = np.inf, 0.0, None
metric_for_logging = config = n_iter = None
metric_for_logging = config = None
self.trial_time = time2eval
self.total_time_used += time_used
self.total_iter += 1
@ -156,10 +155,8 @@ class SearchState:
self.trained_estimator.cleanup()
if trained_estimator:
self.trained_estimator = trained_estimator
self.best_n_iter = n_iter
self.metric_for_logging = metric_for_logging
self.val_loss, self.config = obj, config
self.n_iter = n_iter
def get_hist_config_sig(self, sample_size, config):
config_values = tuple([config[k] for k in self._hp_names])
@ -262,9 +259,7 @@ class AutoMLState:
# tune.report(**result)
return result
def _train_with_config(
self, estimator, config_w_resource, sample_size=None, n_iter=None
):
def _train_with_config(self, estimator, config_w_resource, sample_size=None):
if not sample_size:
sample_size = config_w_resource.get(
"FLAML_sample_size", len(self.y_train_all)
@ -301,7 +296,6 @@ class AutoMLState:
self.n_jobs,
self.learner_classes.get(estimator),
budget,
n_iter,
self.fit_kwargs,
)
if sampled_weight is not None:
@ -1030,7 +1024,7 @@ class AutoML:
self._state.time_budget = None
self._state.n_jobs = n_jobs
self._trained_estimator = self._state._train_with_config(
best_estimator, best_config, sample_size, best.n_iter
best_estimator, best_config, sample_size
)[0]
logger.info("retrain from log succeeded")
return training_duration
@ -1731,7 +1725,6 @@ class AutoML:
config,
estimator,
search_state.sample_size,
search_state.n_iter,
)
def _search_sequential(self):
@ -1953,7 +1946,6 @@ class AutoML:
search_state.config,
estimator,
search_state.sample_size,
search_state.n_iter,
)
if mlflow is not None and mlflow.active_run():
with mlflow.start_run(nested=True):
@ -2031,7 +2023,6 @@ class AutoML:
self._best_estimator,
state.best_config,
self.data_size_full,
state.best_n_iter,
)
logger.info(
"retrain {} for {:.1f}s".format(self._best_estimator, retrain_time)
@ -2144,7 +2135,6 @@ class AutoML:
self._best_estimator,
state.best_config,
self.data_size_full,
state.best_n_iter,
)
logger.info(
"retrain {} for {:.1f}s".format(

View File

@ -465,14 +465,11 @@ def train_estimator(
n_jobs=1,
estimator_class=None,
budget=None,
n_iter=None,
fit_kwargs={},
):
start_time = time.time()
estimator_class = estimator_class or get_estimator_class(task, estimator_name)
estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
if n_iter is not None:
estimator.params["n_estimators"] = n_iter
if X_train is not None:
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
else:

View File

@ -645,11 +645,15 @@ class CatBoostEstimator(BaseEstimator):
"domain": tune.loguniform(lower=0.005, upper=0.2),
"init_value": 0.1,
},
"n_estimators": {
"domain": 8192,
"init_value": 8192,
},
}
@classmethod
def size(cls, config):
n_estimators = 8192
n_estimators = config.get("n_estimators", 8192)
max_leaves = 64
return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8

View File

@ -23,7 +23,6 @@ class TrainingLogRecord(object):
config: dict,
learner: str,
sample_size: int,
n_iter: int,
):
self.record_id = record_id
self.iter_per_learner = iter_per_learner
@ -34,7 +33,6 @@ class TrainingLogRecord(object):
self.config = config
self.learner = learner
self.sample_size = sample_size
self.n_iter = n_iter # n_estimators for catboost
def dump(self, fp: IO[str]):
d = vars(self)
@ -79,7 +77,6 @@ class TrainingLogWriter(object):
config,
learner,
sample_size,
n_iter,
):
if self.file is None:
raise IOError("Call open() to open the outpute file first.")
@ -95,7 +92,6 @@ class TrainingLogWriter(object):
config,
learner,
sample_size,
n_iter,
)
if (
validation_loss < self.current_best_loss

View File

@ -9,7 +9,7 @@ from flaml.training_log import training_log_reader
class TestTrainingLog(unittest.TestCase):
def test_training_log(self, path="test_training_log.log"):
def test_training_log(self, path="test_training_log.log", estimator_list="auto"):
with TemporaryDirectory() as d:
filename = os.path.join(d, path)
@ -27,8 +27,9 @@ class TestTrainingLog(unittest.TestCase):
"model_history": True,
"train_time_limit": 0.1,
"verbose": 3,
"ensemble": True,
# "ensemble": True,
"keep_search_state": True,
"estimator_list": estimator_list,
}
X_train, y_train = fetch_california_housing(return_X_y=True)
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
@ -37,31 +38,34 @@ class TestTrainingLog(unittest.TestCase):
if automl.best_estimator:
estimator, config = automl.best_estimator, automl.best_config
model0 = automl.best_model_for_estimator(estimator)
print(model0.params["n_estimators"], model0.estimator)
print(model0.params["n_estimators"], config)
# train on full data with no time limit
automl._state.time_budget = None
model, _ = automl._state._train_with_config(estimator, config)
print(model.estimator)
# model0 and model are equivalent unless model0's n_estimator is out of search space range
assert (
str(model0.estimator) == str(model.estimator)
or model0.params["n_estimators"] < 4
)
# assuming estimator & config are saved and loaded as follows
automl = AutoML()
automl.fit(
X_train=X_train,
y_train=y_train,
max_iter=0,
max_iter=1,
task="regression",
estimator_list=[estimator],
n_jobs=1,
starting_points={estimator: config},
)
print(automl.best_config)
# then the fitted model should be equivalent to model
# print(str(model.estimator), str(automl.model.estimator))
assert str(model.estimator) == str(automl.model.estimator)
assert (
str(model.estimator) == str(automl.model.estimator)
or estimator == "xgboost"
and str(model.estimator.get_dump())
== str(automl.model.estimator.get_dump())
or estimator == "catboost"
and str(model.estimator.get_all_params())
== str(automl.model.estimator.get_all_params())
)
with training_log_reader(filename) as reader:
count = 0
@ -83,3 +87,10 @@ class TestTrainingLog(unittest.TestCase):
print("IsADirectoryError happens as expected in linux.")
except PermissionError:
print("PermissionError happens as expected in windows.")
def test_each_estimator(self):
self.test_training_log(estimator_list=["xgboost"])
self.test_training_log(estimator_list=["catboost"])
self.test_training_log(estimator_list=["extra_tree"])
self.test_training_log(estimator_list=["rf"])
self.test_training_log(estimator_list=["lgbm"])