mirror of
https://github.com/microsoft/autogen.git
synced 2025-09-26 16:51:39 +00:00
n_estimators for catboost
This commit is contained in:
parent
9e9356f436
commit
7d6e860102
@ -89,6 +89,7 @@ class SearchState:
|
|||||||
and starting_point.get(name) is not None
|
and starting_point.get(name) is not None
|
||||||
):
|
):
|
||||||
self.init_config[name] = starting_point[name]
|
self.init_config[name] = starting_point[name]
|
||||||
|
|
||||||
if isinstance(starting_point, list):
|
if isinstance(starting_point, list):
|
||||||
self.init_config = starting_point
|
self.init_config = starting_point
|
||||||
self._hp_names = list(self._search_space_domain.keys())
|
self._hp_names = list(self._search_space_domain.keys())
|
||||||
@ -104,7 +105,6 @@ class SearchState:
|
|||||||
self.trained_estimator = None
|
self.trained_estimator = None
|
||||||
self.sample_size = None
|
self.sample_size = None
|
||||||
self.trial_time = 0
|
self.trial_time = 0
|
||||||
self.best_n_iter = None
|
|
||||||
|
|
||||||
def update(self, result, time_used, save_model_history=False):
|
def update(self, result, time_used, save_model_history=False):
|
||||||
if result:
|
if result:
|
||||||
@ -122,13 +122,12 @@ class SearchState:
|
|||||||
if (
|
if (
|
||||||
n_iter is not None
|
n_iter is not None
|
||||||
and "n_estimators" in config
|
and "n_estimators" in config
|
||||||
and n_iter >= self._search_space_domain["n_estimators"].lower
|
# and n_iter >= self._search_space_domain["n_estimators"].lower
|
||||||
):
|
):
|
||||||
config["n_estimators"] = n_iter
|
config["n_estimators"] = n_iter
|
||||||
n_iter = None
|
|
||||||
else:
|
else:
|
||||||
obj, time2eval, trained_estimator = np.inf, 0.0, None
|
obj, time2eval, trained_estimator = np.inf, 0.0, None
|
||||||
metric_for_logging = config = n_iter = None
|
metric_for_logging = config = None
|
||||||
self.trial_time = time2eval
|
self.trial_time = time2eval
|
||||||
self.total_time_used += time_used
|
self.total_time_used += time_used
|
||||||
self.total_iter += 1
|
self.total_iter += 1
|
||||||
@ -156,10 +155,8 @@ class SearchState:
|
|||||||
self.trained_estimator.cleanup()
|
self.trained_estimator.cleanup()
|
||||||
if trained_estimator:
|
if trained_estimator:
|
||||||
self.trained_estimator = trained_estimator
|
self.trained_estimator = trained_estimator
|
||||||
self.best_n_iter = n_iter
|
|
||||||
self.metric_for_logging = metric_for_logging
|
self.metric_for_logging = metric_for_logging
|
||||||
self.val_loss, self.config = obj, config
|
self.val_loss, self.config = obj, config
|
||||||
self.n_iter = n_iter
|
|
||||||
|
|
||||||
def get_hist_config_sig(self, sample_size, config):
|
def get_hist_config_sig(self, sample_size, config):
|
||||||
config_values = tuple([config[k] for k in self._hp_names])
|
config_values = tuple([config[k] for k in self._hp_names])
|
||||||
@ -262,9 +259,7 @@ class AutoMLState:
|
|||||||
# tune.report(**result)
|
# tune.report(**result)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _train_with_config(
|
def _train_with_config(self, estimator, config_w_resource, sample_size=None):
|
||||||
self, estimator, config_w_resource, sample_size=None, n_iter=None
|
|
||||||
):
|
|
||||||
if not sample_size:
|
if not sample_size:
|
||||||
sample_size = config_w_resource.get(
|
sample_size = config_w_resource.get(
|
||||||
"FLAML_sample_size", len(self.y_train_all)
|
"FLAML_sample_size", len(self.y_train_all)
|
||||||
@ -301,7 +296,6 @@ class AutoMLState:
|
|||||||
self.n_jobs,
|
self.n_jobs,
|
||||||
self.learner_classes.get(estimator),
|
self.learner_classes.get(estimator),
|
||||||
budget,
|
budget,
|
||||||
n_iter,
|
|
||||||
self.fit_kwargs,
|
self.fit_kwargs,
|
||||||
)
|
)
|
||||||
if sampled_weight is not None:
|
if sampled_weight is not None:
|
||||||
@ -1030,7 +1024,7 @@ class AutoML:
|
|||||||
self._state.time_budget = None
|
self._state.time_budget = None
|
||||||
self._state.n_jobs = n_jobs
|
self._state.n_jobs = n_jobs
|
||||||
self._trained_estimator = self._state._train_with_config(
|
self._trained_estimator = self._state._train_with_config(
|
||||||
best_estimator, best_config, sample_size, best.n_iter
|
best_estimator, best_config, sample_size
|
||||||
)[0]
|
)[0]
|
||||||
logger.info("retrain from log succeeded")
|
logger.info("retrain from log succeeded")
|
||||||
return training_duration
|
return training_duration
|
||||||
@ -1731,7 +1725,6 @@ class AutoML:
|
|||||||
config,
|
config,
|
||||||
estimator,
|
estimator,
|
||||||
search_state.sample_size,
|
search_state.sample_size,
|
||||||
search_state.n_iter,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def _search_sequential(self):
|
def _search_sequential(self):
|
||||||
@ -1953,7 +1946,6 @@ class AutoML:
|
|||||||
search_state.config,
|
search_state.config,
|
||||||
estimator,
|
estimator,
|
||||||
search_state.sample_size,
|
search_state.sample_size,
|
||||||
search_state.n_iter,
|
|
||||||
)
|
)
|
||||||
if mlflow is not None and mlflow.active_run():
|
if mlflow is not None and mlflow.active_run():
|
||||||
with mlflow.start_run(nested=True):
|
with mlflow.start_run(nested=True):
|
||||||
@ -2031,7 +2023,6 @@ class AutoML:
|
|||||||
self._best_estimator,
|
self._best_estimator,
|
||||||
state.best_config,
|
state.best_config,
|
||||||
self.data_size_full,
|
self.data_size_full,
|
||||||
state.best_n_iter,
|
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
"retrain {} for {:.1f}s".format(self._best_estimator, retrain_time)
|
"retrain {} for {:.1f}s".format(self._best_estimator, retrain_time)
|
||||||
@ -2144,7 +2135,6 @@ class AutoML:
|
|||||||
self._best_estimator,
|
self._best_estimator,
|
||||||
state.best_config,
|
state.best_config,
|
||||||
self.data_size_full,
|
self.data_size_full,
|
||||||
state.best_n_iter,
|
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
"retrain {} for {:.1f}s".format(
|
"retrain {} for {:.1f}s".format(
|
||||||
|
@ -465,14 +465,11 @@ def train_estimator(
|
|||||||
n_jobs=1,
|
n_jobs=1,
|
||||||
estimator_class=None,
|
estimator_class=None,
|
||||||
budget=None,
|
budget=None,
|
||||||
n_iter=None,
|
|
||||||
fit_kwargs={},
|
fit_kwargs={},
|
||||||
):
|
):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
estimator_class = estimator_class or get_estimator_class(task, estimator_name)
|
estimator_class = estimator_class or get_estimator_class(task, estimator_name)
|
||||||
estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
|
estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
|
||||||
if n_iter is not None:
|
|
||||||
estimator.params["n_estimators"] = n_iter
|
|
||||||
if X_train is not None:
|
if X_train is not None:
|
||||||
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
|
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
|
||||||
else:
|
else:
|
||||||
|
@ -645,11 +645,15 @@ class CatBoostEstimator(BaseEstimator):
|
|||||||
"domain": tune.loguniform(lower=0.005, upper=0.2),
|
"domain": tune.loguniform(lower=0.005, upper=0.2),
|
||||||
"init_value": 0.1,
|
"init_value": 0.1,
|
||||||
},
|
},
|
||||||
|
"n_estimators": {
|
||||||
|
"domain": 8192,
|
||||||
|
"init_value": 8192,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def size(cls, config):
|
def size(cls, config):
|
||||||
n_estimators = 8192
|
n_estimators = config.get("n_estimators", 8192)
|
||||||
max_leaves = 64
|
max_leaves = 64
|
||||||
return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8
|
return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8
|
||||||
|
|
||||||
|
@ -23,7 +23,6 @@ class TrainingLogRecord(object):
|
|||||||
config: dict,
|
config: dict,
|
||||||
learner: str,
|
learner: str,
|
||||||
sample_size: int,
|
sample_size: int,
|
||||||
n_iter: int,
|
|
||||||
):
|
):
|
||||||
self.record_id = record_id
|
self.record_id = record_id
|
||||||
self.iter_per_learner = iter_per_learner
|
self.iter_per_learner = iter_per_learner
|
||||||
@ -34,7 +33,6 @@ class TrainingLogRecord(object):
|
|||||||
self.config = config
|
self.config = config
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
self.sample_size = sample_size
|
self.sample_size = sample_size
|
||||||
self.n_iter = n_iter # n_estimators for catboost
|
|
||||||
|
|
||||||
def dump(self, fp: IO[str]):
|
def dump(self, fp: IO[str]):
|
||||||
d = vars(self)
|
d = vars(self)
|
||||||
@ -79,7 +77,6 @@ class TrainingLogWriter(object):
|
|||||||
config,
|
config,
|
||||||
learner,
|
learner,
|
||||||
sample_size,
|
sample_size,
|
||||||
n_iter,
|
|
||||||
):
|
):
|
||||||
if self.file is None:
|
if self.file is None:
|
||||||
raise IOError("Call open() to open the outpute file first.")
|
raise IOError("Call open() to open the outpute file first.")
|
||||||
@ -95,7 +92,6 @@ class TrainingLogWriter(object):
|
|||||||
config,
|
config,
|
||||||
learner,
|
learner,
|
||||||
sample_size,
|
sample_size,
|
||||||
n_iter,
|
|
||||||
)
|
)
|
||||||
if (
|
if (
|
||||||
validation_loss < self.current_best_loss
|
validation_loss < self.current_best_loss
|
||||||
|
@ -9,7 +9,7 @@ from flaml.training_log import training_log_reader
|
|||||||
|
|
||||||
|
|
||||||
class TestTrainingLog(unittest.TestCase):
|
class TestTrainingLog(unittest.TestCase):
|
||||||
def test_training_log(self, path="test_training_log.log"):
|
def test_training_log(self, path="test_training_log.log", estimator_list="auto"):
|
||||||
|
|
||||||
with TemporaryDirectory() as d:
|
with TemporaryDirectory() as d:
|
||||||
filename = os.path.join(d, path)
|
filename = os.path.join(d, path)
|
||||||
@ -27,8 +27,9 @@ class TestTrainingLog(unittest.TestCase):
|
|||||||
"model_history": True,
|
"model_history": True,
|
||||||
"train_time_limit": 0.1,
|
"train_time_limit": 0.1,
|
||||||
"verbose": 3,
|
"verbose": 3,
|
||||||
"ensemble": True,
|
# "ensemble": True,
|
||||||
"keep_search_state": True,
|
"keep_search_state": True,
|
||||||
|
"estimator_list": estimator_list,
|
||||||
}
|
}
|
||||||
X_train, y_train = fetch_california_housing(return_X_y=True)
|
X_train, y_train = fetch_california_housing(return_X_y=True)
|
||||||
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||||
@ -37,31 +38,34 @@ class TestTrainingLog(unittest.TestCase):
|
|||||||
if automl.best_estimator:
|
if automl.best_estimator:
|
||||||
estimator, config = automl.best_estimator, automl.best_config
|
estimator, config = automl.best_estimator, automl.best_config
|
||||||
model0 = automl.best_model_for_estimator(estimator)
|
model0 = automl.best_model_for_estimator(estimator)
|
||||||
print(model0.params["n_estimators"], model0.estimator)
|
print(model0.params["n_estimators"], config)
|
||||||
|
|
||||||
|
# train on full data with no time limit
|
||||||
automl._state.time_budget = None
|
automl._state.time_budget = None
|
||||||
model, _ = automl._state._train_with_config(estimator, config)
|
model, _ = automl._state._train_with_config(estimator, config)
|
||||||
print(model.estimator)
|
|
||||||
# model0 and model are equivalent unless model0's n_estimator is out of search space range
|
|
||||||
assert (
|
|
||||||
str(model0.estimator) == str(model.estimator)
|
|
||||||
or model0.params["n_estimators"] < 4
|
|
||||||
)
|
|
||||||
|
|
||||||
# assuming estimator & config are saved and loaded as follows
|
# assuming estimator & config are saved and loaded as follows
|
||||||
automl = AutoML()
|
automl = AutoML()
|
||||||
automl.fit(
|
automl.fit(
|
||||||
X_train=X_train,
|
X_train=X_train,
|
||||||
y_train=y_train,
|
y_train=y_train,
|
||||||
max_iter=0,
|
max_iter=1,
|
||||||
task="regression",
|
task="regression",
|
||||||
estimator_list=[estimator],
|
estimator_list=[estimator],
|
||||||
n_jobs=1,
|
n_jobs=1,
|
||||||
starting_points={estimator: config},
|
starting_points={estimator: config},
|
||||||
)
|
)
|
||||||
|
print(automl.best_config)
|
||||||
# then the fitted model should be equivalent to model
|
# then the fitted model should be equivalent to model
|
||||||
# print(str(model.estimator), str(automl.model.estimator))
|
assert (
|
||||||
assert str(model.estimator) == str(automl.model.estimator)
|
str(model.estimator) == str(automl.model.estimator)
|
||||||
|
or estimator == "xgboost"
|
||||||
|
and str(model.estimator.get_dump())
|
||||||
|
== str(automl.model.estimator.get_dump())
|
||||||
|
or estimator == "catboost"
|
||||||
|
and str(model.estimator.get_all_params())
|
||||||
|
== str(automl.model.estimator.get_all_params())
|
||||||
|
)
|
||||||
|
|
||||||
with training_log_reader(filename) as reader:
|
with training_log_reader(filename) as reader:
|
||||||
count = 0
|
count = 0
|
||||||
@ -83,3 +87,10 @@ class TestTrainingLog(unittest.TestCase):
|
|||||||
print("IsADirectoryError happens as expected in linux.")
|
print("IsADirectoryError happens as expected in linux.")
|
||||||
except PermissionError:
|
except PermissionError:
|
||||||
print("PermissionError happens as expected in windows.")
|
print("PermissionError happens as expected in windows.")
|
||||||
|
|
||||||
|
def test_each_estimator(self):
|
||||||
|
self.test_training_log(estimator_list=["xgboost"])
|
||||||
|
self.test_training_log(estimator_list=["catboost"])
|
||||||
|
self.test_training_log(estimator_list=["extra_tree"])
|
||||||
|
self.test_training_log(estimator_list=["rf"])
|
||||||
|
self.test_training_log(estimator_list=["lgbm"])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user