use_best_model for catboost (#679)

* use_best_model for catboost

* bump version to 1.0.11
This commit is contained in:
Chi Wang 2022-08-20 18:38:56 -07:00 committed by GitHub
parent 3d1a28bfc0
commit dffa802b3e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 50 additions and 30 deletions

View File

@ -1626,15 +1626,26 @@ class CatBoostEstimator(BaseEstimator):
cat_features = list(X_train.select_dtypes(include="category").columns)
else:
cat_features = []
n = max(int(len(y_train) * 0.9), len(y_train) - 1000)
use_best_model = kwargs.get("use_best_model", True)
n = (
max(int(len(y_train) * 0.9), len(y_train) - 1000)
if use_best_model
else len(y_train)
)
X_tr, y_tr = X_train[:n], y_train[:n]
from catboost import Pool, __version__
eval_set = (
Pool(data=X_train[n:], label=y_train[n:], cat_features=cat_features)
if use_best_model
else None
)
if "sample_weight" in kwargs:
weight = kwargs["sample_weight"]
if weight is not None:
kwargs["sample_weight"] = weight[:n]
else:
weight = None
from catboost import Pool, __version__
model = self.estimator_class(train_dir=train_dir, **self.params)
if __version__ >= "0.26":
@ -1642,10 +1653,10 @@ class CatBoostEstimator(BaseEstimator):
X_tr,
y_tr,
cat_features=cat_features,
eval_set=Pool(
data=X_train[n:], label=y_train[n:], cat_features=cat_features
eval_set=eval_set,
callbacks=CatBoostEstimator._callbacks(
start_time, deadline, FREE_MEM_RATIO if use_best_model else None
),
callbacks=CatBoostEstimator._callbacks(start_time, deadline),
**kwargs,
)
else:
@ -1653,9 +1664,7 @@ class CatBoostEstimator(BaseEstimator):
X_tr,
y_tr,
cat_features=cat_features,
eval_set=Pool(
data=X_train[n:], label=y_train[n:], cat_features=cat_features
),
eval_set=eval_set,
**kwargs,
)
shutil.rmtree(train_dir, ignore_errors=True)
@ -1667,7 +1676,7 @@ class CatBoostEstimator(BaseEstimator):
return train_time
@classmethod
def _callbacks(cls, start_time, deadline):
def _callbacks(cls, start_time, deadline, free_mem_ratio):
class ResourceLimit:
def after_iteration(self, info) -> bool:
now = time.time()
@ -1675,9 +1684,9 @@ class CatBoostEstimator(BaseEstimator):
self._time_per_iter = now - start_time
if now + self._time_per_iter > deadline:
return False
if psutil is not None:
if psutil is not None and free_mem_ratio is not None:
mem = psutil.virtual_memory()
if mem.available / mem.total < FREE_MEM_RATIO:
if mem.available / mem.total < free_mem_ratio:
return False
return True # can continue

View File

@ -1 +1 @@
__version__ = "1.0.10"
__version__ = "1.0.11"

View File

@ -98,8 +98,8 @@ class TestRegression(unittest.TestCase):
y_train = np.random.uniform(size=300)
X_val = scipy.sparse.random(100, 900, density=0.0001)
y_val = np.random.uniform(size=100)
automl_experiment = AutoML()
automl_settings = {
automl = AutoML()
settings = {
"time_budget": 2,
"metric": "mae",
"task": "regression",
@ -110,23 +110,34 @@ class TestRegression(unittest.TestCase):
"verbose": 0,
"early_stop": True,
}
automl_experiment.fit(
X_train=X_train,
y_train=y_train,
X_val=X_val,
y_val=y_val,
**automl_settings
automl.fit(
X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **settings
)
assert automl._state.X_val.shape == X_val.shape
print(automl.predict(X_train))
print(automl.model)
print(automl.config_history)
print(automl.best_model_for_estimator("rf"))
print(automl.best_iteration)
print(automl.best_estimator)
print(automl.best_config)
print(automl.best_loss)
print(automl.best_config_train_time)
settings.update(
{
"estimator_list": ["catboost"],
"keep_search_state": False,
"model_history": False,
"use_best_model": False,
"time_budget": None,
"max_iter": 2,
"custom_hp": {"catboost": {"n_estimators": {"domain": 100}}},
}
)
automl.fit(
X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **settings
)
assert automl_experiment._state.X_val.shape == X_val.shape
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.best_model_for_estimator("rf"))
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
print(automl_experiment.best_config)
print(automl_experiment.best_loss)
print(automl_experiment.best_config_train_time)
def test_parallel(self, hpo_method=None):
automl_experiment = AutoML()