diff --git a/flaml/model.py b/flaml/model.py index 4a7825f1a..890f06c6b 100644 --- a/flaml/model.py +++ b/flaml/model.py @@ -1626,15 +1626,26 @@ class CatBoostEstimator(BaseEstimator): cat_features = list(X_train.select_dtypes(include="category").columns) else: cat_features = [] - n = max(int(len(y_train) * 0.9), len(y_train) - 1000) + use_best_model = kwargs.get("use_best_model", True) + n = ( + max(int(len(y_train) * 0.9), len(y_train) - 1000) + if use_best_model + else len(y_train) + ) X_tr, y_tr = X_train[:n], y_train[:n] + from catboost import Pool, __version__ + + eval_set = ( + Pool(data=X_train[n:], label=y_train[n:], cat_features=cat_features) + if use_best_model + else None + ) if "sample_weight" in kwargs: weight = kwargs["sample_weight"] if weight is not None: kwargs["sample_weight"] = weight[:n] else: weight = None - from catboost import Pool, __version__ model = self.estimator_class(train_dir=train_dir, **self.params) if __version__ >= "0.26": @@ -1642,10 +1653,10 @@ class CatBoostEstimator(BaseEstimator): X_tr, y_tr, cat_features=cat_features, - eval_set=Pool( - data=X_train[n:], label=y_train[n:], cat_features=cat_features + eval_set=eval_set, + callbacks=CatBoostEstimator._callbacks( + start_time, deadline, FREE_MEM_RATIO if use_best_model else None ), - callbacks=CatBoostEstimator._callbacks(start_time, deadline), **kwargs, ) else: @@ -1653,9 +1664,7 @@ class CatBoostEstimator(BaseEstimator): X_tr, y_tr, cat_features=cat_features, - eval_set=Pool( - data=X_train[n:], label=y_train[n:], cat_features=cat_features - ), + eval_set=eval_set, **kwargs, ) shutil.rmtree(train_dir, ignore_errors=True) @@ -1667,7 +1676,7 @@ class CatBoostEstimator(BaseEstimator): return train_time @classmethod - def _callbacks(cls, start_time, deadline): + def _callbacks(cls, start_time, deadline, free_mem_ratio): class ResourceLimit: def after_iteration(self, info) -> bool: now = time.time() @@ -1675,9 +1684,9 @@ class CatBoostEstimator(BaseEstimator): self._time_per_iter = now - start_time if now + self._time_per_iter > deadline: return False - if psutil is not None: + if psutil is not None and free_mem_ratio is not None: mem = psutil.virtual_memory() - if mem.available / mem.total < FREE_MEM_RATIO: + if mem.available / mem.total < free_mem_ratio: return False return True # can continue diff --git a/flaml/version.py b/flaml/version.py index 9fd0f8dd6..9eb1ebec5 100644 --- a/flaml/version.py +++ b/flaml/version.py @@ -1 +1 @@ -__version__ = "1.0.10" +__version__ = "1.0.11" diff --git a/test/automl/test_regression.py b/test/automl/test_regression.py index 47869ee34..0aca92f15 100644 --- a/test/automl/test_regression.py +++ b/test/automl/test_regression.py @@ -98,8 +98,8 @@ class TestRegression(unittest.TestCase): y_train = np.random.uniform(size=300) X_val = scipy.sparse.random(100, 900, density=0.0001) y_val = np.random.uniform(size=100) - automl_experiment = AutoML() - automl_settings = { + automl = AutoML() + settings = { "time_budget": 2, "metric": "mae", "task": "regression", @@ -110,23 +110,34 @@ class TestRegression(unittest.TestCase): "verbose": 0, "early_stop": True, } - automl_experiment.fit( - X_train=X_train, - y_train=y_train, - X_val=X_val, - y_val=y_val, - **automl_settings + automl.fit( + X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **settings + ) + assert automl._state.X_val.shape == X_val.shape + print(automl.predict(X_train)) + print(automl.model) + print(automl.config_history) + print(automl.best_model_for_estimator("rf")) + print(automl.best_iteration) + print(automl.best_estimator) + print(automl.best_config) + print(automl.best_loss) + print(automl.best_config_train_time) + + settings.update( + { + "estimator_list": ["catboost"], + "keep_search_state": False, + "model_history": False, + "use_best_model": False, + "time_budget": None, + "max_iter": 2, + "custom_hp": {"catboost": {"n_estimators": {"domain": 100}}}, + } + ) + automl.fit( + X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **settings ) - assert automl_experiment._state.X_val.shape == X_val.shape - print(automl_experiment.predict(X_train)) - print(automl_experiment.model) - print(automl_experiment.config_history) - print(automl_experiment.best_model_for_estimator("rf")) - print(automl_experiment.best_iteration) - print(automl_experiment.best_estimator) - print(automl_experiment.best_config) - print(automl_experiment.best_loss) - print(automl_experiment.best_config_train_time) def test_parallel(self, hpo_method=None): automl_experiment = AutoML()