use_best_model for catboost (#679)

* use_best_model for catboost * bump version to 1.0.11
2026-01-06 12:10:58 +00:00 · 2022-08-20 18:38:56 -07:00 · 2022-08-20 18:38:56 -07:00 · dffa802b3e
commit dffa802b3e
parent 3d1a28bfc0
3 changed files with 50 additions and 30 deletions
--- a/flaml/model.py
+++ b/flaml/model.py
@ -1626,15 +1626,26 @@ class CatBoostEstimator(BaseEstimator):
            cat_features = list(X_train.select_dtypes(include="category").columns)
        else:
            cat_features = []
-        n = max(int(len(y_train) * 0.9), len(y_train) - 1000)
+        use_best_model = kwargs.get("use_best_model", True)
+        n = (
+            max(int(len(y_train) * 0.9), len(y_train) - 1000)
+            if use_best_model
+            else len(y_train)
+        )
        X_tr, y_tr = X_train[:n], y_train[:n]
+        from catboost import Pool, __version__
+
+        eval_set = (
+            Pool(data=X_train[n:], label=y_train[n:], cat_features=cat_features)
+            if use_best_model
+            else None
+        )
        if "sample_weight" in kwargs:
            weight = kwargs["sample_weight"]
            if weight is not None:
                kwargs["sample_weight"] = weight[:n]
        else:
            weight = None
-        from catboost import Pool, __version__

        model = self.estimator_class(train_dir=train_dir, **self.params)
        if __version__ >= "0.26":
@ -1642,10 +1653,10 @@ class CatBoostEstimator(BaseEstimator):
                X_tr,
                y_tr,
                cat_features=cat_features,
-                eval_set=Pool(
-                    data=X_train[n:], label=y_train[n:], cat_features=cat_features
+                eval_set=eval_set,
+                callbacks=CatBoostEstimator._callbacks(
+                    start_time, deadline, FREE_MEM_RATIO if use_best_model else None
                ),
-                callbacks=CatBoostEstimator._callbacks(start_time, deadline),
                **kwargs,
            )
        else:
@ -1653,9 +1664,7 @@ class CatBoostEstimator(BaseEstimator):
                X_tr,
                y_tr,
                cat_features=cat_features,
-                eval_set=Pool(
-                    data=X_train[n:], label=y_train[n:], cat_features=cat_features
-                ),
+                eval_set=eval_set,
                **kwargs,
            )
        shutil.rmtree(train_dir, ignore_errors=True)
@ -1667,7 +1676,7 @@ class CatBoostEstimator(BaseEstimator):
        return train_time

    @classmethod
-    def _callbacks(cls, start_time, deadline):
+    def _callbacks(cls, start_time, deadline, free_mem_ratio):
        class ResourceLimit:
            def after_iteration(self, info) -> bool:
                now = time.time()
@ -1675,9 +1684,9 @@ class CatBoostEstimator(BaseEstimator):
                    self._time_per_iter = now - start_time
                if now + self._time_per_iter > deadline:
                    return False
-                if psutil is not None:
+                if psutil is not None and free_mem_ratio is not None:
                    mem = psutil.virtual_memory()
-                    if mem.available / mem.total < FREE_MEM_RATIO:
+                    if mem.available / mem.total < free_mem_ratio:
                        return False
                return True  # can continue

--- a/flaml/version.py
+++ b/flaml/version.py
@ -1 +1 @@
-__version__ = "1.0.10"
+__version__ = "1.0.11"
--- a/test/automl/test_regression.py
+++ b/test/automl/test_regression.py
@ -98,8 +98,8 @@ class TestRegression(unittest.TestCase):
        y_train = np.random.uniform(size=300)
        X_val = scipy.sparse.random(100, 900, density=0.0001)
        y_val = np.random.uniform(size=100)
-        automl_experiment = AutoML()
-        automl_settings = {
+        automl = AutoML()
+        settings = {
            "time_budget": 2,
            "metric": "mae",
            "task": "regression",
@ -110,23 +110,34 @@ class TestRegression(unittest.TestCase):
            "verbose": 0,
            "early_stop": True,
        }
-        automl_experiment.fit(
-            X_train=X_train,
-            y_train=y_train,
-            X_val=X_val,
-            y_val=y_val,
-            **automl_settings
+        automl.fit(
+            X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **settings
+        )
+        assert automl._state.X_val.shape == X_val.shape
+        print(automl.predict(X_train))
+        print(automl.model)
+        print(automl.config_history)
+        print(automl.best_model_for_estimator("rf"))
+        print(automl.best_iteration)
+        print(automl.best_estimator)
+        print(automl.best_config)
+        print(automl.best_loss)
+        print(automl.best_config_train_time)
+
+        settings.update(
+            {
+                "estimator_list": ["catboost"],
+                "keep_search_state": False,
+                "model_history": False,
+                "use_best_model": False,
+                "time_budget": None,
+                "max_iter": 2,
+                "custom_hp": {"catboost": {"n_estimators": {"domain": 100}}},
+            }
+        )
+        automl.fit(
+            X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **settings
        )
-        assert automl_experiment._state.X_val.shape == X_val.shape
-        print(automl_experiment.predict(X_train))
-        print(automl_experiment.model)
-        print(automl_experiment.config_history)
-        print(automl_experiment.best_model_for_estimator("rf"))
-        print(automl_experiment.best_iteration)
-        print(automl_experiment.best_estimator)
-        print(automl_experiment.best_config)
-        print(automl_experiment.best_loss)
-        print(automl_experiment.best_config_train_time)

    def test_parallel(self, hpo_method=None):
        automl_experiment = AutoML()