sample weight in xgboost (#54)

2025-11-02 10:50:03 +00:00 · 2021-03-31 22:11:56 -07:00 · 2021-03-31 22:11:56 -07:00 · 37d7518a4c
commit 37d7518a4c
parent f28d093522
3 changed files with 16 additions and 14 deletions
--- a/README.md
+++ b/README.md
@ -19,7 +19,7 @@ adding customized learners or metrics. FLAML is powered by a new, [cost-effectiv
 hyperparameter optimization](https://github.com/microsoft/FLAML/tree/main/flaml/tune)
 and learner selection method invented by Microsoft Research.
 FLAML leverages the structure of the search space to choose a search order optimized for both cost and error. For example, the system tends to propose cheap configurations at the beginning stage of the search,
-but quickly moves to configurations with high model complexity and large sample size when needed in the later stage of the search. For another example, it favors cheap learners in the beginning but penalizes them later if the error improvement is slow. The cost-bounded search and cost-based prioritization make a big difference in the the search efficiency under budget constraints.
+but quickly moves to configurations with high model complexity and large sample size when needed in the later stage of the search. For another example, it favors cheap learners in the beginning but penalizes them later if the error improvement is slow. The cost-bounded search and cost-based prioritization make a big difference in the search efficiency under budget constraints.

 FLAML is easy to use:

--- a/flaml/model.py
+++ b/flaml/model.py
@ -383,13 +383,14 @@ class XGBoostEstimator(SKLearnEstimator):
        if not issparse(X_train):
            self.params['tree_method'] = 'hist'
            X_train = self._preprocess(X_train)
-        dtrain = xgb.DMatrix(X_train, label=y_train)
+        if 'sample_weight' in kwargs:
+            dtrain = xgb.DMatrix(X_train, label=y_train, weight=kwargs[
+                'sample_weight'])
+        else:
+            dtrain = xgb.DMatrix(X_train, label=y_train)
+            
        if self._max_leaves>0:
-            if 'sample_weight' in kwargs:
-                self._model = xgb.train(self.params, dtrain,
-                 self._n_estimators, weight=kwargs['sample_weight'])
-            else:
-                self._model = xgb.train(self.params, dtrain, self._n_estimators)
+            self._model = xgb.train(self.params, dtrain, self._n_estimators)
            del dtrain
            train_time = time.time() - start_time
            return train_time
--- a/test/test_automl.py
+++ b/test/test_automl.py
@ -249,6 +249,10 @@ class TestAutoML(unittest.TestCase):

    def test_sparse_matrix_regression(self):

+        X_train = scipy.sparse.random(300, 900, density=0.0001)
+        y_train = np.random.uniform(size=300)
+        X_val = scipy.sparse.random(100, 900, density=0.0001)
+        y_val = np.random.uniform(size=100)
        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
@ -259,10 +263,6 @@ class TestAutoML(unittest.TestCase):
            "model_history": True,
            "verbose": 0,
        }
-        X_train = scipy.sparse.random(300, 900, density=0.0001)
-        y_train = np.random.uniform(size=300)
-        X_val = scipy.sparse.random(100, 900, density=0.0001)
-        y_val = np.random.uniform(size=100)
        automl_experiment.fit(X_train=X_train, y_train=y_train,
                              X_val=X_val, y_val=y_val,
                              **automl_settings)
@ -325,6 +325,8 @@ class TestAutoML(unittest.TestCase):

    def test_sparse_matrix_regression_cv(self):

+        X_train = scipy.sparse.random(8, 100)
+        y_train = np.random.uniform(size=8)
        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
@ -333,10 +335,9 @@ class TestAutoML(unittest.TestCase):
            "log_file_name": "test/sparse_regression.log",
            "n_jobs": 1,
            "model_history": True,
-            "metric": "mse"
+            "metric": "mse",
+            "sample_weight": np.ones(len(y_train)),
        }
-        X_train = scipy.sparse.random(8, 100)
-        y_train = np.random.uniform(size=8)
        automl_experiment.fit(X_train=X_train, y_train=y_train,
                              **automl_settings)
        print(automl_experiment.predict(X_train))