From 37d7518a4cca7dc426ef1a12e92f0111ecb0c025 Mon Sep 17 00:00:00 2001 From: Chi Wang Date: Wed, 31 Mar 2021 22:11:56 -0700 Subject: [PATCH] sample weight in xgboost (#54) --- README.md | 2 +- flaml/model.py | 13 +++++++------ test/test_automl.py | 15 ++++++++------- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index a0a7e2a5f..f2aebd0ae 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ adding customized learners or metrics. FLAML is powered by a new, [cost-effectiv hyperparameter optimization](https://github.com/microsoft/FLAML/tree/main/flaml/tune) and learner selection method invented by Microsoft Research. FLAML leverages the structure of the search space to choose a search order optimized for both cost and error. For example, the system tends to propose cheap configurations at the beginning stage of the search, -but quickly moves to configurations with high model complexity and large sample size when needed in the later stage of the search. For another example, it favors cheap learners in the beginning but penalizes them later if the error improvement is slow. The cost-bounded search and cost-based prioritization make a big difference in the the search efficiency under budget constraints. +but quickly moves to configurations with high model complexity and large sample size when needed in the later stage of the search. For another example, it favors cheap learners in the beginning but penalizes them later if the error improvement is slow. The cost-bounded search and cost-based prioritization make a big difference in the search efficiency under budget constraints. FLAML is easy to use: diff --git a/flaml/model.py b/flaml/model.py index b629376b1..70b0c816e 100644 --- a/flaml/model.py +++ b/flaml/model.py @@ -383,13 +383,14 @@ class XGBoostEstimator(SKLearnEstimator): if not issparse(X_train): self.params['tree_method'] = 'hist' X_train = self._preprocess(X_train) - dtrain = xgb.DMatrix(X_train, label=y_train) + if 'sample_weight' in kwargs: + dtrain = xgb.DMatrix(X_train, label=y_train, weight=kwargs[ + 'sample_weight']) + else: + dtrain = xgb.DMatrix(X_train, label=y_train) + if self._max_leaves>0: - if 'sample_weight' in kwargs: - self._model = xgb.train(self.params, dtrain, - self._n_estimators, weight=kwargs['sample_weight']) - else: - self._model = xgb.train(self.params, dtrain, self._n_estimators) + self._model = xgb.train(self.params, dtrain, self._n_estimators) del dtrain train_time = time.time() - start_time return train_time diff --git a/test/test_automl.py b/test/test_automl.py index c8b2960e5..dbf084a75 100644 --- a/test/test_automl.py +++ b/test/test_automl.py @@ -249,6 +249,10 @@ class TestAutoML(unittest.TestCase): def test_sparse_matrix_regression(self): + X_train = scipy.sparse.random(300, 900, density=0.0001) + y_train = np.random.uniform(size=300) + X_val = scipy.sparse.random(100, 900, density=0.0001) + y_val = np.random.uniform(size=100) automl_experiment = AutoML() automl_settings = { "time_budget": 2, @@ -259,10 +263,6 @@ class TestAutoML(unittest.TestCase): "model_history": True, "verbose": 0, } - X_train = scipy.sparse.random(300, 900, density=0.0001) - y_train = np.random.uniform(size=300) - X_val = scipy.sparse.random(100, 900, density=0.0001) - y_val = np.random.uniform(size=100) automl_experiment.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings) @@ -325,6 +325,8 @@ class TestAutoML(unittest.TestCase): def test_sparse_matrix_regression_cv(self): + X_train = scipy.sparse.random(8, 100) + y_train = np.random.uniform(size=8) automl_experiment = AutoML() automl_settings = { "time_budget": 2, @@ -333,10 +335,9 @@ class TestAutoML(unittest.TestCase): "log_file_name": "test/sparse_regression.log", "n_jobs": 1, "model_history": True, - "metric": "mse" + "metric": "mse", + "sample_weight": np.ones(len(y_train)), } - X_train = scipy.sparse.random(8, 100) - y_train = np.random.uniform(size=8) automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) print(automl_experiment.predict(X_train))