sample weight in xgboost (#54)

This commit is contained in:
Chi Wang 2021-03-31 22:11:56 -07:00 committed by GitHub
parent f28d093522
commit 37d7518a4c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 16 additions and 14 deletions

View File

@ -19,7 +19,7 @@ adding customized learners or metrics. FLAML is powered by a new, [cost-effectiv
hyperparameter optimization](https://github.com/microsoft/FLAML/tree/main/flaml/tune)
and learner selection method invented by Microsoft Research.
FLAML leverages the structure of the search space to choose a search order optimized for both cost and error. For example, the system tends to propose cheap configurations at the beginning stage of the search,
but quickly moves to configurations with high model complexity and large sample size when needed in the later stage of the search. For another example, it favors cheap learners in the beginning but penalizes them later if the error improvement is slow. The cost-bounded search and cost-based prioritization make a big difference in the the search efficiency under budget constraints.
but quickly moves to configurations with high model complexity and large sample size when needed in the later stage of the search. For another example, it favors cheap learners in the beginning but penalizes them later if the error improvement is slow. The cost-bounded search and cost-based prioritization make a big difference in the search efficiency under budget constraints.
FLAML is easy to use:

View File

@ -383,13 +383,14 @@ class XGBoostEstimator(SKLearnEstimator):
if not issparse(X_train):
self.params['tree_method'] = 'hist'
X_train = self._preprocess(X_train)
dtrain = xgb.DMatrix(X_train, label=y_train)
if 'sample_weight' in kwargs:
dtrain = xgb.DMatrix(X_train, label=y_train, weight=kwargs[
'sample_weight'])
else:
dtrain = xgb.DMatrix(X_train, label=y_train)
if self._max_leaves>0:
if 'sample_weight' in kwargs:
self._model = xgb.train(self.params, dtrain,
self._n_estimators, weight=kwargs['sample_weight'])
else:
self._model = xgb.train(self.params, dtrain, self._n_estimators)
self._model = xgb.train(self.params, dtrain, self._n_estimators)
del dtrain
train_time = time.time() - start_time
return train_time

View File

@ -249,6 +249,10 @@ class TestAutoML(unittest.TestCase):
def test_sparse_matrix_regression(self):
X_train = scipy.sparse.random(300, 900, density=0.0001)
y_train = np.random.uniform(size=300)
X_val = scipy.sparse.random(100, 900, density=0.0001)
y_val = np.random.uniform(size=100)
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
@ -259,10 +263,6 @@ class TestAutoML(unittest.TestCase):
"model_history": True,
"verbose": 0,
}
X_train = scipy.sparse.random(300, 900, density=0.0001)
y_train = np.random.uniform(size=300)
X_val = scipy.sparse.random(100, 900, density=0.0001)
y_val = np.random.uniform(size=100)
automl_experiment.fit(X_train=X_train, y_train=y_train,
X_val=X_val, y_val=y_val,
**automl_settings)
@ -325,6 +325,8 @@ class TestAutoML(unittest.TestCase):
def test_sparse_matrix_regression_cv(self):
X_train = scipy.sparse.random(8, 100)
y_train = np.random.uniform(size=8)
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
@ -333,10 +335,9 @@ class TestAutoML(unittest.TestCase):
"log_file_name": "test/sparse_regression.log",
"n_jobs": 1,
"model_history": True,
"metric": "mse"
"metric": "mse",
"sample_weight": np.ones(len(y_train)),
}
X_train = scipy.sparse.random(8, 100)
y_train = np.random.uniform(size=8)
automl_experiment.fit(X_train=X_train, y_train=y_train,
**automl_settings)
print(automl_experiment.predict(X_train))