constraint (#132)

* constraint

* ensemble
This commit is contained in:
Chi Wang 2021-07-10 09:02:17 -07:00 committed by GitHub
parent b04b00dc9d
commit 072e9e4588
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 72 additions and 39 deletions

View File

@ -162,7 +162,6 @@ class AutoMLState:
def _compute_with_config_base(self,
estimator,
config_w_resource):
compute_start_time = time.time()
if 'FLAML_sample_size' in config_w_resource:
sample_size = int(config_w_resource['FLAML_sample_size'])
else:
@ -181,14 +180,14 @@ class AutoMLState:
budget = time_left if sample_size == self.data_size else \
time_left / 2 * sample_size / self.data_size
trained_estimator, val_loss, train_loss, time2eval, _ = \
trained_estimator, val_loss, train_loss, time2eval, pred_time = \
compute_estimator(
sampled_X_train,
sampled_y_train,
self.X_val,
self.y_val,
self.weight_val,
budget,
min(budget, self.train_time_limit),
self.kf,
config,
self.task,
@ -201,7 +200,7 @@ class AutoMLState:
self.log_training_metric,
self.fit_kwargs)
result = {
'total_time': time.time() - compute_start_time,
'pred_time': pred_time,
'time2eval': time2eval,
'train_loss': train_loss,
'val_loss': val_loss,
@ -799,6 +798,8 @@ class AutoML:
n_splits=N_SPLITS,
log_training_metric=False,
mem_thres=MEM_THRES,
pred_time_limit=np.inf,
train_time_limit=np.inf,
X_val=None,
y_val=None,
sample_weight_val=None,
@ -813,7 +814,7 @@ class AutoML:
Args:
X_train: A numpy array or a pandas dataframe of training data in
shape (n, m)
shape (n, m)
y_train: A numpy array or a pandas series of labels in shape (n,)
dataframe: A dataframe of training data including label column
label: A str of the label column name
@ -835,7 +836,7 @@ class AutoML:
return metric_to_minimize, metrics_to_log
which returns a float number as the minimization objective,
and a tuple of floats as the metrics to log
and a tuple of floats or a dictionary as the metrics to log
task: A string of the task type, e.g.,
'classification', 'regression'
n_jobs: An integer of the number of threads for training
@ -865,6 +866,8 @@ class AutoML:
log_training_metric: A boolean of whether to log the training
metric for each model.
mem_thres: A float of the memory size constraint in bytes
pred_time_limit: A float of the prediction latency constraint in seconds
train_time_limit: A float of the training time constraint in seconds
X_val: None or a numpy array or a pandas dataframe of validation data
y_val: None or a numpy array or a pandas series of validation labels
sample_weight_val: None or a numpy array of the sample weight of
@ -955,6 +958,8 @@ class AutoML:
self._ensemble = ensemble
self._max_iter = max_iter
self._mem_thres = mem_thres
self._pred_time_limit = pred_time_limit
self._state.train_time_limit = train_time_limit
self._log_type = log_type
self.split_ratio = split_ratio
self._save_model_history = model_history
@ -1047,6 +1052,10 @@ class AutoML:
points_to_evaluate = [search_state.init_config]
low_cost_partial_config = search_state.low_cost_partial_config
if self._hpo_method in ('bs', 'cfo', 'grid'):
metric_constraints = []
if np.isfinite(self._pred_time_limit):
metric_constraints.append(
('pred_time', '<=', self._pred_time_limit))
algo = SearchAlgo(
metric='val_loss', mode='min', space=search_space,
points_to_evaluate=points_to_evaluate,
@ -1055,7 +1064,10 @@ class AutoML:
prune_attr=prune_attr,
min_resource=min_resource,
max_resource=max_resource,
config_constraints=[(learner_class.size, '<=', self._mem_thres)]
config_constraints=[
(learner_class.size, '<=', self._mem_thres)
],
metric_constraints=metric_constraints,
)
else:
algo = SearchAlgo(
@ -1077,7 +1089,7 @@ class AutoML:
analysis = tune.run(
search_state.training_function,
search_alg=search_state.search_alg,
time_budget_s=budget_left,
time_budget_s=min(budget_left, self._state.train_time_limit),
verbose=max(self.verbose - 1, 0),
use_ray=False)
time_used = time.time() - start_run_time

View File

@ -4,6 +4,7 @@
'''
import time
from joblib.externals.cloudpickle.cloudpickle import instance
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \
@ -127,7 +128,9 @@ def get_test_loss(
start = time.time()
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
if isinstance(eval_metric, str):
pred_start = time.time()
test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj)
pred_time = (time.time() - pred_start) / X_test.shape[0]
test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test,
labels, weight_test)
if train_loss is not False:
@ -136,11 +139,14 @@ def get_test_loss(
eval_metric, test_pred_y,
y_train, labels, fit_kwargs.get('sample_weight'))
else: # customized metric function
test_loss, train_loss = eval_metric(
test_loss, metrics = eval_metric(
X_test, y_test, estimator, labels, X_train, y_train,
weight_test, fit_kwargs.get('sample_weight'))
if isinstance(metrics, dict):
pred_time = metrics.get('pred_time', 0)
train_loss = metrics
train_time = time.time() - start
return test_loss, train_time, train_loss
return test_loss, train_time, train_loss, pred_time
def train_model(estimator, X_train, y_train, budget, fit_kwargs={}):
@ -154,27 +160,27 @@ def evaluate_model(
fit_kwargs={}
):
if 'holdout' in eval_method:
val_loss, train_loss, train_time = evaluate_model_holdout(
val_loss, train_loss, train_time, pred_time = evaluate_model_holdout(
estimator, X_train, y_train, X_val, y_val, weight_val, budget,
task, eval_metric, best_val_loss, train_loss=train_loss,
task, eval_metric, train_loss=train_loss,
fit_kwargs=fit_kwargs)
else:
val_loss, train_loss, train_time = evaluate_model_CV(
val_loss, train_loss, train_time, pred_time = evaluate_model_CV(
estimator, X_train, y_train, budget, kf, task,
eval_metric, best_val_loss, train_loss=train_loss,
fit_kwargs=fit_kwargs)
return val_loss, train_loss, train_time
return val_loss, train_loss, train_time, pred_time
def evaluate_model_holdout(
estimator, X_train, y_train, X_val, y_val,
weight_val, budget, task, eval_metric, best_val_loss, train_loss=False,
weight_val, budget, task, eval_metric, train_loss=False,
fit_kwargs={}
):
val_loss, train_time, train_loss = get_test_loss(
val_loss, train_time, train_loss, pred_time = get_test_loss(
estimator, X_train, y_train, X_val, y_val, weight_val, eval_metric,
task, budget=budget, train_loss=train_loss, fit_kwargs=fit_kwargs)
return val_loss, train_loss, train_time
return val_loss, train_loss, train_time, pred_time
def evaluate_model_CV(
@ -182,9 +188,10 @@ def evaluate_model_CV(
task, eval_metric, best_val_loss, train_loss=False, fit_kwargs={}
):
start_time = time.time()
total_val_loss = total_train_loss = 0
train_time = 0
valid_fold_num = 0
total_val_loss = 0
total_train_loss = None
train_time = pred_time = 0
valid_fold_num = total_fold_num = 0
n = kf.get_n_splits()
X_train_split, y_train_split = X_train_all, y_train_all
if task == 'regression':
@ -224,23 +231,28 @@ def evaluate_model_CV(
if weight is not None:
fit_kwargs['sample_weight'], weight_val = weight[
train_index], weight[val_index]
val_loss_i, train_time_i, train_loss_i = get_test_loss(
val_loss_i, train_time_i, train_loss_i, pred_time_i = get_test_loss(
estimator, X_train, y_train, X_val, y_val, weight_val,
eval_metric, task, labels, budget_per_train,
train_loss=train_loss, fit_kwargs=fit_kwargs)
if weight is not None:
fit_kwargs['sample_weight'] = weight
valid_fold_num += 1
total_fold_num += 1
total_val_loss += val_loss_i
if train_loss is not False:
if isinstance(total_train_loss, list):
total_train_loss = [
total_train_loss[i] + v for i, v in enumerate(train_loss_i)]
elif total_train_loss != 0:
elif isinstance(total_train_loss, dict):
total_train_loss = {
k: total_train_loss[k] + v for k, v in train_loss_i.items()}
elif total_train_loss is not None:
total_train_loss += train_loss_i
else:
total_train_loss = train_loss_i
train_time += train_time_i
pred_time += pred_time_i
if valid_fold_num == n:
val_loss_list.append(total_val_loss / valid_fold_num)
total_val_loss = valid_fold_num = 0
@ -248,16 +260,20 @@ def evaluate_model_CV(
val_loss_list.append(total_val_loss / valid_fold_num)
break
val_loss = np.max(val_loss_list)
n = total_fold_num
if train_loss is not False:
if isinstance(total_train_loss, list):
train_loss = [v / n for v in total_train_loss]
elif isinstance(total_train_loss, dict):
train_loss = {k: v / n for k, v in total_train_loss.items()}
else:
train_loss = total_train_loss / n
pred_time /= n
budget -= time.time() - start_time
if val_loss < best_val_loss and budget > budget_per_train:
estimator.cleanup()
estimator.fit(X_train_all, y_train_all, budget, **fit_kwargs)
return val_loss, train_loss, train_time
return val_loss, train_loss, train_time, pred_time
def compute_estimator(
@ -266,17 +282,15 @@ def compute_estimator(
best_val_loss=np.Inf, n_jobs=1, estimator_class=None, train_loss=False,
fit_kwargs={}
):
start_time = time.time()
estimator_class = estimator_class or get_estimator_class(
task, estimator_name)
estimator = estimator_class(
**config_dic, task=task, n_jobs=n_jobs)
val_loss, train_loss, train_time = evaluate_model(
val_loss, train_loss, train_time, pred_time = evaluate_model(
estimator, X_train, y_train, X_val, y_val, weight_val, budget, kf, task,
eval_method, eval_metric, best_val_loss, train_loss=train_loss,
fit_kwargs=fit_kwargs)
all_time = time.time() - start_time
return estimator, val_loss, train_loss, train_time, all_time
return estimator, val_loss, train_loss, train_time, pred_time
def train_estimator(

View File

@ -42,6 +42,7 @@ class BaseEstimator:
self._task = task
if '_estimator_type' in params:
self._estimator_type = params['_estimator_type']
del self.params['_estimator_type']
else:
self._estimator_type = "regressor" if task == 'regression' \
else "classifier"
@ -152,7 +153,7 @@ class BaseEstimator:
return {}
@classmethod
def size(cls, config):
def size(cls, config: dict) -> float:
'''[optional method] memory size of the estimator in bytes
Args:
@ -165,7 +166,7 @@ class BaseEstimator:
return 1.0
@classmethod
def cost_relative2lgbm(cls):
def cost_relative2lgbm(cls) -> float:
'''[optional method] relative cost compared to lightgbm'''
return 1.0
@ -445,7 +446,8 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
**params
):
super().__init__(task, **params)
self.params = params
del self.params['objective']
del self.params['max_bin']
self.params.update({
"n_estimators": int(round(n_estimators)),
'max_leaves': int(round(max_leaves)),
@ -514,7 +516,8 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
n_estimators=4, max_features=1.0, criterion='gini', **params
):
super().__init__(task, **params)
self.params = params
del self.params['objective']
del self.params['max_bin']
self.params.update({
"n_estimators": int(round(n_estimators)),
"n_jobs": n_jobs,
@ -525,8 +528,6 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
else:
self.estimator_class = RandomForestClassifier
self.params['criterion'] = criterion
self._time_per_iter = None
self._train_size = 0
def get_params(self, deep=False):
params = super().get_params()
@ -761,7 +762,6 @@ class KNeighborsEstimator(BaseEstimator):
self, task='binary:logistic', n_jobs=1, n_neighbors=5, **params
):
super().__init__(task, **params)
self.params = params
self.params.update({
'n_neighbors': int(round(n_neighbors)),
'weights': params.get('weights', 'distance'),

View File

@ -1 +1 @@
__version__ = "0.5.6"
__version__ = "0.5.7"

View File

@ -95,14 +95,19 @@ class MyXGB2(XGBoostEstimator):
def custom_metric(X_test, y_test, estimator, labels, X_train, y_train,
weight_test=None, weight_train=None):
from sklearn.metrics import log_loss
import time
start = time.time()
y_pred = estimator.predict_proba(X_test)
pred_time = (time.time() - start) / len(X_test)
test_loss = log_loss(y_test, y_pred, labels=labels,
sample_weight=weight_test)
y_pred = estimator.predict_proba(X_train)
train_loss = log_loss(y_train, y_pred, labels=labels,
sample_weight=weight_train)
alpha = 0.5
return test_loss * (1 + alpha) - alpha * train_loss, [test_loss, train_loss]
return test_loss * (1 + alpha) - alpha * train_loss, {
"test_loss": test_loss, "train_loss": train_loss, "pred_time": pred_time
}
class TestAutoML(unittest.TestCase):
@ -133,8 +138,8 @@ class TestAutoML(unittest.TestCase):
learner_class=MyRegularizedGreedyForest)
X_train, y_train = load_wine(return_X_y=True)
settings = {
"time_budget": 10, # total running time in seconds
"estimator_list": ['RGF', 'lgbm', 'rf', 'xgboost'],
"time_budget": 5, # total running time in seconds
"estimator_list": ['rf', 'xgboost', 'catboost'],
"task": 'classification', # task type
"sample": True, # whether to subsample training data
"log_file_name": "test/wine.log",
@ -163,6 +168,7 @@ class TestAutoML(unittest.TestCase):
"n_jobs": 1,
"model_history": True,
"sample_weight": np.ones(len(y_train)),
"pred_time_limit": 1e-5,
}
automl_experiment.fit(X_train=X_train, y_train=y_train,
**automl_settings)

View File

@ -26,7 +26,8 @@ class TestTrainingLog(unittest.TestCase):
"mem_thres": 1024 * 1024,
"n_jobs": 1,
"model_history": True,
"verbose": 2,
"train_time_limit": 0.01,
"verbose": 3,
}
X_train, y_train = load_boston(return_X_y=True)
automl_experiment.fit(X_train=X_train, y_train=y_train,