autogen/flaml/ml.py

396 lines
16 KiB
Python
Raw Normal View History

'''!
* Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
'''
import time
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \
accuracy_score, mean_absolute_error, log_loss, average_precision_score, \
f1_score, mean_absolute_percentage_error, ndcg_score
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit
from .model import (
XGBoostEstimator, XGBoostSklearnEstimator, RandomForestEstimator,
LGBMEstimator, LRL1Classifier, LRL2Classifier, CatBoostEstimator,
ExtraTreeEstimator, KNeighborsEstimator, FBProphet, ARIMA, SARIMAX)
from .data import group_counts
import logging
logger = logging.getLogger(__name__)
def get_estimator_class(task, estimator_name):
''' when adding a new learner, need to add an elif branch '''
if 'xgboost' == estimator_name:
if 'regression' == task:
estimator_class = XGBoostEstimator
else:
estimator_class = XGBoostSklearnEstimator
elif 'rf' == estimator_name:
estimator_class = RandomForestEstimator
elif 'lgbm' == estimator_name:
estimator_class = LGBMEstimator
elif 'lrl1' == estimator_name:
estimator_class = LRL1Classifier
elif 'lrl2' == estimator_name:
estimator_class = LRL2Classifier
elif 'catboost' == estimator_name:
estimator_class = CatBoostEstimator
elif 'extra_tree' == estimator_name:
estimator_class = ExtraTreeEstimator
elif 'kneighbor' == estimator_name:
estimator_class = KNeighborsEstimator
elif 'prophet' in estimator_name:
estimator_class = FBProphet
elif estimator_name == 'arima':
estimator_class = ARIMA
elif estimator_name == 'sarimax':
estimator_class = SARIMAX
else:
raise ValueError(
estimator_name + ' is not a built-in learner. '
'Please use AutoML.add_learner() to add a customized learner.')
return estimator_class
def sklearn_metric_loss_score(
metric_name, y_predict, y_true, labels=None, sample_weight=None,
groups=None,
):
'''Loss using the specified metric
Args:
metric_name: A string of the metric name, one of
'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg',
'micro_f1', 'macro_f1'.
y_predict: A 1d or 2d numpy array of the predictions which can be
used to calculate the metric. E.g., 2d for log_loss and 1d
for others.
y_true: A 1d numpy array of the true labels.
labels: A 1d numpy array of the unique labels.
sample_weight: A 1d numpy array of the sample weight.
groups: A 1d numpy array of the group labels.
Returns:
score: A float number of the loss, the lower the better.
'''
metric_name = metric_name.lower()
if 'r2' == metric_name:
score = 1.0 - r2_score(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == 'rmse':
score = np.sqrt(mean_squared_error(
y_true, y_predict, sample_weight=sample_weight))
elif metric_name == 'mae':
score = mean_absolute_error(
y_true, y_predict, sample_weight=sample_weight)
elif metric_name == 'mse':
score = mean_squared_error(
y_true, y_predict, sample_weight=sample_weight)
elif metric_name == 'accuracy':
score = 1.0 - accuracy_score(
y_true, y_predict, sample_weight=sample_weight)
elif metric_name == 'roc_auc':
score = 1.0 - roc_auc_score(
y_true, y_predict, sample_weight=sample_weight)
elif metric_name == 'roc_auc_ovr':
score = 1.0 - roc_auc_score(
y_true, y_predict, sample_weight=sample_weight, multi_class='ovr')
elif metric_name == 'roc_auc_ovo':
score = 1.0 - roc_auc_score(
y_true, y_predict, sample_weight=sample_weight, multi_class='ovo')
elif 'log_loss' == metric_name:
score = log_loss(
y_true, y_predict, labels=labels, sample_weight=sample_weight)
elif 'mape' == metric_name:
try:
score = mean_absolute_percentage_error(
y_true, y_predict)
except ValueError:
return np.inf
elif 'micro_f1' == metric_name:
score = 1 - f1_score(
y_true, y_predict, sample_weight=sample_weight, average='micro')
elif 'macro_f1' == metric_name:
score = 1 - f1_score(
y_true, y_predict, sample_weight=sample_weight, average='macro')
elif 'f1' == metric_name:
score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight)
elif 'ap' == metric_name:
score = 1 - average_precision_score(
y_true, y_predict, sample_weight=sample_weight)
elif 'ndcg' in metric_name:
if '@' in metric_name:
k = int(metric_name.split('@', 1)[-1])
counts = group_counts(groups)
score = 0
psum = 0
for c in counts:
score -= ndcg_score(np.asarray([y_true[psum:psum + c]]),
np.asarray([y_predict[psum:psum + c]]), k=k)
psum += c
score /= len(counts)
score += 1
else:
score = 1 - ndcg_score([y_true], [y_predict])
else:
raise ValueError(
metric_name + ' is not a built-in metric, '
'currently built-in metrics are: '
'r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,'
'log_loss, mape, f1, micro_f1, macro_f1, ap. '
'please pass a customized metric function to AutoML.fit(metric=func)')
return score
def get_y_pred(estimator, X, eval_metric, obj):
if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj:
y_pred_classes = estimator.predict_proba(X)
y_pred = y_pred_classes[
:, 1] if y_pred_classes.ndim > 1 else y_pred_classes
elif eval_metric in ['log_loss', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']:
y_pred = estimator.predict_proba(X)
else:
y_pred = estimator.predict(X)
return y_pred
def _eval_estimator(config, estimator, X_train, y_train, X_test, y_test, weight_test,
groups_test, eval_metric, obj, labels=None,
log_training_metric=False, fit_kwargs={}):
if isinstance(eval_metric, str):
pred_start = time.time()
test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj)
pred_time = (time.time() - pred_start) / X_test.shape[0]
test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test,
labels, weight_test, groups_test)
metric_for_logging = {}
if log_training_metric:
train_pred_y = get_y_pred(estimator, X_train, eval_metric, obj)
metric_for_logging['train_loss'] = sklearn_metric_loss_score(
eval_metric, train_pred_y, y_train, labels,
fit_kwargs.get('sample_weight'), fit_kwargs.get('groups'))
else: # customized metric function
test_loss, metric_for_logging = eval_metric(
X_test, y_test, estimator, labels, X_train, y_train, weight_test,
fit_kwargs.get('sample_weight'), config, groups_test,
fit_kwargs.get('groups'))
if isinstance(metric_for_logging, dict):
pred_time = metric_for_logging.get('pred_time', 0)
test_pred_y = None
# eval_metric may return test_pred_y but not necessarily. Setting None for now.
return test_loss, metric_for_logging, pred_time, test_pred_y
def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_test,
groups_test, eval_metric, obj, labels=None, budget=None,
log_training_metric=False, fit_kwargs={}):
start = time.time()
# if groups_test is not None:
# fit_kwargs['groups_val'] = groups_test
# fit_kwargs['X_val'] = X_test
# fit_kwargs['y_val'] = y_test
estimator.fit(X_train, y_train, budget, **fit_kwargs)
test_loss, metric_for_logging, pred_time, _ = _eval_estimator(
config, estimator, X_train, y_train, X_test, y_test,
weight_test, groups_test, eval_metric, obj,
labels, log_training_metric, fit_kwargs)
train_time = time.time() - start
return test_loss, metric_for_logging, train_time, pred_time
def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
task, eval_metric, best_val_loss,
log_training_metric=False, fit_kwargs={}):
start_time = time.time()
total_val_loss = 0
total_metric = None
metric = None
train_time = pred_time = 0
valid_fold_num = total_fold_num = 0
n = kf.get_n_splits()
X_train_split, y_train_split = X_train_all, y_train_all
if task in ('binary', 'multi'):
labels = np.unique(y_train_all)
else:
labels = None
groups = None
shuffle = True
if isinstance(kf, RepeatedStratifiedKFold):
kf = kf.split(X_train_split, y_train_split)
elif isinstance(kf, GroupKFold):
groups = kf.groups
kf = kf.split(X_train_split, y_train_split, groups)
shuffle = False
elif isinstance(kf, TimeSeriesSplit) and task == 'forecast':
y_train_all = pd.DataFrame(y_train_all, columns=['y'])
train = X_train_all.join(y_train_all)
kf = kf.split(train)
shuffle = False
elif isinstance(kf, TimeSeriesSplit):
kf = kf.split(X_train_split, y_train_split)
else:
kf = kf.split(X_train_split)
rng = np.random.RandomState(2020)
val_loss_list = []
budget_per_train = budget / n
if 'sample_weight' in fit_kwargs:
weight = fit_kwargs['sample_weight']
weight_val = None
else:
weight = weight_val = None
for train_index, val_index in kf:
if shuffle:
train_index = rng.permutation(train_index)
if isinstance(X_train_all, pd.DataFrame):
X_train, X_val = X_train_split.iloc[
train_index], X_train_split.iloc[val_index]
else:
X_train, X_val = X_train_split[
train_index], X_train_split[val_index]
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
estimator.cleanup()
if weight is not None:
fit_kwargs['sample_weight'], weight_val = weight[
train_index], weight[val_index]
if groups is not None:
fit_kwargs['groups'] = groups[train_index]
groups_val = groups[val_index]
else:
groups_val = None
val_loss_i, metric_i, train_time_i, pred_time_i = get_test_loss(
config, estimator, X_train, y_train, X_val, y_val, weight_val,
groups_val, eval_metric, task, labels, budget_per_train,
log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
if weight is not None:
fit_kwargs['sample_weight'] = weight
valid_fold_num += 1
total_fold_num += 1
total_val_loss += val_loss_i
if log_training_metric or not isinstance(eval_metric, str):
if isinstance(total_metric, list):
total_metric = [
total_metric[i] + v for i, v in enumerate(metric_i)]
elif isinstance(total_metric, dict):
total_metric = {
k: total_metric[k] + v for k, v in metric_i.items()}
elif total_metric is not None:
total_metric += metric_i
else:
total_metric = metric_i
train_time += train_time_i
pred_time += pred_time_i
if valid_fold_num == n:
val_loss_list.append(total_val_loss / valid_fold_num)
total_val_loss = valid_fold_num = 0
elif time.time() - start_time >= budget:
val_loss_list.append(total_val_loss / valid_fold_num)
break
val_loss = np.max(val_loss_list)
n = total_fold_num
if log_training_metric or not isinstance(eval_metric, str):
if isinstance(total_metric, list):
metric = [v / n for v in total_metric]
elif isinstance(total_metric, dict):
metric = {k: v / n for k, v in total_metric.items()}
2021-06-18 21:19:59 -07:00
else:
metric = total_metric / n
pred_time /= n
# budget -= time.time() - start_time
# if val_loss < best_val_loss and budget > budget_per_train:
# estimator.cleanup()
# estimator.fit(X_train_all, y_train_all, budget, **fit_kwargs)
return val_loss, metric, train_time, pred_time
def compute_estimator(
X_train, y_train, X_val, y_val, weight_val, groups_val, budget, kf,
config_dic, task, estimator_name, eval_method, eval_metric,
best_val_loss=np.Inf, n_jobs=1, estimator_class=None, log_training_metric=False,
fit_kwargs={}
):
estimator_class = estimator_class or get_estimator_class(
task, estimator_name)
estimator = estimator_class(
**config_dic, task=task, n_jobs=n_jobs)
if 'holdout' in eval_method:
val_loss, metric_for_logging, train_time, pred_time = get_test_loss(
config_dic, estimator, X_train, y_train, X_val, y_val, weight_val,
groups_val, eval_metric, task, budget=budget,
log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
else:
val_loss, metric_for_logging, train_time, pred_time = evaluate_model_CV(
config_dic, estimator, X_train, y_train, budget, kf, task,
eval_metric, best_val_loss, log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs)
return estimator, val_loss, metric_for_logging, train_time, pred_time
def train_estimator(
X_train, y_train, config_dic, task,
estimator_name, n_jobs=1, estimator_class=None, budget=None, fit_kwargs={}
):
start_time = time.time()
estimator_class = estimator_class or get_estimator_class(
task, estimator_name)
estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
if X_train is not None:
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
else:
estimator = estimator.estimator_class(**estimator.params)
train_time = time.time() - start_time
return estimator, train_time
def get_classification_objective(num_labels: int) -> str:
if num_labels == 2:
objective_name = 'binary'
else:
objective_name = 'multi'
return objective_name
def norm_confusion_matrix(y_true, y_pred):
'''normalized confusion matrix
Args:
estimator: A multi-class classification estimator
y_true: A numpy array or a pandas series of true labels
y_pred: A numpy array or a pandas series of predicted labels
Returns:
A normalized confusion matrix
'''
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_true, y_pred)
norm_conf_mat = conf_mat.astype('float') / conf_mat.sum(axis=1)[:, np.newaxis]
return norm_conf_mat
def multi_class_curves(y_true, y_pred_proba, curve_func):
'''Binarize the data for multi-class tasks and produce ROC or precision-recall curves
Args:
y_true: A numpy array or a pandas series of true labels
y_pred_proba: A numpy array or a pandas dataframe of predicted probabilites
curve_func: A function to produce a curve (e.g., roc_curve or precision_recall_curve)
Returns:
A tuple of two dictionaries with the same set of keys (class indices)
The first dictionary curve_x stores the x coordinates of each curve, e.g.,
curve_x[0] is an 1D array of the x coordinates of class 0
The second dictionary curve_y stores the y coordinates of each curve, e.g.,
curve_y[0] is an 1D array of the y coordinates of class 0
'''
from sklearn.preprocessing import label_binarize
classes = np.unique(y_true)
y_true_binary = label_binarize(y_true, classes=classes)
curve_x, curve_y = {}, {}
for i in range(len(classes)):
curve_x[i], curve_y[i], _ = curve_func(y_true_binary[:, i], y_pred_proba[:, i])
return curve_x, curve_y