2021-09-11 21:19:18 -07:00
|
|
|
"""!
|
|
|
|
* Copyright (c) Microsoft Corporation. All rights reserved.
|
2021-04-08 09:29:55 -07:00
|
|
|
* Licensed under the MIT License.
|
2021-09-11 21:19:18 -07:00
|
|
|
"""
|
2021-04-08 09:29:55 -07:00
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
import time
|
2021-04-08 09:29:55 -07:00
|
|
|
import numpy as np
|
|
|
|
import pandas as pd
|
2021-09-11 21:19:18 -07:00
|
|
|
from sklearn.metrics import (
|
|
|
|
mean_squared_error,
|
|
|
|
r2_score,
|
|
|
|
roc_auc_score,
|
|
|
|
accuracy_score,
|
|
|
|
mean_absolute_error,
|
|
|
|
log_loss,
|
|
|
|
average_precision_score,
|
|
|
|
f1_score,
|
|
|
|
mean_absolute_percentage_error,
|
|
|
|
ndcg_score,
|
|
|
|
)
|
2021-08-23 16:26:46 -04:00
|
|
|
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit
|
2021-04-08 09:29:55 -07:00
|
|
|
from .model import (
|
2021-09-11 21:19:18 -07:00
|
|
|
XGBoostEstimator,
|
|
|
|
XGBoostSklearnEstimator,
|
|
|
|
RandomForestEstimator,
|
|
|
|
LGBMEstimator,
|
|
|
|
LRL1Classifier,
|
|
|
|
LRL2Classifier,
|
|
|
|
CatBoostEstimator,
|
|
|
|
ExtraTreeEstimator,
|
|
|
|
KNeighborsEstimator,
|
|
|
|
Prophet,
|
|
|
|
ARIMA,
|
|
|
|
SARIMAX,
|
|
|
|
)
|
2021-09-01 16:25:04 -07:00
|
|
|
from .data import group_counts
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
import logging
|
2021-09-11 21:19:18 -07:00
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
def get_estimator_class(task, estimator_name):
|
2021-09-11 21:19:18 -07:00
|
|
|
"""when adding a new learner, need to add an elif branch"""
|
2021-02-05 21:41:14 -08:00
|
|
|
|
2021-09-11 21:19:18 -07:00
|
|
|
if "xgboost" == estimator_name:
|
|
|
|
if "regression" == task:
|
2021-02-05 21:41:14 -08:00
|
|
|
estimator_class = XGBoostEstimator
|
|
|
|
else:
|
|
|
|
estimator_class = XGBoostSklearnEstimator
|
2021-09-11 21:19:18 -07:00
|
|
|
elif "rf" == estimator_name:
|
2021-02-05 21:41:14 -08:00
|
|
|
estimator_class = RandomForestEstimator
|
2021-09-11 21:19:18 -07:00
|
|
|
elif "lgbm" == estimator_name:
|
2021-02-05 21:41:14 -08:00
|
|
|
estimator_class = LGBMEstimator
|
2021-09-11 21:19:18 -07:00
|
|
|
elif "lrl1" == estimator_name:
|
2021-02-05 21:41:14 -08:00
|
|
|
estimator_class = LRL1Classifier
|
2021-09-11 21:19:18 -07:00
|
|
|
elif "lrl2" == estimator_name:
|
2021-04-08 09:29:55 -07:00
|
|
|
estimator_class = LRL2Classifier
|
2021-09-11 21:19:18 -07:00
|
|
|
elif "catboost" == estimator_name:
|
2021-02-05 21:41:14 -08:00
|
|
|
estimator_class = CatBoostEstimator
|
2021-09-11 21:19:18 -07:00
|
|
|
elif "extra_tree" == estimator_name:
|
2021-02-05 21:41:14 -08:00
|
|
|
estimator_class = ExtraTreeEstimator
|
2021-09-11 21:19:18 -07:00
|
|
|
elif "kneighbor" == estimator_name:
|
2021-02-05 21:41:14 -08:00
|
|
|
estimator_class = KNeighborsEstimator
|
2021-09-11 21:19:18 -07:00
|
|
|
elif "prophet" in estimator_name:
|
|
|
|
estimator_class = Prophet
|
|
|
|
elif estimator_name == "arima":
|
2021-08-23 16:26:46 -04:00
|
|
|
estimator_class = ARIMA
|
2021-09-11 21:19:18 -07:00
|
|
|
elif estimator_name == "sarimax":
|
2021-08-23 16:26:46 -04:00
|
|
|
estimator_class = SARIMAX
|
2021-02-05 21:41:14 -08:00
|
|
|
else:
|
2021-04-08 09:29:55 -07:00
|
|
|
raise ValueError(
|
2021-09-11 21:19:18 -07:00
|
|
|
estimator_name + " is not a built-in learner. "
|
|
|
|
"Please use AutoML.add_learner() to add a customized learner."
|
|
|
|
)
|
2021-02-05 21:41:14 -08:00
|
|
|
return estimator_class
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
|
|
|
def sklearn_metric_loss_score(
|
2021-09-11 21:19:18 -07:00
|
|
|
metric_name,
|
|
|
|
y_predict,
|
|
|
|
y_true,
|
|
|
|
labels=None,
|
|
|
|
sample_weight=None,
|
2021-09-01 16:25:04 -07:00
|
|
|
groups=None,
|
2021-04-08 09:29:55 -07:00
|
|
|
):
|
2021-09-11 21:19:18 -07:00
|
|
|
"""Loss using the specified metric
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
Args:
|
2021-04-26 20:50:41 +02:00
|
|
|
metric_name: A string of the metric name, one of
|
2021-08-23 07:16:10 +09:00
|
|
|
'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
|
2021-09-01 16:25:04 -07:00
|
|
|
'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg',
|
|
|
|
'micro_f1', 'macro_f1'.
|
2021-02-05 21:41:14 -08:00
|
|
|
y_predict: A 1d or 2d numpy array of the predictions which can be
|
|
|
|
used to calculate the metric. E.g., 2d for log_loss and 1d
|
2021-04-08 09:29:55 -07:00
|
|
|
for others.
|
2021-09-01 16:25:04 -07:00
|
|
|
y_true: A 1d numpy array of the true labels.
|
|
|
|
labels: A 1d numpy array of the unique labels.
|
|
|
|
sample_weight: A 1d numpy array of the sample weight.
|
|
|
|
groups: A 1d numpy array of the group labels.
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
Returns:
|
2021-09-01 16:25:04 -07:00
|
|
|
score: A float number of the loss, the lower the better.
|
2021-09-11 21:19:18 -07:00
|
|
|
"""
|
2021-02-05 21:41:14 -08:00
|
|
|
metric_name = metric_name.lower()
|
2021-09-11 21:19:18 -07:00
|
|
|
if "r2" == metric_name:
|
2021-02-05 21:41:14 -08:00
|
|
|
score = 1.0 - r2_score(y_true, y_predict, sample_weight=sample_weight)
|
2021-09-11 21:19:18 -07:00
|
|
|
elif metric_name == "rmse":
|
|
|
|
score = np.sqrt(
|
|
|
|
mean_squared_error(y_true, y_predict, sample_weight=sample_weight)
|
|
|
|
)
|
|
|
|
elif metric_name == "mae":
|
|
|
|
score = mean_absolute_error(y_true, y_predict, sample_weight=sample_weight)
|
|
|
|
elif metric_name == "mse":
|
|
|
|
score = mean_squared_error(y_true, y_predict, sample_weight=sample_weight)
|
|
|
|
elif metric_name == "accuracy":
|
|
|
|
score = 1.0 - accuracy_score(y_true, y_predict, sample_weight=sample_weight)
|
|
|
|
elif metric_name == "roc_auc":
|
|
|
|
score = 1.0 - roc_auc_score(y_true, y_predict, sample_weight=sample_weight)
|
|
|
|
elif metric_name == "roc_auc_ovr":
|
2021-08-23 07:16:10 +09:00
|
|
|
score = 1.0 - roc_auc_score(
|
2021-09-11 21:19:18 -07:00
|
|
|
y_true, y_predict, sample_weight=sample_weight, multi_class="ovr"
|
|
|
|
)
|
|
|
|
elif metric_name == "roc_auc_ovo":
|
2021-08-23 07:16:10 +09:00
|
|
|
score = 1.0 - roc_auc_score(
|
2021-09-11 21:19:18 -07:00
|
|
|
y_true, y_predict, sample_weight=sample_weight, multi_class="ovo"
|
|
|
|
)
|
|
|
|
elif "log_loss" == metric_name:
|
|
|
|
score = log_loss(y_true, y_predict, labels=labels, sample_weight=sample_weight)
|
|
|
|
elif "mape" == metric_name:
|
2021-08-23 19:36:51 -04:00
|
|
|
try:
|
2021-09-11 21:19:18 -07:00
|
|
|
score = mean_absolute_percentage_error(y_true, y_predict)
|
2021-08-23 19:36:51 -04:00
|
|
|
except ValueError:
|
|
|
|
return np.inf
|
2021-09-11 21:19:18 -07:00
|
|
|
elif "micro_f1" == metric_name:
|
2021-05-07 04:29:38 +00:00
|
|
|
score = 1 - f1_score(
|
2021-09-11 21:19:18 -07:00
|
|
|
y_true, y_predict, sample_weight=sample_weight, average="micro"
|
|
|
|
)
|
|
|
|
elif "macro_f1" == metric_name:
|
2021-05-07 04:29:38 +00:00
|
|
|
score = 1 - f1_score(
|
2021-09-11 21:19:18 -07:00
|
|
|
y_true, y_predict, sample_weight=sample_weight, average="macro"
|
|
|
|
)
|
|
|
|
elif "f1" == metric_name:
|
2021-02-05 21:41:14 -08:00
|
|
|
score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight)
|
2021-09-11 21:19:18 -07:00
|
|
|
elif "ap" == metric_name:
|
2021-04-08 09:29:55 -07:00
|
|
|
score = 1 - average_precision_score(
|
2021-09-11 21:19:18 -07:00
|
|
|
y_true, y_predict, sample_weight=sample_weight
|
|
|
|
)
|
|
|
|
elif "ndcg" in metric_name:
|
|
|
|
if "@" in metric_name:
|
|
|
|
k = int(metric_name.split("@", 1)[-1])
|
2021-09-01 16:25:04 -07:00
|
|
|
counts = group_counts(groups)
|
|
|
|
score = 0
|
|
|
|
psum = 0
|
|
|
|
for c in counts:
|
2021-09-11 21:19:18 -07:00
|
|
|
score -= ndcg_score(
|
|
|
|
np.asarray([y_true[psum : psum + c]]),
|
|
|
|
np.asarray([y_predict[psum : psum + c]]),
|
|
|
|
k=k,
|
|
|
|
)
|
2021-09-01 16:25:04 -07:00
|
|
|
psum += c
|
|
|
|
score /= len(counts)
|
|
|
|
score += 1
|
|
|
|
else:
|
|
|
|
score = 1 - ndcg_score([y_true], [y_predict])
|
2021-02-05 21:41:14 -08:00
|
|
|
else:
|
2021-04-08 09:29:55 -07:00
|
|
|
raise ValueError(
|
2021-09-11 21:19:18 -07:00
|
|
|
metric_name + " is not a built-in metric, "
|
|
|
|
"currently built-in metrics are: "
|
|
|
|
"r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,"
|
|
|
|
"log_loss, mape, f1, micro_f1, macro_f1, ap. "
|
|
|
|
"please pass a customized metric function to AutoML.fit(metric=func)"
|
|
|
|
)
|
2021-02-05 21:41:14 -08:00
|
|
|
return score
|
|
|
|
|
|
|
|
|
2021-09-01 16:25:04 -07:00
|
|
|
def get_y_pred(estimator, X, eval_metric, obj):
|
2021-09-11 21:19:18 -07:00
|
|
|
if eval_metric in ["roc_auc", "ap"] and "binary" in obj:
|
2021-04-08 09:29:55 -07:00
|
|
|
y_pred_classes = estimator.predict_proba(X)
|
2021-09-11 21:19:18 -07:00
|
|
|
y_pred = y_pred_classes[:, 1] if y_pred_classes.ndim > 1 else y_pred_classes
|
|
|
|
elif eval_metric in ["log_loss", "roc_auc", "roc_auc_ovr", "roc_auc_ovo"]:
|
2021-02-05 21:41:14 -08:00
|
|
|
y_pred = estimator.predict_proba(X)
|
|
|
|
else:
|
2021-04-08 09:29:55 -07:00
|
|
|
y_pred = estimator.predict(X)
|
2021-02-05 21:41:14 -08:00
|
|
|
return y_pred
|
|
|
|
|
|
|
|
|
2021-09-11 21:19:18 -07:00
|
|
|
def _eval_estimator(
|
|
|
|
config,
|
|
|
|
estimator,
|
|
|
|
X_train,
|
|
|
|
y_train,
|
|
|
|
X_test,
|
|
|
|
y_test,
|
|
|
|
weight_test,
|
|
|
|
groups_test,
|
|
|
|
eval_metric,
|
|
|
|
obj,
|
|
|
|
labels=None,
|
|
|
|
log_training_metric=False,
|
|
|
|
fit_kwargs={},
|
|
|
|
):
|
2021-02-05 21:41:14 -08:00
|
|
|
if isinstance(eval_metric, str):
|
2021-07-10 09:02:17 -07:00
|
|
|
pred_start = time.time()
|
2021-02-05 21:41:14 -08:00
|
|
|
test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj)
|
2021-07-10 09:02:17 -07:00
|
|
|
pred_time = (time.time() - pred_start) / X_test.shape[0]
|
2021-09-11 21:19:18 -07:00
|
|
|
test_loss = sklearn_metric_loss_score(
|
|
|
|
eval_metric, test_pred_y, y_test, labels, weight_test, groups_test
|
|
|
|
)
|
2021-09-02 16:07:30 -04:00
|
|
|
metric_for_logging = {}
|
2021-08-23 19:36:51 -04:00
|
|
|
if log_training_metric:
|
2021-09-02 16:07:30 -04:00
|
|
|
train_pred_y = get_y_pred(estimator, X_train, eval_metric, obj)
|
2021-09-11 21:19:18 -07:00
|
|
|
metric_for_logging["train_loss"] = sklearn_metric_loss_score(
|
|
|
|
eval_metric,
|
|
|
|
train_pred_y,
|
|
|
|
y_train,
|
|
|
|
labels,
|
|
|
|
fit_kwargs.get("sample_weight"),
|
|
|
|
fit_kwargs.get("groups"),
|
|
|
|
)
|
2021-04-08 09:29:55 -07:00
|
|
|
else: # customized metric function
|
2021-09-02 16:07:30 -04:00
|
|
|
test_loss, metric_for_logging = eval_metric(
|
2021-09-11 21:19:18 -07:00
|
|
|
X_test,
|
|
|
|
y_test,
|
|
|
|
estimator,
|
|
|
|
labels,
|
|
|
|
X_train,
|
|
|
|
y_train,
|
|
|
|
weight_test,
|
|
|
|
fit_kwargs.get("sample_weight"),
|
|
|
|
config,
|
|
|
|
groups_test,
|
|
|
|
fit_kwargs.get("groups"),
|
|
|
|
)
|
2021-09-23 10:49:02 -07:00
|
|
|
pred_time = metric_for_logging.get("pred_time", 0)
|
2021-09-04 01:42:21 -07:00
|
|
|
test_pred_y = None
|
|
|
|
# eval_metric may return test_pred_y but not necessarily. Setting None for now.
|
2021-09-02 16:07:30 -04:00
|
|
|
return test_loss, metric_for_logging, pred_time, test_pred_y
|
|
|
|
|
|
|
|
|
2021-09-11 21:19:18 -07:00
|
|
|
def get_test_loss(
|
|
|
|
config,
|
|
|
|
estimator,
|
|
|
|
X_train,
|
|
|
|
y_train,
|
|
|
|
X_test,
|
|
|
|
y_test,
|
|
|
|
weight_test,
|
|
|
|
groups_test,
|
|
|
|
eval_metric,
|
|
|
|
obj,
|
|
|
|
labels=None,
|
|
|
|
budget=None,
|
|
|
|
log_training_metric=False,
|
|
|
|
fit_kwargs={},
|
|
|
|
):
|
2021-09-02 16:07:30 -04:00
|
|
|
|
|
|
|
start = time.time()
|
|
|
|
# if groups_test is not None:
|
|
|
|
# fit_kwargs['groups_val'] = groups_test
|
|
|
|
# fit_kwargs['X_val'] = X_test
|
|
|
|
# fit_kwargs['y_val'] = y_test
|
|
|
|
estimator.fit(X_train, y_train, budget, **fit_kwargs)
|
2021-09-04 01:42:21 -07:00
|
|
|
test_loss, metric_for_logging, pred_time, _ = _eval_estimator(
|
2021-09-11 21:19:18 -07:00
|
|
|
config,
|
|
|
|
estimator,
|
|
|
|
X_train,
|
|
|
|
y_train,
|
|
|
|
X_test,
|
|
|
|
y_test,
|
|
|
|
weight_test,
|
|
|
|
groups_test,
|
|
|
|
eval_metric,
|
|
|
|
obj,
|
|
|
|
labels,
|
|
|
|
log_training_metric,
|
|
|
|
fit_kwargs,
|
|
|
|
)
|
2021-04-08 09:29:55 -07:00
|
|
|
train_time = time.time() - start
|
2021-09-01 16:25:04 -07:00
|
|
|
return test_loss, metric_for_logging, train_time, pred_time
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
2021-09-11 21:19:18 -07:00
|
|
|
def evaluate_model_CV(
|
|
|
|
config,
|
|
|
|
estimator,
|
|
|
|
X_train_all,
|
|
|
|
y_train_all,
|
|
|
|
budget,
|
|
|
|
kf,
|
|
|
|
task,
|
|
|
|
eval_metric,
|
|
|
|
best_val_loss,
|
|
|
|
log_training_metric=False,
|
|
|
|
fit_kwargs={},
|
|
|
|
):
|
2021-02-05 21:41:14 -08:00
|
|
|
start_time = time.time()
|
2021-07-10 09:02:17 -07:00
|
|
|
total_val_loss = 0
|
2021-09-01 16:25:04 -07:00
|
|
|
total_metric = None
|
|
|
|
metric = None
|
2021-07-10 09:02:17 -07:00
|
|
|
train_time = pred_time = 0
|
|
|
|
valid_fold_num = total_fold_num = 0
|
2021-02-05 21:41:14 -08:00
|
|
|
n = kf.get_n_splits()
|
|
|
|
X_train_split, y_train_split = X_train_all, y_train_all
|
2021-09-11 21:19:18 -07:00
|
|
|
if task in ("binary", "multi"):
|
2021-04-08 09:29:55 -07:00
|
|
|
labels = np.unique(y_train_all)
|
2021-08-23 16:26:46 -04:00
|
|
|
else:
|
|
|
|
labels = None
|
2021-09-01 16:25:04 -07:00
|
|
|
groups = None
|
|
|
|
shuffle = True
|
2021-02-05 21:41:14 -08:00
|
|
|
if isinstance(kf, RepeatedStratifiedKFold):
|
|
|
|
kf = kf.split(X_train_split, y_train_split)
|
2021-06-15 18:52:57 -07:00
|
|
|
elif isinstance(kf, GroupKFold):
|
2021-09-01 16:25:04 -07:00
|
|
|
groups = kf.groups
|
|
|
|
kf = kf.split(X_train_split, y_train_split, groups)
|
|
|
|
shuffle = False
|
2021-09-11 21:19:18 -07:00
|
|
|
elif isinstance(kf, TimeSeriesSplit) and task == "forecast":
|
|
|
|
y_train_all = pd.DataFrame(y_train_all, columns=["y"])
|
2021-08-23 16:26:46 -04:00
|
|
|
train = X_train_all.join(y_train_all)
|
|
|
|
kf = kf.split(train)
|
2021-09-01 16:25:04 -07:00
|
|
|
shuffle = False
|
2021-08-23 16:26:46 -04:00
|
|
|
elif isinstance(kf, TimeSeriesSplit):
|
|
|
|
kf = kf.split(X_train_split, y_train_split)
|
2021-02-05 21:41:14 -08:00
|
|
|
else:
|
|
|
|
kf = kf.split(X_train_split)
|
|
|
|
rng = np.random.RandomState(2020)
|
|
|
|
val_loss_list = []
|
2021-08-23 19:36:51 -04:00
|
|
|
budget_per_train = budget / n
|
2021-09-11 21:19:18 -07:00
|
|
|
if "sample_weight" in fit_kwargs:
|
|
|
|
weight = fit_kwargs["sample_weight"]
|
2021-02-05 21:41:14 -08:00
|
|
|
weight_val = None
|
|
|
|
else:
|
|
|
|
weight = weight_val = None
|
|
|
|
for train_index, val_index in kf:
|
2021-09-01 16:25:04 -07:00
|
|
|
if shuffle:
|
2021-08-23 16:26:46 -04:00
|
|
|
train_index = rng.permutation(train_index)
|
2021-02-05 21:41:14 -08:00
|
|
|
if isinstance(X_train_all, pd.DataFrame):
|
2021-09-11 21:19:18 -07:00
|
|
|
X_train = X_train_split.iloc[train_index]
|
|
|
|
X_val = X_train_split.iloc[val_index]
|
2021-02-05 21:41:14 -08:00
|
|
|
else:
|
2021-09-11 21:19:18 -07:00
|
|
|
X_train, X_val = X_train_split[train_index], X_train_split[val_index]
|
2021-09-01 16:25:04 -07:00
|
|
|
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
|
2021-02-05 21:41:14 -08:00
|
|
|
estimator.cleanup()
|
|
|
|
if weight is not None:
|
2021-09-11 21:19:18 -07:00
|
|
|
fit_kwargs["sample_weight"], weight_val = (
|
|
|
|
weight[train_index],
|
|
|
|
weight[val_index],
|
|
|
|
)
|
2021-09-01 16:25:04 -07:00
|
|
|
if groups is not None:
|
2021-09-11 21:19:18 -07:00
|
|
|
fit_kwargs["groups"] = groups[train_index]
|
2021-09-01 16:25:04 -07:00
|
|
|
groups_val = groups[val_index]
|
|
|
|
else:
|
|
|
|
groups_val = None
|
|
|
|
val_loss_i, metric_i, train_time_i, pred_time_i = get_test_loss(
|
2021-09-11 21:19:18 -07:00
|
|
|
config,
|
|
|
|
estimator,
|
|
|
|
X_train,
|
|
|
|
y_train,
|
|
|
|
X_val,
|
|
|
|
y_val,
|
|
|
|
weight_val,
|
|
|
|
groups_val,
|
|
|
|
eval_metric,
|
|
|
|
task,
|
|
|
|
labels,
|
|
|
|
budget_per_train,
|
|
|
|
log_training_metric=log_training_metric,
|
|
|
|
fit_kwargs=fit_kwargs,
|
|
|
|
)
|
2021-02-05 21:41:14 -08:00
|
|
|
if weight is not None:
|
2021-09-11 21:19:18 -07:00
|
|
|
fit_kwargs["sample_weight"] = weight
|
2021-02-05 21:41:14 -08:00
|
|
|
valid_fold_num += 1
|
2021-07-10 09:02:17 -07:00
|
|
|
total_fold_num += 1
|
2021-02-05 21:41:14 -08:00
|
|
|
total_val_loss += val_loss_i
|
2021-08-23 19:36:51 -04:00
|
|
|
if log_training_metric or not isinstance(eval_metric, str):
|
2021-09-23 10:49:02 -07:00
|
|
|
if isinstance(total_metric, dict):
|
2021-09-11 21:19:18 -07:00
|
|
|
total_metric = {k: total_metric[k] + v for k, v in metric_i.items()}
|
2021-09-01 16:25:04 -07:00
|
|
|
elif total_metric is not None:
|
|
|
|
total_metric += metric_i
|
2021-04-08 09:29:55 -07:00
|
|
|
else:
|
2021-09-01 16:25:04 -07:00
|
|
|
total_metric = metric_i
|
2021-02-05 21:41:14 -08:00
|
|
|
train_time += train_time_i
|
2021-07-10 09:02:17 -07:00
|
|
|
pred_time += pred_time_i
|
2021-02-05 21:41:14 -08:00
|
|
|
if valid_fold_num == n:
|
2021-04-08 09:29:55 -07:00
|
|
|
val_loss_list.append(total_val_loss / valid_fold_num)
|
2021-02-05 21:41:14 -08:00
|
|
|
total_val_loss = valid_fold_num = 0
|
|
|
|
elif time.time() - start_time >= budget:
|
2021-04-08 09:29:55 -07:00
|
|
|
val_loss_list.append(total_val_loss / valid_fold_num)
|
2021-02-05 21:41:14 -08:00
|
|
|
break
|
|
|
|
val_loss = np.max(val_loss_list)
|
2021-07-10 09:02:17 -07:00
|
|
|
n = total_fold_num
|
2021-08-23 19:36:51 -04:00
|
|
|
if log_training_metric or not isinstance(eval_metric, str):
|
2021-09-23 10:49:02 -07:00
|
|
|
if isinstance(total_metric, dict):
|
2021-09-01 16:25:04 -07:00
|
|
|
metric = {k: v / n for k, v in total_metric.items()}
|
2021-06-18 21:19:59 -07:00
|
|
|
else:
|
2021-09-01 16:25:04 -07:00
|
|
|
metric = total_metric / n
|
2021-07-10 09:02:17 -07:00
|
|
|
pred_time /= n
|
2021-08-23 19:36:51 -04:00
|
|
|
# budget -= time.time() - start_time
|
|
|
|
# if val_loss < best_val_loss and budget > budget_per_train:
|
|
|
|
# estimator.cleanup()
|
|
|
|
# estimator.fit(X_train_all, y_train_all, budget, **fit_kwargs)
|
2021-09-01 16:25:04 -07:00
|
|
|
return val_loss, metric, train_time, pred_time
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
def compute_estimator(
|
2021-09-11 21:19:18 -07:00
|
|
|
X_train,
|
|
|
|
y_train,
|
|
|
|
X_val,
|
|
|
|
y_val,
|
|
|
|
weight_val,
|
|
|
|
groups_val,
|
|
|
|
budget,
|
|
|
|
kf,
|
|
|
|
config_dic,
|
|
|
|
task,
|
|
|
|
estimator_name,
|
|
|
|
eval_method,
|
|
|
|
eval_metric,
|
|
|
|
best_val_loss=np.Inf,
|
|
|
|
n_jobs=1,
|
|
|
|
estimator_class=None,
|
|
|
|
log_training_metric=False,
|
|
|
|
fit_kwargs={},
|
2021-04-08 09:29:55 -07:00
|
|
|
):
|
2021-09-11 21:19:18 -07:00
|
|
|
estimator_class = estimator_class or get_estimator_class(task, estimator_name)
|
|
|
|
estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
|
2021-09-23 10:49:02 -07:00
|
|
|
if "holdout" == eval_method:
|
2021-09-01 16:25:04 -07:00
|
|
|
val_loss, metric_for_logging, train_time, pred_time = get_test_loss(
|
2021-09-11 21:19:18 -07:00
|
|
|
config_dic,
|
|
|
|
estimator,
|
|
|
|
X_train,
|
|
|
|
y_train,
|
|
|
|
X_val,
|
|
|
|
y_val,
|
|
|
|
weight_val,
|
|
|
|
groups_val,
|
|
|
|
eval_metric,
|
|
|
|
task,
|
|
|
|
budget=budget,
|
|
|
|
log_training_metric=log_training_metric,
|
|
|
|
fit_kwargs=fit_kwargs,
|
|
|
|
)
|
2021-09-01 16:25:04 -07:00
|
|
|
else:
|
|
|
|
val_loss, metric_for_logging, train_time, pred_time = evaluate_model_CV(
|
2021-09-11 21:19:18 -07:00
|
|
|
config_dic,
|
|
|
|
estimator,
|
|
|
|
X_train,
|
|
|
|
y_train,
|
|
|
|
budget,
|
|
|
|
kf,
|
|
|
|
task,
|
|
|
|
eval_metric,
|
|
|
|
best_val_loss,
|
|
|
|
log_training_metric=log_training_metric,
|
|
|
|
fit_kwargs=fit_kwargs,
|
|
|
|
)
|
2021-09-01 16:25:04 -07:00
|
|
|
return estimator, val_loss, metric_for_logging, train_time, pred_time
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
def train_estimator(
|
2021-09-11 21:19:18 -07:00
|
|
|
X_train,
|
|
|
|
y_train,
|
|
|
|
config_dic,
|
|
|
|
task,
|
|
|
|
estimator_name,
|
|
|
|
n_jobs=1,
|
|
|
|
estimator_class=None,
|
|
|
|
budget=None,
|
2021-09-27 21:30:49 -07:00
|
|
|
n_iter=None,
|
2021-09-11 21:19:18 -07:00
|
|
|
fit_kwargs={},
|
2021-04-08 09:29:55 -07:00
|
|
|
):
|
2021-02-05 21:41:14 -08:00
|
|
|
start_time = time.time()
|
2021-09-11 21:19:18 -07:00
|
|
|
estimator_class = estimator_class or get_estimator_class(task, estimator_name)
|
2021-04-08 09:29:55 -07:00
|
|
|
estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
|
2021-09-27 21:30:49 -07:00
|
|
|
if n_iter is not None:
|
|
|
|
estimator.params["n_estimators"] = n_iter
|
2021-02-05 21:41:14 -08:00
|
|
|
if X_train is not None:
|
2021-09-01 16:25:04 -07:00
|
|
|
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
|
2021-02-05 21:41:14 -08:00
|
|
|
else:
|
|
|
|
estimator = estimator.estimator_class(**estimator.params)
|
|
|
|
train_time = time.time() - start_time
|
|
|
|
return estimator, train_time
|
|
|
|
|
|
|
|
|
|
|
|
def get_classification_objective(num_labels: int) -> str:
|
|
|
|
if num_labels == 2:
|
2021-09-11 21:19:18 -07:00
|
|
|
objective_name = "binary"
|
2021-02-05 21:41:14 -08:00
|
|
|
else:
|
2021-09-11 21:19:18 -07:00
|
|
|
objective_name = "multi"
|
2021-02-05 21:41:14 -08:00
|
|
|
return objective_name
|
2021-06-04 10:31:33 -07:00
|
|
|
|
|
|
|
|
|
|
|
def norm_confusion_matrix(y_true, y_pred):
|
2021-09-11 21:19:18 -07:00
|
|
|
"""normalized confusion matrix
|
2021-06-04 10:31:33 -07:00
|
|
|
|
|
|
|
Args:
|
|
|
|
estimator: A multi-class classification estimator
|
|
|
|
y_true: A numpy array or a pandas series of true labels
|
|
|
|
y_pred: A numpy array or a pandas series of predicted labels
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A normalized confusion matrix
|
2021-09-11 21:19:18 -07:00
|
|
|
"""
|
2021-06-04 10:31:33 -07:00
|
|
|
from sklearn.metrics import confusion_matrix
|
2021-09-11 21:19:18 -07:00
|
|
|
|
2021-06-04 10:31:33 -07:00
|
|
|
conf_mat = confusion_matrix(y_true, y_pred)
|
2021-09-11 21:19:18 -07:00
|
|
|
norm_conf_mat = conf_mat.astype("float") / conf_mat.sum(axis=1)[:, np.newaxis]
|
2021-06-04 10:31:33 -07:00
|
|
|
return norm_conf_mat
|
|
|
|
|
|
|
|
|
|
|
|
def multi_class_curves(y_true, y_pred_proba, curve_func):
|
2021-09-11 21:19:18 -07:00
|
|
|
"""Binarize the data for multi-class tasks and produce ROC or precision-recall curves
|
2021-06-04 10:31:33 -07:00
|
|
|
|
|
|
|
Args:
|
|
|
|
y_true: A numpy array or a pandas series of true labels
|
|
|
|
y_pred_proba: A numpy array or a pandas dataframe of predicted probabilites
|
|
|
|
curve_func: A function to produce a curve (e.g., roc_curve or precision_recall_curve)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A tuple of two dictionaries with the same set of keys (class indices)
|
|
|
|
The first dictionary curve_x stores the x coordinates of each curve, e.g.,
|
|
|
|
curve_x[0] is an 1D array of the x coordinates of class 0
|
|
|
|
The second dictionary curve_y stores the y coordinates of each curve, e.g.,
|
|
|
|
curve_y[0] is an 1D array of the y coordinates of class 0
|
2021-09-11 21:19:18 -07:00
|
|
|
"""
|
2021-06-04 10:31:33 -07:00
|
|
|
from sklearn.preprocessing import label_binarize
|
2021-09-11 21:19:18 -07:00
|
|
|
|
2021-06-04 10:31:33 -07:00
|
|
|
classes = np.unique(y_true)
|
|
|
|
y_true_binary = label_binarize(y_true, classes=classes)
|
|
|
|
|
|
|
|
curve_x, curve_y = {}, {}
|
|
|
|
for i in range(len(classes)):
|
|
|
|
curve_x[i], curve_y[i], _ = curve_func(y_true_binary[:, i], y_pred_proba[:, i])
|
|
|
|
return curve_x, curve_y
|