This commit is contained in:
skzhang1 2022-08-10 00:42:47 +00:00
parent 7851a463aa
commit e3c9da50da
2 changed files with 59 additions and 26 deletions

View File

@ -364,7 +364,7 @@ class AutoMLState:
state.best_loss, state.best_loss,
state.n_jobs, state.n_jobs,
state.learner_classes.get(estimator), state.learner_classes.get(estimator),
state.cv_strategy, state.cv_score_agg_func,
state.log_training_metric, state.log_training_metric,
this_estimator_kwargs, this_estimator_kwargs,
) )
@ -729,6 +729,7 @@ class AutoML(BaseEstimator):
settings["min_sample_size"] = settings.get("min_sample_size", MIN_SAMPLE_TRAIN) settings["min_sample_size"] = settings.get("min_sample_size", MIN_SAMPLE_TRAIN)
settings["use_ray"] = settings.get("use_ray", False) settings["use_ray"] = settings.get("use_ray", False)
settings["metric_constraints"] = settings.get("metric_constraints", []) settings["metric_constraints"] = settings.get("metric_constraints", [])
settings["cv_score_agg_func"] = settings.get("cv_score_agg_func", None)
settings["fit_kwargs_by_estimator"] = settings.get( settings["fit_kwargs_by_estimator"] = settings.get(
"fit_kwargs_by_estimator", {} "fit_kwargs_by_estimator", {}
) )
@ -2071,7 +2072,7 @@ class AutoML(BaseEstimator):
use_ray=None, use_ray=None,
metric_constraints=None, metric_constraints=None,
custom_hp=None, custom_hp=None,
cv_strategy=None, cv_score_agg_func=None,
fit_kwargs_by_estimator=None, fit_kwargs_by_estimator=None,
**fit_kwargs, **fit_kwargs,
): ):
@ -2289,16 +2290,34 @@ class AutoML(BaseEstimator):
} }
``` ```
cv_strategy: customized function, the strategy of conducting cross-validation. Default to average the optimization metric across folds. cv_score_agg_func: customized cross-validation scores aggregate function. Default to average metrics across folds. If specificed, this function needs to
We give an example here: have the following signature:
```python ```python
def cv_strategy(val_loss_folds): def cv_score_agg_func(metrics_across_folds):
return sum(val_loss_folds)/len(val_loss_folds) return metric_to_minimize, metrics_to_log
``` ```
The input "metrics_across_folds" is a list of 2-tuples. Each tuple records the loss and metrics information of the corresponding fold.
On each tuple, the first element is a float number that represents the loss score to minimize, and the second is a dict of all the metrics to log or None.
It returns the final aggregate result of all folds. A float number of the minimization objective, and a dictionary as the metrics to log or None.
E.g.,
where val_loss_folds is the list that stores the metrics values of all folds. In this example, we return the average of the optimization ```python
metric across all folds (default strategy). def cv_score_agg_func(metrics_across_folds):
metric_to_minimize = sum([tem[0] for tem in metrics_across_folds])/len(metrics_across_folds)
metrics_to_log = None
for single_fold in metrics_across_folds:
if single_fold[1] is None:
break
elif metrics_to_log is None:
metrics_to_log = single_fold[1]
else:
metrics_to_log = {k: metrics_to_log[k] + v for k, v in single_fold[1].items()}
if metrics_to_log:
n = len(metrics_across_folds)
metrics_to_log = {k: v / n for k, v in metrics_to_log.items()}
return metric_to_minimize, metrics_to_log
```
fit_kwargs_by_estimator: dict, default=None | The user specified keywords arguments, grouped by estimator name. fit_kwargs_by_estimator: dict, default=None | The user specified keywords arguments, grouped by estimator name.
For TransformersEstimator, available fit_kwargs can be found from For TransformersEstimator, available fit_kwargs can be found from
@ -2460,7 +2479,7 @@ class AutoML(BaseEstimator):
eval_method = self._decide_eval_method(eval_method, time_budget) eval_method = self._decide_eval_method(eval_method, time_budget)
self._state.eval_method = eval_method self._state.eval_method = eval_method
logger.info("Evaluation method: {}".format(eval_method)) logger.info("Evaluation method: {}".format(eval_method))
self._state.cv_strategy = cv_strategy self._state.cv_score_agg_func = cv_score_agg_func or self._settings.get("cv_score_agg_func")
self._retrain_in_budget = retrain_full == "budget" and ( self._retrain_in_budget = retrain_full == "budget" and (
eval_method == "holdout" and self._state.X_val is None eval_method == "holdout" and self._state.X_val is None

View File

@ -438,14 +438,28 @@ def evaluate_model_CV(
task, task,
eval_metric, eval_metric,
best_val_loss, best_val_loss,
cv_strategy, cv_score_agg_func,
log_training_metric=False, log_training_metric=False,
fit_kwargs={}, fit_kwargs={},
): ):
if cv_strategy is None: if cv_score_agg_func is None:
cv_strategy = lambda val_loss_folds: sum(val_loss_folds)/len(val_loss_folds) def cv_score_agg_func(metrics_across_folds):
metric_to_minimize = sum([tem[0] for tem in metrics_across_folds])/len(metrics_across_folds)
metrics_to_log = None
for single_fold in metrics_across_folds:
if single_fold[1] is None:
break
elif metrics_to_log is None:
metrics_to_log = single_fold[1]
else:
metrics_to_log = {k: metrics_to_log[k] + v for k, v in single_fold[1].items()}
if metrics_to_log:
n = len(metrics_across_folds)
metrics_to_log = {k: v / n for k, v in metrics_to_log.items()}
return metric_to_minimize, metrics_to_log
start_time = time.time() start_time = time.time()
val_loss_folds = [] val_loss_folds = []
log_metric_folds = []
total_metric = None total_metric = None
metric = None metric = None
train_time = pred_time = 0 train_time = pred_time = 0
@ -520,8 +534,8 @@ def evaluate_model_CV(
total_fold_num += 1 total_fold_num += 1
val_loss_folds.append(val_loss_i) val_loss_folds.append(val_loss_i)
if log_training_metric or not isinstance(eval_metric, str): if log_training_metric or not isinstance(eval_metric, str):
if isinstance(total_metric, dict): if isinstance(metric_i, dict):
total_metric = {k: total_metric[k] + v for k, v in metric_i.items()} log_metric_folds.append(metric_i)
elif total_metric is not None: elif total_metric is not None:
total_metric += metric_i total_metric += metric_i
else: else:
@ -529,17 +543,17 @@ def evaluate_model_CV(
train_time += train_time_i train_time += train_time_i
pred_time += pred_time_i pred_time += pred_time_i
if valid_fold_num == n: if valid_fold_num == n:
val_loss_list.append(cv_strategy(val_loss_folds)) val_loss_list.append(cv_score_agg_func(list(zip(val_loss_folds,[None]*len(val_loss_folds))))[0])
val_loss_folds = []
valid_fold_num = 0 valid_fold_num = 0
val_loss_folds = []
elif time.time() - start_time >= budget: elif time.time() - start_time >= budget:
val_loss_list.append(cv_strategy(val_loss_folds)) val_loss_list.append(cv_score_agg_func(list(zip(val_loss_folds,[None]*len(val_loss_folds))))[0])
break break
val_loss = np.max(val_loss_list) val_loss = np.max(val_loss_list)
n = total_fold_num n = total_fold_num
if log_training_metric or not isinstance(eval_metric, str): if log_training_metric or not isinstance(eval_metric, str):
if isinstance(total_metric, dict): if len(log_metric_folds):
metric = {k: v / n for k, v in total_metric.items()} metric = cv_score_agg_func(list(zip([0]*len(log_metric_folds),log_metric_folds)))[1]
else: else:
metric = total_metric / n metric = total_metric / n
pred_time /= n pred_time /= n
@ -563,7 +577,7 @@ def compute_estimator(
best_val_loss=np.Inf, best_val_loss=np.Inf,
n_jobs=1, n_jobs=1,
estimator_class=None, estimator_class=None,
cv_strategy=None, cv_score_agg_func=None,
log_training_metric=False, log_training_metric=False,
fit_kwargs={}, fit_kwargs={},
): ):
@ -610,7 +624,7 @@ def compute_estimator(
task, task,
eval_metric, eval_metric,
best_val_loss, best_val_loss,
cv_strategy, cv_score_agg_func,
log_training_metric=log_training_metric, log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs, fit_kwargs=fit_kwargs,
) )