mirror of
https://github.com/microsoft/autogen.git
synced 2025-07-25 01:41:01 +00:00

* make default verbose level > 0 when using ray * default hpo method when using ray * bug fix: == -> =
2321 lines
97 KiB
Python
2321 lines
97 KiB
Python
"""!
|
|
* Copyright (c) Microsoft Corporation. All rights reserved.
|
|
* Licensed under the MIT License. See LICENSE file in the
|
|
* project root for license information.
|
|
"""
|
|
import time
|
|
from typing import Callable, Optional
|
|
from functools import partial
|
|
import numpy as np
|
|
from scipy.sparse import issparse
|
|
from sklearn.model_selection import (
|
|
train_test_split,
|
|
RepeatedStratifiedKFold,
|
|
RepeatedKFold,
|
|
GroupKFold,
|
|
TimeSeriesSplit,
|
|
GroupShuffleSplit,
|
|
)
|
|
from sklearn.utils import shuffle
|
|
import pandas as pd
|
|
import logging
|
|
|
|
from .ml import (
|
|
compute_estimator,
|
|
train_estimator,
|
|
get_estimator_class,
|
|
get_classification_objective,
|
|
)
|
|
from .config import (
|
|
MIN_SAMPLE_TRAIN,
|
|
MEM_THRES,
|
|
RANDOM_SEED,
|
|
SMALL_LARGE_THRES,
|
|
CV_HOLDOUT_THRESHOLD,
|
|
SPLIT_RATIO,
|
|
N_SPLITS,
|
|
SAMPLE_MULTIPLY_FACTOR,
|
|
)
|
|
from .data import concat, CLASSIFICATION, TS_FORECAST, FORECAST
|
|
from . import tune
|
|
from .training_log import training_log_reader, training_log_writer
|
|
|
|
logger = logging.getLogger(__name__)
|
|
logger_formatter = logging.Formatter(
|
|
"[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s", "%m-%d %H:%M:%S"
|
|
)
|
|
|
|
try:
|
|
import mlflow
|
|
except ImportError:
|
|
mlflow = None
|
|
|
|
|
|
class SearchState:
|
|
@property
|
|
def search_space(self):
|
|
return self._search_space_domain
|
|
|
|
@property
|
|
def estimated_cost4improvement(self):
|
|
return max(
|
|
self.time_best_found - self.time_best_found_old,
|
|
self.total_time_used - self.time_best_found,
|
|
)
|
|
|
|
def __init__(self, learner_class, data_size, task, starting_point=None):
|
|
self.init_eci = learner_class.cost_relative2lgbm()
|
|
self._search_space_domain = {}
|
|
self.init_config = {}
|
|
self.low_cost_partial_config = {}
|
|
self.cat_hp_cost = {}
|
|
self.data_size = data_size
|
|
self.ls_ever_converged = False
|
|
self.learner_class = learner_class
|
|
search_space = learner_class.search_space(data_size=data_size, task=task)
|
|
for name, space in search_space.items():
|
|
assert "domain" in space
|
|
self._search_space_domain[name] = space["domain"]
|
|
if "init_value" in space:
|
|
self.init_config[name] = space["init_value"]
|
|
if "low_cost_init_value" in space:
|
|
self.low_cost_partial_config[name] = space["low_cost_init_value"]
|
|
if "cat_hp_cost" in space:
|
|
self.cat_hp_cost[name] = space["cat_hp_cost"]
|
|
# if a starting point is provided, set the init config to be
|
|
# the starting point provided
|
|
if (
|
|
isinstance(starting_point, dict)
|
|
and starting_point.get(name) is not None
|
|
):
|
|
self.init_config[name] = starting_point[name]
|
|
|
|
if isinstance(starting_point, list):
|
|
self.init_config = starting_point
|
|
self._hp_names = list(self._search_space_domain.keys())
|
|
self.search_alg = None
|
|
self.best_config = None
|
|
self.best_loss = self.best_loss_old = np.inf
|
|
self.total_time_used = 0
|
|
self.total_iter = 0
|
|
self.base_eci = None
|
|
self.time_best_found = 0
|
|
self.time2eval_best = 0
|
|
self.time2eval_best_old = 0
|
|
self.trained_estimator = None
|
|
self.sample_size = None
|
|
self.trial_time = 0
|
|
|
|
def update(self, result, time_used, save_model_history=False):
|
|
if result:
|
|
config = result["config"]
|
|
if config and "FLAML_sample_size" in config:
|
|
self.sample_size = config["FLAML_sample_size"]
|
|
else:
|
|
self.sample_size = self.data_size
|
|
obj = result["val_loss"]
|
|
metric_for_logging = result["metric_for_logging"]
|
|
time2eval = result["time_total_s"]
|
|
trained_estimator = result["trained_estimator"]
|
|
del result["trained_estimator"] # free up RAM
|
|
n_iter = trained_estimator and trained_estimator.params.get("n_estimators")
|
|
if n_iter is not None and "n_estimators" in config:
|
|
config["n_estimators"] = n_iter
|
|
else:
|
|
obj, time2eval, trained_estimator = np.inf, 0.0, None
|
|
metric_for_logging = config = None
|
|
self.trial_time = time2eval
|
|
self.total_time_used += time_used
|
|
self.total_iter += 1
|
|
|
|
if self.base_eci is None:
|
|
self.base_eci = time_used
|
|
if (obj is not None) and (self.best_loss is None or obj < self.best_loss):
|
|
self.best_loss_old = self.best_loss if self.best_loss < np.inf else 2 * obj
|
|
self.best_loss = obj
|
|
self.time_best_found_old = self.time_best_found
|
|
self.time_best_found = self.total_time_used
|
|
self.iter_best_found = self.total_iter
|
|
self.best_config = config
|
|
self.best_config_sample_size = self.sample_size
|
|
self.best_config_train_time = time_used
|
|
if time2eval:
|
|
self.time2eval_best_old = self.time2eval_best
|
|
self.time2eval_best = time2eval
|
|
if (
|
|
self.trained_estimator
|
|
and trained_estimator
|
|
and self.trained_estimator != trained_estimator
|
|
and not save_model_history
|
|
):
|
|
self.trained_estimator.cleanup()
|
|
if trained_estimator:
|
|
self.trained_estimator = trained_estimator
|
|
self.metric_for_logging = metric_for_logging
|
|
self.val_loss, self.config = obj, config
|
|
|
|
def get_hist_config_sig(self, sample_size, config):
|
|
config_values = tuple([config[k] for k in self._hp_names])
|
|
config_sig = str(sample_size) + "_" + str(config_values)
|
|
return config_sig
|
|
|
|
def est_retrain_time(self, retrain_sample_size):
|
|
assert (
|
|
self.best_config_sample_size is not None
|
|
), "need to first get best_config_sample_size"
|
|
return self.time2eval_best * retrain_sample_size / self.best_config_sample_size
|
|
|
|
|
|
class AutoMLState:
|
|
def _prepare_sample_train_data(self, sample_size):
|
|
sampled_weight = groups = None
|
|
if sample_size <= self.data_size:
|
|
if isinstance(self.X_train, pd.DataFrame):
|
|
sampled_X_train = self.X_train.iloc[:sample_size]
|
|
else:
|
|
sampled_X_train = self.X_train[:sample_size]
|
|
sampled_y_train = self.y_train[:sample_size]
|
|
weight = self.fit_kwargs.get("sample_weight")
|
|
if weight is not None:
|
|
sampled_weight = weight[:sample_size]
|
|
if self.groups is not None:
|
|
groups = self.groups[:sample_size]
|
|
else:
|
|
sampled_X_train = self.X_train_all
|
|
sampled_y_train = self.y_train_all
|
|
if "sample_weight" in self.fit_kwargs:
|
|
sampled_weight = self.sample_weight_all
|
|
if self.groups is not None:
|
|
groups = self.groups_all
|
|
return sampled_X_train, sampled_y_train, sampled_weight, groups
|
|
|
|
def _compute_with_config_base(self, estimator, config_w_resource):
|
|
if "FLAML_sample_size" in config_w_resource:
|
|
sample_size = int(config_w_resource["FLAML_sample_size"])
|
|
else:
|
|
sample_size = self.data_size
|
|
(
|
|
sampled_X_train,
|
|
sampled_y_train,
|
|
sampled_weight,
|
|
groups,
|
|
) = self._prepare_sample_train_data(sample_size)
|
|
if sampled_weight is not None:
|
|
weight = self.fit_kwargs["sample_weight"]
|
|
self.fit_kwargs["sample_weight"] = sampled_weight
|
|
else:
|
|
weight = None
|
|
if groups is not None:
|
|
self.fit_kwargs["groups"] = groups
|
|
config = config_w_resource.copy()
|
|
if "FLAML_sample_size" in config:
|
|
del config["FLAML_sample_size"]
|
|
time_left = self.time_budget - self.time_from_start
|
|
budget = (
|
|
time_left
|
|
if sample_size == self.data_size
|
|
else time_left / 2 * sample_size / self.data_size
|
|
)
|
|
|
|
(
|
|
trained_estimator,
|
|
val_loss,
|
|
metric_for_logging,
|
|
_,
|
|
pred_time,
|
|
) = compute_estimator(
|
|
sampled_X_train,
|
|
sampled_y_train,
|
|
self.X_val,
|
|
self.y_val,
|
|
self.weight_val,
|
|
self.groups_val,
|
|
min(budget, self.train_time_limit),
|
|
self.kf,
|
|
config,
|
|
self.task,
|
|
estimator,
|
|
self.eval_method,
|
|
self.metric,
|
|
self.best_loss,
|
|
self.n_jobs,
|
|
self.learner_classes.get(estimator),
|
|
self.log_training_metric,
|
|
self.fit_kwargs,
|
|
)
|
|
result = {
|
|
"pred_time": pred_time,
|
|
"wall_clock_time": time.time() - self._start_time_flag,
|
|
"metric_for_logging": metric_for_logging,
|
|
"val_loss": val_loss,
|
|
"trained_estimator": trained_estimator if self.save_model_history else None,
|
|
}
|
|
if sampled_weight is not None:
|
|
self.fit_kwargs["sample_weight"] = weight
|
|
return result
|
|
|
|
def _train_with_config(self, estimator, config_w_resource, sample_size=None):
|
|
if not sample_size:
|
|
sample_size = config_w_resource.get(
|
|
"FLAML_sample_size", len(self.y_train_all)
|
|
)
|
|
config = config_w_resource.get("ml", config_w_resource).copy()
|
|
if "FLAML_sample_size" in config:
|
|
del config["FLAML_sample_size"]
|
|
if "learner" in config:
|
|
del config["learner"]
|
|
(
|
|
sampled_X_train,
|
|
sampled_y_train,
|
|
sampled_weight,
|
|
groups,
|
|
) = self._prepare_sample_train_data(sample_size)
|
|
if sampled_weight is not None:
|
|
weight = self.fit_kwargs["sample_weight"]
|
|
self.fit_kwargs["sample_weight"] = sampled_weight
|
|
else:
|
|
weight = None
|
|
if groups is not None:
|
|
self.fit_kwargs["groups"] = groups
|
|
budget = (
|
|
None
|
|
if self.time_budget is None
|
|
else self.time_budget - self.time_from_start
|
|
)
|
|
estimator, train_time = train_estimator(
|
|
sampled_X_train,
|
|
sampled_y_train,
|
|
config,
|
|
self.task,
|
|
estimator,
|
|
self.n_jobs,
|
|
self.learner_classes.get(estimator),
|
|
budget,
|
|
self.fit_kwargs,
|
|
)
|
|
if sampled_weight is not None:
|
|
self.fit_kwargs["sample_weight"] = weight
|
|
return estimator, train_time
|
|
|
|
|
|
def size(state: AutoMLState, config: dict) -> float:
|
|
"""Size function
|
|
|
|
Returns:
|
|
The mem size in bytes for a config
|
|
"""
|
|
config = config.get("ml", config)
|
|
estimator = config["learner"]
|
|
learner_class = state.learner_classes.get(estimator)
|
|
return learner_class.size(config)
|
|
|
|
|
|
class AutoML:
|
|
"""The AutoML class
|
|
|
|
Example:
|
|
|
|
.. code-block:: python
|
|
|
|
automl = AutoML()
|
|
automl_settings = {
|
|
"time_budget": 60,
|
|
"metric": 'accuracy',
|
|
"task": 'classification',
|
|
"log_file_name": 'test/mylog.log',
|
|
}
|
|
automl.fit(X_train = X_train, y_train = y_train,
|
|
**automl_settings)
|
|
|
|
"""
|
|
|
|
from .version import __version__
|
|
|
|
def __init__(self):
|
|
self._track_iter = 0
|
|
self._state = AutoMLState()
|
|
self._state.learner_classes = {}
|
|
|
|
@property
|
|
def model_history(self):
|
|
"""A dictionary of iter->model, storing the models when
|
|
the best model is updated each time.
|
|
"""
|
|
return self._model_history
|
|
|
|
@property
|
|
def config_history(self):
|
|
"""A dictionary of iter->(estimator, config, time),
|
|
storing the best estimator, config, and the time when the best
|
|
model is updated each time.
|
|
"""
|
|
return self._config_history
|
|
|
|
@property
|
|
def model(self):
|
|
"""An object with `predict()` and `predict_proba()` method (for
|
|
classification), storing the best trained model.
|
|
"""
|
|
return self.__dict__.get("_trained_estimator")
|
|
|
|
def best_model_for_estimator(self, estimator_name):
|
|
"""Return the best model found for a particular estimator
|
|
|
|
Args:
|
|
estimator_name: a str of the estimator's name
|
|
|
|
Returns:
|
|
An object with `predict()` and `predict_proba()` method (for
|
|
classification), storing the best trained model for estimator_name.
|
|
"""
|
|
state = self._search_states.get(estimator_name)
|
|
return state and getattr(state, "trained_estimator", None)
|
|
|
|
@property
|
|
def best_estimator(self):
|
|
"""A string indicating the best estimator found."""
|
|
return self._best_estimator
|
|
|
|
@property
|
|
def best_iteration(self):
|
|
"""An integer of the iteration number where the best
|
|
config is found."""
|
|
return self._best_iteration
|
|
|
|
@property
|
|
def best_config(self):
|
|
"""A dictionary of the best configuration."""
|
|
state = self._search_states.get(self._best_estimator)
|
|
return state and getattr(state, "best_config", None)
|
|
|
|
@property
|
|
def best_config_per_estimator(self):
|
|
"""A dictionary of all estimators' best configuration."""
|
|
return {
|
|
e: e_search_state.best_config
|
|
for e, e_search_state in self._search_states.items()
|
|
}
|
|
|
|
@property
|
|
def best_loss(self):
|
|
"""A float of the best loss found"""
|
|
return self._state.best_loss
|
|
|
|
@property
|
|
def best_config_train_time(self):
|
|
"""A float of the seconds taken by training the best config."""
|
|
return getattr(
|
|
self._search_states[self._best_estimator], "best_config_train_time", None
|
|
)
|
|
|
|
@property
|
|
def classes_(self):
|
|
"""A list of n_classes elements for class labels."""
|
|
attr = getattr(self, "_label_transformer", None)
|
|
if attr:
|
|
return attr.classes_.tolist()
|
|
attr = getattr(self, "_trained_estimator", None)
|
|
if attr:
|
|
return attr.classes_.tolist()
|
|
return None
|
|
|
|
@property
|
|
def time_to_find_best_model(self) -> float:
|
|
"""Time taken to find best model in seconds"""
|
|
return self.__dict__.get("_time_taken_best_iter")
|
|
|
|
def predict(self, X_test):
|
|
"""Predict label from features.
|
|
|
|
Args:
|
|
X_test: A numpy array of featurized instances, shape n * m,
|
|
or for 'ts_forecast' task:
|
|
a pandas dataframe with the first column containing
|
|
timestamp values (datetime type) or an integer n for
|
|
the predict steps (only valid when the estimator is
|
|
arima or sarimax). Other columns in the dataframe
|
|
are assumed to be exogenous variables (categorical
|
|
or numeric).
|
|
|
|
.. code-block:: python
|
|
|
|
multivariate_X_test = pd.DataFrame({
|
|
'timeStamp': pd.date_range(start='1/1/2022', end='1/07/2022'),
|
|
'categorical_col': ['yes', 'yes', 'no', 'no', 'yes', 'no', 'yes'],
|
|
'continuous_col': [105, 107, 120, 118, 110, 112, 115]
|
|
})
|
|
model.predict(multivariate_X_test)
|
|
|
|
Returns:
|
|
A array-like of shape n * 1 - - each element is a predicted
|
|
label for an instance.
|
|
"""
|
|
estimator = getattr(self, "_trained_estimator", None)
|
|
if estimator is None:
|
|
logger.warning(
|
|
"No estimator is trained. Please run fit with enough budget."
|
|
)
|
|
return None
|
|
X_test = self._preprocess(X_test)
|
|
y_pred = estimator.predict(X_test)
|
|
if y_pred.ndim > 1 and isinstance(y_pred, np.ndarray):
|
|
y_pred = y_pred.flatten()
|
|
if self._label_transformer:
|
|
return self._label_transformer.inverse_transform(
|
|
pd.Series(y_pred.astype(int))
|
|
)
|
|
else:
|
|
return y_pred
|
|
|
|
def predict_proba(self, X_test):
|
|
"""Predict the probability of each class from features, only works for
|
|
classification problems.
|
|
|
|
Args:
|
|
X_test: A numpy array of featurized instances, shape n * m.
|
|
|
|
Returns:
|
|
A numpy array of shape n * c. c is the # classes. Each element at
|
|
(i, j) is the probability for instance i to be in class j.
|
|
"""
|
|
X_test = self._preprocess(X_test)
|
|
proba = self._trained_estimator.predict_proba(X_test)
|
|
return proba
|
|
|
|
def _preprocess(self, X):
|
|
if isinstance(X, int):
|
|
return X
|
|
if self._state.task == TS_FORECAST:
|
|
X = pd.DataFrame(X)
|
|
if issparse(X):
|
|
X = X.tocsr()
|
|
if self._transformer:
|
|
X = self._transformer.transform(X, self._state.task)
|
|
return X
|
|
|
|
def _validate_data(
|
|
self,
|
|
X_train_all,
|
|
y_train_all,
|
|
dataframe,
|
|
label,
|
|
X_val=None,
|
|
y_val=None,
|
|
groups_val=None,
|
|
groups=None,
|
|
):
|
|
if X_train_all is not None and y_train_all is not None:
|
|
assert (
|
|
isinstance(X_train_all, np.ndarray)
|
|
or issparse(X_train_all)
|
|
or isinstance(X_train_all, pd.DataFrame)
|
|
), (
|
|
"X_train_all must be a numpy array, a pandas dataframe, "
|
|
"or Scipy sparse matrix."
|
|
)
|
|
assert isinstance(y_train_all, np.ndarray) or isinstance(
|
|
y_train_all, pd.Series
|
|
), "y_train_all must be a numpy array or a pandas series."
|
|
assert (
|
|
X_train_all.size != 0 and y_train_all.size != 0
|
|
), "Input data must not be empty."
|
|
if isinstance(X_train_all, np.ndarray) and len(X_train_all.shape) == 1:
|
|
X_train_all = np.reshape(X_train_all, (X_train_all.size, 1))
|
|
if isinstance(y_train_all, np.ndarray):
|
|
y_train_all = y_train_all.flatten()
|
|
assert (
|
|
X_train_all.shape[0] == y_train_all.shape[0]
|
|
), "# rows in X_train must match length of y_train."
|
|
self._df = isinstance(X_train_all, pd.DataFrame)
|
|
self._nrow, self._ndim = X_train_all.shape
|
|
if self._state.task == TS_FORECAST:
|
|
X_train_all = pd.DataFrame(X_train_all)
|
|
assert (
|
|
X_train_all[X_train_all.columns[0]].dtype.name == "datetime64[ns]"
|
|
), f"For '{TS_FORECAST}' task, the first column must contain timestamp values."
|
|
X, y = X_train_all, y_train_all
|
|
elif dataframe is not None and label is not None:
|
|
assert isinstance(
|
|
dataframe, pd.DataFrame
|
|
), "dataframe must be a pandas DataFrame"
|
|
assert label in dataframe.columns, "label must a column name in dataframe"
|
|
self._df = True
|
|
if self._state.task == TS_FORECAST:
|
|
assert (
|
|
dataframe[dataframe.columns[0]].dtype.name == "datetime64[ns]"
|
|
), f"For '{TS_FORECAST}' task, the first column must contain timestamp values."
|
|
X = dataframe.drop(columns=label)
|
|
self._nrow, self._ndim = X.shape
|
|
y = dataframe[label]
|
|
else:
|
|
raise ValueError("either X_train+y_train or dataframe+label are required")
|
|
if issparse(X_train_all):
|
|
self._transformer = self._label_transformer = False
|
|
self._X_train_all, self._y_train_all = X, y
|
|
else:
|
|
from .data import DataTransformer
|
|
|
|
self._transformer = DataTransformer()
|
|
self._X_train_all, self._y_train_all = self._transformer.fit_transform(
|
|
X, y, self._state.task
|
|
)
|
|
self._label_transformer = self._transformer.label_transformer
|
|
self._sample_weight_full = self._state.fit_kwargs.get("sample_weight")
|
|
if X_val is not None and y_val is not None:
|
|
assert (
|
|
isinstance(X_val, np.ndarray)
|
|
or issparse(X_val)
|
|
or isinstance(X_val, pd.DataFrame)
|
|
), (
|
|
"X_val must be None, a numpy array, a pandas dataframe, "
|
|
"or Scipy sparse matrix."
|
|
)
|
|
assert isinstance(y_val, np.ndarray) or isinstance(
|
|
y_val, pd.Series
|
|
), "y_val must be None, a numpy array or a pandas series."
|
|
assert X_val.size != 0 and y_val.size != 0, (
|
|
"Validation data are expected to be nonempty. "
|
|
"Use None for X_val and y_val if no validation data."
|
|
)
|
|
if isinstance(y_val, np.ndarray):
|
|
y_val = y_val.flatten()
|
|
assert (
|
|
X_val.shape[0] == y_val.shape[0]
|
|
), "# rows in X_val must match length of y_val."
|
|
if self._transformer:
|
|
self._state.X_val = self._transformer.transform(X_val, self._state.task)
|
|
else:
|
|
self._state.X_val = X_val
|
|
if self._label_transformer:
|
|
self._state.y_val = self._label_transformer.transform(
|
|
y_val, self._state.task
|
|
)
|
|
else:
|
|
self._state.y_val = y_val
|
|
else:
|
|
self._state.X_val = self._state.y_val = None
|
|
if groups is not None and len(groups) != self._nrow:
|
|
# groups is given as group counts
|
|
self._state.groups = np.concatenate([[i] * c for i, c in enumerate(groups)])
|
|
assert (
|
|
len(self._state.groups) == self._nrow
|
|
), "the sum of group counts must match the number of examples"
|
|
self._state.groups_val = (
|
|
np.concatenate([[i] * c for i, c in enumerate(groups_val)])
|
|
if groups_val is not None
|
|
else None
|
|
)
|
|
else:
|
|
self._state.groups_val = groups_val
|
|
self._state.groups = groups
|
|
|
|
def _prepare_data(self, eval_method, split_ratio, n_splits):
|
|
X_val, y_val = self._state.X_val, self._state.y_val
|
|
if issparse(X_val):
|
|
X_val = X_val.tocsr()
|
|
X_train_all, y_train_all = self._X_train_all, self._y_train_all
|
|
if issparse(X_train_all):
|
|
X_train_all = X_train_all.tocsr()
|
|
if (
|
|
self._state.task in CLASSIFICATION
|
|
and self._auto_augment
|
|
and self._state.fit_kwargs.get("sample_weight") is None
|
|
and self._split_type not in ["time", "group"]
|
|
):
|
|
# logger.info(f"label {pd.unique(y_train_all)}")
|
|
label_set, counts = np.unique(y_train_all, return_counts=True)
|
|
# augment rare classes
|
|
rare_threshld = 20
|
|
rare = counts < rare_threshld
|
|
rare_label, rare_counts = label_set[rare], counts[rare]
|
|
for i, label in enumerate(rare_label):
|
|
count = rare_count = rare_counts[i]
|
|
rare_index = y_train_all == label
|
|
n = len(y_train_all)
|
|
while count < rare_threshld:
|
|
if self._df:
|
|
X_train_all = concat(
|
|
X_train_all, X_train_all.iloc[:n].loc[rare_index]
|
|
)
|
|
else:
|
|
X_train_all = concat(
|
|
X_train_all, X_train_all[:n][rare_index, :]
|
|
)
|
|
if isinstance(y_train_all, pd.Series):
|
|
y_train_all = concat(
|
|
y_train_all, y_train_all.iloc[:n].loc[rare_index]
|
|
)
|
|
else:
|
|
y_train_all = np.concatenate(
|
|
[y_train_all, y_train_all[:n][rare_index]]
|
|
)
|
|
count += rare_count
|
|
logger.info(f"class {label} augmented from {rare_count} to {count}")
|
|
SHUFFLE_SPLIT_TYPES = ["uniform", "stratified"]
|
|
if self._split_type in SHUFFLE_SPLIT_TYPES:
|
|
if self._sample_weight_full is not None:
|
|
X_train_all, y_train_all, self._state.sample_weight_all = shuffle(
|
|
X_train_all,
|
|
y_train_all,
|
|
self._sample_weight_full,
|
|
random_state=RANDOM_SEED,
|
|
)
|
|
self._state.fit_kwargs["sample_weight"] = self._state.sample_weight_all
|
|
else:
|
|
X_train_all, y_train_all = shuffle(
|
|
X_train_all, y_train_all, random_state=RANDOM_SEED
|
|
)
|
|
if self._df:
|
|
X_train_all.reset_index(drop=True, inplace=True)
|
|
if isinstance(y_train_all, pd.Series):
|
|
y_train_all.reset_index(drop=True, inplace=True)
|
|
|
|
X_train, y_train = X_train_all, y_train_all
|
|
self._state.groups_all = self._state.groups
|
|
if X_val is None and eval_method == "holdout":
|
|
# if eval_method = holdout, make holdout data
|
|
if self._split_type == "time":
|
|
if self._state.task == TS_FORECAST:
|
|
num_samples = X_train_all.shape[0]
|
|
period = self._state.fit_kwargs["period"]
|
|
assert (
|
|
period < num_samples
|
|
), f"period={period}>#examples={num_samples}"
|
|
split_idx = num_samples - period
|
|
X_train = X_train_all[:split_idx]
|
|
y_train = y_train_all[:split_idx]
|
|
X_val = X_train_all[split_idx:]
|
|
y_val = y_train_all[split_idx:]
|
|
else:
|
|
if "sample_weight" in self._state.fit_kwargs:
|
|
(
|
|
X_train,
|
|
X_val,
|
|
y_train,
|
|
y_val,
|
|
self._state.fit_kwargs["sample_weight"],
|
|
self._state.weight_val,
|
|
) = train_test_split(
|
|
X_train_all,
|
|
y_train_all,
|
|
self._state.fit_kwargs["sample_weight"],
|
|
test_size=split_ratio,
|
|
shuffle=False,
|
|
)
|
|
else:
|
|
X_train, X_val, y_train, y_val = train_test_split(
|
|
X_train_all,
|
|
y_train_all,
|
|
test_size=split_ratio,
|
|
shuffle=False,
|
|
)
|
|
elif self._split_type == "group":
|
|
gss = GroupShuffleSplit(
|
|
n_splits=1, test_size=split_ratio, random_state=RANDOM_SEED
|
|
)
|
|
for train_idx, val_idx in gss.split(
|
|
X_train_all, y_train_all, self._state.groups_all
|
|
):
|
|
if self._df:
|
|
X_train = X_train_all.iloc[train_idx]
|
|
X_val = X_train_all.iloc[val_idx]
|
|
else:
|
|
X_train, X_val = X_train_all[train_idx], X_train_all[val_idx]
|
|
y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
|
|
self._state.groups = self._state.groups_all[train_idx]
|
|
self._state.groups_val = self._state.groups_all[val_idx]
|
|
elif self._state.task in CLASSIFICATION:
|
|
# for classification, make sure the labels are complete in both
|
|
# training and validation data
|
|
label_set, first = np.unique(y_train_all, return_index=True)
|
|
rest = []
|
|
last = 0
|
|
first.sort()
|
|
for i in range(len(first)):
|
|
rest.extend(range(last, first[i]))
|
|
last = first[i] + 1
|
|
rest.extend(range(last, len(y_train_all)))
|
|
X_first = X_train_all.iloc[first] if self._df else X_train_all[first]
|
|
X_rest = X_train_all.iloc[rest] if self._df else X_train_all[rest]
|
|
y_rest = y_train_all[rest]
|
|
stratify = y_rest if self._split_type == "stratified" else None
|
|
if "sample_weight" in self._state.fit_kwargs:
|
|
(
|
|
X_train,
|
|
X_val,
|
|
y_train,
|
|
y_val,
|
|
weight_train,
|
|
weight_val,
|
|
) = train_test_split(
|
|
X_rest,
|
|
y_rest,
|
|
self._state.fit_kwargs["sample_weight"][rest],
|
|
test_size=split_ratio,
|
|
random_state=RANDOM_SEED,
|
|
)
|
|
weight1 = self._state.fit_kwargs["sample_weight"][first]
|
|
self._state.weight_val = concat(weight1, weight_val)
|
|
self._state.fit_kwargs["sample_weight"] = concat(
|
|
weight1, weight_train
|
|
)
|
|
else:
|
|
X_train, X_val, y_train, y_val = train_test_split(
|
|
X_rest,
|
|
y_rest,
|
|
test_size=split_ratio,
|
|
stratify=stratify,
|
|
random_state=RANDOM_SEED,
|
|
)
|
|
X_train = concat(X_first, X_train)
|
|
y_train = (
|
|
concat(label_set, y_train)
|
|
if self._df
|
|
else np.concatenate([label_set, y_train])
|
|
)
|
|
X_val = concat(X_first, X_val)
|
|
y_val = (
|
|
concat(label_set, y_val)
|
|
if self._df
|
|
else np.concatenate([label_set, y_val])
|
|
)
|
|
elif self._state.task == "regression":
|
|
if "sample_weight" in self._state.fit_kwargs:
|
|
(
|
|
X_train,
|
|
X_val,
|
|
y_train,
|
|
y_val,
|
|
self._state.fit_kwargs["sample_weight"],
|
|
self._state.weight_val,
|
|
) = train_test_split(
|
|
X_train_all,
|
|
y_train_all,
|
|
self._state.fit_kwargs["sample_weight"],
|
|
test_size=split_ratio,
|
|
random_state=RANDOM_SEED,
|
|
)
|
|
else:
|
|
X_train, X_val, y_train, y_val = train_test_split(
|
|
X_train_all,
|
|
y_train_all,
|
|
test_size=split_ratio,
|
|
random_state=RANDOM_SEED,
|
|
)
|
|
self._state.data_size = X_train.shape[0]
|
|
self.data_size_full = len(y_train_all)
|
|
self._state.X_train, self._state.y_train = X_train, y_train
|
|
self._state.X_val, self._state.y_val = X_val, y_val
|
|
self._state.X_train_all = X_train_all
|
|
self._state.y_train_all = y_train_all
|
|
if self._split_type == "group":
|
|
# logger.info("Using GroupKFold")
|
|
assert (
|
|
len(self._state.groups_all) == y_train_all.size
|
|
), "the length of groups must match the number of examples"
|
|
assert (
|
|
len(np.unique(self._state.groups_all)) >= n_splits
|
|
), "the number of groups must be equal or larger than n_splits"
|
|
self._state.kf = GroupKFold(n_splits)
|
|
self._state.kf.groups = self._state.groups_all
|
|
elif self._split_type == "stratified":
|
|
# logger.info("Using StratifiedKFold")
|
|
assert y_train_all.size >= n_splits, (
|
|
f"{n_splits}-fold cross validation"
|
|
f" requires input data with at least {n_splits} examples."
|
|
)
|
|
assert y_train_all.size >= 2 * n_splits, (
|
|
f"{n_splits}-fold cross validation with metric=r2 "
|
|
f"requires input data with at least {n_splits*2} examples."
|
|
)
|
|
self._state.kf = RepeatedStratifiedKFold(
|
|
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED
|
|
)
|
|
elif self._split_type == "time":
|
|
# logger.info("Using TimeSeriesSplit")
|
|
if self._state.task == TS_FORECAST:
|
|
period = self._state.fit_kwargs["period"]
|
|
if period * (n_splits + 1) > y_train_all.size:
|
|
n_splits = int(y_train_all.size / period - 1)
|
|
assert n_splits >= 2, (
|
|
f"cross validation for forecasting period={period}"
|
|
f" requires input data with at least {3 * period} examples."
|
|
)
|
|
logger.info(f"Using nsplits={n_splits} due to data size limit.")
|
|
self._state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period)
|
|
else:
|
|
self._state.kf = TimeSeriesSplit(n_splits=n_splits)
|
|
else:
|
|
# logger.info("Using RepeatedKFold")
|
|
self._state.kf = RepeatedKFold(
|
|
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED
|
|
)
|
|
|
|
def add_learner(self, learner_name, learner_class):
|
|
"""Add a customized learner
|
|
|
|
Args:
|
|
learner_name: A string of the learner's name
|
|
learner_class: A subclass of flaml.model.BaseEstimator
|
|
"""
|
|
self._state.learner_classes[learner_name] = learner_class
|
|
|
|
def get_estimator_from_log(self, log_file_name, record_id, task):
|
|
"""Get the estimator from log file
|
|
|
|
Args:
|
|
log_file_name: A string of the log file name
|
|
record_id: An integer of the record ID in the file,
|
|
0 corresponds to the first trial
|
|
task: A string of the task type,
|
|
'binary', 'multi', 'regression', 'ts_forecast', 'rank'
|
|
|
|
Returns:
|
|
An estimator object for the given configuration
|
|
"""
|
|
|
|
with training_log_reader(log_file_name) as reader:
|
|
record = reader.get_record(record_id)
|
|
estimator = record.learner
|
|
config = record.config
|
|
|
|
estimator, _ = train_estimator(
|
|
None,
|
|
None,
|
|
config,
|
|
task,
|
|
estimator,
|
|
estimator_class=self._state.learner_classes.get(estimator),
|
|
)
|
|
return estimator
|
|
|
|
def retrain_from_log(
|
|
self,
|
|
log_file_name,
|
|
X_train=None,
|
|
y_train=None,
|
|
dataframe=None,
|
|
label=None,
|
|
time_budget=0,
|
|
task="classification",
|
|
eval_method="auto",
|
|
split_ratio=SPLIT_RATIO,
|
|
n_splits=N_SPLITS,
|
|
split_type=None,
|
|
groups=None,
|
|
n_jobs=-1,
|
|
train_best=True,
|
|
train_full=False,
|
|
record_id=-1,
|
|
auto_augment=True,
|
|
**fit_kwargs,
|
|
):
|
|
"""Retrain from log file
|
|
|
|
Args:
|
|
log_file_name: A string of the log file name
|
|
X_train: A numpy array of training data in shape n*m
|
|
For 'ts_forecast' task, the first column of X_train
|
|
must be the timestamp column (datetime type). Other
|
|
columns in the dataframe are assumed to be exogenous
|
|
variables (categorical or numeric).
|
|
y_train: A numpy array of labels in shape n*1
|
|
dataframe: A dataframe of training data including label column.
|
|
For 'ts_forecast' task, dataframe must be specified and should
|
|
have at least two columns: timestamp and label, where the first
|
|
column is the timestamp column (datetime type). Other columns
|
|
in the dataframe are assumed to be exogenous variables
|
|
(categorical or numeric).
|
|
label: A str of the label column name, e.g., 'label';
|
|
Note: If X_train and y_train are provided,
|
|
dataframe and label are ignored;
|
|
If not, dataframe and label must be provided.
|
|
time_budget: A float number of the time budget in seconds.
|
|
task: A string of the task type, e.g.,
|
|
'classification', 'regression', 'ts_forecast', 'rank'.
|
|
eval_method: A string of resampling strategy, one of
|
|
['auto', 'cv', 'holdout'].
|
|
split_ratio: A float of the validation data percentage for holdout.
|
|
n_splits: An integer of the number of folds for cross-validation.
|
|
split_type: str or None, default=None | the data split type.
|
|
For classification tasks, valid choices are [
|
|
None, 'stratified', 'uniform', 'time', 'group']. None -> stratified.
|
|
For regression tasks, valid choices are [None, 'uniform', 'time'].
|
|
None -> uniform.
|
|
For ts_forecast tasks, must be None or 'time'.
|
|
For ranking task, must be None or 'group'.
|
|
groups: None or array-like | Group labels (with matching length to
|
|
y_train) or groups counts (with sum equal to length of y_train)
|
|
for training data.
|
|
n_jobs: An integer of the number of threads for training. Use all
|
|
available resources when n_jobs == -1.
|
|
train_best: A boolean of whether to train the best config in the
|
|
time budget; if false, train the last config in the budget.
|
|
train_full: A boolean of whether to train on the full data. If true,
|
|
eval_method and sample_size in the log file will be ignored.
|
|
record_id: the ID of the training log record from which the model will
|
|
be retrained. By default `record_id = -1` which means this will be
|
|
ignored. `record_id = 0` corresponds to the first trial, and
|
|
when `record_id >= 0`, `time_budget` will be ignored.
|
|
auto_augment: boolean, default=True | Whether to automatically
|
|
augment rare classes.
|
|
**fit_kwargs: Other key word arguments to pass to fit() function of
|
|
the searched learners, such as sample_weight.
|
|
"""
|
|
if task == FORECAST:
|
|
self._state.task = TS_FORECAST
|
|
else:
|
|
self._state.task = task
|
|
self._state.fit_kwargs = fit_kwargs
|
|
self._validate_data(X_train, y_train, dataframe, label, groups=groups)
|
|
|
|
logger.info("log file name {}".format(log_file_name))
|
|
|
|
best_config = None
|
|
best_val_loss = float("+inf")
|
|
best_estimator = None
|
|
sample_size = None
|
|
time_used = 0.0
|
|
training_duration = 0
|
|
best = None
|
|
with training_log_reader(log_file_name) as reader:
|
|
if record_id >= 0:
|
|
best = reader.get_record(record_id)
|
|
else:
|
|
for record in reader.records():
|
|
time_used = record.wall_clock_time
|
|
if time_used > time_budget:
|
|
break
|
|
training_duration = time_used
|
|
val_loss = record.validation_loss
|
|
if val_loss <= best_val_loss or not train_best:
|
|
if val_loss == best_val_loss and train_best:
|
|
size = record.sample_size
|
|
if size > sample_size:
|
|
best = record
|
|
best_val_loss = val_loss
|
|
sample_size = size
|
|
else:
|
|
best = record
|
|
size = record.sample_size
|
|
best_val_loss = val_loss
|
|
sample_size = size
|
|
if not training_duration:
|
|
logger.warning(
|
|
f"No estimator found within time_budget={time_budget}"
|
|
)
|
|
from .model import BaseEstimator as Estimator
|
|
|
|
self._trained_estimator = Estimator()
|
|
return training_duration
|
|
if not best:
|
|
return
|
|
best_estimator = best.learner
|
|
best_config = best.config
|
|
sample_size = len(self._y_train_all) if train_full else best.sample_size
|
|
|
|
logger.info(
|
|
"estimator = {}, config = {}, #training instances = {}".format(
|
|
best_estimator, best_config, sample_size
|
|
)
|
|
)
|
|
# Partially copied from fit() function
|
|
# Initilize some attributes required for retrain_from_log
|
|
self._state.task = task
|
|
self._decide_split_type(split_type)
|
|
if record_id >= 0:
|
|
eval_method = "cv"
|
|
elif eval_method == "auto":
|
|
eval_method = self._decide_eval_method(time_budget)
|
|
self.modelcount = 0
|
|
self._auto_augment = auto_augment
|
|
self._prepare_data(eval_method, split_ratio, n_splits)
|
|
self._state.time_budget = None
|
|
self._state.n_jobs = n_jobs
|
|
self._trained_estimator = self._state._train_with_config(
|
|
best_estimator, best_config, sample_size
|
|
)[0]
|
|
logger.info("retrain from log succeeded")
|
|
return training_duration
|
|
|
|
def _decide_split_type(self, split_type):
|
|
if self._state.task == "classification":
|
|
self._state.task = get_classification_objective(
|
|
len(np.unique(self._y_train_all))
|
|
)
|
|
if self._state.task in CLASSIFICATION:
|
|
assert split_type in [None, "stratified", "uniform", "time", "group"]
|
|
self._split_type = (
|
|
split_type or self._state.groups is None and "stratified" or "group"
|
|
)
|
|
elif self._state.task == "regression":
|
|
assert split_type in [None, "uniform", "time", "group"]
|
|
self._split_type = split_type or "uniform"
|
|
elif self._state.task == TS_FORECAST:
|
|
assert split_type in [None, "time"]
|
|
self._split_type = "time"
|
|
assert isinstance(
|
|
self._state.fit_kwargs.get("period"), int
|
|
), f"missing a required integer 'period' for '{TS_FORECAST}' task."
|
|
elif self._state.task == "rank":
|
|
assert (
|
|
self._state.groups is not None
|
|
), "groups must be specified for ranking task."
|
|
assert split_type in [None, "group"]
|
|
self._split_type = "group"
|
|
|
|
def _decide_eval_method(self, time_budget):
|
|
if self._state.X_val is not None:
|
|
return "holdout"
|
|
nrow, dim = self._nrow, self._ndim
|
|
if (
|
|
time_budget is None
|
|
or nrow * dim / 0.9 < SMALL_LARGE_THRES * (time_budget / 3600)
|
|
and nrow < CV_HOLDOUT_THRESHOLD
|
|
):
|
|
# time allows or sampling can be used and cv is necessary
|
|
return "cv"
|
|
else:
|
|
return "holdout"
|
|
|
|
@property
|
|
def search_space(self) -> dict:
|
|
"""Search space
|
|
Must be called after fit(...) (use max_iter=0 to prevent actual fitting)
|
|
|
|
Returns:
|
|
A dict of the search space
|
|
"""
|
|
estimator_list = self.estimator_list
|
|
if len(estimator_list) == 1:
|
|
estimator = estimator_list[0]
|
|
space = self._search_states[estimator].search_space.copy()
|
|
space["learner"] = estimator
|
|
return space
|
|
choices = []
|
|
for estimator in estimator_list:
|
|
space = self._search_states[estimator].search_space.copy()
|
|
space["learner"] = estimator
|
|
choices.append(space)
|
|
return {"ml": tune.choice(choices)}
|
|
|
|
@property
|
|
def low_cost_partial_config(self) -> dict:
|
|
"""Low cost partial config
|
|
|
|
Returns:
|
|
A dict.
|
|
(a) if there is only one estimator in estimator_list, each key is a
|
|
hyperparameter name.
|
|
(b) otherwise, it is a nested dict with 'ml' as the key, and
|
|
a list of the low_cost_partial_configs as the value, corresponding
|
|
to each learner's low_cost_partial_config; the estimator index as
|
|
an integer corresponding to the cheapest learner is appended to the
|
|
list at the end.
|
|
|
|
"""
|
|
if len(self.estimator_list) == 1:
|
|
estimator = self.estimator_list[0]
|
|
c = self._search_states[estimator].low_cost_partial_config
|
|
return c
|
|
else:
|
|
configs = []
|
|
for estimator in self.estimator_list:
|
|
c = self._search_states[estimator].low_cost_partial_config
|
|
configs.append(c)
|
|
configs.append(
|
|
np.argmin(
|
|
[
|
|
self._state.learner_classes.get(estimator).cost_relative2lgbm()
|
|
for estimator in self.estimator_list
|
|
]
|
|
)
|
|
)
|
|
config = {"ml": configs}
|
|
return config
|
|
|
|
@property
|
|
def cat_hp_cost(self) -> dict:
|
|
"""Categorical hyperparameter cost
|
|
|
|
Returns:
|
|
A dict.
|
|
(a) if there is only one estimator in estimator_list, each key is a
|
|
hyperparameter name.
|
|
(b) otherwise, it is a nested dict with 'ml' as the key, and
|
|
a list of the cat_hp_cost's as the value, corresponding
|
|
to each learner's cat_hp_cost; the cost relative to lgbm for each
|
|
learner (as a list itself) is appended to the list at the end.
|
|
|
|
"""
|
|
if len(self.estimator_list) == 1:
|
|
estimator = self.estimator_list[0]
|
|
c = self._search_states[estimator].cat_hp_cost
|
|
return c
|
|
else:
|
|
configs = []
|
|
for estimator in self.estimator_list:
|
|
c = self._search_states[estimator].cat_hp_cost
|
|
configs.append(c)
|
|
configs.append(
|
|
[
|
|
self._state.learner_classes.get(estimator).cost_relative2lgbm()
|
|
for estimator in self.estimator_list
|
|
]
|
|
)
|
|
config = {"ml": configs}
|
|
return config
|
|
|
|
@property
|
|
def points_to_evaluate(self) -> dict:
|
|
"""Initial points to evaluate
|
|
|
|
Returns:
|
|
A list of dicts. Each dict is the initial point for each learner
|
|
"""
|
|
points = []
|
|
for estimator in self.estimator_list:
|
|
if isinstance(self._search_states[estimator].init_config, list):
|
|
configs = self._search_states[estimator].init_config
|
|
else:
|
|
configs = [self._search_states[estimator].init_config]
|
|
for config in configs:
|
|
config["learner"] = estimator
|
|
if len(self.estimator_list) > 1:
|
|
points.append({"ml": config})
|
|
else:
|
|
points.append(config)
|
|
return points
|
|
|
|
@property
|
|
def prune_attr(self) -> Optional[str]:
|
|
"""Attribute for pruning
|
|
|
|
Returns:
|
|
A string for the sample size attribute or None
|
|
"""
|
|
return "FLAML_sample_size" if self._sample else None
|
|
|
|
@property
|
|
def min_resource(self) -> Optional[float]:
|
|
"""Attribute for pruning
|
|
|
|
Returns:
|
|
A float for the minimal sample size or None
|
|
"""
|
|
return self._min_sample_size if self._sample else None
|
|
|
|
@property
|
|
def max_resource(self) -> Optional[float]:
|
|
"""Attribute for pruning
|
|
|
|
Returns:
|
|
A float for the maximal sample size or None
|
|
"""
|
|
return self._state.data_size if self._sample else None
|
|
|
|
@property
|
|
def trainable(self) -> Callable[[dict], Optional[float]]:
|
|
"""Training function
|
|
|
|
Returns:
|
|
A function that evaluates each config and returns the loss
|
|
"""
|
|
self._state.time_from_start = 0
|
|
for estimator in self.estimator_list:
|
|
search_state = self._search_states[estimator]
|
|
if not hasattr(search_state, "training_function"):
|
|
search_state.training_function = partial(
|
|
AutoMLState._compute_with_config_base, self._state, estimator
|
|
)
|
|
states = self._search_states
|
|
mem_res = self._mem_thres
|
|
|
|
def train(config: dict):
|
|
sample_size = config.get("FLAML_sample_size")
|
|
config = config.get("ml", config).copy()
|
|
if sample_size:
|
|
config["FLAML_sample_size"] = sample_size
|
|
estimator = config["learner"]
|
|
# check memory constraints before training
|
|
if states[estimator].learner_class.size(config) <= mem_res:
|
|
del config["learner"]
|
|
result = states[estimator].training_function(config)
|
|
return result
|
|
else:
|
|
return {
|
|
"pred_time": 0,
|
|
"wall_clock_time": None,
|
|
"metric_for_logging": np.inf,
|
|
"val_loss": np.inf,
|
|
"trained_estimator": None,
|
|
}
|
|
|
|
return train
|
|
|
|
@property
|
|
def metric_constraints(self) -> list:
|
|
"""Metric constraints
|
|
|
|
Returns:
|
|
A list of the metric constraints
|
|
"""
|
|
constraints = []
|
|
if np.isfinite(self._pred_time_limit):
|
|
constraints.append(("pred_time", "<=", self._pred_time_limit))
|
|
return constraints
|
|
|
|
def fit(
|
|
self,
|
|
X_train=None,
|
|
y_train=None,
|
|
dataframe=None,
|
|
label=None,
|
|
metric="auto",
|
|
task="classification",
|
|
n_jobs=-1,
|
|
log_file_name="flaml.log",
|
|
estimator_list="auto",
|
|
time_budget=60,
|
|
max_iter=1000000,
|
|
sample=True,
|
|
ensemble=False,
|
|
eval_method="auto",
|
|
log_type="better",
|
|
model_history=False,
|
|
split_ratio=SPLIT_RATIO,
|
|
n_splits=N_SPLITS,
|
|
log_training_metric=False,
|
|
mem_thres=MEM_THRES,
|
|
pred_time_limit=np.inf,
|
|
train_time_limit=np.inf,
|
|
X_val=None,
|
|
y_val=None,
|
|
sample_weight_val=None,
|
|
groups_val=None,
|
|
groups=None,
|
|
verbose=3,
|
|
retrain_full=True,
|
|
split_type=None,
|
|
learner_selector="sample",
|
|
hpo_method=None,
|
|
starting_points={},
|
|
seed=None,
|
|
n_concurrent_trials=1,
|
|
keep_search_state=False,
|
|
early_stop=False,
|
|
append_log=False,
|
|
auto_augment=True,
|
|
min_sample_size=MIN_SAMPLE_TRAIN,
|
|
use_ray=False,
|
|
**fit_kwargs,
|
|
):
|
|
"""Find a model for a given task
|
|
|
|
Args:
|
|
X_train: A numpy array or a pandas dataframe of training data in
|
|
shape (n, m). For 'ts_forecast' task, the first column of X_train
|
|
must be the timestamp column (datetime type). Other columns in
|
|
the dataframe are assumed to be exogenous variables (categorical or numeric).
|
|
y_train: A numpy array or a pandas series of labels in shape (n, ).
|
|
dataframe: A dataframe of training data including label column.
|
|
For 'ts_forecast' task, dataframe must be specified and must have
|
|
at least two columns, timestamp and label, where the first
|
|
column is the timestamp column (datetime type). Other columns in
|
|
the dataframe are assumed to be exogenous variables (categorical or numeric).
|
|
label: A str of the label column name for, e.g., 'label';
|
|
Note: If X_train and y_train are provided,
|
|
dataframe and label are ignored;
|
|
If not, dataframe and label must be provided.
|
|
metric: A string of the metric name or a function,
|
|
e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
|
|
'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2',
|
|
'mape'.
|
|
If passing a customized metric function, the function needs to
|
|
have the follwing signature:
|
|
|
|
.. code-block:: python
|
|
|
|
def custom_metric(
|
|
X_test, y_test, estimator, labels,
|
|
X_train, y_train, weight_test=None, weight_train=None,
|
|
config=None, groups_test=None, groups_train=None,
|
|
):
|
|
return metric_to_minimize, metrics_to_log
|
|
|
|
which returns a float number as the minimization objective,
|
|
and a dictionary as the metrics to log.
|
|
task: A string of the task type, e.g.,
|
|
'classification', 'regression', 'ts_forecast', 'rank'.
|
|
n_jobs: An integer of the number of threads for training.
|
|
log_file_name: A string of the log file name.
|
|
estimator_list: A list of strings for estimator names, or 'auto'
|
|
e.g.,
|
|
|
|
.. code-block:: python
|
|
|
|
['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree']
|
|
|
|
time_budget: A float number of the time budget in seconds.
|
|
max_iter: An integer of the maximal number of iterations.
|
|
sample: A boolean of whether to sample the training data during
|
|
search.
|
|
ensemble: boolean or dict | default=False. Whether to perform
|
|
ensemble after search. Can be a dict with keys 'passthrough'
|
|
and 'final_estimator' to specify the passthrough and
|
|
final_estimator in the stacker.
|
|
eval_method: A string of resampling strategy, one of
|
|
['auto', 'cv', 'holdout'].
|
|
split_ratio: A float of the valiation data percentage for holdout.
|
|
n_splits: An integer of the number of folds for cross - validation.
|
|
log_type: A string of the log type, one of
|
|
['better', 'all'].
|
|
'better' only logs configs with better loss than previos iters
|
|
'all' logs all the tried configs.
|
|
model_history: A boolean of whether to keep the history of best
|
|
models in the history property. Make sure memory is large
|
|
enough if setting to True.
|
|
log_training_metric: A boolean of whether to log the training
|
|
metric for each model.
|
|
mem_thres: A float of the memory size constraint in bytes.
|
|
pred_time_limit: A float of the prediction latency constraint in seconds.
|
|
train_time_limit: A float of the training time constraint in seconds.
|
|
X_val: None or a numpy array or a pandas dataframe of validation data.
|
|
y_val: None or a numpy array or a pandas series of validation labels.
|
|
sample_weight_val: None or a numpy array of the sample weight of
|
|
validation data of the same shape as y_val.
|
|
groups_val: None or array-like | group labels (with matching length
|
|
to y_val) or group counts (with sum equal to length of y_val)
|
|
for validation data. Need to be consistent with groups.
|
|
groups: None or array-like | Group labels (with matching length to
|
|
y_train) or groups counts (with sum equal to length of y_train)
|
|
for training data.
|
|
verbose: int, default=3 | Controls the verbosity, higher means more
|
|
messages.
|
|
retrain_full: bool or str, default=True | whether to retrain the
|
|
selected model on the full training data when using holdout.
|
|
True - retrain only after search finishes; False - no retraining;
|
|
'budget' - do best effort to retrain without violating the time
|
|
budget.
|
|
split_type: str or None, default=None | the data split type.
|
|
For classification tasks, valid choices are [
|
|
None, 'stratified', 'uniform', 'time']. None -> stratified.
|
|
For regression tasks, valid choices are [None, 'uniform', 'time'].
|
|
None -> uniform.
|
|
For ts_forecast tasks, must be None or 'time'.
|
|
For ranking task, must be None or 'group'.
|
|
hpo_method: str or None, default=None | The hyperparameter
|
|
optimization method. By default, CFO is used for sequential
|
|
search and BlendSearch is used for parallel search.
|
|
No need to set when using flaml's default search space or using
|
|
a simple customized search space. When set to 'bs', BlendSearch
|
|
is used. BlendSearch can be tried when the search space is
|
|
complex, for example, containing multiple disjoint, discontinuous
|
|
subspaces. When set to 'random', random search is used.
|
|
starting_points: A dictionary to specify the starting hyperparameter
|
|
config for the estimators.
|
|
Keys are the name of the estimators, and values are the starting
|
|
hyperparamter configurations for the corresponding estimators.
|
|
The value can be a single hyperparamter configuration dict or a list
|
|
of hyperparamter configuration dicts.
|
|
In the following code example, we get starting_points from the
|
|
automl_experiment and use them in the new_automl_experiment.
|
|
e.g.,
|
|
|
|
.. code-block:: python
|
|
|
|
from flaml import AutoML
|
|
automl_experiment = AutoML()
|
|
X_train, y_train = load_iris(return_X_y=True)
|
|
automl_experiment.fit(X_train, y_train)
|
|
starting_points = automl_experiment.best_config_per_estimator
|
|
|
|
new_automl_experiment = AutoML()
|
|
new_automl_experiment.fit(X_train, y_train,
|
|
starting_points=starting_points)
|
|
|
|
seed: int or None, default=None | The random seed for np.random.
|
|
n_concurrent_trials: [Experimental] int, default=1 | The number of
|
|
concurrent trials. For n_concurrent_trials > 1, installation of
|
|
ray is required: `pip install flaml[ray]`.
|
|
keep_search_state: boolean, default=False | Whether to keep search
|
|
state after fit(). By default the state is deleted for space
|
|
saving.
|
|
early_stop: boolean, default=False | Whether to stop early if the
|
|
search is considered to converge.
|
|
append_log: boolean, default=False | Whetehr to directly append the log
|
|
records to the input log file if it exists.
|
|
auto_augment: boolean, default=True | Whether to automatically
|
|
augment rare classes.
|
|
min_sample_size: int, default=MIN_SAMPLE_TRAIN | the minimal sample
|
|
size when sample=True.
|
|
use_ray: boolean, default=False | Whether to use ray to run the training
|
|
in separate processes. This can be used to prevent OOM for large
|
|
datasets, but will incur more overhead in time. Only use it if
|
|
you run into OOM failures.
|
|
**fit_kwargs: Other key word arguments to pass to fit() function of
|
|
the searched learners, such as sample_weight. Include period as
|
|
a key word argument for 'ts_forecast' task.
|
|
"""
|
|
self._state._start_time_flag = self._start_time_flag = time.time()
|
|
if task == FORECAST:
|
|
self._state.task = TS_FORECAST
|
|
else:
|
|
self._state.task = task
|
|
self._state.log_training_metric = log_training_metric
|
|
self._state.fit_kwargs = fit_kwargs
|
|
self._state.weight_val = sample_weight_val
|
|
|
|
self._validate_data(
|
|
X_train, y_train, dataframe, label, X_val, y_val, groups_val, groups
|
|
)
|
|
self._search_states = {} # key: estimator name; value: SearchState
|
|
self._random = np.random.RandomState(RANDOM_SEED)
|
|
if seed is not None:
|
|
np.random.seed(seed)
|
|
self._seed = seed + 19823 if seed is not None else 20
|
|
self._learner_selector = learner_selector
|
|
old_level = logger.getEffectiveLevel()
|
|
self.verbose = verbose
|
|
# if verbose == 0:
|
|
logger.setLevel(50 - verbose * 10)
|
|
if (not mlflow or not mlflow.active_run()) and not logger.handlers:
|
|
# Add the console handler.
|
|
_ch = logging.StreamHandler()
|
|
_ch.setFormatter(logger_formatter)
|
|
logger.addHandler(_ch)
|
|
self._decide_split_type(split_type)
|
|
logger.info(f"Data split method: {self._split_type}")
|
|
if eval_method == "auto" or self._state.X_val is not None:
|
|
eval_method = self._decide_eval_method(time_budget)
|
|
self._state.eval_method = eval_method
|
|
logger.info("Evaluation method: {}".format(eval_method))
|
|
|
|
self._retrain_in_budget = retrain_full == "budget" and (
|
|
eval_method == "holdout" and self._state.X_val is None
|
|
)
|
|
self._retrain_final = (
|
|
retrain_full is True
|
|
and eval_method == "holdout"
|
|
and self._state.X_val is None
|
|
or eval_method == "cv"
|
|
or max_iter == 1
|
|
)
|
|
self._auto_augment = auto_augment
|
|
self._min_sample_size = min_sample_size
|
|
self._prepare_data(eval_method, split_ratio, n_splits)
|
|
self._sample = (
|
|
sample
|
|
and task != "rank"
|
|
and eval_method != "cv"
|
|
and (self._min_sample_size * SAMPLE_MULTIPLY_FACTOR < self._state.data_size)
|
|
)
|
|
if "auto" == metric:
|
|
if "binary" in self._state.task:
|
|
metric = "roc_auc"
|
|
elif "multi" in self._state.task:
|
|
metric = "log_loss"
|
|
elif self._state.task == TS_FORECAST:
|
|
metric = "mape"
|
|
elif self._state.task == "rank":
|
|
metric = "ndcg"
|
|
else:
|
|
metric = "r2"
|
|
self._state.metric = metric
|
|
if metric in [
|
|
"r2",
|
|
"accuracy",
|
|
"roc_auc",
|
|
"roc_auc_ovr",
|
|
"roc_auc_ovo",
|
|
"f1",
|
|
"ap",
|
|
"micro_f1",
|
|
"macro_f1",
|
|
"ndcg",
|
|
]:
|
|
error_metric = f"1-{metric}"
|
|
elif isinstance(metric, str):
|
|
error_metric = metric
|
|
else:
|
|
error_metric = "customized metric"
|
|
logger.info(f"Minimizing error metric: {error_metric}")
|
|
|
|
if "auto" == estimator_list:
|
|
if self._state.task == TS_FORECAST:
|
|
try:
|
|
import prophet
|
|
|
|
estimator_list = ["prophet", "arima", "sarimax"]
|
|
except ImportError:
|
|
estimator_list = ["arima", "sarimax"]
|
|
elif self._state.task == "rank":
|
|
estimator_list = ["lgbm", "xgboost"]
|
|
else:
|
|
try:
|
|
import catboost
|
|
|
|
estimator_list = ["lgbm", "rf", "catboost", "xgboost", "extra_tree"]
|
|
except ImportError:
|
|
estimator_list = ["lgbm", "rf", "xgboost", "extra_tree"]
|
|
if "regression" != self._state.task:
|
|
estimator_list += ["lrl1"]
|
|
for estimator_name in estimator_list:
|
|
if estimator_name not in self._state.learner_classes:
|
|
self.add_learner(
|
|
estimator_name,
|
|
get_estimator_class(self._state.task, estimator_name),
|
|
)
|
|
# set up learner search space
|
|
for estimator_name in estimator_list:
|
|
estimator_class = self._state.learner_classes[estimator_name]
|
|
estimator_class.init()
|
|
self._search_states[estimator_name] = SearchState(
|
|
learner_class=estimator_class,
|
|
data_size=self._state.data_size,
|
|
task=self._state.task,
|
|
starting_point=starting_points.get(estimator_name),
|
|
)
|
|
logger.info("List of ML learners in AutoML Run: {}".format(estimator_list))
|
|
self.estimator_list = estimator_list
|
|
self._state.time_budget = time_budget or 1e10
|
|
self._active_estimators = estimator_list.copy()
|
|
self._ensemble = ensemble
|
|
self._max_iter = max_iter
|
|
self._mem_thres = mem_thres
|
|
self._pred_time_limit = pred_time_limit
|
|
self._state.train_time_limit = train_time_limit
|
|
self._log_type = log_type
|
|
self.split_ratio = split_ratio
|
|
self._state.save_model_history = model_history
|
|
self._state.n_jobs = n_jobs
|
|
self._n_concurrent_trials = n_concurrent_trials
|
|
self._early_stop = early_stop
|
|
self._use_ray = use_ray or n_concurrent_trials > 1
|
|
self._hpo_method = hpo_method or ("bs" if self._use_ray else "cfo")
|
|
if log_file_name:
|
|
with training_log_writer(log_file_name, append_log) as save_helper:
|
|
self._training_log = save_helper
|
|
self._search()
|
|
else:
|
|
self._training_log = None
|
|
self._search()
|
|
if self._best_estimator:
|
|
logger.info("fit succeeded")
|
|
logger.info(
|
|
f"Time taken to find the best model: {self._time_taken_best_iter}"
|
|
)
|
|
if (
|
|
self._hpo_method in ("cfo", "bs")
|
|
and (self._time_taken_best_iter >= time_budget * 0.7)
|
|
and not all(
|
|
state.search_alg and state.search_alg.searcher.is_ls_ever_converged
|
|
for state in self._search_states.values()
|
|
)
|
|
):
|
|
logger.warning(
|
|
"Time taken to find the best model is {0:.0f}% of the "
|
|
"provided time budget and not all estimators' hyperparameter "
|
|
"search converged. Consider increasing the time budget.".format(
|
|
self._time_taken_best_iter / time_budget * 100
|
|
)
|
|
)
|
|
|
|
if not keep_search_state:
|
|
# release space
|
|
del self._X_train_all, self._y_train_all, self._state.kf
|
|
del self._state.X_train, self._state.X_train_all, self._state.X_val
|
|
del self._state.y_train, self._state.y_train_all, self._state.y_val
|
|
del self._sample_weight_full, self._state.fit_kwargs
|
|
del self._state.groups, self._state.groups_all, self._state.groups_val
|
|
for state in self._search_states.values():
|
|
if state.trained_estimator:
|
|
del state.trained_estimator
|
|
# if verbose == 0:
|
|
logger.setLevel(old_level)
|
|
|
|
def _search_parallel(self):
|
|
try:
|
|
from ray import __version__ as ray_version
|
|
|
|
assert ray_version >= "1.0.0"
|
|
import ray
|
|
from ray.tune.suggest import ConcurrencyLimiter
|
|
except (ImportError, AssertionError):
|
|
raise ImportError(
|
|
"n_concurrent_trial>1 or use_ray=True requires installation of ray. "
|
|
"Please run pip install flaml[ray]"
|
|
)
|
|
if self._hpo_method in ("cfo", "grid"):
|
|
from flaml import CFO as SearchAlgo
|
|
elif "bs" == self._hpo_method:
|
|
from flaml import BlendSearch as SearchAlgo
|
|
elif "random" == self._hpo_method:
|
|
from ray.tune.suggest import BasicVariantGenerator as SearchAlgo
|
|
from ray.tune.sample import Domain
|
|
else:
|
|
raise NotImplementedError(
|
|
f"hpo_method={self._hpo_method} is not recognized. "
|
|
"'cfo' and 'bs' are supported."
|
|
)
|
|
space = self.search_space
|
|
if self._hpo_method == "random":
|
|
# Any point in points_to_evaluate must consist of hyperparamters
|
|
# that are tunable, which can be identified by checking whether
|
|
# the corresponding value in the search space is an instance of
|
|
# the 'Domain' class from flaml or ray.tune
|
|
points_to_evaluate = self.points_to_evaluate.copy()
|
|
to_del = []
|
|
for k, v in space.items():
|
|
if not isinstance(v, Domain):
|
|
to_del.append(k)
|
|
for k in to_del:
|
|
for p in points_to_evaluate:
|
|
if k in p:
|
|
del p[k]
|
|
search_alg = SearchAlgo(
|
|
max_concurrent=self._n_concurrent_trials,
|
|
points_to_evaluate=points_to_evaluate,
|
|
)
|
|
else:
|
|
self._state.time_from_start = time.time() - self._start_time_flag
|
|
time_left = self._state.time_budget - self._state.time_from_start
|
|
search_alg = SearchAlgo(
|
|
metric="val_loss",
|
|
space=space,
|
|
low_cost_partial_config=self.low_cost_partial_config,
|
|
points_to_evaluate=self.points_to_evaluate,
|
|
cat_hp_cost=self.cat_hp_cost,
|
|
prune_attr=self.prune_attr,
|
|
min_resource=self.min_resource,
|
|
max_resource=self.max_resource,
|
|
config_constraints=[
|
|
(partial(size, self._state), "<=", self._mem_thres)
|
|
],
|
|
metric_constraints=self.metric_constraints,
|
|
seed=self._seed,
|
|
time_budget_s=time_left,
|
|
)
|
|
search_alg = ConcurrencyLimiter(search_alg, self._n_concurrent_trials)
|
|
resources_per_trial = (
|
|
{"cpu": self._state.n_jobs} if self._state.n_jobs > 1 else None
|
|
)
|
|
analysis = ray.tune.run(
|
|
self.trainable,
|
|
search_alg=search_alg,
|
|
config=space,
|
|
metric="val_loss",
|
|
mode="min",
|
|
resources_per_trial=resources_per_trial,
|
|
time_budget_s=self._state.time_budget,
|
|
num_samples=self._max_iter,
|
|
verbose=max(self.verbose - 2, 0),
|
|
raise_on_failed_trial=False,
|
|
)
|
|
# logger.info([trial.last_result for trial in analysis.trials])
|
|
trials = sorted(
|
|
(
|
|
trial
|
|
for trial in analysis.trials
|
|
if trial.last_result
|
|
and trial.last_result["wall_clock_time"] is not None
|
|
),
|
|
key=lambda x: x.last_result["wall_clock_time"],
|
|
)
|
|
for _track_iter, trial in enumerate(trials):
|
|
result = trial.last_result
|
|
better = False
|
|
if result:
|
|
config = result["config"]
|
|
estimator = config.get("ml", config)["learner"]
|
|
search_state = self._search_states[estimator]
|
|
search_state.update(result, 0, self._state.save_model_history)
|
|
if result["wall_clock_time"] is not None:
|
|
self._state.time_from_start = result["wall_clock_time"]
|
|
if search_state.sample_size == self._state.data_size:
|
|
self._iter_per_learner[estimator] += 1
|
|
if not self._fullsize_reached:
|
|
self._fullsize_reached = True
|
|
if search_state.best_loss < self._state.best_loss:
|
|
self._state.best_loss = search_state.best_loss
|
|
self._best_estimator = estimator
|
|
self._config_history[_track_iter] = (
|
|
self._best_estimator,
|
|
config,
|
|
self._time_taken_best_iter,
|
|
)
|
|
if self._state.save_model_history:
|
|
self._model_history[
|
|
_track_iter
|
|
] = search_state.trained_estimator
|
|
self._trained_estimator = search_state.trained_estimator
|
|
self._best_iteration = _track_iter
|
|
self._time_taken_best_iter = self._state.time_from_start
|
|
better = True
|
|
self._search_states[estimator].best_config = config
|
|
if (better or self._log_type == "all") and self._training_log:
|
|
self._training_log.append(
|
|
self._iter_per_learner[estimator],
|
|
search_state.metric_for_logging,
|
|
search_state.trial_time,
|
|
self._state.time_from_start,
|
|
search_state.val_loss,
|
|
config,
|
|
estimator,
|
|
search_state.sample_size,
|
|
)
|
|
|
|
def _search_sequential(self):
|
|
try:
|
|
from ray import __version__ as ray_version
|
|
|
|
assert ray_version >= "1.0.0"
|
|
from ray.tune.suggest import ConcurrencyLimiter
|
|
except (ImportError, AssertionError):
|
|
from .searcher.suggestion import ConcurrencyLimiter
|
|
if self._hpo_method in ("cfo", "grid"):
|
|
from flaml import CFO as SearchAlgo
|
|
elif "optuna" == self._hpo_method:
|
|
try:
|
|
from ray import __version__ as ray_version
|
|
|
|
assert ray_version >= "1.0.0"
|
|
from ray.tune.suggest.optuna import OptunaSearch as SearchAlgo
|
|
except (ImportError, AssertionError):
|
|
from .searcher.suggestion import OptunaSearch as SearchAlgo
|
|
elif "bs" == self._hpo_method:
|
|
from flaml import BlendSearch as SearchAlgo
|
|
elif "random" == self._hpo_method:
|
|
from flaml.searcher import RandomSearch as SearchAlgo
|
|
elif "cfocat" == self._hpo_method:
|
|
from flaml.searcher.cfo_cat import CFOCat as SearchAlgo
|
|
else:
|
|
raise NotImplementedError(
|
|
f"hpo_method={self._hpo_method} is not recognized. "
|
|
"'cfo' and 'bs' are supported."
|
|
)
|
|
|
|
est_retrain_time = next_trial_time = 0
|
|
best_config_sig = None
|
|
better = True # whether we find a better model in one trial
|
|
if self._ensemble:
|
|
self.best_model = {}
|
|
if self._max_iter < 2 and self.estimator_list:
|
|
# when max_iter is 1, no need to search
|
|
# TODO: otherwise, need to make sure SearchStates.init_config is inside search space
|
|
self._max_iter = 0
|
|
self._best_estimator = estimator = self.estimator_list[0]
|
|
self._selected = state = self._search_states[estimator]
|
|
state.best_config_sample_size = self._state.data_size
|
|
state.best_config = (
|
|
state.init_config
|
|
if isinstance(state.init_config, dict)
|
|
else state.init_config[0]
|
|
)
|
|
for self._track_iter in range(self._max_iter):
|
|
if self._estimator_index is None:
|
|
estimator = self._active_estimators[0]
|
|
else:
|
|
estimator = self._select_estimator(self._active_estimators)
|
|
if not estimator:
|
|
break
|
|
logger.info(f"iteration {self._track_iter}, current learner {estimator}")
|
|
search_state = self._search_states[estimator]
|
|
self._state.time_from_start = time.time() - self._start_time_flag
|
|
time_left = self._state.time_budget - self._state.time_from_start
|
|
budget_left = (
|
|
time_left
|
|
if not self._retrain_in_budget
|
|
or better
|
|
or (not self.best_estimator)
|
|
or self._search_states[self.best_estimator].sample_size
|
|
< self._state.data_size
|
|
else time_left - est_retrain_time
|
|
)
|
|
if not search_state.search_alg:
|
|
search_state.training_function = partial(
|
|
AutoMLState._compute_with_config_base, self._state, estimator
|
|
)
|
|
search_space = search_state.search_space
|
|
if self._sample:
|
|
prune_attr = "FLAML_sample_size"
|
|
min_resource = self._min_sample_size
|
|
max_resource = self._state.data_size
|
|
else:
|
|
prune_attr = min_resource = max_resource = None
|
|
learner_class = self._state.learner_classes.get(estimator)
|
|
if "grid" == self._hpo_method: # for synthetic exp only
|
|
points_to_evaluate = []
|
|
space = search_space
|
|
keys = list(space.keys())
|
|
domain0, domain1 = space[keys[0]], space[keys[1]]
|
|
for x1 in range(domain0.lower, domain0.upper + 1):
|
|
for x2 in range(domain1.lower, domain1.upper + 1):
|
|
points_to_evaluate.append(
|
|
{
|
|
keys[0]: x1,
|
|
keys[1]: x2,
|
|
}
|
|
)
|
|
self._max_iter_per_learner = len(points_to_evaluate)
|
|
low_cost_partial_config = None
|
|
else:
|
|
points_to_evaluate = (
|
|
search_state.init_config
|
|
if isinstance(search_state.init_config, list)
|
|
else [search_state.init_config]
|
|
)
|
|
low_cost_partial_config = search_state.low_cost_partial_config
|
|
if self._hpo_method in ("bs", "cfo", "grid", "cfocat", "random"):
|
|
algo = SearchAlgo(
|
|
metric="val_loss",
|
|
mode="min",
|
|
space=search_space,
|
|
points_to_evaluate=points_to_evaluate,
|
|
low_cost_partial_config=low_cost_partial_config,
|
|
cat_hp_cost=search_state.cat_hp_cost,
|
|
prune_attr=prune_attr,
|
|
min_resource=min_resource,
|
|
max_resource=max_resource,
|
|
config_constraints=[
|
|
(learner_class.size, "<=", self._mem_thres)
|
|
],
|
|
metric_constraints=self.metric_constraints,
|
|
seed=self._seed,
|
|
)
|
|
else:
|
|
algo = SearchAlgo(
|
|
metric="val_loss",
|
|
mode="min",
|
|
space=search_space,
|
|
points_to_evaluate=[
|
|
p for p in points_to_evaluate if len(p) == len(search_space)
|
|
],
|
|
)
|
|
search_state.search_alg = ConcurrencyLimiter(algo, max_concurrent=1)
|
|
# search_state.search_alg = algo
|
|
else:
|
|
search_space = None
|
|
if self._hpo_method in ("bs", "cfo", "cfocat"):
|
|
search_state.search_alg.searcher.set_search_properties(
|
|
metric=None,
|
|
mode=None,
|
|
setting={
|
|
"metric_target": self._state.best_loss,
|
|
},
|
|
)
|
|
start_run_time = time.time()
|
|
analysis = tune.run(
|
|
search_state.training_function,
|
|
search_alg=search_state.search_alg,
|
|
time_budget_s=min(budget_left, self._state.train_time_limit),
|
|
verbose=max(self.verbose - 3, 0),
|
|
use_ray=False,
|
|
)
|
|
time_used = time.time() - start_run_time
|
|
better = False
|
|
if analysis.trials:
|
|
result = analysis.trials[-1].last_result
|
|
search_state.update(
|
|
result,
|
|
time_used=time_used,
|
|
save_model_history=self._state.save_model_history,
|
|
)
|
|
if self._estimator_index is None:
|
|
# update init eci estimate
|
|
eci_base = search_state.init_eci
|
|
self._eci.append(search_state.estimated_cost4improvement)
|
|
for e in self.estimator_list[1:]:
|
|
self._eci.append(
|
|
self._search_states[e].init_eci / eci_base * self._eci[0]
|
|
)
|
|
self._estimator_index = 0
|
|
min_budget = max(10 * self._eci[0], sum(self._eci))
|
|
max_budget = 10000 * self._eci[0]
|
|
if search_state.sample_size:
|
|
ratio = search_state.data_size / search_state.sample_size
|
|
min_budget *= ratio
|
|
max_budget *= ratio
|
|
logger.info(
|
|
f"Estimated sufficient time budget={max_budget:.0f}s."
|
|
f" Estimated necessary time budget={min_budget:.0f}s."
|
|
)
|
|
if result["wall_clock_time"] is not None:
|
|
self._state.time_from_start = result["wall_clock_time"]
|
|
# logger.info(f"{self._search_states[estimator].sample_size}, {data_size}")
|
|
if search_state.sample_size == self._state.data_size:
|
|
self._iter_per_learner[estimator] += 1
|
|
self._fullsize_reached = True
|
|
if search_state.best_loss < self._state.best_loss:
|
|
best_config_sig = estimator + search_state.get_hist_config_sig(
|
|
self.data_size_full, search_state.best_config
|
|
)
|
|
self._state.best_loss = search_state.best_loss
|
|
self._best_estimator = estimator
|
|
est_retrain_time = (
|
|
search_state.est_retrain_time(self.data_size_full)
|
|
if (best_config_sig not in self._retrained_config)
|
|
else 0
|
|
)
|
|
self._config_history[self._track_iter] = (
|
|
estimator,
|
|
search_state.best_config,
|
|
self._state.time_from_start,
|
|
)
|
|
if self._state.save_model_history:
|
|
self._model_history[
|
|
self._track_iter
|
|
] = search_state.trained_estimator
|
|
elif self._trained_estimator:
|
|
del self._trained_estimator
|
|
self._trained_estimator = None
|
|
if not self._retrain_final:
|
|
self._trained_estimator = search_state.trained_estimator
|
|
self._best_iteration = self._track_iter
|
|
self._time_taken_best_iter = self._state.time_from_start
|
|
better = True
|
|
next_trial_time = search_state.time2eval_best
|
|
if search_state.trained_estimator and not (
|
|
self._state.save_model_history or self._ensemble
|
|
):
|
|
# free RAM
|
|
if search_state.trained_estimator != self._trained_estimator:
|
|
search_state.trained_estimator.cleanup()
|
|
del search_state.trained_estimator
|
|
search_state.trained_estimator = None
|
|
if better or self._log_type == "all":
|
|
if self._training_log:
|
|
self._training_log.append(
|
|
self._iter_per_learner[estimator],
|
|
search_state.metric_for_logging,
|
|
search_state.trial_time,
|
|
self._state.time_from_start,
|
|
search_state.val_loss,
|
|
search_state.config,
|
|
estimator,
|
|
search_state.sample_size,
|
|
)
|
|
if mlflow is not None and mlflow.active_run():
|
|
with mlflow.start_run(nested=True):
|
|
mlflow.log_metric(
|
|
"iter_counter", self._iter_per_learner[estimator]
|
|
)
|
|
mlflow.log_param(
|
|
"metric_for_logging", search_state.metric_for_logging
|
|
)
|
|
mlflow.log_metric("trial_time", search_state.trial_time)
|
|
mlflow.log_metric(
|
|
"wall_clock_time", self._state.time_from_start
|
|
)
|
|
mlflow.log_metric("validation_loss", search_state.val_loss)
|
|
mlflow.log_param("config", search_state.config)
|
|
mlflow.log_param("learner", estimator)
|
|
mlflow.log_param("sample_size", search_state.sample_size)
|
|
mlflow.log_metric(
|
|
"best_validation_loss", search_state.best_loss
|
|
)
|
|
mlflow.log_param("best_config", search_state.best_config)
|
|
mlflow.log_param("best_learner", self._best_estimator)
|
|
logger.info(
|
|
" at {:.1f}s,\testimator {}'s best error={:.4f},\tbest estimator {}'s best error={:.4f}".format(
|
|
self._state.time_from_start,
|
|
estimator,
|
|
search_state.best_loss,
|
|
self._best_estimator,
|
|
self._state.best_loss,
|
|
)
|
|
)
|
|
if (
|
|
self._hpo_method in ("cfo", "bs")
|
|
and all(
|
|
state.search_alg
|
|
and state.search_alg.searcher.is_ls_ever_converged
|
|
for state in self._search_states.values()
|
|
)
|
|
and (
|
|
self._state.time_from_start
|
|
> self._warn_threshold * self._time_taken_best_iter
|
|
)
|
|
):
|
|
logger.warning(
|
|
"All estimator hyperparameters local search has "
|
|
"converged at least once, and the total search time "
|
|
f"exceeds {self._warn_threshold} times the time taken "
|
|
"to find the best model."
|
|
)
|
|
if self._early_stop:
|
|
logger.warning("Stopping search as early_stop is set to True.")
|
|
break
|
|
self._warn_threshold *= 10
|
|
else:
|
|
logger.info(f"stop trying learner {estimator}")
|
|
if self._estimator_index is not None:
|
|
self._active_estimators.remove(estimator)
|
|
self._estimator_index -= 1
|
|
search_state.search_alg.searcher._is_ls_ever_converged = True
|
|
if (
|
|
self._retrain_in_budget
|
|
and best_config_sig
|
|
and est_retrain_time
|
|
and not better
|
|
and self._search_states[self._best_estimator].sample_size
|
|
== self._state.data_size
|
|
and (
|
|
est_retrain_time
|
|
<= self._state.time_budget - self._state.time_from_start
|
|
<= est_retrain_time + next_trial_time
|
|
)
|
|
):
|
|
state = self._search_states[self._best_estimator]
|
|
self._trained_estimator, retrain_time = self._state._train_with_config(
|
|
self._best_estimator,
|
|
state.best_config,
|
|
self.data_size_full,
|
|
)
|
|
logger.info(
|
|
"retrain {} for {:.1f}s".format(self._best_estimator, retrain_time)
|
|
)
|
|
self._retrained_config[
|
|
best_config_sig
|
|
] = state.best_config_train_time = retrain_time
|
|
est_retrain_time = 0
|
|
self._state.time_from_start = time.time() - self._start_time_flag
|
|
if (
|
|
self._state.time_from_start >= self._state.time_budget
|
|
or not self._active_estimators
|
|
):
|
|
break
|
|
if self._ensemble and self._best_estimator:
|
|
time_left = self._state.time_budget - self._state.time_from_start
|
|
time_ensemble = self._search_states[self._best_estimator].time2eval_best
|
|
if time_left < time_ensemble < 2 * time_left:
|
|
break
|
|
|
|
def _search(self):
|
|
# initialize the search_states
|
|
self._eci = []
|
|
self._state.best_loss = float("+inf")
|
|
self._state.time_from_start = 0
|
|
self._estimator_index = None
|
|
self._best_iteration = 0
|
|
self._time_taken_best_iter = 0
|
|
self._model_history = {}
|
|
self._config_history = {}
|
|
self._max_iter_per_learner = 1000000 # TODO
|
|
self._iter_per_learner = dict([(e, 0) for e in self.estimator_list])
|
|
self._fullsize_reached = False
|
|
self._trained_estimator = None
|
|
self._best_estimator = None
|
|
self._retrained_config = {}
|
|
self._warn_threshold = 10
|
|
self._selected = None
|
|
self.modelcount = 0
|
|
|
|
if not self._use_ray:
|
|
self._search_sequential()
|
|
else:
|
|
self._search_parallel()
|
|
# Add a checkpoint for the current best config to the log.
|
|
if self._training_log:
|
|
self._training_log.checkpoint()
|
|
if self._best_estimator:
|
|
self._selected = self._search_states[self._best_estimator]
|
|
self.modelcount = sum(
|
|
search_state.total_iter for search_state in self._search_states.values()
|
|
)
|
|
if self._trained_estimator:
|
|
logger.info(f"selected model: {self._trained_estimator.model}")
|
|
if self._ensemble and self._state.task in (
|
|
"binary",
|
|
"multi",
|
|
"regression",
|
|
):
|
|
search_states = list(
|
|
x for x in self._search_states.items() if x[1].best_config
|
|
)
|
|
search_states.sort(key=lambda x: x[1].best_loss)
|
|
estimators = [
|
|
(
|
|
x[0],
|
|
x[1].learner_class(
|
|
task=self._state.task,
|
|
n_jobs=self._state.n_jobs,
|
|
**x[1].best_config,
|
|
),
|
|
)
|
|
for x in search_states[:2]
|
|
]
|
|
estimators += [
|
|
(
|
|
x[0],
|
|
x[1].learner_class(
|
|
task=self._state.task,
|
|
n_jobs=self._state.n_jobs,
|
|
**x[1].best_config,
|
|
),
|
|
)
|
|
for x in search_states[2:]
|
|
if x[1].best_loss < 4 * self._selected.best_loss
|
|
]
|
|
logger.info(estimators)
|
|
if len(estimators) <= 1:
|
|
return
|
|
if self._state.task in CLASSIFICATION:
|
|
from sklearn.ensemble import StackingClassifier as Stacker
|
|
else:
|
|
from sklearn.ensemble import StackingRegressor as Stacker
|
|
if isinstance(self._ensemble, dict):
|
|
final_estimator = self._ensemble.get(
|
|
"final_estimator", self._trained_estimator
|
|
)
|
|
passthrough = self._ensemble.get("passthrough", True)
|
|
else:
|
|
final_estimator = self._trained_estimator
|
|
passthrough = True
|
|
stacker = Stacker(
|
|
estimators,
|
|
final_estimator,
|
|
n_jobs=self._state.n_jobs,
|
|
passthrough=passthrough,
|
|
)
|
|
if self._sample_weight_full is not None:
|
|
self._state.fit_kwargs["sample_weight"] = self._sample_weight_full
|
|
for e in estimators:
|
|
e[1].__class__.init()
|
|
try:
|
|
stacker.fit(
|
|
self._X_train_all, self._y_train_all, **self._state.fit_kwargs
|
|
)
|
|
logger.info(f"ensemble: {stacker}")
|
|
self._trained_estimator = stacker
|
|
self._trained_estimator.model = stacker
|
|
except ValueError as e:
|
|
if passthrough:
|
|
logger.warning(
|
|
"Using passthrough=False for ensemble because the data contain categorical features."
|
|
)
|
|
stacker = Stacker(
|
|
estimators,
|
|
final_estimator,
|
|
n_jobs=self._state.n_jobs,
|
|
passthrough=False,
|
|
)
|
|
stacker.fit(
|
|
self._X_train_all,
|
|
self._y_train_all,
|
|
**self._state.fit_kwargs,
|
|
)
|
|
logger.info(f"ensemble: {stacker}")
|
|
self._trained_estimator = stacker
|
|
self._trained_estimator.model = stacker
|
|
else:
|
|
raise e
|
|
elif self._retrain_final:
|
|
# reset time budget for retraining
|
|
if self._max_iter > 1:
|
|
self._state.time_from_start -= self._state.time_budget
|
|
if (
|
|
self._state.task == TS_FORECAST
|
|
or self._trained_estimator is None
|
|
or (
|
|
self._state.time_budget - self._state.time_from_start
|
|
> self._selected.est_retrain_time(self.data_size_full)
|
|
and self._selected.best_config_sample_size
|
|
== self._state.data_size
|
|
)
|
|
):
|
|
state = self._search_states[self._best_estimator]
|
|
(
|
|
self._trained_estimator,
|
|
retrain_time,
|
|
) = self._state._train_with_config(
|
|
self._best_estimator,
|
|
state.best_config,
|
|
self.data_size_full,
|
|
)
|
|
logger.info(
|
|
"retrain {} for {:.1f}s".format(
|
|
self._best_estimator, retrain_time
|
|
)
|
|
)
|
|
state.best_config_train_time = retrain_time
|
|
if self._trained_estimator:
|
|
logger.info(f"retrained model: {self._trained_estimator.model}")
|
|
else:
|
|
logger.info("not retraining because the time budget is too small.")
|
|
if self.model and mlflow is not None and mlflow.active_run():
|
|
mlflow.sklearn.log_model(self.model, "best_model")
|
|
|
|
def __del__(self):
|
|
if (
|
|
hasattr(self, "_trained_estimator")
|
|
and self._trained_estimator
|
|
and hasattr(self._trained_estimator, "cleanup")
|
|
):
|
|
self._trained_estimator.cleanup()
|
|
del self._trained_estimator
|
|
|
|
def _select_estimator(self, estimator_list):
|
|
if self._learner_selector == "roundrobin":
|
|
self._estimator_index += 1
|
|
if self._estimator_index == len(estimator_list):
|
|
self._estimator_index = 0
|
|
return estimator_list[self._estimator_index]
|
|
min_estimated_cost, selected = np.Inf, None
|
|
inv = []
|
|
untried_exists = False
|
|
for i, estimator in enumerate(estimator_list):
|
|
if estimator in self._search_states and (
|
|
self._search_states[estimator].sample_size
|
|
): # sample_size=None meaning no result
|
|
search_state = self._search_states[estimator]
|
|
if (
|
|
self._search_states[estimator].time2eval_best
|
|
> self._state.time_budget - self._state.time_from_start
|
|
or self._iter_per_learner[estimator] >= self._max_iter_per_learner
|
|
):
|
|
inv.append(0)
|
|
continue
|
|
estimated_cost = search_state.estimated_cost4improvement
|
|
if search_state.sample_size < self._state.data_size:
|
|
estimated_cost = min(
|
|
estimated_cost,
|
|
search_state.time2eval_best
|
|
* min(
|
|
SAMPLE_MULTIPLY_FACTOR,
|
|
self._state.data_size / search_state.sample_size,
|
|
),
|
|
)
|
|
gap = search_state.best_loss - self._state.best_loss
|
|
if gap > 0 and not self._ensemble:
|
|
delta_loss = (
|
|
search_state.best_loss_old - search_state.best_loss
|
|
) or search_state.best_loss
|
|
delta_time = (
|
|
search_state.total_time_used - search_state.time_best_found_old
|
|
) or 1e-10
|
|
speed = delta_loss / delta_time
|
|
if speed:
|
|
estimated_cost = max(2 * gap / speed, estimated_cost)
|
|
estimated_cost = estimated_cost or 1e-9
|
|
inv.append(1 / estimated_cost)
|
|
else:
|
|
estimated_cost = self._eci[i]
|
|
inv.append(0)
|
|
untried_exists = True
|
|
if estimated_cost < min_estimated_cost:
|
|
min_estimated_cost = estimated_cost
|
|
selected = estimator
|
|
if untried_exists or not selected:
|
|
state = self._search_states.get(selected)
|
|
if not (state and state.sample_size):
|
|
return selected
|
|
s = sum(inv)
|
|
p = self._random.rand()
|
|
q = 0
|
|
for i in range(len(inv)):
|
|
if inv[i]:
|
|
q += inv[i] / s
|
|
if p < q:
|
|
return estimator_list[i]
|