mirror of
https://github.com/microsoft/autogen.git
synced 2025-09-13 10:16:06 +00:00

* Refactor into automl subpackage Moved some of the packages into an automl subpackage to tidy before the task-based refactor. This is in response to discussions with the group and a comment on the first task-based PR. Only changes here are moving subpackages and modules into the new automl, fixing imports to work with this structure and fixing some dependencies in setup.py. * Fix doc building post automl subpackage refactor * Fix broken links in website post automl subpackage refactor * Fix broken links in website post automl subpackage refactor * Remove vw from test deps as this is breaking the build * Move default back to the top-level I'd moved this to automl as that's where it's used internally, but had missed that this is actually part of the public interface so makes sense to live where it was. * Re-add top level modules with deprecation warnings flaml.data, flaml.ml and flaml.model are re-added to the top level, being re-exported from flaml.automl for backwards compatability. Adding a deprecation warning so that we can have a planned removal later. * Fix model.py line-endings * WIP * WIP - Notes below Got to the point where the methods from AutoML are pulled to GenericTask. Started removing private markers and removing the passing of automl to these methods. Done with decide_split_type, started on prepare_data. Need to do the others after * Re-add generic_task * Fix tests: add Task.__str__ * Fix tests: test for ray.ObjectRef * Hotwire TS_Sklearn wrapper to fix test fail * Remove unused data size field from Task * Fix import for CLASSIFICATION in notebook * Update flaml/automl/data.py Co-authored-by: Chi Wang <wang.chi@microsoft.com> * Fix review comments * Fix task -> str in custom learner constructor * Remove unused CLASSIFICATION imports * Hotwire TS_Sklearn wrapper to fix test fail by setting optimizer_for_horizon == False * Revert changes to the automl_classification and pin FLAML version * Fix imports in reverted notebook * Fix FLAML version in automl notebooks * Fix ml.py line endings * Fix CLASSIFICATION task import in automl_classification notebook * Uncomment pip install in notebook and revert import Not convinced this will work because of installing an older version of the package into the environment in which we're running the tests, but let's see. * Revert c6a5dd1a0 * Revert "Revert c6a5dd1a0" This reverts commit e55e35adea03993de87b23f092b14c6af623d487. * Black format model.py * Bump version to 1.1.2 in automl_xgboost * Add docstrings to the Task ABC * Fix import in custom_learner * fix 'optimize_for_horizon' for ts_sklearn * remove debugging print statements * Check for is_forecast() before is_classification() in decide_split_type * Attempt to fix formatting fail * Another attempt to fix formatting fail * And another attempt to fix formatting fail * Add type annotations for task arg in signatures and docstrings * Fix formatting * Fix linting --------- Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu> Co-authored-by: EgorKraevTransferwise <egor.kraev@transferwise.com> Co-authored-by: Chi Wang <wang.chi@microsoft.com> Co-authored-by: Kevin Chen <chenkevin.8787@gmail.com>
850 lines
35 KiB
Python
850 lines
35 KiB
Python
import logging
|
|
import time
|
|
from typing import List, Optional
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
from scipy.sparse import issparse
|
|
from sklearn.utils import shuffle
|
|
from sklearn.model_selection import (
|
|
train_test_split,
|
|
RepeatedStratifiedKFold,
|
|
RepeatedKFold,
|
|
GroupKFold,
|
|
TimeSeriesSplit,
|
|
GroupShuffleSplit,
|
|
StratifiedGroupKFold,
|
|
)
|
|
|
|
from flaml.automl.data import TS_TIMESTAMP_COL, concat
|
|
from flaml.automl.ml import EstimatorSubclass, default_cv_score_agg_func, get_val_loss
|
|
from flaml.automl.model import (
|
|
XGBoostSklearnEstimator,
|
|
XGBoostLimitDepthEstimator,
|
|
RandomForestEstimator,
|
|
LGBMEstimator,
|
|
LRL1Classifier,
|
|
LRL2Classifier,
|
|
CatBoostEstimator,
|
|
ExtraTreesEstimator,
|
|
KNeighborsEstimator,
|
|
TransformersEstimator,
|
|
TransformersEstimatorModelSelection,
|
|
)
|
|
from flaml.automl.task.task import (
|
|
Task,
|
|
get_classification_objective,
|
|
TS_FORECAST,
|
|
TS_FORECASTPANEL,
|
|
)
|
|
from flaml.config import RANDOM_SEED
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class GenericTask(Task):
|
|
estimators = {
|
|
"xgboost": XGBoostSklearnEstimator,
|
|
"xgb_limitdepth": XGBoostLimitDepthEstimator,
|
|
"rf": RandomForestEstimator,
|
|
"lgbm": LGBMEstimator,
|
|
"lrl1": LRL1Classifier,
|
|
"lrl2": LRL2Classifier,
|
|
"catboost": CatBoostEstimator,
|
|
"extra_tree": ExtraTreesEstimator,
|
|
"kneighbor": KNeighborsEstimator,
|
|
"transformer": TransformersEstimator,
|
|
"transformer_ms": TransformersEstimatorModelSelection,
|
|
}
|
|
|
|
def validate_data(
|
|
self,
|
|
automl,
|
|
state,
|
|
X_train_all,
|
|
y_train_all,
|
|
dataframe,
|
|
label,
|
|
X_val=None,
|
|
y_val=None,
|
|
groups_val=None,
|
|
groups=None,
|
|
):
|
|
if X_train_all is not None and y_train_all is not None:
|
|
assert (
|
|
isinstance(X_train_all, np.ndarray)
|
|
or issparse(X_train_all)
|
|
or isinstance(X_train_all, pd.DataFrame)
|
|
), (
|
|
"X_train_all must be a numpy array, a pandas dataframe, "
|
|
"or Scipy sparse matrix."
|
|
)
|
|
assert isinstance(y_train_all, np.ndarray) or isinstance(
|
|
y_train_all, pd.Series
|
|
), "y_train_all must be a numpy array or a pandas series."
|
|
assert (
|
|
X_train_all.size != 0 and y_train_all.size != 0
|
|
), "Input data must not be empty."
|
|
if isinstance(X_train_all, np.ndarray) and len(X_train_all.shape) == 1:
|
|
X_train_all = np.reshape(X_train_all, (X_train_all.size, 1))
|
|
if isinstance(y_train_all, np.ndarray):
|
|
y_train_all = y_train_all.flatten()
|
|
assert (
|
|
X_train_all.shape[0] == y_train_all.shape[0]
|
|
), "# rows in X_train must match length of y_train."
|
|
automl._df = isinstance(X_train_all, pd.DataFrame)
|
|
automl._nrow, automl._ndim = X_train_all.shape
|
|
if self.is_ts_forecast():
|
|
X_train_all = pd.DataFrame(X_train_all)
|
|
X_train_all, y_train_all = self._validate_ts_data(
|
|
X_train_all, y_train_all
|
|
)
|
|
X, y = X_train_all, y_train_all
|
|
elif dataframe is not None and label is not None:
|
|
assert isinstance(
|
|
dataframe, pd.DataFrame
|
|
), "dataframe must be a pandas DataFrame"
|
|
assert label in dataframe.columns, "label must a column name in dataframe"
|
|
automl._df = True
|
|
if self.is_ts_forecast():
|
|
dataframe = self._validate_ts_data(dataframe)
|
|
X = dataframe.drop(columns=label)
|
|
automl._nrow, automl._ndim = X.shape
|
|
y = dataframe[label]
|
|
else:
|
|
raise ValueError("either X_train+y_train or dataframe+label are required")
|
|
|
|
# check the validity of input dimensions for NLP tasks, so need to check _is_nlp_task not estimator
|
|
if self.is_nlp():
|
|
from flaml.automl.nlp.utils import is_a_list_of_str
|
|
|
|
is_all_str = True
|
|
is_all_list = True
|
|
for column in X.columns:
|
|
assert X[column].dtype.name in (
|
|
"object",
|
|
"string",
|
|
), "If the task is an NLP task, X can only contain text columns"
|
|
for each_cell in X[column]:
|
|
if each_cell is not None:
|
|
is_str = isinstance(each_cell, str)
|
|
is_list_of_int = isinstance(each_cell, list) and all(
|
|
isinstance(x, int) for x in each_cell
|
|
)
|
|
is_list_of_str = is_a_list_of_str(each_cell)
|
|
if self.is_token_classification():
|
|
assert is_list_of_str, (
|
|
"For the token-classification task, the input column needs to be a list of string,"
|
|
"instead of string, e.g., ['EU', 'rejects','German', 'call','to','boycott','British','lamb','.',].",
|
|
"For more examples, please refer to test/nlp/test_autohf_tokenclassification.py",
|
|
)
|
|
else:
|
|
assert is_str or is_list_of_int, (
|
|
"Each column of the input must either be str (untokenized) "
|
|
"or a list of integers (tokenized)"
|
|
)
|
|
is_all_str &= is_str
|
|
is_all_list &= is_list_of_int or is_list_of_str
|
|
assert is_all_str or is_all_list, (
|
|
"Currently FLAML only supports two modes for NLP: either all columns of X are string (non-tokenized), "
|
|
"or all columns of X are integer ids (tokenized)"
|
|
)
|
|
|
|
if issparse(X_train_all) or automl._skip_transform:
|
|
automl._transformer = automl._label_transformer = False
|
|
automl._X_train_all, automl._y_train_all = X, y
|
|
else:
|
|
from flaml.automl.data import DataTransformer
|
|
|
|
automl._transformer = DataTransformer()
|
|
|
|
(
|
|
automl._X_train_all,
|
|
automl._y_train_all,
|
|
) = automl._transformer.fit_transform(X, y, self)
|
|
automl._label_transformer = automl._transformer.label_transformer
|
|
if self.is_token_classification():
|
|
if hasattr(automl._label_transformer, "label_list"):
|
|
state.fit_kwargs.update(
|
|
{"label_list": automl._label_transformer.label_list}
|
|
)
|
|
elif "label_list" not in state.fit_kwargs:
|
|
for each_fit_kwargs in state.fit_kwargs_by_estimator.values():
|
|
assert (
|
|
"label_list" in each_fit_kwargs
|
|
), "For the token-classification task, you must either (1) pass token labels; or (2) pass id labels and the label list. "
|
|
"Please refer to the documentation for more details: https://microsoft.github.io/FLAML/docs/Examples/AutoML-NLP#a-simple-token-classification-example"
|
|
automl._feature_names_in_ = (
|
|
automl._X_train_all.columns.to_list()
|
|
if hasattr(automl._X_train_all, "columns")
|
|
else None
|
|
)
|
|
|
|
automl._sample_weight_full = state.fit_kwargs.get(
|
|
"sample_weight"
|
|
) # NOTE: _validate_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
if X_val is not None and y_val is not None:
|
|
assert (
|
|
isinstance(X_val, np.ndarray)
|
|
or issparse(X_val)
|
|
or isinstance(X_val, pd.DataFrame)
|
|
), (
|
|
"X_val must be None, a numpy array, a pandas dataframe, "
|
|
"or Scipy sparse matrix."
|
|
)
|
|
assert isinstance(y_val, np.ndarray) or isinstance(
|
|
y_val, pd.Series
|
|
), "y_val must be None, a numpy array or a pandas series."
|
|
assert X_val.size != 0 and y_val.size != 0, (
|
|
"Validation data are expected to be nonempty. "
|
|
"Use None for X_val and y_val if no validation data."
|
|
)
|
|
if isinstance(y_val, np.ndarray):
|
|
y_val = y_val.flatten()
|
|
assert (
|
|
X_val.shape[0] == y_val.shape[0]
|
|
), "# rows in X_val must match length of y_val."
|
|
if automl._transformer:
|
|
state.X_val = automl._transformer.transform(X_val)
|
|
else:
|
|
state.X_val = X_val
|
|
# If it's NLG_TASKS, y_val is a pandas series containing the output sequence tokens,
|
|
# so we cannot use label_transformer.transform to process it
|
|
if automl._label_transformer:
|
|
state.y_val = automl._label_transformer.transform(y_val)
|
|
else:
|
|
state.y_val = y_val
|
|
else:
|
|
state.X_val = state.y_val = None
|
|
|
|
if groups is not None and len(groups) != automl._nrow:
|
|
# groups is given as group counts
|
|
state.groups = np.concatenate([[i] * c for i, c in enumerate(groups)])
|
|
assert (
|
|
len(state.groups) == automl._nrow
|
|
), "the sum of group counts must match the number of examples"
|
|
state.groups_val = (
|
|
np.concatenate([[i] * c for i, c in enumerate(groups_val)])
|
|
if groups_val is not None
|
|
else None
|
|
)
|
|
else:
|
|
state.groups_val = groups_val
|
|
state.groups = groups
|
|
|
|
@staticmethod
|
|
def _validate_ts_data(
|
|
dataframe,
|
|
y_train_all=None,
|
|
):
|
|
assert (
|
|
dataframe[dataframe.columns[0]].dtype.name == "datetime64[ns]"
|
|
), f"For '{TS_FORECAST}' task, the first column must contain timestamp values."
|
|
if y_train_all is not None:
|
|
y_df = (
|
|
pd.DataFrame(y_train_all)
|
|
if isinstance(y_train_all, pd.Series)
|
|
else pd.DataFrame(y_train_all, columns=["labels"])
|
|
)
|
|
dataframe = dataframe.join(y_df)
|
|
duplicates = dataframe.duplicated()
|
|
if any(duplicates):
|
|
logger.warning(
|
|
"Duplicate timestamp values found in timestamp column. "
|
|
f"\n{dataframe.loc[duplicates, dataframe][dataframe.columns[0]]}"
|
|
)
|
|
dataframe = dataframe.drop_duplicates()
|
|
logger.warning("Removed duplicate rows based on all columns")
|
|
assert (
|
|
dataframe[[dataframe.columns[0]]].duplicated() is None
|
|
), "Duplicate timestamp values with different values for other columns."
|
|
ts_series = pd.to_datetime(dataframe[dataframe.columns[0]])
|
|
inferred_freq = pd.infer_freq(ts_series)
|
|
if inferred_freq is None:
|
|
logger.warning(
|
|
"Missing timestamps detected. To avoid error with estimators, set estimator list to ['prophet']. "
|
|
)
|
|
if y_train_all is not None:
|
|
return dataframe.iloc[:, :-1], dataframe.iloc[:, -1]
|
|
return dataframe
|
|
|
|
def prepare_data(
|
|
self,
|
|
state,
|
|
X_train_all,
|
|
y_train_all,
|
|
auto_augment,
|
|
eval_method,
|
|
split_type,
|
|
split_ratio,
|
|
n_splits,
|
|
data_is_df,
|
|
sample_weight_full,
|
|
) -> int:
|
|
X_val, y_val = state.X_val, state.y_val
|
|
if issparse(X_val):
|
|
X_val = X_val.tocsr()
|
|
if issparse(X_train_all):
|
|
X_train_all = X_train_all.tocsr()
|
|
if (
|
|
self.is_classification()
|
|
and auto_augment
|
|
and state.fit_kwargs.get("sample_weight")
|
|
is None # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
and split_type in ["stratified", "uniform"]
|
|
and not self.is_token_classification()
|
|
):
|
|
# logger.info(f"label {pd.unique(y_train_all)}")
|
|
label_set, counts = np.unique(y_train_all, return_counts=True)
|
|
# augment rare classes
|
|
rare_threshld = 20
|
|
rare = counts < rare_threshld
|
|
rare_label, rare_counts = label_set[rare], counts[rare]
|
|
for i, label in enumerate(rare_label):
|
|
count = rare_count = rare_counts[i]
|
|
rare_index = y_train_all == label
|
|
n = len(y_train_all)
|
|
while count < rare_threshld:
|
|
if data_is_df:
|
|
X_train_all = concat(
|
|
X_train_all, X_train_all.iloc[:n].loc[rare_index]
|
|
)
|
|
else:
|
|
X_train_all = concat(
|
|
X_train_all, X_train_all[:n][rare_index, :]
|
|
)
|
|
if isinstance(y_train_all, pd.Series):
|
|
y_train_all = concat(
|
|
y_train_all, y_train_all.iloc[:n].loc[rare_index]
|
|
)
|
|
else:
|
|
y_train_all = np.concatenate(
|
|
[y_train_all, y_train_all[:n][rare_index]]
|
|
)
|
|
count += rare_count
|
|
logger.info(f"class {label} augmented from {rare_count} to {count}")
|
|
SHUFFLE_SPLIT_TYPES = ["uniform", "stratified"]
|
|
if split_type in SHUFFLE_SPLIT_TYPES:
|
|
if sample_weight_full is not None:
|
|
X_train_all, y_train_all, state.sample_weight_all = shuffle(
|
|
X_train_all,
|
|
y_train_all,
|
|
sample_weight_full,
|
|
random_state=RANDOM_SEED,
|
|
)
|
|
state.fit_kwargs[
|
|
"sample_weight"
|
|
] = (
|
|
state.sample_weight_all
|
|
) # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
if isinstance(state.sample_weight_all, pd.Series):
|
|
state.sample_weight_all.reset_index(drop=True, inplace=True)
|
|
else:
|
|
X_train_all, y_train_all = shuffle(
|
|
X_train_all, y_train_all, random_state=RANDOM_SEED
|
|
)
|
|
if data_is_df:
|
|
X_train_all.reset_index(drop=True, inplace=True)
|
|
if isinstance(y_train_all, pd.Series):
|
|
y_train_all.reset_index(drop=True, inplace=True)
|
|
|
|
X_train, y_train = X_train_all, y_train_all
|
|
state.groups_all = state.groups
|
|
if X_val is None and eval_method == "holdout":
|
|
# if eval_method = holdout, make holdout data
|
|
if split_type == "time":
|
|
if self.is_ts_forecast():
|
|
period = state.fit_kwargs[
|
|
"period"
|
|
] # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
if self.is_ts_forecastpanel():
|
|
X_train_all["time_idx"] -= X_train_all["time_idx"].min()
|
|
X_train_all["time_idx"] = X_train_all["time_idx"].astype("int")
|
|
ids = state.fit_kwargs["group_ids"].copy()
|
|
ids.append(TS_TIMESTAMP_COL)
|
|
ids.append("time_idx")
|
|
y_train_all = pd.DataFrame(y_train_all)
|
|
y_train_all[ids] = X_train_all[ids]
|
|
X_train_all = X_train_all.sort_values(ids)
|
|
y_train_all = y_train_all.sort_values(ids)
|
|
training_cutoff = X_train_all["time_idx"].max() - period
|
|
X_train = X_train_all[lambda x: x.time_idx <= training_cutoff]
|
|
y_train = y_train_all[
|
|
lambda x: x.time_idx <= training_cutoff
|
|
].drop(columns=ids)
|
|
X_val = X_train_all[lambda x: x.time_idx > training_cutoff]
|
|
y_val = y_train_all[
|
|
lambda x: x.time_idx > training_cutoff
|
|
].drop(columns=ids)
|
|
else:
|
|
num_samples = X_train_all.shape[0]
|
|
assert (
|
|
period < num_samples
|
|
), f"period={period}>#examples={num_samples}"
|
|
split_idx = num_samples - period
|
|
X_train = X_train_all[:split_idx]
|
|
y_train = y_train_all[:split_idx]
|
|
X_val = X_train_all[split_idx:]
|
|
y_val = y_train_all[split_idx:]
|
|
else:
|
|
if (
|
|
"sample_weight" in state.fit_kwargs
|
|
): # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
(
|
|
X_train,
|
|
X_val,
|
|
y_train,
|
|
y_val,
|
|
state.fit_kwargs[
|
|
"sample_weight"
|
|
], # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
state.weight_val,
|
|
) = train_test_split(
|
|
X_train_all,
|
|
y_train_all,
|
|
state.fit_kwargs[
|
|
"sample_weight"
|
|
], # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
test_size=split_ratio,
|
|
shuffle=False,
|
|
)
|
|
else:
|
|
X_train, X_val, y_train, y_val = train_test_split(
|
|
X_train_all,
|
|
y_train_all,
|
|
test_size=split_ratio,
|
|
shuffle=False,
|
|
)
|
|
elif split_type == "group":
|
|
gss = GroupShuffleSplit(
|
|
n_splits=1, test_size=split_ratio, random_state=RANDOM_SEED
|
|
)
|
|
for train_idx, val_idx in gss.split(
|
|
X_train_all, y_train_all, state.groups_all
|
|
):
|
|
if data_is_df:
|
|
X_train = X_train_all.iloc[train_idx]
|
|
X_val = X_train_all.iloc[val_idx]
|
|
else:
|
|
X_train, X_val = X_train_all[train_idx], X_train_all[val_idx]
|
|
y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
|
|
state.groups = state.groups_all[train_idx]
|
|
state.groups_val = state.groups_all[val_idx]
|
|
elif self.is_classification():
|
|
# for classification, make sure the labels are complete in both
|
|
# training and validation data
|
|
label_set, first = np.unique(y_train_all, return_index=True)
|
|
rest = []
|
|
last = 0
|
|
first.sort()
|
|
for i in range(len(first)):
|
|
rest.extend(range(last, first[i]))
|
|
last = first[i] + 1
|
|
rest.extend(range(last, len(y_train_all)))
|
|
X_first = X_train_all.iloc[first] if data_is_df else X_train_all[first]
|
|
X_rest = X_train_all.iloc[rest] if data_is_df else X_train_all[rest]
|
|
y_rest = y_train_all[rest]
|
|
stratify = y_rest if split_type == "stratified" else None
|
|
if (
|
|
"sample_weight" in state.fit_kwargs
|
|
): # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
(
|
|
X_train,
|
|
X_val,
|
|
y_train,
|
|
y_val,
|
|
weight_train,
|
|
weight_val,
|
|
) = train_test_split(
|
|
X_rest,
|
|
y_rest,
|
|
state.fit_kwargs["sample_weight"][
|
|
rest
|
|
], # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
test_size=split_ratio,
|
|
stratify=stratify,
|
|
random_state=RANDOM_SEED,
|
|
)
|
|
weight1 = state.fit_kwargs["sample_weight"][
|
|
first
|
|
] # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
state.weight_val = concat(weight1, weight_val)
|
|
state.fit_kwargs[
|
|
"sample_weight"
|
|
] = concat( # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
weight1, weight_train
|
|
)
|
|
else:
|
|
X_train, X_val, y_train, y_val = train_test_split(
|
|
X_rest,
|
|
y_rest,
|
|
test_size=split_ratio,
|
|
stratify=stratify,
|
|
random_state=RANDOM_SEED,
|
|
)
|
|
X_train = concat(X_first, X_train)
|
|
y_train = (
|
|
concat(label_set, y_train)
|
|
if data_is_df
|
|
else np.concatenate([label_set, y_train])
|
|
)
|
|
X_val = concat(X_first, X_val)
|
|
y_val = (
|
|
concat(label_set, y_val)
|
|
if data_is_df
|
|
else np.concatenate([label_set, y_val])
|
|
)
|
|
elif self.is_regression():
|
|
if (
|
|
"sample_weight" in state.fit_kwargs
|
|
): # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
(
|
|
X_train,
|
|
X_val,
|
|
y_train,
|
|
y_val,
|
|
state.fit_kwargs[
|
|
"sample_weight"
|
|
], # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
state.weight_val,
|
|
) = train_test_split(
|
|
X_train_all,
|
|
y_train_all,
|
|
state.fit_kwargs[
|
|
"sample_weight"
|
|
], # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
test_size=split_ratio,
|
|
random_state=RANDOM_SEED,
|
|
)
|
|
else:
|
|
X_train, X_val, y_train, y_val = train_test_split(
|
|
X_train_all,
|
|
y_train_all,
|
|
test_size=split_ratio,
|
|
random_state=RANDOM_SEED,
|
|
)
|
|
state.data_size = X_train.shape
|
|
state.X_train, state.y_train = X_train, y_train
|
|
state.X_val, state.y_val = X_val, y_val
|
|
state.X_train_all = X_train_all
|
|
state.y_train_all = y_train_all
|
|
if eval_method == "holdout":
|
|
state.kf = None
|
|
return
|
|
if split_type == "group":
|
|
# logger.info("Using GroupKFold")
|
|
assert (
|
|
len(state.groups_all) == y_train_all.size
|
|
), "the length of groups must match the number of examples"
|
|
assert (
|
|
len(np.unique(state.groups_all)) >= n_splits
|
|
), "the number of groups must be equal or larger than n_splits"
|
|
state.kf = GroupKFold(n_splits)
|
|
elif split_type == "stratified":
|
|
# logger.info("Using StratifiedKFold")
|
|
assert y_train_all.size >= n_splits, (
|
|
f"{n_splits}-fold cross validation"
|
|
f" requires input data with at least {n_splits} examples."
|
|
)
|
|
assert y_train_all.size >= 2 * n_splits, (
|
|
f"{n_splits}-fold cross validation with metric=r2 "
|
|
f"requires input data with at least {n_splits*2} examples."
|
|
)
|
|
state.kf = RepeatedStratifiedKFold(
|
|
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED
|
|
)
|
|
elif split_type == "time":
|
|
# logger.info("Using TimeSeriesSplit")
|
|
if self.is_ts_forecast() and not self.is_ts_forecastpanel():
|
|
period = state.fit_kwargs[
|
|
"period"
|
|
] # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
if period * (n_splits + 1) > y_train_all.size:
|
|
n_splits = int(y_train_all.size / period - 1)
|
|
assert n_splits >= 2, (
|
|
f"cross validation for forecasting period={period}"
|
|
f" requires input data with at least {3 * period} examples."
|
|
)
|
|
logger.info(f"Using nsplits={n_splits} due to data size limit.")
|
|
state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period)
|
|
elif self.is_ts_forecastpanel():
|
|
n_groups = X_train.groupby(state.fit_kwargs.get("group_ids")).ngroups
|
|
period = state.fit_kwargs.get("period")
|
|
state.kf = TimeSeriesSplit(
|
|
n_splits=n_splits, test_size=period * n_groups
|
|
)
|
|
else:
|
|
state.kf = TimeSeriesSplit(n_splits=n_splits)
|
|
elif isinstance(split_type, str):
|
|
# logger.info("Using RepeatedKFold")
|
|
state.kf = RepeatedKFold(
|
|
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED
|
|
)
|
|
else:
|
|
# logger.info("Using splitter object")
|
|
state.kf = split_type
|
|
if isinstance(state.kf, (GroupKFold, StratifiedGroupKFold)):
|
|
# self._split_type is either "group", a GroupKFold object, or a StratifiedGroupKFold object
|
|
state.kf.groups = state.groups_all
|
|
|
|
def decide_split_type(
|
|
self,
|
|
split_type,
|
|
y_train_all,
|
|
fit_kwargs,
|
|
groups=None,
|
|
) -> str:
|
|
if self.name == "classification":
|
|
self.name = get_classification_objective(len(np.unique(y_train_all)))
|
|
if not isinstance(split_type, str):
|
|
assert hasattr(split_type, "split") and hasattr(
|
|
split_type, "get_n_splits"
|
|
), "split_type must be a string or a splitter object with split and get_n_splits methods."
|
|
assert (
|
|
not isinstance(split_type, GroupKFold) or groups is not None
|
|
), "GroupKFold requires groups to be provided."
|
|
return split_type
|
|
|
|
elif self.is_ts_forecast():
|
|
assert split_type in ["auto", "time"]
|
|
assert isinstance(
|
|
fit_kwargs.get("period"),
|
|
int, # NOTE: _decide_split_type is before kwargs is updated to fit_kwargs_by_estimator
|
|
), f"missing a required integer 'period' for '{TS_FORECAST}' task."
|
|
if fit_kwargs.get("group_ids"):
|
|
# TODO (MARK) This will likely not play well with the task class
|
|
self.name = TS_FORECASTPANEL
|
|
assert isinstance(
|
|
fit_kwargs.get("group_ids"), list
|
|
), f"missing a required List[str] 'group_ids' for '{TS_FORECASTPANEL}' task."
|
|
return "time"
|
|
|
|
elif self.is_classification():
|
|
assert split_type in ["auto", "stratified", "uniform", "time", "group"]
|
|
return (
|
|
split_type
|
|
if split_type != "auto"
|
|
else groups is None and "stratified" or "group"
|
|
)
|
|
|
|
elif self.is_regression():
|
|
assert split_type in ["auto", "uniform", "time", "group"]
|
|
return split_type if split_type != "auto" else "uniform"
|
|
|
|
elif self.is_rank():
|
|
assert groups is not None, "groups must be specified for ranking task."
|
|
assert split_type in ["auto", "group"]
|
|
return "group"
|
|
|
|
elif self.is_nlg():
|
|
assert split_type in ["auto", "uniform", "time", "group"]
|
|
return split_type if split_type != "auto" else "uniform"
|
|
|
|
def preprocess(self, X, transformer=None):
|
|
if isinstance(X, List):
|
|
try:
|
|
if isinstance(X[0], List):
|
|
X = [x for x in zip(*X)]
|
|
X = pd.DataFrame(
|
|
dict(
|
|
[
|
|
(transformer._str_columns[idx], X[idx])
|
|
if isinstance(X[0], List)
|
|
else (transformer._str_columns[idx], [X[idx]])
|
|
for idx in range(len(X))
|
|
]
|
|
)
|
|
)
|
|
except IndexError:
|
|
raise IndexError(
|
|
"Test data contains more columns than training data, exiting"
|
|
)
|
|
elif isinstance(X, int):
|
|
return X
|
|
elif issparse(X):
|
|
X = X.tocsr()
|
|
if self.is_ts_forecast():
|
|
X = pd.DataFrame(X)
|
|
if transformer:
|
|
X = transformer.transform(X)
|
|
return X
|
|
|
|
def evaluate_model_CV(
|
|
self,
|
|
config: dict,
|
|
estimator: EstimatorSubclass,
|
|
X_train_all,
|
|
y_train_all,
|
|
budget,
|
|
kf,
|
|
eval_metric,
|
|
best_val_loss,
|
|
cv_score_agg_func=None,
|
|
log_training_metric=False,
|
|
fit_kwargs: Optional[dict] = None,
|
|
free_mem_ratio=0,
|
|
):
|
|
if fit_kwargs is None:
|
|
fit_kwargs = {}
|
|
if cv_score_agg_func is None:
|
|
cv_score_agg_func = default_cv_score_agg_func
|
|
start_time = time.time()
|
|
val_loss_folds = []
|
|
log_metric_folds = []
|
|
metric = None
|
|
train_time = pred_time = 0
|
|
total_fold_num = 0
|
|
n = kf.get_n_splits()
|
|
X_train_split, y_train_split = X_train_all, y_train_all
|
|
if self.is_classification():
|
|
labels = np.unique(y_train_all)
|
|
else:
|
|
labels = fit_kwargs.get(
|
|
"label_list"
|
|
) # pass the label list on to compute the evaluation metric
|
|
groups = None
|
|
shuffle = getattr(kf, "shuffle", not self.is_ts_forecast())
|
|
if isinstance(kf, RepeatedStratifiedKFold):
|
|
kf = kf.split(X_train_split, y_train_split)
|
|
elif isinstance(kf, (GroupKFold, StratifiedGroupKFold)):
|
|
groups = kf.groups
|
|
kf = kf.split(X_train_split, y_train_split, groups)
|
|
shuffle = False
|
|
elif isinstance(kf, TimeSeriesSplit):
|
|
kf = kf.split(X_train_split, y_train_split)
|
|
else:
|
|
kf = kf.split(X_train_split)
|
|
rng = np.random.RandomState(2020)
|
|
budget_per_train = budget and budget / n
|
|
if "sample_weight" in fit_kwargs:
|
|
weight = fit_kwargs["sample_weight"]
|
|
weight_val = None
|
|
else:
|
|
weight = weight_val = None
|
|
for train_index, val_index in kf:
|
|
if shuffle:
|
|
train_index = rng.permutation(train_index)
|
|
if isinstance(X_train_all, pd.DataFrame):
|
|
X_train = X_train_split.iloc[train_index]
|
|
X_val = X_train_split.iloc[val_index]
|
|
else:
|
|
X_train, X_val = X_train_split[train_index], X_train_split[val_index]
|
|
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
|
|
estimator.cleanup()
|
|
if weight is not None:
|
|
fit_kwargs["sample_weight"], weight_val = (
|
|
weight[train_index],
|
|
weight[val_index],
|
|
)
|
|
if groups is not None:
|
|
fit_kwargs["groups"] = (
|
|
groups[train_index]
|
|
if isinstance(groups, np.ndarray)
|
|
else groups.iloc[train_index]
|
|
)
|
|
groups_val = (
|
|
groups[val_index]
|
|
if isinstance(groups, np.ndarray)
|
|
else groups.iloc[val_index]
|
|
)
|
|
else:
|
|
groups_val = None
|
|
val_loss_i, metric_i, train_time_i, pred_time_i = get_val_loss(
|
|
config,
|
|
estimator,
|
|
X_train,
|
|
y_train,
|
|
X_val,
|
|
y_val,
|
|
weight_val,
|
|
groups_val,
|
|
eval_metric,
|
|
self,
|
|
labels,
|
|
budget_per_train,
|
|
log_training_metric=log_training_metric,
|
|
fit_kwargs=fit_kwargs,
|
|
free_mem_ratio=free_mem_ratio,
|
|
)
|
|
if isinstance(metric_i, dict) and "intermediate_results" in metric_i.keys():
|
|
del metric_i["intermediate_results"]
|
|
if weight is not None:
|
|
fit_kwargs["sample_weight"] = weight
|
|
total_fold_num += 1
|
|
val_loss_folds.append(val_loss_i)
|
|
log_metric_folds.append(metric_i)
|
|
train_time += train_time_i
|
|
pred_time += pred_time_i
|
|
if budget and time.time() - start_time >= budget:
|
|
break
|
|
val_loss, metric = cv_score_agg_func(val_loss_folds, log_metric_folds)
|
|
n = total_fold_num
|
|
pred_time /= n
|
|
return val_loss, metric, train_time, pred_time
|
|
|
|
def default_estimator_list(self, estimator_list: List[str]) -> List[str]:
|
|
if "auto" != estimator_list:
|
|
return estimator_list
|
|
if self.is_rank():
|
|
estimator_list = ["lgbm", "xgboost", "xgb_limitdepth"]
|
|
elif self.is_nlp():
|
|
estimator_list = ["transformer"]
|
|
elif self.is_ts_forecastpanel():
|
|
estimator_list = ["tft"]
|
|
else:
|
|
try:
|
|
import catboost
|
|
|
|
estimator_list = [
|
|
"lgbm",
|
|
"rf",
|
|
"catboost",
|
|
"xgboost",
|
|
"extra_tree",
|
|
"xgb_limitdepth",
|
|
]
|
|
except ImportError:
|
|
estimator_list = [
|
|
"lgbm",
|
|
"rf",
|
|
"xgboost",
|
|
"extra_tree",
|
|
"xgb_limitdepth",
|
|
]
|
|
if self.is_ts_forecast():
|
|
# catboost is removed because it has a `name` parameter, making it incompatible with hcrystalball
|
|
if "catboost" in estimator_list:
|
|
estimator_list.remove("catboost")
|
|
if self.is_ts_forecastregression():
|
|
try:
|
|
import prophet
|
|
|
|
estimator_list += ["prophet", "arima", "sarimax"]
|
|
except ImportError:
|
|
estimator_list += ["arima", "sarimax"]
|
|
elif not self.is_regression():
|
|
estimator_list += ["lrl1"]
|
|
|
|
return estimator_list
|
|
|
|
def default_metric(self, metric: str) -> str:
|
|
if "auto" != metric:
|
|
return metric
|
|
|
|
if self.is_nlp():
|
|
from flaml.automl.nlp.utils import (
|
|
load_default_huggingface_metric_for_task,
|
|
)
|
|
|
|
return load_default_huggingface_metric_for_task(self.name)
|
|
elif self.is_binary():
|
|
return "roc_auc"
|
|
elif self.is_multiclass():
|
|
return "log_loss"
|
|
elif self.is_ts_forecast():
|
|
return "mape"
|
|
elif self.is_rank():
|
|
return "ndcg"
|
|
else:
|
|
return "r2"
|