autogen/flaml/automl/task/generic_task.py
Mark Harley 27b2712016
Extract task class from automl (#857)
* Refactor into automl subpackage

Moved some of the packages into an automl subpackage to tidy before the
task-based refactor. This is in response to discussions with the group
and a comment on the first task-based PR.

Only changes here are moving subpackages and modules into the new
automl, fixing imports to work with this structure and fixing some
dependencies in setup.py.

* Fix doc building post automl subpackage refactor

* Fix broken links in website post automl subpackage refactor

* Fix broken links in website post automl subpackage refactor

* Remove vw from test deps as this is breaking the build

* Move default back to the top-level

I'd moved this to automl as that's where it's used internally, but had
missed that this is actually part of the public interface so makes sense
to live where it was.

* Re-add top level modules with deprecation warnings

flaml.data, flaml.ml and flaml.model are re-added to the top level,
being re-exported from flaml.automl for backwards compatability. Adding
a deprecation warning so that we can have a planned removal later.

* Fix model.py line-endings

* WIP

* WIP - Notes below

Got to the point where the methods from AutoML are pulled to
GenericTask. Started removing private markers and removing the passing
of automl to these methods. Done with decide_split_type, started on
prepare_data. Need to do the others after

* Re-add generic_task

* Fix tests: add Task.__str__

* Fix tests: test for ray.ObjectRef

* Hotwire TS_Sklearn wrapper to fix test fail

* Remove unused data size field from Task

* Fix import for CLASSIFICATION in notebook

* Update flaml/automl/data.py

Co-authored-by: Chi Wang <wang.chi@microsoft.com>

* Fix review comments

* Fix task -> str in custom learner constructor

* Remove unused CLASSIFICATION imports

* Hotwire TS_Sklearn wrapper to fix test fail by setting
optimizer_for_horizon == False

* Revert changes to the automl_classification and pin FLAML version

* Fix imports in reverted notebook

* Fix FLAML version in automl notebooks

* Fix ml.py line endings

* Fix CLASSIFICATION task import in automl_classification notebook

* Uncomment pip install in notebook and revert import

Not convinced this will work because of installing an older version of
the package into the environment in which we're running the tests, but
let's see.

* Revert c6a5dd1a0

* Revert "Revert c6a5dd1a0"

This reverts commit e55e35adea03993de87b23f092b14c6af623d487.

* Black format model.py

* Bump version to 1.1.2 in automl_xgboost

* Add docstrings to the Task ABC

* Fix import in custom_learner

* fix 'optimize_for_horizon' for ts_sklearn

* remove debugging print statements

* Check for is_forecast() before is_classification() in decide_split_type

* Attempt to fix formatting fail

* Another attempt to fix formatting fail

* And another attempt to fix formatting fail

* Add type annotations for task arg in signatures and docstrings

* Fix formatting

* Fix linting

---------

Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu>
Co-authored-by: EgorKraevTransferwise <egor.kraev@transferwise.com>
Co-authored-by: Chi Wang <wang.chi@microsoft.com>
Co-authored-by: Kevin Chen <chenkevin.8787@gmail.com>
2023-03-11 02:39:08 +00:00

850 lines
35 KiB
Python

import logging
import time
from typing import List, Optional
import pandas as pd
import numpy as np
from scipy.sparse import issparse
from sklearn.utils import shuffle
from sklearn.model_selection import (
train_test_split,
RepeatedStratifiedKFold,
RepeatedKFold,
GroupKFold,
TimeSeriesSplit,
GroupShuffleSplit,
StratifiedGroupKFold,
)
from flaml.automl.data import TS_TIMESTAMP_COL, concat
from flaml.automl.ml import EstimatorSubclass, default_cv_score_agg_func, get_val_loss
from flaml.automl.model import (
XGBoostSklearnEstimator,
XGBoostLimitDepthEstimator,
RandomForestEstimator,
LGBMEstimator,
LRL1Classifier,
LRL2Classifier,
CatBoostEstimator,
ExtraTreesEstimator,
KNeighborsEstimator,
TransformersEstimator,
TransformersEstimatorModelSelection,
)
from flaml.automl.task.task import (
Task,
get_classification_objective,
TS_FORECAST,
TS_FORECASTPANEL,
)
from flaml.config import RANDOM_SEED
logger = logging.getLogger(__name__)
class GenericTask(Task):
estimators = {
"xgboost": XGBoostSklearnEstimator,
"xgb_limitdepth": XGBoostLimitDepthEstimator,
"rf": RandomForestEstimator,
"lgbm": LGBMEstimator,
"lrl1": LRL1Classifier,
"lrl2": LRL2Classifier,
"catboost": CatBoostEstimator,
"extra_tree": ExtraTreesEstimator,
"kneighbor": KNeighborsEstimator,
"transformer": TransformersEstimator,
"transformer_ms": TransformersEstimatorModelSelection,
}
def validate_data(
self,
automl,
state,
X_train_all,
y_train_all,
dataframe,
label,
X_val=None,
y_val=None,
groups_val=None,
groups=None,
):
if X_train_all is not None and y_train_all is not None:
assert (
isinstance(X_train_all, np.ndarray)
or issparse(X_train_all)
or isinstance(X_train_all, pd.DataFrame)
), (
"X_train_all must be a numpy array, a pandas dataframe, "
"or Scipy sparse matrix."
)
assert isinstance(y_train_all, np.ndarray) or isinstance(
y_train_all, pd.Series
), "y_train_all must be a numpy array or a pandas series."
assert (
X_train_all.size != 0 and y_train_all.size != 0
), "Input data must not be empty."
if isinstance(X_train_all, np.ndarray) and len(X_train_all.shape) == 1:
X_train_all = np.reshape(X_train_all, (X_train_all.size, 1))
if isinstance(y_train_all, np.ndarray):
y_train_all = y_train_all.flatten()
assert (
X_train_all.shape[0] == y_train_all.shape[0]
), "# rows in X_train must match length of y_train."
automl._df = isinstance(X_train_all, pd.DataFrame)
automl._nrow, automl._ndim = X_train_all.shape
if self.is_ts_forecast():
X_train_all = pd.DataFrame(X_train_all)
X_train_all, y_train_all = self._validate_ts_data(
X_train_all, y_train_all
)
X, y = X_train_all, y_train_all
elif dataframe is not None and label is not None:
assert isinstance(
dataframe, pd.DataFrame
), "dataframe must be a pandas DataFrame"
assert label in dataframe.columns, "label must a column name in dataframe"
automl._df = True
if self.is_ts_forecast():
dataframe = self._validate_ts_data(dataframe)
X = dataframe.drop(columns=label)
automl._nrow, automl._ndim = X.shape
y = dataframe[label]
else:
raise ValueError("either X_train+y_train or dataframe+label are required")
# check the validity of input dimensions for NLP tasks, so need to check _is_nlp_task not estimator
if self.is_nlp():
from flaml.automl.nlp.utils import is_a_list_of_str
is_all_str = True
is_all_list = True
for column in X.columns:
assert X[column].dtype.name in (
"object",
"string",
), "If the task is an NLP task, X can only contain text columns"
for each_cell in X[column]:
if each_cell is not None:
is_str = isinstance(each_cell, str)
is_list_of_int = isinstance(each_cell, list) and all(
isinstance(x, int) for x in each_cell
)
is_list_of_str = is_a_list_of_str(each_cell)
if self.is_token_classification():
assert is_list_of_str, (
"For the token-classification task, the input column needs to be a list of string,"
"instead of string, e.g., ['EU', 'rejects','German', 'call','to','boycott','British','lamb','.',].",
"For more examples, please refer to test/nlp/test_autohf_tokenclassification.py",
)
else:
assert is_str or is_list_of_int, (
"Each column of the input must either be str (untokenized) "
"or a list of integers (tokenized)"
)
is_all_str &= is_str
is_all_list &= is_list_of_int or is_list_of_str
assert is_all_str or is_all_list, (
"Currently FLAML only supports two modes for NLP: either all columns of X are string (non-tokenized), "
"or all columns of X are integer ids (tokenized)"
)
if issparse(X_train_all) or automl._skip_transform:
automl._transformer = automl._label_transformer = False
automl._X_train_all, automl._y_train_all = X, y
else:
from flaml.automl.data import DataTransformer
automl._transformer = DataTransformer()
(
automl._X_train_all,
automl._y_train_all,
) = automl._transformer.fit_transform(X, y, self)
automl._label_transformer = automl._transformer.label_transformer
if self.is_token_classification():
if hasattr(automl._label_transformer, "label_list"):
state.fit_kwargs.update(
{"label_list": automl._label_transformer.label_list}
)
elif "label_list" not in state.fit_kwargs:
for each_fit_kwargs in state.fit_kwargs_by_estimator.values():
assert (
"label_list" in each_fit_kwargs
), "For the token-classification task, you must either (1) pass token labels; or (2) pass id labels and the label list. "
"Please refer to the documentation for more details: https://microsoft.github.io/FLAML/docs/Examples/AutoML-NLP#a-simple-token-classification-example"
automl._feature_names_in_ = (
automl._X_train_all.columns.to_list()
if hasattr(automl._X_train_all, "columns")
else None
)
automl._sample_weight_full = state.fit_kwargs.get(
"sample_weight"
) # NOTE: _validate_data is before kwargs is updated to fit_kwargs_by_estimator
if X_val is not None and y_val is not None:
assert (
isinstance(X_val, np.ndarray)
or issparse(X_val)
or isinstance(X_val, pd.DataFrame)
), (
"X_val must be None, a numpy array, a pandas dataframe, "
"or Scipy sparse matrix."
)
assert isinstance(y_val, np.ndarray) or isinstance(
y_val, pd.Series
), "y_val must be None, a numpy array or a pandas series."
assert X_val.size != 0 and y_val.size != 0, (
"Validation data are expected to be nonempty. "
"Use None for X_val and y_val if no validation data."
)
if isinstance(y_val, np.ndarray):
y_val = y_val.flatten()
assert (
X_val.shape[0] == y_val.shape[0]
), "# rows in X_val must match length of y_val."
if automl._transformer:
state.X_val = automl._transformer.transform(X_val)
else:
state.X_val = X_val
# If it's NLG_TASKS, y_val is a pandas series containing the output sequence tokens,
# so we cannot use label_transformer.transform to process it
if automl._label_transformer:
state.y_val = automl._label_transformer.transform(y_val)
else:
state.y_val = y_val
else:
state.X_val = state.y_val = None
if groups is not None and len(groups) != automl._nrow:
# groups is given as group counts
state.groups = np.concatenate([[i] * c for i, c in enumerate(groups)])
assert (
len(state.groups) == automl._nrow
), "the sum of group counts must match the number of examples"
state.groups_val = (
np.concatenate([[i] * c for i, c in enumerate(groups_val)])
if groups_val is not None
else None
)
else:
state.groups_val = groups_val
state.groups = groups
@staticmethod
def _validate_ts_data(
dataframe,
y_train_all=None,
):
assert (
dataframe[dataframe.columns[0]].dtype.name == "datetime64[ns]"
), f"For '{TS_FORECAST}' task, the first column must contain timestamp values."
if y_train_all is not None:
y_df = (
pd.DataFrame(y_train_all)
if isinstance(y_train_all, pd.Series)
else pd.DataFrame(y_train_all, columns=["labels"])
)
dataframe = dataframe.join(y_df)
duplicates = dataframe.duplicated()
if any(duplicates):
logger.warning(
"Duplicate timestamp values found in timestamp column. "
f"\n{dataframe.loc[duplicates, dataframe][dataframe.columns[0]]}"
)
dataframe = dataframe.drop_duplicates()
logger.warning("Removed duplicate rows based on all columns")
assert (
dataframe[[dataframe.columns[0]]].duplicated() is None
), "Duplicate timestamp values with different values for other columns."
ts_series = pd.to_datetime(dataframe[dataframe.columns[0]])
inferred_freq = pd.infer_freq(ts_series)
if inferred_freq is None:
logger.warning(
"Missing timestamps detected. To avoid error with estimators, set estimator list to ['prophet']. "
)
if y_train_all is not None:
return dataframe.iloc[:, :-1], dataframe.iloc[:, -1]
return dataframe
def prepare_data(
self,
state,
X_train_all,
y_train_all,
auto_augment,
eval_method,
split_type,
split_ratio,
n_splits,
data_is_df,
sample_weight_full,
) -> int:
X_val, y_val = state.X_val, state.y_val
if issparse(X_val):
X_val = X_val.tocsr()
if issparse(X_train_all):
X_train_all = X_train_all.tocsr()
if (
self.is_classification()
and auto_augment
and state.fit_kwargs.get("sample_weight")
is None # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
and split_type in ["stratified", "uniform"]
and not self.is_token_classification()
):
# logger.info(f"label {pd.unique(y_train_all)}")
label_set, counts = np.unique(y_train_all, return_counts=True)
# augment rare classes
rare_threshld = 20
rare = counts < rare_threshld
rare_label, rare_counts = label_set[rare], counts[rare]
for i, label in enumerate(rare_label):
count = rare_count = rare_counts[i]
rare_index = y_train_all == label
n = len(y_train_all)
while count < rare_threshld:
if data_is_df:
X_train_all = concat(
X_train_all, X_train_all.iloc[:n].loc[rare_index]
)
else:
X_train_all = concat(
X_train_all, X_train_all[:n][rare_index, :]
)
if isinstance(y_train_all, pd.Series):
y_train_all = concat(
y_train_all, y_train_all.iloc[:n].loc[rare_index]
)
else:
y_train_all = np.concatenate(
[y_train_all, y_train_all[:n][rare_index]]
)
count += rare_count
logger.info(f"class {label} augmented from {rare_count} to {count}")
SHUFFLE_SPLIT_TYPES = ["uniform", "stratified"]
if split_type in SHUFFLE_SPLIT_TYPES:
if sample_weight_full is not None:
X_train_all, y_train_all, state.sample_weight_all = shuffle(
X_train_all,
y_train_all,
sample_weight_full,
random_state=RANDOM_SEED,
)
state.fit_kwargs[
"sample_weight"
] = (
state.sample_weight_all
) # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
if isinstance(state.sample_weight_all, pd.Series):
state.sample_weight_all.reset_index(drop=True, inplace=True)
else:
X_train_all, y_train_all = shuffle(
X_train_all, y_train_all, random_state=RANDOM_SEED
)
if data_is_df:
X_train_all.reset_index(drop=True, inplace=True)
if isinstance(y_train_all, pd.Series):
y_train_all.reset_index(drop=True, inplace=True)
X_train, y_train = X_train_all, y_train_all
state.groups_all = state.groups
if X_val is None and eval_method == "holdout":
# if eval_method = holdout, make holdout data
if split_type == "time":
if self.is_ts_forecast():
period = state.fit_kwargs[
"period"
] # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
if self.is_ts_forecastpanel():
X_train_all["time_idx"] -= X_train_all["time_idx"].min()
X_train_all["time_idx"] = X_train_all["time_idx"].astype("int")
ids = state.fit_kwargs["group_ids"].copy()
ids.append(TS_TIMESTAMP_COL)
ids.append("time_idx")
y_train_all = pd.DataFrame(y_train_all)
y_train_all[ids] = X_train_all[ids]
X_train_all = X_train_all.sort_values(ids)
y_train_all = y_train_all.sort_values(ids)
training_cutoff = X_train_all["time_idx"].max() - period
X_train = X_train_all[lambda x: x.time_idx <= training_cutoff]
y_train = y_train_all[
lambda x: x.time_idx <= training_cutoff
].drop(columns=ids)
X_val = X_train_all[lambda x: x.time_idx > training_cutoff]
y_val = y_train_all[
lambda x: x.time_idx > training_cutoff
].drop(columns=ids)
else:
num_samples = X_train_all.shape[0]
assert (
period < num_samples
), f"period={period}>#examples={num_samples}"
split_idx = num_samples - period
X_train = X_train_all[:split_idx]
y_train = y_train_all[:split_idx]
X_val = X_train_all[split_idx:]
y_val = y_train_all[split_idx:]
else:
if (
"sample_weight" in state.fit_kwargs
): # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
(
X_train,
X_val,
y_train,
y_val,
state.fit_kwargs[
"sample_weight"
], # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
state.weight_val,
) = train_test_split(
X_train_all,
y_train_all,
state.fit_kwargs[
"sample_weight"
], # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
test_size=split_ratio,
shuffle=False,
)
else:
X_train, X_val, y_train, y_val = train_test_split(
X_train_all,
y_train_all,
test_size=split_ratio,
shuffle=False,
)
elif split_type == "group":
gss = GroupShuffleSplit(
n_splits=1, test_size=split_ratio, random_state=RANDOM_SEED
)
for train_idx, val_idx in gss.split(
X_train_all, y_train_all, state.groups_all
):
if data_is_df:
X_train = X_train_all.iloc[train_idx]
X_val = X_train_all.iloc[val_idx]
else:
X_train, X_val = X_train_all[train_idx], X_train_all[val_idx]
y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
state.groups = state.groups_all[train_idx]
state.groups_val = state.groups_all[val_idx]
elif self.is_classification():
# for classification, make sure the labels are complete in both
# training and validation data
label_set, first = np.unique(y_train_all, return_index=True)
rest = []
last = 0
first.sort()
for i in range(len(first)):
rest.extend(range(last, first[i]))
last = first[i] + 1
rest.extend(range(last, len(y_train_all)))
X_first = X_train_all.iloc[first] if data_is_df else X_train_all[first]
X_rest = X_train_all.iloc[rest] if data_is_df else X_train_all[rest]
y_rest = y_train_all[rest]
stratify = y_rest if split_type == "stratified" else None
if (
"sample_weight" in state.fit_kwargs
): # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
(
X_train,
X_val,
y_train,
y_val,
weight_train,
weight_val,
) = train_test_split(
X_rest,
y_rest,
state.fit_kwargs["sample_weight"][
rest
], # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
test_size=split_ratio,
stratify=stratify,
random_state=RANDOM_SEED,
)
weight1 = state.fit_kwargs["sample_weight"][
first
] # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
state.weight_val = concat(weight1, weight_val)
state.fit_kwargs[
"sample_weight"
] = concat( # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
weight1, weight_train
)
else:
X_train, X_val, y_train, y_val = train_test_split(
X_rest,
y_rest,
test_size=split_ratio,
stratify=stratify,
random_state=RANDOM_SEED,
)
X_train = concat(X_first, X_train)
y_train = (
concat(label_set, y_train)
if data_is_df
else np.concatenate([label_set, y_train])
)
X_val = concat(X_first, X_val)
y_val = (
concat(label_set, y_val)
if data_is_df
else np.concatenate([label_set, y_val])
)
elif self.is_regression():
if (
"sample_weight" in state.fit_kwargs
): # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
(
X_train,
X_val,
y_train,
y_val,
state.fit_kwargs[
"sample_weight"
], # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
state.weight_val,
) = train_test_split(
X_train_all,
y_train_all,
state.fit_kwargs[
"sample_weight"
], # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
test_size=split_ratio,
random_state=RANDOM_SEED,
)
else:
X_train, X_val, y_train, y_val = train_test_split(
X_train_all,
y_train_all,
test_size=split_ratio,
random_state=RANDOM_SEED,
)
state.data_size = X_train.shape
state.X_train, state.y_train = X_train, y_train
state.X_val, state.y_val = X_val, y_val
state.X_train_all = X_train_all
state.y_train_all = y_train_all
if eval_method == "holdout":
state.kf = None
return
if split_type == "group":
# logger.info("Using GroupKFold")
assert (
len(state.groups_all) == y_train_all.size
), "the length of groups must match the number of examples"
assert (
len(np.unique(state.groups_all)) >= n_splits
), "the number of groups must be equal or larger than n_splits"
state.kf = GroupKFold(n_splits)
elif split_type == "stratified":
# logger.info("Using StratifiedKFold")
assert y_train_all.size >= n_splits, (
f"{n_splits}-fold cross validation"
f" requires input data with at least {n_splits} examples."
)
assert y_train_all.size >= 2 * n_splits, (
f"{n_splits}-fold cross validation with metric=r2 "
f"requires input data with at least {n_splits*2} examples."
)
state.kf = RepeatedStratifiedKFold(
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED
)
elif split_type == "time":
# logger.info("Using TimeSeriesSplit")
if self.is_ts_forecast() and not self.is_ts_forecastpanel():
period = state.fit_kwargs[
"period"
] # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
if period * (n_splits + 1) > y_train_all.size:
n_splits = int(y_train_all.size / period - 1)
assert n_splits >= 2, (
f"cross validation for forecasting period={period}"
f" requires input data with at least {3 * period} examples."
)
logger.info(f"Using nsplits={n_splits} due to data size limit.")
state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period)
elif self.is_ts_forecastpanel():
n_groups = X_train.groupby(state.fit_kwargs.get("group_ids")).ngroups
period = state.fit_kwargs.get("period")
state.kf = TimeSeriesSplit(
n_splits=n_splits, test_size=period * n_groups
)
else:
state.kf = TimeSeriesSplit(n_splits=n_splits)
elif isinstance(split_type, str):
# logger.info("Using RepeatedKFold")
state.kf = RepeatedKFold(
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED
)
else:
# logger.info("Using splitter object")
state.kf = split_type
if isinstance(state.kf, (GroupKFold, StratifiedGroupKFold)):
# self._split_type is either "group", a GroupKFold object, or a StratifiedGroupKFold object
state.kf.groups = state.groups_all
def decide_split_type(
self,
split_type,
y_train_all,
fit_kwargs,
groups=None,
) -> str:
if self.name == "classification":
self.name = get_classification_objective(len(np.unique(y_train_all)))
if not isinstance(split_type, str):
assert hasattr(split_type, "split") and hasattr(
split_type, "get_n_splits"
), "split_type must be a string or a splitter object with split and get_n_splits methods."
assert (
not isinstance(split_type, GroupKFold) or groups is not None
), "GroupKFold requires groups to be provided."
return split_type
elif self.is_ts_forecast():
assert split_type in ["auto", "time"]
assert isinstance(
fit_kwargs.get("period"),
int, # NOTE: _decide_split_type is before kwargs is updated to fit_kwargs_by_estimator
), f"missing a required integer 'period' for '{TS_FORECAST}' task."
if fit_kwargs.get("group_ids"):
# TODO (MARK) This will likely not play well with the task class
self.name = TS_FORECASTPANEL
assert isinstance(
fit_kwargs.get("group_ids"), list
), f"missing a required List[str] 'group_ids' for '{TS_FORECASTPANEL}' task."
return "time"
elif self.is_classification():
assert split_type in ["auto", "stratified", "uniform", "time", "group"]
return (
split_type
if split_type != "auto"
else groups is None and "stratified" or "group"
)
elif self.is_regression():
assert split_type in ["auto", "uniform", "time", "group"]
return split_type if split_type != "auto" else "uniform"
elif self.is_rank():
assert groups is not None, "groups must be specified for ranking task."
assert split_type in ["auto", "group"]
return "group"
elif self.is_nlg():
assert split_type in ["auto", "uniform", "time", "group"]
return split_type if split_type != "auto" else "uniform"
def preprocess(self, X, transformer=None):
if isinstance(X, List):
try:
if isinstance(X[0], List):
X = [x for x in zip(*X)]
X = pd.DataFrame(
dict(
[
(transformer._str_columns[idx], X[idx])
if isinstance(X[0], List)
else (transformer._str_columns[idx], [X[idx]])
for idx in range(len(X))
]
)
)
except IndexError:
raise IndexError(
"Test data contains more columns than training data, exiting"
)
elif isinstance(X, int):
return X
elif issparse(X):
X = X.tocsr()
if self.is_ts_forecast():
X = pd.DataFrame(X)
if transformer:
X = transformer.transform(X)
return X
def evaluate_model_CV(
self,
config: dict,
estimator: EstimatorSubclass,
X_train_all,
y_train_all,
budget,
kf,
eval_metric,
best_val_loss,
cv_score_agg_func=None,
log_training_metric=False,
fit_kwargs: Optional[dict] = None,
free_mem_ratio=0,
):
if fit_kwargs is None:
fit_kwargs = {}
if cv_score_agg_func is None:
cv_score_agg_func = default_cv_score_agg_func
start_time = time.time()
val_loss_folds = []
log_metric_folds = []
metric = None
train_time = pred_time = 0
total_fold_num = 0
n = kf.get_n_splits()
X_train_split, y_train_split = X_train_all, y_train_all
if self.is_classification():
labels = np.unique(y_train_all)
else:
labels = fit_kwargs.get(
"label_list"
) # pass the label list on to compute the evaluation metric
groups = None
shuffle = getattr(kf, "shuffle", not self.is_ts_forecast())
if isinstance(kf, RepeatedStratifiedKFold):
kf = kf.split(X_train_split, y_train_split)
elif isinstance(kf, (GroupKFold, StratifiedGroupKFold)):
groups = kf.groups
kf = kf.split(X_train_split, y_train_split, groups)
shuffle = False
elif isinstance(kf, TimeSeriesSplit):
kf = kf.split(X_train_split, y_train_split)
else:
kf = kf.split(X_train_split)
rng = np.random.RandomState(2020)
budget_per_train = budget and budget / n
if "sample_weight" in fit_kwargs:
weight = fit_kwargs["sample_weight"]
weight_val = None
else:
weight = weight_val = None
for train_index, val_index in kf:
if shuffle:
train_index = rng.permutation(train_index)
if isinstance(X_train_all, pd.DataFrame):
X_train = X_train_split.iloc[train_index]
X_val = X_train_split.iloc[val_index]
else:
X_train, X_val = X_train_split[train_index], X_train_split[val_index]
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
estimator.cleanup()
if weight is not None:
fit_kwargs["sample_weight"], weight_val = (
weight[train_index],
weight[val_index],
)
if groups is not None:
fit_kwargs["groups"] = (
groups[train_index]
if isinstance(groups, np.ndarray)
else groups.iloc[train_index]
)
groups_val = (
groups[val_index]
if isinstance(groups, np.ndarray)
else groups.iloc[val_index]
)
else:
groups_val = None
val_loss_i, metric_i, train_time_i, pred_time_i = get_val_loss(
config,
estimator,
X_train,
y_train,
X_val,
y_val,
weight_val,
groups_val,
eval_metric,
self,
labels,
budget_per_train,
log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs,
free_mem_ratio=free_mem_ratio,
)
if isinstance(metric_i, dict) and "intermediate_results" in metric_i.keys():
del metric_i["intermediate_results"]
if weight is not None:
fit_kwargs["sample_weight"] = weight
total_fold_num += 1
val_loss_folds.append(val_loss_i)
log_metric_folds.append(metric_i)
train_time += train_time_i
pred_time += pred_time_i
if budget and time.time() - start_time >= budget:
break
val_loss, metric = cv_score_agg_func(val_loss_folds, log_metric_folds)
n = total_fold_num
pred_time /= n
return val_loss, metric, train_time, pred_time
def default_estimator_list(self, estimator_list: List[str]) -> List[str]:
if "auto" != estimator_list:
return estimator_list
if self.is_rank():
estimator_list = ["lgbm", "xgboost", "xgb_limitdepth"]
elif self.is_nlp():
estimator_list = ["transformer"]
elif self.is_ts_forecastpanel():
estimator_list = ["tft"]
else:
try:
import catboost
estimator_list = [
"lgbm",
"rf",
"catboost",
"xgboost",
"extra_tree",
"xgb_limitdepth",
]
except ImportError:
estimator_list = [
"lgbm",
"rf",
"xgboost",
"extra_tree",
"xgb_limitdepth",
]
if self.is_ts_forecast():
# catboost is removed because it has a `name` parameter, making it incompatible with hcrystalball
if "catboost" in estimator_list:
estimator_list.remove("catboost")
if self.is_ts_forecastregression():
try:
import prophet
estimator_list += ["prophet", "arima", "sarimax"]
except ImportError:
estimator_list += ["arima", "sarimax"]
elif not self.is_regression():
estimator_list += ["lrl1"]
return estimator_list
def default_metric(self, metric: str) -> str:
if "auto" != metric:
return metric
if self.is_nlp():
from flaml.automl.nlp.utils import (
load_default_huggingface_metric_for_task,
)
return load_default_huggingface_metric_for_task(self.name)
elif self.is_binary():
return "roc_auc"
elif self.is_multiclass():
return "log_loss"
elif self.is_ts_forecast():
return "mape"
elif self.is_rank():
return "ndcg"
else:
return "r2"