mirror of
https://github.com/microsoft/autogen.git
synced 2025-10-21 13:00:29 +00:00
Forecast (#162)
* added 'forecast' task with estimators ['fbprophet', 'arima', 'sarimax'] * update setup.py * add TimeSeriesSplit to 'regression' and 'classification' task * add 'time' split_type for 'classification' and 'regression' task Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * feature importance * variable name * Update test/test_split.py Co-authored-by: Chi Wang <wang.chi@microsoft.com> * Update test/test_forecast.py Co-authored-by: Chi Wang <wang.chi@microsoft.com> * prophet installation fail in windows * upload flaml_forecast.ipynb Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com>
This commit is contained in:
parent
6270353458
commit
3d0a3d26a2
2
.github/workflows/python-package.yml
vendored
2
.github/workflows/python-package.yml
vendored
@ -41,7 +41,7 @@ jobs:
|
|||||||
- name: If linux or mac, install ray
|
- name: If linux or mac, install ray
|
||||||
if: (matrix.os == 'macOS-latest' || matrix.os == 'ubuntu-latest') && matrix.python-version != '3.9'
|
if: (matrix.os == 'macOS-latest' || matrix.os == 'ubuntu-latest') && matrix.python-version != '3.9'
|
||||||
run: |
|
run: |
|
||||||
pip install -e .[ray]
|
pip install -e .[ray,forecast]
|
||||||
pip install 'tensorboardX<=2.2'
|
pip install 'tensorboardX<=2.2'
|
||||||
- name: Lint with flake8
|
- name: Lint with flake8
|
||||||
run: |
|
run: |
|
||||||
|
160
flaml/automl.py
160
flaml/automl.py
@ -10,7 +10,7 @@ from functools import partial
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy.sparse import issparse
|
from scipy.sparse import issparse
|
||||||
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, \
|
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, \
|
||||||
RepeatedKFold, GroupKFold
|
RepeatedKFold, GroupKFold, TimeSeriesSplit
|
||||||
from sklearn.utils import shuffle
|
from sklearn.utils import shuffle
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
@ -25,6 +25,7 @@ from . import tune
|
|||||||
from .training_log import training_log_reader, training_log_writer
|
from .training_log import training_log_reader, training_log_writer
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
logger_formatter = logging.Formatter(
|
logger_formatter = logging.Formatter(
|
||||||
'[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s',
|
'[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s',
|
||||||
@ -360,11 +361,15 @@ class AutoML:
|
|||||||
return self._trained_estimator.classes_.tolist()
|
return self._trained_estimator.classes_.tolist()
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def predict(self, X_test):
|
def predict(self, X_test, freq=None):
|
||||||
'''Predict label from features.
|
'''Predict label from features.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
X_test: A numpy array of featurized instances, shape n * m.
|
X_test: A numpy array of featurized instances, shape n * m,
|
||||||
|
or a pandas dataframe with one column with timestamp values
|
||||||
|
for 'forecasting' task.
|
||||||
|
freq: str or pandas offset, default=None | The frequency of the
|
||||||
|
time-series.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A numpy array of shape n * 1 - - each element is a predicted class
|
A numpy array of shape n * 1 - - each element is a predicted class
|
||||||
@ -375,8 +380,14 @@ class AutoML:
|
|||||||
"No estimator is trained. Please run fit with enough budget.")
|
"No estimator is trained. Please run fit with enough budget.")
|
||||||
return None
|
return None
|
||||||
X_test = self._preprocess(X_test)
|
X_test = self._preprocess(X_test)
|
||||||
y_pred = self._trained_estimator.predict(X_test)
|
if self._state.task == 'forecast':
|
||||||
if y_pred.ndim > 1:
|
X_test_df = pd.DataFrame(X_test)
|
||||||
|
X_test_col = list(X_test.columns)[0]
|
||||||
|
X_test_df = X_test_df.rename(columns={X_test_col: 'ds'})
|
||||||
|
y_pred = self._trained_estimator.predict(X_test_df, freq=freq)
|
||||||
|
else:
|
||||||
|
y_pred = self._trained_estimator.predict(X_test)
|
||||||
|
if y_pred.ndim > 1 and isinstance(y_pred, np.ndarray):
|
||||||
y_pred = y_pred.flatten()
|
y_pred = y_pred.flatten()
|
||||||
if self._label_transformer:
|
if self._label_transformer:
|
||||||
return self._label_transformer.inverse_transform(pd.Series(
|
return self._label_transformer.inverse_transform(pd.Series(
|
||||||
@ -408,6 +419,25 @@ class AutoML:
|
|||||||
|
|
||||||
def _validate_data(self, X_train_all, y_train_all, dataframe, label,
|
def _validate_data(self, X_train_all, y_train_all, dataframe, label,
|
||||||
X_val=None, y_val=None):
|
X_val=None, y_val=None):
|
||||||
|
if self._state.task == 'forecast':
|
||||||
|
if dataframe is not None and label is not None:
|
||||||
|
dataframe = dataframe.copy()
|
||||||
|
dataframe = dataframe.rename(columns={label[0]: 'ds', label[1]: 'y'})
|
||||||
|
elif dataframe is not None:
|
||||||
|
if ('ds' not in dataframe) or ('y' not in dataframe):
|
||||||
|
raise ValueError(
|
||||||
|
'For forecasting task, Dataframe must have columns "ds" and "y" '
|
||||||
|
'with the dates and values respectively.'
|
||||||
|
)
|
||||||
|
elif (X_train_all is not None) and (y_train_all is not None):
|
||||||
|
dataframe = pd.DataFrame(X_train_all)
|
||||||
|
time_col = list(dataframe.columns)[0]
|
||||||
|
dataframe = dataframe.rename(columns={time_col: 'ds'})
|
||||||
|
dataframe['y'] = pd.Series(y_train_all)
|
||||||
|
X_train_all = None
|
||||||
|
y_train_all = None
|
||||||
|
label = 'y'
|
||||||
|
|
||||||
if X_train_all is not None and y_train_all is not None:
|
if X_train_all is not None and y_train_all is not None:
|
||||||
if not (isinstance(X_train_all, np.ndarray) or issparse(X_train_all)
|
if not (isinstance(X_train_all, np.ndarray) or issparse(X_train_all)
|
||||||
or isinstance(X_train_all, pd.DataFrame)):
|
or isinstance(X_train_all, pd.DataFrame)):
|
||||||
@ -440,7 +470,7 @@ class AutoML:
|
|||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"either X_train+y_train or dataframe+label are required")
|
"either X_train+y_train or dataframe+label are required")
|
||||||
if issparse(X_train_all):
|
if issparse(X_train_all) or self._state.task == 'forecast':
|
||||||
self._transformer = self._label_transformer = False
|
self._transformer = self._label_transformer = False
|
||||||
self._X_train_all, self._y_train_all = X, y
|
self._X_train_all, self._y_train_all = X, y
|
||||||
else:
|
else:
|
||||||
@ -482,7 +512,8 @@ class AutoML:
|
|||||||
def _prepare_data(self,
|
def _prepare_data(self,
|
||||||
eval_method,
|
eval_method,
|
||||||
split_ratio,
|
split_ratio,
|
||||||
n_splits):
|
n_splits,
|
||||||
|
period=None):
|
||||||
X_val, y_val = self._state.X_val, self._state.y_val
|
X_val, y_val = self._state.X_val, self._state.y_val
|
||||||
if issparse(X_val):
|
if issparse(X_val):
|
||||||
X_val = X_val.tocsr()
|
X_val = X_val.tocsr()
|
||||||
@ -490,8 +521,9 @@ class AutoML:
|
|||||||
self._X_train_all, self._y_train_all
|
self._X_train_all, self._y_train_all
|
||||||
if issparse(X_train_all):
|
if issparse(X_train_all):
|
||||||
X_train_all = X_train_all.tocsr()
|
X_train_all = X_train_all.tocsr()
|
||||||
if self._state.task != 'regression' and self._state.fit_kwargs.get(
|
if (self._state.task == 'binary:logistic' or self._state.task == 'multi:softmax') \
|
||||||
'sample_weight') is None:
|
and self._state.fit_kwargs.get('sample_weight') is None \
|
||||||
|
and self._split_type != 'time':
|
||||||
# logger.info(f"label {pd.unique(y_train_all)}")
|
# logger.info(f"label {pd.unique(y_train_all)}")
|
||||||
label_set, counts = np.unique(y_train_all, return_counts=True)
|
label_set, counts = np.unique(y_train_all, return_counts=True)
|
||||||
# augment rare classes
|
# augment rare classes
|
||||||
@ -518,19 +550,21 @@ class AutoML:
|
|||||||
count += rare_count
|
count += rare_count
|
||||||
logger.info(
|
logger.info(
|
||||||
f"class {label} augmented from {rare_count} to {count}")
|
f"class {label} augmented from {rare_count} to {count}")
|
||||||
if 'sample_weight' in self._state.fit_kwargs:
|
SHUFFLE_SPLIT_TYPES = ['uniform', 'stratified']
|
||||||
X_train_all, y_train_all, self._state.fit_kwargs[
|
if self._split_type in SHUFFLE_SPLIT_TYPES:
|
||||||
'sample_weight'] = shuffle(
|
if 'sample_weight' in self._state.fit_kwargs:
|
||||||
|
X_train_all, y_train_all, self._state.fit_kwargs[
|
||||||
|
'sample_weight'] = shuffle(
|
||||||
X_train_all, y_train_all,
|
X_train_all, y_train_all,
|
||||||
self._state.fit_kwargs['sample_weight'],
|
self._state.fit_kwargs['sample_weight'],
|
||||||
random_state=RANDOM_SEED)
|
random_state=RANDOM_SEED)
|
||||||
elif hasattr(self._state, 'groups') and self._state.groups is not None:
|
elif hasattr(self._state, 'groups') and self._state.groups is not None:
|
||||||
X_train_all, y_train_all, self._state.groups = shuffle(
|
X_train_all, y_train_all, self._state.groups = shuffle(
|
||||||
X_train_all, y_train_all, self._state.groups,
|
X_train_all, y_train_all, self._state.groups,
|
||||||
random_state=RANDOM_SEED)
|
random_state=RANDOM_SEED)
|
||||||
else:
|
else:
|
||||||
X_train_all, y_train_all = shuffle(
|
X_train_all, y_train_all = shuffle(
|
||||||
X_train_all, y_train_all, random_state=RANDOM_SEED)
|
X_train_all, y_train_all, random_state=RANDOM_SEED)
|
||||||
if self._df:
|
if self._df:
|
||||||
X_train_all.reset_index(drop=True, inplace=True)
|
X_train_all.reset_index(drop=True, inplace=True)
|
||||||
if isinstance(y_train_all, pd.Series):
|
if isinstance(y_train_all, pd.Series):
|
||||||
@ -539,7 +573,31 @@ class AutoML:
|
|||||||
X_train, y_train = X_train_all, y_train_all
|
X_train, y_train = X_train_all, y_train_all
|
||||||
if X_val is None:
|
if X_val is None:
|
||||||
# if eval_method = holdout, make holdout data
|
# if eval_method = holdout, make holdout data
|
||||||
if self._state.task != 'regression' and eval_method == 'holdout':
|
if eval_method == 'holdout' and self._split_type == 'time':
|
||||||
|
if 'period' in self._state.fit_kwargs:
|
||||||
|
num_samples = X_train_all.shape[0]
|
||||||
|
split_idx = num_samples - self._state.fit_kwargs.get('period')
|
||||||
|
X_train = X_train_all[:split_idx]
|
||||||
|
y_train = y_train_all[:split_idx]
|
||||||
|
X_val = X_train_all[split_idx:]
|
||||||
|
y_val = y_train_all[split_idx:]
|
||||||
|
else:
|
||||||
|
if 'sample_weight' in self._state.fit_kwargs:
|
||||||
|
X_train, X_val, y_train, y_val, self._state.fit_kwargs[
|
||||||
|
'sample_weight'], self._state.weight_val = \
|
||||||
|
train_test_split(
|
||||||
|
X_train_all,
|
||||||
|
y_train_all,
|
||||||
|
self._state.fit_kwargs['sample_weight'],
|
||||||
|
test_size=split_ratio,
|
||||||
|
shuffle=False)
|
||||||
|
else:
|
||||||
|
X_train, X_val, y_train, y_val = train_test_split(
|
||||||
|
X_train_all,
|
||||||
|
y_train_all,
|
||||||
|
test_size=split_ratio,
|
||||||
|
shuffle=False)
|
||||||
|
elif self._state.task != 'regression' and eval_method == 'holdout':
|
||||||
# for classification, make sure the labels are complete in both
|
# for classification, make sure the labels are complete in both
|
||||||
# training and validation data
|
# training and validation data
|
||||||
label_set, first = np.unique(y_train_all, return_index=True)
|
label_set, first = np.unique(y_train_all, return_index=True)
|
||||||
@ -624,6 +682,13 @@ class AutoML:
|
|||||||
f"requires input data with at least {n_splits*2} examples.")
|
f"requires input data with at least {n_splits*2} examples.")
|
||||||
self._state.kf = RepeatedStratifiedKFold(
|
self._state.kf = RepeatedStratifiedKFold(
|
||||||
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
|
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
|
||||||
|
elif self._split_type == "time":
|
||||||
|
logger.info("Using TimeSeriesSplit")
|
||||||
|
if self._state.task == 'forecast':
|
||||||
|
self._state.kf = TimeSeriesSplit(
|
||||||
|
n_splits=n_splits, test_size=self._state.fit_kwargs.get('period'))
|
||||||
|
else:
|
||||||
|
self._state.kf = TimeSeriesSplit(n_splits=n_splits)
|
||||||
else:
|
else:
|
||||||
logger.info("Using RepeatedKFold")
|
logger.info("Using RepeatedKFold")
|
||||||
self._state.kf = RepeatedKFold(
|
self._state.kf = RepeatedKFold(
|
||||||
@ -762,10 +827,15 @@ class AutoML:
|
|||||||
if self._state.task == 'classification':
|
if self._state.task == 'classification':
|
||||||
self._state.task = get_classification_objective(
|
self._state.task = get_classification_objective(
|
||||||
len(np.unique(self._y_train_all)))
|
len(np.unique(self._y_train_all)))
|
||||||
assert split_type in ["stratified", "uniform"]
|
assert split_type in ["stratified", "uniform", "time"]
|
||||||
self._split_type = split_type
|
self._split_type = split_type
|
||||||
else:
|
elif self._state.task == 'regression':
|
||||||
self._split_type = "uniform"
|
if split_type in ["uniform", "time"]:
|
||||||
|
self._split_type = split_type
|
||||||
|
else:
|
||||||
|
self._split_type = "uniform"
|
||||||
|
elif self._state.task == 'forecast':
|
||||||
|
self._split_type = "time"
|
||||||
if record_id >= 0:
|
if record_id >= 0:
|
||||||
eval_method = 'cv'
|
eval_method = 'cv'
|
||||||
elif eval_method == 'auto':
|
elif eval_method == 'auto':
|
||||||
@ -1011,15 +1081,22 @@ class AutoML:
|
|||||||
Args:
|
Args:
|
||||||
X_train: A numpy array or a pandas dataframe of training data in
|
X_train: A numpy array or a pandas dataframe of training data in
|
||||||
shape (n, m)
|
shape (n, m)
|
||||||
|
For 'forecast' task, X_train should be timestamp
|
||||||
y_train: A numpy array or a pandas series of labels in shape (n,)
|
y_train: A numpy array or a pandas series of labels in shape (n,)
|
||||||
|
For 'forecast' task, y_train should be value
|
||||||
dataframe: A dataframe of training data including label column
|
dataframe: A dataframe of training data including label column
|
||||||
label: A str of the label column name
|
For 'forecast' task, dataframe must be specified and should
|
||||||
|
have two columns: timestamp and value
|
||||||
|
label: A str of the label column name for 'classification' or
|
||||||
|
'regression' task or a tuple of strings for timestamp and
|
||||||
|
value columns for 'forecasting' task
|
||||||
Note: If X_train and y_train are provided,
|
Note: If X_train and y_train are provided,
|
||||||
dataframe and label are ignored;
|
dataframe and label are ignored;
|
||||||
If not, dataframe and label must be provided.
|
If not, dataframe and label must be provided.
|
||||||
metric: A string of the metric name or a function,
|
metric: A string of the metric name or a function,
|
||||||
e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
|
e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
|
||||||
'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2'
|
'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mape', 'mae', 'mse', 'r2'
|
||||||
|
for 'forecast' task, use 'mape'
|
||||||
if passing a customized metric function, the function needs to
|
if passing a customized metric function, the function needs to
|
||||||
have the follwing signature:
|
have the follwing signature:
|
||||||
|
|
||||||
@ -1034,7 +1111,7 @@ class AutoML:
|
|||||||
which returns a float number as the minimization objective,
|
which returns a float number as the minimization objective,
|
||||||
and a tuple of floats or a dictionary as the metrics to log
|
and a tuple of floats or a dictionary as the metrics to log
|
||||||
task: A string of the task type, e.g.,
|
task: A string of the task type, e.g.,
|
||||||
'classification', 'regression'
|
'classification', 'regression', 'forecast'
|
||||||
n_jobs: An integer of the number of threads for training
|
n_jobs: An integer of the number of threads for training
|
||||||
log_file_name: A string of the log file name
|
log_file_name: A string of the log file name
|
||||||
estimator_list: A list of strings for estimator names, or 'auto'
|
estimator_list: A list of strings for estimator names, or 'auto'
|
||||||
@ -1085,7 +1162,8 @@ class AutoML:
|
|||||||
hyperparamter configurations for the corresponding estimators.
|
hyperparamter configurations for the corresponding estimators.
|
||||||
seed: int or None, default=None | The random seed for np.random.
|
seed: int or None, default=None | The random seed for np.random.
|
||||||
**fit_kwargs: Other key word arguments to pass to fit() function of
|
**fit_kwargs: Other key word arguments to pass to fit() function of
|
||||||
the searched learners, such as sample_weight.
|
the searched learners, such as sample_weight. Include period as
|
||||||
|
a key word argument for 'forecast' task.
|
||||||
'''
|
'''
|
||||||
self._start_time_flag = time.time()
|
self._start_time_flag = time.time()
|
||||||
self._state.task = task
|
self._state.task = task
|
||||||
@ -1093,6 +1171,7 @@ class AutoML:
|
|||||||
self._state.fit_kwargs = fit_kwargs
|
self._state.fit_kwargs = fit_kwargs
|
||||||
self._state.weight_val = sample_weight_val
|
self._state.weight_val = sample_weight_val
|
||||||
self._state.groups = groups
|
self._state.groups = groups
|
||||||
|
|
||||||
self._validate_data(X_train, y_train, dataframe, label, X_val, y_val)
|
self._validate_data(X_train, y_train, dataframe, label, X_val, y_val)
|
||||||
self._search_states = {} # key: estimator name; value: SearchState
|
self._search_states = {} # key: estimator name; value: SearchState
|
||||||
self._random = np.random.RandomState(RANDOM_SEED)
|
self._random = np.random.RandomState(RANDOM_SEED)
|
||||||
@ -1106,10 +1185,19 @@ class AutoML:
|
|||||||
if self._state.task == 'classification':
|
if self._state.task == 'classification':
|
||||||
self._state.task = get_classification_objective(
|
self._state.task = get_classification_objective(
|
||||||
len(np.unique(self._y_train_all)))
|
len(np.unique(self._y_train_all)))
|
||||||
assert split_type in ["stratified", "uniform"]
|
assert split_type in ["stratified", "uniform", "time"]
|
||||||
self._split_type = split_type
|
self._split_type = split_type
|
||||||
else:
|
elif self._state.task == 'regression':
|
||||||
self._split_type = "uniform"
|
if split_type in ["uniform", "time"]:
|
||||||
|
self._split_type = split_type
|
||||||
|
else:
|
||||||
|
self._split_type = "uniform"
|
||||||
|
elif self._state.task == 'forecast':
|
||||||
|
if split_type is not None and split_type != 'time':
|
||||||
|
raise ValueError("split_type must be 'time' when task is 'forecast'. ")
|
||||||
|
self._split_type = "time"
|
||||||
|
if self._state.task == 'forecast' and self._state.fit_kwargs.get('period') is None:
|
||||||
|
raise TypeError("missing 1 required argument for 'forecast' task: 'period'. ")
|
||||||
if eval_method == 'auto' or self._state.X_val is not None:
|
if eval_method == 'auto' or self._state.X_val is not None:
|
||||||
eval_method = self._decide_eval_method(time_budget)
|
eval_method = self._decide_eval_method(time_budget)
|
||||||
self._state.eval_method = eval_method
|
self._state.eval_method = eval_method
|
||||||
@ -1122,7 +1210,11 @@ class AutoML:
|
|||||||
|
|
||||||
self._retrain_full = retrain_full and (
|
self._retrain_full = retrain_full and (
|
||||||
eval_method == 'holdout' and self._state.X_val is None)
|
eval_method == 'holdout' and self._state.X_val is None)
|
||||||
self._prepare_data(eval_method, split_ratio, n_splits)
|
if self._state.task != 'forecast':
|
||||||
|
self._prepare_data(eval_method, split_ratio, n_splits)
|
||||||
|
else:
|
||||||
|
self._prepare_data(eval_method, split_ratio, n_splits,
|
||||||
|
period=self._state.fit_kwargs.get('period'))
|
||||||
self._sample = sample and eval_method != 'cv' and (
|
self._sample = sample and eval_method != 'cv' and (
|
||||||
MIN_SAMPLE_TRAIN * SAMPLE_MULTIPLY_FACTOR < self._state.data_size)
|
MIN_SAMPLE_TRAIN * SAMPLE_MULTIPLY_FACTOR < self._state.data_size)
|
||||||
if 'auto' == metric:
|
if 'auto' == metric:
|
||||||
@ -1130,6 +1222,8 @@ class AutoML:
|
|||||||
metric = 'roc_auc'
|
metric = 'roc_auc'
|
||||||
elif 'multi' in self._state.task:
|
elif 'multi' in self._state.task:
|
||||||
metric = 'log_loss'
|
metric = 'log_loss'
|
||||||
|
elif self._state.task == 'forecast':
|
||||||
|
metric = 'mape'
|
||||||
else:
|
else:
|
||||||
metric = 'r2'
|
metric = 'r2'
|
||||||
self._state.metric = metric
|
self._state.metric = metric
|
||||||
@ -1146,6 +1240,8 @@ class AutoML:
|
|||||||
estimator_list = ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree']
|
estimator_list = ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree']
|
||||||
if 'regression' != self._state.task:
|
if 'regression' != self._state.task:
|
||||||
estimator_list += ['lrl1']
|
estimator_list += ['lrl1']
|
||||||
|
if self._state.task == 'forecast':
|
||||||
|
estimator_list = ['fbprophet', 'arima', 'sarimax']
|
||||||
for estimator_name in estimator_list:
|
for estimator_name in estimator_list:
|
||||||
if estimator_name not in self._state.learner_classes:
|
if estimator_name not in self._state.learner_classes:
|
||||||
self.add_learner(
|
self.add_learner(
|
||||||
@ -1237,7 +1333,7 @@ class AutoML:
|
|||||||
elif 'bs' == self._hpo_method:
|
elif 'bs' == self._hpo_method:
|
||||||
from flaml import BlendSearch as SearchAlgo
|
from flaml import BlendSearch as SearchAlgo
|
||||||
elif 'cfocat' == self._hpo_method:
|
elif 'cfocat' == self._hpo_method:
|
||||||
from flaml import CFOCat as SearchAlgo
|
from flaml.searcher.cfo_cat import CFOCat as SearchAlgo
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
f"hpo_method={self._hpo_method} is not recognized. "
|
f"hpo_method={self._hpo_method} is not recognized. "
|
||||||
|
@ -120,7 +120,7 @@ def get_output_from_log(filename, time_budget):
|
|||||||
time_budget: A float of the time budget in seconds
|
time_budget: A float of the time budget in seconds
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
training_time_list: A list of the finished time of each logged iter
|
search_time_list: A list of the finished time of each logged iter
|
||||||
best_error_list:
|
best_error_list:
|
||||||
A list of the best validation error after each logged iter
|
A list of the best validation error after each logged iter
|
||||||
error_list: A list of the validation error of each logged iter
|
error_list: A list of the validation error of each logged iter
|
||||||
@ -132,9 +132,8 @@ def get_output_from_log(filename, time_budget):
|
|||||||
best_config = None
|
best_config = None
|
||||||
best_learner = None
|
best_learner = None
|
||||||
best_val_loss = float('+inf')
|
best_val_loss = float('+inf')
|
||||||
training_duration = 0.0
|
|
||||||
|
|
||||||
training_time_list = []
|
search_time_list = []
|
||||||
config_list = []
|
config_list = []
|
||||||
best_error_list = []
|
best_error_list = []
|
||||||
error_list = []
|
error_list = []
|
||||||
@ -143,7 +142,6 @@ def get_output_from_log(filename, time_budget):
|
|||||||
with training_log_reader(filename) as reader:
|
with training_log_reader(filename) as reader:
|
||||||
for record in reader.records():
|
for record in reader.records():
|
||||||
time_used = record.total_search_time
|
time_used = record.total_search_time
|
||||||
training_duration = time_used
|
|
||||||
val_loss = record.validation_loss
|
val_loss = record.validation_loss
|
||||||
config = record.config
|
config = record.config
|
||||||
learner = record.learner.split('_')[0]
|
learner = record.learner.split('_')[0]
|
||||||
@ -156,7 +154,7 @@ def get_output_from_log(filename, time_budget):
|
|||||||
best_config = config
|
best_config = config
|
||||||
best_learner = learner
|
best_learner = learner
|
||||||
best_config_list.append(best_config)
|
best_config_list.append(best_config)
|
||||||
training_time_list.append(training_duration)
|
search_time_list.append(time_used)
|
||||||
best_error_list.append(best_val_loss)
|
best_error_list.append(best_val_loss)
|
||||||
logged_metric_list.append(train_loss)
|
logged_metric_list.append(train_loss)
|
||||||
error_list.append(val_loss)
|
error_list.append(val_loss)
|
||||||
@ -166,7 +164,7 @@ def get_output_from_log(filename, time_budget):
|
|||||||
"Best Learner": best_learner,
|
"Best Learner": best_learner,
|
||||||
"Best Hyper-parameters": best_config})
|
"Best Hyper-parameters": best_config})
|
||||||
|
|
||||||
return (training_time_list, best_error_list, error_list, config_list,
|
return (search_time_list, best_error_list, error_list, config_list,
|
||||||
logged_metric_list)
|
logged_metric_list)
|
||||||
|
|
||||||
|
|
||||||
|
38
flaml/ml.py
38
flaml/ml.py
@ -9,12 +9,12 @@ import numpy as np
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \
|
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \
|
||||||
accuracy_score, mean_absolute_error, log_loss, average_precision_score, \
|
accuracy_score, mean_absolute_error, log_loss, average_precision_score, \
|
||||||
f1_score
|
f1_score, mean_absolute_percentage_error
|
||||||
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold
|
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit
|
||||||
from .model import (
|
from .model import (
|
||||||
XGBoostEstimator, XGBoostSklearnEstimator, RandomForestEstimator,
|
XGBoostEstimator, XGBoostSklearnEstimator, RandomForestEstimator,
|
||||||
LGBMEstimator, LRL1Classifier, LRL2Classifier, CatBoostEstimator,
|
LGBMEstimator, LRL1Classifier, LRL2Classifier, CatBoostEstimator,
|
||||||
ExtraTreeEstimator, KNeighborsEstimator)
|
ExtraTreeEstimator, KNeighborsEstimator, FBProphet, ARIMA, SARIMAX)
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -42,6 +42,12 @@ def get_estimator_class(task, estimator_name):
|
|||||||
estimator_class = ExtraTreeEstimator
|
estimator_class = ExtraTreeEstimator
|
||||||
elif 'kneighbor' == estimator_name:
|
elif 'kneighbor' == estimator_name:
|
||||||
estimator_class = KNeighborsEstimator
|
estimator_class = KNeighborsEstimator
|
||||||
|
elif 'prophet' in estimator_name:
|
||||||
|
estimator_class = FBProphet
|
||||||
|
elif estimator_name == 'arima':
|
||||||
|
estimator_class = ARIMA
|
||||||
|
elif estimator_name == 'sarimax':
|
||||||
|
estimator_class = SARIMAX
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
estimator_name + ' is not a built-in learner. '
|
estimator_name + ' is not a built-in learner. '
|
||||||
@ -57,7 +63,7 @@ def sklearn_metric_loss_score(
|
|||||||
Args:
|
Args:
|
||||||
metric_name: A string of the metric name, one of
|
metric_name: A string of the metric name, one of
|
||||||
'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
|
'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
|
||||||
'roc_auc_ovo', 'log_loss', 'f1', 'ap', 'micro_f1', 'macro_f1'
|
'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'micro_f1', 'macro_f1'
|
||||||
y_predict: A 1d or 2d numpy array of the predictions which can be
|
y_predict: A 1d or 2d numpy array of the predictions which can be
|
||||||
used to calculate the metric. E.g., 2d for log_loss and 1d
|
used to calculate the metric. E.g., 2d for log_loss and 1d
|
||||||
for others.
|
for others.
|
||||||
@ -95,6 +101,9 @@ def sklearn_metric_loss_score(
|
|||||||
elif 'log_loss' in metric_name:
|
elif 'log_loss' in metric_name:
|
||||||
score = log_loss(
|
score = log_loss(
|
||||||
y_true, y_predict, labels=labels, sample_weight=sample_weight)
|
y_true, y_predict, labels=labels, sample_weight=sample_weight)
|
||||||
|
elif 'mape' in metric_name:
|
||||||
|
score = mean_absolute_percentage_error(
|
||||||
|
y_true, y_predict)
|
||||||
elif 'micro_f1' in metric_name:
|
elif 'micro_f1' in metric_name:
|
||||||
score = 1 - f1_score(
|
score = 1 - f1_score(
|
||||||
y_true, y_predict, sample_weight=sample_weight, average='micro')
|
y_true, y_predict, sample_weight=sample_weight, average='micro')
|
||||||
@ -111,18 +120,20 @@ def sklearn_metric_loss_score(
|
|||||||
metric_name + ' is not a built-in metric, '
|
metric_name + ' is not a built-in metric, '
|
||||||
'currently built-in metrics are: '
|
'currently built-in metrics are: '
|
||||||
'r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,'
|
'r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,'
|
||||||
'log_loss, f1, micro_f1, macro_f1, ap. '
|
'log_loss, mape, f1, micro_f1, macro_f1, ap. '
|
||||||
'please pass a customized metric function to AutoML.fit(metric=func)')
|
'please pass a customized metric function to AutoML.fit(metric=func)')
|
||||||
return score
|
return score
|
||||||
|
|
||||||
|
|
||||||
def get_y_pred(estimator, X, eval_metric, obj):
|
def get_y_pred(estimator, X, eval_metric, obj, freq=None):
|
||||||
if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj:
|
if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj:
|
||||||
y_pred_classes = estimator.predict_proba(X)
|
y_pred_classes = estimator.predict_proba(X)
|
||||||
y_pred = y_pred_classes[
|
y_pred = y_pred_classes[
|
||||||
:, 1] if y_pred_classes.ndim > 1 else y_pred_classes
|
:, 1] if y_pred_classes.ndim > 1 else y_pred_classes
|
||||||
elif eval_metric in ['log_loss', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']:
|
elif eval_metric in ['log_loss', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']:
|
||||||
y_pred = estimator.predict_proba(X)
|
y_pred = estimator.predict_proba(X)
|
||||||
|
elif eval_metric == 'mape':
|
||||||
|
y_pred = estimator.predict(X, freq=freq)
|
||||||
else:
|
else:
|
||||||
y_pred = estimator.predict(X)
|
y_pred = estimator.predict(X)
|
||||||
return y_pred
|
return y_pred
|
||||||
@ -201,15 +212,21 @@ def evaluate_model_CV(
|
|||||||
valid_fold_num = total_fold_num = 0
|
valid_fold_num = total_fold_num = 0
|
||||||
n = kf.get_n_splits()
|
n = kf.get_n_splits()
|
||||||
X_train_split, y_train_split = X_train_all, y_train_all
|
X_train_split, y_train_split = X_train_all, y_train_all
|
||||||
if task == 'regression':
|
if task == 'binary:logistics' or task == 'multi:softmax':
|
||||||
labels = None
|
|
||||||
else:
|
|
||||||
labels = np.unique(y_train_all)
|
labels = np.unique(y_train_all)
|
||||||
|
else:
|
||||||
|
labels = None
|
||||||
|
|
||||||
if isinstance(kf, RepeatedStratifiedKFold):
|
if isinstance(kf, RepeatedStratifiedKFold):
|
||||||
kf = kf.split(X_train_split, y_train_split)
|
kf = kf.split(X_train_split, y_train_split)
|
||||||
elif isinstance(kf, GroupKFold):
|
elif isinstance(kf, GroupKFold):
|
||||||
kf = kf.split(X_train_split, y_train_split, kf.groups)
|
kf = kf.split(X_train_split, y_train_split, kf.groups)
|
||||||
|
elif isinstance(kf, TimeSeriesSplit) and task == 'forecast':
|
||||||
|
y_train_all = pd.DataFrame(y_train_all, columns=['y'])
|
||||||
|
train = X_train_all.join(y_train_all)
|
||||||
|
kf = kf.split(train)
|
||||||
|
elif isinstance(kf, TimeSeriesSplit):
|
||||||
|
kf = kf.split(X_train_split, y_train_split)
|
||||||
else:
|
else:
|
||||||
kf = kf.split(X_train_split)
|
kf = kf.split(X_train_split)
|
||||||
rng = np.random.RandomState(2020)
|
rng = np.random.RandomState(2020)
|
||||||
@ -221,7 +238,8 @@ def evaluate_model_CV(
|
|||||||
else:
|
else:
|
||||||
weight = weight_val = None
|
weight = weight_val = None
|
||||||
for train_index, val_index in kf:
|
for train_index, val_index in kf:
|
||||||
train_index = rng.permutation(train_index)
|
if not isinstance(kf, TimeSeriesSplit):
|
||||||
|
train_index = rng.permutation(train_index)
|
||||||
if isinstance(X_train_all, pd.DataFrame):
|
if isinstance(X_train_all, pd.DataFrame):
|
||||||
X_train, X_val = X_train_split.iloc[
|
X_train, X_val = X_train_split.iloc[
|
||||||
train_index], X_train_split.iloc[val_index]
|
train_index], X_train_split.iloc[val_index]
|
||||||
|
221
flaml/model.py
221
flaml/model.py
@ -15,6 +15,7 @@ import pandas as pd
|
|||||||
from . import tune
|
from . import tune
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -635,7 +636,6 @@ class LRL2Classifier(SKLearnEstimator):
|
|||||||
|
|
||||||
|
|
||||||
class CatBoostEstimator(BaseEstimator):
|
class CatBoostEstimator(BaseEstimator):
|
||||||
|
|
||||||
_time_per_iter = None
|
_time_per_iter = None
|
||||||
_train_size = 0
|
_train_size = 0
|
||||||
|
|
||||||
@ -834,3 +834,222 @@ class KNeighborsEstimator(BaseEstimator):
|
|||||||
X = X.drop(cat_columns, axis=1)
|
X = X.drop(cat_columns, axis=1)
|
||||||
X = X.to_numpy()
|
X = X.to_numpy()
|
||||||
return X
|
return X
|
||||||
|
|
||||||
|
|
||||||
|
class FBProphet(BaseEstimator):
|
||||||
|
@classmethod
|
||||||
|
def search_space(cls, **params):
|
||||||
|
space = {
|
||||||
|
'changepoint_prior_scale': {
|
||||||
|
'domain': tune.loguniform(lower=0.001, upper=1000),
|
||||||
|
'init_value': 0.01,
|
||||||
|
'low_cost_init_value': 0.001,
|
||||||
|
},
|
||||||
|
'seasonality_prior_scale': {
|
||||||
|
'domain': tune.loguniform(lower=0.01, upper=100),
|
||||||
|
'init_value': 1,
|
||||||
|
},
|
||||||
|
'holidays_prior_scale': {
|
||||||
|
'domain': tune.loguniform(lower=0.01, upper=100),
|
||||||
|
'init_value': 1,
|
||||||
|
},
|
||||||
|
'seasonality_mode': {
|
||||||
|
'domain': tune.choice(['additive', 'multiplicative']),
|
||||||
|
'init_value': 'multiplicative',
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return space
|
||||||
|
|
||||||
|
def fit(self, X_train, y_train, budget=None, **kwargs):
|
||||||
|
y_train = pd.DataFrame(y_train, columns=['y'])
|
||||||
|
train_df = X_train.join(y_train)
|
||||||
|
|
||||||
|
if ('ds' not in train_df) or ('y' not in train_df):
|
||||||
|
raise ValueError(
|
||||||
|
'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
|
||||||
|
'values respectively.'
|
||||||
|
)
|
||||||
|
|
||||||
|
if 'n_jobs' in self.params:
|
||||||
|
self.params.pop('n_jobs')
|
||||||
|
|
||||||
|
from prophet import Prophet
|
||||||
|
|
||||||
|
current_time = time.time()
|
||||||
|
model = Prophet(**self.params).fit(train_df)
|
||||||
|
train_time = time.time() - current_time
|
||||||
|
self._model = model
|
||||||
|
return train_time
|
||||||
|
|
||||||
|
def predict(self, X_test, freq=None):
|
||||||
|
if self._model is not None:
|
||||||
|
if isinstance(X_test, int) and freq is not None:
|
||||||
|
future = self._model.make_future_dataframe(periods=X_test, freq=freq)
|
||||||
|
forecast = self._model.predict(future)
|
||||||
|
elif isinstance(X_test, pd.DataFrame):
|
||||||
|
forecast = self._model.predict(X_test)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"either X_test(pd.Dataframe with dates for predictions, column ds) or"
|
||||||
|
"X_test(int number of periods)+freq are required.")
|
||||||
|
return forecast['yhat']
|
||||||
|
else:
|
||||||
|
return np.ones(X_test.shape[0])
|
||||||
|
|
||||||
|
|
||||||
|
class ARIMA(BaseEstimator):
|
||||||
|
@classmethod
|
||||||
|
def search_space(cls, **params):
|
||||||
|
space = {
|
||||||
|
'p': {
|
||||||
|
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||||
|
'init_value': 2,
|
||||||
|
'low_cost_init_value': 0,
|
||||||
|
},
|
||||||
|
'd': {
|
||||||
|
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||||
|
'init_value': 2,
|
||||||
|
'low_cost_init_value': 0,
|
||||||
|
},
|
||||||
|
'q': {
|
||||||
|
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||||
|
'init_value': 2,
|
||||||
|
'low_cost_init_value': 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return space
|
||||||
|
|
||||||
|
def fit(self, X_train, y_train, budget=None, **kwargs):
|
||||||
|
y_train = pd.DataFrame(y_train, columns=['y'])
|
||||||
|
train_df = X_train.join(y_train)
|
||||||
|
|
||||||
|
if ('ds' not in train_df) or ('y' not in train_df):
|
||||||
|
raise ValueError(
|
||||||
|
'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
|
||||||
|
'values respectively.'
|
||||||
|
)
|
||||||
|
|
||||||
|
train_df.index = pd.to_datetime(train_df['ds'])
|
||||||
|
train_df = train_df.drop('ds', axis=1)
|
||||||
|
|
||||||
|
if 'n_jobs' in self.params:
|
||||||
|
self.params.pop('n_jobs')
|
||||||
|
|
||||||
|
from statsmodels.tsa.arima.model import ARIMA as ARIMA_estimator
|
||||||
|
import warnings
|
||||||
|
warnings.filterwarnings("ignore")
|
||||||
|
|
||||||
|
current_time = time.time()
|
||||||
|
model = ARIMA_estimator(train_df,
|
||||||
|
order=(self.params['p'], self.params['d'], self.params['q']),
|
||||||
|
enforce_stationarity=False,
|
||||||
|
enforce_invertibility=False)
|
||||||
|
|
||||||
|
model = model.fit()
|
||||||
|
train_time = time.time() - current_time
|
||||||
|
self._model = model
|
||||||
|
return train_time
|
||||||
|
|
||||||
|
def predict(self, X_test, freq=None):
|
||||||
|
if self._model is not None:
|
||||||
|
if isinstance(X_test, int) and freq is not None:
|
||||||
|
forecast = self._model.forecast(steps=X_test).to_frame().reset_index()
|
||||||
|
elif isinstance(X_test, pd.DataFrame):
|
||||||
|
start_date = X_test.iloc[0, 0]
|
||||||
|
end_date = X_test.iloc[-1, 0]
|
||||||
|
forecast = self._model.predict(start=start_date, end=end_date)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"either X_test(pd.Dataframe with dates for predictions, column ds) or"
|
||||||
|
"X_test(int number of periods)+freq are required.")
|
||||||
|
return forecast
|
||||||
|
else:
|
||||||
|
return np.ones(X_test.shape[0])
|
||||||
|
|
||||||
|
|
||||||
|
class SARIMAX(BaseEstimator):
|
||||||
|
@classmethod
|
||||||
|
def search_space(cls, **params):
|
||||||
|
space = {
|
||||||
|
'p': {
|
||||||
|
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||||
|
'init_value': 2,
|
||||||
|
'low_cost_init_value': 0,
|
||||||
|
},
|
||||||
|
'd': {
|
||||||
|
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||||
|
'init_value': 2,
|
||||||
|
'low_cost_init_value': 0,
|
||||||
|
},
|
||||||
|
'q': {
|
||||||
|
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||||
|
'init_value': 2,
|
||||||
|
'low_cost_init_value': 0,
|
||||||
|
},
|
||||||
|
'P': {
|
||||||
|
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||||
|
'init_value': 1,
|
||||||
|
'low_cost_init_value': 0,
|
||||||
|
},
|
||||||
|
'D': {
|
||||||
|
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||||
|
'init_value': 1,
|
||||||
|
'low_cost_init_value': 0,
|
||||||
|
},
|
||||||
|
'Q': {
|
||||||
|
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||||
|
'init_value': 1,
|
||||||
|
'low_cost_init_value': 0,
|
||||||
|
},
|
||||||
|
's': {
|
||||||
|
'domain': tune.choice([1, 4, 6, 12]),
|
||||||
|
'init_value': 12,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return space
|
||||||
|
|
||||||
|
def fit(self, X_train, y_train, budget=None, **kwargs):
|
||||||
|
y_train = pd.DataFrame(y_train, columns=['y'])
|
||||||
|
train_df = X_train.join(y_train)
|
||||||
|
|
||||||
|
if ('ds' not in train_df) or ('y' not in train_df):
|
||||||
|
raise ValueError(
|
||||||
|
'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
|
||||||
|
'values respectively.'
|
||||||
|
)
|
||||||
|
|
||||||
|
train_df.index = pd.to_datetime(train_df['ds'])
|
||||||
|
train_df = train_df.drop('ds', axis=1)
|
||||||
|
|
||||||
|
if 'n_jobs' in self.params:
|
||||||
|
self.params.pop('n_jobs')
|
||||||
|
|
||||||
|
from statsmodels.tsa.statespace.sarimax import SARIMAX as SARIMAX_estimator
|
||||||
|
|
||||||
|
current_time = time.time()
|
||||||
|
model = SARIMAX_estimator(train_df,
|
||||||
|
order=(self.params['p'], self.params['d'], self.params['q']),
|
||||||
|
seasonality_order=(self.params['P'], self.params['D'], self.params['Q'], self.params['s']),
|
||||||
|
enforce_stationarity=False,
|
||||||
|
enforce_invertibility=False)
|
||||||
|
|
||||||
|
model = model.fit()
|
||||||
|
train_time = time.time() - current_time
|
||||||
|
self._model = model
|
||||||
|
return train_time
|
||||||
|
|
||||||
|
def predict(self, X_test, freq=None):
|
||||||
|
if self._model is not None:
|
||||||
|
if isinstance(X_test, int) and freq is not None:
|
||||||
|
forecast = self._model.forecast(steps=X_test).to_frame().reset_index()
|
||||||
|
elif isinstance(X_test, pd.DataFrame):
|
||||||
|
start_date = X_test.iloc[0, 0]
|
||||||
|
end_date = X_test.iloc[-1, 0]
|
||||||
|
forecast = self._model.predict(start=start_date, end=end_date)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"either X_test(pd.Dataframe with dates for predictions, column ds)"
|
||||||
|
"or X_test(int number of periods)+freq are required.")
|
||||||
|
return forecast
|
||||||
|
else:
|
||||||
|
return np.ones(X_test.shape[0])
|
||||||
|
@ -165,7 +165,8 @@ class BlendSearch(Searcher):
|
|||||||
min_resource, max_resource, reduction_factor, self.cost_attr, seed)
|
min_resource, max_resource, reduction_factor, self.cost_attr, seed)
|
||||||
self._is_ls_ever_converged = False
|
self._is_ls_ever_converged = False
|
||||||
self._subspace = {} # the subspace for each trial id
|
self._subspace = {} # the subspace for each trial id
|
||||||
self._init_search()
|
if space:
|
||||||
|
self._init_search()
|
||||||
|
|
||||||
def set_search_properties(self,
|
def set_search_properties(self,
|
||||||
metric: Optional[str] = None,
|
metric: Optional[str] = None,
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.5.12"
|
__version__ = "0.5.13"
|
||||||
|
File diff suppressed because one or more lines are too long
1089
notebook/flaml_forecast.ipynb
Normal file
1089
notebook/flaml_forecast.ipynb
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
5
setup.py
5
setup.py
@ -56,6 +56,7 @@ setuptools.setup(
|
|||||||
"torch==1.8.1",
|
"torch==1.8.1",
|
||||||
"datasets==1.4.1",
|
"datasets==1.4.1",
|
||||||
"azure-storage-blob",
|
"azure-storage-blob",
|
||||||
|
"statsmodels>=0.12.2"
|
||||||
],
|
],
|
||||||
"blendsearch": [
|
"blendsearch": [
|
||||||
"optuna==2.8.0"
|
"optuna==2.8.0"
|
||||||
@ -79,6 +80,10 @@ setuptools.setup(
|
|||||||
"datasets==1.4.1",
|
"datasets==1.4.1",
|
||||||
"tensorboardX<=2.2",
|
"tensorboardX<=2.2",
|
||||||
"torch"
|
"torch"
|
||||||
|
],
|
||||||
|
"forecast": [
|
||||||
|
"prophet>=1.0.1",
|
||||||
|
"statsmodels>=0.12.2"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
classifiers=[
|
classifiers=[
|
||||||
|
119
test/test_forecast.py
Normal file
119
test/test_forecast.py
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
def test_forecast_automl_df(budget=5):
|
||||||
|
# using dataframe
|
||||||
|
import statsmodels.api as sm
|
||||||
|
data = sm.datasets.co2.load_pandas()
|
||||||
|
data = data.data
|
||||||
|
data = data['co2'].resample('MS').mean()
|
||||||
|
data = data.fillna(data.bfill())
|
||||||
|
data = data.to_frame().reset_index()
|
||||||
|
data = data.rename(columns={'index': 'ds', 'co2': 'y'})
|
||||||
|
num_samples = data.shape[0]
|
||||||
|
time_horizon = 12
|
||||||
|
split_idx = num_samples - time_horizon
|
||||||
|
X_train = data[:split_idx]
|
||||||
|
X_test = data[split_idx:]['ds'].to_frame()
|
||||||
|
y_test = data[split_idx:]['y'].to_frame()
|
||||||
|
''' import AutoML class from flaml package '''
|
||||||
|
from flaml import AutoML
|
||||||
|
automl = AutoML()
|
||||||
|
settings = {
|
||||||
|
"time_budget": budget, # total running time in seconds
|
||||||
|
"metric": 'mape', # primary metric
|
||||||
|
"task": 'forecast', # task type
|
||||||
|
"log_file_name": 'CO2_forecast.log', # flaml log file
|
||||||
|
"eval_method": "holdout",
|
||||||
|
"split_type": 'time'
|
||||||
|
}
|
||||||
|
'''The main flaml automl API'''
|
||||||
|
try:
|
||||||
|
automl.fit(dataframe=X_train, **settings, period=time_horizon, freq='M')
|
||||||
|
except ImportError:
|
||||||
|
automl.fit(dataframe=X_train, **settings, estimator_list=['arima', 'sarimax'], period=time_horizon, freq='M')
|
||||||
|
''' retrieve best config and best learner'''
|
||||||
|
print('Best ML leaner:', automl.best_estimator)
|
||||||
|
print('Best hyperparmeter config:', automl.best_config)
|
||||||
|
print(f'Best mape on validation data: {automl.best_loss}')
|
||||||
|
print(f'Training duration of best run: {automl.best_config_train_time}s')
|
||||||
|
print(automl.model.estimator)
|
||||||
|
''' pickle and save the automl object '''
|
||||||
|
import pickle
|
||||||
|
with open('automl.pkl', 'wb') as f:
|
||||||
|
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
''' compute predictions of testing dataset '''
|
||||||
|
y_pred = automl.predict(X_test)
|
||||||
|
print('Predicted labels', y_pred)
|
||||||
|
print('True labels', y_test)
|
||||||
|
''' compute different metric values on testing dataset'''
|
||||||
|
from flaml.ml import sklearn_metric_loss_score
|
||||||
|
print('mape', '=', sklearn_metric_loss_score('mape', y_pred, y_test))
|
||||||
|
from flaml.data import get_output_from_log
|
||||||
|
time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
|
||||||
|
get_output_from_log(filename=settings['log_file_name'], time_budget=budget)
|
||||||
|
for config in config_history:
|
||||||
|
print(config)
|
||||||
|
print(automl.prune_attr)
|
||||||
|
print(automl.max_resource)
|
||||||
|
print(automl.min_resource)
|
||||||
|
|
||||||
|
|
||||||
|
def test_forecast_automl_Xy(budget=5):
|
||||||
|
# using X_train and y_train
|
||||||
|
import statsmodels.api as sm
|
||||||
|
data = sm.datasets.co2.load_pandas()
|
||||||
|
data = data.data
|
||||||
|
data = data['co2'].resample('MS').mean()
|
||||||
|
data = data.fillna(data.bfill())
|
||||||
|
data = data.to_frame().reset_index()
|
||||||
|
num_samples = data.shape[0]
|
||||||
|
time_horizon = 12
|
||||||
|
split_idx = num_samples - time_horizon
|
||||||
|
X_train = data[:split_idx]['index'].to_frame()
|
||||||
|
y_train = data[:split_idx]['co2']
|
||||||
|
X_test = data[split_idx:]['index'].to_frame()
|
||||||
|
y_test = data[split_idx:]['co2'].to_frame()
|
||||||
|
''' import AutoML class from flaml package '''
|
||||||
|
from flaml import AutoML
|
||||||
|
automl = AutoML()
|
||||||
|
settings = {
|
||||||
|
"time_budget": budget, # total running time in seconds
|
||||||
|
"metric": 'mape', # primary metric
|
||||||
|
"task": 'forecast', # task type
|
||||||
|
"log_file_name": 'CO2_forecast.log', # flaml log file
|
||||||
|
"eval_method": "holdout",
|
||||||
|
"split_type": 'time'
|
||||||
|
}
|
||||||
|
'''The main flaml automl API'''
|
||||||
|
try:
|
||||||
|
automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon, freq='M')
|
||||||
|
except ImportError:
|
||||||
|
automl.fit(X_train=X_train, y_train=y_train, **settings, estimator_list=['arima', 'sarimax'], period=time_horizon, freq='M')
|
||||||
|
''' retrieve best config and best learner'''
|
||||||
|
print('Best ML leaner:', automl.best_estimator)
|
||||||
|
print('Best hyperparmeter config:', automl.best_config)
|
||||||
|
print(f'Best mape on validation data: {automl.best_loss}')
|
||||||
|
print(f'Training duration of best run: {automl.best_config_train_time}s')
|
||||||
|
print(automl.model.estimator)
|
||||||
|
''' pickle and save the automl object '''
|
||||||
|
import pickle
|
||||||
|
with open('automl.pkl', 'wb') as f:
|
||||||
|
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
''' compute predictions of testing dataset '''
|
||||||
|
y_pred = automl.predict(X_test)
|
||||||
|
print('Predicted labels', y_pred)
|
||||||
|
print('True labels', y_test)
|
||||||
|
''' compute different metric values on testing dataset'''
|
||||||
|
from flaml.ml import sklearn_metric_loss_score
|
||||||
|
print('mape', '=', sklearn_metric_loss_score('mape', y_pred, y_test))
|
||||||
|
from flaml.data import get_output_from_log
|
||||||
|
time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
|
||||||
|
get_output_from_log(filename=settings['log_file_name'], time_budget=budget)
|
||||||
|
for config in config_history:
|
||||||
|
print(config)
|
||||||
|
print(automl.prune_attr)
|
||||||
|
print(automl.max_resource)
|
||||||
|
print(automl.min_resource)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_forecast_automl_df(60)
|
||||||
|
test_forecast_automl_Xy(60)
|
@ -6,10 +6,12 @@ from sklearn.model_selection import train_test_split
|
|||||||
from sklearn.metrics import accuracy_score
|
from sklearn.metrics import accuracy_score
|
||||||
|
|
||||||
|
|
||||||
dataset = "credit"
|
dataset = "credit-g"
|
||||||
|
|
||||||
|
|
||||||
def _test(split_type):
|
def _test(split_type):
|
||||||
|
from sklearn.externals._arff import ArffException
|
||||||
|
|
||||||
automl = AutoML()
|
automl = AutoML()
|
||||||
|
|
||||||
automl_settings = {
|
automl_settings = {
|
||||||
@ -22,9 +24,17 @@ def _test(split_type):
|
|||||||
"split_type": split_type,
|
"split_type": split_type,
|
||||||
}
|
}
|
||||||
|
|
||||||
X, y = fetch_openml(name=dataset, return_X_y=True)
|
try:
|
||||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
|
X, y = fetch_openml(name=dataset, return_X_y=True)
|
||||||
random_state=42)
|
except (ArffException, ValueError):
|
||||||
|
from sklearn.datasets import load_wine
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
if split_type != 'time':
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
|
||||||
|
random_state=42)
|
||||||
|
else:
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
|
||||||
|
shuffle=False)
|
||||||
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||||
|
|
||||||
pred = automl.predict(X_test)
|
pred = automl.predict(X_test)
|
||||||
@ -37,6 +47,10 @@ def _test_uniform():
|
|||||||
_test(split_type="uniform")
|
_test(split_type="uniform")
|
||||||
|
|
||||||
|
|
||||||
|
def test_time():
|
||||||
|
_test(split_type="time")
|
||||||
|
|
||||||
|
|
||||||
def test_groups():
|
def test_groups():
|
||||||
from sklearn.externals._arff import ArffException
|
from sklearn.externals._arff import ArffException
|
||||||
try:
|
try:
|
||||||
|
52
test/tune/example.py
Normal file
52
test/tune/example.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
def evaluation_fn(step, width, height):
|
||||||
|
return (0.1 + width * step / 100)**(-1) + height * 0.1
|
||||||
|
|
||||||
|
|
||||||
|
def easy_objective(config):
|
||||||
|
from ray import tune
|
||||||
|
# Hyperparameters
|
||||||
|
width, height = config["width"], config["height"]
|
||||||
|
|
||||||
|
for step in range(config["steps"]):
|
||||||
|
# Iterative training function - can be any arbitrary training procedure
|
||||||
|
intermediate_score = evaluation_fn(step, width, height)
|
||||||
|
# Feed the score back back to Tune.
|
||||||
|
tune.report(iterations=step, mean_loss=intermediate_score)
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_blendsearch_tune(smoke_test=True):
|
||||||
|
try:
|
||||||
|
from ray import tune
|
||||||
|
from ray.tune.suggest import ConcurrencyLimiter
|
||||||
|
from ray.tune.schedulers import AsyncHyperBandScheduler
|
||||||
|
from ray.tune.suggest.flaml import BlendSearch
|
||||||
|
except ImportError:
|
||||||
|
print('ray[tune] is not installed, skipping test')
|
||||||
|
return
|
||||||
|
algo = BlendSearch()
|
||||||
|
algo = ConcurrencyLimiter(algo, max_concurrent=4)
|
||||||
|
scheduler = AsyncHyperBandScheduler()
|
||||||
|
analysis = tune.run(
|
||||||
|
easy_objective,
|
||||||
|
metric="mean_loss",
|
||||||
|
mode="min",
|
||||||
|
search_alg=algo,
|
||||||
|
scheduler=scheduler,
|
||||||
|
num_samples=10 if smoke_test else 100,
|
||||||
|
config={
|
||||||
|
"steps": 100,
|
||||||
|
"width": tune.uniform(0, 20),
|
||||||
|
"height": tune.uniform(-100, 100),
|
||||||
|
# This is an ignored parameter.
|
||||||
|
"activation": tune.choice(["relu", "tanh"])
|
||||||
|
})
|
||||||
|
|
||||||
|
print("Best hyperparameters found were: ", analysis.best_config)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_blendsearch_tune(False)
|
Loading…
x
Reference in New Issue
Block a user