mirror of
https://github.com/microsoft/autogen.git
synced 2025-10-20 12:29:37 +00:00
Forecast (#162)
* added 'forecast' task with estimators ['fbprophet', 'arima', 'sarimax'] * update setup.py * add TimeSeriesSplit to 'regression' and 'classification' task * add 'time' split_type for 'classification' and 'regression' task Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * feature importance * variable name * Update test/test_split.py Co-authored-by: Chi Wang <wang.chi@microsoft.com> * Update test/test_forecast.py Co-authored-by: Chi Wang <wang.chi@microsoft.com> * prophet installation fail in windows * upload flaml_forecast.ipynb Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com>
This commit is contained in:
parent
6270353458
commit
3d0a3d26a2
2
.github/workflows/python-package.yml
vendored
2
.github/workflows/python-package.yml
vendored
@ -41,7 +41,7 @@ jobs:
|
||||
- name: If linux or mac, install ray
|
||||
if: (matrix.os == 'macOS-latest' || matrix.os == 'ubuntu-latest') && matrix.python-version != '3.9'
|
||||
run: |
|
||||
pip install -e .[ray]
|
||||
pip install -e .[ray,forecast]
|
||||
pip install 'tensorboardX<=2.2'
|
||||
- name: Lint with flake8
|
||||
run: |
|
||||
|
160
flaml/automl.py
160
flaml/automl.py
@ -10,7 +10,7 @@ from functools import partial
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, \
|
||||
RepeatedKFold, GroupKFold
|
||||
RepeatedKFold, GroupKFold, TimeSeriesSplit
|
||||
from sklearn.utils import shuffle
|
||||
import pandas as pd
|
||||
|
||||
@ -25,6 +25,7 @@ from . import tune
|
||||
from .training_log import training_log_reader, training_log_writer
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger_formatter = logging.Formatter(
|
||||
'[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s',
|
||||
@ -360,11 +361,15 @@ class AutoML:
|
||||
return self._trained_estimator.classes_.tolist()
|
||||
return None
|
||||
|
||||
def predict(self, X_test):
|
||||
def predict(self, X_test, freq=None):
|
||||
'''Predict label from features.
|
||||
|
||||
Args:
|
||||
X_test: A numpy array of featurized instances, shape n * m.
|
||||
X_test: A numpy array of featurized instances, shape n * m,
|
||||
or a pandas dataframe with one column with timestamp values
|
||||
for 'forecasting' task.
|
||||
freq: str or pandas offset, default=None | The frequency of the
|
||||
time-series.
|
||||
|
||||
Returns:
|
||||
A numpy array of shape n * 1 - - each element is a predicted class
|
||||
@ -375,8 +380,14 @@ class AutoML:
|
||||
"No estimator is trained. Please run fit with enough budget.")
|
||||
return None
|
||||
X_test = self._preprocess(X_test)
|
||||
y_pred = self._trained_estimator.predict(X_test)
|
||||
if y_pred.ndim > 1:
|
||||
if self._state.task == 'forecast':
|
||||
X_test_df = pd.DataFrame(X_test)
|
||||
X_test_col = list(X_test.columns)[0]
|
||||
X_test_df = X_test_df.rename(columns={X_test_col: 'ds'})
|
||||
y_pred = self._trained_estimator.predict(X_test_df, freq=freq)
|
||||
else:
|
||||
y_pred = self._trained_estimator.predict(X_test)
|
||||
if y_pred.ndim > 1 and isinstance(y_pred, np.ndarray):
|
||||
y_pred = y_pred.flatten()
|
||||
if self._label_transformer:
|
||||
return self._label_transformer.inverse_transform(pd.Series(
|
||||
@ -408,6 +419,25 @@ class AutoML:
|
||||
|
||||
def _validate_data(self, X_train_all, y_train_all, dataframe, label,
|
||||
X_val=None, y_val=None):
|
||||
if self._state.task == 'forecast':
|
||||
if dataframe is not None and label is not None:
|
||||
dataframe = dataframe.copy()
|
||||
dataframe = dataframe.rename(columns={label[0]: 'ds', label[1]: 'y'})
|
||||
elif dataframe is not None:
|
||||
if ('ds' not in dataframe) or ('y' not in dataframe):
|
||||
raise ValueError(
|
||||
'For forecasting task, Dataframe must have columns "ds" and "y" '
|
||||
'with the dates and values respectively.'
|
||||
)
|
||||
elif (X_train_all is not None) and (y_train_all is not None):
|
||||
dataframe = pd.DataFrame(X_train_all)
|
||||
time_col = list(dataframe.columns)[0]
|
||||
dataframe = dataframe.rename(columns={time_col: 'ds'})
|
||||
dataframe['y'] = pd.Series(y_train_all)
|
||||
X_train_all = None
|
||||
y_train_all = None
|
||||
label = 'y'
|
||||
|
||||
if X_train_all is not None and y_train_all is not None:
|
||||
if not (isinstance(X_train_all, np.ndarray) or issparse(X_train_all)
|
||||
or isinstance(X_train_all, pd.DataFrame)):
|
||||
@ -440,7 +470,7 @@ class AutoML:
|
||||
else:
|
||||
raise ValueError(
|
||||
"either X_train+y_train or dataframe+label are required")
|
||||
if issparse(X_train_all):
|
||||
if issparse(X_train_all) or self._state.task == 'forecast':
|
||||
self._transformer = self._label_transformer = False
|
||||
self._X_train_all, self._y_train_all = X, y
|
||||
else:
|
||||
@ -482,7 +512,8 @@ class AutoML:
|
||||
def _prepare_data(self,
|
||||
eval_method,
|
||||
split_ratio,
|
||||
n_splits):
|
||||
n_splits,
|
||||
period=None):
|
||||
X_val, y_val = self._state.X_val, self._state.y_val
|
||||
if issparse(X_val):
|
||||
X_val = X_val.tocsr()
|
||||
@ -490,8 +521,9 @@ class AutoML:
|
||||
self._X_train_all, self._y_train_all
|
||||
if issparse(X_train_all):
|
||||
X_train_all = X_train_all.tocsr()
|
||||
if self._state.task != 'regression' and self._state.fit_kwargs.get(
|
||||
'sample_weight') is None:
|
||||
if (self._state.task == 'binary:logistic' or self._state.task == 'multi:softmax') \
|
||||
and self._state.fit_kwargs.get('sample_weight') is None \
|
||||
and self._split_type != 'time':
|
||||
# logger.info(f"label {pd.unique(y_train_all)}")
|
||||
label_set, counts = np.unique(y_train_all, return_counts=True)
|
||||
# augment rare classes
|
||||
@ -518,19 +550,21 @@ class AutoML:
|
||||
count += rare_count
|
||||
logger.info(
|
||||
f"class {label} augmented from {rare_count} to {count}")
|
||||
if 'sample_weight' in self._state.fit_kwargs:
|
||||
X_train_all, y_train_all, self._state.fit_kwargs[
|
||||
'sample_weight'] = shuffle(
|
||||
SHUFFLE_SPLIT_TYPES = ['uniform', 'stratified']
|
||||
if self._split_type in SHUFFLE_SPLIT_TYPES:
|
||||
if 'sample_weight' in self._state.fit_kwargs:
|
||||
X_train_all, y_train_all, self._state.fit_kwargs[
|
||||
'sample_weight'] = shuffle(
|
||||
X_train_all, y_train_all,
|
||||
self._state.fit_kwargs['sample_weight'],
|
||||
random_state=RANDOM_SEED)
|
||||
elif hasattr(self._state, 'groups') and self._state.groups is not None:
|
||||
X_train_all, y_train_all, self._state.groups = shuffle(
|
||||
X_train_all, y_train_all, self._state.groups,
|
||||
random_state=RANDOM_SEED)
|
||||
else:
|
||||
X_train_all, y_train_all = shuffle(
|
||||
X_train_all, y_train_all, random_state=RANDOM_SEED)
|
||||
elif hasattr(self._state, 'groups') and self._state.groups is not None:
|
||||
X_train_all, y_train_all, self._state.groups = shuffle(
|
||||
X_train_all, y_train_all, self._state.groups,
|
||||
random_state=RANDOM_SEED)
|
||||
else:
|
||||
X_train_all, y_train_all = shuffle(
|
||||
X_train_all, y_train_all, random_state=RANDOM_SEED)
|
||||
if self._df:
|
||||
X_train_all.reset_index(drop=True, inplace=True)
|
||||
if isinstance(y_train_all, pd.Series):
|
||||
@ -539,7 +573,31 @@ class AutoML:
|
||||
X_train, y_train = X_train_all, y_train_all
|
||||
if X_val is None:
|
||||
# if eval_method = holdout, make holdout data
|
||||
if self._state.task != 'regression' and eval_method == 'holdout':
|
||||
if eval_method == 'holdout' and self._split_type == 'time':
|
||||
if 'period' in self._state.fit_kwargs:
|
||||
num_samples = X_train_all.shape[0]
|
||||
split_idx = num_samples - self._state.fit_kwargs.get('period')
|
||||
X_train = X_train_all[:split_idx]
|
||||
y_train = y_train_all[:split_idx]
|
||||
X_val = X_train_all[split_idx:]
|
||||
y_val = y_train_all[split_idx:]
|
||||
else:
|
||||
if 'sample_weight' in self._state.fit_kwargs:
|
||||
X_train, X_val, y_train, y_val, self._state.fit_kwargs[
|
||||
'sample_weight'], self._state.weight_val = \
|
||||
train_test_split(
|
||||
X_train_all,
|
||||
y_train_all,
|
||||
self._state.fit_kwargs['sample_weight'],
|
||||
test_size=split_ratio,
|
||||
shuffle=False)
|
||||
else:
|
||||
X_train, X_val, y_train, y_val = train_test_split(
|
||||
X_train_all,
|
||||
y_train_all,
|
||||
test_size=split_ratio,
|
||||
shuffle=False)
|
||||
elif self._state.task != 'regression' and eval_method == 'holdout':
|
||||
# for classification, make sure the labels are complete in both
|
||||
# training and validation data
|
||||
label_set, first = np.unique(y_train_all, return_index=True)
|
||||
@ -624,6 +682,13 @@ class AutoML:
|
||||
f"requires input data with at least {n_splits*2} examples.")
|
||||
self._state.kf = RepeatedStratifiedKFold(
|
||||
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
|
||||
elif self._split_type == "time":
|
||||
logger.info("Using TimeSeriesSplit")
|
||||
if self._state.task == 'forecast':
|
||||
self._state.kf = TimeSeriesSplit(
|
||||
n_splits=n_splits, test_size=self._state.fit_kwargs.get('period'))
|
||||
else:
|
||||
self._state.kf = TimeSeriesSplit(n_splits=n_splits)
|
||||
else:
|
||||
logger.info("Using RepeatedKFold")
|
||||
self._state.kf = RepeatedKFold(
|
||||
@ -762,10 +827,15 @@ class AutoML:
|
||||
if self._state.task == 'classification':
|
||||
self._state.task = get_classification_objective(
|
||||
len(np.unique(self._y_train_all)))
|
||||
assert split_type in ["stratified", "uniform"]
|
||||
assert split_type in ["stratified", "uniform", "time"]
|
||||
self._split_type = split_type
|
||||
else:
|
||||
self._split_type = "uniform"
|
||||
elif self._state.task == 'regression':
|
||||
if split_type in ["uniform", "time"]:
|
||||
self._split_type = split_type
|
||||
else:
|
||||
self._split_type = "uniform"
|
||||
elif self._state.task == 'forecast':
|
||||
self._split_type = "time"
|
||||
if record_id >= 0:
|
||||
eval_method = 'cv'
|
||||
elif eval_method == 'auto':
|
||||
@ -1011,15 +1081,22 @@ class AutoML:
|
||||
Args:
|
||||
X_train: A numpy array or a pandas dataframe of training data in
|
||||
shape (n, m)
|
||||
For 'forecast' task, X_train should be timestamp
|
||||
y_train: A numpy array or a pandas series of labels in shape (n,)
|
||||
For 'forecast' task, y_train should be value
|
||||
dataframe: A dataframe of training data including label column
|
||||
label: A str of the label column name
|
||||
For 'forecast' task, dataframe must be specified and should
|
||||
have two columns: timestamp and value
|
||||
label: A str of the label column name for 'classification' or
|
||||
'regression' task or a tuple of strings for timestamp and
|
||||
value columns for 'forecasting' task
|
||||
Note: If X_train and y_train are provided,
|
||||
dataframe and label are ignored;
|
||||
If not, dataframe and label must be provided.
|
||||
metric: A string of the metric name or a function,
|
||||
e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
|
||||
'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2'
|
||||
'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mape', 'mae', 'mse', 'r2'
|
||||
for 'forecast' task, use 'mape'
|
||||
if passing a customized metric function, the function needs to
|
||||
have the follwing signature:
|
||||
|
||||
@ -1034,7 +1111,7 @@ class AutoML:
|
||||
which returns a float number as the minimization objective,
|
||||
and a tuple of floats or a dictionary as the metrics to log
|
||||
task: A string of the task type, e.g.,
|
||||
'classification', 'regression'
|
||||
'classification', 'regression', 'forecast'
|
||||
n_jobs: An integer of the number of threads for training
|
||||
log_file_name: A string of the log file name
|
||||
estimator_list: A list of strings for estimator names, or 'auto'
|
||||
@ -1085,7 +1162,8 @@ class AutoML:
|
||||
hyperparamter configurations for the corresponding estimators.
|
||||
seed: int or None, default=None | The random seed for np.random.
|
||||
**fit_kwargs: Other key word arguments to pass to fit() function of
|
||||
the searched learners, such as sample_weight.
|
||||
the searched learners, such as sample_weight. Include period as
|
||||
a key word argument for 'forecast' task.
|
||||
'''
|
||||
self._start_time_flag = time.time()
|
||||
self._state.task = task
|
||||
@ -1093,6 +1171,7 @@ class AutoML:
|
||||
self._state.fit_kwargs = fit_kwargs
|
||||
self._state.weight_val = sample_weight_val
|
||||
self._state.groups = groups
|
||||
|
||||
self._validate_data(X_train, y_train, dataframe, label, X_val, y_val)
|
||||
self._search_states = {} # key: estimator name; value: SearchState
|
||||
self._random = np.random.RandomState(RANDOM_SEED)
|
||||
@ -1106,10 +1185,19 @@ class AutoML:
|
||||
if self._state.task == 'classification':
|
||||
self._state.task = get_classification_objective(
|
||||
len(np.unique(self._y_train_all)))
|
||||
assert split_type in ["stratified", "uniform"]
|
||||
assert split_type in ["stratified", "uniform", "time"]
|
||||
self._split_type = split_type
|
||||
else:
|
||||
self._split_type = "uniform"
|
||||
elif self._state.task == 'regression':
|
||||
if split_type in ["uniform", "time"]:
|
||||
self._split_type = split_type
|
||||
else:
|
||||
self._split_type = "uniform"
|
||||
elif self._state.task == 'forecast':
|
||||
if split_type is not None and split_type != 'time':
|
||||
raise ValueError("split_type must be 'time' when task is 'forecast'. ")
|
||||
self._split_type = "time"
|
||||
if self._state.task == 'forecast' and self._state.fit_kwargs.get('period') is None:
|
||||
raise TypeError("missing 1 required argument for 'forecast' task: 'period'. ")
|
||||
if eval_method == 'auto' or self._state.X_val is not None:
|
||||
eval_method = self._decide_eval_method(time_budget)
|
||||
self._state.eval_method = eval_method
|
||||
@ -1122,7 +1210,11 @@ class AutoML:
|
||||
|
||||
self._retrain_full = retrain_full and (
|
||||
eval_method == 'holdout' and self._state.X_val is None)
|
||||
self._prepare_data(eval_method, split_ratio, n_splits)
|
||||
if self._state.task != 'forecast':
|
||||
self._prepare_data(eval_method, split_ratio, n_splits)
|
||||
else:
|
||||
self._prepare_data(eval_method, split_ratio, n_splits,
|
||||
period=self._state.fit_kwargs.get('period'))
|
||||
self._sample = sample and eval_method != 'cv' and (
|
||||
MIN_SAMPLE_TRAIN * SAMPLE_MULTIPLY_FACTOR < self._state.data_size)
|
||||
if 'auto' == metric:
|
||||
@ -1130,6 +1222,8 @@ class AutoML:
|
||||
metric = 'roc_auc'
|
||||
elif 'multi' in self._state.task:
|
||||
metric = 'log_loss'
|
||||
elif self._state.task == 'forecast':
|
||||
metric = 'mape'
|
||||
else:
|
||||
metric = 'r2'
|
||||
self._state.metric = metric
|
||||
@ -1146,6 +1240,8 @@ class AutoML:
|
||||
estimator_list = ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree']
|
||||
if 'regression' != self._state.task:
|
||||
estimator_list += ['lrl1']
|
||||
if self._state.task == 'forecast':
|
||||
estimator_list = ['fbprophet', 'arima', 'sarimax']
|
||||
for estimator_name in estimator_list:
|
||||
if estimator_name not in self._state.learner_classes:
|
||||
self.add_learner(
|
||||
@ -1237,7 +1333,7 @@ class AutoML:
|
||||
elif 'bs' == self._hpo_method:
|
||||
from flaml import BlendSearch as SearchAlgo
|
||||
elif 'cfocat' == self._hpo_method:
|
||||
from flaml import CFOCat as SearchAlgo
|
||||
from flaml.searcher.cfo_cat import CFOCat as SearchAlgo
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"hpo_method={self._hpo_method} is not recognized. "
|
||||
|
@ -120,7 +120,7 @@ def get_output_from_log(filename, time_budget):
|
||||
time_budget: A float of the time budget in seconds
|
||||
|
||||
Returns:
|
||||
training_time_list: A list of the finished time of each logged iter
|
||||
search_time_list: A list of the finished time of each logged iter
|
||||
best_error_list:
|
||||
A list of the best validation error after each logged iter
|
||||
error_list: A list of the validation error of each logged iter
|
||||
@ -132,9 +132,8 @@ def get_output_from_log(filename, time_budget):
|
||||
best_config = None
|
||||
best_learner = None
|
||||
best_val_loss = float('+inf')
|
||||
training_duration = 0.0
|
||||
|
||||
training_time_list = []
|
||||
search_time_list = []
|
||||
config_list = []
|
||||
best_error_list = []
|
||||
error_list = []
|
||||
@ -143,7 +142,6 @@ def get_output_from_log(filename, time_budget):
|
||||
with training_log_reader(filename) as reader:
|
||||
for record in reader.records():
|
||||
time_used = record.total_search_time
|
||||
training_duration = time_used
|
||||
val_loss = record.validation_loss
|
||||
config = record.config
|
||||
learner = record.learner.split('_')[0]
|
||||
@ -156,7 +154,7 @@ def get_output_from_log(filename, time_budget):
|
||||
best_config = config
|
||||
best_learner = learner
|
||||
best_config_list.append(best_config)
|
||||
training_time_list.append(training_duration)
|
||||
search_time_list.append(time_used)
|
||||
best_error_list.append(best_val_loss)
|
||||
logged_metric_list.append(train_loss)
|
||||
error_list.append(val_loss)
|
||||
@ -166,7 +164,7 @@ def get_output_from_log(filename, time_budget):
|
||||
"Best Learner": best_learner,
|
||||
"Best Hyper-parameters": best_config})
|
||||
|
||||
return (training_time_list, best_error_list, error_list, config_list,
|
||||
return (search_time_list, best_error_list, error_list, config_list,
|
||||
logged_metric_list)
|
||||
|
||||
|
||||
|
38
flaml/ml.py
38
flaml/ml.py
@ -9,12 +9,12 @@ import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \
|
||||
accuracy_score, mean_absolute_error, log_loss, average_precision_score, \
|
||||
f1_score
|
||||
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold
|
||||
f1_score, mean_absolute_percentage_error
|
||||
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit
|
||||
from .model import (
|
||||
XGBoostEstimator, XGBoostSklearnEstimator, RandomForestEstimator,
|
||||
LGBMEstimator, LRL1Classifier, LRL2Classifier, CatBoostEstimator,
|
||||
ExtraTreeEstimator, KNeighborsEstimator)
|
||||
ExtraTreeEstimator, KNeighborsEstimator, FBProphet, ARIMA, SARIMAX)
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -42,6 +42,12 @@ def get_estimator_class(task, estimator_name):
|
||||
estimator_class = ExtraTreeEstimator
|
||||
elif 'kneighbor' == estimator_name:
|
||||
estimator_class = KNeighborsEstimator
|
||||
elif 'prophet' in estimator_name:
|
||||
estimator_class = FBProphet
|
||||
elif estimator_name == 'arima':
|
||||
estimator_class = ARIMA
|
||||
elif estimator_name == 'sarimax':
|
||||
estimator_class = SARIMAX
|
||||
else:
|
||||
raise ValueError(
|
||||
estimator_name + ' is not a built-in learner. '
|
||||
@ -57,7 +63,7 @@ def sklearn_metric_loss_score(
|
||||
Args:
|
||||
metric_name: A string of the metric name, one of
|
||||
'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
|
||||
'roc_auc_ovo', 'log_loss', 'f1', 'ap', 'micro_f1', 'macro_f1'
|
||||
'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'micro_f1', 'macro_f1'
|
||||
y_predict: A 1d or 2d numpy array of the predictions which can be
|
||||
used to calculate the metric. E.g., 2d for log_loss and 1d
|
||||
for others.
|
||||
@ -95,6 +101,9 @@ def sklearn_metric_loss_score(
|
||||
elif 'log_loss' in metric_name:
|
||||
score = log_loss(
|
||||
y_true, y_predict, labels=labels, sample_weight=sample_weight)
|
||||
elif 'mape' in metric_name:
|
||||
score = mean_absolute_percentage_error(
|
||||
y_true, y_predict)
|
||||
elif 'micro_f1' in metric_name:
|
||||
score = 1 - f1_score(
|
||||
y_true, y_predict, sample_weight=sample_weight, average='micro')
|
||||
@ -111,18 +120,20 @@ def sklearn_metric_loss_score(
|
||||
metric_name + ' is not a built-in metric, '
|
||||
'currently built-in metrics are: '
|
||||
'r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,'
|
||||
'log_loss, f1, micro_f1, macro_f1, ap. '
|
||||
'log_loss, mape, f1, micro_f1, macro_f1, ap. '
|
||||
'please pass a customized metric function to AutoML.fit(metric=func)')
|
||||
return score
|
||||
|
||||
|
||||
def get_y_pred(estimator, X, eval_metric, obj):
|
||||
def get_y_pred(estimator, X, eval_metric, obj, freq=None):
|
||||
if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj:
|
||||
y_pred_classes = estimator.predict_proba(X)
|
||||
y_pred = y_pred_classes[
|
||||
:, 1] if y_pred_classes.ndim > 1 else y_pred_classes
|
||||
elif eval_metric in ['log_loss', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']:
|
||||
y_pred = estimator.predict_proba(X)
|
||||
elif eval_metric == 'mape':
|
||||
y_pred = estimator.predict(X, freq=freq)
|
||||
else:
|
||||
y_pred = estimator.predict(X)
|
||||
return y_pred
|
||||
@ -201,15 +212,21 @@ def evaluate_model_CV(
|
||||
valid_fold_num = total_fold_num = 0
|
||||
n = kf.get_n_splits()
|
||||
X_train_split, y_train_split = X_train_all, y_train_all
|
||||
if task == 'regression':
|
||||
labels = None
|
||||
else:
|
||||
if task == 'binary:logistics' or task == 'multi:softmax':
|
||||
labels = np.unique(y_train_all)
|
||||
else:
|
||||
labels = None
|
||||
|
||||
if isinstance(kf, RepeatedStratifiedKFold):
|
||||
kf = kf.split(X_train_split, y_train_split)
|
||||
elif isinstance(kf, GroupKFold):
|
||||
kf = kf.split(X_train_split, y_train_split, kf.groups)
|
||||
elif isinstance(kf, TimeSeriesSplit) and task == 'forecast':
|
||||
y_train_all = pd.DataFrame(y_train_all, columns=['y'])
|
||||
train = X_train_all.join(y_train_all)
|
||||
kf = kf.split(train)
|
||||
elif isinstance(kf, TimeSeriesSplit):
|
||||
kf = kf.split(X_train_split, y_train_split)
|
||||
else:
|
||||
kf = kf.split(X_train_split)
|
||||
rng = np.random.RandomState(2020)
|
||||
@ -221,7 +238,8 @@ def evaluate_model_CV(
|
||||
else:
|
||||
weight = weight_val = None
|
||||
for train_index, val_index in kf:
|
||||
train_index = rng.permutation(train_index)
|
||||
if not isinstance(kf, TimeSeriesSplit):
|
||||
train_index = rng.permutation(train_index)
|
||||
if isinstance(X_train_all, pd.DataFrame):
|
||||
X_train, X_val = X_train_split.iloc[
|
||||
train_index], X_train_split.iloc[val_index]
|
||||
|
221
flaml/model.py
221
flaml/model.py
@ -15,6 +15,7 @@ import pandas as pd
|
||||
from . import tune
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -635,7 +636,6 @@ class LRL2Classifier(SKLearnEstimator):
|
||||
|
||||
|
||||
class CatBoostEstimator(BaseEstimator):
|
||||
|
||||
_time_per_iter = None
|
||||
_train_size = 0
|
||||
|
||||
@ -834,3 +834,222 @@ class KNeighborsEstimator(BaseEstimator):
|
||||
X = X.drop(cat_columns, axis=1)
|
||||
X = X.to_numpy()
|
||||
return X
|
||||
|
||||
|
||||
class FBProphet(BaseEstimator):
|
||||
@classmethod
|
||||
def search_space(cls, **params):
|
||||
space = {
|
||||
'changepoint_prior_scale': {
|
||||
'domain': tune.loguniform(lower=0.001, upper=1000),
|
||||
'init_value': 0.01,
|
||||
'low_cost_init_value': 0.001,
|
||||
},
|
||||
'seasonality_prior_scale': {
|
||||
'domain': tune.loguniform(lower=0.01, upper=100),
|
||||
'init_value': 1,
|
||||
},
|
||||
'holidays_prior_scale': {
|
||||
'domain': tune.loguniform(lower=0.01, upper=100),
|
||||
'init_value': 1,
|
||||
},
|
||||
'seasonality_mode': {
|
||||
'domain': tune.choice(['additive', 'multiplicative']),
|
||||
'init_value': 'multiplicative',
|
||||
}
|
||||
}
|
||||
return space
|
||||
|
||||
def fit(self, X_train, y_train, budget=None, **kwargs):
|
||||
y_train = pd.DataFrame(y_train, columns=['y'])
|
||||
train_df = X_train.join(y_train)
|
||||
|
||||
if ('ds' not in train_df) or ('y' not in train_df):
|
||||
raise ValueError(
|
||||
'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
|
||||
'values respectively.'
|
||||
)
|
||||
|
||||
if 'n_jobs' in self.params:
|
||||
self.params.pop('n_jobs')
|
||||
|
||||
from prophet import Prophet
|
||||
|
||||
current_time = time.time()
|
||||
model = Prophet(**self.params).fit(train_df)
|
||||
train_time = time.time() - current_time
|
||||
self._model = model
|
||||
return train_time
|
||||
|
||||
def predict(self, X_test, freq=None):
|
||||
if self._model is not None:
|
||||
if isinstance(X_test, int) and freq is not None:
|
||||
future = self._model.make_future_dataframe(periods=X_test, freq=freq)
|
||||
forecast = self._model.predict(future)
|
||||
elif isinstance(X_test, pd.DataFrame):
|
||||
forecast = self._model.predict(X_test)
|
||||
else:
|
||||
raise ValueError(
|
||||
"either X_test(pd.Dataframe with dates for predictions, column ds) or"
|
||||
"X_test(int number of periods)+freq are required.")
|
||||
return forecast['yhat']
|
||||
else:
|
||||
return np.ones(X_test.shape[0])
|
||||
|
||||
|
||||
class ARIMA(BaseEstimator):
|
||||
@classmethod
|
||||
def search_space(cls, **params):
|
||||
space = {
|
||||
'p': {
|
||||
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||
'init_value': 2,
|
||||
'low_cost_init_value': 0,
|
||||
},
|
||||
'd': {
|
||||
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||
'init_value': 2,
|
||||
'low_cost_init_value': 0,
|
||||
},
|
||||
'q': {
|
||||
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||
'init_value': 2,
|
||||
'low_cost_init_value': 0,
|
||||
}
|
||||
}
|
||||
return space
|
||||
|
||||
def fit(self, X_train, y_train, budget=None, **kwargs):
|
||||
y_train = pd.DataFrame(y_train, columns=['y'])
|
||||
train_df = X_train.join(y_train)
|
||||
|
||||
if ('ds' not in train_df) or ('y' not in train_df):
|
||||
raise ValueError(
|
||||
'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
|
||||
'values respectively.'
|
||||
)
|
||||
|
||||
train_df.index = pd.to_datetime(train_df['ds'])
|
||||
train_df = train_df.drop('ds', axis=1)
|
||||
|
||||
if 'n_jobs' in self.params:
|
||||
self.params.pop('n_jobs')
|
||||
|
||||
from statsmodels.tsa.arima.model import ARIMA as ARIMA_estimator
|
||||
import warnings
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
current_time = time.time()
|
||||
model = ARIMA_estimator(train_df,
|
||||
order=(self.params['p'], self.params['d'], self.params['q']),
|
||||
enforce_stationarity=False,
|
||||
enforce_invertibility=False)
|
||||
|
||||
model = model.fit()
|
||||
train_time = time.time() - current_time
|
||||
self._model = model
|
||||
return train_time
|
||||
|
||||
def predict(self, X_test, freq=None):
|
||||
if self._model is not None:
|
||||
if isinstance(X_test, int) and freq is not None:
|
||||
forecast = self._model.forecast(steps=X_test).to_frame().reset_index()
|
||||
elif isinstance(X_test, pd.DataFrame):
|
||||
start_date = X_test.iloc[0, 0]
|
||||
end_date = X_test.iloc[-1, 0]
|
||||
forecast = self._model.predict(start=start_date, end=end_date)
|
||||
else:
|
||||
raise ValueError(
|
||||
"either X_test(pd.Dataframe with dates for predictions, column ds) or"
|
||||
"X_test(int number of periods)+freq are required.")
|
||||
return forecast
|
||||
else:
|
||||
return np.ones(X_test.shape[0])
|
||||
|
||||
|
||||
class SARIMAX(BaseEstimator):
|
||||
@classmethod
|
||||
def search_space(cls, **params):
|
||||
space = {
|
||||
'p': {
|
||||
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||
'init_value': 2,
|
||||
'low_cost_init_value': 0,
|
||||
},
|
||||
'd': {
|
||||
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||
'init_value': 2,
|
||||
'low_cost_init_value': 0,
|
||||
},
|
||||
'q': {
|
||||
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||
'init_value': 2,
|
||||
'low_cost_init_value': 0,
|
||||
},
|
||||
'P': {
|
||||
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||
'init_value': 1,
|
||||
'low_cost_init_value': 0,
|
||||
},
|
||||
'D': {
|
||||
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||
'init_value': 1,
|
||||
'low_cost_init_value': 0,
|
||||
},
|
||||
'Q': {
|
||||
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||
'init_value': 1,
|
||||
'low_cost_init_value': 0,
|
||||
},
|
||||
's': {
|
||||
'domain': tune.choice([1, 4, 6, 12]),
|
||||
'init_value': 12,
|
||||
}
|
||||
}
|
||||
return space
|
||||
|
||||
def fit(self, X_train, y_train, budget=None, **kwargs):
|
||||
y_train = pd.DataFrame(y_train, columns=['y'])
|
||||
train_df = X_train.join(y_train)
|
||||
|
||||
if ('ds' not in train_df) or ('y' not in train_df):
|
||||
raise ValueError(
|
||||
'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
|
||||
'values respectively.'
|
||||
)
|
||||
|
||||
train_df.index = pd.to_datetime(train_df['ds'])
|
||||
train_df = train_df.drop('ds', axis=1)
|
||||
|
||||
if 'n_jobs' in self.params:
|
||||
self.params.pop('n_jobs')
|
||||
|
||||
from statsmodels.tsa.statespace.sarimax import SARIMAX as SARIMAX_estimator
|
||||
|
||||
current_time = time.time()
|
||||
model = SARIMAX_estimator(train_df,
|
||||
order=(self.params['p'], self.params['d'], self.params['q']),
|
||||
seasonality_order=(self.params['P'], self.params['D'], self.params['Q'], self.params['s']),
|
||||
enforce_stationarity=False,
|
||||
enforce_invertibility=False)
|
||||
|
||||
model = model.fit()
|
||||
train_time = time.time() - current_time
|
||||
self._model = model
|
||||
return train_time
|
||||
|
||||
def predict(self, X_test, freq=None):
|
||||
if self._model is not None:
|
||||
if isinstance(X_test, int) and freq is not None:
|
||||
forecast = self._model.forecast(steps=X_test).to_frame().reset_index()
|
||||
elif isinstance(X_test, pd.DataFrame):
|
||||
start_date = X_test.iloc[0, 0]
|
||||
end_date = X_test.iloc[-1, 0]
|
||||
forecast = self._model.predict(start=start_date, end=end_date)
|
||||
else:
|
||||
raise ValueError(
|
||||
"either X_test(pd.Dataframe with dates for predictions, column ds)"
|
||||
"or X_test(int number of periods)+freq are required.")
|
||||
return forecast
|
||||
else:
|
||||
return np.ones(X_test.shape[0])
|
||||
|
@ -165,7 +165,8 @@ class BlendSearch(Searcher):
|
||||
min_resource, max_resource, reduction_factor, self.cost_attr, seed)
|
||||
self._is_ls_ever_converged = False
|
||||
self._subspace = {} # the subspace for each trial id
|
||||
self._init_search()
|
||||
if space:
|
||||
self._init_search()
|
||||
|
||||
def set_search_properties(self,
|
||||
metric: Optional[str] = None,
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.5.12"
|
||||
__version__ = "0.5.13"
|
||||
|
File diff suppressed because one or more lines are too long
1089
notebook/flaml_forecast.ipynb
Normal file
1089
notebook/flaml_forecast.ipynb
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
5
setup.py
5
setup.py
@ -56,6 +56,7 @@ setuptools.setup(
|
||||
"torch==1.8.1",
|
||||
"datasets==1.4.1",
|
||||
"azure-storage-blob",
|
||||
"statsmodels>=0.12.2"
|
||||
],
|
||||
"blendsearch": [
|
||||
"optuna==2.8.0"
|
||||
@ -79,6 +80,10 @@ setuptools.setup(
|
||||
"datasets==1.4.1",
|
||||
"tensorboardX<=2.2",
|
||||
"torch"
|
||||
],
|
||||
"forecast": [
|
||||
"prophet>=1.0.1",
|
||||
"statsmodels>=0.12.2"
|
||||
]
|
||||
},
|
||||
classifiers=[
|
||||
|
119
test/test_forecast.py
Normal file
119
test/test_forecast.py
Normal file
@ -0,0 +1,119 @@
|
||||
def test_forecast_automl_df(budget=5):
|
||||
# using dataframe
|
||||
import statsmodels.api as sm
|
||||
data = sm.datasets.co2.load_pandas()
|
||||
data = data.data
|
||||
data = data['co2'].resample('MS').mean()
|
||||
data = data.fillna(data.bfill())
|
||||
data = data.to_frame().reset_index()
|
||||
data = data.rename(columns={'index': 'ds', 'co2': 'y'})
|
||||
num_samples = data.shape[0]
|
||||
time_horizon = 12
|
||||
split_idx = num_samples - time_horizon
|
||||
X_train = data[:split_idx]
|
||||
X_test = data[split_idx:]['ds'].to_frame()
|
||||
y_test = data[split_idx:]['y'].to_frame()
|
||||
''' import AutoML class from flaml package '''
|
||||
from flaml import AutoML
|
||||
automl = AutoML()
|
||||
settings = {
|
||||
"time_budget": budget, # total running time in seconds
|
||||
"metric": 'mape', # primary metric
|
||||
"task": 'forecast', # task type
|
||||
"log_file_name": 'CO2_forecast.log', # flaml log file
|
||||
"eval_method": "holdout",
|
||||
"split_type": 'time'
|
||||
}
|
||||
'''The main flaml automl API'''
|
||||
try:
|
||||
automl.fit(dataframe=X_train, **settings, period=time_horizon, freq='M')
|
||||
except ImportError:
|
||||
automl.fit(dataframe=X_train, **settings, estimator_list=['arima', 'sarimax'], period=time_horizon, freq='M')
|
||||
''' retrieve best config and best learner'''
|
||||
print('Best ML leaner:', automl.best_estimator)
|
||||
print('Best hyperparmeter config:', automl.best_config)
|
||||
print(f'Best mape on validation data: {automl.best_loss}')
|
||||
print(f'Training duration of best run: {automl.best_config_train_time}s')
|
||||
print(automl.model.estimator)
|
||||
''' pickle and save the automl object '''
|
||||
import pickle
|
||||
with open('automl.pkl', 'wb') as f:
|
||||
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
|
||||
''' compute predictions of testing dataset '''
|
||||
y_pred = automl.predict(X_test)
|
||||
print('Predicted labels', y_pred)
|
||||
print('True labels', y_test)
|
||||
''' compute different metric values on testing dataset'''
|
||||
from flaml.ml import sklearn_metric_loss_score
|
||||
print('mape', '=', sklearn_metric_loss_score('mape', y_pred, y_test))
|
||||
from flaml.data import get_output_from_log
|
||||
time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
|
||||
get_output_from_log(filename=settings['log_file_name'], time_budget=budget)
|
||||
for config in config_history:
|
||||
print(config)
|
||||
print(automl.prune_attr)
|
||||
print(automl.max_resource)
|
||||
print(automl.min_resource)
|
||||
|
||||
|
||||
def test_forecast_automl_Xy(budget=5):
|
||||
# using X_train and y_train
|
||||
import statsmodels.api as sm
|
||||
data = sm.datasets.co2.load_pandas()
|
||||
data = data.data
|
||||
data = data['co2'].resample('MS').mean()
|
||||
data = data.fillna(data.bfill())
|
||||
data = data.to_frame().reset_index()
|
||||
num_samples = data.shape[0]
|
||||
time_horizon = 12
|
||||
split_idx = num_samples - time_horizon
|
||||
X_train = data[:split_idx]['index'].to_frame()
|
||||
y_train = data[:split_idx]['co2']
|
||||
X_test = data[split_idx:]['index'].to_frame()
|
||||
y_test = data[split_idx:]['co2'].to_frame()
|
||||
''' import AutoML class from flaml package '''
|
||||
from flaml import AutoML
|
||||
automl = AutoML()
|
||||
settings = {
|
||||
"time_budget": budget, # total running time in seconds
|
||||
"metric": 'mape', # primary metric
|
||||
"task": 'forecast', # task type
|
||||
"log_file_name": 'CO2_forecast.log', # flaml log file
|
||||
"eval_method": "holdout",
|
||||
"split_type": 'time'
|
||||
}
|
||||
'''The main flaml automl API'''
|
||||
try:
|
||||
automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon, freq='M')
|
||||
except ImportError:
|
||||
automl.fit(X_train=X_train, y_train=y_train, **settings, estimator_list=['arima', 'sarimax'], period=time_horizon, freq='M')
|
||||
''' retrieve best config and best learner'''
|
||||
print('Best ML leaner:', automl.best_estimator)
|
||||
print('Best hyperparmeter config:', automl.best_config)
|
||||
print(f'Best mape on validation data: {automl.best_loss}')
|
||||
print(f'Training duration of best run: {automl.best_config_train_time}s')
|
||||
print(automl.model.estimator)
|
||||
''' pickle and save the automl object '''
|
||||
import pickle
|
||||
with open('automl.pkl', 'wb') as f:
|
||||
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
|
||||
''' compute predictions of testing dataset '''
|
||||
y_pred = automl.predict(X_test)
|
||||
print('Predicted labels', y_pred)
|
||||
print('True labels', y_test)
|
||||
''' compute different metric values on testing dataset'''
|
||||
from flaml.ml import sklearn_metric_loss_score
|
||||
print('mape', '=', sklearn_metric_loss_score('mape', y_pred, y_test))
|
||||
from flaml.data import get_output_from_log
|
||||
time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
|
||||
get_output_from_log(filename=settings['log_file_name'], time_budget=budget)
|
||||
for config in config_history:
|
||||
print(config)
|
||||
print(automl.prune_attr)
|
||||
print(automl.max_resource)
|
||||
print(automl.min_resource)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_forecast_automl_df(60)
|
||||
test_forecast_automl_Xy(60)
|
@ -6,10 +6,12 @@ from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
|
||||
dataset = "credit"
|
||||
dataset = "credit-g"
|
||||
|
||||
|
||||
def _test(split_type):
|
||||
from sklearn.externals._arff import ArffException
|
||||
|
||||
automl = AutoML()
|
||||
|
||||
automl_settings = {
|
||||
@ -22,9 +24,17 @@ def _test(split_type):
|
||||
"split_type": split_type,
|
||||
}
|
||||
|
||||
X, y = fetch_openml(name=dataset, return_X_y=True)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
|
||||
random_state=42)
|
||||
try:
|
||||
X, y = fetch_openml(name=dataset, return_X_y=True)
|
||||
except (ArffException, ValueError):
|
||||
from sklearn.datasets import load_wine
|
||||
X, y = load_wine(return_X_y=True)
|
||||
if split_type != 'time':
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
|
||||
random_state=42)
|
||||
else:
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
|
||||
shuffle=False)
|
||||
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
|
||||
pred = automl.predict(X_test)
|
||||
@ -37,6 +47,10 @@ def _test_uniform():
|
||||
_test(split_type="uniform")
|
||||
|
||||
|
||||
def test_time():
|
||||
_test(split_type="time")
|
||||
|
||||
|
||||
def test_groups():
|
||||
from sklearn.externals._arff import ArffException
|
||||
try:
|
||||
|
52
test/tune/example.py
Normal file
52
test/tune/example.py
Normal file
@ -0,0 +1,52 @@
|
||||
import time
|
||||
|
||||
|
||||
def evaluation_fn(step, width, height):
|
||||
return (0.1 + width * step / 100)**(-1) + height * 0.1
|
||||
|
||||
|
||||
def easy_objective(config):
|
||||
from ray import tune
|
||||
# Hyperparameters
|
||||
width, height = config["width"], config["height"]
|
||||
|
||||
for step in range(config["steps"]):
|
||||
# Iterative training function - can be any arbitrary training procedure
|
||||
intermediate_score = evaluation_fn(step, width, height)
|
||||
# Feed the score back back to Tune.
|
||||
tune.report(iterations=step, mean_loss=intermediate_score)
|
||||
time.sleep(0.1)
|
||||
|
||||
|
||||
def test_blendsearch_tune(smoke_test=True):
|
||||
try:
|
||||
from ray import tune
|
||||
from ray.tune.suggest import ConcurrencyLimiter
|
||||
from ray.tune.schedulers import AsyncHyperBandScheduler
|
||||
from ray.tune.suggest.flaml import BlendSearch
|
||||
except ImportError:
|
||||
print('ray[tune] is not installed, skipping test')
|
||||
return
|
||||
algo = BlendSearch()
|
||||
algo = ConcurrencyLimiter(algo, max_concurrent=4)
|
||||
scheduler = AsyncHyperBandScheduler()
|
||||
analysis = tune.run(
|
||||
easy_objective,
|
||||
metric="mean_loss",
|
||||
mode="min",
|
||||
search_alg=algo,
|
||||
scheduler=scheduler,
|
||||
num_samples=10 if smoke_test else 100,
|
||||
config={
|
||||
"steps": 100,
|
||||
"width": tune.uniform(0, 20),
|
||||
"height": tune.uniform(-100, 100),
|
||||
# This is an ignored parameter.
|
||||
"activation": tune.choice(["relu", "tanh"])
|
||||
})
|
||||
|
||||
print("Best hyperparameters found were: ", analysis.best_config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_blendsearch_tune(False)
|
Loading…
x
Reference in New Issue
Block a user