package name in setup (#198)

* package name

* learning to rank example: close #200

* try import prophet #201
This commit is contained in:
Chi Wang 2021-09-11 21:19:18 -07:00 committed by GitHub
parent 8f9f08cebc
commit f4529dfe89
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 941 additions and 648 deletions

View File

@ -75,7 +75,7 @@ And they can be used in distributed HPO frameworks such as ray tune or nni.
## Examples ## Examples
- A basic classification example. * A basic classification example.
```python ```python
from flaml import AutoML from flaml import AutoML
@ -99,7 +99,7 @@ print(automl.predict_proba(X_train))
print(automl.model) print(automl.model)
``` ```
- A basic regression example. * A basic regression example.
```python ```python
from flaml import AutoML from flaml import AutoML
@ -123,7 +123,7 @@ print(automl.predict(X_train))
print(automl.model) print(automl.model)
``` ```
- Time series forecasting. * Time series forecasting.
```python ```python
# pip install flaml[forecast] # pip install flaml[forecast]
@ -141,14 +141,15 @@ automl.fit(X_train=X_train[:72], # a single column of timestamp
print(automl.predict(X_train[72:])) print(automl.predict(X_train[72:]))
``` ```
- Learning to rank. * Learning to rank.
```python ```python
from sklearn.datasets import fetch_openml from sklearn.datasets import fetch_openml
from flaml import AutoML from flaml import AutoML
X, y = fetch_openml(name="credit-g", return_X_y=True) X_train, y_train = fetch_openml(name="credit-g", return_X_y=True, as_frame=False)
y_train = y_train.cat.codes
# not a real learning to rank dataaset # not a real learning to rank dataaset
groups = [200] * 4 + [100] * 2, # group counts groups = [200] * 4 + [100] * 2 # group counts
automl = AutoML() automl = AutoML()
automl.fit( automl.fit(
X_train, y_train, groups=groups, X_train, y_train, groups=groups,
@ -207,17 +208,21 @@ pip install -e .[test,notebook]
``` ```
### Docker ### Docker
We provide a simple [Dockerfile](https://github.com/microsoft/FLAML/blob/main/Dockerfile). We provide a simple [Dockerfile](https://github.com/microsoft/FLAML/blob/main/Dockerfile).
```
```bash
docker build git://github.com/microsoft/FLAML -t flaml-dev docker build git://github.com/microsoft/FLAML -t flaml-dev
docker run -it flaml-dev docker run -it flaml-dev
``` ```
### Develop in Remote Container ### Develop in Remote Container
If you use vscode, you can open the FLAML folder in a [Container](https://code.visualstudio.com/docs/remote/containers). If you use vscode, you can open the FLAML folder in a [Container](https://code.visualstudio.com/docs/remote/containers).
We have provided the configuration in (.devcontainer)[(https://github.com/microsoft/FLAML/blob/main/.devcontainer)]. We have provided the configuration in [.devcontainer]((https://github.com/microsoft/FLAML/blob/main/.devcontainer)).
### Pre-commit ### Pre-commit
Run `pre-commit install` to install pre-commit into your git hooks. Before you commit, run Run `pre-commit install` to install pre-commit into your git hooks. Before you commit, run
`pre-commit run` to check if you meet the pre-commit requirements. If you use Windows (without WSL) and can't commit after installing pre-commit, you can run `pre-commit uninstall` to uninstall the hook. In WSL or Linux this is supposed to work. `pre-commit run` to check if you meet the pre-commit requirements. If you use Windows (without WSL) and can't commit after installing pre-commit, you can run `pre-commit uninstall` to uninstall the hook. In WSL or Linux this is supposed to work.

View File

@ -1474,7 +1474,12 @@ class AutoML:
if "auto" == estimator_list: if "auto" == estimator_list:
if self._state.task == "forecast": if self._state.task == "forecast":
estimator_list = ["fbprophet", "arima", "sarimax"] try:
import prophet
estimator_list = ["prophet", "arima", "sarimax"]
except ImportError:
estimator_list = ["arima", "sarimax"]
elif self._state.task == "rank": elif self._state.task == "rank":
estimator_list = ["lgbm", "xgboost"] estimator_list = ["lgbm", "xgboost"]
else: else:

View File

@ -1,7 +1,7 @@
'''! """!
* Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. * Licensed under the MIT License.
''' """
import numpy as np import numpy as np
from scipy.sparse import vstack, issparse from scipy.sparse import vstack, issparse
@ -11,9 +11,10 @@ from .training_log import training_log_reader
from datetime import datetime from datetime import datetime
def load_openml_dataset(dataset_id, data_dir=None, random_state=0, def load_openml_dataset(
dataset_format='dataframe'): dataset_id, data_dir=None, random_state=0, dataset_format="dataframe"
'''Load dataset from open ML. ):
"""Load dataset from open ML.
If the file is not cached locally, download it from open ML. If the file is not cached locally, download it from open ML.
@ -30,41 +31,43 @@ def load_openml_dataset(dataset_id, data_dir=None, random_state=0,
X_test: Test data X_test: Test data
y_train: A series or array of labels for training data y_train: A series or array of labels for training data
y_test: A series or array of labels for test data y_test: A series or array of labels for test data
''' """
import os import os
import openml import openml
import pickle import pickle
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
filename = 'openml_ds' + str(dataset_id) + '.pkl' filename = "openml_ds" + str(dataset_id) + ".pkl"
filepath = os.path.join(data_dir, filename) filepath = os.path.join(data_dir, filename)
if os.path.isfile(filepath): if os.path.isfile(filepath):
print('load dataset from', filepath) print("load dataset from", filepath)
with open(filepath, 'rb') as f: with open(filepath, "rb") as f:
dataset = pickle.load(f) dataset = pickle.load(f)
else: else:
print('download dataset from openml') print("download dataset from openml")
dataset = openml.datasets.get_dataset(dataset_id) dataset = openml.datasets.get_dataset(dataset_id)
if not os.path.exists(data_dir): if not os.path.exists(data_dir):
os.makedirs(data_dir) os.makedirs(data_dir)
with open(filepath, 'wb') as f: with open(filepath, "wb") as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL) pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
print('Dataset name:', dataset.name) print("Dataset name:", dataset.name)
X, y, * \ X, y, *__ = dataset.get_data(
__ = dataset.get_data( target=dataset.default_target_attribute, dataset_format=dataset_format
target=dataset.default_target_attribute, dataset_format=dataset_format) )
X_train, X_test, y_train, y_test = train_test_split( X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
X, y, random_state=random_state)
print( print(
'X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}'.format( "X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}".format(
X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_train.shape,
y_train.shape,
X_test.shape,
y_test.shape,
) )
) )
return X_train, X_test, y_train, y_test return X_train, X_test, y_train, y_test
def load_openml_task(task_id, data_dir): def load_openml_task(task_id, data_dir):
'''Load task from open ML. """Load task from open ML.
Use the first fold of the task. Use the first fold of the task.
If the file is not cached locally, download it from open ML. If the file is not cached locally, download it from open ML.
@ -78,21 +81,22 @@ def load_openml_task(task_id, data_dir):
X_test: A dataframe of test data X_test: A dataframe of test data
y_train: A series of labels for training data y_train: A series of labels for training data
y_test: A series of labels for test data y_test: A series of labels for test data
''' """
import os import os
import openml import openml
import pickle import pickle
task = openml.tasks.get_task(task_id) task = openml.tasks.get_task(task_id)
filename = 'openml_task' + str(task_id) + '.pkl' filename = "openml_task" + str(task_id) + ".pkl"
filepath = os.path.join(data_dir, filename) filepath = os.path.join(data_dir, filename)
if os.path.isfile(filepath): if os.path.isfile(filepath):
print('load dataset from', filepath) print("load dataset from", filepath)
with open(filepath, 'rb') as f: with open(filepath, "rb") as f:
dataset = pickle.load(f) dataset = pickle.load(f)
else: else:
print('download dataset from openml') print("download dataset from openml")
dataset = task.get_dataset() dataset = task.get_dataset()
with open(filepath, 'wb') as f: with open(filepath, "wb") as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL) pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
X, y, _, _ = dataset.get_data(task.target_name) X, y, _, _ = dataset.get_data(task.target_name)
train_indices, test_indices = task.get_train_test_split_indices( train_indices, test_indices = task.get_train_test_split_indices(
@ -105,15 +109,18 @@ def load_openml_task(task_id, data_dir):
X_test = X.iloc[test_indices] X_test = X.iloc[test_indices]
y_test = y[test_indices] y_test = y[test_indices]
print( print(
'X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}'.format( "X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}".format(
X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_train.shape,
y_train.shape,
X_test.shape,
y_test.shape,
) )
) )
return X_train, X_test, y_train, y_test return X_train, X_test, y_train, y_test
def get_output_from_log(filename, time_budget): def get_output_from_log(filename, time_budget):
'''Get output from log file """Get output from log file
Args: Args:
filename: A string of the log file name filename: A string of the log file name
@ -127,11 +134,11 @@ def get_output_from_log(filename, time_budget):
config_list: config_list:
A list of the estimator, sample size and config of each logged iter A list of the estimator, sample size and config of each logged iter
logged_metric_list: A list of the logged metric of each logged iter logged_metric_list: A list of the logged metric of each logged iter
''' """
best_config = None best_config = None
best_learner = None best_learner = None
best_val_loss = float('+inf') best_val_loss = float("+inf")
search_time_list = [] search_time_list = []
config_list = [] config_list = []
@ -144,7 +151,7 @@ def get_output_from_log(filename, time_budget):
time_used = record.wall_clock_time time_used = record.wall_clock_time
val_loss = record.validation_loss val_loss = record.validation_loss
config = record.config config = record.config
learner = record.learner.split('_')[0] learner = record.learner.split("_")[0]
sample_size = record.sample_size sample_size = record.sample_size
metric = record.logged_metric metric = record.logged_metric
@ -158,27 +165,34 @@ def get_output_from_log(filename, time_budget):
best_error_list.append(best_val_loss) best_error_list.append(best_val_loss)
logged_metric_list.append(metric) logged_metric_list.append(metric)
error_list.append(val_loss) error_list.append(val_loss)
config_list.append({"Current Learner": learner, config_list.append(
"Current Sample": sample_size, {
"Current Hyper-parameters": record.config, "Current Learner": learner,
"Best Learner": best_learner, "Current Sample": sample_size,
"Best Hyper-parameters": best_config}) "Current Hyper-parameters": record.config,
"Best Learner": best_learner,
"Best Hyper-parameters": best_config,
}
)
return (search_time_list, best_error_list, error_list, config_list, return (
logged_metric_list) search_time_list,
best_error_list,
error_list,
config_list,
logged_metric_list,
)
def concat(X1, X2): def concat(X1, X2):
'''concatenate two matrices vertically """concatenate two matrices vertically"""
'''
if isinstance(X1, pd.DataFrame) or isinstance(X1, pd.Series): if isinstance(X1, pd.DataFrame) or isinstance(X1, pd.Series):
df = pd.concat([X1, X2], sort=False) df = pd.concat([X1, X2], sort=False)
df.reset_index(drop=True, inplace=True) df.reset_index(drop=True, inplace=True)
if isinstance(X1, pd.DataFrame): if isinstance(X1, pd.DataFrame):
cat_columns = X1.select_dtypes( cat_columns = X1.select_dtypes(include="category").columns
include='category').columns
if len(cat_columns): if len(cat_columns):
df[cat_columns] = df[cat_columns].astype('category') df[cat_columns] = df[cat_columns].astype("category")
return df return df
if issparse(X1): if issparse(X1):
return vstack((X1, X2)) return vstack((X1, X2))
@ -187,8 +201,7 @@ def concat(X1, X2):
class DataTransformer: class DataTransformer:
'''transform X, y """transform X, y"""
'''
def fit_transform(self, X, y, task): def fit_transform(self, X, y, task):
if isinstance(X, pd.DataFrame): if isinstance(X, pd.DataFrame):
@ -198,19 +211,25 @@ class DataTransformer:
drop = False drop = False
for column in X.columns: for column in X.columns:
# sklearn\utils\validation.py needs int/float values # sklearn\utils\validation.py needs int/float values
if X[column].dtype.name in ('object', 'category'): if X[column].dtype.name in ("object", "category"):
if X[column].nunique() == 1 or X[column].nunique( if (
dropna=True) == n - X[column].isnull().sum(): X[column].nunique() == 1
or X[column].nunique(dropna=True)
== n - X[column].isnull().sum()
):
X.drop(columns=column, inplace=True) X.drop(columns=column, inplace=True)
drop = True drop = True
elif X[column].dtype.name == 'category': elif X[column].dtype.name == "category":
current_categories = X[column].cat.categories current_categories = X[column].cat.categories
if '__NAN__' not in current_categories: if "__NAN__" not in current_categories:
X[column] = X[column].cat.add_categories( X[column] = (
'__NAN__').fillna('__NAN__') X[column]
.cat.add_categories("__NAN__")
.fillna("__NAN__")
)
cat_columns.append(column) cat_columns.append(column)
else: else:
X[column] = X[column].fillna('__NAN__') X[column] = X[column].fillna("__NAN__")
cat_columns.append(column) cat_columns.append(column)
else: else:
# print(X[column].dtype.name) # print(X[column].dtype.name)
@ -218,17 +237,27 @@ class DataTransformer:
X.drop(columns=column, inplace=True) X.drop(columns=column, inplace=True)
drop = True drop = True
else: else:
if X[column].dtype.name == 'datetime64[ns]': if X[column].dtype.name == "datetime64[ns]":
tmp_dt = X[column].dt tmp_dt = X[column].dt
new_columns_dict = {f'year_{column}': tmp_dt.year, f'month_{column}': tmp_dt.month, new_columns_dict = {
f'day_{column}': tmp_dt.day, f'hour_{column}': tmp_dt.hour, f"year_{column}": tmp_dt.year,
f'minute_{column}': tmp_dt.minute, f'second_{column}': tmp_dt.second, f"month_{column}": tmp_dt.month,
f'dayofweek_{column}': tmp_dt.dayofweek, f"day_{column}": tmp_dt.day,
f'dayofyear_{column}': tmp_dt.dayofyear, f"hour_{column}": tmp_dt.hour,
f'quarter_{column}': tmp_dt.quarter} f"minute_{column}": tmp_dt.minute,
f"second_{column}": tmp_dt.second,
f"dayofweek_{column}": tmp_dt.dayofweek,
f"dayofyear_{column}": tmp_dt.dayofyear,
f"quarter_{column}": tmp_dt.quarter,
}
for new_col_name in new_columns_dict.keys(): for new_col_name in new_columns_dict.keys():
if new_col_name not in X.columns and \ if (
new_columns_dict.get(new_col_name).nunique(dropna=False) >= 2: new_col_name not in X.columns
and new_columns_dict.get(new_col_name).nunique(
dropna=False
)
>= 2
):
X[new_col_name] = new_columns_dict.get(new_col_name) X[new_col_name] = new_columns_dict.get(new_col_name)
num_columns.append(new_col_name) num_columns.append(new_col_name)
X[column] = X[column].map(datetime.toordinal) X[column] = X[column].map(datetime.toordinal)
@ -239,11 +268,12 @@ class DataTransformer:
num_columns.append(column) num_columns.append(column)
X = X[cat_columns + num_columns] X = X[cat_columns + num_columns]
if cat_columns: if cat_columns:
X[cat_columns] = X[cat_columns].astype('category') X[cat_columns] = X[cat_columns].astype("category")
if num_columns: if num_columns:
X_num = X[num_columns] X_num = X[num_columns]
if np.issubdtype(X_num.columns.dtype, np.integer) and ( if np.issubdtype(X_num.columns.dtype, np.integer) and (
drop or min(X_num.columns) != 0 drop
or min(X_num.columns) != 0
or max(X_num.columns) != X_num.shape[1] - 1 or max(X_num.columns) != X_num.shape[1] - 1
): ):
X_num.columns = range(X_num.shape[1]) X_num.columns = range(X_num.shape[1])
@ -252,17 +282,31 @@ class DataTransformer:
drop = False drop = False
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer from sklearn.compose import ColumnTransformer
self.transformer = ColumnTransformer([(
'continuous', self.transformer = ColumnTransformer(
SimpleImputer(missing_values=np.nan, strategy='median'), [
X_num.columns)]) (
"continuous",
SimpleImputer(missing_values=np.nan, strategy="median"),
X_num.columns,
)
]
)
X[num_columns] = self.transformer.fit_transform(X_num) X[num_columns] = self.transformer.fit_transform(X_num)
self._cat_columns, self._num_columns, self._datetime_columns = \ self._cat_columns, self._num_columns, self._datetime_columns = (
cat_columns, num_columns, datetime_columns cat_columns,
num_columns,
datetime_columns,
)
self._drop = drop self._drop = drop
if task in ('binary', 'multi', 'classification'): if task in (
"binary",
"multi",
"classification",
) or not pd.api.types.is_numeric_dtype(y):
from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import LabelEncoder
self.label_transformer = LabelEncoder() self.label_transformer = LabelEncoder()
y = self.label_transformer.fit_transform(y) y = self.label_transformer.fit_transform(y)
else: else:
@ -272,34 +316,46 @@ class DataTransformer:
def transform(self, X): def transform(self, X):
X = X.copy() X = X.copy()
if isinstance(X, pd.DataFrame): if isinstance(X, pd.DataFrame):
cat_columns, num_columns, datetime_columns = self._cat_columns, \ cat_columns, num_columns, datetime_columns = (
self._num_columns, self._datetime_columns self._cat_columns,
self._num_columns,
self._datetime_columns,
)
if datetime_columns: if datetime_columns:
for column in datetime_columns: for column in datetime_columns:
tmp_dt = X[column].dt tmp_dt = X[column].dt
new_columns_dict = {f'year_{column}': tmp_dt.year, f'month_{column}': tmp_dt.month, new_columns_dict = {
f'day_{column}': tmp_dt.day, f'hour_{column}': tmp_dt.hour, f"year_{column}": tmp_dt.year,
f'minute_{column}': tmp_dt.minute, f'second_{column}': tmp_dt.second, f"month_{column}": tmp_dt.month,
f'dayofweek_{column}': tmp_dt.dayofweek, f"day_{column}": tmp_dt.day,
f'dayofyear_{column}': tmp_dt.dayofyear, f"hour_{column}": tmp_dt.hour,
f'quarter_{column}': tmp_dt.quarter} f"minute_{column}": tmp_dt.minute,
f"second_{column}": tmp_dt.second,
f"dayofweek_{column}": tmp_dt.dayofweek,
f"dayofyear_{column}": tmp_dt.dayofyear,
f"quarter_{column}": tmp_dt.quarter,
}
for new_col_name in new_columns_dict.keys(): for new_col_name in new_columns_dict.keys():
if new_col_name not in X.columns and \ if (
new_columns_dict.get(new_col_name).nunique(dropna=False) >= 2: new_col_name not in X.columns
and new_columns_dict.get(new_col_name).nunique(dropna=False)
>= 2
):
X[new_col_name] = new_columns_dict.get(new_col_name) X[new_col_name] = new_columns_dict.get(new_col_name)
X[column] = X[column].map(datetime.toordinal) X[column] = X[column].map(datetime.toordinal)
del tmp_dt del tmp_dt
X = X[cat_columns + num_columns].copy() X = X[cat_columns + num_columns].copy()
for column in cat_columns: for column in cat_columns:
if X[column].dtype.name == 'object': if X[column].dtype.name == "object":
X[column] = X[column].fillna('__NAN__') X[column] = X[column].fillna("__NAN__")
elif X[column].dtype.name == 'category': elif X[column].dtype.name == "category":
current_categories = X[column].cat.categories current_categories = X[column].cat.categories
if '__NAN__' not in current_categories: if "__NAN__" not in current_categories:
X[column] = X[column].cat.add_categories( X[column] = (
'__NAN__').fillna('__NAN__') X[column].cat.add_categories("__NAN__").fillna("__NAN__")
)
if cat_columns: if cat_columns:
X[cat_columns] = X[cat_columns].astype('category') X[cat_columns] = X[cat_columns].astype("category")
if num_columns: if num_columns:
X_num = X[num_columns].fillna(np.nan) X_num = X[num_columns].fillna(np.nan)
if self._drop: if self._drop:

View File

@ -1,65 +1,90 @@
'''! """!
* Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. * Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. * Licensed under the MIT License.
''' """
import time import time
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \ from sklearn.metrics import (
accuracy_score, mean_absolute_error, log_loss, average_precision_score, \ mean_squared_error,
f1_score, mean_absolute_percentage_error, ndcg_score r2_score,
roc_auc_score,
accuracy_score,
mean_absolute_error,
log_loss,
average_precision_score,
f1_score,
mean_absolute_percentage_error,
ndcg_score,
)
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit
from .model import ( from .model import (
XGBoostEstimator, XGBoostSklearnEstimator, RandomForestEstimator, XGBoostEstimator,
LGBMEstimator, LRL1Classifier, LRL2Classifier, CatBoostEstimator, XGBoostSklearnEstimator,
ExtraTreeEstimator, KNeighborsEstimator, FBProphet, ARIMA, SARIMAX) RandomForestEstimator,
LGBMEstimator,
LRL1Classifier,
LRL2Classifier,
CatBoostEstimator,
ExtraTreeEstimator,
KNeighborsEstimator,
Prophet,
ARIMA,
SARIMAX,
)
from .data import group_counts from .data import group_counts
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def get_estimator_class(task, estimator_name): def get_estimator_class(task, estimator_name):
''' when adding a new learner, need to add an elif branch ''' """when adding a new learner, need to add an elif branch"""
if 'xgboost' == estimator_name: if "xgboost" == estimator_name:
if 'regression' == task: if "regression" == task:
estimator_class = XGBoostEstimator estimator_class = XGBoostEstimator
else: else:
estimator_class = XGBoostSklearnEstimator estimator_class = XGBoostSklearnEstimator
elif 'rf' == estimator_name: elif "rf" == estimator_name:
estimator_class = RandomForestEstimator estimator_class = RandomForestEstimator
elif 'lgbm' == estimator_name: elif "lgbm" == estimator_name:
estimator_class = LGBMEstimator estimator_class = LGBMEstimator
elif 'lrl1' == estimator_name: elif "lrl1" == estimator_name:
estimator_class = LRL1Classifier estimator_class = LRL1Classifier
elif 'lrl2' == estimator_name: elif "lrl2" == estimator_name:
estimator_class = LRL2Classifier estimator_class = LRL2Classifier
elif 'catboost' == estimator_name: elif "catboost" == estimator_name:
estimator_class = CatBoostEstimator estimator_class = CatBoostEstimator
elif 'extra_tree' == estimator_name: elif "extra_tree" == estimator_name:
estimator_class = ExtraTreeEstimator estimator_class = ExtraTreeEstimator
elif 'kneighbor' == estimator_name: elif "kneighbor" == estimator_name:
estimator_class = KNeighborsEstimator estimator_class = KNeighborsEstimator
elif 'prophet' in estimator_name: elif "prophet" in estimator_name:
estimator_class = FBProphet estimator_class = Prophet
elif estimator_name == 'arima': elif estimator_name == "arima":
estimator_class = ARIMA estimator_class = ARIMA
elif estimator_name == 'sarimax': elif estimator_name == "sarimax":
estimator_class = SARIMAX estimator_class = SARIMAX
else: else:
raise ValueError( raise ValueError(
estimator_name + ' is not a built-in learner. ' estimator_name + " is not a built-in learner. "
'Please use AutoML.add_learner() to add a customized learner.') "Please use AutoML.add_learner() to add a customized learner."
)
return estimator_class return estimator_class
def sklearn_metric_loss_score( def sklearn_metric_loss_score(
metric_name, y_predict, y_true, labels=None, sample_weight=None, metric_name,
y_predict,
y_true,
labels=None,
sample_weight=None,
groups=None, groups=None,
): ):
'''Loss using the specified metric """Loss using the specified metric
Args: Args:
metric_name: A string of the metric name, one of metric_name: A string of the metric name, one of
@ -76,60 +101,63 @@ def sklearn_metric_loss_score(
Returns: Returns:
score: A float number of the loss, the lower the better. score: A float number of the loss, the lower the better.
''' """
metric_name = metric_name.lower() metric_name = metric_name.lower()
if 'r2' == metric_name: if "r2" == metric_name:
score = 1.0 - r2_score(y_true, y_predict, sample_weight=sample_weight) score = 1.0 - r2_score(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == 'rmse': elif metric_name == "rmse":
score = np.sqrt(mean_squared_error( score = np.sqrt(
y_true, y_predict, sample_weight=sample_weight)) mean_squared_error(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == 'mae': )
score = mean_absolute_error( elif metric_name == "mae":
y_true, y_predict, sample_weight=sample_weight) score = mean_absolute_error(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == 'mse': elif metric_name == "mse":
score = mean_squared_error( score = mean_squared_error(y_true, y_predict, sample_weight=sample_weight)
y_true, y_predict, sample_weight=sample_weight) elif metric_name == "accuracy":
elif metric_name == 'accuracy': score = 1.0 - accuracy_score(y_true, y_predict, sample_weight=sample_weight)
score = 1.0 - accuracy_score( elif metric_name == "roc_auc":
y_true, y_predict, sample_weight=sample_weight) score = 1.0 - roc_auc_score(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == 'roc_auc': elif metric_name == "roc_auc_ovr":
score = 1.0 - roc_auc_score( score = 1.0 - roc_auc_score(
y_true, y_predict, sample_weight=sample_weight) y_true, y_predict, sample_weight=sample_weight, multi_class="ovr"
elif metric_name == 'roc_auc_ovr': )
elif metric_name == "roc_auc_ovo":
score = 1.0 - roc_auc_score( score = 1.0 - roc_auc_score(
y_true, y_predict, sample_weight=sample_weight, multi_class='ovr') y_true, y_predict, sample_weight=sample_weight, multi_class="ovo"
elif metric_name == 'roc_auc_ovo': )
score = 1.0 - roc_auc_score( elif "log_loss" == metric_name:
y_true, y_predict, sample_weight=sample_weight, multi_class='ovo') score = log_loss(y_true, y_predict, labels=labels, sample_weight=sample_weight)
elif 'log_loss' == metric_name: elif "mape" == metric_name:
score = log_loss(
y_true, y_predict, labels=labels, sample_weight=sample_weight)
elif 'mape' == metric_name:
try: try:
score = mean_absolute_percentage_error( score = mean_absolute_percentage_error(y_true, y_predict)
y_true, y_predict)
except ValueError: except ValueError:
return np.inf return np.inf
elif 'micro_f1' == metric_name: elif "micro_f1" == metric_name:
score = 1 - f1_score( score = 1 - f1_score(
y_true, y_predict, sample_weight=sample_weight, average='micro') y_true, y_predict, sample_weight=sample_weight, average="micro"
elif 'macro_f1' == metric_name: )
elif "macro_f1" == metric_name:
score = 1 - f1_score( score = 1 - f1_score(
y_true, y_predict, sample_weight=sample_weight, average='macro') y_true, y_predict, sample_weight=sample_weight, average="macro"
elif 'f1' == metric_name: )
elif "f1" == metric_name:
score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight) score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight)
elif 'ap' == metric_name: elif "ap" == metric_name:
score = 1 - average_precision_score( score = 1 - average_precision_score(
y_true, y_predict, sample_weight=sample_weight) y_true, y_predict, sample_weight=sample_weight
elif 'ndcg' in metric_name: )
if '@' in metric_name: elif "ndcg" in metric_name:
k = int(metric_name.split('@', 1)[-1]) if "@" in metric_name:
k = int(metric_name.split("@", 1)[-1])
counts = group_counts(groups) counts = group_counts(groups)
score = 0 score = 0
psum = 0 psum = 0
for c in counts: for c in counts:
score -= ndcg_score(np.asarray([y_true[psum:psum + c]]), score -= ndcg_score(
np.asarray([y_predict[psum:psum + c]]), k=k) np.asarray([y_true[psum : psum + c]]),
np.asarray([y_predict[psum : psum + c]]),
k=k,
)
psum += c psum += c
score /= len(counts) score /= len(counts)
score += 1 score += 1
@ -137,56 +165,96 @@ def sklearn_metric_loss_score(
score = 1 - ndcg_score([y_true], [y_predict]) score = 1 - ndcg_score([y_true], [y_predict])
else: else:
raise ValueError( raise ValueError(
metric_name + ' is not a built-in metric, ' metric_name + " is not a built-in metric, "
'currently built-in metrics are: ' "currently built-in metrics are: "
'r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,' "r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,"
'log_loss, mape, f1, micro_f1, macro_f1, ap. ' "log_loss, mape, f1, micro_f1, macro_f1, ap. "
'please pass a customized metric function to AutoML.fit(metric=func)') "please pass a customized metric function to AutoML.fit(metric=func)"
)
return score return score
def get_y_pred(estimator, X, eval_metric, obj): def get_y_pred(estimator, X, eval_metric, obj):
if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj: if eval_metric in ["roc_auc", "ap"] and "binary" in obj:
y_pred_classes = estimator.predict_proba(X) y_pred_classes = estimator.predict_proba(X)
y_pred = y_pred_classes[ y_pred = y_pred_classes[:, 1] if y_pred_classes.ndim > 1 else y_pred_classes
:, 1] if y_pred_classes.ndim > 1 else y_pred_classes elif eval_metric in ["log_loss", "roc_auc", "roc_auc_ovr", "roc_auc_ovo"]:
elif eval_metric in ['log_loss', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']:
y_pred = estimator.predict_proba(X) y_pred = estimator.predict_proba(X)
else: else:
y_pred = estimator.predict(X) y_pred = estimator.predict(X)
return y_pred return y_pred
def _eval_estimator(config, estimator, X_train, y_train, X_test, y_test, weight_test, def _eval_estimator(
groups_test, eval_metric, obj, labels=None, config,
log_training_metric=False, fit_kwargs={}): estimator,
X_train,
y_train,
X_test,
y_test,
weight_test,
groups_test,
eval_metric,
obj,
labels=None,
log_training_metric=False,
fit_kwargs={},
):
if isinstance(eval_metric, str): if isinstance(eval_metric, str):
pred_start = time.time() pred_start = time.time()
test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj) test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj)
pred_time = (time.time() - pred_start) / X_test.shape[0] pred_time = (time.time() - pred_start) / X_test.shape[0]
test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test, test_loss = sklearn_metric_loss_score(
labels, weight_test, groups_test) eval_metric, test_pred_y, y_test, labels, weight_test, groups_test
)
metric_for_logging = {} metric_for_logging = {}
if log_training_metric: if log_training_metric:
train_pred_y = get_y_pred(estimator, X_train, eval_metric, obj) train_pred_y = get_y_pred(estimator, X_train, eval_metric, obj)
metric_for_logging['train_loss'] = sklearn_metric_loss_score( metric_for_logging["train_loss"] = sklearn_metric_loss_score(
eval_metric, train_pred_y, y_train, labels, eval_metric,
fit_kwargs.get('sample_weight'), fit_kwargs.get('groups')) train_pred_y,
y_train,
labels,
fit_kwargs.get("sample_weight"),
fit_kwargs.get("groups"),
)
else: # customized metric function else: # customized metric function
test_loss, metric_for_logging = eval_metric( test_loss, metric_for_logging = eval_metric(
X_test, y_test, estimator, labels, X_train, y_train, weight_test, X_test,
fit_kwargs.get('sample_weight'), config, groups_test, y_test,
fit_kwargs.get('groups')) estimator,
labels,
X_train,
y_train,
weight_test,
fit_kwargs.get("sample_weight"),
config,
groups_test,
fit_kwargs.get("groups"),
)
if isinstance(metric_for_logging, dict): if isinstance(metric_for_logging, dict):
pred_time = metric_for_logging.get('pred_time', 0) pred_time = metric_for_logging.get("pred_time", 0)
test_pred_y = None test_pred_y = None
# eval_metric may return test_pred_y but not necessarily. Setting None for now. # eval_metric may return test_pred_y but not necessarily. Setting None for now.
return test_loss, metric_for_logging, pred_time, test_pred_y return test_loss, metric_for_logging, pred_time, test_pred_y
def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_test, def get_test_loss(
groups_test, eval_metric, obj, labels=None, budget=None, config,
log_training_metric=False, fit_kwargs={}): estimator,
X_train,
y_train,
X_test,
y_test,
weight_test,
groups_test,
eval_metric,
obj,
labels=None,
budget=None,
log_training_metric=False,
fit_kwargs={},
):
start = time.time() start = time.time()
# if groups_test is not None: # if groups_test is not None:
@ -195,16 +263,37 @@ def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_te
# fit_kwargs['y_val'] = y_test # fit_kwargs['y_val'] = y_test
estimator.fit(X_train, y_train, budget, **fit_kwargs) estimator.fit(X_train, y_train, budget, **fit_kwargs)
test_loss, metric_for_logging, pred_time, _ = _eval_estimator( test_loss, metric_for_logging, pred_time, _ = _eval_estimator(
config, estimator, X_train, y_train, X_test, y_test, config,
weight_test, groups_test, eval_metric, obj, estimator,
labels, log_training_metric, fit_kwargs) X_train,
y_train,
X_test,
y_test,
weight_test,
groups_test,
eval_metric,
obj,
labels,
log_training_metric,
fit_kwargs,
)
train_time = time.time() - start train_time = time.time() - start
return test_loss, metric_for_logging, train_time, pred_time return test_loss, metric_for_logging, train_time, pred_time
def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf, def evaluate_model_CV(
task, eval_metric, best_val_loss, config,
log_training_metric=False, fit_kwargs={}): estimator,
X_train_all,
y_train_all,
budget,
kf,
task,
eval_metric,
best_val_loss,
log_training_metric=False,
fit_kwargs={},
):
start_time = time.time() start_time = time.time()
total_val_loss = 0 total_val_loss = 0
total_metric = None total_metric = None
@ -213,7 +302,7 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
valid_fold_num = total_fold_num = 0 valid_fold_num = total_fold_num = 0
n = kf.get_n_splits() n = kf.get_n_splits()
X_train_split, y_train_split = X_train_all, y_train_all X_train_split, y_train_split = X_train_all, y_train_all
if task in ('binary', 'multi'): if task in ("binary", "multi"):
labels = np.unique(y_train_all) labels = np.unique(y_train_all)
else: else:
labels = None labels = None
@ -225,8 +314,8 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
groups = kf.groups groups = kf.groups
kf = kf.split(X_train_split, y_train_split, groups) kf = kf.split(X_train_split, y_train_split, groups)
shuffle = False shuffle = False
elif isinstance(kf, TimeSeriesSplit) and task == 'forecast': elif isinstance(kf, TimeSeriesSplit) and task == "forecast":
y_train_all = pd.DataFrame(y_train_all, columns=['y']) y_train_all = pd.DataFrame(y_train_all, columns=["y"])
train = X_train_all.join(y_train_all) train = X_train_all.join(y_train_all)
kf = kf.split(train) kf = kf.split(train)
shuffle = False shuffle = False
@ -237,8 +326,8 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
rng = np.random.RandomState(2020) rng = np.random.RandomState(2020)
val_loss_list = [] val_loss_list = []
budget_per_train = budget / n budget_per_train = budget / n
if 'sample_weight' in fit_kwargs: if "sample_weight" in fit_kwargs:
weight = fit_kwargs['sample_weight'] weight = fit_kwargs["sample_weight"]
weight_val = None weight_val = None
else: else:
weight = weight_val = None weight = weight_val = None
@ -246,37 +335,48 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
if shuffle: if shuffle:
train_index = rng.permutation(train_index) train_index = rng.permutation(train_index)
if isinstance(X_train_all, pd.DataFrame): if isinstance(X_train_all, pd.DataFrame):
X_train, X_val = X_train_split.iloc[ X_train = X_train_split.iloc[train_index]
train_index], X_train_split.iloc[val_index] X_val = X_train_split.iloc[val_index]
else: else:
X_train, X_val = X_train_split[ X_train, X_val = X_train_split[train_index], X_train_split[val_index]
train_index], X_train_split[val_index]
y_train, y_val = y_train_split[train_index], y_train_split[val_index] y_train, y_val = y_train_split[train_index], y_train_split[val_index]
estimator.cleanup() estimator.cleanup()
if weight is not None: if weight is not None:
fit_kwargs['sample_weight'], weight_val = weight[ fit_kwargs["sample_weight"], weight_val = (
train_index], weight[val_index] weight[train_index],
weight[val_index],
)
if groups is not None: if groups is not None:
fit_kwargs['groups'] = groups[train_index] fit_kwargs["groups"] = groups[train_index]
groups_val = groups[val_index] groups_val = groups[val_index]
else: else:
groups_val = None groups_val = None
val_loss_i, metric_i, train_time_i, pred_time_i = get_test_loss( val_loss_i, metric_i, train_time_i, pred_time_i = get_test_loss(
config, estimator, X_train, y_train, X_val, y_val, weight_val, config,
groups_val, eval_metric, task, labels, budget_per_train, estimator,
log_training_metric=log_training_metric, fit_kwargs=fit_kwargs) X_train,
y_train,
X_val,
y_val,
weight_val,
groups_val,
eval_metric,
task,
labels,
budget_per_train,
log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs,
)
if weight is not None: if weight is not None:
fit_kwargs['sample_weight'] = weight fit_kwargs["sample_weight"] = weight
valid_fold_num += 1 valid_fold_num += 1
total_fold_num += 1 total_fold_num += 1
total_val_loss += val_loss_i total_val_loss += val_loss_i
if log_training_metric or not isinstance(eval_metric, str): if log_training_metric or not isinstance(eval_metric, str):
if isinstance(total_metric, list): if isinstance(total_metric, list):
total_metric = [ total_metric = [total_metric[i] + v for i, v in enumerate(metric_i)]
total_metric[i] + v for i, v in enumerate(metric_i)]
elif isinstance(total_metric, dict): elif isinstance(total_metric, dict):
total_metric = { total_metric = {k: total_metric[k] + v for k, v in metric_i.items()}
k: total_metric[k] + v for k, v in metric_i.items()}
elif total_metric is not None: elif total_metric is not None:
total_metric += metric_i total_metric += metric_i
else: else:
@ -307,35 +407,73 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
def compute_estimator( def compute_estimator(
X_train, y_train, X_val, y_val, weight_val, groups_val, budget, kf, X_train,
config_dic, task, estimator_name, eval_method, eval_metric, y_train,
best_val_loss=np.Inf, n_jobs=1, estimator_class=None, log_training_metric=False, X_val,
fit_kwargs={} y_val,
weight_val,
groups_val,
budget,
kf,
config_dic,
task,
estimator_name,
eval_method,
eval_metric,
best_val_loss=np.Inf,
n_jobs=1,
estimator_class=None,
log_training_metric=False,
fit_kwargs={},
): ):
estimator_class = estimator_class or get_estimator_class( estimator_class = estimator_class or get_estimator_class(task, estimator_name)
task, estimator_name) estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
estimator = estimator_class( if "holdout" in eval_method:
**config_dic, task=task, n_jobs=n_jobs)
if 'holdout' in eval_method:
val_loss, metric_for_logging, train_time, pred_time = get_test_loss( val_loss, metric_for_logging, train_time, pred_time = get_test_loss(
config_dic, estimator, X_train, y_train, X_val, y_val, weight_val, config_dic,
groups_val, eval_metric, task, budget=budget, estimator,
log_training_metric=log_training_metric, fit_kwargs=fit_kwargs) X_train,
y_train,
X_val,
y_val,
weight_val,
groups_val,
eval_metric,
task,
budget=budget,
log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs,
)
else: else:
val_loss, metric_for_logging, train_time, pred_time = evaluate_model_CV( val_loss, metric_for_logging, train_time, pred_time = evaluate_model_CV(
config_dic, estimator, X_train, y_train, budget, kf, task, config_dic,
eval_metric, best_val_loss, log_training_metric=log_training_metric, estimator,
fit_kwargs=fit_kwargs) X_train,
y_train,
budget,
kf,
task,
eval_metric,
best_val_loss,
log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs,
)
return estimator, val_loss, metric_for_logging, train_time, pred_time return estimator, val_loss, metric_for_logging, train_time, pred_time
def train_estimator( def train_estimator(
X_train, y_train, config_dic, task, X_train,
estimator_name, n_jobs=1, estimator_class=None, budget=None, fit_kwargs={} y_train,
config_dic,
task,
estimator_name,
n_jobs=1,
estimator_class=None,
budget=None,
fit_kwargs={},
): ):
start_time = time.time() start_time = time.time()
estimator_class = estimator_class or get_estimator_class( estimator_class = estimator_class or get_estimator_class(task, estimator_name)
task, estimator_name)
estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs) estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
if X_train is not None: if X_train is not None:
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs) train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
@ -347,14 +485,14 @@ def train_estimator(
def get_classification_objective(num_labels: int) -> str: def get_classification_objective(num_labels: int) -> str:
if num_labels == 2: if num_labels == 2:
objective_name = 'binary' objective_name = "binary"
else: else:
objective_name = 'multi' objective_name = "multi"
return objective_name return objective_name
def norm_confusion_matrix(y_true, y_pred): def norm_confusion_matrix(y_true, y_pred):
'''normalized confusion matrix """normalized confusion matrix
Args: Args:
estimator: A multi-class classification estimator estimator: A multi-class classification estimator
@ -363,15 +501,16 @@ def norm_confusion_matrix(y_true, y_pred):
Returns: Returns:
A normalized confusion matrix A normalized confusion matrix
''' """
from sklearn.metrics import confusion_matrix from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_true, y_pred) conf_mat = confusion_matrix(y_true, y_pred)
norm_conf_mat = conf_mat.astype('float') / conf_mat.sum(axis=1)[:, np.newaxis] norm_conf_mat = conf_mat.astype("float") / conf_mat.sum(axis=1)[:, np.newaxis]
return norm_conf_mat return norm_conf_mat
def multi_class_curves(y_true, y_pred_proba, curve_func): def multi_class_curves(y_true, y_pred_proba, curve_func):
'''Binarize the data for multi-class tasks and produce ROC or precision-recall curves """Binarize the data for multi-class tasks and produce ROC or precision-recall curves
Args: Args:
y_true: A numpy array or a pandas series of true labels y_true: A numpy array or a pandas series of true labels
@ -384,8 +523,9 @@ def multi_class_curves(y_true, y_pred_proba, curve_func):
curve_x[0] is an 1D array of the x coordinates of class 0 curve_x[0] is an 1D array of the x coordinates of class 0
The second dictionary curve_y stores the y coordinates of each curve, e.g., The second dictionary curve_y stores the y coordinates of each curve, e.g.,
curve_y[0] is an 1D array of the y coordinates of class 0 curve_y[0] is an 1D array of the y coordinates of class 0
''' """
from sklearn.preprocessing import label_binarize from sklearn.preprocessing import label_binarize
classes = np.unique(y_true) classes = np.unique(y_true)
y_true_binary = label_binarize(y_true, classes=classes) y_true_binary = label_binarize(y_true, classes=classes)

File diff suppressed because it is too large Load Diff

View File

@ -1 +1 @@
__version__ = "0.6.3" __version__ = "0.6.4"

View File

@ -32,7 +32,7 @@ setuptools.setup(
long_description=long_description, long_description=long_description,
long_description_content_type="text/markdown", long_description_content_type="text/markdown",
url="https://github.com/microsoft/FLAML", url="https://github.com/microsoft/FLAML",
packages=setuptools.find_packages(), packages=setuptools.find_packages(include=["flaml*"]),
install_requires=install_requires, install_requires=install_requires,
extras_require={ extras_require={
"notebook": [ "notebook": [

View File

@ -30,9 +30,11 @@ def test_forecast_automl(budget=5):
} }
"""The main flaml automl API""" """The main flaml automl API"""
try: try:
import prophet
automl.fit(dataframe=df, **settings, period=time_horizon) automl.fit(dataframe=df, **settings, period=time_horizon)
except ImportError: except ImportError:
print("not using FBProphet due to ImportError") print("not using prophet due to ImportError")
automl.fit( automl.fit(
dataframe=df, dataframe=df,
**settings, **settings,
@ -79,7 +81,7 @@ def test_forecast_automl(budget=5):
try: try:
automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon) automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon)
except ImportError: except ImportError:
print("not using FBProphet due to ImportError") print("not using prophet due to ImportError")
automl.fit( automl.fit(
X_train=X_train, X_train=X_train,
y_train=y_train, y_train=y_train,
@ -94,6 +96,8 @@ def test_numpy():
y_train = np.random.random(size=72) y_train = np.random.random(size=72)
automl = AutoML() automl = AutoML()
try: try:
import prophet
automl.fit( automl.fit(
X_train=X_train[:60], # a single column of timestamp X_train=X_train[:60], # a single column of timestamp
y_train=y_train, # value for each timestamp y_train=y_train, # value for each timestamp
@ -105,9 +109,9 @@ def test_numpy():
print(automl.predict(X_train[60:])) print(automl.predict(X_train[60:]))
print(automl.predict(12)) print(automl.predict(12))
except ValueError: except ValueError:
print("ValueError for FBProphet is raised as expected.") print("ValueError for prophet is raised as expected.")
except ImportError: except ImportError:
print("not using FBProphet due to ImportError") print("not using prophet due to ImportError")
automl = AutoML() automl = AutoML()
automl.fit( automl.fit(
X_train=X_train[:72], # a single column of timestamp X_train=X_train[:72], # a single column of timestamp