package name in setup (#198)

* package name

* learning to rank example: close #200

* try import prophet #201
This commit is contained in:
Chi Wang 2021-09-11 21:19:18 -07:00 committed by GitHub
parent 8f9f08cebc
commit f4529dfe89
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 941 additions and 648 deletions

View File

@ -75,7 +75,7 @@ And they can be used in distributed HPO frameworks such as ray tune or nni.
## Examples
- A basic classification example.
* A basic classification example.
```python
from flaml import AutoML
@ -99,7 +99,7 @@ print(automl.predict_proba(X_train))
print(automl.model)
```
- A basic regression example.
* A basic regression example.
```python
from flaml import AutoML
@ -123,7 +123,7 @@ print(automl.predict(X_train))
print(automl.model)
```
- Time series forecasting.
* Time series forecasting.
```python
# pip install flaml[forecast]
@ -141,14 +141,15 @@ automl.fit(X_train=X_train[:72], # a single column of timestamp
print(automl.predict(X_train[72:]))
```
- Learning to rank.
* Learning to rank.
```python
from sklearn.datasets import fetch_openml
from flaml import AutoML
X, y = fetch_openml(name="credit-g", return_X_y=True)
X_train, y_train = fetch_openml(name="credit-g", return_X_y=True, as_frame=False)
y_train = y_train.cat.codes
# not a real learning to rank dataaset
groups = [200] * 4 + [100] * 2, # group counts
groups = [200] * 4 + [100] * 2 # group counts
automl = AutoML()
automl.fit(
X_train, y_train, groups=groups,
@ -207,17 +208,21 @@ pip install -e .[test,notebook]
```
### Docker
We provide a simple [Dockerfile](https://github.com/microsoft/FLAML/blob/main/Dockerfile).
```
```bash
docker build git://github.com/microsoft/FLAML -t flaml-dev
docker run -it flaml-dev
```
### Develop in Remote Container
If you use vscode, you can open the FLAML folder in a [Container](https://code.visualstudio.com/docs/remote/containers).
We have provided the configuration in (.devcontainer)[(https://github.com/microsoft/FLAML/blob/main/.devcontainer)].
We have provided the configuration in [.devcontainer]((https://github.com/microsoft/FLAML/blob/main/.devcontainer)).
### Pre-commit
Run `pre-commit install` to install pre-commit into your git hooks. Before you commit, run
`pre-commit run` to check if you meet the pre-commit requirements. If you use Windows (without WSL) and can't commit after installing pre-commit, you can run `pre-commit uninstall` to uninstall the hook. In WSL or Linux this is supposed to work.

View File

@ -1474,7 +1474,12 @@ class AutoML:
if "auto" == estimator_list:
if self._state.task == "forecast":
estimator_list = ["fbprophet", "arima", "sarimax"]
try:
import prophet
estimator_list = ["prophet", "arima", "sarimax"]
except ImportError:
estimator_list = ["arima", "sarimax"]
elif self._state.task == "rank":
estimator_list = ["lgbm", "xgboost"]
else:

View File

@ -1,7 +1,7 @@
'''!
"""!
* Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
'''
"""
import numpy as np
from scipy.sparse import vstack, issparse
@ -11,9 +11,10 @@ from .training_log import training_log_reader
from datetime import datetime
def load_openml_dataset(dataset_id, data_dir=None, random_state=0,
dataset_format='dataframe'):
'''Load dataset from open ML.
def load_openml_dataset(
dataset_id, data_dir=None, random_state=0, dataset_format="dataframe"
):
"""Load dataset from open ML.
If the file is not cached locally, download it from open ML.
@ -30,41 +31,43 @@ def load_openml_dataset(dataset_id, data_dir=None, random_state=0,
X_test: Test data
y_train: A series or array of labels for training data
y_test: A series or array of labels for test data
'''
"""
import os
import openml
import pickle
from sklearn.model_selection import train_test_split
filename = 'openml_ds' + str(dataset_id) + '.pkl'
filename = "openml_ds" + str(dataset_id) + ".pkl"
filepath = os.path.join(data_dir, filename)
if os.path.isfile(filepath):
print('load dataset from', filepath)
with open(filepath, 'rb') as f:
print("load dataset from", filepath)
with open(filepath, "rb") as f:
dataset = pickle.load(f)
else:
print('download dataset from openml')
print("download dataset from openml")
dataset = openml.datasets.get_dataset(dataset_id)
if not os.path.exists(data_dir):
os.makedirs(data_dir)
with open(filepath, 'wb') as f:
with open(filepath, "wb") as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
print('Dataset name:', dataset.name)
X, y, * \
__ = dataset.get_data(
target=dataset.default_target_attribute, dataset_format=dataset_format)
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=random_state)
print("Dataset name:", dataset.name)
X, y, *__ = dataset.get_data(
target=dataset.default_target_attribute, dataset_format=dataset_format
)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
print(
'X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}'.format(
X_train.shape, y_train.shape, X_test.shape, y_test.shape,
"X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}".format(
X_train.shape,
y_train.shape,
X_test.shape,
y_test.shape,
)
)
return X_train, X_test, y_train, y_test
def load_openml_task(task_id, data_dir):
'''Load task from open ML.
"""Load task from open ML.
Use the first fold of the task.
If the file is not cached locally, download it from open ML.
@ -78,21 +81,22 @@ def load_openml_task(task_id, data_dir):
X_test: A dataframe of test data
y_train: A series of labels for training data
y_test: A series of labels for test data
'''
"""
import os
import openml
import pickle
task = openml.tasks.get_task(task_id)
filename = 'openml_task' + str(task_id) + '.pkl'
filename = "openml_task" + str(task_id) + ".pkl"
filepath = os.path.join(data_dir, filename)
if os.path.isfile(filepath):
print('load dataset from', filepath)
with open(filepath, 'rb') as f:
print("load dataset from", filepath)
with open(filepath, "rb") as f:
dataset = pickle.load(f)
else:
print('download dataset from openml')
print("download dataset from openml")
dataset = task.get_dataset()
with open(filepath, 'wb') as f:
with open(filepath, "wb") as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
X, y, _, _ = dataset.get_data(task.target_name)
train_indices, test_indices = task.get_train_test_split_indices(
@ -105,15 +109,18 @@ def load_openml_task(task_id, data_dir):
X_test = X.iloc[test_indices]
y_test = y[test_indices]
print(
'X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}'.format(
X_train.shape, y_train.shape, X_test.shape, y_test.shape,
"X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}".format(
X_train.shape,
y_train.shape,
X_test.shape,
y_test.shape,
)
)
return X_train, X_test, y_train, y_test
def get_output_from_log(filename, time_budget):
'''Get output from log file
"""Get output from log file
Args:
filename: A string of the log file name
@ -127,11 +134,11 @@ def get_output_from_log(filename, time_budget):
config_list:
A list of the estimator, sample size and config of each logged iter
logged_metric_list: A list of the logged metric of each logged iter
'''
"""
best_config = None
best_learner = None
best_val_loss = float('+inf')
best_val_loss = float("+inf")
search_time_list = []
config_list = []
@ -144,7 +151,7 @@ def get_output_from_log(filename, time_budget):
time_used = record.wall_clock_time
val_loss = record.validation_loss
config = record.config
learner = record.learner.split('_')[0]
learner = record.learner.split("_")[0]
sample_size = record.sample_size
metric = record.logged_metric
@ -158,27 +165,34 @@ def get_output_from_log(filename, time_budget):
best_error_list.append(best_val_loss)
logged_metric_list.append(metric)
error_list.append(val_loss)
config_list.append({"Current Learner": learner,
config_list.append(
{
"Current Learner": learner,
"Current Sample": sample_size,
"Current Hyper-parameters": record.config,
"Best Learner": best_learner,
"Best Hyper-parameters": best_config})
"Best Hyper-parameters": best_config,
}
)
return (search_time_list, best_error_list, error_list, config_list,
logged_metric_list)
return (
search_time_list,
best_error_list,
error_list,
config_list,
logged_metric_list,
)
def concat(X1, X2):
'''concatenate two matrices vertically
'''
"""concatenate two matrices vertically"""
if isinstance(X1, pd.DataFrame) or isinstance(X1, pd.Series):
df = pd.concat([X1, X2], sort=False)
df.reset_index(drop=True, inplace=True)
if isinstance(X1, pd.DataFrame):
cat_columns = X1.select_dtypes(
include='category').columns
cat_columns = X1.select_dtypes(include="category").columns
if len(cat_columns):
df[cat_columns] = df[cat_columns].astype('category')
df[cat_columns] = df[cat_columns].astype("category")
return df
if issparse(X1):
return vstack((X1, X2))
@ -187,8 +201,7 @@ def concat(X1, X2):
class DataTransformer:
'''transform X, y
'''
"""transform X, y"""
def fit_transform(self, X, y, task):
if isinstance(X, pd.DataFrame):
@ -198,19 +211,25 @@ class DataTransformer:
drop = False
for column in X.columns:
# sklearn\utils\validation.py needs int/float values
if X[column].dtype.name in ('object', 'category'):
if X[column].nunique() == 1 or X[column].nunique(
dropna=True) == n - X[column].isnull().sum():
if X[column].dtype.name in ("object", "category"):
if (
X[column].nunique() == 1
or X[column].nunique(dropna=True)
== n - X[column].isnull().sum()
):
X.drop(columns=column, inplace=True)
drop = True
elif X[column].dtype.name == 'category':
elif X[column].dtype.name == "category":
current_categories = X[column].cat.categories
if '__NAN__' not in current_categories:
X[column] = X[column].cat.add_categories(
'__NAN__').fillna('__NAN__')
if "__NAN__" not in current_categories:
X[column] = (
X[column]
.cat.add_categories("__NAN__")
.fillna("__NAN__")
)
cat_columns.append(column)
else:
X[column] = X[column].fillna('__NAN__')
X[column] = X[column].fillna("__NAN__")
cat_columns.append(column)
else:
# print(X[column].dtype.name)
@ -218,17 +237,27 @@ class DataTransformer:
X.drop(columns=column, inplace=True)
drop = True
else:
if X[column].dtype.name == 'datetime64[ns]':
if X[column].dtype.name == "datetime64[ns]":
tmp_dt = X[column].dt
new_columns_dict = {f'year_{column}': tmp_dt.year, f'month_{column}': tmp_dt.month,
f'day_{column}': tmp_dt.day, f'hour_{column}': tmp_dt.hour,
f'minute_{column}': tmp_dt.minute, f'second_{column}': tmp_dt.second,
f'dayofweek_{column}': tmp_dt.dayofweek,
f'dayofyear_{column}': tmp_dt.dayofyear,
f'quarter_{column}': tmp_dt.quarter}
new_columns_dict = {
f"year_{column}": tmp_dt.year,
f"month_{column}": tmp_dt.month,
f"day_{column}": tmp_dt.day,
f"hour_{column}": tmp_dt.hour,
f"minute_{column}": tmp_dt.minute,
f"second_{column}": tmp_dt.second,
f"dayofweek_{column}": tmp_dt.dayofweek,
f"dayofyear_{column}": tmp_dt.dayofyear,
f"quarter_{column}": tmp_dt.quarter,
}
for new_col_name in new_columns_dict.keys():
if new_col_name not in X.columns and \
new_columns_dict.get(new_col_name).nunique(dropna=False) >= 2:
if (
new_col_name not in X.columns
and new_columns_dict.get(new_col_name).nunique(
dropna=False
)
>= 2
):
X[new_col_name] = new_columns_dict.get(new_col_name)
num_columns.append(new_col_name)
X[column] = X[column].map(datetime.toordinal)
@ -239,11 +268,12 @@ class DataTransformer:
num_columns.append(column)
X = X[cat_columns + num_columns]
if cat_columns:
X[cat_columns] = X[cat_columns].astype('category')
X[cat_columns] = X[cat_columns].astype("category")
if num_columns:
X_num = X[num_columns]
if np.issubdtype(X_num.columns.dtype, np.integer) and (
drop or min(X_num.columns) != 0
drop
or min(X_num.columns) != 0
or max(X_num.columns) != X_num.shape[1] - 1
):
X_num.columns = range(X_num.shape[1])
@ -252,17 +282,31 @@ class DataTransformer:
drop = False
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
self.transformer = ColumnTransformer([(
'continuous',
SimpleImputer(missing_values=np.nan, strategy='median'),
X_num.columns)])
self.transformer = ColumnTransformer(
[
(
"continuous",
SimpleImputer(missing_values=np.nan, strategy="median"),
X_num.columns,
)
]
)
X[num_columns] = self.transformer.fit_transform(X_num)
self._cat_columns, self._num_columns, self._datetime_columns = \
cat_columns, num_columns, datetime_columns
self._cat_columns, self._num_columns, self._datetime_columns = (
cat_columns,
num_columns,
datetime_columns,
)
self._drop = drop
if task in ('binary', 'multi', 'classification'):
if task in (
"binary",
"multi",
"classification",
) or not pd.api.types.is_numeric_dtype(y):
from sklearn.preprocessing import LabelEncoder
self.label_transformer = LabelEncoder()
y = self.label_transformer.fit_transform(y)
else:
@ -272,34 +316,46 @@ class DataTransformer:
def transform(self, X):
X = X.copy()
if isinstance(X, pd.DataFrame):
cat_columns, num_columns, datetime_columns = self._cat_columns, \
self._num_columns, self._datetime_columns
cat_columns, num_columns, datetime_columns = (
self._cat_columns,
self._num_columns,
self._datetime_columns,
)
if datetime_columns:
for column in datetime_columns:
tmp_dt = X[column].dt
new_columns_dict = {f'year_{column}': tmp_dt.year, f'month_{column}': tmp_dt.month,
f'day_{column}': tmp_dt.day, f'hour_{column}': tmp_dt.hour,
f'minute_{column}': tmp_dt.minute, f'second_{column}': tmp_dt.second,
f'dayofweek_{column}': tmp_dt.dayofweek,
f'dayofyear_{column}': tmp_dt.dayofyear,
f'quarter_{column}': tmp_dt.quarter}
new_columns_dict = {
f"year_{column}": tmp_dt.year,
f"month_{column}": tmp_dt.month,
f"day_{column}": tmp_dt.day,
f"hour_{column}": tmp_dt.hour,
f"minute_{column}": tmp_dt.minute,
f"second_{column}": tmp_dt.second,
f"dayofweek_{column}": tmp_dt.dayofweek,
f"dayofyear_{column}": tmp_dt.dayofyear,
f"quarter_{column}": tmp_dt.quarter,
}
for new_col_name in new_columns_dict.keys():
if new_col_name not in X.columns and \
new_columns_dict.get(new_col_name).nunique(dropna=False) >= 2:
if (
new_col_name not in X.columns
and new_columns_dict.get(new_col_name).nunique(dropna=False)
>= 2
):
X[new_col_name] = new_columns_dict.get(new_col_name)
X[column] = X[column].map(datetime.toordinal)
del tmp_dt
X = X[cat_columns + num_columns].copy()
for column in cat_columns:
if X[column].dtype.name == 'object':
X[column] = X[column].fillna('__NAN__')
elif X[column].dtype.name == 'category':
if X[column].dtype.name == "object":
X[column] = X[column].fillna("__NAN__")
elif X[column].dtype.name == "category":
current_categories = X[column].cat.categories
if '__NAN__' not in current_categories:
X[column] = X[column].cat.add_categories(
'__NAN__').fillna('__NAN__')
if "__NAN__" not in current_categories:
X[column] = (
X[column].cat.add_categories("__NAN__").fillna("__NAN__")
)
if cat_columns:
X[cat_columns] = X[cat_columns].astype('category')
X[cat_columns] = X[cat_columns].astype("category")
if num_columns:
X_num = X[num_columns].fillna(np.nan)
if self._drop:

View File

@ -1,65 +1,90 @@
'''!
* Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
"""!
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
'''
"""
import time
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \
accuracy_score, mean_absolute_error, log_loss, average_precision_score, \
f1_score, mean_absolute_percentage_error, ndcg_score
from sklearn.metrics import (
mean_squared_error,
r2_score,
roc_auc_score,
accuracy_score,
mean_absolute_error,
log_loss,
average_precision_score,
f1_score,
mean_absolute_percentage_error,
ndcg_score,
)
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit
from .model import (
XGBoostEstimator, XGBoostSklearnEstimator, RandomForestEstimator,
LGBMEstimator, LRL1Classifier, LRL2Classifier, CatBoostEstimator,
ExtraTreeEstimator, KNeighborsEstimator, FBProphet, ARIMA, SARIMAX)
XGBoostEstimator,
XGBoostSklearnEstimator,
RandomForestEstimator,
LGBMEstimator,
LRL1Classifier,
LRL2Classifier,
CatBoostEstimator,
ExtraTreeEstimator,
KNeighborsEstimator,
Prophet,
ARIMA,
SARIMAX,
)
from .data import group_counts
import logging
logger = logging.getLogger(__name__)
def get_estimator_class(task, estimator_name):
''' when adding a new learner, need to add an elif branch '''
"""when adding a new learner, need to add an elif branch"""
if 'xgboost' == estimator_name:
if 'regression' == task:
if "xgboost" == estimator_name:
if "regression" == task:
estimator_class = XGBoostEstimator
else:
estimator_class = XGBoostSklearnEstimator
elif 'rf' == estimator_name:
elif "rf" == estimator_name:
estimator_class = RandomForestEstimator
elif 'lgbm' == estimator_name:
elif "lgbm" == estimator_name:
estimator_class = LGBMEstimator
elif 'lrl1' == estimator_name:
elif "lrl1" == estimator_name:
estimator_class = LRL1Classifier
elif 'lrl2' == estimator_name:
elif "lrl2" == estimator_name:
estimator_class = LRL2Classifier
elif 'catboost' == estimator_name:
elif "catboost" == estimator_name:
estimator_class = CatBoostEstimator
elif 'extra_tree' == estimator_name:
elif "extra_tree" == estimator_name:
estimator_class = ExtraTreeEstimator
elif 'kneighbor' == estimator_name:
elif "kneighbor" == estimator_name:
estimator_class = KNeighborsEstimator
elif 'prophet' in estimator_name:
estimator_class = FBProphet
elif estimator_name == 'arima':
elif "prophet" in estimator_name:
estimator_class = Prophet
elif estimator_name == "arima":
estimator_class = ARIMA
elif estimator_name == 'sarimax':
elif estimator_name == "sarimax":
estimator_class = SARIMAX
else:
raise ValueError(
estimator_name + ' is not a built-in learner. '
'Please use AutoML.add_learner() to add a customized learner.')
estimator_name + " is not a built-in learner. "
"Please use AutoML.add_learner() to add a customized learner."
)
return estimator_class
def sklearn_metric_loss_score(
metric_name, y_predict, y_true, labels=None, sample_weight=None,
metric_name,
y_predict,
y_true,
labels=None,
sample_weight=None,
groups=None,
):
'''Loss using the specified metric
"""Loss using the specified metric
Args:
metric_name: A string of the metric name, one of
@ -76,60 +101,63 @@ def sklearn_metric_loss_score(
Returns:
score: A float number of the loss, the lower the better.
'''
"""
metric_name = metric_name.lower()
if 'r2' == metric_name:
if "r2" == metric_name:
score = 1.0 - r2_score(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == 'rmse':
score = np.sqrt(mean_squared_error(
y_true, y_predict, sample_weight=sample_weight))
elif metric_name == 'mae':
score = mean_absolute_error(
y_true, y_predict, sample_weight=sample_weight)
elif metric_name == 'mse':
score = mean_squared_error(
y_true, y_predict, sample_weight=sample_weight)
elif metric_name == 'accuracy':
score = 1.0 - accuracy_score(
y_true, y_predict, sample_weight=sample_weight)
elif metric_name == 'roc_auc':
elif metric_name == "rmse":
score = np.sqrt(
mean_squared_error(y_true, y_predict, sample_weight=sample_weight)
)
elif metric_name == "mae":
score = mean_absolute_error(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == "mse":
score = mean_squared_error(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == "accuracy":
score = 1.0 - accuracy_score(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == "roc_auc":
score = 1.0 - roc_auc_score(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == "roc_auc_ovr":
score = 1.0 - roc_auc_score(
y_true, y_predict, sample_weight=sample_weight)
elif metric_name == 'roc_auc_ovr':
y_true, y_predict, sample_weight=sample_weight, multi_class="ovr"
)
elif metric_name == "roc_auc_ovo":
score = 1.0 - roc_auc_score(
y_true, y_predict, sample_weight=sample_weight, multi_class='ovr')
elif metric_name == 'roc_auc_ovo':
score = 1.0 - roc_auc_score(
y_true, y_predict, sample_weight=sample_weight, multi_class='ovo')
elif 'log_loss' == metric_name:
score = log_loss(
y_true, y_predict, labels=labels, sample_weight=sample_weight)
elif 'mape' == metric_name:
y_true, y_predict, sample_weight=sample_weight, multi_class="ovo"
)
elif "log_loss" == metric_name:
score = log_loss(y_true, y_predict, labels=labels, sample_weight=sample_weight)
elif "mape" == metric_name:
try:
score = mean_absolute_percentage_error(
y_true, y_predict)
score = mean_absolute_percentage_error(y_true, y_predict)
except ValueError:
return np.inf
elif 'micro_f1' == metric_name:
elif "micro_f1" == metric_name:
score = 1 - f1_score(
y_true, y_predict, sample_weight=sample_weight, average='micro')
elif 'macro_f1' == metric_name:
y_true, y_predict, sample_weight=sample_weight, average="micro"
)
elif "macro_f1" == metric_name:
score = 1 - f1_score(
y_true, y_predict, sample_weight=sample_weight, average='macro')
elif 'f1' == metric_name:
y_true, y_predict, sample_weight=sample_weight, average="macro"
)
elif "f1" == metric_name:
score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight)
elif 'ap' == metric_name:
elif "ap" == metric_name:
score = 1 - average_precision_score(
y_true, y_predict, sample_weight=sample_weight)
elif 'ndcg' in metric_name:
if '@' in metric_name:
k = int(metric_name.split('@', 1)[-1])
y_true, y_predict, sample_weight=sample_weight
)
elif "ndcg" in metric_name:
if "@" in metric_name:
k = int(metric_name.split("@", 1)[-1])
counts = group_counts(groups)
score = 0
psum = 0
for c in counts:
score -= ndcg_score(np.asarray([y_true[psum:psum + c]]),
np.asarray([y_predict[psum:psum + c]]), k=k)
score -= ndcg_score(
np.asarray([y_true[psum : psum + c]]),
np.asarray([y_predict[psum : psum + c]]),
k=k,
)
psum += c
score /= len(counts)
score += 1
@ -137,56 +165,96 @@ def sklearn_metric_loss_score(
score = 1 - ndcg_score([y_true], [y_predict])
else:
raise ValueError(
metric_name + ' is not a built-in metric, '
'currently built-in metrics are: '
'r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,'
'log_loss, mape, f1, micro_f1, macro_f1, ap. '
'please pass a customized metric function to AutoML.fit(metric=func)')
metric_name + " is not a built-in metric, "
"currently built-in metrics are: "
"r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,"
"log_loss, mape, f1, micro_f1, macro_f1, ap. "
"please pass a customized metric function to AutoML.fit(metric=func)"
)
return score
def get_y_pred(estimator, X, eval_metric, obj):
if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj:
if eval_metric in ["roc_auc", "ap"] and "binary" in obj:
y_pred_classes = estimator.predict_proba(X)
y_pred = y_pred_classes[
:, 1] if y_pred_classes.ndim > 1 else y_pred_classes
elif eval_metric in ['log_loss', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']:
y_pred = y_pred_classes[:, 1] if y_pred_classes.ndim > 1 else y_pred_classes
elif eval_metric in ["log_loss", "roc_auc", "roc_auc_ovr", "roc_auc_ovo"]:
y_pred = estimator.predict_proba(X)
else:
y_pred = estimator.predict(X)
return y_pred
def _eval_estimator(config, estimator, X_train, y_train, X_test, y_test, weight_test,
groups_test, eval_metric, obj, labels=None,
log_training_metric=False, fit_kwargs={}):
def _eval_estimator(
config,
estimator,
X_train,
y_train,
X_test,
y_test,
weight_test,
groups_test,
eval_metric,
obj,
labels=None,
log_training_metric=False,
fit_kwargs={},
):
if isinstance(eval_metric, str):
pred_start = time.time()
test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj)
pred_time = (time.time() - pred_start) / X_test.shape[0]
test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test,
labels, weight_test, groups_test)
test_loss = sklearn_metric_loss_score(
eval_metric, test_pred_y, y_test, labels, weight_test, groups_test
)
metric_for_logging = {}
if log_training_metric:
train_pred_y = get_y_pred(estimator, X_train, eval_metric, obj)
metric_for_logging['train_loss'] = sklearn_metric_loss_score(
eval_metric, train_pred_y, y_train, labels,
fit_kwargs.get('sample_weight'), fit_kwargs.get('groups'))
metric_for_logging["train_loss"] = sklearn_metric_loss_score(
eval_metric,
train_pred_y,
y_train,
labels,
fit_kwargs.get("sample_weight"),
fit_kwargs.get("groups"),
)
else: # customized metric function
test_loss, metric_for_logging = eval_metric(
X_test, y_test, estimator, labels, X_train, y_train, weight_test,
fit_kwargs.get('sample_weight'), config, groups_test,
fit_kwargs.get('groups'))
X_test,
y_test,
estimator,
labels,
X_train,
y_train,
weight_test,
fit_kwargs.get("sample_weight"),
config,
groups_test,
fit_kwargs.get("groups"),
)
if isinstance(metric_for_logging, dict):
pred_time = metric_for_logging.get('pred_time', 0)
pred_time = metric_for_logging.get("pred_time", 0)
test_pred_y = None
# eval_metric may return test_pred_y but not necessarily. Setting None for now.
return test_loss, metric_for_logging, pred_time, test_pred_y
def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_test,
groups_test, eval_metric, obj, labels=None, budget=None,
log_training_metric=False, fit_kwargs={}):
def get_test_loss(
config,
estimator,
X_train,
y_train,
X_test,
y_test,
weight_test,
groups_test,
eval_metric,
obj,
labels=None,
budget=None,
log_training_metric=False,
fit_kwargs={},
):
start = time.time()
# if groups_test is not None:
@ -195,16 +263,37 @@ def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_te
# fit_kwargs['y_val'] = y_test
estimator.fit(X_train, y_train, budget, **fit_kwargs)
test_loss, metric_for_logging, pred_time, _ = _eval_estimator(
config, estimator, X_train, y_train, X_test, y_test,
weight_test, groups_test, eval_metric, obj,
labels, log_training_metric, fit_kwargs)
config,
estimator,
X_train,
y_train,
X_test,
y_test,
weight_test,
groups_test,
eval_metric,
obj,
labels,
log_training_metric,
fit_kwargs,
)
train_time = time.time() - start
return test_loss, metric_for_logging, train_time, pred_time
def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
task, eval_metric, best_val_loss,
log_training_metric=False, fit_kwargs={}):
def evaluate_model_CV(
config,
estimator,
X_train_all,
y_train_all,
budget,
kf,
task,
eval_metric,
best_val_loss,
log_training_metric=False,
fit_kwargs={},
):
start_time = time.time()
total_val_loss = 0
total_metric = None
@ -213,7 +302,7 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
valid_fold_num = total_fold_num = 0
n = kf.get_n_splits()
X_train_split, y_train_split = X_train_all, y_train_all
if task in ('binary', 'multi'):
if task in ("binary", "multi"):
labels = np.unique(y_train_all)
else:
labels = None
@ -225,8 +314,8 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
groups = kf.groups
kf = kf.split(X_train_split, y_train_split, groups)
shuffle = False
elif isinstance(kf, TimeSeriesSplit) and task == 'forecast':
y_train_all = pd.DataFrame(y_train_all, columns=['y'])
elif isinstance(kf, TimeSeriesSplit) and task == "forecast":
y_train_all = pd.DataFrame(y_train_all, columns=["y"])
train = X_train_all.join(y_train_all)
kf = kf.split(train)
shuffle = False
@ -237,8 +326,8 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
rng = np.random.RandomState(2020)
val_loss_list = []
budget_per_train = budget / n
if 'sample_weight' in fit_kwargs:
weight = fit_kwargs['sample_weight']
if "sample_weight" in fit_kwargs:
weight = fit_kwargs["sample_weight"]
weight_val = None
else:
weight = weight_val = None
@ -246,37 +335,48 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
if shuffle:
train_index = rng.permutation(train_index)
if isinstance(X_train_all, pd.DataFrame):
X_train, X_val = X_train_split.iloc[
train_index], X_train_split.iloc[val_index]
X_train = X_train_split.iloc[train_index]
X_val = X_train_split.iloc[val_index]
else:
X_train, X_val = X_train_split[
train_index], X_train_split[val_index]
X_train, X_val = X_train_split[train_index], X_train_split[val_index]
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
estimator.cleanup()
if weight is not None:
fit_kwargs['sample_weight'], weight_val = weight[
train_index], weight[val_index]
fit_kwargs["sample_weight"], weight_val = (
weight[train_index],
weight[val_index],
)
if groups is not None:
fit_kwargs['groups'] = groups[train_index]
fit_kwargs["groups"] = groups[train_index]
groups_val = groups[val_index]
else:
groups_val = None
val_loss_i, metric_i, train_time_i, pred_time_i = get_test_loss(
config, estimator, X_train, y_train, X_val, y_val, weight_val,
groups_val, eval_metric, task, labels, budget_per_train,
log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
config,
estimator,
X_train,
y_train,
X_val,
y_val,
weight_val,
groups_val,
eval_metric,
task,
labels,
budget_per_train,
log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs,
)
if weight is not None:
fit_kwargs['sample_weight'] = weight
fit_kwargs["sample_weight"] = weight
valid_fold_num += 1
total_fold_num += 1
total_val_loss += val_loss_i
if log_training_metric or not isinstance(eval_metric, str):
if isinstance(total_metric, list):
total_metric = [
total_metric[i] + v for i, v in enumerate(metric_i)]
total_metric = [total_metric[i] + v for i, v in enumerate(metric_i)]
elif isinstance(total_metric, dict):
total_metric = {
k: total_metric[k] + v for k, v in metric_i.items()}
total_metric = {k: total_metric[k] + v for k, v in metric_i.items()}
elif total_metric is not None:
total_metric += metric_i
else:
@ -307,35 +407,73 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
def compute_estimator(
X_train, y_train, X_val, y_val, weight_val, groups_val, budget, kf,
config_dic, task, estimator_name, eval_method, eval_metric,
best_val_loss=np.Inf, n_jobs=1, estimator_class=None, log_training_metric=False,
fit_kwargs={}
X_train,
y_train,
X_val,
y_val,
weight_val,
groups_val,
budget,
kf,
config_dic,
task,
estimator_name,
eval_method,
eval_metric,
best_val_loss=np.Inf,
n_jobs=1,
estimator_class=None,
log_training_metric=False,
fit_kwargs={},
):
estimator_class = estimator_class or get_estimator_class(
task, estimator_name)
estimator = estimator_class(
**config_dic, task=task, n_jobs=n_jobs)
if 'holdout' in eval_method:
estimator_class = estimator_class or get_estimator_class(task, estimator_name)
estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
if "holdout" in eval_method:
val_loss, metric_for_logging, train_time, pred_time = get_test_loss(
config_dic, estimator, X_train, y_train, X_val, y_val, weight_val,
groups_val, eval_metric, task, budget=budget,
log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
config_dic,
estimator,
X_train,
y_train,
X_val,
y_val,
weight_val,
groups_val,
eval_metric,
task,
budget=budget,
log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs,
)
else:
val_loss, metric_for_logging, train_time, pred_time = evaluate_model_CV(
config_dic, estimator, X_train, y_train, budget, kf, task,
eval_metric, best_val_loss, log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs)
config_dic,
estimator,
X_train,
y_train,
budget,
kf,
task,
eval_metric,
best_val_loss,
log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs,
)
return estimator, val_loss, metric_for_logging, train_time, pred_time
def train_estimator(
X_train, y_train, config_dic, task,
estimator_name, n_jobs=1, estimator_class=None, budget=None, fit_kwargs={}
X_train,
y_train,
config_dic,
task,
estimator_name,
n_jobs=1,
estimator_class=None,
budget=None,
fit_kwargs={},
):
start_time = time.time()
estimator_class = estimator_class or get_estimator_class(
task, estimator_name)
estimator_class = estimator_class or get_estimator_class(task, estimator_name)
estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
if X_train is not None:
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
@ -347,14 +485,14 @@ def train_estimator(
def get_classification_objective(num_labels: int) -> str:
if num_labels == 2:
objective_name = 'binary'
objective_name = "binary"
else:
objective_name = 'multi'
objective_name = "multi"
return objective_name
def norm_confusion_matrix(y_true, y_pred):
'''normalized confusion matrix
"""normalized confusion matrix
Args:
estimator: A multi-class classification estimator
@ -363,15 +501,16 @@ def norm_confusion_matrix(y_true, y_pred):
Returns:
A normalized confusion matrix
'''
"""
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_true, y_pred)
norm_conf_mat = conf_mat.astype('float') / conf_mat.sum(axis=1)[:, np.newaxis]
norm_conf_mat = conf_mat.astype("float") / conf_mat.sum(axis=1)[:, np.newaxis]
return norm_conf_mat
def multi_class_curves(y_true, y_pred_proba, curve_func):
'''Binarize the data for multi-class tasks and produce ROC or precision-recall curves
"""Binarize the data for multi-class tasks and produce ROC or precision-recall curves
Args:
y_true: A numpy array or a pandas series of true labels
@ -384,8 +523,9 @@ def multi_class_curves(y_true, y_pred_proba, curve_func):
curve_x[0] is an 1D array of the x coordinates of class 0
The second dictionary curve_y stores the y coordinates of each curve, e.g.,
curve_y[0] is an 1D array of the y coordinates of class 0
'''
"""
from sklearn.preprocessing import label_binarize
classes = np.unique(y_true)
y_true_binary = label_binarize(y_true, classes=classes)

File diff suppressed because it is too large Load Diff

View File

@ -1 +1 @@
__version__ = "0.6.3"
__version__ = "0.6.4"

View File

@ -32,7 +32,7 @@ setuptools.setup(
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/microsoft/FLAML",
packages=setuptools.find_packages(),
packages=setuptools.find_packages(include=["flaml*"]),
install_requires=install_requires,
extras_require={
"notebook": [

View File

@ -30,9 +30,11 @@ def test_forecast_automl(budget=5):
}
"""The main flaml automl API"""
try:
import prophet
automl.fit(dataframe=df, **settings, period=time_horizon)
except ImportError:
print("not using FBProphet due to ImportError")
print("not using prophet due to ImportError")
automl.fit(
dataframe=df,
**settings,
@ -79,7 +81,7 @@ def test_forecast_automl(budget=5):
try:
automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon)
except ImportError:
print("not using FBProphet due to ImportError")
print("not using prophet due to ImportError")
automl.fit(
X_train=X_train,
y_train=y_train,
@ -94,6 +96,8 @@ def test_numpy():
y_train = np.random.random(size=72)
automl = AutoML()
try:
import prophet
automl.fit(
X_train=X_train[:60], # a single column of timestamp
y_train=y_train, # value for each timestamp
@ -105,9 +109,9 @@ def test_numpy():
print(automl.predict(X_train[60:]))
print(automl.predict(12))
except ValueError:
print("ValueError for FBProphet is raised as expected.")
print("ValueError for prophet is raised as expected.")
except ImportError:
print("not using FBProphet due to ImportError")
print("not using prophet due to ImportError")
automl = AutoML()
automl.fit(
X_train=X_train[:72], # a single column of timestamp