mirror of
https://github.com/microsoft/autogen.git
synced 2025-09-03 21:37:17 +00:00
package name in setup (#198)
* package name * learning to rank example: close #200 * try import prophet #201
This commit is contained in:
parent
8f9f08cebc
commit
f4529dfe89
21
README.md
21
README.md
@ -75,7 +75,7 @@ And they can be used in distributed HPO frameworks such as ray tune or nni.
|
||||
|
||||
## Examples
|
||||
|
||||
- A basic classification example.
|
||||
* A basic classification example.
|
||||
|
||||
```python
|
||||
from flaml import AutoML
|
||||
@ -99,7 +99,7 @@ print(automl.predict_proba(X_train))
|
||||
print(automl.model)
|
||||
```
|
||||
|
||||
- A basic regression example.
|
||||
* A basic regression example.
|
||||
|
||||
```python
|
||||
from flaml import AutoML
|
||||
@ -123,7 +123,7 @@ print(automl.predict(X_train))
|
||||
print(automl.model)
|
||||
```
|
||||
|
||||
- Time series forecasting.
|
||||
* Time series forecasting.
|
||||
|
||||
```python
|
||||
# pip install flaml[forecast]
|
||||
@ -141,14 +141,15 @@ automl.fit(X_train=X_train[:72], # a single column of timestamp
|
||||
print(automl.predict(X_train[72:]))
|
||||
```
|
||||
|
||||
- Learning to rank.
|
||||
* Learning to rank.
|
||||
|
||||
```python
|
||||
from sklearn.datasets import fetch_openml
|
||||
from flaml import AutoML
|
||||
X, y = fetch_openml(name="credit-g", return_X_y=True)
|
||||
X_train, y_train = fetch_openml(name="credit-g", return_X_y=True, as_frame=False)
|
||||
y_train = y_train.cat.codes
|
||||
# not a real learning to rank dataaset
|
||||
groups = [200] * 4 + [100] * 2, # group counts
|
||||
groups = [200] * 4 + [100] * 2 # group counts
|
||||
automl = AutoML()
|
||||
automl.fit(
|
||||
X_train, y_train, groups=groups,
|
||||
@ -207,17 +208,21 @@ pip install -e .[test,notebook]
|
||||
```
|
||||
|
||||
### Docker
|
||||
|
||||
We provide a simple [Dockerfile](https://github.com/microsoft/FLAML/blob/main/Dockerfile).
|
||||
```
|
||||
|
||||
```bash
|
||||
docker build git://github.com/microsoft/FLAML -t flaml-dev
|
||||
docker run -it flaml-dev
|
||||
```
|
||||
|
||||
### Develop in Remote Container
|
||||
|
||||
If you use vscode, you can open the FLAML folder in a [Container](https://code.visualstudio.com/docs/remote/containers).
|
||||
We have provided the configuration in (.devcontainer)[(https://github.com/microsoft/FLAML/blob/main/.devcontainer)].
|
||||
We have provided the configuration in [.devcontainer]((https://github.com/microsoft/FLAML/blob/main/.devcontainer)).
|
||||
|
||||
### Pre-commit
|
||||
|
||||
Run `pre-commit install` to install pre-commit into your git hooks. Before you commit, run
|
||||
`pre-commit run` to check if you meet the pre-commit requirements. If you use Windows (without WSL) and can't commit after installing pre-commit, you can run `pre-commit uninstall` to uninstall the hook. In WSL or Linux this is supposed to work.
|
||||
|
||||
|
@ -1474,7 +1474,12 @@ class AutoML:
|
||||
|
||||
if "auto" == estimator_list:
|
||||
if self._state.task == "forecast":
|
||||
estimator_list = ["fbprophet", "arima", "sarimax"]
|
||||
try:
|
||||
import prophet
|
||||
|
||||
estimator_list = ["prophet", "arima", "sarimax"]
|
||||
except ImportError:
|
||||
estimator_list = ["arima", "sarimax"]
|
||||
elif self._state.task == "rank":
|
||||
estimator_list = ["lgbm", "xgboost"]
|
||||
else:
|
||||
|
228
flaml/data.py
228
flaml/data.py
@ -1,7 +1,7 @@
|
||||
'''!
|
||||
"""!
|
||||
* Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
|
||||
* Licensed under the MIT License.
|
||||
'''
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import vstack, issparse
|
||||
@ -11,9 +11,10 @@ from .training_log import training_log_reader
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def load_openml_dataset(dataset_id, data_dir=None, random_state=0,
|
||||
dataset_format='dataframe'):
|
||||
'''Load dataset from open ML.
|
||||
def load_openml_dataset(
|
||||
dataset_id, data_dir=None, random_state=0, dataset_format="dataframe"
|
||||
):
|
||||
"""Load dataset from open ML.
|
||||
|
||||
If the file is not cached locally, download it from open ML.
|
||||
|
||||
@ -30,41 +31,43 @@ def load_openml_dataset(dataset_id, data_dir=None, random_state=0,
|
||||
X_test: Test data
|
||||
y_train: A series or array of labels for training data
|
||||
y_test: A series or array of labels for test data
|
||||
'''
|
||||
"""
|
||||
import os
|
||||
import openml
|
||||
import pickle
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
filename = 'openml_ds' + str(dataset_id) + '.pkl'
|
||||
filename = "openml_ds" + str(dataset_id) + ".pkl"
|
||||
filepath = os.path.join(data_dir, filename)
|
||||
if os.path.isfile(filepath):
|
||||
print('load dataset from', filepath)
|
||||
with open(filepath, 'rb') as f:
|
||||
print("load dataset from", filepath)
|
||||
with open(filepath, "rb") as f:
|
||||
dataset = pickle.load(f)
|
||||
else:
|
||||
print('download dataset from openml')
|
||||
print("download dataset from openml")
|
||||
dataset = openml.datasets.get_dataset(dataset_id)
|
||||
if not os.path.exists(data_dir):
|
||||
os.makedirs(data_dir)
|
||||
with open(filepath, 'wb') as f:
|
||||
with open(filepath, "wb") as f:
|
||||
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
|
||||
print('Dataset name:', dataset.name)
|
||||
X, y, * \
|
||||
__ = dataset.get_data(
|
||||
target=dataset.default_target_attribute, dataset_format=dataset_format)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, random_state=random_state)
|
||||
print("Dataset name:", dataset.name)
|
||||
X, y, *__ = dataset.get_data(
|
||||
target=dataset.default_target_attribute, dataset_format=dataset_format
|
||||
)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
|
||||
print(
|
||||
'X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}'.format(
|
||||
X_train.shape, y_train.shape, X_test.shape, y_test.shape,
|
||||
"X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}".format(
|
||||
X_train.shape,
|
||||
y_train.shape,
|
||||
X_test.shape,
|
||||
y_test.shape,
|
||||
)
|
||||
)
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
|
||||
def load_openml_task(task_id, data_dir):
|
||||
'''Load task from open ML.
|
||||
"""Load task from open ML.
|
||||
|
||||
Use the first fold of the task.
|
||||
If the file is not cached locally, download it from open ML.
|
||||
@ -78,21 +81,22 @@ def load_openml_task(task_id, data_dir):
|
||||
X_test: A dataframe of test data
|
||||
y_train: A series of labels for training data
|
||||
y_test: A series of labels for test data
|
||||
'''
|
||||
"""
|
||||
import os
|
||||
import openml
|
||||
import pickle
|
||||
|
||||
task = openml.tasks.get_task(task_id)
|
||||
filename = 'openml_task' + str(task_id) + '.pkl'
|
||||
filename = "openml_task" + str(task_id) + ".pkl"
|
||||
filepath = os.path.join(data_dir, filename)
|
||||
if os.path.isfile(filepath):
|
||||
print('load dataset from', filepath)
|
||||
with open(filepath, 'rb') as f:
|
||||
print("load dataset from", filepath)
|
||||
with open(filepath, "rb") as f:
|
||||
dataset = pickle.load(f)
|
||||
else:
|
||||
print('download dataset from openml')
|
||||
print("download dataset from openml")
|
||||
dataset = task.get_dataset()
|
||||
with open(filepath, 'wb') as f:
|
||||
with open(filepath, "wb") as f:
|
||||
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
|
||||
X, y, _, _ = dataset.get_data(task.target_name)
|
||||
train_indices, test_indices = task.get_train_test_split_indices(
|
||||
@ -105,15 +109,18 @@ def load_openml_task(task_id, data_dir):
|
||||
X_test = X.iloc[test_indices]
|
||||
y_test = y[test_indices]
|
||||
print(
|
||||
'X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}'.format(
|
||||
X_train.shape, y_train.shape, X_test.shape, y_test.shape,
|
||||
"X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}".format(
|
||||
X_train.shape,
|
||||
y_train.shape,
|
||||
X_test.shape,
|
||||
y_test.shape,
|
||||
)
|
||||
)
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
|
||||
def get_output_from_log(filename, time_budget):
|
||||
'''Get output from log file
|
||||
"""Get output from log file
|
||||
|
||||
Args:
|
||||
filename: A string of the log file name
|
||||
@ -127,11 +134,11 @@ def get_output_from_log(filename, time_budget):
|
||||
config_list:
|
||||
A list of the estimator, sample size and config of each logged iter
|
||||
logged_metric_list: A list of the logged metric of each logged iter
|
||||
'''
|
||||
"""
|
||||
|
||||
best_config = None
|
||||
best_learner = None
|
||||
best_val_loss = float('+inf')
|
||||
best_val_loss = float("+inf")
|
||||
|
||||
search_time_list = []
|
||||
config_list = []
|
||||
@ -144,7 +151,7 @@ def get_output_from_log(filename, time_budget):
|
||||
time_used = record.wall_clock_time
|
||||
val_loss = record.validation_loss
|
||||
config = record.config
|
||||
learner = record.learner.split('_')[0]
|
||||
learner = record.learner.split("_")[0]
|
||||
sample_size = record.sample_size
|
||||
metric = record.logged_metric
|
||||
|
||||
@ -158,27 +165,34 @@ def get_output_from_log(filename, time_budget):
|
||||
best_error_list.append(best_val_loss)
|
||||
logged_metric_list.append(metric)
|
||||
error_list.append(val_loss)
|
||||
config_list.append({"Current Learner": learner,
|
||||
config_list.append(
|
||||
{
|
||||
"Current Learner": learner,
|
||||
"Current Sample": sample_size,
|
||||
"Current Hyper-parameters": record.config,
|
||||
"Best Learner": best_learner,
|
||||
"Best Hyper-parameters": best_config})
|
||||
"Best Hyper-parameters": best_config,
|
||||
}
|
||||
)
|
||||
|
||||
return (search_time_list, best_error_list, error_list, config_list,
|
||||
logged_metric_list)
|
||||
return (
|
||||
search_time_list,
|
||||
best_error_list,
|
||||
error_list,
|
||||
config_list,
|
||||
logged_metric_list,
|
||||
)
|
||||
|
||||
|
||||
def concat(X1, X2):
|
||||
'''concatenate two matrices vertically
|
||||
'''
|
||||
"""concatenate two matrices vertically"""
|
||||
if isinstance(X1, pd.DataFrame) or isinstance(X1, pd.Series):
|
||||
df = pd.concat([X1, X2], sort=False)
|
||||
df.reset_index(drop=True, inplace=True)
|
||||
if isinstance(X1, pd.DataFrame):
|
||||
cat_columns = X1.select_dtypes(
|
||||
include='category').columns
|
||||
cat_columns = X1.select_dtypes(include="category").columns
|
||||
if len(cat_columns):
|
||||
df[cat_columns] = df[cat_columns].astype('category')
|
||||
df[cat_columns] = df[cat_columns].astype("category")
|
||||
return df
|
||||
if issparse(X1):
|
||||
return vstack((X1, X2))
|
||||
@ -187,8 +201,7 @@ def concat(X1, X2):
|
||||
|
||||
|
||||
class DataTransformer:
|
||||
'''transform X, y
|
||||
'''
|
||||
"""transform X, y"""
|
||||
|
||||
def fit_transform(self, X, y, task):
|
||||
if isinstance(X, pd.DataFrame):
|
||||
@ -198,19 +211,25 @@ class DataTransformer:
|
||||
drop = False
|
||||
for column in X.columns:
|
||||
# sklearn\utils\validation.py needs int/float values
|
||||
if X[column].dtype.name in ('object', 'category'):
|
||||
if X[column].nunique() == 1 or X[column].nunique(
|
||||
dropna=True) == n - X[column].isnull().sum():
|
||||
if X[column].dtype.name in ("object", "category"):
|
||||
if (
|
||||
X[column].nunique() == 1
|
||||
or X[column].nunique(dropna=True)
|
||||
== n - X[column].isnull().sum()
|
||||
):
|
||||
X.drop(columns=column, inplace=True)
|
||||
drop = True
|
||||
elif X[column].dtype.name == 'category':
|
||||
elif X[column].dtype.name == "category":
|
||||
current_categories = X[column].cat.categories
|
||||
if '__NAN__' not in current_categories:
|
||||
X[column] = X[column].cat.add_categories(
|
||||
'__NAN__').fillna('__NAN__')
|
||||
if "__NAN__" not in current_categories:
|
||||
X[column] = (
|
||||
X[column]
|
||||
.cat.add_categories("__NAN__")
|
||||
.fillna("__NAN__")
|
||||
)
|
||||
cat_columns.append(column)
|
||||
else:
|
||||
X[column] = X[column].fillna('__NAN__')
|
||||
X[column] = X[column].fillna("__NAN__")
|
||||
cat_columns.append(column)
|
||||
else:
|
||||
# print(X[column].dtype.name)
|
||||
@ -218,17 +237,27 @@ class DataTransformer:
|
||||
X.drop(columns=column, inplace=True)
|
||||
drop = True
|
||||
else:
|
||||
if X[column].dtype.name == 'datetime64[ns]':
|
||||
if X[column].dtype.name == "datetime64[ns]":
|
||||
tmp_dt = X[column].dt
|
||||
new_columns_dict = {f'year_{column}': tmp_dt.year, f'month_{column}': tmp_dt.month,
|
||||
f'day_{column}': tmp_dt.day, f'hour_{column}': tmp_dt.hour,
|
||||
f'minute_{column}': tmp_dt.minute, f'second_{column}': tmp_dt.second,
|
||||
f'dayofweek_{column}': tmp_dt.dayofweek,
|
||||
f'dayofyear_{column}': tmp_dt.dayofyear,
|
||||
f'quarter_{column}': tmp_dt.quarter}
|
||||
new_columns_dict = {
|
||||
f"year_{column}": tmp_dt.year,
|
||||
f"month_{column}": tmp_dt.month,
|
||||
f"day_{column}": tmp_dt.day,
|
||||
f"hour_{column}": tmp_dt.hour,
|
||||
f"minute_{column}": tmp_dt.minute,
|
||||
f"second_{column}": tmp_dt.second,
|
||||
f"dayofweek_{column}": tmp_dt.dayofweek,
|
||||
f"dayofyear_{column}": tmp_dt.dayofyear,
|
||||
f"quarter_{column}": tmp_dt.quarter,
|
||||
}
|
||||
for new_col_name in new_columns_dict.keys():
|
||||
if new_col_name not in X.columns and \
|
||||
new_columns_dict.get(new_col_name).nunique(dropna=False) >= 2:
|
||||
if (
|
||||
new_col_name not in X.columns
|
||||
and new_columns_dict.get(new_col_name).nunique(
|
||||
dropna=False
|
||||
)
|
||||
>= 2
|
||||
):
|
||||
X[new_col_name] = new_columns_dict.get(new_col_name)
|
||||
num_columns.append(new_col_name)
|
||||
X[column] = X[column].map(datetime.toordinal)
|
||||
@ -239,11 +268,12 @@ class DataTransformer:
|
||||
num_columns.append(column)
|
||||
X = X[cat_columns + num_columns]
|
||||
if cat_columns:
|
||||
X[cat_columns] = X[cat_columns].astype('category')
|
||||
X[cat_columns] = X[cat_columns].astype("category")
|
||||
if num_columns:
|
||||
X_num = X[num_columns]
|
||||
if np.issubdtype(X_num.columns.dtype, np.integer) and (
|
||||
drop or min(X_num.columns) != 0
|
||||
drop
|
||||
or min(X_num.columns) != 0
|
||||
or max(X_num.columns) != X_num.shape[1] - 1
|
||||
):
|
||||
X_num.columns = range(X_num.shape[1])
|
||||
@ -252,17 +282,31 @@ class DataTransformer:
|
||||
drop = False
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.compose import ColumnTransformer
|
||||
self.transformer = ColumnTransformer([(
|
||||
'continuous',
|
||||
SimpleImputer(missing_values=np.nan, strategy='median'),
|
||||
X_num.columns)])
|
||||
|
||||
self.transformer = ColumnTransformer(
|
||||
[
|
||||
(
|
||||
"continuous",
|
||||
SimpleImputer(missing_values=np.nan, strategy="median"),
|
||||
X_num.columns,
|
||||
)
|
||||
]
|
||||
)
|
||||
X[num_columns] = self.transformer.fit_transform(X_num)
|
||||
self._cat_columns, self._num_columns, self._datetime_columns = \
|
||||
cat_columns, num_columns, datetime_columns
|
||||
self._cat_columns, self._num_columns, self._datetime_columns = (
|
||||
cat_columns,
|
||||
num_columns,
|
||||
datetime_columns,
|
||||
)
|
||||
self._drop = drop
|
||||
|
||||
if task in ('binary', 'multi', 'classification'):
|
||||
if task in (
|
||||
"binary",
|
||||
"multi",
|
||||
"classification",
|
||||
) or not pd.api.types.is_numeric_dtype(y):
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
self.label_transformer = LabelEncoder()
|
||||
y = self.label_transformer.fit_transform(y)
|
||||
else:
|
||||
@ -272,34 +316,46 @@ class DataTransformer:
|
||||
def transform(self, X):
|
||||
X = X.copy()
|
||||
if isinstance(X, pd.DataFrame):
|
||||
cat_columns, num_columns, datetime_columns = self._cat_columns, \
|
||||
self._num_columns, self._datetime_columns
|
||||
cat_columns, num_columns, datetime_columns = (
|
||||
self._cat_columns,
|
||||
self._num_columns,
|
||||
self._datetime_columns,
|
||||
)
|
||||
if datetime_columns:
|
||||
for column in datetime_columns:
|
||||
tmp_dt = X[column].dt
|
||||
new_columns_dict = {f'year_{column}': tmp_dt.year, f'month_{column}': tmp_dt.month,
|
||||
f'day_{column}': tmp_dt.day, f'hour_{column}': tmp_dt.hour,
|
||||
f'minute_{column}': tmp_dt.minute, f'second_{column}': tmp_dt.second,
|
||||
f'dayofweek_{column}': tmp_dt.dayofweek,
|
||||
f'dayofyear_{column}': tmp_dt.dayofyear,
|
||||
f'quarter_{column}': tmp_dt.quarter}
|
||||
new_columns_dict = {
|
||||
f"year_{column}": tmp_dt.year,
|
||||
f"month_{column}": tmp_dt.month,
|
||||
f"day_{column}": tmp_dt.day,
|
||||
f"hour_{column}": tmp_dt.hour,
|
||||
f"minute_{column}": tmp_dt.minute,
|
||||
f"second_{column}": tmp_dt.second,
|
||||
f"dayofweek_{column}": tmp_dt.dayofweek,
|
||||
f"dayofyear_{column}": tmp_dt.dayofyear,
|
||||
f"quarter_{column}": tmp_dt.quarter,
|
||||
}
|
||||
for new_col_name in new_columns_dict.keys():
|
||||
if new_col_name not in X.columns and \
|
||||
new_columns_dict.get(new_col_name).nunique(dropna=False) >= 2:
|
||||
if (
|
||||
new_col_name not in X.columns
|
||||
and new_columns_dict.get(new_col_name).nunique(dropna=False)
|
||||
>= 2
|
||||
):
|
||||
X[new_col_name] = new_columns_dict.get(new_col_name)
|
||||
X[column] = X[column].map(datetime.toordinal)
|
||||
del tmp_dt
|
||||
X = X[cat_columns + num_columns].copy()
|
||||
for column in cat_columns:
|
||||
if X[column].dtype.name == 'object':
|
||||
X[column] = X[column].fillna('__NAN__')
|
||||
elif X[column].dtype.name == 'category':
|
||||
if X[column].dtype.name == "object":
|
||||
X[column] = X[column].fillna("__NAN__")
|
||||
elif X[column].dtype.name == "category":
|
||||
current_categories = X[column].cat.categories
|
||||
if '__NAN__' not in current_categories:
|
||||
X[column] = X[column].cat.add_categories(
|
||||
'__NAN__').fillna('__NAN__')
|
||||
if "__NAN__" not in current_categories:
|
||||
X[column] = (
|
||||
X[column].cat.add_categories("__NAN__").fillna("__NAN__")
|
||||
)
|
||||
if cat_columns:
|
||||
X[cat_columns] = X[cat_columns].astype('category')
|
||||
X[cat_columns] = X[cat_columns].astype("category")
|
||||
if num_columns:
|
||||
X_num = X[num_columns].fillna(np.nan)
|
||||
if self._drop:
|
||||
|
424
flaml/ml.py
424
flaml/ml.py
@ -1,65 +1,90 @@
|
||||
'''!
|
||||
* Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
|
||||
"""!
|
||||
* Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
* Licensed under the MIT License.
|
||||
'''
|
||||
"""
|
||||
|
||||
import time
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \
|
||||
accuracy_score, mean_absolute_error, log_loss, average_precision_score, \
|
||||
f1_score, mean_absolute_percentage_error, ndcg_score
|
||||
from sklearn.metrics import (
|
||||
mean_squared_error,
|
||||
r2_score,
|
||||
roc_auc_score,
|
||||
accuracy_score,
|
||||
mean_absolute_error,
|
||||
log_loss,
|
||||
average_precision_score,
|
||||
f1_score,
|
||||
mean_absolute_percentage_error,
|
||||
ndcg_score,
|
||||
)
|
||||
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit
|
||||
from .model import (
|
||||
XGBoostEstimator, XGBoostSklearnEstimator, RandomForestEstimator,
|
||||
LGBMEstimator, LRL1Classifier, LRL2Classifier, CatBoostEstimator,
|
||||
ExtraTreeEstimator, KNeighborsEstimator, FBProphet, ARIMA, SARIMAX)
|
||||
XGBoostEstimator,
|
||||
XGBoostSklearnEstimator,
|
||||
RandomForestEstimator,
|
||||
LGBMEstimator,
|
||||
LRL1Classifier,
|
||||
LRL2Classifier,
|
||||
CatBoostEstimator,
|
||||
ExtraTreeEstimator,
|
||||
KNeighborsEstimator,
|
||||
Prophet,
|
||||
ARIMA,
|
||||
SARIMAX,
|
||||
)
|
||||
from .data import group_counts
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_estimator_class(task, estimator_name):
|
||||
''' when adding a new learner, need to add an elif branch '''
|
||||
"""when adding a new learner, need to add an elif branch"""
|
||||
|
||||
if 'xgboost' == estimator_name:
|
||||
if 'regression' == task:
|
||||
if "xgboost" == estimator_name:
|
||||
if "regression" == task:
|
||||
estimator_class = XGBoostEstimator
|
||||
else:
|
||||
estimator_class = XGBoostSklearnEstimator
|
||||
elif 'rf' == estimator_name:
|
||||
elif "rf" == estimator_name:
|
||||
estimator_class = RandomForestEstimator
|
||||
elif 'lgbm' == estimator_name:
|
||||
elif "lgbm" == estimator_name:
|
||||
estimator_class = LGBMEstimator
|
||||
elif 'lrl1' == estimator_name:
|
||||
elif "lrl1" == estimator_name:
|
||||
estimator_class = LRL1Classifier
|
||||
elif 'lrl2' == estimator_name:
|
||||
elif "lrl2" == estimator_name:
|
||||
estimator_class = LRL2Classifier
|
||||
elif 'catboost' == estimator_name:
|
||||
elif "catboost" == estimator_name:
|
||||
estimator_class = CatBoostEstimator
|
||||
elif 'extra_tree' == estimator_name:
|
||||
elif "extra_tree" == estimator_name:
|
||||
estimator_class = ExtraTreeEstimator
|
||||
elif 'kneighbor' == estimator_name:
|
||||
elif "kneighbor" == estimator_name:
|
||||
estimator_class = KNeighborsEstimator
|
||||
elif 'prophet' in estimator_name:
|
||||
estimator_class = FBProphet
|
||||
elif estimator_name == 'arima':
|
||||
elif "prophet" in estimator_name:
|
||||
estimator_class = Prophet
|
||||
elif estimator_name == "arima":
|
||||
estimator_class = ARIMA
|
||||
elif estimator_name == 'sarimax':
|
||||
elif estimator_name == "sarimax":
|
||||
estimator_class = SARIMAX
|
||||
else:
|
||||
raise ValueError(
|
||||
estimator_name + ' is not a built-in learner. '
|
||||
'Please use AutoML.add_learner() to add a customized learner.')
|
||||
estimator_name + " is not a built-in learner. "
|
||||
"Please use AutoML.add_learner() to add a customized learner."
|
||||
)
|
||||
return estimator_class
|
||||
|
||||
|
||||
def sklearn_metric_loss_score(
|
||||
metric_name, y_predict, y_true, labels=None, sample_weight=None,
|
||||
metric_name,
|
||||
y_predict,
|
||||
y_true,
|
||||
labels=None,
|
||||
sample_weight=None,
|
||||
groups=None,
|
||||
):
|
||||
'''Loss using the specified metric
|
||||
"""Loss using the specified metric
|
||||
|
||||
Args:
|
||||
metric_name: A string of the metric name, one of
|
||||
@ -76,60 +101,63 @@ def sklearn_metric_loss_score(
|
||||
|
||||
Returns:
|
||||
score: A float number of the loss, the lower the better.
|
||||
'''
|
||||
"""
|
||||
metric_name = metric_name.lower()
|
||||
if 'r2' == metric_name:
|
||||
if "r2" == metric_name:
|
||||
score = 1.0 - r2_score(y_true, y_predict, sample_weight=sample_weight)
|
||||
elif metric_name == 'rmse':
|
||||
score = np.sqrt(mean_squared_error(
|
||||
y_true, y_predict, sample_weight=sample_weight))
|
||||
elif metric_name == 'mae':
|
||||
score = mean_absolute_error(
|
||||
y_true, y_predict, sample_weight=sample_weight)
|
||||
elif metric_name == 'mse':
|
||||
score = mean_squared_error(
|
||||
y_true, y_predict, sample_weight=sample_weight)
|
||||
elif metric_name == 'accuracy':
|
||||
score = 1.0 - accuracy_score(
|
||||
y_true, y_predict, sample_weight=sample_weight)
|
||||
elif metric_name == 'roc_auc':
|
||||
elif metric_name == "rmse":
|
||||
score = np.sqrt(
|
||||
mean_squared_error(y_true, y_predict, sample_weight=sample_weight)
|
||||
)
|
||||
elif metric_name == "mae":
|
||||
score = mean_absolute_error(y_true, y_predict, sample_weight=sample_weight)
|
||||
elif metric_name == "mse":
|
||||
score = mean_squared_error(y_true, y_predict, sample_weight=sample_weight)
|
||||
elif metric_name == "accuracy":
|
||||
score = 1.0 - accuracy_score(y_true, y_predict, sample_weight=sample_weight)
|
||||
elif metric_name == "roc_auc":
|
||||
score = 1.0 - roc_auc_score(y_true, y_predict, sample_weight=sample_weight)
|
||||
elif metric_name == "roc_auc_ovr":
|
||||
score = 1.0 - roc_auc_score(
|
||||
y_true, y_predict, sample_weight=sample_weight)
|
||||
elif metric_name == 'roc_auc_ovr':
|
||||
y_true, y_predict, sample_weight=sample_weight, multi_class="ovr"
|
||||
)
|
||||
elif metric_name == "roc_auc_ovo":
|
||||
score = 1.0 - roc_auc_score(
|
||||
y_true, y_predict, sample_weight=sample_weight, multi_class='ovr')
|
||||
elif metric_name == 'roc_auc_ovo':
|
||||
score = 1.0 - roc_auc_score(
|
||||
y_true, y_predict, sample_weight=sample_weight, multi_class='ovo')
|
||||
elif 'log_loss' == metric_name:
|
||||
score = log_loss(
|
||||
y_true, y_predict, labels=labels, sample_weight=sample_weight)
|
||||
elif 'mape' == metric_name:
|
||||
y_true, y_predict, sample_weight=sample_weight, multi_class="ovo"
|
||||
)
|
||||
elif "log_loss" == metric_name:
|
||||
score = log_loss(y_true, y_predict, labels=labels, sample_weight=sample_weight)
|
||||
elif "mape" == metric_name:
|
||||
try:
|
||||
score = mean_absolute_percentage_error(
|
||||
y_true, y_predict)
|
||||
score = mean_absolute_percentage_error(y_true, y_predict)
|
||||
except ValueError:
|
||||
return np.inf
|
||||
elif 'micro_f1' == metric_name:
|
||||
elif "micro_f1" == metric_name:
|
||||
score = 1 - f1_score(
|
||||
y_true, y_predict, sample_weight=sample_weight, average='micro')
|
||||
elif 'macro_f1' == metric_name:
|
||||
y_true, y_predict, sample_weight=sample_weight, average="micro"
|
||||
)
|
||||
elif "macro_f1" == metric_name:
|
||||
score = 1 - f1_score(
|
||||
y_true, y_predict, sample_weight=sample_weight, average='macro')
|
||||
elif 'f1' == metric_name:
|
||||
y_true, y_predict, sample_weight=sample_weight, average="macro"
|
||||
)
|
||||
elif "f1" == metric_name:
|
||||
score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight)
|
||||
elif 'ap' == metric_name:
|
||||
elif "ap" == metric_name:
|
||||
score = 1 - average_precision_score(
|
||||
y_true, y_predict, sample_weight=sample_weight)
|
||||
elif 'ndcg' in metric_name:
|
||||
if '@' in metric_name:
|
||||
k = int(metric_name.split('@', 1)[-1])
|
||||
y_true, y_predict, sample_weight=sample_weight
|
||||
)
|
||||
elif "ndcg" in metric_name:
|
||||
if "@" in metric_name:
|
||||
k = int(metric_name.split("@", 1)[-1])
|
||||
counts = group_counts(groups)
|
||||
score = 0
|
||||
psum = 0
|
||||
for c in counts:
|
||||
score -= ndcg_score(np.asarray([y_true[psum:psum + c]]),
|
||||
np.asarray([y_predict[psum:psum + c]]), k=k)
|
||||
score -= ndcg_score(
|
||||
np.asarray([y_true[psum : psum + c]]),
|
||||
np.asarray([y_predict[psum : psum + c]]),
|
||||
k=k,
|
||||
)
|
||||
psum += c
|
||||
score /= len(counts)
|
||||
score += 1
|
||||
@ -137,56 +165,96 @@ def sklearn_metric_loss_score(
|
||||
score = 1 - ndcg_score([y_true], [y_predict])
|
||||
else:
|
||||
raise ValueError(
|
||||
metric_name + ' is not a built-in metric, '
|
||||
'currently built-in metrics are: '
|
||||
'r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,'
|
||||
'log_loss, mape, f1, micro_f1, macro_f1, ap. '
|
||||
'please pass a customized metric function to AutoML.fit(metric=func)')
|
||||
metric_name + " is not a built-in metric, "
|
||||
"currently built-in metrics are: "
|
||||
"r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,"
|
||||
"log_loss, mape, f1, micro_f1, macro_f1, ap. "
|
||||
"please pass a customized metric function to AutoML.fit(metric=func)"
|
||||
)
|
||||
return score
|
||||
|
||||
|
||||
def get_y_pred(estimator, X, eval_metric, obj):
|
||||
if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj:
|
||||
if eval_metric in ["roc_auc", "ap"] and "binary" in obj:
|
||||
y_pred_classes = estimator.predict_proba(X)
|
||||
y_pred = y_pred_classes[
|
||||
:, 1] if y_pred_classes.ndim > 1 else y_pred_classes
|
||||
elif eval_metric in ['log_loss', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']:
|
||||
y_pred = y_pred_classes[:, 1] if y_pred_classes.ndim > 1 else y_pred_classes
|
||||
elif eval_metric in ["log_loss", "roc_auc", "roc_auc_ovr", "roc_auc_ovo"]:
|
||||
y_pred = estimator.predict_proba(X)
|
||||
else:
|
||||
y_pred = estimator.predict(X)
|
||||
return y_pred
|
||||
|
||||
|
||||
def _eval_estimator(config, estimator, X_train, y_train, X_test, y_test, weight_test,
|
||||
groups_test, eval_metric, obj, labels=None,
|
||||
log_training_metric=False, fit_kwargs={}):
|
||||
def _eval_estimator(
|
||||
config,
|
||||
estimator,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test,
|
||||
y_test,
|
||||
weight_test,
|
||||
groups_test,
|
||||
eval_metric,
|
||||
obj,
|
||||
labels=None,
|
||||
log_training_metric=False,
|
||||
fit_kwargs={},
|
||||
):
|
||||
if isinstance(eval_metric, str):
|
||||
pred_start = time.time()
|
||||
test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj)
|
||||
pred_time = (time.time() - pred_start) / X_test.shape[0]
|
||||
test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test,
|
||||
labels, weight_test, groups_test)
|
||||
test_loss = sklearn_metric_loss_score(
|
||||
eval_metric, test_pred_y, y_test, labels, weight_test, groups_test
|
||||
)
|
||||
metric_for_logging = {}
|
||||
if log_training_metric:
|
||||
train_pred_y = get_y_pred(estimator, X_train, eval_metric, obj)
|
||||
metric_for_logging['train_loss'] = sklearn_metric_loss_score(
|
||||
eval_metric, train_pred_y, y_train, labels,
|
||||
fit_kwargs.get('sample_weight'), fit_kwargs.get('groups'))
|
||||
metric_for_logging["train_loss"] = sklearn_metric_loss_score(
|
||||
eval_metric,
|
||||
train_pred_y,
|
||||
y_train,
|
||||
labels,
|
||||
fit_kwargs.get("sample_weight"),
|
||||
fit_kwargs.get("groups"),
|
||||
)
|
||||
else: # customized metric function
|
||||
test_loss, metric_for_logging = eval_metric(
|
||||
X_test, y_test, estimator, labels, X_train, y_train, weight_test,
|
||||
fit_kwargs.get('sample_weight'), config, groups_test,
|
||||
fit_kwargs.get('groups'))
|
||||
X_test,
|
||||
y_test,
|
||||
estimator,
|
||||
labels,
|
||||
X_train,
|
||||
y_train,
|
||||
weight_test,
|
||||
fit_kwargs.get("sample_weight"),
|
||||
config,
|
||||
groups_test,
|
||||
fit_kwargs.get("groups"),
|
||||
)
|
||||
if isinstance(metric_for_logging, dict):
|
||||
pred_time = metric_for_logging.get('pred_time', 0)
|
||||
pred_time = metric_for_logging.get("pred_time", 0)
|
||||
test_pred_y = None
|
||||
# eval_metric may return test_pred_y but not necessarily. Setting None for now.
|
||||
return test_loss, metric_for_logging, pred_time, test_pred_y
|
||||
|
||||
|
||||
def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_test,
|
||||
groups_test, eval_metric, obj, labels=None, budget=None,
|
||||
log_training_metric=False, fit_kwargs={}):
|
||||
def get_test_loss(
|
||||
config,
|
||||
estimator,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test,
|
||||
y_test,
|
||||
weight_test,
|
||||
groups_test,
|
||||
eval_metric,
|
||||
obj,
|
||||
labels=None,
|
||||
budget=None,
|
||||
log_training_metric=False,
|
||||
fit_kwargs={},
|
||||
):
|
||||
|
||||
start = time.time()
|
||||
# if groups_test is not None:
|
||||
@ -195,16 +263,37 @@ def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_te
|
||||
# fit_kwargs['y_val'] = y_test
|
||||
estimator.fit(X_train, y_train, budget, **fit_kwargs)
|
||||
test_loss, metric_for_logging, pred_time, _ = _eval_estimator(
|
||||
config, estimator, X_train, y_train, X_test, y_test,
|
||||
weight_test, groups_test, eval_metric, obj,
|
||||
labels, log_training_metric, fit_kwargs)
|
||||
config,
|
||||
estimator,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test,
|
||||
y_test,
|
||||
weight_test,
|
||||
groups_test,
|
||||
eval_metric,
|
||||
obj,
|
||||
labels,
|
||||
log_training_metric,
|
||||
fit_kwargs,
|
||||
)
|
||||
train_time = time.time() - start
|
||||
return test_loss, metric_for_logging, train_time, pred_time
|
||||
|
||||
|
||||
def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
|
||||
task, eval_metric, best_val_loss,
|
||||
log_training_metric=False, fit_kwargs={}):
|
||||
def evaluate_model_CV(
|
||||
config,
|
||||
estimator,
|
||||
X_train_all,
|
||||
y_train_all,
|
||||
budget,
|
||||
kf,
|
||||
task,
|
||||
eval_metric,
|
||||
best_val_loss,
|
||||
log_training_metric=False,
|
||||
fit_kwargs={},
|
||||
):
|
||||
start_time = time.time()
|
||||
total_val_loss = 0
|
||||
total_metric = None
|
||||
@ -213,7 +302,7 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
|
||||
valid_fold_num = total_fold_num = 0
|
||||
n = kf.get_n_splits()
|
||||
X_train_split, y_train_split = X_train_all, y_train_all
|
||||
if task in ('binary', 'multi'):
|
||||
if task in ("binary", "multi"):
|
||||
labels = np.unique(y_train_all)
|
||||
else:
|
||||
labels = None
|
||||
@ -225,8 +314,8 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
|
||||
groups = kf.groups
|
||||
kf = kf.split(X_train_split, y_train_split, groups)
|
||||
shuffle = False
|
||||
elif isinstance(kf, TimeSeriesSplit) and task == 'forecast':
|
||||
y_train_all = pd.DataFrame(y_train_all, columns=['y'])
|
||||
elif isinstance(kf, TimeSeriesSplit) and task == "forecast":
|
||||
y_train_all = pd.DataFrame(y_train_all, columns=["y"])
|
||||
train = X_train_all.join(y_train_all)
|
||||
kf = kf.split(train)
|
||||
shuffle = False
|
||||
@ -237,8 +326,8 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
|
||||
rng = np.random.RandomState(2020)
|
||||
val_loss_list = []
|
||||
budget_per_train = budget / n
|
||||
if 'sample_weight' in fit_kwargs:
|
||||
weight = fit_kwargs['sample_weight']
|
||||
if "sample_weight" in fit_kwargs:
|
||||
weight = fit_kwargs["sample_weight"]
|
||||
weight_val = None
|
||||
else:
|
||||
weight = weight_val = None
|
||||
@ -246,37 +335,48 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
|
||||
if shuffle:
|
||||
train_index = rng.permutation(train_index)
|
||||
if isinstance(X_train_all, pd.DataFrame):
|
||||
X_train, X_val = X_train_split.iloc[
|
||||
train_index], X_train_split.iloc[val_index]
|
||||
X_train = X_train_split.iloc[train_index]
|
||||
X_val = X_train_split.iloc[val_index]
|
||||
else:
|
||||
X_train, X_val = X_train_split[
|
||||
train_index], X_train_split[val_index]
|
||||
X_train, X_val = X_train_split[train_index], X_train_split[val_index]
|
||||
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
|
||||
estimator.cleanup()
|
||||
if weight is not None:
|
||||
fit_kwargs['sample_weight'], weight_val = weight[
|
||||
train_index], weight[val_index]
|
||||
fit_kwargs["sample_weight"], weight_val = (
|
||||
weight[train_index],
|
||||
weight[val_index],
|
||||
)
|
||||
if groups is not None:
|
||||
fit_kwargs['groups'] = groups[train_index]
|
||||
fit_kwargs["groups"] = groups[train_index]
|
||||
groups_val = groups[val_index]
|
||||
else:
|
||||
groups_val = None
|
||||
val_loss_i, metric_i, train_time_i, pred_time_i = get_test_loss(
|
||||
config, estimator, X_train, y_train, X_val, y_val, weight_val,
|
||||
groups_val, eval_metric, task, labels, budget_per_train,
|
||||
log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
|
||||
config,
|
||||
estimator,
|
||||
X_train,
|
||||
y_train,
|
||||
X_val,
|
||||
y_val,
|
||||
weight_val,
|
||||
groups_val,
|
||||
eval_metric,
|
||||
task,
|
||||
labels,
|
||||
budget_per_train,
|
||||
log_training_metric=log_training_metric,
|
||||
fit_kwargs=fit_kwargs,
|
||||
)
|
||||
if weight is not None:
|
||||
fit_kwargs['sample_weight'] = weight
|
||||
fit_kwargs["sample_weight"] = weight
|
||||
valid_fold_num += 1
|
||||
total_fold_num += 1
|
||||
total_val_loss += val_loss_i
|
||||
if log_training_metric or not isinstance(eval_metric, str):
|
||||
if isinstance(total_metric, list):
|
||||
total_metric = [
|
||||
total_metric[i] + v for i, v in enumerate(metric_i)]
|
||||
total_metric = [total_metric[i] + v for i, v in enumerate(metric_i)]
|
||||
elif isinstance(total_metric, dict):
|
||||
total_metric = {
|
||||
k: total_metric[k] + v for k, v in metric_i.items()}
|
||||
total_metric = {k: total_metric[k] + v for k, v in metric_i.items()}
|
||||
elif total_metric is not None:
|
||||
total_metric += metric_i
|
||||
else:
|
||||
@ -307,35 +407,73 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
|
||||
|
||||
|
||||
def compute_estimator(
|
||||
X_train, y_train, X_val, y_val, weight_val, groups_val, budget, kf,
|
||||
config_dic, task, estimator_name, eval_method, eval_metric,
|
||||
best_val_loss=np.Inf, n_jobs=1, estimator_class=None, log_training_metric=False,
|
||||
fit_kwargs={}
|
||||
X_train,
|
||||
y_train,
|
||||
X_val,
|
||||
y_val,
|
||||
weight_val,
|
||||
groups_val,
|
||||
budget,
|
||||
kf,
|
||||
config_dic,
|
||||
task,
|
||||
estimator_name,
|
||||
eval_method,
|
||||
eval_metric,
|
||||
best_val_loss=np.Inf,
|
||||
n_jobs=1,
|
||||
estimator_class=None,
|
||||
log_training_metric=False,
|
||||
fit_kwargs={},
|
||||
):
|
||||
estimator_class = estimator_class or get_estimator_class(
|
||||
task, estimator_name)
|
||||
estimator = estimator_class(
|
||||
**config_dic, task=task, n_jobs=n_jobs)
|
||||
if 'holdout' in eval_method:
|
||||
estimator_class = estimator_class or get_estimator_class(task, estimator_name)
|
||||
estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
|
||||
if "holdout" in eval_method:
|
||||
val_loss, metric_for_logging, train_time, pred_time = get_test_loss(
|
||||
config_dic, estimator, X_train, y_train, X_val, y_val, weight_val,
|
||||
groups_val, eval_metric, task, budget=budget,
|
||||
log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
|
||||
config_dic,
|
||||
estimator,
|
||||
X_train,
|
||||
y_train,
|
||||
X_val,
|
||||
y_val,
|
||||
weight_val,
|
||||
groups_val,
|
||||
eval_metric,
|
||||
task,
|
||||
budget=budget,
|
||||
log_training_metric=log_training_metric,
|
||||
fit_kwargs=fit_kwargs,
|
||||
)
|
||||
else:
|
||||
val_loss, metric_for_logging, train_time, pred_time = evaluate_model_CV(
|
||||
config_dic, estimator, X_train, y_train, budget, kf, task,
|
||||
eval_metric, best_val_loss, log_training_metric=log_training_metric,
|
||||
fit_kwargs=fit_kwargs)
|
||||
config_dic,
|
||||
estimator,
|
||||
X_train,
|
||||
y_train,
|
||||
budget,
|
||||
kf,
|
||||
task,
|
||||
eval_metric,
|
||||
best_val_loss,
|
||||
log_training_metric=log_training_metric,
|
||||
fit_kwargs=fit_kwargs,
|
||||
)
|
||||
return estimator, val_loss, metric_for_logging, train_time, pred_time
|
||||
|
||||
|
||||
def train_estimator(
|
||||
X_train, y_train, config_dic, task,
|
||||
estimator_name, n_jobs=1, estimator_class=None, budget=None, fit_kwargs={}
|
||||
X_train,
|
||||
y_train,
|
||||
config_dic,
|
||||
task,
|
||||
estimator_name,
|
||||
n_jobs=1,
|
||||
estimator_class=None,
|
||||
budget=None,
|
||||
fit_kwargs={},
|
||||
):
|
||||
start_time = time.time()
|
||||
estimator_class = estimator_class or get_estimator_class(
|
||||
task, estimator_name)
|
||||
estimator_class = estimator_class or get_estimator_class(task, estimator_name)
|
||||
estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
|
||||
if X_train is not None:
|
||||
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
|
||||
@ -347,14 +485,14 @@ def train_estimator(
|
||||
|
||||
def get_classification_objective(num_labels: int) -> str:
|
||||
if num_labels == 2:
|
||||
objective_name = 'binary'
|
||||
objective_name = "binary"
|
||||
else:
|
||||
objective_name = 'multi'
|
||||
objective_name = "multi"
|
||||
return objective_name
|
||||
|
||||
|
||||
def norm_confusion_matrix(y_true, y_pred):
|
||||
'''normalized confusion matrix
|
||||
"""normalized confusion matrix
|
||||
|
||||
Args:
|
||||
estimator: A multi-class classification estimator
|
||||
@ -363,15 +501,16 @@ def norm_confusion_matrix(y_true, y_pred):
|
||||
|
||||
Returns:
|
||||
A normalized confusion matrix
|
||||
'''
|
||||
"""
|
||||
from sklearn.metrics import confusion_matrix
|
||||
|
||||
conf_mat = confusion_matrix(y_true, y_pred)
|
||||
norm_conf_mat = conf_mat.astype('float') / conf_mat.sum(axis=1)[:, np.newaxis]
|
||||
norm_conf_mat = conf_mat.astype("float") / conf_mat.sum(axis=1)[:, np.newaxis]
|
||||
return norm_conf_mat
|
||||
|
||||
|
||||
def multi_class_curves(y_true, y_pred_proba, curve_func):
|
||||
'''Binarize the data for multi-class tasks and produce ROC or precision-recall curves
|
||||
"""Binarize the data for multi-class tasks and produce ROC or precision-recall curves
|
||||
|
||||
Args:
|
||||
y_true: A numpy array or a pandas series of true labels
|
||||
@ -384,8 +523,9 @@ def multi_class_curves(y_true, y_pred_proba, curve_func):
|
||||
curve_x[0] is an 1D array of the x coordinates of class 0
|
||||
The second dictionary curve_y stores the y coordinates of each curve, e.g.,
|
||||
curve_y[0] is an 1D array of the y coordinates of class 0
|
||||
'''
|
||||
"""
|
||||
from sklearn.preprocessing import label_binarize
|
||||
|
||||
classes = np.unique(y_true)
|
||||
y_true_binary = label_binarize(y_true, classes=classes)
|
||||
|
||||
|
873
flaml/model.py
873
flaml/model.py
File diff suppressed because it is too large
Load Diff
@ -1 +1 @@
|
||||
__version__ = "0.6.3"
|
||||
__version__ = "0.6.4"
|
||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ setuptools.setup(
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://github.com/microsoft/FLAML",
|
||||
packages=setuptools.find_packages(),
|
||||
packages=setuptools.find_packages(include=["flaml*"]),
|
||||
install_requires=install_requires,
|
||||
extras_require={
|
||||
"notebook": [
|
||||
|
@ -30,9 +30,11 @@ def test_forecast_automl(budget=5):
|
||||
}
|
||||
"""The main flaml automl API"""
|
||||
try:
|
||||
import prophet
|
||||
|
||||
automl.fit(dataframe=df, **settings, period=time_horizon)
|
||||
except ImportError:
|
||||
print("not using FBProphet due to ImportError")
|
||||
print("not using prophet due to ImportError")
|
||||
automl.fit(
|
||||
dataframe=df,
|
||||
**settings,
|
||||
@ -79,7 +81,7 @@ def test_forecast_automl(budget=5):
|
||||
try:
|
||||
automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon)
|
||||
except ImportError:
|
||||
print("not using FBProphet due to ImportError")
|
||||
print("not using prophet due to ImportError")
|
||||
automl.fit(
|
||||
X_train=X_train,
|
||||
y_train=y_train,
|
||||
@ -94,6 +96,8 @@ def test_numpy():
|
||||
y_train = np.random.random(size=72)
|
||||
automl = AutoML()
|
||||
try:
|
||||
import prophet
|
||||
|
||||
automl.fit(
|
||||
X_train=X_train[:60], # a single column of timestamp
|
||||
y_train=y_train, # value for each timestamp
|
||||
@ -105,9 +109,9 @@ def test_numpy():
|
||||
print(automl.predict(X_train[60:]))
|
||||
print(automl.predict(12))
|
||||
except ValueError:
|
||||
print("ValueError for FBProphet is raised as expected.")
|
||||
print("ValueError for prophet is raised as expected.")
|
||||
except ImportError:
|
||||
print("not using FBProphet due to ImportError")
|
||||
print("not using prophet due to ImportError")
|
||||
automl = AutoML()
|
||||
automl.fit(
|
||||
X_train=X_train[:72], # a single column of timestamp
|
||||
|
Loading…
x
Reference in New Issue
Block a user