mirror of
https://github.com/microsoft/autogen.git
synced 2025-09-03 21:37:17 +00:00
package name in setup (#198)
* package name * learning to rank example: close #200 * try import prophet #201
This commit is contained in:
parent
8f9f08cebc
commit
f4529dfe89
21
README.md
21
README.md
@ -75,7 +75,7 @@ And they can be used in distributed HPO frameworks such as ray tune or nni.
|
|||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
|
|
||||||
- A basic classification example.
|
* A basic classification example.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from flaml import AutoML
|
from flaml import AutoML
|
||||||
@ -99,7 +99,7 @@ print(automl.predict_proba(X_train))
|
|||||||
print(automl.model)
|
print(automl.model)
|
||||||
```
|
```
|
||||||
|
|
||||||
- A basic regression example.
|
* A basic regression example.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from flaml import AutoML
|
from flaml import AutoML
|
||||||
@ -123,7 +123,7 @@ print(automl.predict(X_train))
|
|||||||
print(automl.model)
|
print(automl.model)
|
||||||
```
|
```
|
||||||
|
|
||||||
- Time series forecasting.
|
* Time series forecasting.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# pip install flaml[forecast]
|
# pip install flaml[forecast]
|
||||||
@ -141,14 +141,15 @@ automl.fit(X_train=X_train[:72], # a single column of timestamp
|
|||||||
print(automl.predict(X_train[72:]))
|
print(automl.predict(X_train[72:]))
|
||||||
```
|
```
|
||||||
|
|
||||||
- Learning to rank.
|
* Learning to rank.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from sklearn.datasets import fetch_openml
|
from sklearn.datasets import fetch_openml
|
||||||
from flaml import AutoML
|
from flaml import AutoML
|
||||||
X, y = fetch_openml(name="credit-g", return_X_y=True)
|
X_train, y_train = fetch_openml(name="credit-g", return_X_y=True, as_frame=False)
|
||||||
|
y_train = y_train.cat.codes
|
||||||
# not a real learning to rank dataaset
|
# not a real learning to rank dataaset
|
||||||
groups = [200] * 4 + [100] * 2, # group counts
|
groups = [200] * 4 + [100] * 2 # group counts
|
||||||
automl = AutoML()
|
automl = AutoML()
|
||||||
automl.fit(
|
automl.fit(
|
||||||
X_train, y_train, groups=groups,
|
X_train, y_train, groups=groups,
|
||||||
@ -207,17 +208,21 @@ pip install -e .[test,notebook]
|
|||||||
```
|
```
|
||||||
|
|
||||||
### Docker
|
### Docker
|
||||||
|
|
||||||
We provide a simple [Dockerfile](https://github.com/microsoft/FLAML/blob/main/Dockerfile).
|
We provide a simple [Dockerfile](https://github.com/microsoft/FLAML/blob/main/Dockerfile).
|
||||||
```
|
|
||||||
|
```bash
|
||||||
docker build git://github.com/microsoft/FLAML -t flaml-dev
|
docker build git://github.com/microsoft/FLAML -t flaml-dev
|
||||||
docker run -it flaml-dev
|
docker run -it flaml-dev
|
||||||
```
|
```
|
||||||
|
|
||||||
### Develop in Remote Container
|
### Develop in Remote Container
|
||||||
|
|
||||||
If you use vscode, you can open the FLAML folder in a [Container](https://code.visualstudio.com/docs/remote/containers).
|
If you use vscode, you can open the FLAML folder in a [Container](https://code.visualstudio.com/docs/remote/containers).
|
||||||
We have provided the configuration in (.devcontainer)[(https://github.com/microsoft/FLAML/blob/main/.devcontainer)].
|
We have provided the configuration in [.devcontainer]((https://github.com/microsoft/FLAML/blob/main/.devcontainer)).
|
||||||
|
|
||||||
### Pre-commit
|
### Pre-commit
|
||||||
|
|
||||||
Run `pre-commit install` to install pre-commit into your git hooks. Before you commit, run
|
Run `pre-commit install` to install pre-commit into your git hooks. Before you commit, run
|
||||||
`pre-commit run` to check if you meet the pre-commit requirements. If you use Windows (without WSL) and can't commit after installing pre-commit, you can run `pre-commit uninstall` to uninstall the hook. In WSL or Linux this is supposed to work.
|
`pre-commit run` to check if you meet the pre-commit requirements. If you use Windows (without WSL) and can't commit after installing pre-commit, you can run `pre-commit uninstall` to uninstall the hook. In WSL or Linux this is supposed to work.
|
||||||
|
|
||||||
|
@ -1474,7 +1474,12 @@ class AutoML:
|
|||||||
|
|
||||||
if "auto" == estimator_list:
|
if "auto" == estimator_list:
|
||||||
if self._state.task == "forecast":
|
if self._state.task == "forecast":
|
||||||
estimator_list = ["fbprophet", "arima", "sarimax"]
|
try:
|
||||||
|
import prophet
|
||||||
|
|
||||||
|
estimator_list = ["prophet", "arima", "sarimax"]
|
||||||
|
except ImportError:
|
||||||
|
estimator_list = ["arima", "sarimax"]
|
||||||
elif self._state.task == "rank":
|
elif self._state.task == "rank":
|
||||||
estimator_list = ["lgbm", "xgboost"]
|
estimator_list = ["lgbm", "xgboost"]
|
||||||
else:
|
else:
|
||||||
|
234
flaml/data.py
234
flaml/data.py
@ -1,7 +1,7 @@
|
|||||||
'''!
|
"""!
|
||||||
* Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
|
* Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
|
||||||
* Licensed under the MIT License.
|
* Licensed under the MIT License.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy.sparse import vstack, issparse
|
from scipy.sparse import vstack, issparse
|
||||||
@ -11,9 +11,10 @@ from .training_log import training_log_reader
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
def load_openml_dataset(dataset_id, data_dir=None, random_state=0,
|
def load_openml_dataset(
|
||||||
dataset_format='dataframe'):
|
dataset_id, data_dir=None, random_state=0, dataset_format="dataframe"
|
||||||
'''Load dataset from open ML.
|
):
|
||||||
|
"""Load dataset from open ML.
|
||||||
|
|
||||||
If the file is not cached locally, download it from open ML.
|
If the file is not cached locally, download it from open ML.
|
||||||
|
|
||||||
@ -30,41 +31,43 @@ def load_openml_dataset(dataset_id, data_dir=None, random_state=0,
|
|||||||
X_test: Test data
|
X_test: Test data
|
||||||
y_train: A series or array of labels for training data
|
y_train: A series or array of labels for training data
|
||||||
y_test: A series or array of labels for test data
|
y_test: A series or array of labels for test data
|
||||||
'''
|
"""
|
||||||
import os
|
import os
|
||||||
import openml
|
import openml
|
||||||
import pickle
|
import pickle
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
filename = 'openml_ds' + str(dataset_id) + '.pkl'
|
filename = "openml_ds" + str(dataset_id) + ".pkl"
|
||||||
filepath = os.path.join(data_dir, filename)
|
filepath = os.path.join(data_dir, filename)
|
||||||
if os.path.isfile(filepath):
|
if os.path.isfile(filepath):
|
||||||
print('load dataset from', filepath)
|
print("load dataset from", filepath)
|
||||||
with open(filepath, 'rb') as f:
|
with open(filepath, "rb") as f:
|
||||||
dataset = pickle.load(f)
|
dataset = pickle.load(f)
|
||||||
else:
|
else:
|
||||||
print('download dataset from openml')
|
print("download dataset from openml")
|
||||||
dataset = openml.datasets.get_dataset(dataset_id)
|
dataset = openml.datasets.get_dataset(dataset_id)
|
||||||
if not os.path.exists(data_dir):
|
if not os.path.exists(data_dir):
|
||||||
os.makedirs(data_dir)
|
os.makedirs(data_dir)
|
||||||
with open(filepath, 'wb') as f:
|
with open(filepath, "wb") as f:
|
||||||
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
|
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
|
||||||
print('Dataset name:', dataset.name)
|
print("Dataset name:", dataset.name)
|
||||||
X, y, * \
|
X, y, *__ = dataset.get_data(
|
||||||
__ = dataset.get_data(
|
target=dataset.default_target_attribute, dataset_format=dataset_format
|
||||||
target=dataset.default_target_attribute, dataset_format=dataset_format)
|
)
|
||||||
X_train, X_test, y_train, y_test = train_test_split(
|
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
|
||||||
X, y, random_state=random_state)
|
|
||||||
print(
|
print(
|
||||||
'X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}'.format(
|
"X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}".format(
|
||||||
X_train.shape, y_train.shape, X_test.shape, y_test.shape,
|
X_train.shape,
|
||||||
|
y_train.shape,
|
||||||
|
X_test.shape,
|
||||||
|
y_test.shape,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return X_train, X_test, y_train, y_test
|
return X_train, X_test, y_train, y_test
|
||||||
|
|
||||||
|
|
||||||
def load_openml_task(task_id, data_dir):
|
def load_openml_task(task_id, data_dir):
|
||||||
'''Load task from open ML.
|
"""Load task from open ML.
|
||||||
|
|
||||||
Use the first fold of the task.
|
Use the first fold of the task.
|
||||||
If the file is not cached locally, download it from open ML.
|
If the file is not cached locally, download it from open ML.
|
||||||
@ -78,21 +81,22 @@ def load_openml_task(task_id, data_dir):
|
|||||||
X_test: A dataframe of test data
|
X_test: A dataframe of test data
|
||||||
y_train: A series of labels for training data
|
y_train: A series of labels for training data
|
||||||
y_test: A series of labels for test data
|
y_test: A series of labels for test data
|
||||||
'''
|
"""
|
||||||
import os
|
import os
|
||||||
import openml
|
import openml
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
task = openml.tasks.get_task(task_id)
|
task = openml.tasks.get_task(task_id)
|
||||||
filename = 'openml_task' + str(task_id) + '.pkl'
|
filename = "openml_task" + str(task_id) + ".pkl"
|
||||||
filepath = os.path.join(data_dir, filename)
|
filepath = os.path.join(data_dir, filename)
|
||||||
if os.path.isfile(filepath):
|
if os.path.isfile(filepath):
|
||||||
print('load dataset from', filepath)
|
print("load dataset from", filepath)
|
||||||
with open(filepath, 'rb') as f:
|
with open(filepath, "rb") as f:
|
||||||
dataset = pickle.load(f)
|
dataset = pickle.load(f)
|
||||||
else:
|
else:
|
||||||
print('download dataset from openml')
|
print("download dataset from openml")
|
||||||
dataset = task.get_dataset()
|
dataset = task.get_dataset()
|
||||||
with open(filepath, 'wb') as f:
|
with open(filepath, "wb") as f:
|
||||||
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
|
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
|
||||||
X, y, _, _ = dataset.get_data(task.target_name)
|
X, y, _, _ = dataset.get_data(task.target_name)
|
||||||
train_indices, test_indices = task.get_train_test_split_indices(
|
train_indices, test_indices = task.get_train_test_split_indices(
|
||||||
@ -105,15 +109,18 @@ def load_openml_task(task_id, data_dir):
|
|||||||
X_test = X.iloc[test_indices]
|
X_test = X.iloc[test_indices]
|
||||||
y_test = y[test_indices]
|
y_test = y[test_indices]
|
||||||
print(
|
print(
|
||||||
'X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}'.format(
|
"X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}".format(
|
||||||
X_train.shape, y_train.shape, X_test.shape, y_test.shape,
|
X_train.shape,
|
||||||
|
y_train.shape,
|
||||||
|
X_test.shape,
|
||||||
|
y_test.shape,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return X_train, X_test, y_train, y_test
|
return X_train, X_test, y_train, y_test
|
||||||
|
|
||||||
|
|
||||||
def get_output_from_log(filename, time_budget):
|
def get_output_from_log(filename, time_budget):
|
||||||
'''Get output from log file
|
"""Get output from log file
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
filename: A string of the log file name
|
filename: A string of the log file name
|
||||||
@ -127,11 +134,11 @@ def get_output_from_log(filename, time_budget):
|
|||||||
config_list:
|
config_list:
|
||||||
A list of the estimator, sample size and config of each logged iter
|
A list of the estimator, sample size and config of each logged iter
|
||||||
logged_metric_list: A list of the logged metric of each logged iter
|
logged_metric_list: A list of the logged metric of each logged iter
|
||||||
'''
|
"""
|
||||||
|
|
||||||
best_config = None
|
best_config = None
|
||||||
best_learner = None
|
best_learner = None
|
||||||
best_val_loss = float('+inf')
|
best_val_loss = float("+inf")
|
||||||
|
|
||||||
search_time_list = []
|
search_time_list = []
|
||||||
config_list = []
|
config_list = []
|
||||||
@ -144,7 +151,7 @@ def get_output_from_log(filename, time_budget):
|
|||||||
time_used = record.wall_clock_time
|
time_used = record.wall_clock_time
|
||||||
val_loss = record.validation_loss
|
val_loss = record.validation_loss
|
||||||
config = record.config
|
config = record.config
|
||||||
learner = record.learner.split('_')[0]
|
learner = record.learner.split("_")[0]
|
||||||
sample_size = record.sample_size
|
sample_size = record.sample_size
|
||||||
metric = record.logged_metric
|
metric = record.logged_metric
|
||||||
|
|
||||||
@ -158,27 +165,34 @@ def get_output_from_log(filename, time_budget):
|
|||||||
best_error_list.append(best_val_loss)
|
best_error_list.append(best_val_loss)
|
||||||
logged_metric_list.append(metric)
|
logged_metric_list.append(metric)
|
||||||
error_list.append(val_loss)
|
error_list.append(val_loss)
|
||||||
config_list.append({"Current Learner": learner,
|
config_list.append(
|
||||||
"Current Sample": sample_size,
|
{
|
||||||
"Current Hyper-parameters": record.config,
|
"Current Learner": learner,
|
||||||
"Best Learner": best_learner,
|
"Current Sample": sample_size,
|
||||||
"Best Hyper-parameters": best_config})
|
"Current Hyper-parameters": record.config,
|
||||||
|
"Best Learner": best_learner,
|
||||||
|
"Best Hyper-parameters": best_config,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return (search_time_list, best_error_list, error_list, config_list,
|
return (
|
||||||
logged_metric_list)
|
search_time_list,
|
||||||
|
best_error_list,
|
||||||
|
error_list,
|
||||||
|
config_list,
|
||||||
|
logged_metric_list,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def concat(X1, X2):
|
def concat(X1, X2):
|
||||||
'''concatenate two matrices vertically
|
"""concatenate two matrices vertically"""
|
||||||
'''
|
|
||||||
if isinstance(X1, pd.DataFrame) or isinstance(X1, pd.Series):
|
if isinstance(X1, pd.DataFrame) or isinstance(X1, pd.Series):
|
||||||
df = pd.concat([X1, X2], sort=False)
|
df = pd.concat([X1, X2], sort=False)
|
||||||
df.reset_index(drop=True, inplace=True)
|
df.reset_index(drop=True, inplace=True)
|
||||||
if isinstance(X1, pd.DataFrame):
|
if isinstance(X1, pd.DataFrame):
|
||||||
cat_columns = X1.select_dtypes(
|
cat_columns = X1.select_dtypes(include="category").columns
|
||||||
include='category').columns
|
|
||||||
if len(cat_columns):
|
if len(cat_columns):
|
||||||
df[cat_columns] = df[cat_columns].astype('category')
|
df[cat_columns] = df[cat_columns].astype("category")
|
||||||
return df
|
return df
|
||||||
if issparse(X1):
|
if issparse(X1):
|
||||||
return vstack((X1, X2))
|
return vstack((X1, X2))
|
||||||
@ -187,8 +201,7 @@ def concat(X1, X2):
|
|||||||
|
|
||||||
|
|
||||||
class DataTransformer:
|
class DataTransformer:
|
||||||
'''transform X, y
|
"""transform X, y"""
|
||||||
'''
|
|
||||||
|
|
||||||
def fit_transform(self, X, y, task):
|
def fit_transform(self, X, y, task):
|
||||||
if isinstance(X, pd.DataFrame):
|
if isinstance(X, pd.DataFrame):
|
||||||
@ -198,19 +211,25 @@ class DataTransformer:
|
|||||||
drop = False
|
drop = False
|
||||||
for column in X.columns:
|
for column in X.columns:
|
||||||
# sklearn\utils\validation.py needs int/float values
|
# sklearn\utils\validation.py needs int/float values
|
||||||
if X[column].dtype.name in ('object', 'category'):
|
if X[column].dtype.name in ("object", "category"):
|
||||||
if X[column].nunique() == 1 or X[column].nunique(
|
if (
|
||||||
dropna=True) == n - X[column].isnull().sum():
|
X[column].nunique() == 1
|
||||||
|
or X[column].nunique(dropna=True)
|
||||||
|
== n - X[column].isnull().sum()
|
||||||
|
):
|
||||||
X.drop(columns=column, inplace=True)
|
X.drop(columns=column, inplace=True)
|
||||||
drop = True
|
drop = True
|
||||||
elif X[column].dtype.name == 'category':
|
elif X[column].dtype.name == "category":
|
||||||
current_categories = X[column].cat.categories
|
current_categories = X[column].cat.categories
|
||||||
if '__NAN__' not in current_categories:
|
if "__NAN__" not in current_categories:
|
||||||
X[column] = X[column].cat.add_categories(
|
X[column] = (
|
||||||
'__NAN__').fillna('__NAN__')
|
X[column]
|
||||||
|
.cat.add_categories("__NAN__")
|
||||||
|
.fillna("__NAN__")
|
||||||
|
)
|
||||||
cat_columns.append(column)
|
cat_columns.append(column)
|
||||||
else:
|
else:
|
||||||
X[column] = X[column].fillna('__NAN__')
|
X[column] = X[column].fillna("__NAN__")
|
||||||
cat_columns.append(column)
|
cat_columns.append(column)
|
||||||
else:
|
else:
|
||||||
# print(X[column].dtype.name)
|
# print(X[column].dtype.name)
|
||||||
@ -218,17 +237,27 @@ class DataTransformer:
|
|||||||
X.drop(columns=column, inplace=True)
|
X.drop(columns=column, inplace=True)
|
||||||
drop = True
|
drop = True
|
||||||
else:
|
else:
|
||||||
if X[column].dtype.name == 'datetime64[ns]':
|
if X[column].dtype.name == "datetime64[ns]":
|
||||||
tmp_dt = X[column].dt
|
tmp_dt = X[column].dt
|
||||||
new_columns_dict = {f'year_{column}': tmp_dt.year, f'month_{column}': tmp_dt.month,
|
new_columns_dict = {
|
||||||
f'day_{column}': tmp_dt.day, f'hour_{column}': tmp_dt.hour,
|
f"year_{column}": tmp_dt.year,
|
||||||
f'minute_{column}': tmp_dt.minute, f'second_{column}': tmp_dt.second,
|
f"month_{column}": tmp_dt.month,
|
||||||
f'dayofweek_{column}': tmp_dt.dayofweek,
|
f"day_{column}": tmp_dt.day,
|
||||||
f'dayofyear_{column}': tmp_dt.dayofyear,
|
f"hour_{column}": tmp_dt.hour,
|
||||||
f'quarter_{column}': tmp_dt.quarter}
|
f"minute_{column}": tmp_dt.minute,
|
||||||
|
f"second_{column}": tmp_dt.second,
|
||||||
|
f"dayofweek_{column}": tmp_dt.dayofweek,
|
||||||
|
f"dayofyear_{column}": tmp_dt.dayofyear,
|
||||||
|
f"quarter_{column}": tmp_dt.quarter,
|
||||||
|
}
|
||||||
for new_col_name in new_columns_dict.keys():
|
for new_col_name in new_columns_dict.keys():
|
||||||
if new_col_name not in X.columns and \
|
if (
|
||||||
new_columns_dict.get(new_col_name).nunique(dropna=False) >= 2:
|
new_col_name not in X.columns
|
||||||
|
and new_columns_dict.get(new_col_name).nunique(
|
||||||
|
dropna=False
|
||||||
|
)
|
||||||
|
>= 2
|
||||||
|
):
|
||||||
X[new_col_name] = new_columns_dict.get(new_col_name)
|
X[new_col_name] = new_columns_dict.get(new_col_name)
|
||||||
num_columns.append(new_col_name)
|
num_columns.append(new_col_name)
|
||||||
X[column] = X[column].map(datetime.toordinal)
|
X[column] = X[column].map(datetime.toordinal)
|
||||||
@ -239,11 +268,12 @@ class DataTransformer:
|
|||||||
num_columns.append(column)
|
num_columns.append(column)
|
||||||
X = X[cat_columns + num_columns]
|
X = X[cat_columns + num_columns]
|
||||||
if cat_columns:
|
if cat_columns:
|
||||||
X[cat_columns] = X[cat_columns].astype('category')
|
X[cat_columns] = X[cat_columns].astype("category")
|
||||||
if num_columns:
|
if num_columns:
|
||||||
X_num = X[num_columns]
|
X_num = X[num_columns]
|
||||||
if np.issubdtype(X_num.columns.dtype, np.integer) and (
|
if np.issubdtype(X_num.columns.dtype, np.integer) and (
|
||||||
drop or min(X_num.columns) != 0
|
drop
|
||||||
|
or min(X_num.columns) != 0
|
||||||
or max(X_num.columns) != X_num.shape[1] - 1
|
or max(X_num.columns) != X_num.shape[1] - 1
|
||||||
):
|
):
|
||||||
X_num.columns = range(X_num.shape[1])
|
X_num.columns = range(X_num.shape[1])
|
||||||
@ -252,17 +282,31 @@ class DataTransformer:
|
|||||||
drop = False
|
drop = False
|
||||||
from sklearn.impute import SimpleImputer
|
from sklearn.impute import SimpleImputer
|
||||||
from sklearn.compose import ColumnTransformer
|
from sklearn.compose import ColumnTransformer
|
||||||
self.transformer = ColumnTransformer([(
|
|
||||||
'continuous',
|
self.transformer = ColumnTransformer(
|
||||||
SimpleImputer(missing_values=np.nan, strategy='median'),
|
[
|
||||||
X_num.columns)])
|
(
|
||||||
|
"continuous",
|
||||||
|
SimpleImputer(missing_values=np.nan, strategy="median"),
|
||||||
|
X_num.columns,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
X[num_columns] = self.transformer.fit_transform(X_num)
|
X[num_columns] = self.transformer.fit_transform(X_num)
|
||||||
self._cat_columns, self._num_columns, self._datetime_columns = \
|
self._cat_columns, self._num_columns, self._datetime_columns = (
|
||||||
cat_columns, num_columns, datetime_columns
|
cat_columns,
|
||||||
|
num_columns,
|
||||||
|
datetime_columns,
|
||||||
|
)
|
||||||
self._drop = drop
|
self._drop = drop
|
||||||
|
|
||||||
if task in ('binary', 'multi', 'classification'):
|
if task in (
|
||||||
|
"binary",
|
||||||
|
"multi",
|
||||||
|
"classification",
|
||||||
|
) or not pd.api.types.is_numeric_dtype(y):
|
||||||
from sklearn.preprocessing import LabelEncoder
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
|
||||||
self.label_transformer = LabelEncoder()
|
self.label_transformer = LabelEncoder()
|
||||||
y = self.label_transformer.fit_transform(y)
|
y = self.label_transformer.fit_transform(y)
|
||||||
else:
|
else:
|
||||||
@ -272,34 +316,46 @@ class DataTransformer:
|
|||||||
def transform(self, X):
|
def transform(self, X):
|
||||||
X = X.copy()
|
X = X.copy()
|
||||||
if isinstance(X, pd.DataFrame):
|
if isinstance(X, pd.DataFrame):
|
||||||
cat_columns, num_columns, datetime_columns = self._cat_columns, \
|
cat_columns, num_columns, datetime_columns = (
|
||||||
self._num_columns, self._datetime_columns
|
self._cat_columns,
|
||||||
|
self._num_columns,
|
||||||
|
self._datetime_columns,
|
||||||
|
)
|
||||||
if datetime_columns:
|
if datetime_columns:
|
||||||
for column in datetime_columns:
|
for column in datetime_columns:
|
||||||
tmp_dt = X[column].dt
|
tmp_dt = X[column].dt
|
||||||
new_columns_dict = {f'year_{column}': tmp_dt.year, f'month_{column}': tmp_dt.month,
|
new_columns_dict = {
|
||||||
f'day_{column}': tmp_dt.day, f'hour_{column}': tmp_dt.hour,
|
f"year_{column}": tmp_dt.year,
|
||||||
f'minute_{column}': tmp_dt.minute, f'second_{column}': tmp_dt.second,
|
f"month_{column}": tmp_dt.month,
|
||||||
f'dayofweek_{column}': tmp_dt.dayofweek,
|
f"day_{column}": tmp_dt.day,
|
||||||
f'dayofyear_{column}': tmp_dt.dayofyear,
|
f"hour_{column}": tmp_dt.hour,
|
||||||
f'quarter_{column}': tmp_dt.quarter}
|
f"minute_{column}": tmp_dt.minute,
|
||||||
|
f"second_{column}": tmp_dt.second,
|
||||||
|
f"dayofweek_{column}": tmp_dt.dayofweek,
|
||||||
|
f"dayofyear_{column}": tmp_dt.dayofyear,
|
||||||
|
f"quarter_{column}": tmp_dt.quarter,
|
||||||
|
}
|
||||||
for new_col_name in new_columns_dict.keys():
|
for new_col_name in new_columns_dict.keys():
|
||||||
if new_col_name not in X.columns and \
|
if (
|
||||||
new_columns_dict.get(new_col_name).nunique(dropna=False) >= 2:
|
new_col_name not in X.columns
|
||||||
|
and new_columns_dict.get(new_col_name).nunique(dropna=False)
|
||||||
|
>= 2
|
||||||
|
):
|
||||||
X[new_col_name] = new_columns_dict.get(new_col_name)
|
X[new_col_name] = new_columns_dict.get(new_col_name)
|
||||||
X[column] = X[column].map(datetime.toordinal)
|
X[column] = X[column].map(datetime.toordinal)
|
||||||
del tmp_dt
|
del tmp_dt
|
||||||
X = X[cat_columns + num_columns].copy()
|
X = X[cat_columns + num_columns].copy()
|
||||||
for column in cat_columns:
|
for column in cat_columns:
|
||||||
if X[column].dtype.name == 'object':
|
if X[column].dtype.name == "object":
|
||||||
X[column] = X[column].fillna('__NAN__')
|
X[column] = X[column].fillna("__NAN__")
|
||||||
elif X[column].dtype.name == 'category':
|
elif X[column].dtype.name == "category":
|
||||||
current_categories = X[column].cat.categories
|
current_categories = X[column].cat.categories
|
||||||
if '__NAN__' not in current_categories:
|
if "__NAN__" not in current_categories:
|
||||||
X[column] = X[column].cat.add_categories(
|
X[column] = (
|
||||||
'__NAN__').fillna('__NAN__')
|
X[column].cat.add_categories("__NAN__").fillna("__NAN__")
|
||||||
|
)
|
||||||
if cat_columns:
|
if cat_columns:
|
||||||
X[cat_columns] = X[cat_columns].astype('category')
|
X[cat_columns] = X[cat_columns].astype("category")
|
||||||
if num_columns:
|
if num_columns:
|
||||||
X_num = X[num_columns].fillna(np.nan)
|
X_num = X[num_columns].fillna(np.nan)
|
||||||
if self._drop:
|
if self._drop:
|
||||||
|
424
flaml/ml.py
424
flaml/ml.py
@ -1,65 +1,90 @@
|
|||||||
'''!
|
"""!
|
||||||
* Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
|
* Copyright (c) Microsoft Corporation. All rights reserved.
|
||||||
* Licensed under the MIT License.
|
* Licensed under the MIT License.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import time
|
import time
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \
|
from sklearn.metrics import (
|
||||||
accuracy_score, mean_absolute_error, log_loss, average_precision_score, \
|
mean_squared_error,
|
||||||
f1_score, mean_absolute_percentage_error, ndcg_score
|
r2_score,
|
||||||
|
roc_auc_score,
|
||||||
|
accuracy_score,
|
||||||
|
mean_absolute_error,
|
||||||
|
log_loss,
|
||||||
|
average_precision_score,
|
||||||
|
f1_score,
|
||||||
|
mean_absolute_percentage_error,
|
||||||
|
ndcg_score,
|
||||||
|
)
|
||||||
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit
|
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit
|
||||||
from .model import (
|
from .model import (
|
||||||
XGBoostEstimator, XGBoostSklearnEstimator, RandomForestEstimator,
|
XGBoostEstimator,
|
||||||
LGBMEstimator, LRL1Classifier, LRL2Classifier, CatBoostEstimator,
|
XGBoostSklearnEstimator,
|
||||||
ExtraTreeEstimator, KNeighborsEstimator, FBProphet, ARIMA, SARIMAX)
|
RandomForestEstimator,
|
||||||
|
LGBMEstimator,
|
||||||
|
LRL1Classifier,
|
||||||
|
LRL2Classifier,
|
||||||
|
CatBoostEstimator,
|
||||||
|
ExtraTreeEstimator,
|
||||||
|
KNeighborsEstimator,
|
||||||
|
Prophet,
|
||||||
|
ARIMA,
|
||||||
|
SARIMAX,
|
||||||
|
)
|
||||||
from .data import group_counts
|
from .data import group_counts
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def get_estimator_class(task, estimator_name):
|
def get_estimator_class(task, estimator_name):
|
||||||
''' when adding a new learner, need to add an elif branch '''
|
"""when adding a new learner, need to add an elif branch"""
|
||||||
|
|
||||||
if 'xgboost' == estimator_name:
|
if "xgboost" == estimator_name:
|
||||||
if 'regression' == task:
|
if "regression" == task:
|
||||||
estimator_class = XGBoostEstimator
|
estimator_class = XGBoostEstimator
|
||||||
else:
|
else:
|
||||||
estimator_class = XGBoostSklearnEstimator
|
estimator_class = XGBoostSklearnEstimator
|
||||||
elif 'rf' == estimator_name:
|
elif "rf" == estimator_name:
|
||||||
estimator_class = RandomForestEstimator
|
estimator_class = RandomForestEstimator
|
||||||
elif 'lgbm' == estimator_name:
|
elif "lgbm" == estimator_name:
|
||||||
estimator_class = LGBMEstimator
|
estimator_class = LGBMEstimator
|
||||||
elif 'lrl1' == estimator_name:
|
elif "lrl1" == estimator_name:
|
||||||
estimator_class = LRL1Classifier
|
estimator_class = LRL1Classifier
|
||||||
elif 'lrl2' == estimator_name:
|
elif "lrl2" == estimator_name:
|
||||||
estimator_class = LRL2Classifier
|
estimator_class = LRL2Classifier
|
||||||
elif 'catboost' == estimator_name:
|
elif "catboost" == estimator_name:
|
||||||
estimator_class = CatBoostEstimator
|
estimator_class = CatBoostEstimator
|
||||||
elif 'extra_tree' == estimator_name:
|
elif "extra_tree" == estimator_name:
|
||||||
estimator_class = ExtraTreeEstimator
|
estimator_class = ExtraTreeEstimator
|
||||||
elif 'kneighbor' == estimator_name:
|
elif "kneighbor" == estimator_name:
|
||||||
estimator_class = KNeighborsEstimator
|
estimator_class = KNeighborsEstimator
|
||||||
elif 'prophet' in estimator_name:
|
elif "prophet" in estimator_name:
|
||||||
estimator_class = FBProphet
|
estimator_class = Prophet
|
||||||
elif estimator_name == 'arima':
|
elif estimator_name == "arima":
|
||||||
estimator_class = ARIMA
|
estimator_class = ARIMA
|
||||||
elif estimator_name == 'sarimax':
|
elif estimator_name == "sarimax":
|
||||||
estimator_class = SARIMAX
|
estimator_class = SARIMAX
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
estimator_name + ' is not a built-in learner. '
|
estimator_name + " is not a built-in learner. "
|
||||||
'Please use AutoML.add_learner() to add a customized learner.')
|
"Please use AutoML.add_learner() to add a customized learner."
|
||||||
|
)
|
||||||
return estimator_class
|
return estimator_class
|
||||||
|
|
||||||
|
|
||||||
def sklearn_metric_loss_score(
|
def sklearn_metric_loss_score(
|
||||||
metric_name, y_predict, y_true, labels=None, sample_weight=None,
|
metric_name,
|
||||||
|
y_predict,
|
||||||
|
y_true,
|
||||||
|
labels=None,
|
||||||
|
sample_weight=None,
|
||||||
groups=None,
|
groups=None,
|
||||||
):
|
):
|
||||||
'''Loss using the specified metric
|
"""Loss using the specified metric
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
metric_name: A string of the metric name, one of
|
metric_name: A string of the metric name, one of
|
||||||
@ -76,60 +101,63 @@ def sklearn_metric_loss_score(
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
score: A float number of the loss, the lower the better.
|
score: A float number of the loss, the lower the better.
|
||||||
'''
|
"""
|
||||||
metric_name = metric_name.lower()
|
metric_name = metric_name.lower()
|
||||||
if 'r2' == metric_name:
|
if "r2" == metric_name:
|
||||||
score = 1.0 - r2_score(y_true, y_predict, sample_weight=sample_weight)
|
score = 1.0 - r2_score(y_true, y_predict, sample_weight=sample_weight)
|
||||||
elif metric_name == 'rmse':
|
elif metric_name == "rmse":
|
||||||
score = np.sqrt(mean_squared_error(
|
score = np.sqrt(
|
||||||
y_true, y_predict, sample_weight=sample_weight))
|
mean_squared_error(y_true, y_predict, sample_weight=sample_weight)
|
||||||
elif metric_name == 'mae':
|
)
|
||||||
score = mean_absolute_error(
|
elif metric_name == "mae":
|
||||||
y_true, y_predict, sample_weight=sample_weight)
|
score = mean_absolute_error(y_true, y_predict, sample_weight=sample_weight)
|
||||||
elif metric_name == 'mse':
|
elif metric_name == "mse":
|
||||||
score = mean_squared_error(
|
score = mean_squared_error(y_true, y_predict, sample_weight=sample_weight)
|
||||||
y_true, y_predict, sample_weight=sample_weight)
|
elif metric_name == "accuracy":
|
||||||
elif metric_name == 'accuracy':
|
score = 1.0 - accuracy_score(y_true, y_predict, sample_weight=sample_weight)
|
||||||
score = 1.0 - accuracy_score(
|
elif metric_name == "roc_auc":
|
||||||
y_true, y_predict, sample_weight=sample_weight)
|
score = 1.0 - roc_auc_score(y_true, y_predict, sample_weight=sample_weight)
|
||||||
elif metric_name == 'roc_auc':
|
elif metric_name == "roc_auc_ovr":
|
||||||
score = 1.0 - roc_auc_score(
|
score = 1.0 - roc_auc_score(
|
||||||
y_true, y_predict, sample_weight=sample_weight)
|
y_true, y_predict, sample_weight=sample_weight, multi_class="ovr"
|
||||||
elif metric_name == 'roc_auc_ovr':
|
)
|
||||||
|
elif metric_name == "roc_auc_ovo":
|
||||||
score = 1.0 - roc_auc_score(
|
score = 1.0 - roc_auc_score(
|
||||||
y_true, y_predict, sample_weight=sample_weight, multi_class='ovr')
|
y_true, y_predict, sample_weight=sample_weight, multi_class="ovo"
|
||||||
elif metric_name == 'roc_auc_ovo':
|
)
|
||||||
score = 1.0 - roc_auc_score(
|
elif "log_loss" == metric_name:
|
||||||
y_true, y_predict, sample_weight=sample_weight, multi_class='ovo')
|
score = log_loss(y_true, y_predict, labels=labels, sample_weight=sample_weight)
|
||||||
elif 'log_loss' == metric_name:
|
elif "mape" == metric_name:
|
||||||
score = log_loss(
|
|
||||||
y_true, y_predict, labels=labels, sample_weight=sample_weight)
|
|
||||||
elif 'mape' == metric_name:
|
|
||||||
try:
|
try:
|
||||||
score = mean_absolute_percentage_error(
|
score = mean_absolute_percentage_error(y_true, y_predict)
|
||||||
y_true, y_predict)
|
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return np.inf
|
return np.inf
|
||||||
elif 'micro_f1' == metric_name:
|
elif "micro_f1" == metric_name:
|
||||||
score = 1 - f1_score(
|
score = 1 - f1_score(
|
||||||
y_true, y_predict, sample_weight=sample_weight, average='micro')
|
y_true, y_predict, sample_weight=sample_weight, average="micro"
|
||||||
elif 'macro_f1' == metric_name:
|
)
|
||||||
|
elif "macro_f1" == metric_name:
|
||||||
score = 1 - f1_score(
|
score = 1 - f1_score(
|
||||||
y_true, y_predict, sample_weight=sample_weight, average='macro')
|
y_true, y_predict, sample_weight=sample_weight, average="macro"
|
||||||
elif 'f1' == metric_name:
|
)
|
||||||
|
elif "f1" == metric_name:
|
||||||
score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight)
|
score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight)
|
||||||
elif 'ap' == metric_name:
|
elif "ap" == metric_name:
|
||||||
score = 1 - average_precision_score(
|
score = 1 - average_precision_score(
|
||||||
y_true, y_predict, sample_weight=sample_weight)
|
y_true, y_predict, sample_weight=sample_weight
|
||||||
elif 'ndcg' in metric_name:
|
)
|
||||||
if '@' in metric_name:
|
elif "ndcg" in metric_name:
|
||||||
k = int(metric_name.split('@', 1)[-1])
|
if "@" in metric_name:
|
||||||
|
k = int(metric_name.split("@", 1)[-1])
|
||||||
counts = group_counts(groups)
|
counts = group_counts(groups)
|
||||||
score = 0
|
score = 0
|
||||||
psum = 0
|
psum = 0
|
||||||
for c in counts:
|
for c in counts:
|
||||||
score -= ndcg_score(np.asarray([y_true[psum:psum + c]]),
|
score -= ndcg_score(
|
||||||
np.asarray([y_predict[psum:psum + c]]), k=k)
|
np.asarray([y_true[psum : psum + c]]),
|
||||||
|
np.asarray([y_predict[psum : psum + c]]),
|
||||||
|
k=k,
|
||||||
|
)
|
||||||
psum += c
|
psum += c
|
||||||
score /= len(counts)
|
score /= len(counts)
|
||||||
score += 1
|
score += 1
|
||||||
@ -137,56 +165,96 @@ def sklearn_metric_loss_score(
|
|||||||
score = 1 - ndcg_score([y_true], [y_predict])
|
score = 1 - ndcg_score([y_true], [y_predict])
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
metric_name + ' is not a built-in metric, '
|
metric_name + " is not a built-in metric, "
|
||||||
'currently built-in metrics are: '
|
"currently built-in metrics are: "
|
||||||
'r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,'
|
"r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,"
|
||||||
'log_loss, mape, f1, micro_f1, macro_f1, ap. '
|
"log_loss, mape, f1, micro_f1, macro_f1, ap. "
|
||||||
'please pass a customized metric function to AutoML.fit(metric=func)')
|
"please pass a customized metric function to AutoML.fit(metric=func)"
|
||||||
|
)
|
||||||
return score
|
return score
|
||||||
|
|
||||||
|
|
||||||
def get_y_pred(estimator, X, eval_metric, obj):
|
def get_y_pred(estimator, X, eval_metric, obj):
|
||||||
if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj:
|
if eval_metric in ["roc_auc", "ap"] and "binary" in obj:
|
||||||
y_pred_classes = estimator.predict_proba(X)
|
y_pred_classes = estimator.predict_proba(X)
|
||||||
y_pred = y_pred_classes[
|
y_pred = y_pred_classes[:, 1] if y_pred_classes.ndim > 1 else y_pred_classes
|
||||||
:, 1] if y_pred_classes.ndim > 1 else y_pred_classes
|
elif eval_metric in ["log_loss", "roc_auc", "roc_auc_ovr", "roc_auc_ovo"]:
|
||||||
elif eval_metric in ['log_loss', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']:
|
|
||||||
y_pred = estimator.predict_proba(X)
|
y_pred = estimator.predict_proba(X)
|
||||||
else:
|
else:
|
||||||
y_pred = estimator.predict(X)
|
y_pred = estimator.predict(X)
|
||||||
return y_pred
|
return y_pred
|
||||||
|
|
||||||
|
|
||||||
def _eval_estimator(config, estimator, X_train, y_train, X_test, y_test, weight_test,
|
def _eval_estimator(
|
||||||
groups_test, eval_metric, obj, labels=None,
|
config,
|
||||||
log_training_metric=False, fit_kwargs={}):
|
estimator,
|
||||||
|
X_train,
|
||||||
|
y_train,
|
||||||
|
X_test,
|
||||||
|
y_test,
|
||||||
|
weight_test,
|
||||||
|
groups_test,
|
||||||
|
eval_metric,
|
||||||
|
obj,
|
||||||
|
labels=None,
|
||||||
|
log_training_metric=False,
|
||||||
|
fit_kwargs={},
|
||||||
|
):
|
||||||
if isinstance(eval_metric, str):
|
if isinstance(eval_metric, str):
|
||||||
pred_start = time.time()
|
pred_start = time.time()
|
||||||
test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj)
|
test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj)
|
||||||
pred_time = (time.time() - pred_start) / X_test.shape[0]
|
pred_time = (time.time() - pred_start) / X_test.shape[0]
|
||||||
test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test,
|
test_loss = sklearn_metric_loss_score(
|
||||||
labels, weight_test, groups_test)
|
eval_metric, test_pred_y, y_test, labels, weight_test, groups_test
|
||||||
|
)
|
||||||
metric_for_logging = {}
|
metric_for_logging = {}
|
||||||
if log_training_metric:
|
if log_training_metric:
|
||||||
train_pred_y = get_y_pred(estimator, X_train, eval_metric, obj)
|
train_pred_y = get_y_pred(estimator, X_train, eval_metric, obj)
|
||||||
metric_for_logging['train_loss'] = sklearn_metric_loss_score(
|
metric_for_logging["train_loss"] = sklearn_metric_loss_score(
|
||||||
eval_metric, train_pred_y, y_train, labels,
|
eval_metric,
|
||||||
fit_kwargs.get('sample_weight'), fit_kwargs.get('groups'))
|
train_pred_y,
|
||||||
|
y_train,
|
||||||
|
labels,
|
||||||
|
fit_kwargs.get("sample_weight"),
|
||||||
|
fit_kwargs.get("groups"),
|
||||||
|
)
|
||||||
else: # customized metric function
|
else: # customized metric function
|
||||||
test_loss, metric_for_logging = eval_metric(
|
test_loss, metric_for_logging = eval_metric(
|
||||||
X_test, y_test, estimator, labels, X_train, y_train, weight_test,
|
X_test,
|
||||||
fit_kwargs.get('sample_weight'), config, groups_test,
|
y_test,
|
||||||
fit_kwargs.get('groups'))
|
estimator,
|
||||||
|
labels,
|
||||||
|
X_train,
|
||||||
|
y_train,
|
||||||
|
weight_test,
|
||||||
|
fit_kwargs.get("sample_weight"),
|
||||||
|
config,
|
||||||
|
groups_test,
|
||||||
|
fit_kwargs.get("groups"),
|
||||||
|
)
|
||||||
if isinstance(metric_for_logging, dict):
|
if isinstance(metric_for_logging, dict):
|
||||||
pred_time = metric_for_logging.get('pred_time', 0)
|
pred_time = metric_for_logging.get("pred_time", 0)
|
||||||
test_pred_y = None
|
test_pred_y = None
|
||||||
# eval_metric may return test_pred_y but not necessarily. Setting None for now.
|
# eval_metric may return test_pred_y but not necessarily. Setting None for now.
|
||||||
return test_loss, metric_for_logging, pred_time, test_pred_y
|
return test_loss, metric_for_logging, pred_time, test_pred_y
|
||||||
|
|
||||||
|
|
||||||
def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_test,
|
def get_test_loss(
|
||||||
groups_test, eval_metric, obj, labels=None, budget=None,
|
config,
|
||||||
log_training_metric=False, fit_kwargs={}):
|
estimator,
|
||||||
|
X_train,
|
||||||
|
y_train,
|
||||||
|
X_test,
|
||||||
|
y_test,
|
||||||
|
weight_test,
|
||||||
|
groups_test,
|
||||||
|
eval_metric,
|
||||||
|
obj,
|
||||||
|
labels=None,
|
||||||
|
budget=None,
|
||||||
|
log_training_metric=False,
|
||||||
|
fit_kwargs={},
|
||||||
|
):
|
||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
# if groups_test is not None:
|
# if groups_test is not None:
|
||||||
@ -195,16 +263,37 @@ def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_te
|
|||||||
# fit_kwargs['y_val'] = y_test
|
# fit_kwargs['y_val'] = y_test
|
||||||
estimator.fit(X_train, y_train, budget, **fit_kwargs)
|
estimator.fit(X_train, y_train, budget, **fit_kwargs)
|
||||||
test_loss, metric_for_logging, pred_time, _ = _eval_estimator(
|
test_loss, metric_for_logging, pred_time, _ = _eval_estimator(
|
||||||
config, estimator, X_train, y_train, X_test, y_test,
|
config,
|
||||||
weight_test, groups_test, eval_metric, obj,
|
estimator,
|
||||||
labels, log_training_metric, fit_kwargs)
|
X_train,
|
||||||
|
y_train,
|
||||||
|
X_test,
|
||||||
|
y_test,
|
||||||
|
weight_test,
|
||||||
|
groups_test,
|
||||||
|
eval_metric,
|
||||||
|
obj,
|
||||||
|
labels,
|
||||||
|
log_training_metric,
|
||||||
|
fit_kwargs,
|
||||||
|
)
|
||||||
train_time = time.time() - start
|
train_time = time.time() - start
|
||||||
return test_loss, metric_for_logging, train_time, pred_time
|
return test_loss, metric_for_logging, train_time, pred_time
|
||||||
|
|
||||||
|
|
||||||
def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
|
def evaluate_model_CV(
|
||||||
task, eval_metric, best_val_loss,
|
config,
|
||||||
log_training_metric=False, fit_kwargs={}):
|
estimator,
|
||||||
|
X_train_all,
|
||||||
|
y_train_all,
|
||||||
|
budget,
|
||||||
|
kf,
|
||||||
|
task,
|
||||||
|
eval_metric,
|
||||||
|
best_val_loss,
|
||||||
|
log_training_metric=False,
|
||||||
|
fit_kwargs={},
|
||||||
|
):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
total_val_loss = 0
|
total_val_loss = 0
|
||||||
total_metric = None
|
total_metric = None
|
||||||
@ -213,7 +302,7 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
|
|||||||
valid_fold_num = total_fold_num = 0
|
valid_fold_num = total_fold_num = 0
|
||||||
n = kf.get_n_splits()
|
n = kf.get_n_splits()
|
||||||
X_train_split, y_train_split = X_train_all, y_train_all
|
X_train_split, y_train_split = X_train_all, y_train_all
|
||||||
if task in ('binary', 'multi'):
|
if task in ("binary", "multi"):
|
||||||
labels = np.unique(y_train_all)
|
labels = np.unique(y_train_all)
|
||||||
else:
|
else:
|
||||||
labels = None
|
labels = None
|
||||||
@ -225,8 +314,8 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
|
|||||||
groups = kf.groups
|
groups = kf.groups
|
||||||
kf = kf.split(X_train_split, y_train_split, groups)
|
kf = kf.split(X_train_split, y_train_split, groups)
|
||||||
shuffle = False
|
shuffle = False
|
||||||
elif isinstance(kf, TimeSeriesSplit) and task == 'forecast':
|
elif isinstance(kf, TimeSeriesSplit) and task == "forecast":
|
||||||
y_train_all = pd.DataFrame(y_train_all, columns=['y'])
|
y_train_all = pd.DataFrame(y_train_all, columns=["y"])
|
||||||
train = X_train_all.join(y_train_all)
|
train = X_train_all.join(y_train_all)
|
||||||
kf = kf.split(train)
|
kf = kf.split(train)
|
||||||
shuffle = False
|
shuffle = False
|
||||||
@ -237,8 +326,8 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
|
|||||||
rng = np.random.RandomState(2020)
|
rng = np.random.RandomState(2020)
|
||||||
val_loss_list = []
|
val_loss_list = []
|
||||||
budget_per_train = budget / n
|
budget_per_train = budget / n
|
||||||
if 'sample_weight' in fit_kwargs:
|
if "sample_weight" in fit_kwargs:
|
||||||
weight = fit_kwargs['sample_weight']
|
weight = fit_kwargs["sample_weight"]
|
||||||
weight_val = None
|
weight_val = None
|
||||||
else:
|
else:
|
||||||
weight = weight_val = None
|
weight = weight_val = None
|
||||||
@ -246,37 +335,48 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
|
|||||||
if shuffle:
|
if shuffle:
|
||||||
train_index = rng.permutation(train_index)
|
train_index = rng.permutation(train_index)
|
||||||
if isinstance(X_train_all, pd.DataFrame):
|
if isinstance(X_train_all, pd.DataFrame):
|
||||||
X_train, X_val = X_train_split.iloc[
|
X_train = X_train_split.iloc[train_index]
|
||||||
train_index], X_train_split.iloc[val_index]
|
X_val = X_train_split.iloc[val_index]
|
||||||
else:
|
else:
|
||||||
X_train, X_val = X_train_split[
|
X_train, X_val = X_train_split[train_index], X_train_split[val_index]
|
||||||
train_index], X_train_split[val_index]
|
|
||||||
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
|
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
|
||||||
estimator.cleanup()
|
estimator.cleanup()
|
||||||
if weight is not None:
|
if weight is not None:
|
||||||
fit_kwargs['sample_weight'], weight_val = weight[
|
fit_kwargs["sample_weight"], weight_val = (
|
||||||
train_index], weight[val_index]
|
weight[train_index],
|
||||||
|
weight[val_index],
|
||||||
|
)
|
||||||
if groups is not None:
|
if groups is not None:
|
||||||
fit_kwargs['groups'] = groups[train_index]
|
fit_kwargs["groups"] = groups[train_index]
|
||||||
groups_val = groups[val_index]
|
groups_val = groups[val_index]
|
||||||
else:
|
else:
|
||||||
groups_val = None
|
groups_val = None
|
||||||
val_loss_i, metric_i, train_time_i, pred_time_i = get_test_loss(
|
val_loss_i, metric_i, train_time_i, pred_time_i = get_test_loss(
|
||||||
config, estimator, X_train, y_train, X_val, y_val, weight_val,
|
config,
|
||||||
groups_val, eval_metric, task, labels, budget_per_train,
|
estimator,
|
||||||
log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
|
X_train,
|
||||||
|
y_train,
|
||||||
|
X_val,
|
||||||
|
y_val,
|
||||||
|
weight_val,
|
||||||
|
groups_val,
|
||||||
|
eval_metric,
|
||||||
|
task,
|
||||||
|
labels,
|
||||||
|
budget_per_train,
|
||||||
|
log_training_metric=log_training_metric,
|
||||||
|
fit_kwargs=fit_kwargs,
|
||||||
|
)
|
||||||
if weight is not None:
|
if weight is not None:
|
||||||
fit_kwargs['sample_weight'] = weight
|
fit_kwargs["sample_weight"] = weight
|
||||||
valid_fold_num += 1
|
valid_fold_num += 1
|
||||||
total_fold_num += 1
|
total_fold_num += 1
|
||||||
total_val_loss += val_loss_i
|
total_val_loss += val_loss_i
|
||||||
if log_training_metric or not isinstance(eval_metric, str):
|
if log_training_metric or not isinstance(eval_metric, str):
|
||||||
if isinstance(total_metric, list):
|
if isinstance(total_metric, list):
|
||||||
total_metric = [
|
total_metric = [total_metric[i] + v for i, v in enumerate(metric_i)]
|
||||||
total_metric[i] + v for i, v in enumerate(metric_i)]
|
|
||||||
elif isinstance(total_metric, dict):
|
elif isinstance(total_metric, dict):
|
||||||
total_metric = {
|
total_metric = {k: total_metric[k] + v for k, v in metric_i.items()}
|
||||||
k: total_metric[k] + v for k, v in metric_i.items()}
|
|
||||||
elif total_metric is not None:
|
elif total_metric is not None:
|
||||||
total_metric += metric_i
|
total_metric += metric_i
|
||||||
else:
|
else:
|
||||||
@ -307,35 +407,73 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
|
|||||||
|
|
||||||
|
|
||||||
def compute_estimator(
|
def compute_estimator(
|
||||||
X_train, y_train, X_val, y_val, weight_val, groups_val, budget, kf,
|
X_train,
|
||||||
config_dic, task, estimator_name, eval_method, eval_metric,
|
y_train,
|
||||||
best_val_loss=np.Inf, n_jobs=1, estimator_class=None, log_training_metric=False,
|
X_val,
|
||||||
fit_kwargs={}
|
y_val,
|
||||||
|
weight_val,
|
||||||
|
groups_val,
|
||||||
|
budget,
|
||||||
|
kf,
|
||||||
|
config_dic,
|
||||||
|
task,
|
||||||
|
estimator_name,
|
||||||
|
eval_method,
|
||||||
|
eval_metric,
|
||||||
|
best_val_loss=np.Inf,
|
||||||
|
n_jobs=1,
|
||||||
|
estimator_class=None,
|
||||||
|
log_training_metric=False,
|
||||||
|
fit_kwargs={},
|
||||||
):
|
):
|
||||||
estimator_class = estimator_class or get_estimator_class(
|
estimator_class = estimator_class or get_estimator_class(task, estimator_name)
|
||||||
task, estimator_name)
|
estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
|
||||||
estimator = estimator_class(
|
if "holdout" in eval_method:
|
||||||
**config_dic, task=task, n_jobs=n_jobs)
|
|
||||||
if 'holdout' in eval_method:
|
|
||||||
val_loss, metric_for_logging, train_time, pred_time = get_test_loss(
|
val_loss, metric_for_logging, train_time, pred_time = get_test_loss(
|
||||||
config_dic, estimator, X_train, y_train, X_val, y_val, weight_val,
|
config_dic,
|
||||||
groups_val, eval_metric, task, budget=budget,
|
estimator,
|
||||||
log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
|
X_train,
|
||||||
|
y_train,
|
||||||
|
X_val,
|
||||||
|
y_val,
|
||||||
|
weight_val,
|
||||||
|
groups_val,
|
||||||
|
eval_metric,
|
||||||
|
task,
|
||||||
|
budget=budget,
|
||||||
|
log_training_metric=log_training_metric,
|
||||||
|
fit_kwargs=fit_kwargs,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
val_loss, metric_for_logging, train_time, pred_time = evaluate_model_CV(
|
val_loss, metric_for_logging, train_time, pred_time = evaluate_model_CV(
|
||||||
config_dic, estimator, X_train, y_train, budget, kf, task,
|
config_dic,
|
||||||
eval_metric, best_val_loss, log_training_metric=log_training_metric,
|
estimator,
|
||||||
fit_kwargs=fit_kwargs)
|
X_train,
|
||||||
|
y_train,
|
||||||
|
budget,
|
||||||
|
kf,
|
||||||
|
task,
|
||||||
|
eval_metric,
|
||||||
|
best_val_loss,
|
||||||
|
log_training_metric=log_training_metric,
|
||||||
|
fit_kwargs=fit_kwargs,
|
||||||
|
)
|
||||||
return estimator, val_loss, metric_for_logging, train_time, pred_time
|
return estimator, val_loss, metric_for_logging, train_time, pred_time
|
||||||
|
|
||||||
|
|
||||||
def train_estimator(
|
def train_estimator(
|
||||||
X_train, y_train, config_dic, task,
|
X_train,
|
||||||
estimator_name, n_jobs=1, estimator_class=None, budget=None, fit_kwargs={}
|
y_train,
|
||||||
|
config_dic,
|
||||||
|
task,
|
||||||
|
estimator_name,
|
||||||
|
n_jobs=1,
|
||||||
|
estimator_class=None,
|
||||||
|
budget=None,
|
||||||
|
fit_kwargs={},
|
||||||
):
|
):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
estimator_class = estimator_class or get_estimator_class(
|
estimator_class = estimator_class or get_estimator_class(task, estimator_name)
|
||||||
task, estimator_name)
|
|
||||||
estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
|
estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
|
||||||
if X_train is not None:
|
if X_train is not None:
|
||||||
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
|
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
|
||||||
@ -347,14 +485,14 @@ def train_estimator(
|
|||||||
|
|
||||||
def get_classification_objective(num_labels: int) -> str:
|
def get_classification_objective(num_labels: int) -> str:
|
||||||
if num_labels == 2:
|
if num_labels == 2:
|
||||||
objective_name = 'binary'
|
objective_name = "binary"
|
||||||
else:
|
else:
|
||||||
objective_name = 'multi'
|
objective_name = "multi"
|
||||||
return objective_name
|
return objective_name
|
||||||
|
|
||||||
|
|
||||||
def norm_confusion_matrix(y_true, y_pred):
|
def norm_confusion_matrix(y_true, y_pred):
|
||||||
'''normalized confusion matrix
|
"""normalized confusion matrix
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
estimator: A multi-class classification estimator
|
estimator: A multi-class classification estimator
|
||||||
@ -363,15 +501,16 @@ def norm_confusion_matrix(y_true, y_pred):
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A normalized confusion matrix
|
A normalized confusion matrix
|
||||||
'''
|
"""
|
||||||
from sklearn.metrics import confusion_matrix
|
from sklearn.metrics import confusion_matrix
|
||||||
|
|
||||||
conf_mat = confusion_matrix(y_true, y_pred)
|
conf_mat = confusion_matrix(y_true, y_pred)
|
||||||
norm_conf_mat = conf_mat.astype('float') / conf_mat.sum(axis=1)[:, np.newaxis]
|
norm_conf_mat = conf_mat.astype("float") / conf_mat.sum(axis=1)[:, np.newaxis]
|
||||||
return norm_conf_mat
|
return norm_conf_mat
|
||||||
|
|
||||||
|
|
||||||
def multi_class_curves(y_true, y_pred_proba, curve_func):
|
def multi_class_curves(y_true, y_pred_proba, curve_func):
|
||||||
'''Binarize the data for multi-class tasks and produce ROC or precision-recall curves
|
"""Binarize the data for multi-class tasks and produce ROC or precision-recall curves
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
y_true: A numpy array or a pandas series of true labels
|
y_true: A numpy array or a pandas series of true labels
|
||||||
@ -384,8 +523,9 @@ def multi_class_curves(y_true, y_pred_proba, curve_func):
|
|||||||
curve_x[0] is an 1D array of the x coordinates of class 0
|
curve_x[0] is an 1D array of the x coordinates of class 0
|
||||||
The second dictionary curve_y stores the y coordinates of each curve, e.g.,
|
The second dictionary curve_y stores the y coordinates of each curve, e.g.,
|
||||||
curve_y[0] is an 1D array of the y coordinates of class 0
|
curve_y[0] is an 1D array of the y coordinates of class 0
|
||||||
'''
|
"""
|
||||||
from sklearn.preprocessing import label_binarize
|
from sklearn.preprocessing import label_binarize
|
||||||
|
|
||||||
classes = np.unique(y_true)
|
classes = np.unique(y_true)
|
||||||
y_true_binary = label_binarize(y_true, classes=classes)
|
y_true_binary = label_binarize(y_true, classes=classes)
|
||||||
|
|
||||||
|
887
flaml/model.py
887
flaml/model.py
File diff suppressed because it is too large
Load Diff
@ -1 +1 @@
|
|||||||
__version__ = "0.6.3"
|
__version__ = "0.6.4"
|
||||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ setuptools.setup(
|
|||||||
long_description=long_description,
|
long_description=long_description,
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
url="https://github.com/microsoft/FLAML",
|
url="https://github.com/microsoft/FLAML",
|
||||||
packages=setuptools.find_packages(),
|
packages=setuptools.find_packages(include=["flaml*"]),
|
||||||
install_requires=install_requires,
|
install_requires=install_requires,
|
||||||
extras_require={
|
extras_require={
|
||||||
"notebook": [
|
"notebook": [
|
||||||
|
@ -30,9 +30,11 @@ def test_forecast_automl(budget=5):
|
|||||||
}
|
}
|
||||||
"""The main flaml automl API"""
|
"""The main flaml automl API"""
|
||||||
try:
|
try:
|
||||||
|
import prophet
|
||||||
|
|
||||||
automl.fit(dataframe=df, **settings, period=time_horizon)
|
automl.fit(dataframe=df, **settings, period=time_horizon)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("not using FBProphet due to ImportError")
|
print("not using prophet due to ImportError")
|
||||||
automl.fit(
|
automl.fit(
|
||||||
dataframe=df,
|
dataframe=df,
|
||||||
**settings,
|
**settings,
|
||||||
@ -79,7 +81,7 @@ def test_forecast_automl(budget=5):
|
|||||||
try:
|
try:
|
||||||
automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon)
|
automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("not using FBProphet due to ImportError")
|
print("not using prophet due to ImportError")
|
||||||
automl.fit(
|
automl.fit(
|
||||||
X_train=X_train,
|
X_train=X_train,
|
||||||
y_train=y_train,
|
y_train=y_train,
|
||||||
@ -94,6 +96,8 @@ def test_numpy():
|
|||||||
y_train = np.random.random(size=72)
|
y_train = np.random.random(size=72)
|
||||||
automl = AutoML()
|
automl = AutoML()
|
||||||
try:
|
try:
|
||||||
|
import prophet
|
||||||
|
|
||||||
automl.fit(
|
automl.fit(
|
||||||
X_train=X_train[:60], # a single column of timestamp
|
X_train=X_train[:60], # a single column of timestamp
|
||||||
y_train=y_train, # value for each timestamp
|
y_train=y_train, # value for each timestamp
|
||||||
@ -105,9 +109,9 @@ def test_numpy():
|
|||||||
print(automl.predict(X_train[60:]))
|
print(automl.predict(X_train[60:]))
|
||||||
print(automl.predict(12))
|
print(automl.predict(12))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
print("ValueError for FBProphet is raised as expected.")
|
print("ValueError for prophet is raised as expected.")
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("not using FBProphet due to ImportError")
|
print("not using prophet due to ImportError")
|
||||||
automl = AutoML()
|
automl = AutoML()
|
||||||
automl.fit(
|
automl.fit(
|
||||||
X_train=X_train[:72], # a single column of timestamp
|
X_train=X_train[:72], # a single column of timestamp
|
||||||
|
Loading…
x
Reference in New Issue
Block a user