diff --git a/README.md b/README.md index 4732e4b48..5937a76bb 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ And they can be used in distributed HPO frameworks such as ray tune or nni. ## Examples -- A basic classification example. +* A basic classification example. ```python from flaml import AutoML @@ -99,7 +99,7 @@ print(automl.predict_proba(X_train)) print(automl.model) ``` -- A basic regression example. +* A basic regression example. ```python from flaml import AutoML @@ -123,7 +123,7 @@ print(automl.predict(X_train)) print(automl.model) ``` -- Time series forecasting. +* Time series forecasting. ```python # pip install flaml[forecast] @@ -141,14 +141,15 @@ automl.fit(X_train=X_train[:72], # a single column of timestamp print(automl.predict(X_train[72:])) ``` -- Learning to rank. +* Learning to rank. ```python from sklearn.datasets import fetch_openml from flaml import AutoML -X, y = fetch_openml(name="credit-g", return_X_y=True) +X_train, y_train = fetch_openml(name="credit-g", return_X_y=True, as_frame=False) +y_train = y_train.cat.codes # not a real learning to rank dataaset -groups = [200] * 4 + [100] * 2, # group counts +groups = [200] * 4 + [100] * 2 # group counts automl = AutoML() automl.fit( X_train, y_train, groups=groups, @@ -207,17 +208,21 @@ pip install -e .[test,notebook] ``` ### Docker + We provide a simple [Dockerfile](https://github.com/microsoft/FLAML/blob/main/Dockerfile). -``` + +```bash docker build git://github.com/microsoft/FLAML -t flaml-dev docker run -it flaml-dev ``` ### Develop in Remote Container + If you use vscode, you can open the FLAML folder in a [Container](https://code.visualstudio.com/docs/remote/containers). -We have provided the configuration in (.devcontainer)[(https://github.com/microsoft/FLAML/blob/main/.devcontainer)]. +We have provided the configuration in [.devcontainer]((https://github.com/microsoft/FLAML/blob/main/.devcontainer)). ### Pre-commit + Run `pre-commit install` to install pre-commit into your git hooks. Before you commit, run `pre-commit run` to check if you meet the pre-commit requirements. If you use Windows (without WSL) and can't commit after installing pre-commit, you can run `pre-commit uninstall` to uninstall the hook. In WSL or Linux this is supposed to work. diff --git a/flaml/automl.py b/flaml/automl.py index 621af4200..4371edd7b 100644 --- a/flaml/automl.py +++ b/flaml/automl.py @@ -1474,7 +1474,12 @@ class AutoML: if "auto" == estimator_list: if self._state.task == "forecast": - estimator_list = ["fbprophet", "arima", "sarimax"] + try: + import prophet + + estimator_list = ["prophet", "arima", "sarimax"] + except ImportError: + estimator_list = ["arima", "sarimax"] elif self._state.task == "rank": estimator_list = ["lgbm", "xgboost"] else: diff --git a/flaml/data.py b/flaml/data.py index d4b48f619..cf278c2d3 100644 --- a/flaml/data.py +++ b/flaml/data.py @@ -1,7 +1,7 @@ -'''! +"""! * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. -''' +""" import numpy as np from scipy.sparse import vstack, issparse @@ -11,9 +11,10 @@ from .training_log import training_log_reader from datetime import datetime -def load_openml_dataset(dataset_id, data_dir=None, random_state=0, - dataset_format='dataframe'): - '''Load dataset from open ML. +def load_openml_dataset( + dataset_id, data_dir=None, random_state=0, dataset_format="dataframe" +): + """Load dataset from open ML. If the file is not cached locally, download it from open ML. @@ -30,41 +31,43 @@ def load_openml_dataset(dataset_id, data_dir=None, random_state=0, X_test: Test data y_train: A series or array of labels for training data y_test: A series or array of labels for test data - ''' + """ import os import openml import pickle from sklearn.model_selection import train_test_split - filename = 'openml_ds' + str(dataset_id) + '.pkl' + filename = "openml_ds" + str(dataset_id) + ".pkl" filepath = os.path.join(data_dir, filename) if os.path.isfile(filepath): - print('load dataset from', filepath) - with open(filepath, 'rb') as f: + print("load dataset from", filepath) + with open(filepath, "rb") as f: dataset = pickle.load(f) else: - print('download dataset from openml') + print("download dataset from openml") dataset = openml.datasets.get_dataset(dataset_id) if not os.path.exists(data_dir): os.makedirs(data_dir) - with open(filepath, 'wb') as f: + with open(filepath, "wb") as f: pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL) - print('Dataset name:', dataset.name) - X, y, * \ - __ = dataset.get_data( - target=dataset.default_target_attribute, dataset_format=dataset_format) - X_train, X_test, y_train, y_test = train_test_split( - X, y, random_state=random_state) + print("Dataset name:", dataset.name) + X, y, *__ = dataset.get_data( + target=dataset.default_target_attribute, dataset_format=dataset_format + ) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state) print( - 'X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}'.format( - X_train.shape, y_train.shape, X_test.shape, y_test.shape, + "X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}".format( + X_train.shape, + y_train.shape, + X_test.shape, + y_test.shape, ) ) return X_train, X_test, y_train, y_test def load_openml_task(task_id, data_dir): - '''Load task from open ML. + """Load task from open ML. Use the first fold of the task. If the file is not cached locally, download it from open ML. @@ -78,21 +81,22 @@ def load_openml_task(task_id, data_dir): X_test: A dataframe of test data y_train: A series of labels for training data y_test: A series of labels for test data - ''' + """ import os import openml import pickle + task = openml.tasks.get_task(task_id) - filename = 'openml_task' + str(task_id) + '.pkl' + filename = "openml_task" + str(task_id) + ".pkl" filepath = os.path.join(data_dir, filename) if os.path.isfile(filepath): - print('load dataset from', filepath) - with open(filepath, 'rb') as f: + print("load dataset from", filepath) + with open(filepath, "rb") as f: dataset = pickle.load(f) else: - print('download dataset from openml') + print("download dataset from openml") dataset = task.get_dataset() - with open(filepath, 'wb') as f: + with open(filepath, "wb") as f: pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL) X, y, _, _ = dataset.get_data(task.target_name) train_indices, test_indices = task.get_train_test_split_indices( @@ -105,15 +109,18 @@ def load_openml_task(task_id, data_dir): X_test = X.iloc[test_indices] y_test = y[test_indices] print( - 'X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}'.format( - X_train.shape, y_train.shape, X_test.shape, y_test.shape, + "X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}".format( + X_train.shape, + y_train.shape, + X_test.shape, + y_test.shape, ) ) return X_train, X_test, y_train, y_test def get_output_from_log(filename, time_budget): - '''Get output from log file + """Get output from log file Args: filename: A string of the log file name @@ -127,11 +134,11 @@ def get_output_from_log(filename, time_budget): config_list: A list of the estimator, sample size and config of each logged iter logged_metric_list: A list of the logged metric of each logged iter - ''' + """ best_config = None best_learner = None - best_val_loss = float('+inf') + best_val_loss = float("+inf") search_time_list = [] config_list = [] @@ -144,7 +151,7 @@ def get_output_from_log(filename, time_budget): time_used = record.wall_clock_time val_loss = record.validation_loss config = record.config - learner = record.learner.split('_')[0] + learner = record.learner.split("_")[0] sample_size = record.sample_size metric = record.logged_metric @@ -158,27 +165,34 @@ def get_output_from_log(filename, time_budget): best_error_list.append(best_val_loss) logged_metric_list.append(metric) error_list.append(val_loss) - config_list.append({"Current Learner": learner, - "Current Sample": sample_size, - "Current Hyper-parameters": record.config, - "Best Learner": best_learner, - "Best Hyper-parameters": best_config}) + config_list.append( + { + "Current Learner": learner, + "Current Sample": sample_size, + "Current Hyper-parameters": record.config, + "Best Learner": best_learner, + "Best Hyper-parameters": best_config, + } + ) - return (search_time_list, best_error_list, error_list, config_list, - logged_metric_list) + return ( + search_time_list, + best_error_list, + error_list, + config_list, + logged_metric_list, + ) def concat(X1, X2): - '''concatenate two matrices vertically - ''' + """concatenate two matrices vertically""" if isinstance(X1, pd.DataFrame) or isinstance(X1, pd.Series): df = pd.concat([X1, X2], sort=False) df.reset_index(drop=True, inplace=True) if isinstance(X1, pd.DataFrame): - cat_columns = X1.select_dtypes( - include='category').columns + cat_columns = X1.select_dtypes(include="category").columns if len(cat_columns): - df[cat_columns] = df[cat_columns].astype('category') + df[cat_columns] = df[cat_columns].astype("category") return df if issparse(X1): return vstack((X1, X2)) @@ -187,8 +201,7 @@ def concat(X1, X2): class DataTransformer: - '''transform X, y - ''' + """transform X, y""" def fit_transform(self, X, y, task): if isinstance(X, pd.DataFrame): @@ -198,19 +211,25 @@ class DataTransformer: drop = False for column in X.columns: # sklearn\utils\validation.py needs int/float values - if X[column].dtype.name in ('object', 'category'): - if X[column].nunique() == 1 or X[column].nunique( - dropna=True) == n - X[column].isnull().sum(): + if X[column].dtype.name in ("object", "category"): + if ( + X[column].nunique() == 1 + or X[column].nunique(dropna=True) + == n - X[column].isnull().sum() + ): X.drop(columns=column, inplace=True) drop = True - elif X[column].dtype.name == 'category': + elif X[column].dtype.name == "category": current_categories = X[column].cat.categories - if '__NAN__' not in current_categories: - X[column] = X[column].cat.add_categories( - '__NAN__').fillna('__NAN__') + if "__NAN__" not in current_categories: + X[column] = ( + X[column] + .cat.add_categories("__NAN__") + .fillna("__NAN__") + ) cat_columns.append(column) else: - X[column] = X[column].fillna('__NAN__') + X[column] = X[column].fillna("__NAN__") cat_columns.append(column) else: # print(X[column].dtype.name) @@ -218,17 +237,27 @@ class DataTransformer: X.drop(columns=column, inplace=True) drop = True else: - if X[column].dtype.name == 'datetime64[ns]': + if X[column].dtype.name == "datetime64[ns]": tmp_dt = X[column].dt - new_columns_dict = {f'year_{column}': tmp_dt.year, f'month_{column}': tmp_dt.month, - f'day_{column}': tmp_dt.day, f'hour_{column}': tmp_dt.hour, - f'minute_{column}': tmp_dt.minute, f'second_{column}': tmp_dt.second, - f'dayofweek_{column}': tmp_dt.dayofweek, - f'dayofyear_{column}': tmp_dt.dayofyear, - f'quarter_{column}': tmp_dt.quarter} + new_columns_dict = { + f"year_{column}": tmp_dt.year, + f"month_{column}": tmp_dt.month, + f"day_{column}": tmp_dt.day, + f"hour_{column}": tmp_dt.hour, + f"minute_{column}": tmp_dt.minute, + f"second_{column}": tmp_dt.second, + f"dayofweek_{column}": tmp_dt.dayofweek, + f"dayofyear_{column}": tmp_dt.dayofyear, + f"quarter_{column}": tmp_dt.quarter, + } for new_col_name in new_columns_dict.keys(): - if new_col_name not in X.columns and \ - new_columns_dict.get(new_col_name).nunique(dropna=False) >= 2: + if ( + new_col_name not in X.columns + and new_columns_dict.get(new_col_name).nunique( + dropna=False + ) + >= 2 + ): X[new_col_name] = new_columns_dict.get(new_col_name) num_columns.append(new_col_name) X[column] = X[column].map(datetime.toordinal) @@ -239,11 +268,12 @@ class DataTransformer: num_columns.append(column) X = X[cat_columns + num_columns] if cat_columns: - X[cat_columns] = X[cat_columns].astype('category') + X[cat_columns] = X[cat_columns].astype("category") if num_columns: X_num = X[num_columns] if np.issubdtype(X_num.columns.dtype, np.integer) and ( - drop or min(X_num.columns) != 0 + drop + or min(X_num.columns) != 0 or max(X_num.columns) != X_num.shape[1] - 1 ): X_num.columns = range(X_num.shape[1]) @@ -252,17 +282,31 @@ class DataTransformer: drop = False from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer - self.transformer = ColumnTransformer([( - 'continuous', - SimpleImputer(missing_values=np.nan, strategy='median'), - X_num.columns)]) + + self.transformer = ColumnTransformer( + [ + ( + "continuous", + SimpleImputer(missing_values=np.nan, strategy="median"), + X_num.columns, + ) + ] + ) X[num_columns] = self.transformer.fit_transform(X_num) - self._cat_columns, self._num_columns, self._datetime_columns = \ - cat_columns, num_columns, datetime_columns + self._cat_columns, self._num_columns, self._datetime_columns = ( + cat_columns, + num_columns, + datetime_columns, + ) self._drop = drop - if task in ('binary', 'multi', 'classification'): + if task in ( + "binary", + "multi", + "classification", + ) or not pd.api.types.is_numeric_dtype(y): from sklearn.preprocessing import LabelEncoder + self.label_transformer = LabelEncoder() y = self.label_transformer.fit_transform(y) else: @@ -272,34 +316,46 @@ class DataTransformer: def transform(self, X): X = X.copy() if isinstance(X, pd.DataFrame): - cat_columns, num_columns, datetime_columns = self._cat_columns, \ - self._num_columns, self._datetime_columns + cat_columns, num_columns, datetime_columns = ( + self._cat_columns, + self._num_columns, + self._datetime_columns, + ) if datetime_columns: for column in datetime_columns: tmp_dt = X[column].dt - new_columns_dict = {f'year_{column}': tmp_dt.year, f'month_{column}': tmp_dt.month, - f'day_{column}': tmp_dt.day, f'hour_{column}': tmp_dt.hour, - f'minute_{column}': tmp_dt.minute, f'second_{column}': tmp_dt.second, - f'dayofweek_{column}': tmp_dt.dayofweek, - f'dayofyear_{column}': tmp_dt.dayofyear, - f'quarter_{column}': tmp_dt.quarter} + new_columns_dict = { + f"year_{column}": tmp_dt.year, + f"month_{column}": tmp_dt.month, + f"day_{column}": tmp_dt.day, + f"hour_{column}": tmp_dt.hour, + f"minute_{column}": tmp_dt.minute, + f"second_{column}": tmp_dt.second, + f"dayofweek_{column}": tmp_dt.dayofweek, + f"dayofyear_{column}": tmp_dt.dayofyear, + f"quarter_{column}": tmp_dt.quarter, + } for new_col_name in new_columns_dict.keys(): - if new_col_name not in X.columns and \ - new_columns_dict.get(new_col_name).nunique(dropna=False) >= 2: + if ( + new_col_name not in X.columns + and new_columns_dict.get(new_col_name).nunique(dropna=False) + >= 2 + ): X[new_col_name] = new_columns_dict.get(new_col_name) X[column] = X[column].map(datetime.toordinal) del tmp_dt X = X[cat_columns + num_columns].copy() for column in cat_columns: - if X[column].dtype.name == 'object': - X[column] = X[column].fillna('__NAN__') - elif X[column].dtype.name == 'category': + if X[column].dtype.name == "object": + X[column] = X[column].fillna("__NAN__") + elif X[column].dtype.name == "category": current_categories = X[column].cat.categories - if '__NAN__' not in current_categories: - X[column] = X[column].cat.add_categories( - '__NAN__').fillna('__NAN__') + if "__NAN__" not in current_categories: + X[column] = ( + X[column].cat.add_categories("__NAN__").fillna("__NAN__") + ) if cat_columns: - X[cat_columns] = X[cat_columns].astype('category') + X[cat_columns] = X[cat_columns].astype("category") if num_columns: X_num = X[num_columns].fillna(np.nan) if self._drop: diff --git a/flaml/ml.py b/flaml/ml.py index 91d747e8a..edad16bab 100644 --- a/flaml/ml.py +++ b/flaml/ml.py @@ -1,65 +1,90 @@ -'''! - * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. +"""! + * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the MIT License. -''' +""" import time import numpy as np import pandas as pd -from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \ - accuracy_score, mean_absolute_error, log_loss, average_precision_score, \ - f1_score, mean_absolute_percentage_error, ndcg_score +from sklearn.metrics import ( + mean_squared_error, + r2_score, + roc_auc_score, + accuracy_score, + mean_absolute_error, + log_loss, + average_precision_score, + f1_score, + mean_absolute_percentage_error, + ndcg_score, +) from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit from .model import ( - XGBoostEstimator, XGBoostSklearnEstimator, RandomForestEstimator, - LGBMEstimator, LRL1Classifier, LRL2Classifier, CatBoostEstimator, - ExtraTreeEstimator, KNeighborsEstimator, FBProphet, ARIMA, SARIMAX) + XGBoostEstimator, + XGBoostSklearnEstimator, + RandomForestEstimator, + LGBMEstimator, + LRL1Classifier, + LRL2Classifier, + CatBoostEstimator, + ExtraTreeEstimator, + KNeighborsEstimator, + Prophet, + ARIMA, + SARIMAX, +) from .data import group_counts import logging + logger = logging.getLogger(__name__) def get_estimator_class(task, estimator_name): - ''' when adding a new learner, need to add an elif branch ''' + """when adding a new learner, need to add an elif branch""" - if 'xgboost' == estimator_name: - if 'regression' == task: + if "xgboost" == estimator_name: + if "regression" == task: estimator_class = XGBoostEstimator else: estimator_class = XGBoostSklearnEstimator - elif 'rf' == estimator_name: + elif "rf" == estimator_name: estimator_class = RandomForestEstimator - elif 'lgbm' == estimator_name: + elif "lgbm" == estimator_name: estimator_class = LGBMEstimator - elif 'lrl1' == estimator_name: + elif "lrl1" == estimator_name: estimator_class = LRL1Classifier - elif 'lrl2' == estimator_name: + elif "lrl2" == estimator_name: estimator_class = LRL2Classifier - elif 'catboost' == estimator_name: + elif "catboost" == estimator_name: estimator_class = CatBoostEstimator - elif 'extra_tree' == estimator_name: + elif "extra_tree" == estimator_name: estimator_class = ExtraTreeEstimator - elif 'kneighbor' == estimator_name: + elif "kneighbor" == estimator_name: estimator_class = KNeighborsEstimator - elif 'prophet' in estimator_name: - estimator_class = FBProphet - elif estimator_name == 'arima': + elif "prophet" in estimator_name: + estimator_class = Prophet + elif estimator_name == "arima": estimator_class = ARIMA - elif estimator_name == 'sarimax': + elif estimator_name == "sarimax": estimator_class = SARIMAX else: raise ValueError( - estimator_name + ' is not a built-in learner. ' - 'Please use AutoML.add_learner() to add a customized learner.') + estimator_name + " is not a built-in learner. " + "Please use AutoML.add_learner() to add a customized learner." + ) return estimator_class def sklearn_metric_loss_score( - metric_name, y_predict, y_true, labels=None, sample_weight=None, + metric_name, + y_predict, + y_true, + labels=None, + sample_weight=None, groups=None, ): - '''Loss using the specified metric + """Loss using the specified metric Args: metric_name: A string of the metric name, one of @@ -76,60 +101,63 @@ def sklearn_metric_loss_score( Returns: score: A float number of the loss, the lower the better. - ''' + """ metric_name = metric_name.lower() - if 'r2' == metric_name: + if "r2" == metric_name: score = 1.0 - r2_score(y_true, y_predict, sample_weight=sample_weight) - elif metric_name == 'rmse': - score = np.sqrt(mean_squared_error( - y_true, y_predict, sample_weight=sample_weight)) - elif metric_name == 'mae': - score = mean_absolute_error( - y_true, y_predict, sample_weight=sample_weight) - elif metric_name == 'mse': - score = mean_squared_error( - y_true, y_predict, sample_weight=sample_weight) - elif metric_name == 'accuracy': - score = 1.0 - accuracy_score( - y_true, y_predict, sample_weight=sample_weight) - elif metric_name == 'roc_auc': + elif metric_name == "rmse": + score = np.sqrt( + mean_squared_error(y_true, y_predict, sample_weight=sample_weight) + ) + elif metric_name == "mae": + score = mean_absolute_error(y_true, y_predict, sample_weight=sample_weight) + elif metric_name == "mse": + score = mean_squared_error(y_true, y_predict, sample_weight=sample_weight) + elif metric_name == "accuracy": + score = 1.0 - accuracy_score(y_true, y_predict, sample_weight=sample_weight) + elif metric_name == "roc_auc": + score = 1.0 - roc_auc_score(y_true, y_predict, sample_weight=sample_weight) + elif metric_name == "roc_auc_ovr": score = 1.0 - roc_auc_score( - y_true, y_predict, sample_weight=sample_weight) - elif metric_name == 'roc_auc_ovr': + y_true, y_predict, sample_weight=sample_weight, multi_class="ovr" + ) + elif metric_name == "roc_auc_ovo": score = 1.0 - roc_auc_score( - y_true, y_predict, sample_weight=sample_weight, multi_class='ovr') - elif metric_name == 'roc_auc_ovo': - score = 1.0 - roc_auc_score( - y_true, y_predict, sample_weight=sample_weight, multi_class='ovo') - elif 'log_loss' == metric_name: - score = log_loss( - y_true, y_predict, labels=labels, sample_weight=sample_weight) - elif 'mape' == metric_name: + y_true, y_predict, sample_weight=sample_weight, multi_class="ovo" + ) + elif "log_loss" == metric_name: + score = log_loss(y_true, y_predict, labels=labels, sample_weight=sample_weight) + elif "mape" == metric_name: try: - score = mean_absolute_percentage_error( - y_true, y_predict) + score = mean_absolute_percentage_error(y_true, y_predict) except ValueError: return np.inf - elif 'micro_f1' == metric_name: + elif "micro_f1" == metric_name: score = 1 - f1_score( - y_true, y_predict, sample_weight=sample_weight, average='micro') - elif 'macro_f1' == metric_name: + y_true, y_predict, sample_weight=sample_weight, average="micro" + ) + elif "macro_f1" == metric_name: score = 1 - f1_score( - y_true, y_predict, sample_weight=sample_weight, average='macro') - elif 'f1' == metric_name: + y_true, y_predict, sample_weight=sample_weight, average="macro" + ) + elif "f1" == metric_name: score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight) - elif 'ap' == metric_name: + elif "ap" == metric_name: score = 1 - average_precision_score( - y_true, y_predict, sample_weight=sample_weight) - elif 'ndcg' in metric_name: - if '@' in metric_name: - k = int(metric_name.split('@', 1)[-1]) + y_true, y_predict, sample_weight=sample_weight + ) + elif "ndcg" in metric_name: + if "@" in metric_name: + k = int(metric_name.split("@", 1)[-1]) counts = group_counts(groups) score = 0 psum = 0 for c in counts: - score -= ndcg_score(np.asarray([y_true[psum:psum + c]]), - np.asarray([y_predict[psum:psum + c]]), k=k) + score -= ndcg_score( + np.asarray([y_true[psum : psum + c]]), + np.asarray([y_predict[psum : psum + c]]), + k=k, + ) psum += c score /= len(counts) score += 1 @@ -137,56 +165,96 @@ def sklearn_metric_loss_score( score = 1 - ndcg_score([y_true], [y_predict]) else: raise ValueError( - metric_name + ' is not a built-in metric, ' - 'currently built-in metrics are: ' - 'r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,' - 'log_loss, mape, f1, micro_f1, macro_f1, ap. ' - 'please pass a customized metric function to AutoML.fit(metric=func)') + metric_name + " is not a built-in metric, " + "currently built-in metrics are: " + "r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo," + "log_loss, mape, f1, micro_f1, macro_f1, ap. " + "please pass a customized metric function to AutoML.fit(metric=func)" + ) return score def get_y_pred(estimator, X, eval_metric, obj): - if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj: + if eval_metric in ["roc_auc", "ap"] and "binary" in obj: y_pred_classes = estimator.predict_proba(X) - y_pred = y_pred_classes[ - :, 1] if y_pred_classes.ndim > 1 else y_pred_classes - elif eval_metric in ['log_loss', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']: + y_pred = y_pred_classes[:, 1] if y_pred_classes.ndim > 1 else y_pred_classes + elif eval_metric in ["log_loss", "roc_auc", "roc_auc_ovr", "roc_auc_ovo"]: y_pred = estimator.predict_proba(X) else: y_pred = estimator.predict(X) return y_pred -def _eval_estimator(config, estimator, X_train, y_train, X_test, y_test, weight_test, - groups_test, eval_metric, obj, labels=None, - log_training_metric=False, fit_kwargs={}): +def _eval_estimator( + config, + estimator, + X_train, + y_train, + X_test, + y_test, + weight_test, + groups_test, + eval_metric, + obj, + labels=None, + log_training_metric=False, + fit_kwargs={}, +): if isinstance(eval_metric, str): pred_start = time.time() test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj) pred_time = (time.time() - pred_start) / X_test.shape[0] - test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test, - labels, weight_test, groups_test) + test_loss = sklearn_metric_loss_score( + eval_metric, test_pred_y, y_test, labels, weight_test, groups_test + ) metric_for_logging = {} if log_training_metric: train_pred_y = get_y_pred(estimator, X_train, eval_metric, obj) - metric_for_logging['train_loss'] = sklearn_metric_loss_score( - eval_metric, train_pred_y, y_train, labels, - fit_kwargs.get('sample_weight'), fit_kwargs.get('groups')) + metric_for_logging["train_loss"] = sklearn_metric_loss_score( + eval_metric, + train_pred_y, + y_train, + labels, + fit_kwargs.get("sample_weight"), + fit_kwargs.get("groups"), + ) else: # customized metric function test_loss, metric_for_logging = eval_metric( - X_test, y_test, estimator, labels, X_train, y_train, weight_test, - fit_kwargs.get('sample_weight'), config, groups_test, - fit_kwargs.get('groups')) + X_test, + y_test, + estimator, + labels, + X_train, + y_train, + weight_test, + fit_kwargs.get("sample_weight"), + config, + groups_test, + fit_kwargs.get("groups"), + ) if isinstance(metric_for_logging, dict): - pred_time = metric_for_logging.get('pred_time', 0) + pred_time = metric_for_logging.get("pred_time", 0) test_pred_y = None # eval_metric may return test_pred_y but not necessarily. Setting None for now. return test_loss, metric_for_logging, pred_time, test_pred_y -def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_test, - groups_test, eval_metric, obj, labels=None, budget=None, - log_training_metric=False, fit_kwargs={}): +def get_test_loss( + config, + estimator, + X_train, + y_train, + X_test, + y_test, + weight_test, + groups_test, + eval_metric, + obj, + labels=None, + budget=None, + log_training_metric=False, + fit_kwargs={}, +): start = time.time() # if groups_test is not None: @@ -195,16 +263,37 @@ def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_te # fit_kwargs['y_val'] = y_test estimator.fit(X_train, y_train, budget, **fit_kwargs) test_loss, metric_for_logging, pred_time, _ = _eval_estimator( - config, estimator, X_train, y_train, X_test, y_test, - weight_test, groups_test, eval_metric, obj, - labels, log_training_metric, fit_kwargs) + config, + estimator, + X_train, + y_train, + X_test, + y_test, + weight_test, + groups_test, + eval_metric, + obj, + labels, + log_training_metric, + fit_kwargs, + ) train_time = time.time() - start return test_loss, metric_for_logging, train_time, pred_time -def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf, - task, eval_metric, best_val_loss, - log_training_metric=False, fit_kwargs={}): +def evaluate_model_CV( + config, + estimator, + X_train_all, + y_train_all, + budget, + kf, + task, + eval_metric, + best_val_loss, + log_training_metric=False, + fit_kwargs={}, +): start_time = time.time() total_val_loss = 0 total_metric = None @@ -213,7 +302,7 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf, valid_fold_num = total_fold_num = 0 n = kf.get_n_splits() X_train_split, y_train_split = X_train_all, y_train_all - if task in ('binary', 'multi'): + if task in ("binary", "multi"): labels = np.unique(y_train_all) else: labels = None @@ -225,8 +314,8 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf, groups = kf.groups kf = kf.split(X_train_split, y_train_split, groups) shuffle = False - elif isinstance(kf, TimeSeriesSplit) and task == 'forecast': - y_train_all = pd.DataFrame(y_train_all, columns=['y']) + elif isinstance(kf, TimeSeriesSplit) and task == "forecast": + y_train_all = pd.DataFrame(y_train_all, columns=["y"]) train = X_train_all.join(y_train_all) kf = kf.split(train) shuffle = False @@ -237,8 +326,8 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf, rng = np.random.RandomState(2020) val_loss_list = [] budget_per_train = budget / n - if 'sample_weight' in fit_kwargs: - weight = fit_kwargs['sample_weight'] + if "sample_weight" in fit_kwargs: + weight = fit_kwargs["sample_weight"] weight_val = None else: weight = weight_val = None @@ -246,37 +335,48 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf, if shuffle: train_index = rng.permutation(train_index) if isinstance(X_train_all, pd.DataFrame): - X_train, X_val = X_train_split.iloc[ - train_index], X_train_split.iloc[val_index] + X_train = X_train_split.iloc[train_index] + X_val = X_train_split.iloc[val_index] else: - X_train, X_val = X_train_split[ - train_index], X_train_split[val_index] + X_train, X_val = X_train_split[train_index], X_train_split[val_index] y_train, y_val = y_train_split[train_index], y_train_split[val_index] estimator.cleanup() if weight is not None: - fit_kwargs['sample_weight'], weight_val = weight[ - train_index], weight[val_index] + fit_kwargs["sample_weight"], weight_val = ( + weight[train_index], + weight[val_index], + ) if groups is not None: - fit_kwargs['groups'] = groups[train_index] + fit_kwargs["groups"] = groups[train_index] groups_val = groups[val_index] else: groups_val = None val_loss_i, metric_i, train_time_i, pred_time_i = get_test_loss( - config, estimator, X_train, y_train, X_val, y_val, weight_val, - groups_val, eval_metric, task, labels, budget_per_train, - log_training_metric=log_training_metric, fit_kwargs=fit_kwargs) + config, + estimator, + X_train, + y_train, + X_val, + y_val, + weight_val, + groups_val, + eval_metric, + task, + labels, + budget_per_train, + log_training_metric=log_training_metric, + fit_kwargs=fit_kwargs, + ) if weight is not None: - fit_kwargs['sample_weight'] = weight + fit_kwargs["sample_weight"] = weight valid_fold_num += 1 total_fold_num += 1 total_val_loss += val_loss_i if log_training_metric or not isinstance(eval_metric, str): if isinstance(total_metric, list): - total_metric = [ - total_metric[i] + v for i, v in enumerate(metric_i)] + total_metric = [total_metric[i] + v for i, v in enumerate(metric_i)] elif isinstance(total_metric, dict): - total_metric = { - k: total_metric[k] + v for k, v in metric_i.items()} + total_metric = {k: total_metric[k] + v for k, v in metric_i.items()} elif total_metric is not None: total_metric += metric_i else: @@ -307,35 +407,73 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf, def compute_estimator( - X_train, y_train, X_val, y_val, weight_val, groups_val, budget, kf, - config_dic, task, estimator_name, eval_method, eval_metric, - best_val_loss=np.Inf, n_jobs=1, estimator_class=None, log_training_metric=False, - fit_kwargs={} + X_train, + y_train, + X_val, + y_val, + weight_val, + groups_val, + budget, + kf, + config_dic, + task, + estimator_name, + eval_method, + eval_metric, + best_val_loss=np.Inf, + n_jobs=1, + estimator_class=None, + log_training_metric=False, + fit_kwargs={}, ): - estimator_class = estimator_class or get_estimator_class( - task, estimator_name) - estimator = estimator_class( - **config_dic, task=task, n_jobs=n_jobs) - if 'holdout' in eval_method: + estimator_class = estimator_class or get_estimator_class(task, estimator_name) + estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs) + if "holdout" in eval_method: val_loss, metric_for_logging, train_time, pred_time = get_test_loss( - config_dic, estimator, X_train, y_train, X_val, y_val, weight_val, - groups_val, eval_metric, task, budget=budget, - log_training_metric=log_training_metric, fit_kwargs=fit_kwargs) + config_dic, + estimator, + X_train, + y_train, + X_val, + y_val, + weight_val, + groups_val, + eval_metric, + task, + budget=budget, + log_training_metric=log_training_metric, + fit_kwargs=fit_kwargs, + ) else: val_loss, metric_for_logging, train_time, pred_time = evaluate_model_CV( - config_dic, estimator, X_train, y_train, budget, kf, task, - eval_metric, best_val_loss, log_training_metric=log_training_metric, - fit_kwargs=fit_kwargs) + config_dic, + estimator, + X_train, + y_train, + budget, + kf, + task, + eval_metric, + best_val_loss, + log_training_metric=log_training_metric, + fit_kwargs=fit_kwargs, + ) return estimator, val_loss, metric_for_logging, train_time, pred_time def train_estimator( - X_train, y_train, config_dic, task, - estimator_name, n_jobs=1, estimator_class=None, budget=None, fit_kwargs={} + X_train, + y_train, + config_dic, + task, + estimator_name, + n_jobs=1, + estimator_class=None, + budget=None, + fit_kwargs={}, ): start_time = time.time() - estimator_class = estimator_class or get_estimator_class( - task, estimator_name) + estimator_class = estimator_class or get_estimator_class(task, estimator_name) estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs) if X_train is not None: train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs) @@ -347,14 +485,14 @@ def train_estimator( def get_classification_objective(num_labels: int) -> str: if num_labels == 2: - objective_name = 'binary' + objective_name = "binary" else: - objective_name = 'multi' + objective_name = "multi" return objective_name def norm_confusion_matrix(y_true, y_pred): - '''normalized confusion matrix + """normalized confusion matrix Args: estimator: A multi-class classification estimator @@ -363,15 +501,16 @@ def norm_confusion_matrix(y_true, y_pred): Returns: A normalized confusion matrix - ''' + """ from sklearn.metrics import confusion_matrix + conf_mat = confusion_matrix(y_true, y_pred) - norm_conf_mat = conf_mat.astype('float') / conf_mat.sum(axis=1)[:, np.newaxis] + norm_conf_mat = conf_mat.astype("float") / conf_mat.sum(axis=1)[:, np.newaxis] return norm_conf_mat def multi_class_curves(y_true, y_pred_proba, curve_func): - '''Binarize the data for multi-class tasks and produce ROC or precision-recall curves + """Binarize the data for multi-class tasks and produce ROC or precision-recall curves Args: y_true: A numpy array or a pandas series of true labels @@ -384,8 +523,9 @@ def multi_class_curves(y_true, y_pred_proba, curve_func): curve_x[0] is an 1D array of the x coordinates of class 0 The second dictionary curve_y stores the y coordinates of each curve, e.g., curve_y[0] is an 1D array of the y coordinates of class 0 - ''' + """ from sklearn.preprocessing import label_binarize + classes = np.unique(y_true) y_true_binary = label_binarize(y_true, classes=classes) diff --git a/flaml/model.py b/flaml/model.py index 619c20147..bf1092b27 100644 --- a/flaml/model.py +++ b/flaml/model.py @@ -1,7 +1,7 @@ -'''! - * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. +"""! + * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the MIT License. -''' +""" import numpy as np import xgboost as xgb @@ -21,39 +21,40 @@ logger = logging.getLogger(__name__) class BaseEstimator: - '''The abstract class for all learners + """The abstract class for all learners Typical example: XGBoostEstimator: for regression XGBoostSklearnEstimator: for classification LGBMEstimator, RandomForestEstimator, LRL1Classifier, LRL2Classifier: for both regression and classification - ''' + """ - def __init__(self, task='binary', **params): - '''Constructor + def __init__(self, task="binary", **params): + """Constructor Args: task: A string of the task type, one of 'binary', 'multi', 'regression', 'rank', 'forecast' n_jobs: An integer of the number of parallel threads params: A dictionary of the hyperparameter names and values - ''' + """ self.params = params self.estimator_class = self._model = None self._task = task - if '_estimator_type' in params: - self._estimator_type = params['_estimator_type'] - del self.params['_estimator_type'] + if "_estimator_type" in params: + self._estimator_type = params["_estimator_type"] + del self.params["_estimator_type"] else: - self._estimator_type = "classifier" if task in ( - 'binary', 'multi') else "regressor" + self._estimator_type = ( + "classifier" if task in ("binary", "multi") else "regressor" + ) def get_params(self, deep=False): params = self.params.copy() params["task"] = self._task - if hasattr(self, '_estimator_type'): - params['_estimator_type'] = self._estimator_type + if hasattr(self, "_estimator_type"): + params["_estimator_type"] = self._estimator_type return params @property @@ -66,14 +67,12 @@ class BaseEstimator: @property def model(self): - '''Trained model after fit() is called, or None before fit() is called - ''' + """Trained model after fit() is called, or None before fit() is called""" return self._model @property def estimator(self): - '''Trained model after fit() is called, or None before fit() is called - ''' + """Trained model after fit() is called, or None before fit() is called""" return self._model def _preprocess(self, X): @@ -82,10 +81,10 @@ class BaseEstimator: def _fit(self, X_train, y_train, **kwargs): current_time = time.time() - if 'groups' in kwargs: + if "groups" in kwargs: kwargs = kwargs.copy() - if self._task == 'rank': - kwargs['group'] = group_counts(kwargs['groups']) + if self._task == "rank": + kwargs["group"] = group_counts(kwargs["groups"]) # groups_val = kwargs.get('groups_val') # if groups_val is not None: # kwargs['eval_group'] = [group_counts(groups_val)] @@ -93,7 +92,7 @@ class BaseEstimator: # (kwargs['X_val'], kwargs['y_val'])] # kwargs['verbose'] = False # del kwargs['groups_val'], kwargs['X_val'], kwargs['y_val'] - del kwargs['groups'] + del kwargs["groups"] X_train = self._preprocess(X_train) model = self.estimator_class(**self.params) model.fit(X_train, y_train, **kwargs) @@ -102,7 +101,7 @@ class BaseEstimator: return train_time def fit(self, X_train, y_train, budget=None, **kwargs): - '''Train the model from given training data + """Train the model from given training data Args: X_train: A numpy array of training data in shape n*m @@ -111,11 +110,11 @@ class BaseEstimator: Returns: train_time: A float of the training time in seconds - ''' + """ return self._fit(X_train, y_train, **kwargs) def predict(self, X_test): - '''Predict label from features + """Predict label from features Args: X_test: A numpy array of featurized instances, shape n*m @@ -123,7 +122,7 @@ class BaseEstimator: Returns: A numpy array of shape n*1. Each element is the label for a instance - ''' + """ if self._model is not None: X_test = self._preprocess(X_test) return self._model.predict(X_test) @@ -131,7 +130,7 @@ class BaseEstimator: return np.ones(X_test.shape[0]) def predict_proba(self, X_test): - '''Predict the probability of each class from features + """Predict the probability of each class from features Only works for classification problems @@ -143,9 +142,11 @@ class BaseEstimator: A numpy array of shape n*c. c is the # classes Each element at (i,j) is the probability for instance i to be in class j - ''' - assert self._task in ('binary', 'multi'), ( - 'predict_prob() only for classification task.') + """ + assert self._task in ( + "binary", + "multi", + ), "predict_prob() only for classification task." X_test = self._preprocess(X_test) return self._model.predict_proba(X_test) @@ -154,7 +155,7 @@ class BaseEstimator: @classmethod def search_space(cls, **params): - '''[required method] search space + """[required method] search space Returns: A dictionary of the search space. @@ -162,12 +163,12 @@ class BaseEstimator: its domain and init_value (optional), cat_hp_cost (optional) e.g., {'domain': tune.randint(lower=1, upper=10), 'init_value': 1} - ''' + """ return {} @classmethod def size(cls, config: dict) -> float: - '''[optional method] memory size of the estimator in bytes + """[optional method] memory size of the estimator in bytes Args: config - the dict of the hyperparameter config @@ -175,121 +176,121 @@ class BaseEstimator: Returns: A float of the memory size required by the estimator to train the given config - ''' + """ return 1.0 @classmethod def cost_relative2lgbm(cls) -> float: - '''[optional method] relative cost compared to lightgbm''' + """[optional method] relative cost compared to lightgbm""" return 1.0 @classmethod def init(cls): - '''[optional method] initialize the class''' + """[optional method] initialize the class""" pass class SKLearnEstimator(BaseEstimator): - - def __init__(self, task='binary', **params): + def __init__(self, task="binary", **params): super().__init__(task, **params) def _preprocess(self, X): if isinstance(X, pd.DataFrame): - cat_columns = X.select_dtypes(include=['category']).columns + cat_columns = X.select_dtypes(include=["category"]).columns if not cat_columns.empty: X = X.copy() X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes) - elif isinstance(X, np.ndarray) and X.dtype.kind not in 'buif': + elif isinstance(X, np.ndarray) and X.dtype.kind not in "buif": # numpy array is not of numeric dtype X = pd.DataFrame(X) for col in X.columns: if isinstance(X[col][0], str): - X[col] = X[col].astype('category').cat.codes + X[col] = X[col].astype("category").cat.codes X = X.to_numpy() return X class LGBMEstimator(BaseEstimator): - @classmethod def search_space(cls, data_size, **params): upper = min(32768, int(data_size)) return { - 'n_estimators': { - 'domain': tune.lograndint(lower=4, upper=upper), - 'init_value': 4, - 'low_cost_init_value': 4, + "n_estimators": { + "domain": tune.lograndint(lower=4, upper=upper), + "init_value": 4, + "low_cost_init_value": 4, }, - 'num_leaves': { - 'domain': tune.lograndint(lower=4, upper=upper), - 'init_value': 4, - 'low_cost_init_value': 4, + "num_leaves": { + "domain": tune.lograndint(lower=4, upper=upper), + "init_value": 4, + "low_cost_init_value": 4, }, - 'min_child_samples': { - 'domain': tune.lograndint(lower=2, upper=2**7 + 1), - 'init_value': 20, + "min_child_samples": { + "domain": tune.lograndint(lower=2, upper=2 ** 7 + 1), + "init_value": 20, }, - 'learning_rate': { - 'domain': tune.loguniform(lower=1 / 1024, upper=1.0), - 'init_value': 0.1, + "learning_rate": { + "domain": tune.loguniform(lower=1 / 1024, upper=1.0), + "init_value": 0.1, }, # 'subsample': { # 'domain': tune.uniform(lower=0.1, upper=1.0), # 'init_value': 1.0, # }, - 'log_max_bin': { - 'domain': tune.lograndint(lower=3, upper=11), - 'init_value': 8, + "log_max_bin": { # log transformed with base 2 + "domain": tune.lograndint(lower=3, upper=11), + "init_value": 8, }, - 'colsample_bytree': { - 'domain': tune.uniform(lower=0.01, upper=1.0), - 'init_value': 1.0, + "colsample_bytree": { + "domain": tune.uniform(lower=0.01, upper=1.0), + "init_value": 1.0, }, - 'reg_alpha': { - 'domain': tune.loguniform(lower=1 / 1024, upper=1024), - 'init_value': 1 / 1024, + "reg_alpha": { + "domain": tune.loguniform(lower=1 / 1024, upper=1024), + "init_value": 1 / 1024, }, - 'reg_lambda': { - 'domain': tune.loguniform(lower=1 / 1024, upper=1024), - 'init_value': 1.0, + "reg_lambda": { + "domain": tune.loguniform(lower=1 / 1024, upper=1024), + "init_value": 1.0, }, } @classmethod def size(cls, config): - num_leaves = int(round(config.get('num_leaves') or config['max_leaves'])) - n_estimators = int(round(config['n_estimators'])) + num_leaves = int(round(config.get("num_leaves") or config["max_leaves"])) + n_estimators = int(round(config["n_estimators"])) return (num_leaves * 3 + (num_leaves - 1) * 4 + 1.0) * n_estimators * 8 - def __init__(self, task='binary', log_max_bin=8, **params): + def __init__(self, task="binary", log_max_bin=8, **params): super().__init__(task, **params) if "objective" not in self.params: # Default: ‘regression’ for LGBMRegressor, # ‘binary’ or ‘multiclass’ for LGBMClassifier - objective = 'regression' - if 'binary' in task: - objective = 'binary' - elif 'multi' in task: - objective = 'multiclass' - elif 'rank' == task: - objective = 'lambdarank' + objective = "regression" + if "binary" in task: + objective = "binary" + elif "multi" in task: + objective = "multiclass" + elif "rank" == task: + objective = "lambdarank" self.params["objective"] = objective if "n_estimators" in self.params: self.params["n_estimators"] = int(round(self.params["n_estimators"])) if "num_leaves" in self.params: self.params["num_leaves"] = int(round(self.params["num_leaves"])) if "min_child_samples" in self.params: - self.params["min_child_samples"] = int(round(self.params["min_child_samples"])) + self.params["min_child_samples"] = int( + round(self.params["min_child_samples"]) + ) if "max_bin" not in self.params: - self.params['max_bin'] = 1 << int(round(log_max_bin)) - 1 + self.params["max_bin"] = 1 << int(round(log_max_bin)) - 1 if "verbose" not in self.params: - self.params['verbose'] = -1 + self.params["verbose"] = -1 # if "subsample_freq" not in self.params: # self.params['subsample_freq'] = 1 - if 'regression' == task: + if "regression" == task: self.estimator_class = LGBMRegressor - elif 'rank' == task: + elif "rank" == task: self.estimator_class = LGBMRanker else: self.estimator_class = LGBMClassifier @@ -297,23 +298,27 @@ class LGBMEstimator(BaseEstimator): self._train_size = 0 def _preprocess(self, X): - if not isinstance(X, pd.DataFrame) and issparse(X) and np.issubdtype( - X.dtype, np.integer): + if ( + not isinstance(X, pd.DataFrame) + and issparse(X) + and np.issubdtype(X.dtype, np.integer) + ): X = X.astype(float) - elif isinstance(X, np.ndarray) and X.dtype.kind not in 'buif': + elif isinstance(X, np.ndarray) and X.dtype.kind not in "buif": # numpy array is not of numeric dtype X = pd.DataFrame(X) for col in X.columns: if isinstance(X[col][0], str): - X[col] = X[col].astype('category').cat.codes + X[col] = X[col].astype("category").cat.codes X = X.to_numpy() return X def fit(self, X_train, y_train, budget=None, **kwargs): start_time = time.time() n_iter = self.params["n_estimators"] - if (not self._time_per_iter or abs( - self._train_size - X_train.shape[0]) > 4) and budget is not None: + if ( + not self._time_per_iter or abs(self._train_size - X_train.shape[0]) > 4 + ) and budget is not None: self.params["n_estimators"] = 1 self._t1 = self._fit(X_train, y_train, **kwargs) if self._t1 >= budget: @@ -321,18 +326,25 @@ class LGBMEstimator(BaseEstimator): return self._t1 self.params["n_estimators"] = 4 self._t2 = self._fit(X_train, y_train, **kwargs) - self._time_per_iter = (self._t2 - self._t1) / ( - self.params["n_estimators"] - 1) if self._t2 > self._t1 \ - else self._t1 if self._t1 else 0.001 + self._time_per_iter = ( + (self._t2 - self._t1) / (self.params["n_estimators"] - 1) + if self._t2 > self._t1 + else self._t1 + if self._t1 + else 0.001 + ) self._train_size = X_train.shape[0] - if self._t1 + self._t2 >= budget or n_iter == self.params[ - "n_estimators"]: + if self._t1 + self._t2 >= budget or n_iter == self.params["n_estimators"]: self.params["n_estimators"] = n_iter return time.time() - start_time if budget is not None: - self.params["n_estimators"] = min(n_iter, int( - (budget - time.time() + start_time - self._t1) - / self._time_per_iter + 1)) + self.params["n_estimators"] = min( + n_iter, + int( + (budget - time.time() + start_time - self._t1) / self._time_per_iter + + 1 + ), + ) if self.params["n_estimators"] > 0: self._fit(X_train, y_train, **kwargs) self.params["n_estimators"] = n_iter @@ -341,49 +353,49 @@ class LGBMEstimator(BaseEstimator): class XGBoostEstimator(SKLearnEstimator): - ''' not using sklearn API, used for regression ''' + """not using sklearn API, used for regression""" @classmethod def search_space(cls, data_size, **params): upper = min(32768, int(data_size)) return { - 'n_estimators': { - 'domain': tune.lograndint(lower=4, upper=upper), - 'init_value': 4, - 'low_cost_init_value': 4, + "n_estimators": { + "domain": tune.lograndint(lower=4, upper=upper), + "init_value": 4, + "low_cost_init_value": 4, }, - 'max_leaves': { - 'domain': tune.lograndint(lower=4, upper=upper), - 'init_value': 4, - 'low_cost_init_value': 4, + "max_leaves": { + "domain": tune.lograndint(lower=4, upper=upper), + "init_value": 4, + "low_cost_init_value": 4, }, - 'min_child_weight': { - 'domain': tune.loguniform(lower=0.001, upper=128), - 'init_value': 1, + "min_child_weight": { + "domain": tune.loguniform(lower=0.001, upper=128), + "init_value": 1, }, - 'learning_rate': { - 'domain': tune.loguniform(lower=1 / 1024, upper=1.0), - 'init_value': 0.1, + "learning_rate": { + "domain": tune.loguniform(lower=1 / 1024, upper=1.0), + "init_value": 0.1, }, - 'subsample': { - 'domain': tune.uniform(lower=0.1, upper=1.0), - 'init_value': 1.0, + "subsample": { + "domain": tune.uniform(lower=0.1, upper=1.0), + "init_value": 1.0, }, - 'colsample_bylevel': { - 'domain': tune.uniform(lower=0.01, upper=1.0), - 'init_value': 1.0, + "colsample_bylevel": { + "domain": tune.uniform(lower=0.01, upper=1.0), + "init_value": 1.0, }, - 'colsample_bytree': { - 'domain': tune.uniform(lower=0.01, upper=1.0), - 'init_value': 1.0, + "colsample_bytree": { + "domain": tune.uniform(lower=0.01, upper=1.0), + "init_value": 1.0, }, - 'reg_alpha': { - 'domain': tune.loguniform(lower=1 / 1024, upper=1024), - 'init_value': 1 / 1024, + "reg_alpha": { + "domain": tune.loguniform(lower=1 / 1024, upper=1024), + "init_value": 1 / 1024, }, - 'reg_lambda': { - 'domain': tune.loguniform(lower=1 / 1024, upper=1024), - 'init_value': 1.0, + "reg_lambda": { + "domain": tune.loguniform(lower=1 / 1024, upper=1024), + "init_value": 1.0, }, } @@ -396,59 +408,70 @@ class XGBoostEstimator(SKLearnEstimator): return 1.6 def __init__( - self, task='regression', all_thread=False, n_jobs=1, - n_estimators=4, max_leaves=4, subsample=1.0, min_child_weight=1, - learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0, - colsample_bytree=1.0, tree_method='auto', **params + self, + task="regression", + all_thread=False, + n_jobs=1, + n_estimators=4, + max_leaves=4, + subsample=1.0, + min_child_weight=1, + learning_rate=0.1, + reg_lambda=1.0, + reg_alpha=0.0, + colsample_bylevel=1.0, + colsample_bytree=1.0, + tree_method="auto", + **params, ): super().__init__(task, **params) self._n_estimators = int(round(n_estimators)) - self.params.update({ - 'max_leaves': int(round(max_leaves)), - 'max_depth': params.get('max_depth', 0), - 'grow_policy': params.get("grow_policy", 'lossguide'), - 'tree_method': tree_method, - 'verbosity': params.get('verbosity', 0), - 'nthread': n_jobs, - 'learning_rate': float(learning_rate), - 'subsample': float(subsample), - 'reg_alpha': float(reg_alpha), - 'reg_lambda': float(reg_lambda), - 'min_child_weight': float(min_child_weight), - 'booster': params.get('booster', 'gbtree'), - 'colsample_bylevel': float(colsample_bylevel), - 'colsample_bytree': float(colsample_bytree), - 'objective': params.get("objective") - }) + self.params.update( + { + "max_leaves": int(round(max_leaves)), + "max_depth": params.get("max_depth", 0), + "grow_policy": params.get("grow_policy", "lossguide"), + "tree_method": tree_method, + "verbosity": params.get("verbosity", 0), + "nthread": n_jobs, + "learning_rate": float(learning_rate), + "subsample": float(subsample), + "reg_alpha": float(reg_alpha), + "reg_lambda": float(reg_lambda), + "min_child_weight": float(min_child_weight), + "booster": params.get("booster", "gbtree"), + "colsample_bylevel": float(colsample_bylevel), + "colsample_bytree": float(colsample_bytree), + "objective": params.get("objective"), + } + ) if all_thread: - del self.params['nthread'] + del self.params["nthread"] def get_params(self, deep=False): params = super().get_params() - params["n_jobs"] = params['nthread'] + params["n_jobs"] = params["nthread"] return params def fit(self, X_train, y_train, budget=None, **kwargs): start_time = time.time() if not issparse(X_train): - self.params['tree_method'] = 'hist' + self.params["tree_method"] = "hist" X_train = self._preprocess(X_train) - if 'sample_weight' in kwargs: - dtrain = xgb.DMatrix(X_train, label=y_train, weight=kwargs[ - 'sample_weight']) + if "sample_weight" in kwargs: + dtrain = xgb.DMatrix(X_train, label=y_train, weight=kwargs["sample_weight"]) else: dtrain = xgb.DMatrix(X_train, label=y_train) - objective = self.params.get('objective') + objective = self.params.get("objective") if isinstance(objective, str): obj = None else: obj = objective - if 'objective' in self.params: - del self.params['objective'] - self._model = xgb.train(self.params, dtrain, self._n_estimators, - obj=obj) - self.params['objective'] = objective + if "objective" in self.params: + del self.params["objective"] + self._model = xgb.train(self.params, dtrain, self._n_estimators, obj=obj) + self.params["objective"] = objective del dtrain train_time = time.time() - start_time return train_time @@ -461,7 +484,7 @@ class XGBoostEstimator(SKLearnEstimator): class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator): - ''' using sklearn API, used for classification ''' + """using sklearn API, used for classification""" @classmethod def search_space(cls, data_size, **params): @@ -472,74 +495,84 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator): return XGBoostEstimator.cost_relative2lgbm() def __init__( - self, task='binary', n_jobs=1, - n_estimators=4, max_leaves=4, subsample=1.0, - min_child_weight=1, learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0, - colsample_bylevel=1.0, colsample_bytree=1.0, tree_method='hist', - **params + self, + task="binary", + n_jobs=1, + n_estimators=4, + max_leaves=4, + subsample=1.0, + min_child_weight=1, + learning_rate=0.1, + reg_lambda=1.0, + reg_alpha=0.0, + colsample_bylevel=1.0, + colsample_bytree=1.0, + tree_method="hist", + **params, ): super().__init__(task, **params) - del self.params['objective'] - del self.params['max_bin'] - del self.params['verbose'] - self.params.update({ - "n_estimators": int(round(n_estimators)), - 'max_leaves': int(round(max_leaves)), - 'max_depth': 0, - 'grow_policy': params.get("grow_policy", 'lossguide'), - 'tree_method': tree_method, - 'n_jobs': n_jobs, - 'verbosity': 0, - 'learning_rate': float(learning_rate), - 'subsample': float(subsample), - 'reg_alpha': float(reg_alpha), - 'reg_lambda': float(reg_lambda), - 'min_child_weight': float(min_child_weight), - 'booster': params.get('booster', 'gbtree'), - 'colsample_bylevel': float(colsample_bylevel), - 'colsample_bytree': float(colsample_bytree), - 'use_label_encoder': params.get('use_label_encoder', False), - }) + del self.params["objective"] + del self.params["max_bin"] + del self.params["verbose"] + self.params.update( + { + "n_estimators": int(round(n_estimators)), + "max_leaves": int(round(max_leaves)), + "max_depth": 0, + "grow_policy": params.get("grow_policy", "lossguide"), + "tree_method": tree_method, + "n_jobs": n_jobs, + "verbosity": 0, + "learning_rate": float(learning_rate), + "subsample": float(subsample), + "reg_alpha": float(reg_alpha), + "reg_lambda": float(reg_lambda), + "min_child_weight": float(min_child_weight), + "booster": params.get("booster", "gbtree"), + "colsample_bylevel": float(colsample_bylevel), + "colsample_bytree": float(colsample_bytree), + "use_label_encoder": params.get("use_label_encoder", False), + } + ) self.estimator_class = xgb.XGBRegressor - if 'rank' == task: + if "rank" == task: self.estimator_class = xgb.XGBRanker - elif task in ('binary', 'multi'): + elif task in ("binary", "multi"): self.estimator_class = xgb.XGBClassifier self._time_per_iter = None self._train_size = 0 def fit(self, X_train, y_train, budget=None, **kwargs): if issparse(X_train): - self.params['tree_method'] = 'auto' + self.params["tree_method"] = "auto" return super().fit(X_train, y_train, budget, **kwargs) class RandomForestEstimator(SKLearnEstimator, LGBMEstimator): - @classmethod def search_space(cls, data_size, task, **params): data_size = int(data_size) upper = min(2048, data_size) space = { - 'n_estimators': { - 'domain': tune.lograndint(lower=4, upper=upper), - 'init_value': 4, - 'low_cost_init_value': 4, + "n_estimators": { + "domain": tune.lograndint(lower=4, upper=upper), + "init_value": 4, + "low_cost_init_value": 4, }, - 'max_features': { - 'domain': tune.loguniform(lower=0.1, upper=1.0), - 'init_value': 1.0, + "max_features": { + "domain": tune.loguniform(lower=0.1, upper=1.0), + "init_value": 1.0, }, - 'max_leaves': { - 'domain': tune.lograndint(lower=4, upper=min(32768, data_size)), - 'init_value': 4, - 'low_cost_init_value': 4, + "max_leaves": { + "domain": tune.lograndint(lower=4, upper=min(32768, data_size)), + "init_value": 4, + "low_cost_init_value": 4, }, } - if task in ('binary', 'multi'): - space['criterion'] = { - 'domain': tune.choice(['gini', 'entropy']), + if task in ("binary", "multi"): + space["criterion"] = { + "domain": tune.choice(["gini", "entropy"]), # 'init_value': 'gini', } return space @@ -549,24 +582,31 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator): return 2.0 def __init__( - self, task='binary', n_jobs=1, - n_estimators=4, max_features=1.0, criterion='gini', max_leaves=4, - **params + self, + task="binary", + n_jobs=1, + n_estimators=4, + max_features=1.0, + criterion="gini", + max_leaves=4, + **params, ): super().__init__(task, **params) - del self.params['objective'] - del self.params['max_bin'] - self.params.update({ - "n_estimators": int(round(n_estimators)), - "n_jobs": n_jobs, - "verbose": 0, - 'max_features': float(max_features), - "max_leaf_nodes": params.get('max_leaf_nodes', int(round(max_leaves))), - }) + del self.params["objective"] + del self.params["max_bin"] + self.params.update( + { + "n_estimators": int(round(n_estimators)), + "n_jobs": n_jobs, + "verbose": 0, + "max_features": float(max_features), + "max_leaf_nodes": params.get("max_leaf_nodes", int(round(max_leaves))), + } + ) self.estimator_class = RandomForestRegressor - if task in ('binary', 'multi'): + if task in ("binary", "multi"): self.estimator_class = RandomForestClassifier - self.params['criterion'] = criterion + self.params["criterion"] = criterion def get_params(self, deep=False): params = super().get_params() @@ -574,27 +614,25 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator): class ExtraTreeEstimator(RandomForestEstimator): - @classmethod def cost_relative2lgbm(cls): return 1.9 - def __init__(self, task='binary', **params): + def __init__(self, task="binary", **params): super().__init__(task, **params) - if 'regression' in task: + if "regression" in task: self.estimator_class = ExtraTreesRegressor else: self.estimator_class = ExtraTreesClassifier class LRL1Classifier(SKLearnEstimator): - @classmethod def search_space(cls, **params): return { - 'C': { - 'domain': tune.loguniform(lower=0.03125, upper=32768.0), - 'init_value': 1.0, + "C": { + "domain": tune.loguniform(lower=0.03125, upper=32768.0), + "init_value": 1.0, }, } @@ -602,25 +640,25 @@ class LRL1Classifier(SKLearnEstimator): def cost_relative2lgbm(cls): return 160 - def __init__( - self, task='binary', n_jobs=1, tol=0.0001, C=1.0, - **params - ): + def __init__(self, task="binary", n_jobs=1, tol=0.0001, C=1.0, **params): super().__init__(task, **params) - self.params.update({ - 'penalty': params.get("penalty", 'l1'), - 'tol': float(tol), - 'C': float(C), - 'solver': params.get("solver", 'saga'), - 'n_jobs': n_jobs, - }) - assert task in ('binary', 'multi'), ( - 'LogisticRegression for classification task only') + self.params.update( + { + "penalty": params.get("penalty", "l1"), + "tol": float(tol), + "C": float(C), + "solver": params.get("solver", "saga"), + "n_jobs": n_jobs, + } + ) + assert task in ( + "binary", + "multi", + ), "LogisticRegression for classification task only" self.estimator_class = LogisticRegression class LRL2Classifier(SKLearnEstimator): - @classmethod def search_space(cls, **params): return LRL1Classifier.search_space(**params) @@ -629,20 +667,21 @@ class LRL2Classifier(SKLearnEstimator): def cost_relative2lgbm(cls): return 25 - def __init__( - self, task='binary', n_jobs=1, tol=0.0001, C=1.0, - **params - ): + def __init__(self, task="binary", n_jobs=1, tol=0.0001, C=1.0, **params): super().__init__(task, **params) - self.params.update({ - 'penalty': params.get("penalty", 'l2'), - 'tol': float(tol), - 'C': float(C), - 'solver': params.get("solver", 'lbfgs'), - 'n_jobs': n_jobs, - }) - assert task in ('binary', 'multi'), ( - 'LogisticRegression for classification task only') + self.params.update( + { + "penalty": params.get("penalty", "l2"), + "tol": float(tol), + "C": float(C), + "solver": params.get("solver", "lbfgs"), + "n_jobs": n_jobs, + } + ) + assert task in ( + "binary", + "multi", + ), "LogisticRegression for classification task only" self.estimator_class = LogisticRegression @@ -654,14 +693,14 @@ class CatBoostEstimator(BaseEstimator): def search_space(cls, data_size, **params): upper = max(min(round(1500000 / data_size), 150), 12) return { - 'early_stopping_rounds': { - 'domain': tune.lograndint(lower=10, upper=upper), - 'init_value': 10, - 'low_cost_init_value': 10, + "early_stopping_rounds": { + "domain": tune.lograndint(lower=10, upper=upper), + "init_value": 10, + "low_cost_init_value": 10, }, - 'learning_rate': { - 'domain': tune.loguniform(lower=.005, upper=.2), - 'init_value': 0.1, + "learning_rate": { + "domain": tune.loguniform(lower=0.005, upper=0.2), + "init_value": 0.1, }, } @@ -682,68 +721,84 @@ class CatBoostEstimator(BaseEstimator): def _preprocess(self, X): if isinstance(X, pd.DataFrame): - cat_columns = X.select_dtypes(include=['category']).columns + cat_columns = X.select_dtypes(include=["category"]).columns if not cat_columns.empty: X = X.copy() X[cat_columns] = X[cat_columns].apply( - lambda x: - x.cat.rename_categories( - [str(c) if isinstance(c, float) else c - for c in x.cat.categories])) - elif isinstance(X, np.ndarray) and X.dtype.kind not in 'buif': + lambda x: x.cat.rename_categories( + [ + str(c) if isinstance(c, float) else c + for c in x.cat.categories + ] + ) + ) + elif isinstance(X, np.ndarray) and X.dtype.kind not in "buif": # numpy array is not of numeric dtype X = pd.DataFrame(X) for col in X.columns: if isinstance(X[col][0], str): - X[col] = X[col].astype('category').cat.codes + X[col] = X[col].astype("category").cat.codes X = X.to_numpy() return X def __init__( - self, task='binary', n_jobs=1, - n_estimators=8192, learning_rate=0.1, early_stopping_rounds=4, **params + self, + task="binary", + n_jobs=1, + n_estimators=8192, + learning_rate=0.1, + early_stopping_rounds=4, + **params, ): super().__init__(task, **params) - self.params.update({ - "early_stopping_rounds": int(round(early_stopping_rounds)), - "n_estimators": n_estimators, - 'learning_rate': learning_rate, - 'thread_count': n_jobs, - 'verbose': params.get('verbose', False), - 'random_seed': params.get("random_seed", 10242048), - }) + self.params.update( + { + "early_stopping_rounds": int(round(early_stopping_rounds)), + "n_estimators": n_estimators, + "learning_rate": learning_rate, + "thread_count": n_jobs, + "verbose": params.get("verbose", False), + "random_seed": params.get("random_seed", 10242048), + } + ) from catboost import CatBoostRegressor + self.estimator_class = CatBoostRegressor - if task in ('binary', 'multi'): + if task in ("binary", "multi"): from catboost import CatBoostClassifier + self.estimator_class = CatBoostClassifier def get_params(self, deep=False): params = super().get_params() - params['n_jobs'] = params['thread_count'] + params["n_jobs"] = params["thread_count"] return params def fit(self, X_train, y_train, budget=None, **kwargs): import shutil + start_time = time.time() - train_dir = f'catboost_{str(start_time)}' + train_dir = f"catboost_{str(start_time)}" n_iter = self.params["n_estimators"] X_train = self._preprocess(X_train) if isinstance(X_train, pd.DataFrame): - cat_features = list(X_train.select_dtypes( - include='category').columns) + cat_features = list(X_train.select_dtypes(include="category").columns) else: cat_features = [] # from catboost import CatBoostError # try: - if (not CatBoostEstimator._time_per_iter or abs( - CatBoostEstimator._train_size - len(y_train)) > 4) and budget: + if ( + not CatBoostEstimator._time_per_iter + or abs(CatBoostEstimator._train_size - len(y_train)) > 4 + ) and budget: # measure the time per iteration self.params["n_estimators"] = 1 CatBoostEstimator._smallmodel = self.estimator_class( - train_dir=train_dir, **self.params) + train_dir=train_dir, **self.params + ) CatBoostEstimator._smallmodel.fit( - X_train, y_train, cat_features=cat_features, **kwargs) + X_train, y_train, cat_features=cat_features, **kwargs + ) CatBoostEstimator._t1 = time.time() - start_time if CatBoostEstimator._t1 >= budget: self.params["n_estimators"] = n_iter @@ -752,47 +807,61 @@ class CatBoostEstimator(BaseEstimator): return CatBoostEstimator._t1 self.params["n_estimators"] = 4 CatBoostEstimator._smallmodel = self.estimator_class( - train_dir=train_dir, **self.params) + train_dir=train_dir, **self.params + ) CatBoostEstimator._smallmodel.fit( - X_train, y_train, cat_features=cat_features, **kwargs) + X_train, y_train, cat_features=cat_features, **kwargs + ) CatBoostEstimator._time_per_iter = ( - time.time() - start_time - CatBoostEstimator._t1) / ( - self.params["n_estimators"] - 1) + time.time() - start_time - CatBoostEstimator._t1 + ) / (self.params["n_estimators"] - 1) if CatBoostEstimator._time_per_iter <= 0: CatBoostEstimator._time_per_iter = CatBoostEstimator._t1 CatBoostEstimator._train_size = len(y_train) - if time.time() - start_time >= budget or n_iter == self.params[ - "n_estimators"]: + if ( + time.time() - start_time >= budget + or n_iter == self.params["n_estimators"] + ): self.params["n_estimators"] = n_iter self._model = CatBoostEstimator._smallmodel shutil.rmtree(train_dir, ignore_errors=True) return time.time() - start_time if budget: train_times = 1 - self.params["n_estimators"] = min(n_iter, int( - (budget - time.time() + start_time - CatBoostEstimator._t1) - / train_times / CatBoostEstimator._time_per_iter + 1)) + self.params["n_estimators"] = min( + n_iter, + int( + (budget - time.time() + start_time - CatBoostEstimator._t1) + / train_times + / CatBoostEstimator._time_per_iter + + 1 + ), + ) self._model = CatBoostEstimator._smallmodel if self.params["n_estimators"] > 0: n = max(int(len(y_train) * 0.9), len(y_train) - 1000) X_tr, y_tr = X_train[:n], y_train[:n] - if 'sample_weight' in kwargs: - weight = kwargs['sample_weight'] + if "sample_weight" in kwargs: + weight = kwargs["sample_weight"] if weight is not None: - kwargs['sample_weight'] = weight[:n] + kwargs["sample_weight"] = weight[:n] else: weight = None from catboost import Pool + model = self.estimator_class(train_dir=train_dir, **self.params) model.fit( - X_tr, y_tr, cat_features=cat_features, + X_tr, + y_tr, + cat_features=cat_features, eval_set=Pool( - data=X_train[n:], label=y_train[n:], - cat_features=cat_features), - **kwargs) # model.get_best_iteration() + data=X_train[n:], label=y_train[n:], cat_features=cat_features + ), + **kwargs, + ) # model.get_best_iteration() shutil.rmtree(train_dir, ignore_errors=True) if weight is not None: - kwargs['sample_weight'] = weight + kwargs["sample_weight"] = weight self._model = model # except CatBoostError: # self._model = None @@ -802,15 +871,14 @@ class CatBoostEstimator(BaseEstimator): class KNeighborsEstimator(BaseEstimator): - @classmethod def search_space(cls, data_size, **params): upper = min(512, int(data_size / 2)) return { - 'n_neighbors': { - 'domain': tune.lograndint(lower=1, upper=upper), - 'init_value': 5, - 'low_cost_init_value': 1, + "n_neighbors": { + "domain": tune.lograndint(lower=1, upper=upper), + "init_value": 5, + "low_cost_init_value": 1, }, } @@ -818,29 +886,30 @@ class KNeighborsEstimator(BaseEstimator): def cost_relative2lgbm(cls): return 30 - def __init__( - self, task='binary', n_jobs=1, n_neighbors=5, **params - ): + def __init__(self, task="binary", n_jobs=1, n_neighbors=5, **params): super().__init__(task, **params) - self.params.update({ - 'n_neighbors': int(round(n_neighbors)), - 'weights': params.get('weights', 'distance'), - 'n_jobs': n_jobs, - }) + self.params.update( + { + "n_neighbors": int(round(n_neighbors)), + "weights": params.get("weights", "distance"), + "n_jobs": n_jobs, + } + ) from sklearn.neighbors import KNeighborsRegressor + self.estimator_class = KNeighborsRegressor - if task in ('binary', 'multi'): + if task in ("binary", "multi"): from sklearn.neighbors import KNeighborsClassifier + self.estimator_class = KNeighborsClassifier def _preprocess(self, X): if isinstance(X, pd.DataFrame): - cat_columns = X.select_dtypes(['category']).columns + cat_columns = X.select_dtypes(["category"]).columns if X.shape[1] == len(cat_columns): - raise ValueError( - "kneighbor requires at least one numeric feature") + raise ValueError("kneighbor requires at least one numeric feature") X = X.drop(cat_columns, axis=1) - elif isinstance(X, np.ndarray) and X.dtype.kind not in 'buif': + elif isinstance(X, np.ndarray) and X.dtype.kind not in "buif": # drop categocial columns if any X = pd.DataFrame(X) cat_columns = [] @@ -852,45 +921,47 @@ class KNeighborsEstimator(BaseEstimator): return X -class FBProphet(BaseEstimator): +class Prophet(BaseEstimator): @classmethod def search_space(cls, **params): space = { - 'changepoint_prior_scale': { - 'domain': tune.loguniform(lower=0.001, upper=1000), - 'init_value': 0.01, - 'low_cost_init_value': 0.001, + "changepoint_prior_scale": { + "domain": tune.loguniform(lower=0.001, upper=1000), + "init_value": 0.01, + "low_cost_init_value": 0.001, }, - 'seasonality_prior_scale': { - 'domain': tune.loguniform(lower=0.01, upper=100), - 'init_value': 1, + "seasonality_prior_scale": { + "domain": tune.loguniform(lower=0.01, upper=100), + "init_value": 1, }, - 'holidays_prior_scale': { - 'domain': tune.loguniform(lower=0.01, upper=100), - 'init_value': 1, + "holidays_prior_scale": { + "domain": tune.loguniform(lower=0.01, upper=100), + "init_value": 1, + }, + "seasonality_mode": { + "domain": tune.choice(["additive", "multiplicative"]), + "init_value": "multiplicative", }, - 'seasonality_mode': { - 'domain': tune.choice(['additive', 'multiplicative']), - 'init_value': 'multiplicative', - } } return space - def __init__(self, task='forecast', **params): - if 'n_jobs' in params: - params.pop('n_jobs') + def __init__(self, task="forecast", **params): + if "n_jobs" in params: + params.pop("n_jobs") super().__init__(task, **params) def _join(self, X_train, y_train): - assert 'ds' in X_train, ( - 'Dataframe for training forecast model must have column' - ' "ds" with the dates in X_train.') - y_train = pd.DataFrame(y_train, columns=['y']) + assert "ds" in X_train, ( + "Dataframe for training forecast model must have column" + ' "ds" with the dates in X_train.' + ) + y_train = pd.DataFrame(y_train, columns=["y"]) train_df = X_train.join(y_train) return train_df def fit(self, X_train, y_train, budget=None, **kwargs): from prophet import Prophet + current_time = time.time() train_df = self._join(X_train, y_train) model = Prophet(**self.params).fit(train_df) @@ -902,54 +973,60 @@ class FBProphet(BaseEstimator): if isinstance(X_test, int): raise ValueError( "predict() with steps is only supported for arima/sarimax." - " For FBProphet, pass a dataframe with a date colum named ds.") + " For Prophet, pass a dataframe with a date colum named ds." + ) if self._model is not None: forecast = self._model.predict(X_test) - return forecast['yhat'] + return forecast["yhat"] else: logger.warning( - "Estimator is not fit yet. Please run fit() before predict().") + "Estimator is not fit yet. Please run fit() before predict()." + ) return np.ones(X_test.shape[0]) -class ARIMA(FBProphet): +class ARIMA(Prophet): @classmethod def search_space(cls, **params): space = { - 'p': { - 'domain': tune.quniform(lower=0, upper=10, q=1), - 'init_value': 2, - 'low_cost_init_value': 0, + "p": { + "domain": tune.quniform(lower=0, upper=10, q=1), + "init_value": 2, + "low_cost_init_value": 0, }, - 'd': { - 'domain': tune.quniform(lower=0, upper=10, q=1), - 'init_value': 2, - 'low_cost_init_value': 0, + "d": { + "domain": tune.quniform(lower=0, upper=10, q=1), + "init_value": 2, + "low_cost_init_value": 0, + }, + "q": { + "domain": tune.quniform(lower=0, upper=10, q=1), + "init_value": 2, + "low_cost_init_value": 0, }, - 'q': { - 'domain': tune.quniform(lower=0, upper=10, q=1), - 'init_value': 2, - 'low_cost_init_value': 0, - } } return space def _join(self, X_train, y_train): train_df = super()._join(X_train, y_train) - train_df.index = pd.to_datetime(train_df['ds']) - train_df = train_df.drop('ds', axis=1) + train_df.index = pd.to_datetime(train_df["ds"]) + train_df = train_df.drop("ds", axis=1) return train_df def fit(self, X_train, y_train, budget=None, **kwargs): import warnings + warnings.filterwarnings("ignore") from statsmodels.tsa.arima.model import ARIMA as ARIMA_estimator + current_time = time.time() train_df = self._join(X_train, y_train) model = ARIMA_estimator( - train_df, order=( - self.params['p'], self.params['d'], self.params['q']), - enforce_stationarity=False, enforce_invertibility=False) + train_df, + order=(self.params["p"], self.params["d"], self.params["q"]), + enforce_stationarity=False, + enforce_invertibility=False, + ) model = model.fit() train_time = time.time() - current_time self._model = model @@ -966,65 +1043,71 @@ class ARIMA(FBProphet): else: raise ValueError( "X_test needs to be either a pd.Dataframe with dates as column ds)" - " or an int number of periods for predict().") + " or an int number of periods for predict()." + ) return forecast else: - return np.ones(X_test if isinstance(X_test, int) - else X_test.shape[0]) + return np.ones(X_test if isinstance(X_test, int) else X_test.shape[0]) class SARIMAX(ARIMA): @classmethod def search_space(cls, **params): space = { - 'p': { - 'domain': tune.quniform(lower=0, upper=10, q=1), - 'init_value': 2, - 'low_cost_init_value': 0, + "p": { + "domain": tune.quniform(lower=0, upper=10, q=1), + "init_value": 2, + "low_cost_init_value": 0, }, - 'd': { - 'domain': tune.quniform(lower=0, upper=10, q=1), - 'init_value': 2, - 'low_cost_init_value': 0, + "d": { + "domain": tune.quniform(lower=0, upper=10, q=1), + "init_value": 2, + "low_cost_init_value": 0, }, - 'q': { - 'domain': tune.quniform(lower=0, upper=10, q=1), - 'init_value': 2, - 'low_cost_init_value': 0, + "q": { + "domain": tune.quniform(lower=0, upper=10, q=1), + "init_value": 2, + "low_cost_init_value": 0, }, - 'P': { - 'domain': tune.quniform(lower=0, upper=10, q=1), - 'init_value': 1, - 'low_cost_init_value': 0, + "P": { + "domain": tune.quniform(lower=0, upper=10, q=1), + "init_value": 1, + "low_cost_init_value": 0, }, - 'D': { - 'domain': tune.quniform(lower=0, upper=10, q=1), - 'init_value': 1, - 'low_cost_init_value': 0, + "D": { + "domain": tune.quniform(lower=0, upper=10, q=1), + "init_value": 1, + "low_cost_init_value": 0, }, - 'Q': { - 'domain': tune.quniform(lower=0, upper=10, q=1), - 'init_value': 1, - 'low_cost_init_value': 0, + "Q": { + "domain": tune.quniform(lower=0, upper=10, q=1), + "init_value": 1, + "low_cost_init_value": 0, + }, + "s": { + "domain": tune.choice([1, 4, 6, 12]), + "init_value": 12, }, - 's': { - 'domain': tune.choice([1, 4, 6, 12]), - 'init_value': 12, - } } return space def fit(self, X_train, y_train, budget=None, **kwargs): from statsmodels.tsa.statespace.sarimax import SARIMAX as SARIMAX_estimator + current_time = time.time() train_df = self._join(X_train, y_train) model = SARIMAX_estimator( - train_df, order=( - self.params['p'], self.params['d'], self.params['q']), + train_df, + order=(self.params["p"], self.params["d"], self.params["q"]), seasonality_order=( - self.params['P'], self.params['D'], self.params['Q'], - self.params['s']), - enforce_stationarity=False, enforce_invertibility=False) + self.params["P"], + self.params["D"], + self.params["Q"], + self.params["s"], + ), + enforce_stationarity=False, + enforce_invertibility=False, + ) model = model.fit() train_time = time.time() - current_time self._model = model diff --git a/flaml/version.py b/flaml/version.py index 63af88769..364e7baed 100644 --- a/flaml/version.py +++ b/flaml/version.py @@ -1 +1 @@ -__version__ = "0.6.3" +__version__ = "0.6.4" diff --git a/setup.py b/setup.py index 73757e85b..f17e675e0 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ setuptools.setup( long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/microsoft/FLAML", - packages=setuptools.find_packages(), + packages=setuptools.find_packages(include=["flaml*"]), install_requires=install_requires, extras_require={ "notebook": [ diff --git a/test/test_forecast.py b/test/test_forecast.py index 8ca2b6059..0f7642d66 100644 --- a/test/test_forecast.py +++ b/test/test_forecast.py @@ -30,9 +30,11 @@ def test_forecast_automl(budget=5): } """The main flaml automl API""" try: + import prophet + automl.fit(dataframe=df, **settings, period=time_horizon) except ImportError: - print("not using FBProphet due to ImportError") + print("not using prophet due to ImportError") automl.fit( dataframe=df, **settings, @@ -79,7 +81,7 @@ def test_forecast_automl(budget=5): try: automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon) except ImportError: - print("not using FBProphet due to ImportError") + print("not using prophet due to ImportError") automl.fit( X_train=X_train, y_train=y_train, @@ -94,6 +96,8 @@ def test_numpy(): y_train = np.random.random(size=72) automl = AutoML() try: + import prophet + automl.fit( X_train=X_train[:60], # a single column of timestamp y_train=y_train, # value for each timestamp @@ -105,9 +109,9 @@ def test_numpy(): print(automl.predict(X_train[60:])) print(automl.predict(12)) except ValueError: - print("ValueError for FBProphet is raised as expected.") + print("ValueError for prophet is raised as expected.") except ImportError: - print("not using FBProphet due to ImportError") + print("not using prophet due to ImportError") automl = AutoML() automl.fit( X_train=X_train[:72], # a single column of timestamp