mirror of
				https://github.com/microsoft/autogen.git
				synced 2025-10-31 01:40:58 +00:00 
			
		
		
		
	package name in setup (#198)
* package name * learning to rank example: close #200 * try import prophet #201
This commit is contained in:
		
							parent
							
								
									8f9f08cebc
								
							
						
					
					
						commit
						f4529dfe89
					
				
							
								
								
									
										21
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								README.md
									
									
									
									
									
								
							| @ -75,7 +75,7 @@ And they can be used in distributed HPO frameworks such as ray tune or nni. | ||||
| 
 | ||||
| ## Examples | ||||
| 
 | ||||
| - A basic classification example. | ||||
| * A basic classification example. | ||||
| 
 | ||||
| ```python | ||||
| from flaml import AutoML | ||||
| @ -99,7 +99,7 @@ print(automl.predict_proba(X_train)) | ||||
| print(automl.model) | ||||
| ``` | ||||
| 
 | ||||
| - A basic regression example. | ||||
| * A basic regression example. | ||||
| 
 | ||||
| ```python | ||||
| from flaml import AutoML | ||||
| @ -123,7 +123,7 @@ print(automl.predict(X_train)) | ||||
| print(automl.model) | ||||
| ``` | ||||
| 
 | ||||
| - Time series forecasting. | ||||
| * Time series forecasting. | ||||
| 
 | ||||
| ```python | ||||
| # pip install flaml[forecast] | ||||
| @ -141,14 +141,15 @@ automl.fit(X_train=X_train[:72],  # a single column of timestamp | ||||
| print(automl.predict(X_train[72:])) | ||||
| ``` | ||||
| 
 | ||||
| - Learning to rank. | ||||
| * Learning to rank. | ||||
| 
 | ||||
| ```python | ||||
| from sklearn.datasets import fetch_openml | ||||
| from flaml import AutoML | ||||
| X, y = fetch_openml(name="credit-g", return_X_y=True)   | ||||
| X_train, y_train = fetch_openml(name="credit-g", return_X_y=True, as_frame=False) | ||||
| y_train = y_train.cat.codes | ||||
| # not a real learning to rank dataaset | ||||
| groups = [200] * 4 + [100] * 2,    # group counts | ||||
| groups = [200] * 4 + [100] * 2    # group counts | ||||
| automl = AutoML() | ||||
| automl.fit( | ||||
|     X_train, y_train, groups=groups, | ||||
| @ -207,17 +208,21 @@ pip install -e .[test,notebook] | ||||
| ``` | ||||
| 
 | ||||
| ### Docker | ||||
| 
 | ||||
| We provide a simple [Dockerfile](https://github.com/microsoft/FLAML/blob/main/Dockerfile). | ||||
| ``` | ||||
| 
 | ||||
| ```bash | ||||
| docker build git://github.com/microsoft/FLAML -t flaml-dev | ||||
| docker run -it flaml-dev | ||||
| ``` | ||||
| 
 | ||||
| ### Develop in Remote Container | ||||
| 
 | ||||
| If you use vscode, you can open the FLAML folder in a [Container](https://code.visualstudio.com/docs/remote/containers). | ||||
| We have provided the configuration in (.devcontainer)[(https://github.com/microsoft/FLAML/blob/main/.devcontainer)]. | ||||
| We have provided the configuration in [.devcontainer]((https://github.com/microsoft/FLAML/blob/main/.devcontainer)). | ||||
| 
 | ||||
| ### Pre-commit | ||||
| 
 | ||||
| Run `pre-commit install` to install pre-commit into your git hooks. Before you commit, run | ||||
| `pre-commit run` to check if you meet the pre-commit requirements. If you use Windows (without WSL) and can't commit after installing pre-commit, you can run `pre-commit uninstall` to uninstall the hook. In WSL or Linux this is supposed to work. | ||||
| 
 | ||||
|  | ||||
| @ -1474,7 +1474,12 @@ class AutoML: | ||||
| 
 | ||||
|         if "auto" == estimator_list: | ||||
|             if self._state.task == "forecast": | ||||
|                 estimator_list = ["fbprophet", "arima", "sarimax"] | ||||
|                 try: | ||||
|                     import prophet | ||||
| 
 | ||||
|                     estimator_list = ["prophet", "arima", "sarimax"] | ||||
|                 except ImportError: | ||||
|                     estimator_list = ["arima", "sarimax"] | ||||
|             elif self._state.task == "rank": | ||||
|                 estimator_list = ["lgbm", "xgboost"] | ||||
|             else: | ||||
|  | ||||
							
								
								
									
										228
									
								
								flaml/data.py
									
									
									
									
									
								
							
							
						
						
									
										228
									
								
								flaml/data.py
									
									
									
									
									
								
							| @ -1,7 +1,7 @@ | ||||
| '''! | ||||
| """! | ||||
|  * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. | ||||
|  * Licensed under the MIT License. | ||||
| ''' | ||||
| """ | ||||
| 
 | ||||
| import numpy as np | ||||
| from scipy.sparse import vstack, issparse | ||||
| @ -11,9 +11,10 @@ from .training_log import training_log_reader | ||||
| from datetime import datetime | ||||
| 
 | ||||
| 
 | ||||
| def load_openml_dataset(dataset_id, data_dir=None, random_state=0, | ||||
|                         dataset_format='dataframe'): | ||||
|     '''Load dataset from open ML. | ||||
| def load_openml_dataset( | ||||
|     dataset_id, data_dir=None, random_state=0, dataset_format="dataframe" | ||||
| ): | ||||
|     """Load dataset from open ML. | ||||
| 
 | ||||
|     If the file is not cached locally, download it from open ML. | ||||
| 
 | ||||
| @ -30,41 +31,43 @@ def load_openml_dataset(dataset_id, data_dir=None, random_state=0, | ||||
|         X_test:  Test data | ||||
|         y_train: A series or array of labels for training data | ||||
|         y_test:  A series or array of labels for test data | ||||
|     ''' | ||||
|     """ | ||||
|     import os | ||||
|     import openml | ||||
|     import pickle | ||||
|     from sklearn.model_selection import train_test_split | ||||
| 
 | ||||
|     filename = 'openml_ds' + str(dataset_id) + '.pkl' | ||||
|     filename = "openml_ds" + str(dataset_id) + ".pkl" | ||||
|     filepath = os.path.join(data_dir, filename) | ||||
|     if os.path.isfile(filepath): | ||||
|         print('load dataset from', filepath) | ||||
|         with open(filepath, 'rb') as f: | ||||
|         print("load dataset from", filepath) | ||||
|         with open(filepath, "rb") as f: | ||||
|             dataset = pickle.load(f) | ||||
|     else: | ||||
|         print('download dataset from openml') | ||||
|         print("download dataset from openml") | ||||
|         dataset = openml.datasets.get_dataset(dataset_id) | ||||
|         if not os.path.exists(data_dir): | ||||
|             os.makedirs(data_dir) | ||||
|         with open(filepath, 'wb') as f: | ||||
|         with open(filepath, "wb") as f: | ||||
|             pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL) | ||||
|     print('Dataset name:', dataset.name) | ||||
|     X, y, * \ | ||||
|         __ = dataset.get_data( | ||||
|             target=dataset.default_target_attribute, dataset_format=dataset_format) | ||||
|     X_train, X_test, y_train, y_test = train_test_split( | ||||
|         X, y, random_state=random_state) | ||||
|     print("Dataset name:", dataset.name) | ||||
|     X, y, *__ = dataset.get_data( | ||||
|         target=dataset.default_target_attribute, dataset_format=dataset_format | ||||
|     ) | ||||
|     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state) | ||||
|     print( | ||||
|         'X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}'.format( | ||||
|             X_train.shape, y_train.shape, X_test.shape, y_test.shape, | ||||
|         "X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}".format( | ||||
|             X_train.shape, | ||||
|             y_train.shape, | ||||
|             X_test.shape, | ||||
|             y_test.shape, | ||||
|         ) | ||||
|     ) | ||||
|     return X_train, X_test, y_train, y_test | ||||
| 
 | ||||
| 
 | ||||
| def load_openml_task(task_id, data_dir): | ||||
|     '''Load task from open ML. | ||||
|     """Load task from open ML. | ||||
| 
 | ||||
|     Use the first fold of the task. | ||||
|     If the file is not cached locally, download it from open ML. | ||||
| @ -78,21 +81,22 @@ def load_openml_task(task_id, data_dir): | ||||
|         X_test:  A dataframe of test data | ||||
|         y_train: A series of labels for training data | ||||
|         y_test:  A series of labels for test data | ||||
|     ''' | ||||
|     """ | ||||
|     import os | ||||
|     import openml | ||||
|     import pickle | ||||
| 
 | ||||
|     task = openml.tasks.get_task(task_id) | ||||
|     filename = 'openml_task' + str(task_id) + '.pkl' | ||||
|     filename = "openml_task" + str(task_id) + ".pkl" | ||||
|     filepath = os.path.join(data_dir, filename) | ||||
|     if os.path.isfile(filepath): | ||||
|         print('load dataset from', filepath) | ||||
|         with open(filepath, 'rb') as f: | ||||
|         print("load dataset from", filepath) | ||||
|         with open(filepath, "rb") as f: | ||||
|             dataset = pickle.load(f) | ||||
|     else: | ||||
|         print('download dataset from openml') | ||||
|         print("download dataset from openml") | ||||
|         dataset = task.get_dataset() | ||||
|         with open(filepath, 'wb') as f: | ||||
|         with open(filepath, "wb") as f: | ||||
|             pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL) | ||||
|     X, y, _, _ = dataset.get_data(task.target_name) | ||||
|     train_indices, test_indices = task.get_train_test_split_indices( | ||||
| @ -105,15 +109,18 @@ def load_openml_task(task_id, data_dir): | ||||
|     X_test = X.iloc[test_indices] | ||||
|     y_test = y[test_indices] | ||||
|     print( | ||||
|         'X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}'.format( | ||||
|             X_train.shape, y_train.shape, X_test.shape, y_test.shape, | ||||
|         "X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}".format( | ||||
|             X_train.shape, | ||||
|             y_train.shape, | ||||
|             X_test.shape, | ||||
|             y_test.shape, | ||||
|         ) | ||||
|     ) | ||||
|     return X_train, X_test, y_train, y_test | ||||
| 
 | ||||
| 
 | ||||
| def get_output_from_log(filename, time_budget): | ||||
|     '''Get output from log file | ||||
|     """Get output from log file | ||||
| 
 | ||||
|     Args: | ||||
|         filename: A string of the log file name | ||||
| @ -127,11 +134,11 @@ def get_output_from_log(filename, time_budget): | ||||
|         config_list: | ||||
|             A list of the estimator, sample size and config of each logged iter | ||||
|         logged_metric_list: A list of the logged metric of each logged iter | ||||
|     ''' | ||||
|     """ | ||||
| 
 | ||||
|     best_config = None | ||||
|     best_learner = None | ||||
|     best_val_loss = float('+inf') | ||||
|     best_val_loss = float("+inf") | ||||
| 
 | ||||
|     search_time_list = [] | ||||
|     config_list = [] | ||||
| @ -144,7 +151,7 @@ def get_output_from_log(filename, time_budget): | ||||
|             time_used = record.wall_clock_time | ||||
|             val_loss = record.validation_loss | ||||
|             config = record.config | ||||
|             learner = record.learner.split('_')[0] | ||||
|             learner = record.learner.split("_")[0] | ||||
|             sample_size = record.sample_size | ||||
|             metric = record.logged_metric | ||||
| 
 | ||||
| @ -158,27 +165,34 @@ def get_output_from_log(filename, time_budget): | ||||
|                 best_error_list.append(best_val_loss) | ||||
|                 logged_metric_list.append(metric) | ||||
|                 error_list.append(val_loss) | ||||
|                 config_list.append({"Current Learner": learner, | ||||
|                 config_list.append( | ||||
|                     { | ||||
|                         "Current Learner": learner, | ||||
|                         "Current Sample": sample_size, | ||||
|                         "Current Hyper-parameters": record.config, | ||||
|                         "Best Learner": best_learner, | ||||
|                                     "Best Hyper-parameters": best_config}) | ||||
|                         "Best Hyper-parameters": best_config, | ||||
|                     } | ||||
|                 ) | ||||
| 
 | ||||
|     return (search_time_list, best_error_list, error_list, config_list, | ||||
|             logged_metric_list) | ||||
|     return ( | ||||
|         search_time_list, | ||||
|         best_error_list, | ||||
|         error_list, | ||||
|         config_list, | ||||
|         logged_metric_list, | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def concat(X1, X2): | ||||
|     '''concatenate two matrices vertically | ||||
|     ''' | ||||
|     """concatenate two matrices vertically""" | ||||
|     if isinstance(X1, pd.DataFrame) or isinstance(X1, pd.Series): | ||||
|         df = pd.concat([X1, X2], sort=False) | ||||
|         df.reset_index(drop=True, inplace=True) | ||||
|         if isinstance(X1, pd.DataFrame): | ||||
|             cat_columns = X1.select_dtypes( | ||||
|                 include='category').columns | ||||
|             cat_columns = X1.select_dtypes(include="category").columns | ||||
|             if len(cat_columns): | ||||
|                 df[cat_columns] = df[cat_columns].astype('category') | ||||
|                 df[cat_columns] = df[cat_columns].astype("category") | ||||
|         return df | ||||
|     if issparse(X1): | ||||
|         return vstack((X1, X2)) | ||||
| @ -187,8 +201,7 @@ def concat(X1, X2): | ||||
| 
 | ||||
| 
 | ||||
| class DataTransformer: | ||||
|     '''transform X, y | ||||
|     ''' | ||||
|     """transform X, y""" | ||||
| 
 | ||||
|     def fit_transform(self, X, y, task): | ||||
|         if isinstance(X, pd.DataFrame): | ||||
| @ -198,19 +211,25 @@ class DataTransformer: | ||||
|             drop = False | ||||
|             for column in X.columns: | ||||
|                 # sklearn\utils\validation.py needs int/float values | ||||
|                 if X[column].dtype.name in ('object', 'category'): | ||||
|                     if X[column].nunique() == 1 or X[column].nunique( | ||||
|                             dropna=True) == n - X[column].isnull().sum(): | ||||
|                 if X[column].dtype.name in ("object", "category"): | ||||
|                     if ( | ||||
|                         X[column].nunique() == 1 | ||||
|                         or X[column].nunique(dropna=True) | ||||
|                         == n - X[column].isnull().sum() | ||||
|                     ): | ||||
|                         X.drop(columns=column, inplace=True) | ||||
|                         drop = True | ||||
|                     elif X[column].dtype.name == 'category': | ||||
|                     elif X[column].dtype.name == "category": | ||||
|                         current_categories = X[column].cat.categories | ||||
|                         if '__NAN__' not in current_categories: | ||||
|                             X[column] = X[column].cat.add_categories( | ||||
|                                 '__NAN__').fillna('__NAN__') | ||||
|                         if "__NAN__" not in current_categories: | ||||
|                             X[column] = ( | ||||
|                                 X[column] | ||||
|                                 .cat.add_categories("__NAN__") | ||||
|                                 .fillna("__NAN__") | ||||
|                             ) | ||||
|                         cat_columns.append(column) | ||||
|                     else: | ||||
|                         X[column] = X[column].fillna('__NAN__') | ||||
|                         X[column] = X[column].fillna("__NAN__") | ||||
|                         cat_columns.append(column) | ||||
|                 else: | ||||
|                     # print(X[column].dtype.name) | ||||
| @ -218,17 +237,27 @@ class DataTransformer: | ||||
|                         X.drop(columns=column, inplace=True) | ||||
|                         drop = True | ||||
|                     else: | ||||
|                         if X[column].dtype.name == 'datetime64[ns]': | ||||
|                         if X[column].dtype.name == "datetime64[ns]": | ||||
|                             tmp_dt = X[column].dt | ||||
|                             new_columns_dict = {f'year_{column}': tmp_dt.year, f'month_{column}': tmp_dt.month, | ||||
|                                                 f'day_{column}': tmp_dt.day, f'hour_{column}': tmp_dt.hour, | ||||
|                                                 f'minute_{column}': tmp_dt.minute, f'second_{column}': tmp_dt.second, | ||||
|                                                 f'dayofweek_{column}': tmp_dt.dayofweek, | ||||
|                                                 f'dayofyear_{column}': tmp_dt.dayofyear, | ||||
|                                                 f'quarter_{column}': tmp_dt.quarter} | ||||
|                             new_columns_dict = { | ||||
|                                 f"year_{column}": tmp_dt.year, | ||||
|                                 f"month_{column}": tmp_dt.month, | ||||
|                                 f"day_{column}": tmp_dt.day, | ||||
|                                 f"hour_{column}": tmp_dt.hour, | ||||
|                                 f"minute_{column}": tmp_dt.minute, | ||||
|                                 f"second_{column}": tmp_dt.second, | ||||
|                                 f"dayofweek_{column}": tmp_dt.dayofweek, | ||||
|                                 f"dayofyear_{column}": tmp_dt.dayofyear, | ||||
|                                 f"quarter_{column}": tmp_dt.quarter, | ||||
|                             } | ||||
|                             for new_col_name in new_columns_dict.keys(): | ||||
|                                 if new_col_name not in X.columns and \ | ||||
|                                         new_columns_dict.get(new_col_name).nunique(dropna=False) >= 2: | ||||
|                                 if ( | ||||
|                                     new_col_name not in X.columns | ||||
|                                     and new_columns_dict.get(new_col_name).nunique( | ||||
|                                         dropna=False | ||||
|                                     ) | ||||
|                                     >= 2 | ||||
|                                 ): | ||||
|                                     X[new_col_name] = new_columns_dict.get(new_col_name) | ||||
|                                     num_columns.append(new_col_name) | ||||
|                             X[column] = X[column].map(datetime.toordinal) | ||||
| @ -239,11 +268,12 @@ class DataTransformer: | ||||
|                             num_columns.append(column) | ||||
|             X = X[cat_columns + num_columns] | ||||
|             if cat_columns: | ||||
|                 X[cat_columns] = X[cat_columns].astype('category') | ||||
|                 X[cat_columns] = X[cat_columns].astype("category") | ||||
|             if num_columns: | ||||
|                 X_num = X[num_columns] | ||||
|                 if np.issubdtype(X_num.columns.dtype, np.integer) and ( | ||||
|                     drop or min(X_num.columns) != 0 | ||||
|                     drop | ||||
|                     or min(X_num.columns) != 0 | ||||
|                     or max(X_num.columns) != X_num.shape[1] - 1 | ||||
|                 ): | ||||
|                     X_num.columns = range(X_num.shape[1]) | ||||
| @ -252,17 +282,31 @@ class DataTransformer: | ||||
|                     drop = False | ||||
|                 from sklearn.impute import SimpleImputer | ||||
|                 from sklearn.compose import ColumnTransformer | ||||
|                 self.transformer = ColumnTransformer([( | ||||
|                     'continuous', | ||||
|                     SimpleImputer(missing_values=np.nan, strategy='median'), | ||||
|                     X_num.columns)]) | ||||
| 
 | ||||
|                 self.transformer = ColumnTransformer( | ||||
|                     [ | ||||
|                         ( | ||||
|                             "continuous", | ||||
|                             SimpleImputer(missing_values=np.nan, strategy="median"), | ||||
|                             X_num.columns, | ||||
|                         ) | ||||
|                     ] | ||||
|                 ) | ||||
|                 X[num_columns] = self.transformer.fit_transform(X_num) | ||||
|             self._cat_columns, self._num_columns, self._datetime_columns = \ | ||||
|                 cat_columns, num_columns, datetime_columns | ||||
|             self._cat_columns, self._num_columns, self._datetime_columns = ( | ||||
|                 cat_columns, | ||||
|                 num_columns, | ||||
|                 datetime_columns, | ||||
|             ) | ||||
|             self._drop = drop | ||||
| 
 | ||||
|         if task in ('binary', 'multi', 'classification'): | ||||
|         if task in ( | ||||
|             "binary", | ||||
|             "multi", | ||||
|             "classification", | ||||
|         ) or not pd.api.types.is_numeric_dtype(y): | ||||
|             from sklearn.preprocessing import LabelEncoder | ||||
| 
 | ||||
|             self.label_transformer = LabelEncoder() | ||||
|             y = self.label_transformer.fit_transform(y) | ||||
|         else: | ||||
| @ -272,34 +316,46 @@ class DataTransformer: | ||||
|     def transform(self, X): | ||||
|         X = X.copy() | ||||
|         if isinstance(X, pd.DataFrame): | ||||
|             cat_columns, num_columns, datetime_columns = self._cat_columns, \ | ||||
|                 self._num_columns, self._datetime_columns | ||||
|             cat_columns, num_columns, datetime_columns = ( | ||||
|                 self._cat_columns, | ||||
|                 self._num_columns, | ||||
|                 self._datetime_columns, | ||||
|             ) | ||||
|             if datetime_columns: | ||||
|                 for column in datetime_columns: | ||||
|                     tmp_dt = X[column].dt | ||||
|                     new_columns_dict = {f'year_{column}': tmp_dt.year, f'month_{column}': tmp_dt.month, | ||||
|                                         f'day_{column}': tmp_dt.day, f'hour_{column}': tmp_dt.hour, | ||||
|                                         f'minute_{column}': tmp_dt.minute, f'second_{column}': tmp_dt.second, | ||||
|                                         f'dayofweek_{column}': tmp_dt.dayofweek, | ||||
|                                         f'dayofyear_{column}': tmp_dt.dayofyear, | ||||
|                                         f'quarter_{column}': tmp_dt.quarter} | ||||
|                     new_columns_dict = { | ||||
|                         f"year_{column}": tmp_dt.year, | ||||
|                         f"month_{column}": tmp_dt.month, | ||||
|                         f"day_{column}": tmp_dt.day, | ||||
|                         f"hour_{column}": tmp_dt.hour, | ||||
|                         f"minute_{column}": tmp_dt.minute, | ||||
|                         f"second_{column}": tmp_dt.second, | ||||
|                         f"dayofweek_{column}": tmp_dt.dayofweek, | ||||
|                         f"dayofyear_{column}": tmp_dt.dayofyear, | ||||
|                         f"quarter_{column}": tmp_dt.quarter, | ||||
|                     } | ||||
|                     for new_col_name in new_columns_dict.keys(): | ||||
|                         if new_col_name not in X.columns and \ | ||||
|                                 new_columns_dict.get(new_col_name).nunique(dropna=False) >= 2: | ||||
|                         if ( | ||||
|                             new_col_name not in X.columns | ||||
|                             and new_columns_dict.get(new_col_name).nunique(dropna=False) | ||||
|                             >= 2 | ||||
|                         ): | ||||
|                             X[new_col_name] = new_columns_dict.get(new_col_name) | ||||
|                     X[column] = X[column].map(datetime.toordinal) | ||||
|                     del tmp_dt | ||||
|             X = X[cat_columns + num_columns].copy() | ||||
|             for column in cat_columns: | ||||
|                 if X[column].dtype.name == 'object': | ||||
|                     X[column] = X[column].fillna('__NAN__') | ||||
|                 elif X[column].dtype.name == 'category': | ||||
|                 if X[column].dtype.name == "object": | ||||
|                     X[column] = X[column].fillna("__NAN__") | ||||
|                 elif X[column].dtype.name == "category": | ||||
|                     current_categories = X[column].cat.categories | ||||
|                     if '__NAN__' not in current_categories: | ||||
|                         X[column] = X[column].cat.add_categories( | ||||
|                             '__NAN__').fillna('__NAN__') | ||||
|                     if "__NAN__" not in current_categories: | ||||
|                         X[column] = ( | ||||
|                             X[column].cat.add_categories("__NAN__").fillna("__NAN__") | ||||
|                         ) | ||||
|             if cat_columns: | ||||
|                 X[cat_columns] = X[cat_columns].astype('category') | ||||
|                 X[cat_columns] = X[cat_columns].astype("category") | ||||
|             if num_columns: | ||||
|                 X_num = X[num_columns].fillna(np.nan) | ||||
|                 if self._drop: | ||||
|  | ||||
							
								
								
									
										424
									
								
								flaml/ml.py
									
									
									
									
									
								
							
							
						
						
									
										424
									
								
								flaml/ml.py
									
									
									
									
									
								
							| @ -1,65 +1,90 @@ | ||||
| '''! | ||||
|  * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved. | ||||
| """! | ||||
|  * Copyright (c) Microsoft Corporation. All rights reserved. | ||||
|  * Licensed under the MIT License. | ||||
| ''' | ||||
| """ | ||||
| 
 | ||||
| import time | ||||
| import numpy as np | ||||
| import pandas as pd | ||||
| from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \ | ||||
|     accuracy_score, mean_absolute_error, log_loss, average_precision_score, \ | ||||
|     f1_score, mean_absolute_percentage_error, ndcg_score | ||||
| from sklearn.metrics import ( | ||||
|     mean_squared_error, | ||||
|     r2_score, | ||||
|     roc_auc_score, | ||||
|     accuracy_score, | ||||
|     mean_absolute_error, | ||||
|     log_loss, | ||||
|     average_precision_score, | ||||
|     f1_score, | ||||
|     mean_absolute_percentage_error, | ||||
|     ndcg_score, | ||||
| ) | ||||
| from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit | ||||
| from .model import ( | ||||
|     XGBoostEstimator, XGBoostSklearnEstimator, RandomForestEstimator, | ||||
|     LGBMEstimator, LRL1Classifier, LRL2Classifier, CatBoostEstimator, | ||||
|     ExtraTreeEstimator, KNeighborsEstimator, FBProphet, ARIMA, SARIMAX) | ||||
|     XGBoostEstimator, | ||||
|     XGBoostSklearnEstimator, | ||||
|     RandomForestEstimator, | ||||
|     LGBMEstimator, | ||||
|     LRL1Classifier, | ||||
|     LRL2Classifier, | ||||
|     CatBoostEstimator, | ||||
|     ExtraTreeEstimator, | ||||
|     KNeighborsEstimator, | ||||
|     Prophet, | ||||
|     ARIMA, | ||||
|     SARIMAX, | ||||
| ) | ||||
| from .data import group_counts | ||||
| 
 | ||||
| import logging | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| 
 | ||||
| def get_estimator_class(task, estimator_name): | ||||
|     ''' when adding a new learner, need to add an elif branch ''' | ||||
|     """when adding a new learner, need to add an elif branch""" | ||||
| 
 | ||||
|     if 'xgboost' == estimator_name: | ||||
|         if 'regression' == task: | ||||
|     if "xgboost" == estimator_name: | ||||
|         if "regression" == task: | ||||
|             estimator_class = XGBoostEstimator | ||||
|         else: | ||||
|             estimator_class = XGBoostSklearnEstimator | ||||
|     elif 'rf' == estimator_name: | ||||
|     elif "rf" == estimator_name: | ||||
|         estimator_class = RandomForestEstimator | ||||
|     elif 'lgbm' == estimator_name: | ||||
|     elif "lgbm" == estimator_name: | ||||
|         estimator_class = LGBMEstimator | ||||
|     elif 'lrl1' == estimator_name: | ||||
|     elif "lrl1" == estimator_name: | ||||
|         estimator_class = LRL1Classifier | ||||
|     elif 'lrl2' == estimator_name: | ||||
|     elif "lrl2" == estimator_name: | ||||
|         estimator_class = LRL2Classifier | ||||
|     elif 'catboost' == estimator_name: | ||||
|     elif "catboost" == estimator_name: | ||||
|         estimator_class = CatBoostEstimator | ||||
|     elif 'extra_tree' == estimator_name: | ||||
|     elif "extra_tree" == estimator_name: | ||||
|         estimator_class = ExtraTreeEstimator | ||||
|     elif 'kneighbor' == estimator_name: | ||||
|     elif "kneighbor" == estimator_name: | ||||
|         estimator_class = KNeighborsEstimator | ||||
|     elif 'prophet' in estimator_name: | ||||
|         estimator_class = FBProphet | ||||
|     elif estimator_name == 'arima': | ||||
|     elif "prophet" in estimator_name: | ||||
|         estimator_class = Prophet | ||||
|     elif estimator_name == "arima": | ||||
|         estimator_class = ARIMA | ||||
|     elif estimator_name == 'sarimax': | ||||
|     elif estimator_name == "sarimax": | ||||
|         estimator_class = SARIMAX | ||||
|     else: | ||||
|         raise ValueError( | ||||
|             estimator_name + ' is not a built-in learner. ' | ||||
|             'Please use AutoML.add_learner() to add a customized learner.') | ||||
|             estimator_name + " is not a built-in learner. " | ||||
|             "Please use AutoML.add_learner() to add a customized learner." | ||||
|         ) | ||||
|     return estimator_class | ||||
| 
 | ||||
| 
 | ||||
| def sklearn_metric_loss_score( | ||||
|     metric_name, y_predict, y_true, labels=None, sample_weight=None, | ||||
|     metric_name, | ||||
|     y_predict, | ||||
|     y_true, | ||||
|     labels=None, | ||||
|     sample_weight=None, | ||||
|     groups=None, | ||||
| ): | ||||
|     '''Loss using the specified metric | ||||
|     """Loss using the specified metric | ||||
| 
 | ||||
|     Args: | ||||
|         metric_name: A string of the metric name, one of | ||||
| @ -76,60 +101,63 @@ def sklearn_metric_loss_score( | ||||
| 
 | ||||
|     Returns: | ||||
|         score: A float number of the loss, the lower the better. | ||||
|     ''' | ||||
|     """ | ||||
|     metric_name = metric_name.lower() | ||||
|     if 'r2' == metric_name: | ||||
|     if "r2" == metric_name: | ||||
|         score = 1.0 - r2_score(y_true, y_predict, sample_weight=sample_weight) | ||||
|     elif metric_name == 'rmse': | ||||
|         score = np.sqrt(mean_squared_error( | ||||
|             y_true, y_predict, sample_weight=sample_weight)) | ||||
|     elif metric_name == 'mae': | ||||
|         score = mean_absolute_error( | ||||
|             y_true, y_predict, sample_weight=sample_weight) | ||||
|     elif metric_name == 'mse': | ||||
|         score = mean_squared_error( | ||||
|             y_true, y_predict, sample_weight=sample_weight) | ||||
|     elif metric_name == 'accuracy': | ||||
|         score = 1.0 - accuracy_score( | ||||
|             y_true, y_predict, sample_weight=sample_weight) | ||||
|     elif metric_name == 'roc_auc': | ||||
|     elif metric_name == "rmse": | ||||
|         score = np.sqrt( | ||||
|             mean_squared_error(y_true, y_predict, sample_weight=sample_weight) | ||||
|         ) | ||||
|     elif metric_name == "mae": | ||||
|         score = mean_absolute_error(y_true, y_predict, sample_weight=sample_weight) | ||||
|     elif metric_name == "mse": | ||||
|         score = mean_squared_error(y_true, y_predict, sample_weight=sample_weight) | ||||
|     elif metric_name == "accuracy": | ||||
|         score = 1.0 - accuracy_score(y_true, y_predict, sample_weight=sample_weight) | ||||
|     elif metric_name == "roc_auc": | ||||
|         score = 1.0 - roc_auc_score(y_true, y_predict, sample_weight=sample_weight) | ||||
|     elif metric_name == "roc_auc_ovr": | ||||
|         score = 1.0 - roc_auc_score( | ||||
|             y_true, y_predict, sample_weight=sample_weight) | ||||
|     elif metric_name == 'roc_auc_ovr': | ||||
|             y_true, y_predict, sample_weight=sample_weight, multi_class="ovr" | ||||
|         ) | ||||
|     elif metric_name == "roc_auc_ovo": | ||||
|         score = 1.0 - roc_auc_score( | ||||
|             y_true, y_predict, sample_weight=sample_weight, multi_class='ovr') | ||||
|     elif metric_name == 'roc_auc_ovo': | ||||
|         score = 1.0 - roc_auc_score( | ||||
|             y_true, y_predict, sample_weight=sample_weight, multi_class='ovo') | ||||
|     elif 'log_loss' == metric_name: | ||||
|         score = log_loss( | ||||
|             y_true, y_predict, labels=labels, sample_weight=sample_weight) | ||||
|     elif 'mape' == metric_name: | ||||
|             y_true, y_predict, sample_weight=sample_weight, multi_class="ovo" | ||||
|         ) | ||||
|     elif "log_loss" == metric_name: | ||||
|         score = log_loss(y_true, y_predict, labels=labels, sample_weight=sample_weight) | ||||
|     elif "mape" == metric_name: | ||||
|         try: | ||||
|             score = mean_absolute_percentage_error( | ||||
|                 y_true, y_predict) | ||||
|             score = mean_absolute_percentage_error(y_true, y_predict) | ||||
|         except ValueError: | ||||
|             return np.inf | ||||
|     elif 'micro_f1' == metric_name: | ||||
|     elif "micro_f1" == metric_name: | ||||
|         score = 1 - f1_score( | ||||
|             y_true, y_predict, sample_weight=sample_weight, average='micro') | ||||
|     elif 'macro_f1' == metric_name: | ||||
|             y_true, y_predict, sample_weight=sample_weight, average="micro" | ||||
|         ) | ||||
|     elif "macro_f1" == metric_name: | ||||
|         score = 1 - f1_score( | ||||
|             y_true, y_predict, sample_weight=sample_weight, average='macro') | ||||
|     elif 'f1' == metric_name: | ||||
|             y_true, y_predict, sample_weight=sample_weight, average="macro" | ||||
|         ) | ||||
|     elif "f1" == metric_name: | ||||
|         score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight) | ||||
|     elif 'ap' == metric_name: | ||||
|     elif "ap" == metric_name: | ||||
|         score = 1 - average_precision_score( | ||||
|             y_true, y_predict, sample_weight=sample_weight) | ||||
|     elif 'ndcg' in metric_name: | ||||
|         if '@' in metric_name: | ||||
|             k = int(metric_name.split('@', 1)[-1]) | ||||
|             y_true, y_predict, sample_weight=sample_weight | ||||
|         ) | ||||
|     elif "ndcg" in metric_name: | ||||
|         if "@" in metric_name: | ||||
|             k = int(metric_name.split("@", 1)[-1]) | ||||
|             counts = group_counts(groups) | ||||
|             score = 0 | ||||
|             psum = 0 | ||||
|             for c in counts: | ||||
|                 score -= ndcg_score(np.asarray([y_true[psum:psum + c]]), | ||||
|                                     np.asarray([y_predict[psum:psum + c]]), k=k) | ||||
|                 score -= ndcg_score( | ||||
|                     np.asarray([y_true[psum : psum + c]]), | ||||
|                     np.asarray([y_predict[psum : psum + c]]), | ||||
|                     k=k, | ||||
|                 ) | ||||
|                 psum += c | ||||
|             score /= len(counts) | ||||
|             score += 1 | ||||
| @ -137,56 +165,96 @@ def sklearn_metric_loss_score( | ||||
|             score = 1 - ndcg_score([y_true], [y_predict]) | ||||
|     else: | ||||
|         raise ValueError( | ||||
|             metric_name + ' is not a built-in metric, ' | ||||
|             'currently built-in metrics are: ' | ||||
|             'r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,' | ||||
|             'log_loss, mape, f1, micro_f1, macro_f1, ap. ' | ||||
|             'please pass a customized metric function to AutoML.fit(metric=func)') | ||||
|             metric_name + " is not a built-in metric, " | ||||
|             "currently built-in metrics are: " | ||||
|             "r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo," | ||||
|             "log_loss, mape, f1, micro_f1, macro_f1, ap. " | ||||
|             "please pass a customized metric function to AutoML.fit(metric=func)" | ||||
|         ) | ||||
|     return score | ||||
| 
 | ||||
| 
 | ||||
| def get_y_pred(estimator, X, eval_metric, obj): | ||||
|     if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj: | ||||
|     if eval_metric in ["roc_auc", "ap"] and "binary" in obj: | ||||
|         y_pred_classes = estimator.predict_proba(X) | ||||
|         y_pred = y_pred_classes[ | ||||
|             :, 1] if y_pred_classes.ndim > 1 else y_pred_classes | ||||
|     elif eval_metric in ['log_loss', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']: | ||||
|         y_pred = y_pred_classes[:, 1] if y_pred_classes.ndim > 1 else y_pred_classes | ||||
|     elif eval_metric in ["log_loss", "roc_auc", "roc_auc_ovr", "roc_auc_ovo"]: | ||||
|         y_pred = estimator.predict_proba(X) | ||||
|     else: | ||||
|         y_pred = estimator.predict(X) | ||||
|     return y_pred | ||||
| 
 | ||||
| 
 | ||||
| def _eval_estimator(config, estimator, X_train, y_train, X_test, y_test, weight_test, | ||||
|                     groups_test, eval_metric, obj, labels=None, | ||||
|                     log_training_metric=False, fit_kwargs={}): | ||||
| def _eval_estimator( | ||||
|     config, | ||||
|     estimator, | ||||
|     X_train, | ||||
|     y_train, | ||||
|     X_test, | ||||
|     y_test, | ||||
|     weight_test, | ||||
|     groups_test, | ||||
|     eval_metric, | ||||
|     obj, | ||||
|     labels=None, | ||||
|     log_training_metric=False, | ||||
|     fit_kwargs={}, | ||||
| ): | ||||
|     if isinstance(eval_metric, str): | ||||
|         pred_start = time.time() | ||||
|         test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj) | ||||
|         pred_time = (time.time() - pred_start) / X_test.shape[0] | ||||
|         test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test, | ||||
|                                               labels, weight_test, groups_test) | ||||
|         test_loss = sklearn_metric_loss_score( | ||||
|             eval_metric, test_pred_y, y_test, labels, weight_test, groups_test | ||||
|         ) | ||||
|         metric_for_logging = {} | ||||
|         if log_training_metric: | ||||
|             train_pred_y = get_y_pred(estimator, X_train, eval_metric, obj) | ||||
|             metric_for_logging['train_loss'] = sklearn_metric_loss_score( | ||||
|                 eval_metric, train_pred_y, y_train, labels, | ||||
|                 fit_kwargs.get('sample_weight'), fit_kwargs.get('groups')) | ||||
|             metric_for_logging["train_loss"] = sklearn_metric_loss_score( | ||||
|                 eval_metric, | ||||
|                 train_pred_y, | ||||
|                 y_train, | ||||
|                 labels, | ||||
|                 fit_kwargs.get("sample_weight"), | ||||
|                 fit_kwargs.get("groups"), | ||||
|             ) | ||||
|     else:  # customized metric function | ||||
|         test_loss, metric_for_logging = eval_metric( | ||||
|             X_test, y_test, estimator, labels, X_train, y_train, weight_test, | ||||
|             fit_kwargs.get('sample_weight'), config, groups_test, | ||||
|             fit_kwargs.get('groups')) | ||||
|             X_test, | ||||
|             y_test, | ||||
|             estimator, | ||||
|             labels, | ||||
|             X_train, | ||||
|             y_train, | ||||
|             weight_test, | ||||
|             fit_kwargs.get("sample_weight"), | ||||
|             config, | ||||
|             groups_test, | ||||
|             fit_kwargs.get("groups"), | ||||
|         ) | ||||
|         if isinstance(metric_for_logging, dict): | ||||
|             pred_time = metric_for_logging.get('pred_time', 0) | ||||
|             pred_time = metric_for_logging.get("pred_time", 0) | ||||
|         test_pred_y = None | ||||
|         # eval_metric may return test_pred_y but not necessarily. Setting None for now. | ||||
|     return test_loss, metric_for_logging, pred_time, test_pred_y | ||||
| 
 | ||||
| 
 | ||||
| def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_test, | ||||
|                   groups_test, eval_metric, obj, labels=None, budget=None, | ||||
|                   log_training_metric=False, fit_kwargs={}): | ||||
| def get_test_loss( | ||||
|     config, | ||||
|     estimator, | ||||
|     X_train, | ||||
|     y_train, | ||||
|     X_test, | ||||
|     y_test, | ||||
|     weight_test, | ||||
|     groups_test, | ||||
|     eval_metric, | ||||
|     obj, | ||||
|     labels=None, | ||||
|     budget=None, | ||||
|     log_training_metric=False, | ||||
|     fit_kwargs={}, | ||||
| ): | ||||
| 
 | ||||
|     start = time.time() | ||||
|     # if groups_test is not None: | ||||
| @ -195,16 +263,37 @@ def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_te | ||||
|     #     fit_kwargs['y_val'] = y_test | ||||
|     estimator.fit(X_train, y_train, budget, **fit_kwargs) | ||||
|     test_loss, metric_for_logging, pred_time, _ = _eval_estimator( | ||||
|         config, estimator, X_train, y_train, X_test, y_test, | ||||
|         weight_test, groups_test, eval_metric, obj, | ||||
|         labels, log_training_metric, fit_kwargs) | ||||
|         config, | ||||
|         estimator, | ||||
|         X_train, | ||||
|         y_train, | ||||
|         X_test, | ||||
|         y_test, | ||||
|         weight_test, | ||||
|         groups_test, | ||||
|         eval_metric, | ||||
|         obj, | ||||
|         labels, | ||||
|         log_training_metric, | ||||
|         fit_kwargs, | ||||
|     ) | ||||
|     train_time = time.time() - start | ||||
|     return test_loss, metric_for_logging, train_time, pred_time | ||||
| 
 | ||||
| 
 | ||||
| def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf, | ||||
|                       task, eval_metric, best_val_loss, | ||||
|                       log_training_metric=False, fit_kwargs={}): | ||||
| def evaluate_model_CV( | ||||
|     config, | ||||
|     estimator, | ||||
|     X_train_all, | ||||
|     y_train_all, | ||||
|     budget, | ||||
|     kf, | ||||
|     task, | ||||
|     eval_metric, | ||||
|     best_val_loss, | ||||
|     log_training_metric=False, | ||||
|     fit_kwargs={}, | ||||
| ): | ||||
|     start_time = time.time() | ||||
|     total_val_loss = 0 | ||||
|     total_metric = None | ||||
| @ -213,7 +302,7 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf, | ||||
|     valid_fold_num = total_fold_num = 0 | ||||
|     n = kf.get_n_splits() | ||||
|     X_train_split, y_train_split = X_train_all, y_train_all | ||||
|     if task in ('binary', 'multi'): | ||||
|     if task in ("binary", "multi"): | ||||
|         labels = np.unique(y_train_all) | ||||
|     else: | ||||
|         labels = None | ||||
| @ -225,8 +314,8 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf, | ||||
|         groups = kf.groups | ||||
|         kf = kf.split(X_train_split, y_train_split, groups) | ||||
|         shuffle = False | ||||
|     elif isinstance(kf, TimeSeriesSplit) and task == 'forecast': | ||||
|         y_train_all = pd.DataFrame(y_train_all, columns=['y']) | ||||
|     elif isinstance(kf, TimeSeriesSplit) and task == "forecast": | ||||
|         y_train_all = pd.DataFrame(y_train_all, columns=["y"]) | ||||
|         train = X_train_all.join(y_train_all) | ||||
|         kf = kf.split(train) | ||||
|         shuffle = False | ||||
| @ -237,8 +326,8 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf, | ||||
|     rng = np.random.RandomState(2020) | ||||
|     val_loss_list = [] | ||||
|     budget_per_train = budget / n | ||||
|     if 'sample_weight' in fit_kwargs: | ||||
|         weight = fit_kwargs['sample_weight'] | ||||
|     if "sample_weight" in fit_kwargs: | ||||
|         weight = fit_kwargs["sample_weight"] | ||||
|         weight_val = None | ||||
|     else: | ||||
|         weight = weight_val = None | ||||
| @ -246,37 +335,48 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf, | ||||
|         if shuffle: | ||||
|             train_index = rng.permutation(train_index) | ||||
|         if isinstance(X_train_all, pd.DataFrame): | ||||
|             X_train, X_val = X_train_split.iloc[ | ||||
|                 train_index], X_train_split.iloc[val_index] | ||||
|             X_train = X_train_split.iloc[train_index] | ||||
|             X_val = X_train_split.iloc[val_index] | ||||
|         else: | ||||
|             X_train, X_val = X_train_split[ | ||||
|                 train_index], X_train_split[val_index] | ||||
|             X_train, X_val = X_train_split[train_index], X_train_split[val_index] | ||||
|         y_train, y_val = y_train_split[train_index], y_train_split[val_index] | ||||
|         estimator.cleanup() | ||||
|         if weight is not None: | ||||
|             fit_kwargs['sample_weight'], weight_val = weight[ | ||||
|                 train_index], weight[val_index] | ||||
|             fit_kwargs["sample_weight"], weight_val = ( | ||||
|                 weight[train_index], | ||||
|                 weight[val_index], | ||||
|             ) | ||||
|         if groups is not None: | ||||
|             fit_kwargs['groups'] = groups[train_index] | ||||
|             fit_kwargs["groups"] = groups[train_index] | ||||
|             groups_val = groups[val_index] | ||||
|         else: | ||||
|             groups_val = None | ||||
|         val_loss_i, metric_i, train_time_i, pred_time_i = get_test_loss( | ||||
|             config, estimator, X_train, y_train, X_val, y_val, weight_val, | ||||
|             groups_val, eval_metric, task, labels, budget_per_train, | ||||
|             log_training_metric=log_training_metric, fit_kwargs=fit_kwargs) | ||||
|             config, | ||||
|             estimator, | ||||
|             X_train, | ||||
|             y_train, | ||||
|             X_val, | ||||
|             y_val, | ||||
|             weight_val, | ||||
|             groups_val, | ||||
|             eval_metric, | ||||
|             task, | ||||
|             labels, | ||||
|             budget_per_train, | ||||
|             log_training_metric=log_training_metric, | ||||
|             fit_kwargs=fit_kwargs, | ||||
|         ) | ||||
|         if weight is not None: | ||||
|             fit_kwargs['sample_weight'] = weight | ||||
|             fit_kwargs["sample_weight"] = weight | ||||
|         valid_fold_num += 1 | ||||
|         total_fold_num += 1 | ||||
|         total_val_loss += val_loss_i | ||||
|         if log_training_metric or not isinstance(eval_metric, str): | ||||
|             if isinstance(total_metric, list): | ||||
|                 total_metric = [ | ||||
|                     total_metric[i] + v for i, v in enumerate(metric_i)] | ||||
|                 total_metric = [total_metric[i] + v for i, v in enumerate(metric_i)] | ||||
|             elif isinstance(total_metric, dict): | ||||
|                 total_metric = { | ||||
|                     k: total_metric[k] + v for k, v in metric_i.items()} | ||||
|                 total_metric = {k: total_metric[k] + v for k, v in metric_i.items()} | ||||
|             elif total_metric is not None: | ||||
|                 total_metric += metric_i | ||||
|             else: | ||||
| @ -307,35 +407,73 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf, | ||||
| 
 | ||||
| 
 | ||||
| def compute_estimator( | ||||
|     X_train, y_train, X_val, y_val, weight_val, groups_val, budget, kf, | ||||
|     config_dic, task, estimator_name, eval_method, eval_metric, | ||||
|     best_val_loss=np.Inf, n_jobs=1, estimator_class=None, log_training_metric=False, | ||||
|     fit_kwargs={} | ||||
|     X_train, | ||||
|     y_train, | ||||
|     X_val, | ||||
|     y_val, | ||||
|     weight_val, | ||||
|     groups_val, | ||||
|     budget, | ||||
|     kf, | ||||
|     config_dic, | ||||
|     task, | ||||
|     estimator_name, | ||||
|     eval_method, | ||||
|     eval_metric, | ||||
|     best_val_loss=np.Inf, | ||||
|     n_jobs=1, | ||||
|     estimator_class=None, | ||||
|     log_training_metric=False, | ||||
|     fit_kwargs={}, | ||||
| ): | ||||
|     estimator_class = estimator_class or get_estimator_class( | ||||
|         task, estimator_name) | ||||
|     estimator = estimator_class( | ||||
|         **config_dic, task=task, n_jobs=n_jobs) | ||||
|     if 'holdout' in eval_method: | ||||
|     estimator_class = estimator_class or get_estimator_class(task, estimator_name) | ||||
|     estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs) | ||||
|     if "holdout" in eval_method: | ||||
|         val_loss, metric_for_logging, train_time, pred_time = get_test_loss( | ||||
|             config_dic, estimator, X_train, y_train, X_val, y_val, weight_val, | ||||
|             groups_val, eval_metric, task, budget=budget, | ||||
|             log_training_metric=log_training_metric, fit_kwargs=fit_kwargs) | ||||
|             config_dic, | ||||
|             estimator, | ||||
|             X_train, | ||||
|             y_train, | ||||
|             X_val, | ||||
|             y_val, | ||||
|             weight_val, | ||||
|             groups_val, | ||||
|             eval_metric, | ||||
|             task, | ||||
|             budget=budget, | ||||
|             log_training_metric=log_training_metric, | ||||
|             fit_kwargs=fit_kwargs, | ||||
|         ) | ||||
|     else: | ||||
|         val_loss, metric_for_logging, train_time, pred_time = evaluate_model_CV( | ||||
|             config_dic, estimator, X_train, y_train, budget, kf, task, | ||||
|             eval_metric, best_val_loss, log_training_metric=log_training_metric, | ||||
|             fit_kwargs=fit_kwargs) | ||||
|             config_dic, | ||||
|             estimator, | ||||
|             X_train, | ||||
|             y_train, | ||||
|             budget, | ||||
|             kf, | ||||
|             task, | ||||
|             eval_metric, | ||||
|             best_val_loss, | ||||
|             log_training_metric=log_training_metric, | ||||
|             fit_kwargs=fit_kwargs, | ||||
|         ) | ||||
|     return estimator, val_loss, metric_for_logging, train_time, pred_time | ||||
| 
 | ||||
| 
 | ||||
| def train_estimator( | ||||
|     X_train, y_train, config_dic, task, | ||||
|     estimator_name, n_jobs=1, estimator_class=None, budget=None, fit_kwargs={} | ||||
|     X_train, | ||||
|     y_train, | ||||
|     config_dic, | ||||
|     task, | ||||
|     estimator_name, | ||||
|     n_jobs=1, | ||||
|     estimator_class=None, | ||||
|     budget=None, | ||||
|     fit_kwargs={}, | ||||
| ): | ||||
|     start_time = time.time() | ||||
|     estimator_class = estimator_class or get_estimator_class( | ||||
|         task, estimator_name) | ||||
|     estimator_class = estimator_class or get_estimator_class(task, estimator_name) | ||||
|     estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs) | ||||
|     if X_train is not None: | ||||
|         train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs) | ||||
| @ -347,14 +485,14 @@ def train_estimator( | ||||
| 
 | ||||
| def get_classification_objective(num_labels: int) -> str: | ||||
|     if num_labels == 2: | ||||
|         objective_name = 'binary' | ||||
|         objective_name = "binary" | ||||
|     else: | ||||
|         objective_name = 'multi' | ||||
|         objective_name = "multi" | ||||
|     return objective_name | ||||
| 
 | ||||
| 
 | ||||
| def norm_confusion_matrix(y_true, y_pred): | ||||
|     '''normalized confusion matrix | ||||
|     """normalized confusion matrix | ||||
| 
 | ||||
|     Args: | ||||
|         estimator: A multi-class classification estimator | ||||
| @ -363,15 +501,16 @@ def norm_confusion_matrix(y_true, y_pred): | ||||
| 
 | ||||
|     Returns: | ||||
|         A normalized confusion matrix | ||||
|     ''' | ||||
|     """ | ||||
|     from sklearn.metrics import confusion_matrix | ||||
| 
 | ||||
|     conf_mat = confusion_matrix(y_true, y_pred) | ||||
|     norm_conf_mat = conf_mat.astype('float') / conf_mat.sum(axis=1)[:, np.newaxis] | ||||
|     norm_conf_mat = conf_mat.astype("float") / conf_mat.sum(axis=1)[:, np.newaxis] | ||||
|     return norm_conf_mat | ||||
| 
 | ||||
| 
 | ||||
| def multi_class_curves(y_true, y_pred_proba, curve_func): | ||||
|     '''Binarize the data for multi-class tasks and produce ROC or precision-recall curves | ||||
|     """Binarize the data for multi-class tasks and produce ROC or precision-recall curves | ||||
| 
 | ||||
|     Args: | ||||
|         y_true: A numpy array or a pandas series of true labels | ||||
| @ -384,8 +523,9 @@ def multi_class_curves(y_true, y_pred_proba, curve_func): | ||||
|             curve_x[0] is an 1D array of the x coordinates of class 0 | ||||
|         The second dictionary curve_y stores the y coordinates of each curve, e.g., | ||||
|             curve_y[0] is an 1D array of the y coordinates of class 0 | ||||
|     ''' | ||||
|     """ | ||||
|     from sklearn.preprocessing import label_binarize | ||||
| 
 | ||||
|     classes = np.unique(y_true) | ||||
|     y_true_binary = label_binarize(y_true, classes=classes) | ||||
| 
 | ||||
|  | ||||
							
								
								
									
										873
									
								
								flaml/model.py
									
									
									
									
									
								
							
							
						
						
									
										873
									
								
								flaml/model.py
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -1 +1 @@ | ||||
| __version__ = "0.6.3" | ||||
| __version__ = "0.6.4" | ||||
|  | ||||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							| @ -32,7 +32,7 @@ setuptools.setup( | ||||
|     long_description=long_description, | ||||
|     long_description_content_type="text/markdown", | ||||
|     url="https://github.com/microsoft/FLAML", | ||||
|     packages=setuptools.find_packages(), | ||||
|     packages=setuptools.find_packages(include=["flaml*"]), | ||||
|     install_requires=install_requires, | ||||
|     extras_require={ | ||||
|         "notebook": [ | ||||
|  | ||||
| @ -30,9 +30,11 @@ def test_forecast_automl(budget=5): | ||||
|     } | ||||
|     """The main flaml automl API""" | ||||
|     try: | ||||
|         import prophet | ||||
| 
 | ||||
|         automl.fit(dataframe=df, **settings, period=time_horizon) | ||||
|     except ImportError: | ||||
|         print("not using FBProphet due to ImportError") | ||||
|         print("not using prophet due to ImportError") | ||||
|         automl.fit( | ||||
|             dataframe=df, | ||||
|             **settings, | ||||
| @ -79,7 +81,7 @@ def test_forecast_automl(budget=5): | ||||
|     try: | ||||
|         automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon) | ||||
|     except ImportError: | ||||
|         print("not using FBProphet due to ImportError") | ||||
|         print("not using prophet due to ImportError") | ||||
|         automl.fit( | ||||
|             X_train=X_train, | ||||
|             y_train=y_train, | ||||
| @ -94,6 +96,8 @@ def test_numpy(): | ||||
|     y_train = np.random.random(size=72) | ||||
|     automl = AutoML() | ||||
|     try: | ||||
|         import prophet | ||||
| 
 | ||||
|         automl.fit( | ||||
|             X_train=X_train[:60],  # a single column of timestamp | ||||
|             y_train=y_train,  # value for each timestamp | ||||
| @ -105,9 +109,9 @@ def test_numpy(): | ||||
|         print(automl.predict(X_train[60:])) | ||||
|         print(automl.predict(12)) | ||||
|     except ValueError: | ||||
|         print("ValueError for FBProphet is raised as expected.") | ||||
|         print("ValueError for prophet is raised as expected.") | ||||
|     except ImportError: | ||||
|         print("not using FBProphet due to ImportError") | ||||
|         print("not using prophet due to ImportError") | ||||
|         automl = AutoML() | ||||
|         automl.fit( | ||||
|             X_train=X_train[:72],  # a single column of timestamp | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Chi Wang
						Chi Wang