| 
									
										
										
										
											2021-11-06 09:37:33 -07:00
										 |  |  | # ! | 
					
						
							|  |  |  | #  * Copyright (c) Microsoft Corporation. All rights reserved. | 
					
						
							|  |  |  | #  * Licensed under the MIT License. See LICENSE file in the | 
					
						
							|  |  |  | #  * project root for license information. | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  | import numpy as np | 
					
						
							|  |  |  | from scipy.sparse import vstack, issparse | 
					
						
							|  |  |  | import pandas as pd | 
					
						
							| 
									
										
										
										
											2021-11-16 14:06:20 -05:00
										 |  |  | from pandas import DataFrame, Series | 
					
						
							| 
									
										
										
										
											2021-10-08 16:09:43 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  | from .training_log import training_log_reader | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-04-20 17:32:58 +02:00
										 |  |  | from datetime import datetime | 
					
						
							| 
									
										
										
										
											2021-11-16 14:06:20 -05:00
										 |  |  | from typing import Dict, Union, List | 
					
						
							| 
									
										
										
										
											2021-04-20 17:32:58 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-03 12:45:16 -05:00
										 |  |  | # TODO: if your task is not specified in here, define your task as an all-capitalized word | 
					
						
							| 
									
										
										
										
											2021-11-16 14:06:20 -05:00
										 |  |  | SEQCLASSIFICATION = "seq-classification" | 
					
						
							| 
									
										
										
										
											2022-01-02 20:12:34 -05:00
										 |  |  | MULTICHOICECLASSIFICATION = "multichoice-classification" | 
					
						
							| 
									
										
										
										
											2022-01-03 13:44:10 -05:00
										 |  |  | TOKENCLASSIFICATION = "token-classification" | 
					
						
							| 
									
										
										
										
											2022-01-02 20:12:34 -05:00
										 |  |  | CLASSIFICATION = ( | 
					
						
							|  |  |  |     "binary", | 
					
						
							| 
									
										
										
										
											2022-03-25 17:00:08 -04:00
										 |  |  |     "multiclass", | 
					
						
							| 
									
										
										
										
											2022-01-02 20:12:34 -05:00
										 |  |  |     "classification", | 
					
						
							|  |  |  |     SEQCLASSIFICATION, | 
					
						
							|  |  |  |     MULTICHOICECLASSIFICATION, | 
					
						
							| 
									
										
										
										
											2022-01-03 13:44:10 -05:00
										 |  |  |     TOKENCLASSIFICATION, | 
					
						
							| 
									
										
										
										
											2022-01-02 20:12:34 -05:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2021-11-16 14:06:20 -05:00
										 |  |  | SEQREGRESSION = "seq-regression" | 
					
						
							|  |  |  | REGRESSION = ("regression", SEQREGRESSION) | 
					
						
							| 
									
										
										
										
											2022-01-24 21:39:36 -05:00
										 |  |  | TS_FORECASTREGRESSION = ( | 
					
						
							|  |  |  |     "forecast", | 
					
						
							|  |  |  |     "ts_forecast", | 
					
						
							|  |  |  |     "ts_forecast_regression", | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | TS_FORECASTCLASSIFICATION = "ts_forecast_classification" | 
					
						
							|  |  |  | TS_FORECAST = ( | 
					
						
							|  |  |  |     *TS_FORECASTREGRESSION, | 
					
						
							|  |  |  |     TS_FORECASTCLASSIFICATION, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2021-10-30 12:48:57 -04:00
										 |  |  | TS_TIMESTAMP_COL = "ds" | 
					
						
							|  |  |  | TS_VALUE_COL = "y" | 
					
						
							| 
									
										
										
										
											2021-12-03 12:45:16 -05:00
										 |  |  | SUMMARIZATION = "summarization" | 
					
						
							|  |  |  | NLG_TASKS = (SUMMARIZATION,) | 
					
						
							|  |  |  | NLU_TASKS = ( | 
					
						
							|  |  |  |     SEQREGRESSION, | 
					
						
							|  |  |  |     SEQCLASSIFICATION, | 
					
						
							| 
									
										
										
										
											2022-01-02 20:12:34 -05:00
										 |  |  |     MULTICHOICECLASSIFICATION, | 
					
						
							| 
									
										
										
										
											2022-01-03 13:44:10 -05:00
										 |  |  |     TOKENCLASSIFICATION, | 
					
						
							| 
									
										
										
										
											2021-12-03 12:45:16 -05:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2021-10-08 16:09:43 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-23 14:26:39 -05:00
										 |  |  | def _is_nlp_task(task): | 
					
						
							| 
									
										
										
										
											2021-12-20 17:19:32 -05:00
										 |  |  |     if task in NLU_TASKS or task in NLG_TASKS: | 
					
						
							| 
									
										
										
										
											2021-11-23 14:26:39 -05:00
										 |  |  |         return True | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  | def load_openml_dataset( | 
					
						
							|  |  |  |     dataset_id, data_dir=None, random_state=0, dataset_format="dataframe" | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     """Load dataset from open ML.
 | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     If the file is not cached locally, download it from open ML. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Args: | 
					
						
							| 
									
										
										
										
											2021-12-16 17:11:33 -08:00
										 |  |  |         dataset_id: An integer of the dataset id in openml. | 
					
						
							|  |  |  |         data_dir: A string of the path to store and load the data. | 
					
						
							|  |  |  |         random_state: An integer of the random seed for splitting data. | 
					
						
							| 
									
										
										
										
											2021-08-12 02:02:22 -04:00
										 |  |  |         dataset_format: A string specifying the format of returned dataset. Default is 'dataframe'. | 
					
						
							|  |  |  |             Can choose from ['dataframe', 'array']. | 
					
						
							|  |  |  |             If 'dataframe', the returned dataset will be a Pandas DataFrame. | 
					
						
							|  |  |  |             If 'array', the returned dataset will be a NumPy array or a SciPy sparse matrix. | 
					
						
							| 
									
										
										
										
											2021-12-16 17:11:33 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |     Returns: | 
					
						
							| 
									
										
										
										
											2021-12-16 17:11:33 -08:00
										 |  |  |         X_train: Training data. | 
					
						
							|  |  |  |         X_test:  Test data. | 
					
						
							|  |  |  |         y_train: A series or array of labels for training data. | 
					
						
							|  |  |  |         y_test:  A series or array of labels for test data. | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |     import os | 
					
						
							|  |  |  |     import openml | 
					
						
							|  |  |  |     import pickle | 
					
						
							|  |  |  |     from sklearn.model_selection import train_test_split | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |     filename = "openml_ds" + str(dataset_id) + ".pkl" | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |     filepath = os.path.join(data_dir, filename) | 
					
						
							|  |  |  |     if os.path.isfile(filepath): | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |         print("load dataset from", filepath) | 
					
						
							|  |  |  |         with open(filepath, "rb") as f: | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |             dataset = pickle.load(f) | 
					
						
							|  |  |  |     else: | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |         print("download dataset from openml") | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |         dataset = openml.datasets.get_dataset(dataset_id) | 
					
						
							|  |  |  |         if not os.path.exists(data_dir): | 
					
						
							|  |  |  |             os.makedirs(data_dir) | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |         with open(filepath, "wb") as f: | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |             pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL) | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |     print("Dataset name:", dataset.name) | 
					
						
							|  |  |  |     X, y, *__ = dataset.get_data( | 
					
						
							|  |  |  |         target=dataset.default_target_attribute, dataset_format=dataset_format | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state) | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |     print( | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |         "X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}".format( | 
					
						
							|  |  |  |             X_train.shape, | 
					
						
							|  |  |  |             y_train.shape, | 
					
						
							|  |  |  |             X_test.shape, | 
					
						
							|  |  |  |             y_test.shape, | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |         ) | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     return X_train, X_test, y_train, y_test | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def load_openml_task(task_id, data_dir): | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |     """Load task from open ML.
 | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-04-08 09:29:55 -07:00
										 |  |  |     Use the first fold of the task. | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |     If the file is not cached locally, download it from open ML. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Args: | 
					
						
							| 
									
										
										
										
											2021-12-16 17:11:33 -08:00
										 |  |  |         task_id: An integer of the task id in openml. | 
					
						
							|  |  |  |         data_dir: A string of the path to store and load the data. | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     Returns: | 
					
						
							| 
									
										
										
										
											2021-12-16 17:11:33 -08:00
										 |  |  |         X_train: A dataframe of training data. | 
					
						
							|  |  |  |         X_test:  A dataframe of test data. | 
					
						
							|  |  |  |         y_train: A series of labels for training data. | 
					
						
							|  |  |  |         y_test:  A series of labels for test data. | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |     import os | 
					
						
							|  |  |  |     import openml | 
					
						
							|  |  |  |     import pickle | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |     task = openml.tasks.get_task(task_id) | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |     filename = "openml_task" + str(task_id) + ".pkl" | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |     filepath = os.path.join(data_dir, filename) | 
					
						
							|  |  |  |     if os.path.isfile(filepath): | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |         print("load dataset from", filepath) | 
					
						
							|  |  |  |         with open(filepath, "rb") as f: | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |             dataset = pickle.load(f) | 
					
						
							|  |  |  |     else: | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |         print("download dataset from openml") | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |         dataset = task.get_dataset() | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |         with open(filepath, "wb") as f: | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |             pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL) | 
					
						
							| 
									
										
										
										
											2021-07-20 17:00:44 -07:00
										 |  |  |     X, y, _, _ = dataset.get_data(task.target_name) | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |     train_indices, test_indices = task.get_train_test_split_indices( | 
					
						
							|  |  |  |         repeat=0, | 
					
						
							|  |  |  |         fold=0, | 
					
						
							|  |  |  |         sample=0, | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2021-07-20 17:00:44 -07:00
										 |  |  |     X_train = X.iloc[train_indices] | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |     y_train = y[train_indices] | 
					
						
							| 
									
										
										
										
											2021-07-20 17:00:44 -07:00
										 |  |  |     X_test = X.iloc[test_indices] | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |     y_test = y[test_indices] | 
					
						
							|  |  |  |     print( | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |         "X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}".format( | 
					
						
							|  |  |  |             X_train.shape, | 
					
						
							|  |  |  |             y_train.shape, | 
					
						
							|  |  |  |             X_test.shape, | 
					
						
							|  |  |  |             y_test.shape, | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |         ) | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     return X_train, X_test, y_train, y_test | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def get_output_from_log(filename, time_budget): | 
					
						
							| 
									
										
										
										
											2021-12-16 17:11:33 -08:00
										 |  |  |     """Get output from log file.
 | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     Args: | 
					
						
							| 
									
										
										
										
											2021-11-06 09:37:33 -07:00
										 |  |  |         filename: A string of the log file name. | 
					
						
							|  |  |  |         time_budget: A float of the time budget in seconds. | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     Returns: | 
					
						
							| 
									
										
										
										
											2021-11-06 09:37:33 -07:00
										 |  |  |         search_time_list: A list of the finished time of each logged iter. | 
					
						
							|  |  |  |         best_error_list: A list of the best validation error after each logged iter. | 
					
						
							|  |  |  |         error_list: A list of the validation error of each logged iter. | 
					
						
							|  |  |  |         config_list: A list of the estimator, sample size and config of each logged iter. | 
					
						
							|  |  |  |         logged_metric_list: A list of the logged metric of each logged iter. | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     best_config = None | 
					
						
							|  |  |  |     best_learner = None | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |     best_val_loss = float("+inf") | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-08-23 16:26:46 -04:00
										 |  |  |     search_time_list = [] | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |     config_list = [] | 
					
						
							|  |  |  |     best_error_list = [] | 
					
						
							|  |  |  |     error_list = [] | 
					
						
							|  |  |  |     logged_metric_list = [] | 
					
						
							|  |  |  |     best_config_list = [] | 
					
						
							|  |  |  |     with training_log_reader(filename) as reader: | 
					
						
							|  |  |  |         for record in reader.records(): | 
					
						
							| 
									
										
										
										
											2021-08-23 19:36:51 -04:00
										 |  |  |             time_used = record.wall_clock_time | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |             val_loss = record.validation_loss | 
					
						
							|  |  |  |             config = record.config | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |             learner = record.learner.split("_")[0] | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |             sample_size = record.sample_size | 
					
						
							| 
									
										
										
										
											2021-09-01 16:25:04 -07:00
										 |  |  |             metric = record.logged_metric | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-08-23 19:36:51 -04:00
										 |  |  |             if time_used < time_budget and np.isfinite(val_loss): | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |                 if val_loss < best_val_loss: | 
					
						
							|  |  |  |                     best_val_loss = val_loss | 
					
						
							|  |  |  |                     best_config = config | 
					
						
							|  |  |  |                     best_learner = learner | 
					
						
							|  |  |  |                     best_config_list.append(best_config) | 
					
						
							| 
									
										
										
										
											2021-08-23 16:26:46 -04:00
										 |  |  |                 search_time_list.append(time_used) | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |                 best_error_list.append(best_val_loss) | 
					
						
							| 
									
										
										
										
											2021-09-01 16:25:04 -07:00
										 |  |  |                 logged_metric_list.append(metric) | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |                 error_list.append(val_loss) | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |                 config_list.append( | 
					
						
							|  |  |  |                     { | 
					
						
							|  |  |  |                         "Current Learner": learner, | 
					
						
							|  |  |  |                         "Current Sample": sample_size, | 
					
						
							|  |  |  |                         "Current Hyper-parameters": record.config, | 
					
						
							|  |  |  |                         "Best Learner": best_learner, | 
					
						
							|  |  |  |                         "Best Hyper-parameters": best_config, | 
					
						
							|  |  |  |                     } | 
					
						
							|  |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |     return ( | 
					
						
							|  |  |  |         search_time_list, | 
					
						
							|  |  |  |         best_error_list, | 
					
						
							|  |  |  |         error_list, | 
					
						
							|  |  |  |         config_list, | 
					
						
							|  |  |  |         logged_metric_list, | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def concat(X1, X2): | 
					
						
							| 
									
										
										
										
											2021-12-16 17:11:33 -08:00
										 |  |  |     """concatenate two matrices vertically.""" | 
					
						
							| 
									
										
										
										
											2021-11-28 10:14:25 -08:00
										 |  |  |     if isinstance(X1, (DataFrame, Series)): | 
					
						
							| 
									
										
										
										
											2021-02-05 21:41:14 -08:00
										 |  |  |         df = pd.concat([X1, X2], sort=False) | 
					
						
							|  |  |  |         df.reset_index(drop=True, inplace=True) | 
					
						
							| 
									
										
										
										
											2021-11-16 14:06:20 -05:00
										 |  |  |         if isinstance(X1, DataFrame): | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |             cat_columns = X1.select_dtypes(include="category").columns | 
					
						
							| 
									
										
										
										
											2021-02-05 21:41:14 -08:00
										 |  |  |             if len(cat_columns): | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |                 df[cat_columns] = df[cat_columns].astype("category") | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |         return df | 
					
						
							|  |  |  |     if issparse(X1): | 
					
						
							|  |  |  |         return vstack((X1, X2)) | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         return np.concatenate([X1, X2]) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class DataTransformer: | 
					
						
							| 
									
										
										
										
											2021-11-06 12:44:10 -07:00
										 |  |  |     """Transform input training data.""" | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-16 14:06:20 -05:00
										 |  |  |     def fit_transform(self, X: Union[DataFrame, np.array], y, task): | 
					
						
							| 
									
										
										
										
											2021-11-06 09:37:33 -07:00
										 |  |  |         """Fit transformer and process the input training data according to the task type.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Args: | 
					
						
							|  |  |  |             X: A numpy array or a pandas dataframe of training data. | 
					
						
							|  |  |  |             y: A numpy array or a pandas series of labels. | 
					
						
							|  |  |  |             task: A string of the task type, e.g., | 
					
						
							|  |  |  |                 'classification', 'regression', 'ts_forecast', 'rank'. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Returns: | 
					
						
							|  |  |  |             X: Processed numpy array or pandas dataframe of training data. | 
					
						
							|  |  |  |             y: Processed numpy array or pandas series of labels. | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2021-11-16 14:06:20 -05:00
										 |  |  |         if _is_nlp_task(task): | 
					
						
							|  |  |  |             # if the mode is NLP, check the type of input, each column must be either string or | 
					
						
							|  |  |  |             # ids (input ids, token type id, attention mask, etc.) | 
					
						
							|  |  |  |             str_columns = [] | 
					
						
							|  |  |  |             for column in X.columns: | 
					
						
							|  |  |  |                 if isinstance(X[column].iloc[0], str): | 
					
						
							|  |  |  |                     str_columns.append(column) | 
					
						
							|  |  |  |             if len(str_columns) > 0: | 
					
						
							|  |  |  |                 X[str_columns] = X[str_columns].astype("string") | 
					
						
							|  |  |  |             self._str_columns = str_columns | 
					
						
							|  |  |  |         elif isinstance(X, DataFrame): | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |             X = X.copy() | 
					
						
							|  |  |  |             n = X.shape[0] | 
					
						
							| 
									
										
										
										
											2021-04-21 16:22:54 +02:00
										 |  |  |             cat_columns, num_columns, datetime_columns = [], [], [] | 
					
						
							| 
									
										
										
										
											2021-03-19 09:50:47 -07:00
										 |  |  |             drop = False | 
					
						
							| 
									
										
										
										
											2022-01-24 21:39:36 -05:00
										 |  |  |             if task in TS_FORECAST: | 
					
						
							| 
									
										
										
										
											2021-10-30 12:48:57 -04:00
										 |  |  |                 X = X.rename(columns={X.columns[0]: TS_TIMESTAMP_COL}) | 
					
						
							|  |  |  |                 ds_col = X.pop(TS_TIMESTAMP_COL) | 
					
						
							| 
									
										
										
										
											2021-11-16 14:06:20 -05:00
										 |  |  |                 if isinstance(y, Series): | 
					
						
							| 
									
										
										
										
											2021-10-30 12:48:57 -04:00
										 |  |  |                     y = y.rename(TS_VALUE_COL) | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |             for column in X.columns: | 
					
						
							| 
									
										
										
										
											2021-04-20 17:32:58 +02:00
										 |  |  |                 # sklearn\utils\validation.py needs int/float values | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |                 if X[column].dtype.name in ("object", "category"): | 
					
						
							|  |  |  |                     if ( | 
					
						
							|  |  |  |                         X[column].nunique() == 1 | 
					
						
							|  |  |  |                         or X[column].nunique(dropna=True) | 
					
						
							|  |  |  |                         == n - X[column].isnull().sum() | 
					
						
							|  |  |  |                     ): | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |                         X.drop(columns=column, inplace=True) | 
					
						
							| 
									
										
										
										
											2021-03-19 09:50:47 -07:00
										 |  |  |                         drop = True | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |                     elif X[column].dtype.name == "category": | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |                         current_categories = X[column].cat.categories | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |                         if "__NAN__" not in current_categories: | 
					
						
							|  |  |  |                             X[column] = ( | 
					
						
							|  |  |  |                                 X[column] | 
					
						
							|  |  |  |                                 .cat.add_categories("__NAN__") | 
					
						
							|  |  |  |                                 .fillna("__NAN__") | 
					
						
							|  |  |  |                             ) | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |                         cat_columns.append(column) | 
					
						
							|  |  |  |                     else: | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |                         X[column] = X[column].fillna("__NAN__") | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |                         cat_columns.append(column) | 
					
						
							| 
									
										
										
										
											2021-11-18 11:19:53 -08:00
										 |  |  |                 elif X[column].nunique(dropna=True) < 2: | 
					
						
							|  |  |  |                     X.drop(columns=column, inplace=True) | 
					
						
							|  |  |  |                     drop = True | 
					
						
							|  |  |  |                 else:  # datetime or numeric | 
					
						
							|  |  |  |                     if X[column].dtype.name == "datetime64[ns]": | 
					
						
							|  |  |  |                         tmp_dt = X[column].dt | 
					
						
							|  |  |  |                         new_columns_dict = { | 
					
						
							|  |  |  |                             f"year_{column}": tmp_dt.year, | 
					
						
							|  |  |  |                             f"month_{column}": tmp_dt.month, | 
					
						
							|  |  |  |                             f"day_{column}": tmp_dt.day, | 
					
						
							|  |  |  |                             f"hour_{column}": tmp_dt.hour, | 
					
						
							|  |  |  |                             f"minute_{column}": tmp_dt.minute, | 
					
						
							|  |  |  |                             f"second_{column}": tmp_dt.second, | 
					
						
							|  |  |  |                             f"dayofweek_{column}": tmp_dt.dayofweek, | 
					
						
							|  |  |  |                             f"dayofyear_{column}": tmp_dt.dayofyear, | 
					
						
							|  |  |  |                             f"quarter_{column}": tmp_dt.quarter, | 
					
						
							|  |  |  |                         } | 
					
						
							|  |  |  |                         for key, value in new_columns_dict.items(): | 
					
						
							|  |  |  |                             if ( | 
					
						
							|  |  |  |                                 key not in X.columns | 
					
						
							|  |  |  |                                 and value.nunique(dropna=False) >= 2 | 
					
						
							|  |  |  |                             ): | 
					
						
							|  |  |  |                                 X[key] = value | 
					
						
							|  |  |  |                                 num_columns.append(key) | 
					
						
							|  |  |  |                         X[column] = X[column].map(datetime.toordinal) | 
					
						
							|  |  |  |                         datetime_columns.append(column) | 
					
						
							|  |  |  |                         del tmp_dt | 
					
						
							|  |  |  |                     X[column] = X[column].fillna(np.nan) | 
					
						
							|  |  |  |                     num_columns.append(column) | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |             X = X[cat_columns + num_columns] | 
					
						
							| 
									
										
										
										
											2022-01-24 21:39:36 -05:00
										 |  |  |             if task in TS_FORECAST: | 
					
						
							| 
									
										
										
										
											2021-10-30 12:48:57 -04:00
										 |  |  |                 X.insert(0, TS_TIMESTAMP_COL, ds_col) | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |             if cat_columns: | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |                 X[cat_columns] = X[cat_columns].astype("category") | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |             if num_columns: | 
					
						
							| 
									
										
										
										
											2021-03-19 09:50:47 -07:00
										 |  |  |                 X_num = X[num_columns] | 
					
						
							| 
									
										
										
										
											2021-09-01 16:25:04 -07:00
										 |  |  |                 if np.issubdtype(X_num.columns.dtype, np.integer) and ( | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |                     drop | 
					
						
							|  |  |  |                     or min(X_num.columns) != 0 | 
					
						
							| 
									
										
										
										
											2021-09-01 16:25:04 -07:00
										 |  |  |                     or max(X_num.columns) != X_num.shape[1] - 1 | 
					
						
							|  |  |  |                 ): | 
					
						
							| 
									
										
										
										
											2021-03-19 09:50:47 -07:00
										 |  |  |                     X_num.columns = range(X_num.shape[1]) | 
					
						
							| 
									
										
										
										
											2021-09-01 16:25:04 -07:00
										 |  |  |                     drop = True | 
					
						
							| 
									
										
										
										
											2021-04-08 09:29:55 -07:00
										 |  |  |                 else: | 
					
						
							|  |  |  |                     drop = False | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |                 from sklearn.impute import SimpleImputer | 
					
						
							|  |  |  |                 from sklearn.compose import ColumnTransformer | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  |                 self.transformer = ColumnTransformer( | 
					
						
							|  |  |  |                     [ | 
					
						
							|  |  |  |                         ( | 
					
						
							|  |  |  |                             "continuous", | 
					
						
							|  |  |  |                             SimpleImputer(missing_values=np.nan, strategy="median"), | 
					
						
							|  |  |  |                             X_num.columns, | 
					
						
							|  |  |  |                         ) | 
					
						
							|  |  |  |                     ] | 
					
						
							|  |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2021-03-19 09:50:47 -07:00
										 |  |  |                 X[num_columns] = self.transformer.fit_transform(X_num) | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |             self._cat_columns, self._num_columns, self._datetime_columns = ( | 
					
						
							|  |  |  |                 cat_columns, | 
					
						
							|  |  |  |                 num_columns, | 
					
						
							|  |  |  |                 datetime_columns, | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2021-03-19 09:50:47 -07:00
										 |  |  |             self._drop = drop | 
					
						
							| 
									
										
										
										
											2021-12-20 17:19:32 -05:00
										 |  |  |         if ( | 
					
						
							| 
									
										
										
										
											2022-01-03 13:44:10 -05:00
										 |  |  |             (task in CLASSIFICATION or not pd.api.types.is_numeric_dtype(y)) | 
					
						
							| 
									
										
										
										
											2021-12-20 17:19:32 -05:00
										 |  |  |             and task not in NLG_TASKS | 
					
						
							| 
									
										
										
										
											2022-01-03 13:44:10 -05:00
										 |  |  |             and task != TOKENCLASSIFICATION | 
					
						
							| 
									
										
										
										
											2021-12-20 17:19:32 -05:00
										 |  |  |         ): | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |             from sklearn.preprocessing import LabelEncoder | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |             self.label_transformer = LabelEncoder() | 
					
						
							|  |  |  |             y = self.label_transformer.fit_transform(y) | 
					
						
							| 
									
										
										
										
											2021-09-01 16:25:04 -07:00
										 |  |  |         else: | 
					
						
							|  |  |  |             self.label_transformer = None | 
					
						
							| 
									
										
										
										
											2021-11-06 09:37:33 -07:00
										 |  |  |         self._task = task | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |         return X, y | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-16 14:06:20 -05:00
										 |  |  |     def transform(self, X: Union[DataFrame, np.array]): | 
					
						
							| 
									
										
										
										
											2021-11-06 09:37:33 -07:00
										 |  |  |         """Process data using fit transformer.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Args: | 
					
						
							|  |  |  |             X: A numpy array or a pandas dataframe of training data. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Returns: | 
					
						
							|  |  |  |             X: Processed numpy array or pandas dataframe of training data. | 
					
						
							|  |  |  |         """
 | 
					
						
							| 
									
										
										
										
											2021-04-24 02:14:29 +02:00
										 |  |  |         X = X.copy() | 
					
						
							| 
									
										
										
										
											2021-11-16 14:06:20 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  |         if _is_nlp_task(self._task): | 
					
						
							|  |  |  |             # if the mode is NLP, check the type of input, each column must be either string or | 
					
						
							|  |  |  |             # ids (input ids, token type id, attention mask, etc.) | 
					
						
							|  |  |  |             if len(self._str_columns) > 0: | 
					
						
							|  |  |  |                 X[self._str_columns] = X[self._str_columns].astype("string") | 
					
						
							|  |  |  |         elif isinstance(X, DataFrame): | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |             cat_columns, num_columns, datetime_columns = ( | 
					
						
							|  |  |  |                 self._cat_columns, | 
					
						
							|  |  |  |                 self._num_columns, | 
					
						
							|  |  |  |                 self._datetime_columns, | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2022-01-24 21:39:36 -05:00
										 |  |  |             if self._task in TS_FORECAST: | 
					
						
							| 
									
										
										
										
											2021-10-30 12:48:57 -04:00
										 |  |  |                 X = X.rename(columns={X.columns[0]: TS_TIMESTAMP_COL}) | 
					
						
							|  |  |  |                 ds_col = X.pop(TS_TIMESTAMP_COL) | 
					
						
							| 
									
										
										
										
											2021-11-18 11:19:53 -08:00
										 |  |  |             for column in datetime_columns: | 
					
						
							|  |  |  |                 tmp_dt = X[column].dt | 
					
						
							|  |  |  |                 new_columns_dict = { | 
					
						
							|  |  |  |                     f"year_{column}": tmp_dt.year, | 
					
						
							|  |  |  |                     f"month_{column}": tmp_dt.month, | 
					
						
							|  |  |  |                     f"day_{column}": tmp_dt.day, | 
					
						
							|  |  |  |                     f"hour_{column}": tmp_dt.hour, | 
					
						
							|  |  |  |                     f"minute_{column}": tmp_dt.minute, | 
					
						
							|  |  |  |                     f"second_{column}": tmp_dt.second, | 
					
						
							|  |  |  |                     f"dayofweek_{column}": tmp_dt.dayofweek, | 
					
						
							|  |  |  |                     f"dayofyear_{column}": tmp_dt.dayofyear, | 
					
						
							|  |  |  |                     f"quarter_{column}": tmp_dt.quarter, | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |                 for new_col_name, new_col_value in new_columns_dict.items(): | 
					
						
							|  |  |  |                     if new_col_name not in X.columns and new_col_name in num_columns: | 
					
						
							|  |  |  |                         X[new_col_name] = new_col_value | 
					
						
							|  |  |  |                 X[column] = X[column].map(datetime.toordinal) | 
					
						
							|  |  |  |                 del tmp_dt | 
					
						
							| 
									
										
										
										
											2021-05-25 17:30:08 +02:00
										 |  |  |             X = X[cat_columns + num_columns].copy() | 
					
						
							| 
									
										
										
										
											2022-01-24 21:39:36 -05:00
										 |  |  |             if self._task in TS_FORECAST: | 
					
						
							| 
									
										
										
										
											2021-10-30 12:48:57 -04:00
										 |  |  |                 X.insert(0, TS_TIMESTAMP_COL, ds_col) | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |             for column in cat_columns: | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |                 if X[column].dtype.name == "object": | 
					
						
							|  |  |  |                     X[column] = X[column].fillna("__NAN__") | 
					
						
							|  |  |  |                 elif X[column].dtype.name == "category": | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |                     current_categories = X[column].cat.categories | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |                     if "__NAN__" not in current_categories: | 
					
						
							|  |  |  |                         X[column] = ( | 
					
						
							|  |  |  |                             X[column].cat.add_categories("__NAN__").fillna("__NAN__") | 
					
						
							|  |  |  |                         ) | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |             if cat_columns: | 
					
						
							| 
									
										
										
										
											2021-09-11 21:19:18 -07:00
										 |  |  |                 X[cat_columns] = X[cat_columns].astype("category") | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |             if num_columns: | 
					
						
							| 
									
										
										
										
											2021-03-19 09:50:47 -07:00
										 |  |  |                 X_num = X[num_columns].fillna(np.nan) | 
					
						
							|  |  |  |                 if self._drop: | 
					
						
							|  |  |  |                     X_num.columns = range(X_num.shape[1]) | 
					
						
							|  |  |  |                 X[num_columns] = self.transformer.transform(X_num) | 
					
						
							| 
									
										
										
										
											2020-12-04 09:40:27 -08:00
										 |  |  |         return X | 
					
						
							| 
									
										
										
										
											2021-09-01 16:25:04 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def group_counts(groups): | 
					
						
							|  |  |  |     _, i, c = np.unique(groups, return_counts=True, return_index=True) | 
					
						
							|  |  |  |     return c[np.argsort(i)] |