2020-12-04 09:40:27 -08:00
|
|
|
'''!
|
2021-02-05 21:41:14 -08:00
|
|
|
* Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
|
2021-04-08 09:29:55 -07:00
|
|
|
* Licensed under the MIT License.
|
2020-12-04 09:40:27 -08:00
|
|
|
'''
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
from scipy.sparse import vstack, issparse
|
|
|
|
import pandas as pd
|
|
|
|
from .training_log import training_log_reader
|
|
|
|
|
2021-04-20 17:32:58 +02:00
|
|
|
from datetime import datetime
|
|
|
|
|
2020-12-04 09:40:27 -08:00
|
|
|
|
2021-08-12 02:02:22 -04:00
|
|
|
def load_openml_dataset(dataset_id, data_dir=None, random_state=0,
|
|
|
|
dataset_format='dataframe'):
|
2021-04-08 09:29:55 -07:00
|
|
|
'''Load dataset from open ML.
|
2020-12-04 09:40:27 -08:00
|
|
|
|
|
|
|
If the file is not cached locally, download it from open ML.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
dataset_id: An integer of the dataset id in openml
|
|
|
|
data_dir: A string of the path to store and load the data
|
|
|
|
random_state: An integer of the random seed for splitting data
|
2021-08-12 02:02:22 -04:00
|
|
|
dataset_format: A string specifying the format of returned dataset. Default is 'dataframe'.
|
|
|
|
Can choose from ['dataframe', 'array'].
|
|
|
|
If 'dataframe', the returned dataset will be a Pandas DataFrame.
|
|
|
|
If 'array', the returned dataset will be a NumPy array or a SciPy sparse matrix.
|
2020-12-04 09:40:27 -08:00
|
|
|
Returns:
|
2021-08-12 02:02:22 -04:00
|
|
|
X_train: Training data
|
|
|
|
X_test: Test data
|
|
|
|
y_train: A series or array of labels for training data
|
|
|
|
y_test: A series or array of labels for test data
|
2020-12-04 09:40:27 -08:00
|
|
|
'''
|
|
|
|
import os
|
|
|
|
import openml
|
|
|
|
import pickle
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
|
|
filename = 'openml_ds' + str(dataset_id) + '.pkl'
|
|
|
|
filepath = os.path.join(data_dir, filename)
|
|
|
|
if os.path.isfile(filepath):
|
|
|
|
print('load dataset from', filepath)
|
|
|
|
with open(filepath, 'rb') as f:
|
|
|
|
dataset = pickle.load(f)
|
|
|
|
else:
|
|
|
|
print('download dataset from openml')
|
|
|
|
dataset = openml.datasets.get_dataset(dataset_id)
|
|
|
|
if not os.path.exists(data_dir):
|
|
|
|
os.makedirs(data_dir)
|
|
|
|
with open(filepath, 'wb') as f:
|
|
|
|
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
|
|
|
|
print('Dataset name:', dataset.name)
|
|
|
|
X, y, * \
|
|
|
|
__ = dataset.get_data(
|
2021-08-12 02:02:22 -04:00
|
|
|
target=dataset.default_target_attribute, dataset_format=dataset_format)
|
2020-12-04 09:40:27 -08:00
|
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
X, y, random_state=random_state)
|
|
|
|
print(
|
|
|
|
'X_train.shape: {}, y_train.shape: {};\nX_test.shape: {}, y_test.shape: {}'.format(
|
|
|
|
X_train.shape, y_train.shape, X_test.shape, y_test.shape,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
return X_train, X_test, y_train, y_test
|
|
|
|
|
|
|
|
|
|
|
|
def load_openml_task(task_id, data_dir):
|
2021-04-08 09:29:55 -07:00
|
|
|
'''Load task from open ML.
|
2020-12-04 09:40:27 -08:00
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
Use the first fold of the task.
|
2020-12-04 09:40:27 -08:00
|
|
|
If the file is not cached locally, download it from open ML.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
task_id: An integer of the task id in openml
|
|
|
|
data_dir: A string of the path to store and load the data
|
|
|
|
|
|
|
|
Returns:
|
2021-07-20 17:00:44 -07:00
|
|
|
X_train: A dataframe of training data
|
|
|
|
X_test: A dataframe of test data
|
|
|
|
y_train: A series of labels for training data
|
|
|
|
y_test: A series of labels for test data
|
2020-12-04 09:40:27 -08:00
|
|
|
'''
|
|
|
|
import os
|
|
|
|
import openml
|
|
|
|
import pickle
|
|
|
|
task = openml.tasks.get_task(task_id)
|
|
|
|
filename = 'openml_task' + str(task_id) + '.pkl'
|
|
|
|
filepath = os.path.join(data_dir, filename)
|
|
|
|
if os.path.isfile(filepath):
|
|
|
|
print('load dataset from', filepath)
|
|
|
|
with open(filepath, 'rb') as f:
|
|
|
|
dataset = pickle.load(f)
|
|
|
|
else:
|
|
|
|
print('download dataset from openml')
|
|
|
|
dataset = task.get_dataset()
|
|
|
|
with open(filepath, 'wb') as f:
|
|
|
|
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
|
2021-07-20 17:00:44 -07:00
|
|
|
X, y, _, _ = dataset.get_data(task.target_name)
|
2020-12-04 09:40:27 -08:00
|
|
|
train_indices, test_indices = task.get_train_test_split_indices(
|
|
|
|
repeat=0,
|
|
|
|
fold=0,
|
|
|
|
sample=0,
|
|
|
|
)
|
2021-07-20 17:00:44 -07:00
|
|
|
X_train = X.iloc[train_indices]
|
2020-12-04 09:40:27 -08:00
|
|
|
y_train = y[train_indices]
|
2021-07-20 17:00:44 -07:00
|
|
|
X_test = X.iloc[test_indices]
|
2020-12-04 09:40:27 -08:00
|
|
|
y_test = y[test_indices]
|
|
|
|
print(
|
|
|
|
'X_train.shape: {}, y_train.shape: {},\nX_test.shape: {}, y_test.shape: {}'.format(
|
|
|
|
X_train.shape, y_train.shape, X_test.shape, y_test.shape,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
return X_train, X_test, y_train, y_test
|
|
|
|
|
|
|
|
|
|
|
|
def get_output_from_log(filename, time_budget):
|
|
|
|
'''Get output from log file
|
|
|
|
|
|
|
|
Args:
|
|
|
|
filename: A string of the log file name
|
|
|
|
time_budget: A float of the time budget in seconds
|
|
|
|
|
|
|
|
Returns:
|
2021-08-23 16:26:46 -04:00
|
|
|
search_time_list: A list of the finished time of each logged iter
|
2021-04-08 09:29:55 -07:00
|
|
|
best_error_list:
|
2020-12-04 09:40:27 -08:00
|
|
|
A list of the best validation error after each logged iter
|
|
|
|
error_list: A list of the validation error of each logged iter
|
2021-04-08 09:29:55 -07:00
|
|
|
config_list:
|
2020-12-04 09:40:27 -08:00
|
|
|
A list of the estimator, sample size and config of each logged iter
|
2021-04-08 09:29:55 -07:00
|
|
|
logged_metric_list: A list of the logged metric of each logged iter
|
2020-12-04 09:40:27 -08:00
|
|
|
'''
|
|
|
|
|
|
|
|
best_config = None
|
|
|
|
best_learner = None
|
|
|
|
best_val_loss = float('+inf')
|
|
|
|
|
2021-08-23 16:26:46 -04:00
|
|
|
search_time_list = []
|
2020-12-04 09:40:27 -08:00
|
|
|
config_list = []
|
|
|
|
best_error_list = []
|
|
|
|
error_list = []
|
|
|
|
logged_metric_list = []
|
|
|
|
best_config_list = []
|
|
|
|
with training_log_reader(filename) as reader:
|
|
|
|
for record in reader.records():
|
|
|
|
time_used = record.total_search_time
|
|
|
|
val_loss = record.validation_loss
|
|
|
|
config = record.config
|
|
|
|
learner = record.learner.split('_')[0]
|
|
|
|
sample_size = record.sample_size
|
|
|
|
train_loss = record.logged_metric
|
|
|
|
|
|
|
|
if time_used < time_budget:
|
|
|
|
if val_loss < best_val_loss:
|
|
|
|
best_val_loss = val_loss
|
|
|
|
best_config = config
|
|
|
|
best_learner = learner
|
|
|
|
best_config_list.append(best_config)
|
2021-08-23 16:26:46 -04:00
|
|
|
search_time_list.append(time_used)
|
2020-12-04 09:40:27 -08:00
|
|
|
best_error_list.append(best_val_loss)
|
|
|
|
logged_metric_list.append(train_loss)
|
|
|
|
error_list.append(val_loss)
|
|
|
|
config_list.append({"Current Learner": learner,
|
|
|
|
"Current Sample": sample_size,
|
|
|
|
"Current Hyper-parameters": record.config,
|
|
|
|
"Best Learner": best_learner,
|
|
|
|
"Best Hyper-parameters": best_config})
|
|
|
|
|
2021-08-23 16:26:46 -04:00
|
|
|
return (search_time_list, best_error_list, error_list, config_list,
|
2020-12-04 09:40:27 -08:00
|
|
|
logged_metric_list)
|
|
|
|
|
|
|
|
|
|
|
|
def concat(X1, X2):
|
|
|
|
'''concatenate two matrices vertically
|
|
|
|
'''
|
|
|
|
if isinstance(X1, pd.DataFrame) or isinstance(X1, pd.Series):
|
2021-02-05 21:41:14 -08:00
|
|
|
df = pd.concat([X1, X2], sort=False)
|
|
|
|
df.reset_index(drop=True, inplace=True)
|
2020-12-04 09:40:27 -08:00
|
|
|
if isinstance(X1, pd.DataFrame):
|
|
|
|
cat_columns = X1.select_dtypes(
|
|
|
|
include='category').columns
|
2021-02-05 21:41:14 -08:00
|
|
|
if len(cat_columns):
|
|
|
|
df[cat_columns] = df[cat_columns].astype('category')
|
2020-12-04 09:40:27 -08:00
|
|
|
return df
|
|
|
|
if issparse(X1):
|
|
|
|
return vstack((X1, X2))
|
|
|
|
else:
|
|
|
|
return np.concatenate([X1, X2])
|
|
|
|
|
|
|
|
|
|
|
|
class DataTransformer:
|
|
|
|
'''transform X, y
|
|
|
|
'''
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
def fit_transform(self, X, y, task):
|
2020-12-04 09:40:27 -08:00
|
|
|
if isinstance(X, pd.DataFrame):
|
|
|
|
X = X.copy()
|
|
|
|
n = X.shape[0]
|
2021-04-21 16:22:54 +02:00
|
|
|
cat_columns, num_columns, datetime_columns = [], [], []
|
2021-03-19 09:50:47 -07:00
|
|
|
drop = False
|
2020-12-04 09:40:27 -08:00
|
|
|
for column in X.columns:
|
2021-04-20 17:32:58 +02:00
|
|
|
# sklearn\utils\validation.py needs int/float values
|
2020-12-04 09:40:27 -08:00
|
|
|
if X[column].dtype.name in ('object', 'category'):
|
|
|
|
if X[column].nunique() == 1 or X[column].nunique(
|
|
|
|
dropna=True) == n - X[column].isnull().sum():
|
|
|
|
X.drop(columns=column, inplace=True)
|
2021-03-19 09:50:47 -07:00
|
|
|
drop = True
|
2020-12-04 09:40:27 -08:00
|
|
|
elif X[column].dtype.name == 'category':
|
|
|
|
current_categories = X[column].cat.categories
|
|
|
|
if '__NAN__' not in current_categories:
|
|
|
|
X[column] = X[column].cat.add_categories(
|
|
|
|
'__NAN__').fillna('__NAN__')
|
|
|
|
cat_columns.append(column)
|
|
|
|
else:
|
2021-03-19 09:50:47 -07:00
|
|
|
X[column] = X[column].fillna('__NAN__')
|
2020-12-04 09:40:27 -08:00
|
|
|
cat_columns.append(column)
|
|
|
|
else:
|
2021-04-24 02:14:29 +02:00
|
|
|
# print(X[column].dtype.name)
|
2020-12-04 09:40:27 -08:00
|
|
|
if X[column].nunique(dropna=True) < 2:
|
|
|
|
X.drop(columns=column, inplace=True)
|
2021-03-19 09:50:47 -07:00
|
|
|
drop = True
|
2020-12-04 09:40:27 -08:00
|
|
|
else:
|
2021-05-25 17:30:08 +02:00
|
|
|
if X[column].dtype.name == 'datetime64[ns]':
|
|
|
|
tmp_dt = X[column].dt
|
|
|
|
new_columns_dict = {f'year_{column}': tmp_dt.year, f'month_{column}': tmp_dt.month,
|
|
|
|
f'day_{column}': tmp_dt.day, f'hour_{column}': tmp_dt.hour,
|
|
|
|
f'minute_{column}': tmp_dt.minute, f'second_{column}': tmp_dt.second,
|
|
|
|
f'dayofweek_{column}': tmp_dt.dayofweek,
|
|
|
|
f'dayofyear_{column}': tmp_dt.dayofyear,
|
|
|
|
f'quarter_{column}': tmp_dt.quarter}
|
|
|
|
for new_col_name in new_columns_dict.keys():
|
|
|
|
if new_col_name not in X.columns and \
|
|
|
|
new_columns_dict.get(new_col_name).nunique(dropna=False) >= 2:
|
|
|
|
X[new_col_name] = new_columns_dict.get(new_col_name)
|
|
|
|
num_columns.append(new_col_name)
|
|
|
|
X[column] = X[column].map(datetime.toordinal)
|
|
|
|
datetime_columns.append(column)
|
|
|
|
del tmp_dt
|
|
|
|
else:
|
|
|
|
X[column] = X[column].fillna(np.nan)
|
|
|
|
num_columns.append(column)
|
2020-12-04 09:40:27 -08:00
|
|
|
X = X[cat_columns + num_columns]
|
|
|
|
if cat_columns:
|
|
|
|
X[cat_columns] = X[cat_columns].astype('category')
|
|
|
|
if num_columns:
|
2021-03-19 09:50:47 -07:00
|
|
|
X_num = X[num_columns]
|
|
|
|
if drop and np.issubdtype(X_num.columns.dtype, np.integer):
|
|
|
|
X_num.columns = range(X_num.shape[1])
|
2021-04-08 09:29:55 -07:00
|
|
|
else:
|
|
|
|
drop = False
|
2020-12-04 09:40:27 -08:00
|
|
|
from sklearn.impute import SimpleImputer
|
|
|
|
from sklearn.compose import ColumnTransformer
|
|
|
|
self.transformer = ColumnTransformer([(
|
|
|
|
'continuous',
|
|
|
|
SimpleImputer(missing_values=np.nan, strategy='median'),
|
2021-04-08 09:29:55 -07:00
|
|
|
X_num.columns)])
|
2021-03-19 09:50:47 -07:00
|
|
|
X[num_columns] = self.transformer.fit_transform(X_num)
|
2021-05-18 15:57:42 -07:00
|
|
|
self._cat_columns, self._num_columns, self._datetime_columns = \
|
|
|
|
cat_columns, num_columns, datetime_columns
|
2021-03-19 09:50:47 -07:00
|
|
|
self._drop = drop
|
2021-04-08 09:29:55 -07:00
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
if task == 'regression':
|
2020-12-04 09:40:27 -08:00
|
|
|
self.label_transformer = None
|
|
|
|
else:
|
|
|
|
from sklearn.preprocessing import LabelEncoder
|
|
|
|
self.label_transformer = LabelEncoder()
|
|
|
|
y = self.label_transformer.fit_transform(y)
|
|
|
|
return X, y
|
|
|
|
|
|
|
|
def transform(self, X):
|
2021-04-24 02:14:29 +02:00
|
|
|
X = X.copy()
|
2020-12-04 09:40:27 -08:00
|
|
|
if isinstance(X, pd.DataFrame):
|
2021-04-24 02:14:29 +02:00
|
|
|
cat_columns, num_columns, datetime_columns = self._cat_columns, \
|
|
|
|
self._num_columns, self._datetime_columns
|
2021-04-21 16:22:54 +02:00
|
|
|
if datetime_columns:
|
2021-05-25 17:30:08 +02:00
|
|
|
for column in datetime_columns:
|
|
|
|
tmp_dt = X[column].dt
|
|
|
|
new_columns_dict = {f'year_{column}': tmp_dt.year, f'month_{column}': tmp_dt.month,
|
|
|
|
f'day_{column}': tmp_dt.day, f'hour_{column}': tmp_dt.hour,
|
|
|
|
f'minute_{column}': tmp_dt.minute, f'second_{column}': tmp_dt.second,
|
|
|
|
f'dayofweek_{column}': tmp_dt.dayofweek,
|
|
|
|
f'dayofyear_{column}': tmp_dt.dayofyear,
|
|
|
|
f'quarter_{column}': tmp_dt.quarter}
|
|
|
|
for new_col_name in new_columns_dict.keys():
|
|
|
|
if new_col_name not in X.columns and \
|
|
|
|
new_columns_dict.get(new_col_name).nunique(dropna=False) >= 2:
|
|
|
|
X[new_col_name] = new_columns_dict.get(new_col_name)
|
|
|
|
X[column] = X[column].map(datetime.toordinal)
|
|
|
|
del tmp_dt
|
|
|
|
X = X[cat_columns + num_columns].copy()
|
2020-12-04 09:40:27 -08:00
|
|
|
for column in cat_columns:
|
|
|
|
if X[column].dtype.name == 'object':
|
2021-03-19 09:50:47 -07:00
|
|
|
X[column] = X[column].fillna('__NAN__')
|
2020-12-04 09:40:27 -08:00
|
|
|
elif X[column].dtype.name == 'category':
|
|
|
|
current_categories = X[column].cat.categories
|
|
|
|
if '__NAN__' not in current_categories:
|
|
|
|
X[column] = X[column].cat.add_categories(
|
|
|
|
'__NAN__').fillna('__NAN__')
|
|
|
|
if cat_columns:
|
|
|
|
X[cat_columns] = X[cat_columns].astype('category')
|
|
|
|
if num_columns:
|
2021-03-19 09:50:47 -07:00
|
|
|
X_num = X[num_columns].fillna(np.nan)
|
|
|
|
if self._drop:
|
|
|
|
X_num.columns = range(X_num.shape[1])
|
|
|
|
X[num_columns] = self.transformer.transform(X_num)
|
2020-12-04 09:40:27 -08:00
|
|
|
return X
|