2021-11-06 09:37:33 -07:00
# !
2022-01-30 13:02:18 -08:00
# * Copyright (c) FLAML authors. All rights reserved.
2021-11-06 09:37:33 -07:00
# * Licensed under the MIT License. See LICENSE file in the
# * project root for license information.
2021-11-03 19:08:23 -07:00
from contextlib import contextmanager
from functools import partial
import signal
import os
from typing import Callable , List
2021-02-05 21:41:14 -08:00
import numpy as np
import time
from sklearn . ensemble import RandomForestRegressor , RandomForestClassifier
from sklearn . ensemble import ExtraTreesRegressor , ExtraTreesClassifier
from sklearn . linear_model import LogisticRegression
2021-11-03 19:08:23 -07:00
from sklearn . dummy import DummyClassifier , DummyRegressor
2021-02-05 21:41:14 -08:00
from scipy . sparse import issparse
import logging
2021-11-23 14:26:39 -05:00
import shutil
2022-01-14 13:39:09 -08:00
from pandas import DataFrame , Series , to_datetime
import sys
2022-04-12 14:46:14 -04:00
import math
2021-11-03 19:08:23 -07:00
from . import tune
from . data import (
group_counts ,
CLASSIFICATION ,
2022-01-24 21:39:36 -05:00
TS_FORECASTREGRESSION ,
2021-11-03 19:08:23 -07:00
TS_TIMESTAMP_COL ,
TS_VALUE_COL ,
2021-12-03 12:45:16 -05:00
SEQCLASSIFICATION ,
SEQREGRESSION ,
2022-01-03 13:44:10 -05:00
TOKENCLASSIFICATION ,
2021-12-20 17:19:32 -05:00
SUMMARIZATION ,
NLG_TASKS ,
2022-01-02 20:12:34 -05:00
MULTICHOICECLASSIFICATION ,
2021-11-03 19:08:23 -07:00
)
try :
import psutil
except ImportError :
psutil = None
try :
import resource
except ImportError :
resource = None
2021-08-23 16:26:46 -04:00
2021-10-08 16:09:43 -07:00
logger = logging . getLogger ( " flaml.automl " )
2021-11-03 19:08:23 -07:00
FREE_MEM_RATIO = 0.2
def TimeoutHandler ( sig , frame ) :
raise TimeoutError ( sig , frame )
@contextmanager
def limit_resource ( memory_limit , time_limit ) :
if memory_limit > 0 :
soft , hard = resource . getrlimit ( resource . RLIMIT_AS )
if soft < 0 and ( hard < 0 or memory_limit < = hard ) or memory_limit < soft :
2022-06-15 13:46:52 -07:00
try :
resource . setrlimit ( resource . RLIMIT_AS , ( int ( memory_limit ) , hard ) )
except ValueError :
# According to https://bugs.python.org/issue40518, it's a mac-specific error.
pass
2021-11-03 19:08:23 -07:00
main_thread = False
if time_limit is not None :
try :
signal . signal ( signal . SIGALRM , TimeoutHandler )
signal . alarm ( int ( time_limit ) or 1 )
main_thread = True
except ValueError :
pass
try :
yield
finally :
if main_thread :
signal . alarm ( 0 )
if memory_limit > 0 :
resource . setrlimit ( resource . RLIMIT_AS , ( soft , hard ) )
2021-02-05 21:41:14 -08:00
class BaseEstimator :
2021-11-06 09:37:33 -07:00
""" The abstract class for all learners.
2021-02-05 21:41:14 -08:00
2021-11-06 09:37:33 -07:00
Typical examples :
2021-12-16 17:11:33 -08:00
* XGBoostEstimator : for regression .
* XGBoostSklearnEstimator : for classification .
* LGBMEstimator , RandomForestEstimator , LRL1Classifier , LRL2Classifier :
for both regression and classification .
2021-09-11 21:19:18 -07:00
"""
2021-02-05 21:41:14 -08:00
2021-10-08 16:09:43 -07:00
def __init__ ( self , task = " binary " , * * config ) :
2021-11-06 09:37:33 -07:00
""" Constructor.
2021-04-08 09:29:55 -07:00
2021-02-05 21:41:14 -08:00
Args :
task : A string of the task type , one of
2022-03-25 17:00:08 -04:00
' binary ' , ' multiclass ' , ' regression ' , ' rank ' , ' seq-classification ' ,
' seq-regression ' , ' token-classification ' , ' multichoice-classification ' ,
' summarization ' , ' ts_forecast ' , ' ts_forecast_classification ' .
2021-11-16 14:06:20 -05:00
config : A dictionary containing the hyperparameter names , ' n_jobs ' as keys .
n_jobs is the number of parallel threads .
2021-09-11 21:19:18 -07:00
"""
2021-12-03 19:37:49 -08:00
self . _task = task
2021-10-08 16:09:43 -07:00
self . params = self . config2params ( config )
2021-02-05 21:41:14 -08:00
self . estimator_class = self . _model = None
2021-10-08 16:09:43 -07:00
if " _estimator_type " in config :
self . _estimator_type = self . params . pop ( " _estimator_type " )
2021-02-05 21:41:14 -08:00
else :
2021-09-11 21:19:18 -07:00
self . _estimator_type = (
2021-10-08 16:09:43 -07:00
" classifier " if task in CLASSIFICATION else " regressor "
2021-09-11 21:19:18 -07:00
)
2021-02-05 21:41:14 -08:00
def get_params ( self , deep = False ) :
params = self . params . copy ( )
params [ " task " ] = self . _task
2021-09-11 21:19:18 -07:00
if hasattr ( self , " _estimator_type " ) :
params [ " _estimator_type " ] = self . _estimator_type
2021-02-05 21:41:14 -08:00
return params
@property
def classes_ ( self ) :
return self . _model . classes_
@property
2021-04-08 09:29:55 -07:00
def n_features_in_ ( self ) :
2022-01-02 21:37:19 -08:00
return self . _model . n_features_in_
2021-02-05 21:41:14 -08:00
@property
def model ( self ) :
2021-11-06 09:37:33 -07:00
""" Trained model after fit() is called, or None before fit() is called. """
2021-02-05 21:41:14 -08:00
return self . _model
2021-07-05 21:17:26 -04:00
@property
def estimator ( self ) :
2021-11-06 09:37:33 -07:00
""" Trained model after fit() is called, or None before fit() is called. """
2021-07-05 21:17:26 -04:00
return self . _model
2021-02-05 21:41:14 -08:00
def _preprocess ( self , X ) :
return X
2021-04-08 09:29:55 -07:00
def _fit ( self , X_train , y_train , * * kwargs ) :
2021-02-05 21:41:14 -08:00
2021-03-28 17:54:25 -07:00
current_time = time . time ( )
2021-09-11 21:19:18 -07:00
if " groups " in kwargs :
2021-09-01 16:25:04 -07:00
kwargs = kwargs . copy ( )
2021-10-08 16:09:43 -07:00
groups = kwargs . pop ( " groups " )
2021-09-11 21:19:18 -07:00
if self . _task == " rank " :
2021-10-08 16:09:43 -07:00
kwargs [ " group " ] = group_counts ( groups )
2021-09-01 16:25:04 -07:00
# groups_val = kwargs.get('groups_val')
# if groups_val is not None:
# kwargs['eval_group'] = [group_counts(groups_val)]
# kwargs['eval_set'] = [
# (kwargs['X_val'], kwargs['y_val'])]
# kwargs['verbose'] = False
# del kwargs['groups_val'], kwargs['X_val'], kwargs['y_val']
2021-02-05 21:41:14 -08:00
X_train = self . _preprocess ( X_train )
model = self . estimator_class ( * * self . params )
2021-10-08 16:09:43 -07:00
if logger . level == logging . DEBUG :
2022-06-21 18:59:07 -07:00
# xgboost 1.6 doesn't display all the params in the model str
logger . debug ( f " flaml.model - { model } fit started with params { self . params } " )
2021-02-05 21:41:14 -08:00
model . fit ( X_train , y_train , * * kwargs )
2021-10-08 16:09:43 -07:00
if logger . level == logging . DEBUG :
logger . debug ( f " flaml.model - { model } fit finished " )
2021-03-28 17:54:25 -07:00
train_time = time . time ( ) - current_time
2021-02-05 21:41:14 -08:00
self . _model = model
return train_time
def fit ( self , X_train , y_train , budget = None , * * kwargs ) :
2021-11-06 09:37:33 -07:00
""" Train the model from given training data.
2021-04-08 09:29:55 -07:00
2021-02-05 21:41:14 -08:00
Args :
2021-11-06 09:37:33 -07:00
X_train : A numpy array or a dataframe of training data in shape n * m .
y_train : A numpy array or a series of labels in shape n * 1.
budget : A float of the time budget in seconds .
2021-02-05 21:41:14 -08:00
Returns :
2021-11-06 09:37:33 -07:00
train_time : A float of the training time in seconds .
2021-09-11 21:19:18 -07:00
"""
2021-11-03 19:08:23 -07:00
if (
getattr ( self , " limit_resource " , None )
and resource is not None
and ( budget is not None or psutil is not None )
) :
start_time = time . time ( )
mem = psutil . virtual_memory ( ) if psutil is not None else None
try :
with limit_resource (
mem . available * ( 1 - FREE_MEM_RATIO )
+ psutil . Process ( os . getpid ( ) ) . memory_info ( ) . rss
if mem is not None
else - 1 ,
budget ,
) :
train_time = self . _fit ( X_train , y_train , * * kwargs )
except ( MemoryError , TimeoutError ) as e :
logger . warning ( f " { e . __class__ } { e } " )
if self . _task in CLASSIFICATION :
model = DummyClassifier ( )
else :
model = DummyRegressor ( )
X_train = self . _preprocess ( X_train )
model . fit ( X_train , y_train )
self . _model = model
train_time = time . time ( ) - start_time
else :
train_time = self . _fit ( X_train , y_train , * * kwargs )
return train_time
2021-02-05 21:41:14 -08:00
2022-03-20 22:03:02 -04:00
def predict ( self , X , * * kwargs ) :
2021-11-06 09:37:33 -07:00
""" Predict label from features.
2021-04-08 09:29:55 -07:00
2021-02-05 21:41:14 -08:00
Args :
2022-01-16 23:37:56 +01:00
X : A numpy array or a dataframe of featurized instances , shape n * m .
2021-02-05 21:41:14 -08:00
Returns :
2021-04-08 09:29:55 -07:00
A numpy array of shape n * 1.
2021-11-06 09:37:33 -07:00
Each element is the label for a instance .
2021-09-11 21:19:18 -07:00
"""
2021-04-21 04:36:06 -04:00
if self . _model is not None :
2022-01-16 23:37:56 +01:00
X = self . _preprocess ( X )
return self . _model . predict ( X )
2021-04-21 04:36:06 -04:00
else :
2021-12-16 17:11:33 -08:00
logger . warning (
" Estimator is not fit yet. Please run fit() before predict(). "
)
2022-01-16 23:37:56 +01:00
return np . ones ( X . shape [ 0 ] )
2021-02-05 21:41:14 -08:00
2022-03-20 22:03:02 -04:00
def predict_proba ( self , X , * * kwargs ) :
2021-11-06 09:37:33 -07:00
""" Predict the probability of each class from features.
2021-02-05 21:41:14 -08:00
Only works for classification problems
Args :
2022-01-16 23:37:56 +01:00
X : A numpy array of featurized instances , shape n * m .
2021-02-05 21:41:14 -08:00
Returns :
2021-11-06 09:37:33 -07:00
A numpy array of shape n * c . c is the # classes.
2021-02-05 21:41:14 -08:00
Each element at ( i , j ) is the probability for instance i to be in
2021-11-06 09:37:33 -07:00
class j .
2021-09-11 21:19:18 -07:00
"""
2021-12-20 17:19:32 -05:00
assert self . _task in CLASSIFICATION , " predict_proba() only for classification. "
2022-01-16 23:37:56 +01:00
X = self . _preprocess ( X )
return self . _model . predict_proba ( X )
2021-02-05 21:41:14 -08:00
2022-03-25 17:00:08 -04:00
def score ( self , X_val : DataFrame , y_val : Series , * * kwargs ) :
""" Report the evaluation score of a trained estimator.
Args :
X_val : A pandas dataframe of the validation input data .
y_val : A pandas series of the validation label .
kwargs : keyword argument of the evaluation function , for example :
- metric : A string of the metric name or a function
e . g . , ' accuracy ' , ' roc_auc ' , ' roc_auc_ovr ' , ' roc_auc_ovo ' ,
' f1 ' , ' micro_f1 ' , ' macro_f1 ' , ' log_loss ' , ' mae ' , ' mse ' , ' r2 ' ,
' mape ' . Default is ' auto ' .
If metric is given , the score will report the user specified metric .
If metric is not given , the metric is set to accuracy for classification and r2
for regression .
You can also pass a customized metric function , for examples on how to pass a
customized metric function , please check
[ test / nlp / test_autohf_custom_metric . py ] ( https : / / github . com / microsoft / FLAML / blob / main / test / nlp / test_autohf_custom_metric . py ) and
[ test / automl / test_multiclass . py ] ( https : / / github . com / microsoft / FLAML / blob / main / test / automl / test_multiclass . py ) .
Returns :
The evaluation score on the validation dataset .
"""
from . ml import metric_loss_score
from . ml import is_min_metric
if self . _model is not None :
if self . _task == " rank " :
raise NotImplementedError (
" AutoML.score() is not implemented for ranking "
)
else :
X_val = self . _preprocess ( X_val )
metric = kwargs . get ( " metric " , None )
if metric :
y_pred = self . predict ( X_val , * * kwargs )
if is_min_metric ( metric ) :
return metric_loss_score ( metric , y_pred , y_val )
else :
return 1.0 - metric_loss_score ( metric , y_pred , y_val )
else :
return self . _model . score ( X_val , y_val , * * kwargs )
else :
logger . warning (
" Estimator is not fit yet. Please run fit() before predict(). "
)
return 0.0
2021-04-08 09:29:55 -07:00
def cleanup ( self ) :
2021-11-18 09:39:45 -08:00
del self . _model
self . _model = None
2021-02-05 21:41:14 -08:00
@classmethod
2021-12-03 09:15:21 -08:00
def search_space ( cls , data_size , task , * * params ) :
2021-11-06 09:37:33 -07:00
""" [required method] search space.
2021-02-05 21:41:14 -08:00
2021-12-03 09:15:21 -08:00
Args :
data_size : A tuple of two integers , number of rows and columns .
2022-03-25 17:00:08 -04:00
task : A str of the task type , e . g . , " binary " , " multiclass " , " regression " .
2021-12-03 09:15:21 -08:00
2021-02-05 21:41:14 -08:00
Returns :
2021-04-08 09:29:55 -07:00
A dictionary of the search space .
2021-02-05 21:41:14 -08:00
Each key is the name of a hyperparameter , and value is a dict with
2021-10-08 16:09:43 -07:00
its domain ( required ) and low_cost_init_value , init_value ,
cat_hp_cost ( if applicable ) .
2021-12-16 17:11:33 -08:00
e . g . , ` ` ` { ' domain ' : tune . randint ( lower = 1 , upper = 10 ) , ' init_value ' : 1 } ` ` ` .
2021-09-11 21:19:18 -07:00
"""
2021-02-05 21:41:14 -08:00
return { }
@classmethod
2021-07-10 09:02:17 -07:00
def size ( cls , config : dict ) - > float :
2021-11-06 09:37:33 -07:00
""" [optional method] memory size of the estimator in bytes.
2021-04-08 09:29:55 -07:00
2021-02-05 21:41:14 -08:00
Args :
2021-11-06 09:37:33 -07:00
config : A dict of the hyperparameter config .
2021-02-05 21:41:14 -08:00
Returns :
A float of the memory size required by the estimator to train the
2021-10-08 16:09:43 -07:00
given config .
2021-09-11 21:19:18 -07:00
"""
2021-02-05 21:41:14 -08:00
return 1.0
@classmethod
2021-07-10 09:02:17 -07:00
def cost_relative2lgbm ( cls ) - > float :
2021-11-06 09:37:33 -07:00
""" [optional method] relative cost compared to lightgbm. """
2021-02-05 21:41:14 -08:00
return 1.0
2021-05-22 08:51:38 -07:00
@classmethod
def init ( cls ) :
2021-11-06 09:37:33 -07:00
""" [optional method] initialize the class. """
2021-05-22 08:51:38 -07:00
pass
2021-10-08 16:09:43 -07:00
def config2params ( self , config : dict ) - > dict :
""" [optional method] config dict to params dict
Args :
2021-11-06 09:37:33 -07:00
config : A dict of the hyperparameter config .
2021-10-08 16:09:43 -07:00
Returns :
A dict that will be passed to self . estimator_class ' s constructor.
"""
2021-11-16 14:06:20 -05:00
params = config . copy ( )
2022-01-22 22:59:44 -08:00
if " FLAML_sample_size " in params :
params . pop ( " FLAML_sample_size " )
2021-11-16 14:06:20 -05:00
return params
class TransformersEstimator ( BaseEstimator ) :
""" The class for fine-tuning language models, using huggingface transformers API. """
2021-11-18 09:39:45 -08:00
ITER_HP = " global_max_steps "
2021-11-16 14:06:20 -05:00
def __init__ ( self , task = " seq-classification " , * * config ) :
super ( ) . __init__ ( task , * * config )
2021-11-23 14:26:39 -05:00
import uuid
self . trial_id = str ( uuid . uuid1 ( ) . hex ) [ : 8 ]
2022-04-28 14:06:29 -04:00
if task not in NLG_TASKS : # TODO: not in NLG_TASKS
from . nlp . huggingface . training_args import (
TrainingArgumentsForAuto as TrainingArguments ,
)
2021-12-20 17:19:32 -05:00
else :
2022-04-28 14:06:29 -04:00
from . nlp . huggingface . training_args import (
Seq2SeqTrainingArgumentsForAuto as TrainingArguments ,
)
2021-12-20 17:19:32 -05:00
self . _TrainingArguments = TrainingArguments
2021-11-16 14:06:20 -05:00
2021-12-24 16:23:09 -05:00
@staticmethod
2022-05-10 17:22:57 -04:00
def _join ( X_train , y_train , task ) :
2022-01-03 13:44:10 -05:00
y_train = DataFrame ( y_train , index = X_train . index )
2022-05-10 17:22:57 -04:00
y_train . columns = [ " label " ] if task != TOKENCLASSIFICATION else [ " labels " ]
2021-11-16 14:06:20 -05:00
train_df = X_train . join ( y_train )
return train_df
@classmethod
2021-12-03 12:45:16 -05:00
def search_space ( cls , data_size , task , * * params ) :
search_space_dict = {
2021-11-16 14:06:20 -05:00
" learning_rate " : {
" domain " : tune . loguniform ( lower = 1e-6 , upper = 1e-3 ) ,
2021-11-23 14:26:39 -05:00
" init_value " : 1e-5 ,
2021-11-16 14:06:20 -05:00
} ,
" num_train_epochs " : {
2021-11-18 09:39:45 -08:00
" domain " : tune . loguniform ( lower = 0.1 , upper = 10.0 ) ,
2022-05-10 17:22:57 -04:00
" init_value " : 3.0 , # to be consistent with roberta
2021-11-16 14:06:20 -05:00
} ,
" per_device_train_batch_size " : {
" domain " : tune . choice ( [ 4 , 8 , 16 , 32 ] ) ,
2021-11-23 14:26:39 -05:00
" init_value " : 32 ,
2021-11-16 14:06:20 -05:00
} ,
" warmup_ratio " : {
" domain " : tune . uniform ( lower = 0.0 , upper = 0.3 ) ,
2021-11-23 14:26:39 -05:00
" init_value " : 0.0 ,
2021-11-16 14:06:20 -05:00
} ,
" weight_decay " : {
" domain " : tune . uniform ( lower = 0.0 , upper = 0.3 ) ,
2021-11-23 14:26:39 -05:00
" init_value " : 0.0 ,
2021-11-16 14:06:20 -05:00
} ,
" adam_epsilon " : {
" domain " : tune . loguniform ( lower = 1e-8 , upper = 1e-6 ) ,
2021-11-23 14:26:39 -05:00
" init_value " : 1e-6 ,
2021-11-16 14:06:20 -05:00
} ,
2021-11-23 14:26:39 -05:00
" seed " : { " domain " : tune . choice ( list ( range ( 40 , 45 ) ) ) , " init_value " : 42 } ,
2022-03-20 22:03:02 -04:00
" global_max_steps " : {
" domain " : sys . maxsize ,
" init_value " : sys . maxsize ,
} ,
2021-11-16 14:06:20 -05:00
}
2021-12-03 12:45:16 -05:00
return search_space_dict
2021-11-16 14:06:20 -05:00
2022-04-28 14:06:29 -04:00
@property
def checkpoint_freq ( self ) :
return (
int (
min ( self . _training_args . num_train_epochs , 1 )
* len ( self . _X_train )
/ self . _training_args . per_device_train_batch_size
/ self . _training_args . ckpt_per_epoch
)
+ 1
)
@property
def fp16 ( self ) :
return self . _kwargs . get ( " gpu_per_trial " ) and self . _training_args . fp16
@property
def no_cuda ( self ) :
return not self . _kwargs . get ( " gpu_per_trial " )
def _set_training_args ( self , * * kwargs ) :
from . nlp . utils import date_str , Counter
for ( key , val ) in kwargs . items ( ) :
assert key not in self . params , (
" Since {} is in the search space, it cannot exist in ' custom_fit_kwargs ' at the same time. "
" If you need to fix the value of {} to {} , the only way is to add a single-value domain in the search "
" space by adding: \n ' {} ' : {{ ' domain ' : {} }} to ' custom_hp ' . For example: "
' automl_settings[ " custom_hp " ] = {{ " transformer " : {{ " model_path " : {{ " domain " : '
' " google/electra-small-discriminator " }} }} }} ' . format (
key , key , val , key , val
2022-03-26 14:08:51 -04:00
)
2022-04-28 14:06:29 -04:00
)
"""
If use has specified any custom args for TrainingArguments , update these arguments
"""
self . _training_args = self . _TrainingArguments ( * * kwargs )
"""
Update the attributes in TrainingArguments with self . params values
"""
for key , val in self . params . items ( ) :
if hasattr ( self . _training_args , key ) :
setattr ( self . _training_args , key , val )
"""
Update the attributes in TrainingArguments that depends on the values of self . params
"""
local_dir = os . path . join (
self . _training_args . output_dir , " train_ {} " . format ( date_str ( ) )
)
if self . _use_ray is True :
import ray
self . _training_args . output_dir = ray . tune . get_trial_dir ( )
else :
self . _training_args . output_dir = Counter . get_trial_fold_name (
local_dir , self . params , self . trial_id
)
self . _training_args . eval_steps = (
self . _training_args . logging_steps
) = self . _training_args . saving_steps = self . checkpoint_freq
self . _training_args . fp16 = self . fp16
self . _training_args . no_cuda = self . no_cuda
2021-11-16 14:06:20 -05:00
2021-12-24 16:23:09 -05:00
def _preprocess ( self , X , y = None , * * kwargs ) :
2022-01-03 13:44:10 -05:00
from . nlp . utils import tokenize_text , is_a_list_of_str
2021-12-24 16:23:09 -05:00
is_str = str ( X . dtypes [ 0 ] ) in ( " string " , " str " )
2022-01-03 13:44:10 -05:00
is_list_of_str = is_a_list_of_str ( X [ list ( X . keys ( ) ) [ 0 ] ] . to_list ( ) [ 0 ] )
2021-12-24 16:23:09 -05:00
2022-01-03 13:44:10 -05:00
if is_str or is_list_of_str :
2021-12-20 17:19:32 -05:00
return tokenize_text (
2022-01-16 12:07:31 -05:00
X = X ,
Y = y ,
task = self . _task ,
2022-04-28 14:06:29 -04:00
hf_args = self . _training_args ,
tokenizer = self . tokenizer ,
2021-12-20 17:19:32 -05:00
)
2021-11-16 14:06:20 -05:00
else :
2021-12-20 17:19:32 -05:00
return X , None
2021-11-16 14:06:20 -05:00
2022-04-28 14:06:29 -04:00
def _model_init ( self ) :
2022-01-06 10:28:19 -08:00
from . nlp . utils import load_model
2022-03-20 22:03:02 -04:00
this_model = load_model (
2022-04-28 14:06:29 -04:00
checkpoint_path = self . _training_args . model_path ,
2022-01-06 10:28:19 -08:00
task = self . _task ,
2022-04-28 14:06:29 -04:00
num_labels = self . num_labels ,
2022-01-06 10:28:19 -08:00
)
2022-03-20 22:03:02 -04:00
return this_model
2022-04-28 14:06:29 -04:00
def preprocess_data ( self , X , y ) :
from datasets import Dataset
2022-03-20 22:03:02 -04:00
2022-04-28 14:06:29 -04:00
if ( self . _task not in NLG_TASKS ) and ( self . _task != TOKENCLASSIFICATION ) :
processed_X , _ = self . _preprocess ( X = X , * * self . _kwargs )
processed_y = y
else :
processed_X , processed_y = self . _preprocess ( X = X , y = y , * * self . _kwargs )
processed_dataset = Dataset . from_pandas (
2022-05-10 17:22:57 -04:00
TransformersEstimator . _join ( processed_X , processed_y , self . _task )
2022-04-28 14:06:29 -04:00
)
return processed_dataset , processed_X , processed_y
@property
def num_labels ( self ) :
if self . _task == SEQREGRESSION :
return 1
elif self . _task == SEQCLASSIFICATION :
return len ( set ( self . _y_train ) )
elif self . _task == TOKENCLASSIFICATION :
2022-07-05 13:38:21 -04:00
return len ( self . _training_args . label_list )
2022-04-28 14:06:29 -04:00
else :
return None
@property
def tokenizer ( self ) :
from transformers import AutoTokenizer
if self . _task == SUMMARIZATION :
return AutoTokenizer . from_pretrained (
pretrained_model_name_or_path = self . _training_args . model_path ,
cache_dir = None ,
use_fast = True ,
revision = " main " ,
use_auth_token = None ,
2022-03-20 22:03:02 -04:00
)
else :
2022-04-28 14:06:29 -04:00
return AutoTokenizer . from_pretrained (
2022-05-12 10:57:25 -04:00
self . _training_args . model_path ,
use_fast = True ,
2022-07-05 13:38:21 -04:00
add_prefix_space = " roberta " in self . _training_args . model_path
and not getattr ( self , " _pred_flag " , False ) ,
# If roberta model and the call is from .fit instead of .predict (when the model_path is updated to the checkpoint name instead), must set add_prefix_space to True to avoid the assertion error at
2022-05-12 10:57:25 -04:00
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/roberta/tokenization_roberta_fast.py#L249
2022-04-28 14:06:29 -04:00
)
@property
def data_collator ( self ) :
2022-05-10 17:22:57 -04:00
from . nlp . huggingface . data_collator import task_to_datacollator_class
2022-04-28 14:06:29 -04:00
return (
2022-05-10 17:22:57 -04:00
task_to_datacollator_class [ self . _task ] (
2022-04-28 14:06:29 -04:00
tokenizer = self . tokenizer ,
2022-05-10 17:22:57 -04:00
pad_to_multiple_of = 8 , # if self._training_args.fp16 else None,
2022-03-20 22:03:02 -04:00
)
2022-05-10 17:22:57 -04:00
if self . _task in ( MULTICHOICECLASSIFICATION , TOKENCLASSIFICATION )
2022-04-28 14:06:29 -04:00
else None
)
2022-01-06 10:28:19 -08:00
2022-04-28 14:06:29 -04:00
def fit (
self ,
X_train : DataFrame ,
y_train : Series ,
budget = None ,
X_val = None ,
y_val = None ,
gpu_per_trial = None ,
metric = None ,
* * kwargs ,
) :
2022-01-12 00:10:31 -05:00
import transformers
transformers . logging . set_verbosity_error ( )
2022-01-16 12:07:31 -05:00
from transformers import TrainerCallback
2021-11-23 14:26:39 -05:00
from transformers . trainer_utils import set_seed
2022-04-28 14:06:29 -04:00
from . nlp . huggingface . trainer import TrainerForAuto
2021-12-03 12:45:16 -05:00
2022-04-28 14:06:29 -04:00
try :
from ray . tune import is_session_enabled
2021-12-03 12:45:16 -05:00
2022-04-28 14:06:29 -04:00
self . _use_ray = is_session_enabled ( )
except ImportError :
self . _use_ray = False
2021-11-16 14:06:20 -05:00
this_params = self . params
2022-04-28 14:06:29 -04:00
self . _kwargs = kwargs
self . _X_train , self . _y_train = X_train , y_train
self . _set_training_args ( * * kwargs )
train_dataset , self . _X_train , self . _y_train = self . preprocess_data (
X_train , y_train
)
if X_val is not None :
eval_dataset , self . _X_val , self . _y_val = self . preprocess_data ( X_val , y_val )
else :
eval_dataset , self . _X_val , self . _y_val = None , None , None
set_seed ( self . params . get ( " seed " , self . _training_args . seed ) )
self . _metric = metric
2021-11-16 14:06:20 -05:00
2022-01-16 12:07:31 -05:00
class EarlyStoppingCallbackForAuto ( TrainerCallback ) :
2021-11-16 14:06:20 -05:00
def on_train_begin ( self , args , state , control , * * callback_kwargs ) :
self . train_begin_time = time . time ( )
def on_step_begin ( self , args , state , control , * * callback_kwargs ) :
self . step_begin_time = time . time ( )
def on_step_end ( self , args , state , control , * * callback_kwargs ) :
if state . global_step == 1 :
self . time_per_iter = time . time ( ) - self . step_begin_time
2021-11-18 09:39:45 -08:00
if (
budget
and (
2021-11-16 14:06:20 -05:00
time . time ( ) + self . time_per_iter
> self . train_begin_time + budget
2021-11-18 09:39:45 -08:00
)
or state . global_step > = this_params [ TransformersEstimator . ITER_HP ]
) :
2021-11-16 14:06:20 -05:00
control . should_training_stop = True
2021-11-18 09:39:45 -08:00
control . should_save = True
control . should_evaluate = True
2021-11-16 14:06:20 -05:00
return control
2021-11-18 09:39:45 -08:00
def on_epoch_end ( self , args , state , control , * * callback_kwargs ) :
if (
control . should_training_stop
2021-11-23 14:26:39 -05:00
or state . epoch + 1 > = args . num_train_epochs
2021-11-18 09:39:45 -08:00
) :
control . should_save = True
control . should_evaluate = True
2022-01-06 10:28:19 -08:00
self . _trainer = TrainerForAuto (
2022-04-28 14:06:29 -04:00
args = self . _training_args ,
model_init = self . _model_init ,
2021-11-16 14:06:20 -05:00
train_dataset = train_dataset ,
eval_dataset = eval_dataset ,
2022-04-28 14:06:29 -04:00
tokenizer = self . tokenizer ,
data_collator = self . data_collator ,
2021-11-16 14:06:20 -05:00
compute_metrics = self . _compute_metrics_by_dataset_name ,
callbacks = [ EarlyStoppingCallbackForAuto ] ,
)
2021-12-20 17:19:32 -05:00
if self . _task in NLG_TASKS :
2022-01-06 10:28:19 -08:00
setattr ( self . _trainer , " _is_seq2seq " , True )
2022-03-20 22:03:02 -04:00
2022-04-12 14:46:14 -04:00
"""
When not using ray for tuning , set the limit of CUDA_VISIBLE_DEVICES to math . ceil ( gpu_per_trial ) ,
so each estimator does not see all the GPUs
"""
2022-04-28 14:06:29 -04:00
if gpu_per_trial is not None :
2022-03-20 22:03:02 -04:00
tmp_cuda_visible_devices = os . environ . get ( " CUDA_VISIBLE_DEVICES " , " " )
self . _trainer . args . _n_gpu = gpu_per_trial
2022-04-28 14:06:29 -04:00
2022-03-20 22:03:02 -04:00
# if gpu_per_trial == 0:
# os.environ["CUDA_VISIBLE_DEVICES"] = ""
2022-04-12 14:46:14 -04:00
if tmp_cuda_visible_devices . count ( " , " ) != math . ceil ( gpu_per_trial ) - 1 :
2022-04-28 14:06:29 -04:00
2022-03-20 22:03:02 -04:00
os . environ [ " CUDA_VISIBLE_DEVICES " ] = " , " . join (
2022-04-12 14:46:14 -04:00
[ str ( x ) for x in range ( math . ceil ( gpu_per_trial ) ) ]
2022-03-20 22:03:02 -04:00
)
import time
start_time = time . time ( )
2022-01-06 10:28:19 -08:00
self . _trainer . train ( )
2021-11-16 14:06:20 -05:00
2022-04-28 14:06:29 -04:00
if gpu_per_trial is not None :
2022-03-20 22:03:02 -04:00
os . environ [ " CUDA_VISIBLE_DEVICES " ] = tmp_cuda_visible_devices
2021-11-16 14:06:20 -05:00
2022-03-20 22:03:02 -04:00
self . params [ self . ITER_HP ] = self . _trainer . state . global_step
2021-11-16 14:06:20 -05:00
2022-03-20 22:03:02 -04:00
self . _checkpoint_path = self . _select_checkpoint ( self . _trainer )
2022-01-06 10:28:19 -08:00
self . _ckpt_remains = list ( self . _trainer . ckpt_to_metric . keys ( ) )
2022-03-20 22:03:02 -04:00
2022-01-14 20:26:10 -05:00
if hasattr ( self . _trainer , " intermediate_results " ) :
2022-01-16 12:07:31 -05:00
self . intermediate_results = [
x [ 1 ]
for x in sorted (
self . _trainer . intermediate_results . items ( ) , key = lambda x : x [ 0 ]
)
]
2022-01-06 10:28:19 -08:00
self . _trainer = None
2022-04-28 14:06:29 -04:00
2022-03-20 22:03:02 -04:00
return time . time ( ) - start_time
2021-11-23 14:26:39 -05:00
def _delete_one_ckpt ( self , ckpt_location ) :
2022-04-28 14:06:29 -04:00
if self . _use_ray is False :
2021-11-23 14:26:39 -05:00
try :
shutil . rmtree ( ckpt_location )
except FileNotFoundError :
logger . warning ( " checkpoint {} not found " . format ( ckpt_location ) )
def cleanup ( self ) :
2021-12-03 12:45:16 -05:00
super ( ) . cleanup ( )
2021-11-23 14:26:39 -05:00
if hasattr ( self , " _ckpt_remains " ) :
for each_ckpt in self . _ckpt_remains :
self . _delete_one_ckpt ( each_ckpt )
2021-11-18 09:39:45 -08:00
def _select_checkpoint ( self , trainer ) :
2021-11-23 14:26:39 -05:00
from transformers . trainer_utils import PREFIX_CHECKPOINT_DIR
2021-11-18 09:39:45 -08:00
if trainer . ckpt_to_metric :
best_ckpt , _ = min (
2021-12-24 16:23:09 -05:00
trainer . ckpt_to_metric . items ( ) , key = lambda x : x [ 1 ] [ " eval_loss " ]
2021-11-18 09:39:45 -08:00
)
best_ckpt_global_step = trainer . ckpt_to_global_step [ best_ckpt ]
2021-11-23 14:26:39 -05:00
for each_ckpt in list ( trainer . ckpt_to_metric ) :
if each_ckpt != best_ckpt :
del trainer . ckpt_to_metric [ each_ckpt ]
del trainer . ckpt_to_global_step [ each_ckpt ]
self . _delete_one_ckpt ( each_ckpt )
2021-11-18 09:39:45 -08:00
else :
best_ckpt_global_step = trainer . state . global_step
best_ckpt = os . path . join (
trainer . args . output_dir ,
f " { PREFIX_CHECKPOINT_DIR } - { best_ckpt_global_step } " ,
)
2021-11-16 14:06:20 -05:00
self . params [ self . ITER_HP ] = best_ckpt_global_step
2022-01-14 13:39:09 -08:00
logger . debug ( trainer . state . global_step )
logger . debug ( trainer . ckpt_to_global_step )
2021-11-16 14:06:20 -05:00
return best_ckpt
def _compute_metrics_by_dataset_name ( self , eval_pred ) :
2022-07-05 13:38:21 -04:00
# TODO: call self._metric(eval_pred, self)
2021-12-23 18:44:53 -05:00
if isinstance ( self . _metric , str ) :
2021-12-24 16:23:09 -05:00
from . ml import metric_loss_score
2022-07-05 13:38:21 -04:00
from . nlp . utils import postprocess_prediction_and_true
predictions , y_true = eval_pred
# postprocess the matrix prediction and ground truth into user readable format, e.g., for summarization, decode into text
processed_predictions , processed_y_true = postprocess_prediction_and_true (
task = self . _task ,
y_pred = predictions ,
tokenizer = self . tokenizer ,
hf_args = self . _training_args ,
y_true = y_true ,
)
2022-01-14 13:39:09 -08:00
metric_dict = {
2022-01-14 20:26:10 -05:00
" automl_metric " : metric_loss_score (
2022-05-10 17:22:57 -04:00
metric_name = self . _metric ,
2022-07-05 13:38:21 -04:00
y_processed_predict = processed_predictions ,
y_processed_true = processed_y_true ,
2022-05-10 17:22:57 -04:00
labels = self . _training_args . label_list ,
2021-12-23 18:44:53 -05:00
)
}
else :
2022-07-05 13:38:21 -04:00
# TODO: debug to see how custom metric can take both tokenized (here) and untokenized input (ml.py)
2022-01-14 13:39:09 -08:00
loss , metric_dict = self . _metric (
2021-12-23 18:44:53 -05:00
X_test = self . _X_val ,
y_test = self . _y_val ,
estimator = self ,
labels = None ,
X_train = self . _X_train ,
y_train = self . _y_train ,
2021-12-16 17:11:33 -08:00
)
2022-01-14 20:26:10 -05:00
metric_dict [ " automl_metric " ] = loss
2022-04-28 14:06:29 -04:00
2022-01-14 13:39:09 -08:00
return metric_dict
2021-11-23 14:26:39 -05:00
2022-03-25 17:00:08 -04:00
def _init_model_for_predict ( self ) :
2022-01-02 20:12:34 -05:00
from . nlp . huggingface . trainer import TrainerForAuto
2021-11-23 14:26:39 -05:00
2022-04-28 14:06:29 -04:00
"""
Need to reinit training_args because of a bug in deepspeed : if not reinit , the deepspeed config will be inconsistent
with HF config https : / / github . com / huggingface / transformers / blob / main / src / transformers / training_args . py #L947
"""
2022-07-05 13:38:21 -04:00
self . _pred_flag = True
2022-04-28 14:06:29 -04:00
training_args = self . _TrainingArguments (
local_rank = - 1 , model_path = self . _checkpoint_path , fp16 = self . fp16
2022-01-02 20:12:34 -05:00
)
2022-04-28 14:06:29 -04:00
for key , val in self . _training_args . __dict__ . items ( ) :
if key not in ( " local_rank " , " model_path " , " fp16 " ) :
setattr ( training_args , key , val )
self . _training_args = training_args
2022-03-20 22:03:02 -04:00
new_trainer = TrainerForAuto (
2022-04-28 14:06:29 -04:00
model = self . _model_init ( ) ,
args = self . _training_args ,
data_collator = self . data_collator ,
2022-01-12 22:50:39 -05:00
compute_metrics = self . _compute_metrics_by_dataset_name ,
2022-01-02 20:12:34 -05:00
)
2022-03-20 22:03:02 -04:00
if self . _task in NLG_TASKS :
setattr ( new_trainer , " _is_seq2seq " , True )
2022-04-28 14:06:29 -04:00
return new_trainer
2022-01-02 20:12:34 -05:00
2022-04-28 14:06:29 -04:00
def predict_proba ( self , X , * * pred_kwargs ) :
2022-03-25 17:00:08 -04:00
from datasets import Dataset
2022-04-28 14:06:29 -04:00
if pred_kwargs :
for key , val in pred_kwargs . items ( ) :
setattr ( self . _training_args , key , val )
2022-01-02 20:12:34 -05:00
assert (
self . _task in CLASSIFICATION
) , " predict_proba() only for classification tasks. "
2022-03-25 17:00:08 -04:00
X_test , _ = self . _preprocess ( X , * * self . _kwargs )
test_dataset = Dataset . from_pandas ( X_test )
2022-04-28 14:06:29 -04:00
new_trainer = self . _init_model_for_predict ( )
2022-03-20 22:03:02 -04:00
predictions = new_trainer . predict ( test_dataset )
2021-11-23 14:26:39 -05:00
return predictions . predictions
2022-03-25 17:00:08 -04:00
def score ( self , X_val : DataFrame , y_val : Series , * * kwargs ) :
import transformers
transformers . logging . set_verbosity_error ( )
self . _metric = kwargs [ " metric " ]
2022-04-28 14:06:29 -04:00
eval_dataset , X_val , y_val = self . preprocess_data ( X_val , y_val )
2022-03-25 17:00:08 -04:00
2022-04-28 14:06:29 -04:00
new_trainer = self . _init_model_for_predict ( )
2022-03-25 17:00:08 -04:00
return new_trainer . evaluate ( eval_dataset )
2022-04-28 14:06:29 -04:00
def predict ( self , X , * * pred_kwargs ) :
2022-03-20 22:03:02 -04:00
import transformers
2022-03-25 17:00:08 -04:00
from datasets import Dataset
2022-07-05 13:38:21 -04:00
from . nlp . utils import postprocess_prediction_and_true
2022-03-20 22:03:02 -04:00
transformers . logging . set_verbosity_error ( )
2022-04-28 14:06:29 -04:00
if pred_kwargs :
for key , val in pred_kwargs . items ( ) :
setattr ( self . _training_args , key , val )
2022-03-25 17:00:08 -04:00
X_test , _ = self . _preprocess ( X , * * self . _kwargs )
test_dataset = Dataset . from_pandas ( X_test )
2022-04-28 14:06:29 -04:00
new_trainer = self . _init_model_for_predict ( )
2022-03-20 22:03:02 -04:00
2021-12-20 17:19:32 -05:00
if self . _task not in NLG_TASKS :
2022-03-20 22:03:02 -04:00
predictions = new_trainer . predict ( test_dataset )
2021-12-20 17:19:32 -05:00
else :
2022-03-20 22:03:02 -04:00
predictions = new_trainer . predict (
2021-12-20 17:19:32 -05:00
test_dataset ,
2022-03-20 22:03:02 -04:00
metric_key_prefix = " predict " ,
2021-12-20 17:19:32 -05:00
)
2022-07-05 13:38:21 -04:00
post_y_pred , _ = postprocess_prediction_and_true (
task = self . _task ,
y_pred = predictions . predictions ,
tokenizer = self . tokenizer ,
hf_args = self . _training_args ,
X = X ,
)
return post_y_pred
2021-10-08 16:09:43 -07:00
2021-12-03 19:37:49 -08:00
def config2params ( self , config : dict ) - > dict :
2022-01-22 22:59:44 -08:00
params = super ( ) . config2params ( config )
2021-11-26 22:40:37 -05:00
params [ TransformersEstimator . ITER_HP ] = params . get (
TransformersEstimator . ITER_HP , sys . maxsize
)
return params
2021-02-05 21:41:14 -08:00
2022-04-28 14:06:29 -04:00
class TransformersEstimatorModelSelection ( TransformersEstimator ) :
def __init__ ( self , task = " seq-classification " , * * config ) :
super ( ) . __init__ ( task , * * config )
@classmethod
def search_space ( cls , data_size , task , * * params ) :
search_space_dict = TransformersEstimator . search_space (
data_size , task , * * params
)
"""
For model selection , use the same search space regardless of memory constraint
If OOM , user should change the search space themselves
"""
search_space_dict [ " model_path " ] = {
" domain " : tune . choice (
[
" google/electra-base-discriminator " ,
" bert-base-uncased " ,
" roberta-base " ,
" facebook/muppet-roberta-base " ,
" google/electra-small-discriminator " ,
]
) ,
" init_value " : " facebook/muppet-roberta-base " ,
}
return search_space_dict
2021-02-05 21:41:14 -08:00
class SKLearnEstimator ( BaseEstimator ) :
2021-11-06 09:37:33 -07:00
""" The base class for tuning scikit-learn estimators. """
2021-10-08 16:09:43 -07:00
def __init__ ( self , task = " binary " , * * config ) :
super ( ) . __init__ ( task , * * config )
2021-06-11 10:25:45 -07:00
2021-02-05 21:41:14 -08:00
def _preprocess ( self , X ) :
2021-11-16 14:06:20 -05:00
if isinstance ( X , DataFrame ) :
2021-09-11 21:19:18 -07:00
cat_columns = X . select_dtypes ( include = [ " category " ] ) . columns
2021-08-12 02:02:22 -04:00
if not cat_columns . empty :
X = X . copy ( )
X [ cat_columns ] = X [ cat_columns ] . apply ( lambda x : x . cat . codes )
2021-09-11 21:19:18 -07:00
elif isinstance ( X , np . ndarray ) and X . dtype . kind not in " buif " :
2021-07-27 18:02:49 -07:00
# numpy array is not of numeric dtype
2021-11-16 14:06:20 -05:00
X = DataFrame ( X )
2021-07-27 18:02:49 -07:00
for col in X . columns :
if isinstance ( X [ col ] [ 0 ] , str ) :
2021-09-11 21:19:18 -07:00
X [ col ] = X [ col ] . astype ( " category " ) . cat . codes
2021-07-27 18:02:49 -07:00
X = X . to_numpy ( )
2021-02-05 21:41:14 -08:00
return X
class LGBMEstimator ( BaseEstimator ) :
2021-11-06 09:37:33 -07:00
""" The class for tuning LGBM, using sklearn API. """
2021-11-03 19:08:23 -07:00
ITER_HP = " n_estimators "
HAS_CALLBACK = True
2022-03-01 15:39:09 -08:00
DEFAULT_ITER = 100
2021-11-03 19:08:23 -07:00
2021-02-05 21:41:14 -08:00
@classmethod
2021-04-08 09:29:55 -07:00
def search_space ( cls , data_size , * * params ) :
2022-04-28 14:06:29 -04:00
upper = max ( 5 , min ( 32768 , int ( data_size [ 0 ] ) ) ) # upper must be larger than lower
2021-02-05 21:41:14 -08:00
return {
2021-09-11 21:19:18 -07:00
" n_estimators " : {
" domain " : tune . lograndint ( lower = 4 , upper = upper ) ,
" init_value " : 4 ,
" low_cost_init_value " : 4 ,
2021-02-05 21:41:14 -08:00
} ,
2021-09-11 21:19:18 -07:00
" num_leaves " : {
" domain " : tune . lograndint ( lower = 4 , upper = upper ) ,
" init_value " : 4 ,
" low_cost_init_value " : 4 ,
2021-02-05 21:41:14 -08:00
} ,
2021-09-11 21:19:18 -07:00
" min_child_samples " : {
2022-06-09 08:11:15 -07:00
" domain " : tune . lograndint ( lower = 2 , upper = 2 * * 7 + 1 ) ,
2021-09-11 21:19:18 -07:00
" init_value " : 20 ,
2021-02-05 21:41:14 -08:00
} ,
2021-09-11 21:19:18 -07:00
" learning_rate " : {
" domain " : tune . loguniform ( lower = 1 / 1024 , upper = 1.0 ) ,
" init_value " : 0.1 ,
2021-02-05 21:41:14 -08:00
} ,
2021-09-11 21:19:18 -07:00
" log_max_bin " : { # log transformed with base 2
" domain " : tune . lograndint ( lower = 3 , upper = 11 ) ,
" init_value " : 8 ,
2021-04-08 09:29:55 -07:00
} ,
2021-09-11 21:19:18 -07:00
" colsample_bytree " : {
" domain " : tune . uniform ( lower = 0.01 , upper = 1.0 ) ,
" init_value " : 1.0 ,
2021-04-08 09:29:55 -07:00
} ,
2021-09-11 21:19:18 -07:00
" reg_alpha " : {
" domain " : tune . loguniform ( lower = 1 / 1024 , upper = 1024 ) ,
" init_value " : 1 / 1024 ,
2021-04-08 09:29:55 -07:00
} ,
2021-09-11 21:19:18 -07:00
" reg_lambda " : {
" domain " : tune . loguniform ( lower = 1 / 1024 , upper = 1024 ) ,
" init_value " : 1.0 ,
2021-04-08 09:29:55 -07:00
} ,
2021-02-05 21:41:14 -08:00
}
2021-12-03 19:37:49 -08:00
def config2params ( self , config : dict ) - > dict :
2022-01-22 22:59:44 -08:00
params = super ( ) . config2params ( config )
2021-10-08 16:09:43 -07:00
if " log_max_bin " in params :
params [ " max_bin " ] = ( 1 << params . pop ( " log_max_bin " ) ) - 1
return params
2021-02-05 21:41:14 -08:00
@classmethod
def size ( cls , config ) :
2021-11-22 21:17:48 -08:00
num_leaves = int (
round (
config . get ( " num_leaves " )
or config . get ( " max_leaves " )
2021-12-03 09:15:21 -08:00
or 1 << config . get ( " max_depth " , 16 )
2021-11-22 21:17:48 -08:00
)
)
2021-09-11 21:19:18 -07:00
n_estimators = int ( round ( config [ " n_estimators " ] ) )
2021-04-08 09:29:55 -07:00
return ( num_leaves * 3 + ( num_leaves - 1 ) * 4 + 1.0 ) * n_estimators * 8
2021-10-08 16:09:43 -07:00
def __init__ ( self , task = " binary " , * * config ) :
super ( ) . __init__ ( task , * * config )
2021-08-12 02:02:22 -04:00
if " verbose " not in self . params :
2021-09-11 21:19:18 -07:00
self . params [ " verbose " ] = - 1
if " regression " == task :
2021-10-15 21:36:42 -07:00
from lightgbm import LGBMRegressor
2021-02-05 21:41:14 -08:00
self . estimator_class = LGBMRegressor
2021-09-11 21:19:18 -07:00
elif " rank " == task :
2021-10-15 21:36:42 -07:00
from lightgbm import LGBMRanker
2021-09-01 16:25:04 -07:00
self . estimator_class = LGBMRanker
2021-02-05 21:41:14 -08:00
else :
2021-10-15 21:36:42 -07:00
from lightgbm import LGBMClassifier
2021-02-05 21:41:14 -08:00
self . estimator_class = LGBMClassifier
self . _time_per_iter = None
self . _train_size = 0
2021-12-03 09:15:21 -08:00
self . _mem_per_iter = - 1
2021-11-03 19:08:23 -07:00
self . HAS_CALLBACK = self . HAS_CALLBACK and self . _callbacks ( 0 , 0 ) is not None
2021-02-05 21:41:14 -08:00
def _preprocess ( self , X ) :
2021-09-11 21:19:18 -07:00
if (
2021-11-16 14:06:20 -05:00
not isinstance ( X , DataFrame )
2021-09-11 21:19:18 -07:00
and issparse ( X )
and np . issubdtype ( X . dtype , np . integer )
) :
2021-02-05 21:41:14 -08:00
X = X . astype ( float )
2021-09-11 21:19:18 -07:00
elif isinstance ( X , np . ndarray ) and X . dtype . kind not in " buif " :
2021-08-12 02:02:22 -04:00
# numpy array is not of numeric dtype
2021-11-16 14:06:20 -05:00
X = DataFrame ( X )
2021-08-12 02:02:22 -04:00
for col in X . columns :
if isinstance ( X [ col ] [ 0 ] , str ) :
2021-09-11 21:19:18 -07:00
X [ col ] = X [ col ] . astype ( " category " ) . cat . codes
2021-08-12 02:02:22 -04:00
X = X . to_numpy ( )
2021-02-05 21:41:14 -08:00
return X
def fit ( self , X_train , y_train , budget = None , * * kwargs ) :
start_time = time . time ( )
2021-11-03 19:08:23 -07:00
deadline = start_time + budget if budget else np . inf
2022-03-01 15:39:09 -08:00
n_iter = self . params . get ( self . ITER_HP , self . DEFAULT_ITER )
2021-09-27 21:30:49 -07:00
trained = False
2021-11-03 19:08:23 -07:00
if not self . HAS_CALLBACK :
mem0 = psutil . virtual_memory ( ) . available if psutil is not None else 1
if (
(
not self . _time_per_iter
or abs ( self . _train_size - X_train . shape [ 0 ] ) > 4
)
and budget is not None
2021-12-03 09:15:21 -08:00
or self . _mem_per_iter < 0
2021-11-03 19:08:23 -07:00
and psutil is not None
) and n_iter > 1 :
self . params [ self . ITER_HP ] = 1
self . _t1 = self . _fit ( X_train , y_train , * * kwargs )
if budget is not None and self . _t1 > = budget or n_iter == 1 :
return self . _t1
mem1 = psutil . virtual_memory ( ) . available if psutil is not None else 1
self . _mem1 = mem0 - mem1
self . params [ self . ITER_HP ] = min ( n_iter , 4 )
self . _t2 = self . _fit ( X_train , y_train , * * kwargs )
mem2 = psutil . virtual_memory ( ) . available if psutil is not None else 1
self . _mem2 = max ( mem0 - mem2 , self . _mem1 )
# if self._mem1 <= 0:
# self._mem_per_iter = self._mem2 / (self.params[self.ITER_HP] + 1)
# elif self._mem2 <= 0:
# self._mem_per_iter = self._mem1
# else:
self . _mem_per_iter = min (
self . _mem1 , self . _mem2 / self . params [ self . ITER_HP ]
)
2021-12-03 09:15:21 -08:00
# if self._mem_per_iter <= 1 and psutil is not None:
# n_iter = self.params[self.ITER_HP]
2021-11-03 19:08:23 -07:00
self . _time_per_iter = (
( self . _t2 - self . _t1 ) / ( self . params [ self . ITER_HP ] - 1 )
if self . _t2 > self . _t1
else self . _t1
if self . _t1
else 0.001
)
self . _train_size = X_train . shape [ 0 ]
if (
budget is not None
and self . _t1 + self . _t2 > = budget
or n_iter == self . params [ self . ITER_HP ]
) :
# self.params[self.ITER_HP] = n_iter
return time . time ( ) - start_time
trained = True
# logger.debug(mem0)
# logger.debug(self._mem_per_iter)
if n_iter > 1 :
max_iter = min (
n_iter ,
int (
( budget - time . time ( ) + start_time - self . _t1 )
/ self . _time_per_iter
+ 1
)
if budget is not None
else n_iter ,
int ( ( 1 - FREE_MEM_RATIO ) * mem0 / self . _mem_per_iter )
2021-12-03 09:15:21 -08:00
if psutil is not None and self . _mem_per_iter > 0
2021-11-03 19:08:23 -07:00
else n_iter ,
)
if trained and max_iter < = self . params [ self . ITER_HP ] :
return time . time ( ) - start_time
2021-12-12 20:05:18 -08:00
# when not trained, train at least one iter
self . params [ self . ITER_HP ] = max ( max_iter , 1 )
if self . HAS_CALLBACK :
2022-03-01 18:39:16 -08:00
kwargs_callbacks = kwargs . get ( " callbacks " )
if kwargs_callbacks :
callbacks = kwargs_callbacks + self . _callbacks ( start_time , deadline )
kwargs . pop ( " callbacks " )
else :
callbacks = self . _callbacks ( start_time , deadline )
2022-06-21 18:59:07 -07:00
if isinstance ( self , XGBoostSklearnEstimator ) :
from xgboost import __version__
if __version__ > = " 1.6.0 " :
# since xgboost>=1.6.0, callbacks can't be passed in fit()
self . params [ " callbacks " ] = callbacks
callbacks = None
2021-12-12 20:05:18 -08:00
self . _fit (
X_train ,
y_train ,
2022-03-01 18:39:16 -08:00
callbacks = callbacks ,
2021-12-12 20:05:18 -08:00
* * kwargs ,
)
2022-06-21 18:59:07 -07:00
if callbacks is None :
# for xgboost>=1.6.0, pop callbacks to enable pickle
callbacks = self . params . pop ( " callbacks " )
self . _model . set_params ( callbacks = callbacks [ : - 1 ] )
2021-12-12 20:05:18 -08:00
best_iteration = (
self . _model . get_booster ( ) . best_iteration
if isinstance ( self , XGBoostSklearnEstimator )
else self . _model . best_iteration_
)
if best_iteration is not None :
self . _model . set_params ( n_estimators = best_iteration + 1 )
2021-09-27 21:30:49 -07:00
else :
2021-12-12 20:05:18 -08:00
self . _fit ( X_train , y_train , * * kwargs )
2021-02-05 21:41:14 -08:00
train_time = time . time ( ) - start_time
return train_time
2021-11-03 19:08:23 -07:00
def _callbacks ( self , start_time , deadline ) - > List [ Callable ] :
return [ partial ( self . _callback , start_time , deadline ) ]
def _callback ( self , start_time , deadline , env ) - > None :
from lightgbm . callback import EarlyStopException
now = time . time ( )
if env . iteration == 0 :
self . _time_per_iter = now - start_time
if now + self . _time_per_iter > deadline :
raise EarlyStopException ( env . iteration , env . evaluation_result_list )
if psutil is not None :
mem = psutil . virtual_memory ( )
if mem . available / mem . total < FREE_MEM_RATIO :
raise EarlyStopException ( env . iteration , env . evaluation_result_list )
2021-02-05 21:41:14 -08:00
class XGBoostEstimator ( SKLearnEstimator ) :
2021-11-06 09:37:33 -07:00
""" The class for tuning XGBoost regressor, not using sklearn API. """
2021-02-05 21:41:14 -08:00
2022-03-01 15:39:09 -08:00
DEFAULT_ITER = 10
2021-02-05 21:41:14 -08:00
@classmethod
2021-04-08 09:29:55 -07:00
def search_space ( cls , data_size , * * params ) :
2022-04-28 14:06:29 -04:00
upper = max ( 5 , min ( 32768 , int ( data_size [ 0 ] ) ) ) # upper must be larger than lower
2021-02-05 21:41:14 -08:00
return {
2021-09-11 21:19:18 -07:00
" n_estimators " : {
" domain " : tune . lograndint ( lower = 4 , upper = upper ) ,
" init_value " : 4 ,
" low_cost_init_value " : 4 ,
2021-02-05 21:41:14 -08:00
} ,
2021-09-11 21:19:18 -07:00
" max_leaves " : {
" domain " : tune . lograndint ( lower = 4 , upper = upper ) ,
" init_value " : 4 ,
" low_cost_init_value " : 4 ,
2021-02-05 21:41:14 -08:00
} ,
2021-11-22 21:17:48 -08:00
" max_depth " : {
" domain " : tune . choice ( [ 0 , 6 , 12 ] ) ,
" init_value " : 0 ,
} ,
2021-09-11 21:19:18 -07:00
" min_child_weight " : {
" domain " : tune . loguniform ( lower = 0.001 , upper = 128 ) ,
2022-06-09 08:11:15 -07:00
" init_value " : 1.0 ,
2021-02-05 21:41:14 -08:00
} ,
2021-09-11 21:19:18 -07:00
" learning_rate " : {
" domain " : tune . loguniform ( lower = 1 / 1024 , upper = 1.0 ) ,
" init_value " : 0.1 ,
2021-02-05 21:41:14 -08:00
} ,
2021-09-11 21:19:18 -07:00
" subsample " : {
" domain " : tune . uniform ( lower = 0.1 , upper = 1.0 ) ,
" init_value " : 1.0 ,
2021-04-08 09:29:55 -07:00
} ,
2021-09-11 21:19:18 -07:00
" colsample_bylevel " : {
" domain " : tune . uniform ( lower = 0.01 , upper = 1.0 ) ,
" init_value " : 1.0 ,
2021-04-08 09:29:55 -07:00
} ,
2021-09-11 21:19:18 -07:00
" colsample_bytree " : {
" domain " : tune . uniform ( lower = 0.01 , upper = 1.0 ) ,
" init_value " : 1.0 ,
2021-04-08 09:29:55 -07:00
} ,
2021-09-11 21:19:18 -07:00
" reg_alpha " : {
" domain " : tune . loguniform ( lower = 1 / 1024 , upper = 1024 ) ,
" init_value " : 1 / 1024 ,
2021-04-08 09:29:55 -07:00
} ,
2021-09-11 21:19:18 -07:00
" reg_lambda " : {
" domain " : tune . loguniform ( lower = 1 / 1024 , upper = 1024 ) ,
" init_value " : 1.0 ,
2021-04-08 09:29:55 -07:00
} ,
2021-02-05 21:41:14 -08:00
}
2021-04-08 09:29:55 -07:00
2021-02-05 21:41:14 -08:00
@classmethod
def size ( cls , config ) :
return LGBMEstimator . size ( config )
@classmethod
def cost_relative2lgbm ( cls ) :
return 1.6
2021-12-03 19:37:49 -08:00
def config2params ( self , config : dict ) - > dict :
2022-01-22 22:59:44 -08:00
params = super ( ) . config2params ( config )
2021-11-22 21:17:48 -08:00
max_depth = params [ " max_depth " ] = params . get ( " max_depth " , 0 )
if max_depth == 0 :
params [ " grow_policy " ] = params . get ( " grow_policy " , " lossguide " )
params [ " tree_method " ] = params . get ( " tree_method " , " hist " )
# params["booster"] = params.get("booster", "gbtree")
2021-10-08 16:09:43 -07:00
params [ " use_label_encoder " ] = params . get ( " use_label_encoder " , False )
if " n_jobs " in config :
params [ " nthread " ] = params . pop ( " n_jobs " )
return params
2021-04-08 09:29:55 -07:00
def __init__ (
2021-09-11 21:19:18 -07:00
self ,
task = " regression " ,
2021-10-08 16:09:43 -07:00
* * config ,
2021-04-08 09:29:55 -07:00
) :
2021-10-08 16:09:43 -07:00
super ( ) . __init__ ( task , * * config )
self . params [ " verbosity " ] = 0
2021-02-05 21:41:14 -08:00
def fit ( self , X_train , y_train , budget = None , * * kwargs ) :
2021-10-15 21:36:42 -07:00
import xgboost as xgb
2021-04-08 09:29:55 -07:00
start_time = time . time ( )
2021-11-03 19:08:23 -07:00
deadline = start_time + budget if budget else np . inf
2021-10-08 16:09:43 -07:00
if issparse ( X_train ) :
2022-06-21 18:59:07 -07:00
if xgb . __version__ < " 1.6.0 " :
# "auto" fails for sparse input since xgboost 1.6.0
self . params [ " tree_method " ] = " auto "
2021-10-08 16:09:43 -07:00
else :
2021-02-05 21:41:14 -08:00
X_train = self . _preprocess ( X_train )
2021-09-11 21:19:18 -07:00
if " sample_weight " in kwargs :
dtrain = xgb . DMatrix ( X_train , label = y_train , weight = kwargs [ " sample_weight " ] )
2021-03-31 22:11:56 -07:00
else :
dtrain = xgb . DMatrix ( X_train , label = y_train )
2021-04-08 09:29:55 -07:00
2021-09-11 21:19:18 -07:00
objective = self . params . get ( " objective " )
2021-04-10 21:14:28 -04:00
if isinstance ( objective , str ) :
obj = None
2021-02-05 21:41:14 -08:00
else :
2021-04-10 21:14:28 -04:00
obj = objective
2021-09-11 21:19:18 -07:00
if " objective " in self . params :
del self . params [ " objective " ]
2021-10-08 16:09:43 -07:00
_n_estimators = self . params . pop ( " n_estimators " )
2021-11-03 19:08:23 -07:00
callbacks = XGBoostEstimator . _callbacks ( start_time , deadline )
if callbacks :
self . _model = xgb . train (
self . params ,
dtrain ,
_n_estimators ,
obj = obj ,
callbacks = callbacks ,
)
self . params [ " n_estimators " ] = self . _model . best_iteration + 1
else :
self . _model = xgb . train ( self . params , dtrain , _n_estimators , obj = obj )
self . params [ " n_estimators " ] = _n_estimators
2021-09-11 21:19:18 -07:00
self . params [ " objective " ] = objective
2021-04-10 21:14:28 -04:00
del dtrain
train_time = time . time ( ) - start_time
return train_time
2021-02-05 21:41:14 -08:00
2022-03-20 22:03:02 -04:00
def predict ( self , X , * * kwargs ) :
2021-10-15 21:36:42 -07:00
import xgboost as xgb
2021-10-16 01:11:12 -07:00
2022-01-16 23:37:56 +01:00
if not issparse ( X ) :
X = self . _preprocess ( X )
dtest = xgb . DMatrix ( X )
2021-02-05 21:41:14 -08:00
return super ( ) . predict ( dtest )
2021-11-03 19:08:23 -07:00
@classmethod
def _callbacks ( cls , start_time , deadline ) :
try :
from xgboost . callback import TrainingCallback
except ImportError : # for xgboost<1.3
return None
class ResourceLimit ( TrainingCallback ) :
def after_iteration ( self , model , epoch , evals_log ) - > bool :
now = time . time ( )
if epoch == 0 :
self . _time_per_iter = now - start_time
if now + self . _time_per_iter > deadline :
return True
if psutil is not None :
mem = psutil . virtual_memory ( )
if mem . available / mem . total < FREE_MEM_RATIO :
return True
return False
return [ ResourceLimit ( ) ]
2021-02-05 21:41:14 -08:00
class XGBoostSklearnEstimator ( SKLearnEstimator , LGBMEstimator ) :
2021-11-22 21:17:48 -08:00
""" The class for tuning XGBoost with unlimited depth, using sklearn API. """
2021-02-05 21:41:14 -08:00
2022-03-01 15:39:09 -08:00
DEFAULT_ITER = 10
2021-02-05 21:41:14 -08:00
@classmethod
2021-04-08 09:29:55 -07:00
def search_space ( cls , data_size , * * params ) :
2021-11-22 21:17:48 -08:00
space = XGBoostEstimator . search_space ( data_size )
space . pop ( " max_depth " )
return space
2021-02-05 21:41:14 -08:00
@classmethod
def cost_relative2lgbm ( cls ) :
return XGBoostEstimator . cost_relative2lgbm ( )
2021-12-03 19:37:49 -08:00
def config2params ( self , config : dict ) - > dict :
2022-01-22 22:59:44 -08:00
params = super ( ) . config2params ( config )
2021-11-22 21:17:48 -08:00
max_depth = params [ " max_depth " ] = params . get ( " max_depth " , 0 )
if max_depth == 0 :
params [ " grow_policy " ] = params . get ( " grow_policy " , " lossguide " )
params [ " tree_method " ] = params . get ( " tree_method " , " hist " )
2021-10-08 16:09:43 -07:00
params [ " use_label_encoder " ] = params . get ( " use_label_encoder " , False )
return params
2021-04-08 09:29:55 -07:00
def __init__ (
2021-09-11 21:19:18 -07:00
self ,
task = " binary " ,
2021-10-08 16:09:43 -07:00
* * config ,
2021-04-08 09:29:55 -07:00
) :
2021-10-08 16:09:43 -07:00
super ( ) . __init__ ( task , * * config )
2021-09-11 21:19:18 -07:00
del self . params [ " verbose " ]
2021-10-08 16:09:43 -07:00
self . params [ " verbosity " ] = 0
2021-10-15 21:36:42 -07:00
import xgboost as xgb
2021-02-05 21:41:14 -08:00
2021-09-04 01:42:21 -07:00
self . estimator_class = xgb . XGBRegressor
2021-09-11 21:19:18 -07:00
if " rank " == task :
2021-09-01 16:25:04 -07:00
self . estimator_class = xgb . XGBRanker
2021-10-08 16:09:43 -07:00
elif task in CLASSIFICATION :
2021-02-05 21:41:14 -08:00
self . estimator_class = xgb . XGBClassifier
2022-06-21 18:59:07 -07:00
self . _xgb_version = xgb . __version__
2021-02-05 21:41:14 -08:00
def fit ( self , X_train , y_train , budget = None , * * kwargs ) :
2022-06-21 18:59:07 -07:00
if issparse ( X_train ) and self . _xgb_version < " 1.6.0 " :
# "auto" fails for sparse input since xgboost 1.6.0
2021-09-11 21:19:18 -07:00
self . params [ " tree_method " ] = " auto "
2022-01-30 13:02:18 -08:00
if kwargs . get ( " gpu_per_trial " ) :
self . params [ " tree_method " ] = " gpu_hist "
kwargs . pop ( " gpu_per_trial " )
2021-02-05 21:41:14 -08:00
return super ( ) . fit ( X_train , y_train , budget , * * kwargs )
2021-11-03 19:08:23 -07:00
def _callbacks ( self , start_time , deadline ) - > List [ Callable ] :
return XGBoostEstimator . _callbacks ( start_time , deadline )
2021-02-05 21:41:14 -08:00
2021-11-22 21:17:48 -08:00
class XGBoostLimitDepthEstimator ( XGBoostSklearnEstimator ) :
""" The class for tuning XGBoost with limited depth, using sklearn API. """
@classmethod
def search_space ( cls , data_size , * * params ) :
space = XGBoostEstimator . search_space ( data_size )
space . pop ( " max_leaves " )
2021-12-03 09:15:21 -08:00
upper = max ( 6 , int ( np . log2 ( data_size [ 0 ] ) ) )
2021-11-22 21:17:48 -08:00
space [ " max_depth " ] = {
" domain " : tune . randint ( lower = 1 , upper = min ( upper , 16 ) ) ,
" init_value " : 6 ,
" low_cost_init_value " : 1 ,
}
space [ " learning_rate " ] [ " init_value " ] = 0.3
space [ " n_estimators " ] [ " init_value " ] = 10
return space
@classmethod
def cost_relative2lgbm ( cls ) :
return 64
2021-04-08 09:29:55 -07:00
class RandomForestEstimator ( SKLearnEstimator , LGBMEstimator ) :
2021-11-06 09:37:33 -07:00
""" The class for tuning Random Forest. """
2021-11-03 19:08:23 -07:00
HAS_CALLBACK = False
2021-12-03 09:15:21 -08:00
nrows = 101
2021-11-03 19:08:23 -07:00
2021-02-05 21:41:14 -08:00
@classmethod
2021-04-08 09:29:55 -07:00
def search_space ( cls , data_size , task , * * params ) :
2021-12-03 09:15:21 -08:00
RandomForestEstimator . nrows = int ( data_size [ 0 ] )
upper = min ( 2048 , RandomForestEstimator . nrows )
init = 1 / np . sqrt ( data_size [ 1 ] ) if task in CLASSIFICATION else 1
lower = min ( 0.1 , init )
2021-02-05 21:41:14 -08:00
space = {
2021-09-11 21:19:18 -07:00
" n_estimators " : {
2022-04-28 14:06:29 -04:00
" domain " : tune . lograndint ( lower = 4 , upper = max ( 5 , upper ) ) ,
2021-09-11 21:19:18 -07:00
" init_value " : 4 ,
" low_cost_init_value " : 4 ,
2021-02-05 21:41:14 -08:00
} ,
2021-09-11 21:19:18 -07:00
" max_features " : {
2021-12-03 09:15:21 -08:00
" domain " : tune . loguniform ( lower = lower , upper = 1.0 ) ,
" init_value " : init ,
2021-02-05 21:41:14 -08:00
} ,
2021-09-11 21:19:18 -07:00
" max_leaves " : {
2021-12-03 09:15:21 -08:00
" domain " : tune . lograndint (
2022-04-28 14:06:29 -04:00
lower = 4 ,
upper = max ( 5 , min ( 32768 , RandomForestEstimator . nrows >> 1 ) ) , #
2021-12-03 09:15:21 -08:00
) ,
2021-09-11 21:19:18 -07:00
" init_value " : 4 ,
" low_cost_init_value " : 4 ,
2021-07-27 18:02:49 -07:00
} ,
2021-02-05 21:41:14 -08:00
}
2021-10-08 16:09:43 -07:00
if task in CLASSIFICATION :
2021-09-11 21:19:18 -07:00
space [ " criterion " ] = {
" domain " : tune . choice ( [ " gini " , " entropy " ] ) ,
2021-12-03 09:15:21 -08:00
# "init_value": "gini",
2021-02-05 21:41:14 -08:00
}
return space
@classmethod
def cost_relative2lgbm ( cls ) :
2021-12-03 09:15:21 -08:00
return 2
2021-02-05 21:41:14 -08:00
2021-12-03 19:37:49 -08:00
def config2params ( self , config : dict ) - > dict :
2022-01-22 22:59:44 -08:00
params = super ( ) . config2params ( config )
2021-10-08 16:09:43 -07:00
if " max_leaves " in params :
params [ " max_leaf_nodes " ] = params . get (
" max_leaf_nodes " , params . pop ( " max_leaves " )
)
2021-12-03 19:37:49 -08:00
if self . _task not in CLASSIFICATION and " criterion " in config :
params . pop ( " criterion " )
2021-10-08 16:09:43 -07:00
return params
2021-04-08 09:29:55 -07:00
def __init__ (
2021-09-11 21:19:18 -07:00
self ,
task = " binary " ,
* * params ,
2021-04-08 09:29:55 -07:00
) :
2021-02-05 21:41:14 -08:00
super ( ) . __init__ ( task , * * params )
2021-10-08 16:09:43 -07:00
self . params [ " verbose " ] = 0
2021-09-04 01:42:21 -07:00
self . estimator_class = RandomForestRegressor
2021-10-08 16:09:43 -07:00
if task in CLASSIFICATION :
2021-02-05 21:41:14 -08:00
self . estimator_class = RandomForestClassifier
2021-11-06 09:37:33 -07:00
class ExtraTreesEstimator ( RandomForestEstimator ) :
""" The class for tuning Extra Trees. """
2021-02-05 21:41:14 -08:00
@classmethod
def cost_relative2lgbm ( cls ) :
return 1.9
2021-09-11 21:19:18 -07:00
def __init__ ( self , task = " binary " , * * params ) :
2021-02-05 21:41:14 -08:00
super ( ) . __init__ ( task , * * params )
2021-09-11 21:19:18 -07:00
if " regression " in task :
2021-02-05 21:41:14 -08:00
self . estimator_class = ExtraTreesRegressor
else :
self . estimator_class = ExtraTreesClassifier
class LRL1Classifier ( SKLearnEstimator ) :
2021-11-06 09:37:33 -07:00
""" The class for tuning Logistic Regression with L1 regularization. """
2021-02-05 21:41:14 -08:00
@classmethod
2021-04-08 09:29:55 -07:00
def search_space ( cls , * * params ) :
2021-02-05 21:41:14 -08:00
return {
2021-09-11 21:19:18 -07:00
" C " : {
" domain " : tune . loguniform ( lower = 0.03125 , upper = 32768.0 ) ,
" init_value " : 1.0 ,
2021-02-05 21:41:14 -08:00
} ,
}
@classmethod
def cost_relative2lgbm ( cls ) :
return 160
2021-12-03 19:37:49 -08:00
def config2params ( self , config : dict ) - > dict :
2022-01-22 22:59:44 -08:00
params = super ( ) . config2params ( config )
2021-10-08 16:09:43 -07:00
params [ " tol " ] = params . get ( " tol " , 0.0001 )
params [ " solver " ] = params . get ( " solver " , " saga " )
params [ " penalty " ] = params . get ( " penalty " , " l1 " )
return params
def __init__ ( self , task = " binary " , * * config ) :
super ( ) . __init__ ( task , * * config )
assert task in CLASSIFICATION , " LogisticRegression for classification task only "
2021-09-04 01:42:21 -07:00
self . estimator_class = LogisticRegression
2021-02-05 21:41:14 -08:00
class LRL2Classifier ( SKLearnEstimator ) :
2021-11-06 09:37:33 -07:00
""" The class for tuning Logistic Regression with L2 regularization. """
2021-11-03 19:08:23 -07:00
limit_resource = True
2021-02-05 21:41:14 -08:00
@classmethod
2021-04-08 09:29:55 -07:00
def search_space ( cls , * * params ) :
2021-02-05 21:41:14 -08:00
return LRL1Classifier . search_space ( * * params )
@classmethod
def cost_relative2lgbm ( cls ) :
return 25
2021-12-03 19:37:49 -08:00
def config2params ( self , config : dict ) - > dict :
2022-01-22 22:59:44 -08:00
params = super ( ) . config2params ( config )
2021-10-08 16:09:43 -07:00
params [ " tol " ] = params . get ( " tol " , 0.0001 )
params [ " solver " ] = params . get ( " solver " , " lbfgs " )
params [ " penalty " ] = params . get ( " penalty " , " l2 " )
return params
def __init__ ( self , task = " binary " , * * config ) :
super ( ) . __init__ ( task , * * config )
assert task in CLASSIFICATION , " LogisticRegression for classification task only "
2021-09-04 01:42:21 -07:00
self . estimator_class = LogisticRegression
2021-02-05 21:41:14 -08:00
class CatBoostEstimator ( BaseEstimator ) :
2021-11-06 09:37:33 -07:00
""" The class for tuning CatBoost. """
2021-11-03 19:08:23 -07:00
ITER_HP = " n_estimators "
2022-03-01 15:39:09 -08:00
DEFAULT_ITER = 1000
2021-02-05 21:41:14 -08:00
@classmethod
2021-04-08 09:29:55 -07:00
def search_space ( cls , data_size , * * params ) :
2021-12-03 09:15:21 -08:00
upper = max ( min ( round ( 1500000 / data_size [ 0 ] ) , 150 ) , 12 )
2021-02-05 21:41:14 -08:00
return {
2021-09-11 21:19:18 -07:00
" early_stopping_rounds " : {
" domain " : tune . lograndint ( lower = 10 , upper = upper ) ,
" init_value " : 10 ,
" low_cost_init_value " : 10 ,
2021-02-05 21:41:14 -08:00
} ,
2021-09-11 21:19:18 -07:00
" learning_rate " : {
" domain " : tune . loguniform ( lower = 0.005 , upper = 0.2 ) ,
" init_value " : 0.1 ,
2021-02-05 21:41:14 -08:00
} ,
2021-10-16 10:40:01 -07:00
" n_estimators " : {
" domain " : 8192 ,
" init_value " : 8192 ,
} ,
2021-02-05 21:41:14 -08:00
}
@classmethod
def size ( cls , config ) :
2021-10-16 10:40:01 -07:00
n_estimators = config . get ( " n_estimators " , 8192 )
2021-02-05 21:41:14 -08:00
max_leaves = 64
2021-04-08 09:29:55 -07:00
return ( max_leaves * 3 + ( max_leaves - 1 ) * 4 + 1.0 ) * n_estimators * 8
2021-02-05 21:41:14 -08:00
@classmethod
def cost_relative2lgbm ( cls ) :
return 15
2021-08-12 02:02:22 -04:00
def _preprocess ( self , X ) :
2021-11-16 14:06:20 -05:00
if isinstance ( X , DataFrame ) :
2021-09-11 21:19:18 -07:00
cat_columns = X . select_dtypes ( include = [ " category " ] ) . columns
2021-08-12 02:02:22 -04:00
if not cat_columns . empty :
X = X . copy ( )
X [ cat_columns ] = X [ cat_columns ] . apply (
2021-09-11 21:19:18 -07:00
lambda x : x . cat . rename_categories (
[
str ( c ) if isinstance ( c , float ) else c
for c in x . cat . categories
]
)
)
elif isinstance ( X , np . ndarray ) and X . dtype . kind not in " buif " :
2021-08-12 02:02:22 -04:00
# numpy array is not of numeric dtype
2021-11-16 14:06:20 -05:00
X = DataFrame ( X )
2021-08-12 02:02:22 -04:00
for col in X . columns :
if isinstance ( X [ col ] [ 0 ] , str ) :
2021-09-11 21:19:18 -07:00
X [ col ] = X [ col ] . astype ( " category " ) . cat . codes
2021-08-12 02:02:22 -04:00
X = X . to_numpy ( )
return X
2021-12-03 19:37:49 -08:00
def config2params ( self , config : dict ) - > dict :
2022-01-22 22:59:44 -08:00
params = super ( ) . config2params ( config )
2021-10-08 16:09:43 -07:00
params [ " n_estimators " ] = params . get ( " n_estimators " , 8192 )
if " n_jobs " in params :
params [ " thread_count " ] = params . pop ( " n_jobs " )
return params
2021-04-08 09:29:55 -07:00
def __init__ (
2021-09-11 21:19:18 -07:00
self ,
task = " binary " ,
2021-10-08 16:09:43 -07:00
* * config ,
2021-04-08 09:29:55 -07:00
) :
2021-10-08 16:09:43 -07:00
super ( ) . __init__ ( task , * * config )
2021-09-11 21:19:18 -07:00
self . params . update (
{
2021-10-08 16:09:43 -07:00
" verbose " : config . get ( " verbose " , False ) ,
" random_seed " : config . get ( " random_seed " , 10242048 ) ,
2021-09-11 21:19:18 -07:00
}
)
2021-09-04 01:42:21 -07:00
from catboost import CatBoostRegressor
2021-09-11 21:19:18 -07:00
2021-09-04 01:42:21 -07:00
self . estimator_class = CatBoostRegressor
2021-10-08 16:09:43 -07:00
if task in CLASSIFICATION :
2021-02-05 21:41:14 -08:00
from catboost import CatBoostClassifier
2021-09-11 21:19:18 -07:00
2021-02-05 21:41:14 -08:00
self . estimator_class = CatBoostClassifier
def fit ( self , X_train , y_train , budget = None , * * kwargs ) :
start_time = time . time ( )
2021-11-03 19:08:23 -07:00
deadline = start_time + budget if budget else np . inf
2021-09-11 21:19:18 -07:00
train_dir = f " catboost_ { str ( start_time ) } "
2021-08-12 02:02:22 -04:00
X_train = self . _preprocess ( X_train )
2021-11-16 14:06:20 -05:00
if isinstance ( X_train , DataFrame ) :
2021-09-11 21:19:18 -07:00
cat_features = list ( X_train . select_dtypes ( include = " category " ) . columns )
2021-02-05 21:41:14 -08:00
else :
cat_features = [ ]
2021-11-03 19:08:23 -07:00
n = max ( int ( len ( y_train ) * 0.9 ) , len ( y_train ) - 1000 )
X_tr , y_tr = X_train [ : n ] , y_train [ : n ]
if " sample_weight " in kwargs :
weight = kwargs [ " sample_weight " ]
if weight is not None :
kwargs [ " sample_weight " ] = weight [ : n ]
else :
weight = None
from catboost import Pool , __version__
2021-09-11 21:19:18 -07:00
2021-11-03 19:08:23 -07:00
model = self . estimator_class ( train_dir = train_dir , * * self . params )
if __version__ > = " 0.26 " :
2021-08-12 02:02:22 -04:00
model . fit (
2021-09-11 21:19:18 -07:00
X_tr ,
y_tr ,
cat_features = cat_features ,
2021-08-12 02:02:22 -04:00
eval_set = Pool (
2021-09-11 21:19:18 -07:00
data = X_train [ n : ] , label = y_train [ n : ] , cat_features = cat_features
) ,
2021-11-03 19:08:23 -07:00
callbacks = CatBoostEstimator . _callbacks ( start_time , deadline ) ,
2021-09-11 21:19:18 -07:00
* * kwargs ,
2021-11-03 19:08:23 -07:00
)
2021-09-27 21:30:49 -07:00
else :
2021-11-03 19:08:23 -07:00
model . fit (
X_tr ,
y_tr ,
cat_features = cat_features ,
eval_set = Pool (
data = X_train [ n : ] , label = y_train [ n : ] , cat_features = cat_features
) ,
* * kwargs ,
)
shutil . rmtree ( train_dir , ignore_errors = True )
if weight is not None :
kwargs [ " sample_weight " ] = weight
self . _model = model
self . params [ self . ITER_HP ] = self . _model . tree_count_
2021-02-05 21:41:14 -08:00
train_time = time . time ( ) - start_time
return train_time
2021-11-03 19:08:23 -07:00
@classmethod
def _callbacks ( cls , start_time , deadline ) :
class ResourceLimit :
def after_iteration ( self , info ) - > bool :
now = time . time ( )
if info . iteration == 1 :
self . _time_per_iter = now - start_time
if now + self . _time_per_iter > deadline :
return False
if psutil is not None :
mem = psutil . virtual_memory ( )
if mem . available / mem . total < FREE_MEM_RATIO :
return False
return True # can continue
return [ ResourceLimit ( ) ]
2021-02-05 21:41:14 -08:00
class KNeighborsEstimator ( BaseEstimator ) :
@classmethod
2021-04-08 09:29:55 -07:00
def search_space ( cls , data_size , * * params ) :
2021-12-03 09:15:21 -08:00
upper = min ( 512 , int ( data_size [ 0 ] / 2 ) )
2021-02-05 21:41:14 -08:00
return {
2021-09-11 21:19:18 -07:00
" n_neighbors " : {
2022-04-28 14:06:29 -04:00
" domain " : tune . lograndint ( lower = 1 , upper = max ( 2 , upper ) ) ,
2021-09-11 21:19:18 -07:00
" init_value " : 5 ,
" low_cost_init_value " : 1 ,
2021-02-05 21:41:14 -08:00
} ,
}
@classmethod
def cost_relative2lgbm ( cls ) :
return 30
2021-12-03 19:37:49 -08:00
def config2params ( self , config : dict ) - > dict :
2022-01-22 22:59:44 -08:00
params = super ( ) . config2params ( config )
2021-10-08 16:09:43 -07:00
params [ " weights " ] = params . get ( " weights " , " distance " )
return params
2021-09-11 21:19:18 -07:00
2021-10-08 16:09:43 -07:00
def __init__ ( self , task = " binary " , * * config ) :
super ( ) . __init__ ( task , * * config )
if task in CLASSIFICATION :
2021-02-05 21:41:14 -08:00
from sklearn . neighbors import KNeighborsClassifier
2021-09-11 21:19:18 -07:00
2021-02-05 21:41:14 -08:00
self . estimator_class = KNeighborsClassifier
2021-10-08 16:09:43 -07:00
else :
from sklearn . neighbors import KNeighborsRegressor
self . estimator_class = KNeighborsRegressor
2021-02-05 21:41:14 -08:00
def _preprocess ( self , X ) :
2021-11-16 14:06:20 -05:00
if isinstance ( X , DataFrame ) :
2021-09-11 21:19:18 -07:00
cat_columns = X . select_dtypes ( [ " category " ] ) . columns
2021-02-05 21:41:14 -08:00
if X . shape [ 1 ] == len ( cat_columns ) :
2021-09-11 21:19:18 -07:00
raise ValueError ( " kneighbor requires at least one numeric feature " )
2021-04-08 09:29:55 -07:00
X = X . drop ( cat_columns , axis = 1 )
2021-09-11 21:19:18 -07:00
elif isinstance ( X , np . ndarray ) and X . dtype . kind not in " buif " :
2021-07-27 18:02:49 -07:00
# drop categocial columns if any
2021-11-16 14:06:20 -05:00
X = DataFrame ( X )
2021-07-27 18:02:49 -07:00
cat_columns = [ ]
for col in X . columns :
if isinstance ( X [ col ] [ 0 ] , str ) :
cat_columns . append ( col )
X = X . drop ( cat_columns , axis = 1 )
X = X . to_numpy ( )
2021-02-05 21:41:14 -08:00
return X
2021-08-23 16:26:46 -04:00
2021-10-30 12:48:57 -04:00
class Prophet ( SKLearnEstimator ) :
2021-11-06 09:37:33 -07:00
""" The class for tuning Prophet. """
2021-08-23 16:26:46 -04:00
@classmethod
def search_space ( cls , * * params ) :
space = {
2021-09-11 21:19:18 -07:00
" changepoint_prior_scale " : {
2021-10-30 12:48:57 -04:00
" domain " : tune . loguniform ( lower = 0.001 , upper = 0.05 ) ,
" init_value " : 0.05 ,
2021-09-11 21:19:18 -07:00
" low_cost_init_value " : 0.001 ,
2021-08-23 16:26:46 -04:00
} ,
2021-09-11 21:19:18 -07:00
" seasonality_prior_scale " : {
2021-10-30 12:48:57 -04:00
" domain " : tune . loguniform ( lower = 0.01 , upper = 10 ) ,
" init_value " : 10 ,
2021-08-23 16:26:46 -04:00
} ,
2021-09-11 21:19:18 -07:00
" holidays_prior_scale " : {
2021-10-30 12:48:57 -04:00
" domain " : tune . loguniform ( lower = 0.01 , upper = 10 ) ,
" init_value " : 10 ,
2021-09-11 21:19:18 -07:00
} ,
" seasonality_mode " : {
" domain " : tune . choice ( [ " additive " , " multiplicative " ] ) ,
" init_value " : " multiplicative " ,
2021-08-23 16:26:46 -04:00
} ,
}
return space
2022-01-24 21:39:36 -05:00
def __init__ ( self , task = " ts_forecast " , n_jobs = 1 , * * params ) :
2021-09-01 16:25:04 -07:00
super ( ) . __init__ ( task , * * params )
def _join ( self , X_train , y_train ) :
2021-10-30 12:48:57 -04:00
assert TS_TIMESTAMP_COL in X_train , (
" Dataframe for training ts_forecast model must have column "
f ' " { TS_TIMESTAMP_COL } " with the dates in X_train. '
2021-09-11 21:19:18 -07:00
)
2021-11-16 14:06:20 -05:00
y_train = DataFrame ( y_train , columns = [ TS_VALUE_COL ] )
2021-08-23 16:26:46 -04:00
train_df = X_train . join ( y_train )
2021-09-01 16:25:04 -07:00
return train_df
2021-08-23 16:26:46 -04:00
2021-09-01 16:25:04 -07:00
def fit ( self , X_train , y_train , budget = None , * * kwargs ) :
2021-08-23 16:26:46 -04:00
from prophet import Prophet
2021-09-11 21:19:18 -07:00
2021-08-23 16:26:46 -04:00
current_time = time . time ( )
2021-09-01 16:25:04 -07:00
train_df = self . _join ( X_train , y_train )
2021-10-30 12:48:57 -04:00
train_df = self . _preprocess ( train_df )
cols = list ( train_df )
cols . remove ( TS_TIMESTAMP_COL )
cols . remove ( TS_VALUE_COL )
2021-11-18 09:39:45 -08:00
logging . getLogger ( " prophet " ) . setLevel ( logging . WARNING )
2021-10-30 12:48:57 -04:00
model = Prophet ( * * self . params )
for regressor in cols :
model . add_regressor ( regressor )
2021-11-03 19:08:23 -07:00
with suppress_stdout_stderr ( ) :
model . fit ( train_df )
2021-08-23 16:26:46 -04:00
train_time = time . time ( ) - current_time
self . _model = model
return train_time
2022-03-20 22:03:02 -04:00
def predict ( self , X , * * kwargs ) :
2022-01-16 23:37:56 +01:00
if isinstance ( X , int ) :
2021-09-01 16:25:04 -07:00
raise ValueError (
" predict() with steps is only supported for arima/sarimax. "
2021-10-30 12:48:57 -04:00
" For Prophet, pass a dataframe with the first column containing "
" the timestamp values. "
2021-09-11 21:19:18 -07:00
)
2021-08-23 16:26:46 -04:00
if self . _model is not None :
2022-01-16 23:37:56 +01:00
X = self . _preprocess ( X )
forecast = self . _model . predict ( X )
2021-09-11 21:19:18 -07:00
return forecast [ " yhat " ]
2021-08-23 16:26:46 -04:00
else :
2021-09-04 01:42:21 -07:00
logger . warning (
2021-09-11 21:19:18 -07:00
" Estimator is not fit yet. Please run fit() before predict(). "
)
2022-01-16 23:37:56 +01:00
return np . ones ( X . shape [ 0 ] )
2021-08-23 16:26:46 -04:00
2022-03-25 17:00:08 -04:00
def score ( self , X_val : DataFrame , y_val : Series , * * kwargs ) :
from sklearn . metrics import r2_score
from . ml import metric_loss_score
y_pred = self . predict ( X_val )
self . _metric = kwargs . get ( " metric " , None )
if self . _metric :
return metric_loss_score ( self . _metric , y_pred , y_val )
else :
return r2_score ( y_pred , y_val )
2021-08-23 16:26:46 -04:00
2021-09-11 21:19:18 -07:00
class ARIMA ( Prophet ) :
2021-11-06 09:37:33 -07:00
""" The class for tuning ARIMA. """
2021-08-23 16:26:46 -04:00
@classmethod
def search_space ( cls , * * params ) :
space = {
2021-09-11 21:19:18 -07:00
" p " : {
2022-06-09 08:11:15 -07:00
" domain " : tune . qrandint ( lower = 0 , upper = 10 , q = 1 ) ,
2021-09-11 21:19:18 -07:00
" init_value " : 2 ,
" low_cost_init_value " : 0 ,
2021-08-23 16:26:46 -04:00
} ,
2021-09-11 21:19:18 -07:00
" d " : {
2022-06-09 08:11:15 -07:00
" domain " : tune . qrandint ( lower = 0 , upper = 10 , q = 1 ) ,
2021-09-11 21:19:18 -07:00
" init_value " : 2 ,
" low_cost_init_value " : 0 ,
} ,
" q " : {
2022-06-09 08:11:15 -07:00
" domain " : tune . qrandint ( lower = 0 , upper = 10 , q = 1 ) ,
2021-10-30 12:48:57 -04:00
" init_value " : 1 ,
2021-09-11 21:19:18 -07:00
" low_cost_init_value " : 0 ,
2021-08-23 16:26:46 -04:00
} ,
}
return space
2021-09-01 16:25:04 -07:00
def _join ( self , X_train , y_train ) :
train_df = super ( ) . _join ( X_train , y_train )
2022-01-14 13:39:09 -08:00
train_df . index = to_datetime ( train_df [ TS_TIMESTAMP_COL ] )
2021-10-30 12:48:57 -04:00
train_df = train_df . drop ( TS_TIMESTAMP_COL , axis = 1 )
2021-09-01 16:25:04 -07:00
return train_df
2021-08-23 16:26:46 -04:00
2021-09-01 16:25:04 -07:00
def fit ( self , X_train , y_train , budget = None , * * kwargs ) :
2021-09-04 01:42:21 -07:00
import warnings
2021-09-11 21:19:18 -07:00
2021-08-23 16:26:46 -04:00
warnings . filterwarnings ( " ignore " )
2021-09-04 01:42:21 -07:00
from statsmodels . tsa . arima . model import ARIMA as ARIMA_estimator
2021-09-11 21:19:18 -07:00
2021-08-23 16:26:46 -04:00
current_time = time . time ( )
2021-09-01 16:25:04 -07:00
train_df = self . _join ( X_train , y_train )
2021-10-30 12:48:57 -04:00
train_df = self . _preprocess ( train_df )
2021-11-18 09:39:45 -08:00
regressors = list ( train_df )
regressors . remove ( TS_VALUE_COL )
2021-10-30 12:48:57 -04:00
if regressors :
model = ARIMA_estimator (
2021-11-03 19:08:23 -07:00
train_df [ [ TS_VALUE_COL ] ] ,
exog = train_df [ regressors ] ,
order = ( self . params [ " p " ] , self . params [ " d " ] , self . params [ " q " ] ) ,
enforce_stationarity = False ,
enforce_invertibility = False ,
)
2021-10-30 12:48:57 -04:00
else :
model = ARIMA_estimator (
2021-11-03 19:08:23 -07:00
train_df ,
order = ( self . params [ " p " ] , self . params [ " d " ] , self . params [ " q " ] ) ,
enforce_stationarity = False ,
enforce_invertibility = False ,
)
with suppress_stdout_stderr ( ) :
model = model . fit ( )
2021-08-23 16:26:46 -04:00
train_time = time . time ( ) - current_time
self . _model = model
return train_time
2022-03-20 22:03:02 -04:00
def predict ( self , X , * * kwargs ) :
2021-08-23 16:26:46 -04:00
if self . _model is not None :
2022-01-16 23:37:56 +01:00
if isinstance ( X , int ) :
forecast = self . _model . forecast ( steps = X )
elif isinstance ( X , DataFrame ) :
start = X [ TS_TIMESTAMP_COL ] . iloc [ 0 ]
end = X [ TS_TIMESTAMP_COL ] . iloc [ - 1 ]
if len ( X . columns ) > 1 :
X = self . _preprocess ( X . drop ( columns = TS_TIMESTAMP_COL ) )
regressors = list ( X )
2021-11-03 19:08:23 -07:00
forecast = self . _model . predict (
2022-01-16 23:37:56 +01:00
start = start , end = end , exog = X [ regressors ]
2021-11-03 19:08:23 -07:00
)
2021-10-30 12:48:57 -04:00
else :
forecast = self . _model . predict ( start = start , end = end )
2021-08-23 16:26:46 -04:00
else :
raise ValueError (
2022-01-16 23:37:56 +01:00
" X needs to be either a pandas Dataframe with dates as the first column "
2021-09-11 21:19:18 -07:00
" or an int number of periods for predict(). "
)
2021-08-23 16:26:46 -04:00
return forecast
else :
2022-01-16 23:37:56 +01:00
return np . ones ( X if isinstance ( X , int ) else X . shape [ 0 ] )
2021-08-23 16:26:46 -04:00
2021-09-01 16:25:04 -07:00
class SARIMAX ( ARIMA ) :
2021-11-06 09:37:33 -07:00
""" The class for tuning SARIMA. """
2021-08-23 16:26:46 -04:00
@classmethod
def search_space ( cls , * * params ) :
space = {
2021-09-11 21:19:18 -07:00
" p " : {
2022-06-09 08:11:15 -07:00
" domain " : tune . qrandint ( lower = 0 , upper = 10 , q = 1 ) ,
2021-09-11 21:19:18 -07:00
" init_value " : 2 ,
" low_cost_init_value " : 0 ,
2021-08-23 16:26:46 -04:00
} ,
2021-09-11 21:19:18 -07:00
" d " : {
2022-06-09 08:11:15 -07:00
" domain " : tune . qrandint ( lower = 0 , upper = 10 , q = 1 ) ,
2021-09-11 21:19:18 -07:00
" init_value " : 2 ,
" low_cost_init_value " : 0 ,
2021-08-23 16:26:46 -04:00
} ,
2021-09-11 21:19:18 -07:00
" q " : {
2022-06-09 08:11:15 -07:00
" domain " : tune . qrandint ( lower = 0 , upper = 10 , q = 1 ) ,
2021-10-30 12:48:57 -04:00
" init_value " : 1 ,
2021-09-11 21:19:18 -07:00
" low_cost_init_value " : 0 ,
2021-08-23 16:26:46 -04:00
} ,
2021-09-11 21:19:18 -07:00
" P " : {
2022-06-09 08:11:15 -07:00
" domain " : tune . qrandint ( lower = 0 , upper = 10 , q = 1 ) ,
2021-09-11 21:19:18 -07:00
" init_value " : 1 ,
" low_cost_init_value " : 0 ,
2021-08-23 16:26:46 -04:00
} ,
2021-09-11 21:19:18 -07:00
" D " : {
2022-06-09 08:11:15 -07:00
" domain " : tune . qrandint ( lower = 0 , upper = 10 , q = 1 ) ,
2021-09-11 21:19:18 -07:00
" init_value " : 1 ,
" low_cost_init_value " : 0 ,
2021-08-23 16:26:46 -04:00
} ,
2021-09-11 21:19:18 -07:00
" Q " : {
2022-06-09 08:11:15 -07:00
" domain " : tune . qrandint ( lower = 0 , upper = 10 , q = 1 ) ,
2021-09-11 21:19:18 -07:00
" init_value " : 1 ,
" low_cost_init_value " : 0 ,
} ,
" s " : {
" domain " : tune . choice ( [ 1 , 4 , 6 , 12 ] ) ,
" init_value " : 12 ,
2021-08-23 16:26:46 -04:00
} ,
}
return space
def fit ( self , X_train , y_train , budget = None , * * kwargs ) :
2021-10-30 12:48:57 -04:00
import warnings
warnings . filterwarnings ( " ignore " )
2021-08-23 16:26:46 -04:00
from statsmodels . tsa . statespace . sarimax import SARIMAX as SARIMAX_estimator
2021-09-11 21:19:18 -07:00
2021-08-23 16:26:46 -04:00
current_time = time . time ( )
2021-09-01 16:25:04 -07:00
train_df = self . _join ( X_train , y_train )
2021-10-30 12:48:57 -04:00
train_df = self . _preprocess ( train_df )
regressors = list ( train_df )
regressors . remove ( TS_VALUE_COL )
if regressors :
model = SARIMAX_estimator (
2021-11-03 19:08:23 -07:00
train_df [ [ TS_VALUE_COL ] ] ,
exog = train_df [ regressors ] ,
order = ( self . params [ " p " ] , self . params [ " d " ] , self . params [ " q " ] ) ,
2021-10-30 12:48:57 -04:00
seasonality_order = (
self . params [ " P " ] ,
self . params [ " D " ] ,
self . params [ " Q " ] ,
2021-11-03 19:08:23 -07:00
self . params [ " s " ] ,
) ,
enforce_stationarity = False ,
enforce_invertibility = False ,
)
2021-10-30 12:48:57 -04:00
else :
model = SARIMAX_estimator (
2021-11-03 19:08:23 -07:00
train_df ,
order = ( self . params [ " p " ] , self . params [ " d " ] , self . params [ " q " ] ) ,
2021-10-30 12:48:57 -04:00
seasonality_order = (
self . params [ " P " ] ,
self . params [ " D " ] ,
self . params [ " Q " ] ,
2021-11-03 19:08:23 -07:00
self . params [ " s " ] ,
) ,
enforce_stationarity = False ,
enforce_invertibility = False ,
)
with suppress_stdout_stderr ( ) :
model = model . fit ( )
2021-08-23 16:26:46 -04:00
train_time = time . time ( ) - current_time
self . _model = model
return train_time
2021-11-03 19:08:23 -07:00
2022-01-24 21:39:36 -05:00
class TS_SKLearn ( SKLearnEstimator ) :
2022-01-07 19:25:58 -05:00
""" The class for tuning SKLearn Regressors for time-series forecasting, using hcrystalball """
2022-01-07 02:12:38 -05:00
base_class = SKLearnEstimator
@classmethod
def search_space ( cls , data_size , pred_horizon , * * params ) :
space = cls . base_class . search_space ( data_size , * * params )
2022-01-07 19:25:58 -05:00
space . update (
{
" optimize_for_horizon " : {
" domain " : tune . choice ( [ True , False ] ) ,
" init_value " : False ,
" low_cost_init_value " : False ,
} ,
" lags " : {
2022-04-28 14:06:29 -04:00
" domain " : tune . randint (
lower = 1 , upper = max ( 2 , int ( np . sqrt ( data_size [ 0 ] ) ) )
) ,
2022-01-07 19:25:58 -05:00
" init_value " : 3 ,
} ,
}
)
2022-01-07 02:12:38 -05:00
return space
2022-01-24 21:39:36 -05:00
def __init__ ( self , task = " ts_forecast " , * * params ) :
2022-01-07 02:12:38 -05:00
super ( ) . __init__ ( task , * * params )
self . hcrystaball_model = None
2022-01-24 21:39:36 -05:00
self . ts_task = (
" regression " if task in TS_FORECASTREGRESSION else " classification "
)
2022-01-07 02:12:38 -05:00
def transform_X ( self , X ) :
cols = list ( X )
if len ( cols ) == 1 :
ds_col = cols [ 0 ]
2022-01-14 13:39:09 -08:00
X = DataFrame ( index = X [ ds_col ] )
2022-01-07 02:12:38 -05:00
elif len ( cols ) > 1 :
ds_col = cols [ 0 ]
exog_cols = cols [ 1 : ]
X = X [ exog_cols ] . set_index ( X [ ds_col ] )
return X
def _fit ( self , X_train , y_train , budget = None , * * kwargs ) :
from hcrystalball . wrappers import get_sklearn_wrapper
X_train = self . transform_X ( X_train )
X_train = self . _preprocess ( X_train )
params = self . params . copy ( )
lags = params . pop ( " lags " )
optimize_for_horizon = params . pop ( " optimize_for_horizon " )
2022-01-24 21:39:36 -05:00
estimator = self . base_class ( task = self . ts_task , * * params )
2022-01-07 02:12:38 -05:00
self . hcrystaball_model = get_sklearn_wrapper ( estimator . estimator_class )
self . hcrystaball_model . lags = int ( lags )
self . hcrystaball_model . fit ( X_train , y_train )
if optimize_for_horizon :
# Direct Multi-step Forecast Strategy - fit a seperate model for each horizon
model_list = [ ]
for i in range ( 1 , kwargs [ " period " ] + 1 ) :
2022-01-07 19:25:58 -05:00
(
X_fit ,
y_fit ,
) = self . hcrystaball_model . _transform_data_to_tsmodel_input_format (
X_train , y_train , i
)
2022-01-07 02:12:38 -05:00
self . hcrystaball_model . model . set_params ( * * estimator . params )
model = self . hcrystaball_model . model . fit ( X_fit , y_fit )
model_list . append ( model )
self . _model = model_list
else :
2022-01-07 19:25:58 -05:00
(
X_fit ,
y_fit ,
) = self . hcrystaball_model . _transform_data_to_tsmodel_input_format (
X_train , y_train , kwargs [ " period " ]
)
2022-01-07 02:12:38 -05:00
self . hcrystaball_model . model . set_params ( * * estimator . params )
model = self . hcrystaball_model . model . fit ( X_fit , y_fit )
self . _model = model
def fit ( self , X_train , y_train , budget = None , * * kwargs ) :
current_time = time . time ( )
self . _fit ( X_train , y_train , budget = budget , * * kwargs )
train_time = time . time ( ) - current_time
return train_time
2022-03-20 22:03:02 -04:00
def predict ( self , X , * * kwargs ) :
2022-01-07 02:12:38 -05:00
if self . _model is not None :
2022-01-16 23:37:56 +01:00
X = self . transform_X ( X )
X = self . _preprocess ( X )
2022-01-07 02:12:38 -05:00
if isinstance ( self . _model , list ) :
2022-01-07 19:25:58 -05:00
assert len ( self . _model ) == len (
2022-01-16 23:37:56 +01:00
X
) , " Model is optimized for horizon, length of X must be equal to `period`. "
2022-01-07 02:12:38 -05:00
preds = [ ]
for i in range ( 1 , len ( self . _model ) + 1 ) :
2022-01-07 19:25:58 -05:00
(
X_pred ,
_ ,
) = self . hcrystaball_model . _transform_data_to_tsmodel_input_format (
2022-01-16 23:37:56 +01:00
X . iloc [ : i , : ]
2022-01-07 19:25:58 -05:00
)
2022-01-07 02:12:38 -05:00
preds . append ( self . _model [ i - 1 ] . predict ( X_pred ) [ - 1 ] )
2022-01-14 13:39:09 -08:00
forecast = DataFrame (
2022-01-07 19:25:58 -05:00
data = np . asarray ( preds ) . reshape ( - 1 , 1 ) ,
columns = [ self . hcrystaball_model . name ] ,
2022-01-16 23:37:56 +01:00
index = X . index ,
2022-01-07 19:25:58 -05:00
)
2022-01-07 02:12:38 -05:00
else :
2022-01-07 19:25:58 -05:00
(
X_pred ,
_ ,
2022-01-16 23:37:56 +01:00
) = self . hcrystaball_model . _transform_data_to_tsmodel_input_format ( X )
2022-01-07 02:12:38 -05:00
forecast = self . _model . predict ( X_pred )
return forecast
else :
logger . warning (
" Estimator is not fit yet. Please run fit() before predict(). "
)
2022-01-16 23:37:56 +01:00
return np . ones ( X . shape [ 0 ] )
2022-01-07 02:12:38 -05:00
2022-01-24 21:39:36 -05:00
class LGBM_TS ( TS_SKLearn ) :
2022-01-07 19:25:58 -05:00
""" The class for tuning LGBM Regressor for time-series forecasting """
2022-01-07 02:12:38 -05:00
base_class = LGBMEstimator
2022-01-24 21:39:36 -05:00
class XGBoost_TS ( TS_SKLearn ) :
2022-01-07 19:25:58 -05:00
""" The class for tuning XGBoost Regressor for time-series forecasting """
2022-01-07 02:12:38 -05:00
base_class = XGBoostSklearnEstimator
2022-01-07 19:25:58 -05:00
2022-01-07 02:12:38 -05:00
# catboost regressor is invalid because it has a `name` parameter, making it incompatible with hcrystalball
# class CatBoost_TS_Regressor(TS_Regressor):
# base_class = CatBoostEstimator
2022-01-24 21:39:36 -05:00
class RF_TS ( TS_SKLearn ) :
2022-01-07 19:25:58 -05:00
""" The class for tuning Random Forest Regressor for time-series forecasting """
2022-01-07 02:12:38 -05:00
base_class = RandomForestEstimator
2022-01-24 21:39:36 -05:00
class ExtraTrees_TS ( TS_SKLearn ) :
2022-01-07 19:25:58 -05:00
""" The class for tuning Extra Trees Regressor for time-series forecasting """
2022-01-07 02:12:38 -05:00
base_class = ExtraTreesEstimator
2022-01-24 21:39:36 -05:00
class XGBoostLimitDepth_TS ( TS_SKLearn ) :
2022-01-07 19:25:58 -05:00
""" The class for tuning XGBoost Regressor with unlimited depth for time-series forecasting """
2022-01-07 02:12:38 -05:00
base_class = XGBoostLimitDepthEstimator
2021-11-03 19:08:23 -07:00
class suppress_stdout_stderr ( object ) :
def __init__ ( self ) :
# Open a pair of null files
self . null_fds = [ os . open ( os . devnull , os . O_RDWR ) for x in range ( 2 ) ]
# Save the actual stdout (1) and stderr (2) file descriptors.
self . save_fds = ( os . dup ( 1 ) , os . dup ( 2 ) )
def __enter__ ( self ) :
# Assign the null pointers to stdout and stderr.
os . dup2 ( self . null_fds [ 0 ] , 1 )
os . dup2 ( self . null_fds [ 1 ] , 2 )
def __exit__ ( self , * _ ) :
# Re-assign the real stdout/stderr back to (1) and (2)
os . dup2 ( self . save_fds [ 0 ] , 1 )
os . dup2 ( self . save_fds [ 1 ] , 2 )
# Close the null files
os . close ( self . null_fds [ 0 ] )
os . close ( self . null_fds [ 1 ] )