2021-09-11 21:19:18 -07:00
|
|
|
|
"""!
|
|
|
|
|
* Copyright (c) Microsoft Corporation. All rights reserved.
|
2021-04-08 09:29:55 -07:00
|
|
|
|
* Licensed under the MIT License.
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
import xgboost as xgb
|
|
|
|
|
import time
|
|
|
|
|
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
|
|
|
|
|
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
|
|
|
|
|
from sklearn.linear_model import LogisticRegression
|
2021-09-01 16:25:04 -07:00
|
|
|
|
from lightgbm import LGBMClassifier, LGBMRegressor, LGBMRanker
|
2021-02-05 21:41:14 -08:00
|
|
|
|
from scipy.sparse import issparse
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from . import tune
|
2021-09-01 16:25:04 -07:00
|
|
|
|
from .data import group_counts
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
import logging
|
2021-08-23 16:26:46 -04:00
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BaseEstimator:
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""The abstract class for all learners
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
Typical example:
|
|
|
|
|
XGBoostEstimator: for regression
|
|
|
|
|
XGBoostSklearnEstimator: for classification
|
2021-04-08 09:29:55 -07:00
|
|
|
|
LGBMEstimator, RandomForestEstimator, LRL1Classifier, LRL2Classifier:
|
|
|
|
|
for both regression and classification
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
2021-09-11 21:19:18 -07:00
|
|
|
|
def __init__(self, task="binary", **params):
|
|
|
|
|
"""Constructor
|
2021-04-08 09:29:55 -07:00
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
|
Args:
|
|
|
|
|
task: A string of the task type, one of
|
2021-09-04 01:42:21 -07:00
|
|
|
|
'binary', 'multi', 'regression', 'rank', 'forecast'
|
2021-02-05 21:41:14 -08:00
|
|
|
|
n_jobs: An integer of the number of parallel threads
|
|
|
|
|
params: A dictionary of the hyperparameter names and values
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self.params = params
|
|
|
|
|
self.estimator_class = self._model = None
|
|
|
|
|
self._task = task
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if "_estimator_type" in params:
|
|
|
|
|
self._estimator_type = params["_estimator_type"]
|
|
|
|
|
del self.params["_estimator_type"]
|
2021-02-05 21:41:14 -08:00
|
|
|
|
else:
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self._estimator_type = (
|
|
|
|
|
"classifier" if task in ("binary", "multi") else "regressor"
|
|
|
|
|
)
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
def get_params(self, deep=False):
|
|
|
|
|
params = self.params.copy()
|
|
|
|
|
params["task"] = self._task
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if hasattr(self, "_estimator_type"):
|
|
|
|
|
params["_estimator_type"] = self._estimator_type
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return params
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def classes_(self):
|
|
|
|
|
return self._model.classes_
|
|
|
|
|
|
|
|
|
|
@property
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def n_features_in_(self):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return self.model.n_features_in_
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def model(self):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""Trained model after fit() is called, or None before fit() is called"""
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return self._model
|
|
|
|
|
|
2021-07-05 21:17:26 -04:00
|
|
|
|
@property
|
|
|
|
|
def estimator(self):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""Trained model after fit() is called, or None before fit() is called"""
|
2021-07-05 21:17:26 -04:00
|
|
|
|
return self._model
|
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
|
def _preprocess(self, X):
|
|
|
|
|
return X
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def _fit(self, X_train, y_train, **kwargs):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
2021-03-28 17:54:25 -07:00
|
|
|
|
current_time = time.time()
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if "groups" in kwargs:
|
2021-09-01 16:25:04 -07:00
|
|
|
|
kwargs = kwargs.copy()
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if self._task == "rank":
|
|
|
|
|
kwargs["group"] = group_counts(kwargs["groups"])
|
2021-09-01 16:25:04 -07:00
|
|
|
|
# groups_val = kwargs.get('groups_val')
|
|
|
|
|
# if groups_val is not None:
|
|
|
|
|
# kwargs['eval_group'] = [group_counts(groups_val)]
|
|
|
|
|
# kwargs['eval_set'] = [
|
|
|
|
|
# (kwargs['X_val'], kwargs['y_val'])]
|
|
|
|
|
# kwargs['verbose'] = False
|
|
|
|
|
# del kwargs['groups_val'], kwargs['X_val'], kwargs['y_val']
|
2021-09-11 21:19:18 -07:00
|
|
|
|
del kwargs["groups"]
|
2021-02-05 21:41:14 -08:00
|
|
|
|
X_train = self._preprocess(X_train)
|
|
|
|
|
model = self.estimator_class(**self.params)
|
|
|
|
|
model.fit(X_train, y_train, **kwargs)
|
2021-03-28 17:54:25 -07:00
|
|
|
|
train_time = time.time() - current_time
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self._model = model
|
|
|
|
|
return train_time
|
|
|
|
|
|
|
|
|
|
def fit(self, X_train, y_train, budget=None, **kwargs):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""Train the model from given training data
|
2021-04-08 09:29:55 -07:00
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
|
Args:
|
|
|
|
|
X_train: A numpy array of training data in shape n*m
|
|
|
|
|
y_train: A numpy array of labels in shape n*1
|
|
|
|
|
budget: A float of the time budget in seconds
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
train_time: A float of the training time in seconds
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return self._fit(X_train, y_train, **kwargs)
|
|
|
|
|
|
|
|
|
|
def predict(self, X_test):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""Predict label from features
|
2021-04-08 09:29:55 -07:00
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
|
Args:
|
|
|
|
|
X_test: A numpy array of featurized instances, shape n*m
|
|
|
|
|
|
|
|
|
|
Returns:
|
2021-04-08 09:29:55 -07:00
|
|
|
|
A numpy array of shape n*1.
|
2021-02-05 21:41:14 -08:00
|
|
|
|
Each element is the label for a instance
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""
|
2021-04-21 04:36:06 -04:00
|
|
|
|
if self._model is not None:
|
|
|
|
|
X_test = self._preprocess(X_test)
|
|
|
|
|
return self._model.predict(X_test)
|
|
|
|
|
else:
|
|
|
|
|
return np.ones(X_test.shape[0])
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
def predict_proba(self, X_test):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""Predict the probability of each class from features
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
Only works for classification problems
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
model: An object of trained model with method predict_proba()
|
|
|
|
|
X_test: A numpy array of featurized instances, shape n*m
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A numpy array of shape n*c. c is the # classes
|
|
|
|
|
Each element at (i,j) is the probability for instance i to be in
|
|
|
|
|
class j
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""
|
|
|
|
|
assert self._task in (
|
|
|
|
|
"binary",
|
|
|
|
|
"multi",
|
|
|
|
|
), "predict_prob() only for classification task."
|
2021-09-04 01:42:21 -07:00
|
|
|
|
X_test = self._preprocess(X_test)
|
|
|
|
|
return self._model.predict_proba(X_test)
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def cleanup(self):
|
|
|
|
|
pass
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def search_space(cls, **params):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""[required method] search space
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
Returns:
|
2021-04-08 09:29:55 -07:00
|
|
|
|
A dictionary of the search space.
|
2021-02-05 21:41:14 -08:00
|
|
|
|
Each key is the name of a hyperparameter, and value is a dict with
|
2021-04-08 09:29:55 -07:00
|
|
|
|
its domain and init_value (optional), cat_hp_cost (optional)
|
|
|
|
|
e.g.,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
{'domain': tune.randint(lower=1, upper=10), 'init_value': 1}
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
@classmethod
|
2021-07-10 09:02:17 -07:00
|
|
|
|
def size(cls, config: dict) -> float:
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""[optional method] memory size of the estimator in bytes
|
2021-04-08 09:29:55 -07:00
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
|
Args:
|
|
|
|
|
config - the dict of the hyperparameter config
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A float of the memory size required by the estimator to train the
|
|
|
|
|
given config
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return 1.0
|
|
|
|
|
|
|
|
|
|
@classmethod
|
2021-07-10 09:02:17 -07:00
|
|
|
|
def cost_relative2lgbm(cls) -> float:
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""[optional method] relative cost compared to lightgbm"""
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return 1.0
|
|
|
|
|
|
2021-05-22 08:51:38 -07:00
|
|
|
|
@classmethod
|
|
|
|
|
def init(cls):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""[optional method] initialize the class"""
|
2021-05-22 08:51:38 -07:00
|
|
|
|
pass
|
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
class SKLearnEstimator(BaseEstimator):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
def __init__(self, task="binary", **params):
|
2021-06-11 10:25:45 -07:00
|
|
|
|
super().__init__(task, **params)
|
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
|
def _preprocess(self, X):
|
|
|
|
|
if isinstance(X, pd.DataFrame):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
cat_columns = X.select_dtypes(include=["category"]).columns
|
2021-08-12 02:02:22 -04:00
|
|
|
|
if not cat_columns.empty:
|
|
|
|
|
X = X.copy()
|
|
|
|
|
X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)
|
2021-09-11 21:19:18 -07:00
|
|
|
|
elif isinstance(X, np.ndarray) and X.dtype.kind not in "buif":
|
2021-07-27 18:02:49 -07:00
|
|
|
|
# numpy array is not of numeric dtype
|
|
|
|
|
X = pd.DataFrame(X)
|
|
|
|
|
for col in X.columns:
|
|
|
|
|
if isinstance(X[col][0], str):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
X[col] = X[col].astype("category").cat.codes
|
2021-07-27 18:02:49 -07:00
|
|
|
|
X = X.to_numpy()
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return X
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LGBMEstimator(BaseEstimator):
|
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def search_space(cls, data_size, **params):
|
|
|
|
|
upper = min(32768, int(data_size))
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return {
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"n_estimators": {
|
|
|
|
|
"domain": tune.lograndint(lower=4, upper=upper),
|
|
|
|
|
"init_value": 4,
|
|
|
|
|
"low_cost_init_value": 4,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"num_leaves": {
|
|
|
|
|
"domain": tune.lograndint(lower=4, upper=upper),
|
|
|
|
|
"init_value": 4,
|
|
|
|
|
"low_cost_init_value": 4,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"min_child_samples": {
|
|
|
|
|
"domain": tune.lograndint(lower=2, upper=2 ** 7 + 1),
|
|
|
|
|
"init_value": 20,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"learning_rate": {
|
|
|
|
|
"domain": tune.loguniform(lower=1 / 1024, upper=1.0),
|
|
|
|
|
"init_value": 0.1,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
2021-08-23 19:36:51 -04:00
|
|
|
|
# 'subsample': {
|
|
|
|
|
# 'domain': tune.uniform(lower=0.1, upper=1.0),
|
|
|
|
|
# 'init_value': 1.0,
|
|
|
|
|
# },
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"log_max_bin": { # log transformed with base 2
|
|
|
|
|
"domain": tune.lograndint(lower=3, upper=11),
|
|
|
|
|
"init_value": 8,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"colsample_bytree": {
|
|
|
|
|
"domain": tune.uniform(lower=0.01, upper=1.0),
|
|
|
|
|
"init_value": 1.0,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"reg_alpha": {
|
|
|
|
|
"domain": tune.loguniform(lower=1 / 1024, upper=1024),
|
|
|
|
|
"init_value": 1 / 1024,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"reg_lambda": {
|
|
|
|
|
"domain": tune.loguniform(lower=1 / 1024, upper=1024),
|
|
|
|
|
"init_value": 1.0,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
},
|
2021-02-05 21:41:14 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def size(cls, config):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
num_leaves = int(round(config.get("num_leaves") or config["max_leaves"]))
|
|
|
|
|
n_estimators = int(round(config["n_estimators"]))
|
2021-04-08 09:29:55 -07:00
|
|
|
|
return (num_leaves * 3 + (num_leaves - 1) * 4 + 1.0) * n_estimators * 8
|
|
|
|
|
|
2021-09-11 21:19:18 -07:00
|
|
|
|
def __init__(self, task="binary", log_max_bin=8, **params):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
super().__init__(task, **params)
|
2021-08-23 19:36:51 -04:00
|
|
|
|
if "objective" not in self.params:
|
|
|
|
|
# Default: ‘regression’ for LGBMRegressor,
|
|
|
|
|
# ‘binary’ or ‘multiclass’ for LGBMClassifier
|
2021-09-11 21:19:18 -07:00
|
|
|
|
objective = "regression"
|
|
|
|
|
if "binary" in task:
|
|
|
|
|
objective = "binary"
|
|
|
|
|
elif "multi" in task:
|
|
|
|
|
objective = "multiclass"
|
|
|
|
|
elif "rank" == task:
|
|
|
|
|
objective = "lambdarank"
|
2021-08-23 19:36:51 -04:00
|
|
|
|
self.params["objective"] = objective
|
2021-06-11 10:25:45 -07:00
|
|
|
|
if "n_estimators" in self.params:
|
|
|
|
|
self.params["n_estimators"] = int(round(self.params["n_estimators"]))
|
|
|
|
|
if "num_leaves" in self.params:
|
|
|
|
|
self.params["num_leaves"] = int(round(self.params["num_leaves"]))
|
|
|
|
|
if "min_child_samples" in self.params:
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self.params["min_child_samples"] = int(
|
|
|
|
|
round(self.params["min_child_samples"])
|
|
|
|
|
)
|
2021-06-11 10:25:45 -07:00
|
|
|
|
if "max_bin" not in self.params:
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self.params["max_bin"] = 1 << int(round(log_max_bin)) - 1
|
2021-08-12 02:02:22 -04:00
|
|
|
|
if "verbose" not in self.params:
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self.params["verbose"] = -1
|
2021-08-23 19:36:51 -04:00
|
|
|
|
# if "subsample_freq" not in self.params:
|
|
|
|
|
# self.params['subsample_freq'] = 1
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if "regression" == task:
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self.estimator_class = LGBMRegressor
|
2021-09-11 21:19:18 -07:00
|
|
|
|
elif "rank" == task:
|
2021-09-01 16:25:04 -07:00
|
|
|
|
self.estimator_class = LGBMRanker
|
2021-02-05 21:41:14 -08:00
|
|
|
|
else:
|
|
|
|
|
self.estimator_class = LGBMClassifier
|
|
|
|
|
self._time_per_iter = None
|
|
|
|
|
self._train_size = 0
|
|
|
|
|
|
|
|
|
|
def _preprocess(self, X):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if (
|
|
|
|
|
not isinstance(X, pd.DataFrame)
|
|
|
|
|
and issparse(X)
|
|
|
|
|
and np.issubdtype(X.dtype, np.integer)
|
|
|
|
|
):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
X = X.astype(float)
|
2021-09-11 21:19:18 -07:00
|
|
|
|
elif isinstance(X, np.ndarray) and X.dtype.kind not in "buif":
|
2021-08-12 02:02:22 -04:00
|
|
|
|
# numpy array is not of numeric dtype
|
|
|
|
|
X = pd.DataFrame(X)
|
|
|
|
|
for col in X.columns:
|
|
|
|
|
if isinstance(X[col][0], str):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
X[col] = X[col].astype("category").cat.codes
|
2021-08-12 02:02:22 -04:00
|
|
|
|
X = X.to_numpy()
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return X
|
|
|
|
|
|
|
|
|
|
def fit(self, X_train, y_train, budget=None, **kwargs):
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
n_iter = self.params["n_estimators"]
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if (
|
|
|
|
|
not self._time_per_iter or abs(self._train_size - X_train.shape[0]) > 4
|
|
|
|
|
) and budget is not None:
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self.params["n_estimators"] = 1
|
|
|
|
|
self._t1 = self._fit(X_train, y_train, **kwargs)
|
2021-04-08 09:29:55 -07:00
|
|
|
|
if self._t1 >= budget:
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self.params["n_estimators"] = n_iter
|
|
|
|
|
return self._t1
|
|
|
|
|
self.params["n_estimators"] = 4
|
|
|
|
|
self._t2 = self._fit(X_train, y_train, **kwargs)
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self._time_per_iter = (
|
|
|
|
|
(self._t2 - self._t1) / (self.params["n_estimators"] - 1)
|
|
|
|
|
if self._t2 > self._t1
|
|
|
|
|
else self._t1
|
|
|
|
|
if self._t1
|
|
|
|
|
else 0.001
|
|
|
|
|
)
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self._train_size = X_train.shape[0]
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if self._t1 + self._t2 >= budget or n_iter == self.params["n_estimators"]:
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self.params["n_estimators"] = n_iter
|
|
|
|
|
return time.time() - start_time
|
|
|
|
|
if budget is not None:
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self.params["n_estimators"] = min(
|
|
|
|
|
n_iter,
|
|
|
|
|
int(
|
|
|
|
|
(budget - time.time() + start_time - self._t1) / self._time_per_iter
|
|
|
|
|
+ 1
|
|
|
|
|
),
|
|
|
|
|
)
|
2021-02-05 21:41:14 -08:00
|
|
|
|
if self.params["n_estimators"] > 0:
|
|
|
|
|
self._fit(X_train, y_train, **kwargs)
|
|
|
|
|
self.params["n_estimators"] = n_iter
|
|
|
|
|
train_time = time.time() - start_time
|
|
|
|
|
return train_time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class XGBoostEstimator(SKLearnEstimator):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""not using sklearn API, used for regression"""
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def search_space(cls, data_size, **params):
|
|
|
|
|
upper = min(32768, int(data_size))
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return {
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"n_estimators": {
|
|
|
|
|
"domain": tune.lograndint(lower=4, upper=upper),
|
|
|
|
|
"init_value": 4,
|
|
|
|
|
"low_cost_init_value": 4,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"max_leaves": {
|
|
|
|
|
"domain": tune.lograndint(lower=4, upper=upper),
|
|
|
|
|
"init_value": 4,
|
|
|
|
|
"low_cost_init_value": 4,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"min_child_weight": {
|
|
|
|
|
"domain": tune.loguniform(lower=0.001, upper=128),
|
|
|
|
|
"init_value": 1,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"learning_rate": {
|
|
|
|
|
"domain": tune.loguniform(lower=1 / 1024, upper=1.0),
|
|
|
|
|
"init_value": 0.1,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"subsample": {
|
|
|
|
|
"domain": tune.uniform(lower=0.1, upper=1.0),
|
|
|
|
|
"init_value": 1.0,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"colsample_bylevel": {
|
|
|
|
|
"domain": tune.uniform(lower=0.01, upper=1.0),
|
|
|
|
|
"init_value": 1.0,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"colsample_bytree": {
|
|
|
|
|
"domain": tune.uniform(lower=0.01, upper=1.0),
|
|
|
|
|
"init_value": 1.0,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"reg_alpha": {
|
|
|
|
|
"domain": tune.loguniform(lower=1 / 1024, upper=1024),
|
|
|
|
|
"init_value": 1 / 1024,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"reg_lambda": {
|
|
|
|
|
"domain": tune.loguniform(lower=1 / 1024, upper=1024),
|
|
|
|
|
"init_value": 1.0,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
},
|
2021-02-05 21:41:14 -08:00
|
|
|
|
}
|
2021-04-08 09:29:55 -07:00
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
|
@classmethod
|
|
|
|
|
def size(cls, config):
|
|
|
|
|
return LGBMEstimator.size(config)
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def cost_relative2lgbm(cls):
|
|
|
|
|
return 1.6
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def __init__(
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self,
|
|
|
|
|
task="regression",
|
|
|
|
|
all_thread=False,
|
|
|
|
|
n_jobs=1,
|
|
|
|
|
n_estimators=4,
|
|
|
|
|
max_leaves=4,
|
|
|
|
|
subsample=1.0,
|
|
|
|
|
min_child_weight=1,
|
|
|
|
|
learning_rate=0.1,
|
|
|
|
|
reg_lambda=1.0,
|
|
|
|
|
reg_alpha=0.0,
|
|
|
|
|
colsample_bylevel=1.0,
|
|
|
|
|
colsample_bytree=1.0,
|
|
|
|
|
tree_method="auto",
|
|
|
|
|
**params,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
super().__init__(task, **params)
|
|
|
|
|
self._n_estimators = int(round(n_estimators))
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self.params.update(
|
|
|
|
|
{
|
|
|
|
|
"max_leaves": int(round(max_leaves)),
|
|
|
|
|
"max_depth": params.get("max_depth", 0),
|
|
|
|
|
"grow_policy": params.get("grow_policy", "lossguide"),
|
|
|
|
|
"tree_method": tree_method,
|
|
|
|
|
"verbosity": params.get("verbosity", 0),
|
|
|
|
|
"nthread": n_jobs,
|
|
|
|
|
"learning_rate": float(learning_rate),
|
|
|
|
|
"subsample": float(subsample),
|
|
|
|
|
"reg_alpha": float(reg_alpha),
|
|
|
|
|
"reg_lambda": float(reg_lambda),
|
|
|
|
|
"min_child_weight": float(min_child_weight),
|
|
|
|
|
"booster": params.get("booster", "gbtree"),
|
|
|
|
|
"colsample_bylevel": float(colsample_bylevel),
|
|
|
|
|
"colsample_bytree": float(colsample_bytree),
|
|
|
|
|
"objective": params.get("objective"),
|
|
|
|
|
}
|
|
|
|
|
)
|
2021-02-05 21:41:14 -08:00
|
|
|
|
if all_thread:
|
2021-09-11 21:19:18 -07:00
|
|
|
|
del self.params["nthread"]
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
def get_params(self, deep=False):
|
|
|
|
|
params = super().get_params()
|
2021-09-11 21:19:18 -07:00
|
|
|
|
params["n_jobs"] = params["nthread"]
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return params
|
|
|
|
|
|
|
|
|
|
def fit(self, X_train, y_train, budget=None, **kwargs):
|
2021-04-08 09:29:55 -07:00
|
|
|
|
start_time = time.time()
|
2021-02-05 21:41:14 -08:00
|
|
|
|
if not issparse(X_train):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self.params["tree_method"] = "hist"
|
2021-02-05 21:41:14 -08:00
|
|
|
|
X_train = self._preprocess(X_train)
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if "sample_weight" in kwargs:
|
|
|
|
|
dtrain = xgb.DMatrix(X_train, label=y_train, weight=kwargs["sample_weight"])
|
2021-03-31 22:11:56 -07:00
|
|
|
|
else:
|
|
|
|
|
dtrain = xgb.DMatrix(X_train, label=y_train)
|
2021-04-08 09:29:55 -07:00
|
|
|
|
|
2021-09-11 21:19:18 -07:00
|
|
|
|
objective = self.params.get("objective")
|
2021-04-10 21:14:28 -04:00
|
|
|
|
if isinstance(objective, str):
|
|
|
|
|
obj = None
|
2021-02-05 21:41:14 -08:00
|
|
|
|
else:
|
2021-04-10 21:14:28 -04:00
|
|
|
|
obj = objective
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if "objective" in self.params:
|
|
|
|
|
del self.params["objective"]
|
|
|
|
|
self._model = xgb.train(self.params, dtrain, self._n_estimators, obj=obj)
|
|
|
|
|
self.params["objective"] = objective
|
2021-04-10 21:14:28 -04:00
|
|
|
|
del dtrain
|
|
|
|
|
train_time = time.time() - start_time
|
|
|
|
|
return train_time
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
def predict(self, X_test):
|
|
|
|
|
if not issparse(X_test):
|
|
|
|
|
X_test = self._preprocess(X_test)
|
|
|
|
|
dtest = xgb.DMatrix(X_test)
|
|
|
|
|
return super().predict(dtest)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"""using sklearn API, used for classification"""
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def search_space(cls, data_size, **params):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return XGBoostEstimator.search_space(data_size)
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def cost_relative2lgbm(cls):
|
|
|
|
|
return XGBoostEstimator.cost_relative2lgbm()
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def __init__(
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self,
|
|
|
|
|
task="binary",
|
|
|
|
|
n_jobs=1,
|
|
|
|
|
n_estimators=4,
|
|
|
|
|
max_leaves=4,
|
|
|
|
|
subsample=1.0,
|
|
|
|
|
min_child_weight=1,
|
|
|
|
|
learning_rate=0.1,
|
|
|
|
|
reg_lambda=1.0,
|
|
|
|
|
reg_alpha=0.0,
|
|
|
|
|
colsample_bylevel=1.0,
|
|
|
|
|
colsample_bytree=1.0,
|
|
|
|
|
tree_method="hist",
|
|
|
|
|
**params,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
super().__init__(task, **params)
|
2021-09-11 21:19:18 -07:00
|
|
|
|
del self.params["objective"]
|
|
|
|
|
del self.params["max_bin"]
|
|
|
|
|
del self.params["verbose"]
|
|
|
|
|
self.params.update(
|
|
|
|
|
{
|
|
|
|
|
"n_estimators": int(round(n_estimators)),
|
|
|
|
|
"max_leaves": int(round(max_leaves)),
|
|
|
|
|
"max_depth": 0,
|
|
|
|
|
"grow_policy": params.get("grow_policy", "lossguide"),
|
|
|
|
|
"tree_method": tree_method,
|
|
|
|
|
"n_jobs": n_jobs,
|
|
|
|
|
"verbosity": 0,
|
|
|
|
|
"learning_rate": float(learning_rate),
|
|
|
|
|
"subsample": float(subsample),
|
|
|
|
|
"reg_alpha": float(reg_alpha),
|
|
|
|
|
"reg_lambda": float(reg_lambda),
|
|
|
|
|
"min_child_weight": float(min_child_weight),
|
|
|
|
|
"booster": params.get("booster", "gbtree"),
|
|
|
|
|
"colsample_bylevel": float(colsample_bylevel),
|
|
|
|
|
"colsample_bytree": float(colsample_bytree),
|
|
|
|
|
"use_label_encoder": params.get("use_label_encoder", False),
|
|
|
|
|
}
|
|
|
|
|
)
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
2021-09-04 01:42:21 -07:00
|
|
|
|
self.estimator_class = xgb.XGBRegressor
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if "rank" == task:
|
2021-09-01 16:25:04 -07:00
|
|
|
|
self.estimator_class = xgb.XGBRanker
|
2021-09-11 21:19:18 -07:00
|
|
|
|
elif task in ("binary", "multi"):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self.estimator_class = xgb.XGBClassifier
|
|
|
|
|
self._time_per_iter = None
|
|
|
|
|
self._train_size = 0
|
|
|
|
|
|
|
|
|
|
def fit(self, X_train, y_train, budget=None, **kwargs):
|
|
|
|
|
if issparse(X_train):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self.params["tree_method"] = "auto"
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return super().fit(X_train, y_train, budget, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def search_space(cls, data_size, task, **params):
|
2021-07-27 18:02:49 -07:00
|
|
|
|
data_size = int(data_size)
|
|
|
|
|
upper = min(2048, data_size)
|
2021-02-05 21:41:14 -08:00
|
|
|
|
space = {
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"n_estimators": {
|
|
|
|
|
"domain": tune.lograndint(lower=4, upper=upper),
|
|
|
|
|
"init_value": 4,
|
|
|
|
|
"low_cost_init_value": 4,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"max_features": {
|
|
|
|
|
"domain": tune.loguniform(lower=0.1, upper=1.0),
|
|
|
|
|
"init_value": 1.0,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"max_leaves": {
|
|
|
|
|
"domain": tune.lograndint(lower=4, upper=min(32768, data_size)),
|
|
|
|
|
"init_value": 4,
|
|
|
|
|
"low_cost_init_value": 4,
|
2021-07-27 18:02:49 -07:00
|
|
|
|
},
|
2021-02-05 21:41:14 -08:00
|
|
|
|
}
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if task in ("binary", "multi"):
|
|
|
|
|
space["criterion"] = {
|
|
|
|
|
"domain": tune.choice(["gini", "entropy"]),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
# 'init_value': 'gini',
|
|
|
|
|
}
|
|
|
|
|
return space
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def cost_relative2lgbm(cls):
|
|
|
|
|
return 2.0
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def __init__(
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self,
|
|
|
|
|
task="binary",
|
|
|
|
|
n_jobs=1,
|
|
|
|
|
n_estimators=4,
|
|
|
|
|
max_features=1.0,
|
|
|
|
|
criterion="gini",
|
|
|
|
|
max_leaves=4,
|
|
|
|
|
**params,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
super().__init__(task, **params)
|
2021-09-11 21:19:18 -07:00
|
|
|
|
del self.params["objective"]
|
|
|
|
|
del self.params["max_bin"]
|
|
|
|
|
self.params.update(
|
|
|
|
|
{
|
|
|
|
|
"n_estimators": int(round(n_estimators)),
|
|
|
|
|
"n_jobs": n_jobs,
|
|
|
|
|
"verbose": 0,
|
|
|
|
|
"max_features": float(max_features),
|
|
|
|
|
"max_leaf_nodes": params.get("max_leaf_nodes", int(round(max_leaves))),
|
|
|
|
|
}
|
|
|
|
|
)
|
2021-09-04 01:42:21 -07:00
|
|
|
|
self.estimator_class = RandomForestRegressor
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if task in ("binary", "multi"):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self.estimator_class = RandomForestClassifier
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self.params["criterion"] = criterion
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
def get_params(self, deep=False):
|
|
|
|
|
params = super().get_params()
|
|
|
|
|
return params
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ExtraTreeEstimator(RandomForestEstimator):
|
|
|
|
|
@classmethod
|
|
|
|
|
def cost_relative2lgbm(cls):
|
|
|
|
|
return 1.9
|
|
|
|
|
|
2021-09-11 21:19:18 -07:00
|
|
|
|
def __init__(self, task="binary", **params):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
super().__init__(task, **params)
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if "regression" in task:
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self.estimator_class = ExtraTreesRegressor
|
|
|
|
|
else:
|
|
|
|
|
self.estimator_class = ExtraTreesClassifier
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LRL1Classifier(SKLearnEstimator):
|
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def search_space(cls, **params):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return {
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"C": {
|
|
|
|
|
"domain": tune.loguniform(lower=0.03125, upper=32768.0),
|
|
|
|
|
"init_value": 1.0,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def cost_relative2lgbm(cls):
|
|
|
|
|
return 160
|
|
|
|
|
|
2021-09-11 21:19:18 -07:00
|
|
|
|
def __init__(self, task="binary", n_jobs=1, tol=0.0001, C=1.0, **params):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
super().__init__(task, **params)
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self.params.update(
|
|
|
|
|
{
|
|
|
|
|
"penalty": params.get("penalty", "l1"),
|
|
|
|
|
"tol": float(tol),
|
|
|
|
|
"C": float(C),
|
|
|
|
|
"solver": params.get("solver", "saga"),
|
|
|
|
|
"n_jobs": n_jobs,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
assert task in (
|
|
|
|
|
"binary",
|
|
|
|
|
"multi",
|
|
|
|
|
), "LogisticRegression for classification task only"
|
2021-09-04 01:42:21 -07:00
|
|
|
|
self.estimator_class = LogisticRegression
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LRL2Classifier(SKLearnEstimator):
|
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def search_space(cls, **params):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return LRL1Classifier.search_space(**params)
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def cost_relative2lgbm(cls):
|
|
|
|
|
return 25
|
|
|
|
|
|
2021-09-11 21:19:18 -07:00
|
|
|
|
def __init__(self, task="binary", n_jobs=1, tol=0.0001, C=1.0, **params):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
super().__init__(task, **params)
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self.params.update(
|
|
|
|
|
{
|
|
|
|
|
"penalty": params.get("penalty", "l2"),
|
|
|
|
|
"tol": float(tol),
|
|
|
|
|
"C": float(C),
|
|
|
|
|
"solver": params.get("solver", "lbfgs"),
|
|
|
|
|
"n_jobs": n_jobs,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
assert task in (
|
|
|
|
|
"binary",
|
|
|
|
|
"multi",
|
|
|
|
|
), "LogisticRegression for classification task only"
|
2021-09-04 01:42:21 -07:00
|
|
|
|
self.estimator_class = LogisticRegression
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CatBoostEstimator(BaseEstimator):
|
|
|
|
|
_time_per_iter = None
|
|
|
|
|
_train_size = 0
|
|
|
|
|
|
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def search_space(cls, data_size, **params):
|
2021-08-12 02:02:22 -04:00
|
|
|
|
upper = max(min(round(1500000 / data_size), 150), 12)
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return {
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"early_stopping_rounds": {
|
|
|
|
|
"domain": tune.lograndint(lower=10, upper=upper),
|
|
|
|
|
"init_value": 10,
|
|
|
|
|
"low_cost_init_value": 10,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"learning_rate": {
|
|
|
|
|
"domain": tune.loguniform(lower=0.005, upper=0.2),
|
|
|
|
|
"init_value": 0.1,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def size(cls, config):
|
|
|
|
|
n_estimators = 8192
|
|
|
|
|
max_leaves = 64
|
2021-04-08 09:29:55 -07:00
|
|
|
|
return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def cost_relative2lgbm(cls):
|
|
|
|
|
return 15
|
|
|
|
|
|
2021-05-22 08:51:38 -07:00
|
|
|
|
@classmethod
|
|
|
|
|
def init(cls):
|
|
|
|
|
CatBoostEstimator._time_per_iter = None
|
|
|
|
|
CatBoostEstimator._train_size = 0
|
|
|
|
|
|
2021-08-12 02:02:22 -04:00
|
|
|
|
def _preprocess(self, X):
|
|
|
|
|
if isinstance(X, pd.DataFrame):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
cat_columns = X.select_dtypes(include=["category"]).columns
|
2021-08-12 02:02:22 -04:00
|
|
|
|
if not cat_columns.empty:
|
|
|
|
|
X = X.copy()
|
|
|
|
|
X[cat_columns] = X[cat_columns].apply(
|
2021-09-11 21:19:18 -07:00
|
|
|
|
lambda x: x.cat.rename_categories(
|
|
|
|
|
[
|
|
|
|
|
str(c) if isinstance(c, float) else c
|
|
|
|
|
for c in x.cat.categories
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
elif isinstance(X, np.ndarray) and X.dtype.kind not in "buif":
|
2021-08-12 02:02:22 -04:00
|
|
|
|
# numpy array is not of numeric dtype
|
|
|
|
|
X = pd.DataFrame(X)
|
|
|
|
|
for col in X.columns:
|
|
|
|
|
if isinstance(X[col][0], str):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
X[col] = X[col].astype("category").cat.codes
|
2021-08-12 02:02:22 -04:00
|
|
|
|
X = X.to_numpy()
|
|
|
|
|
return X
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def __init__(
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self,
|
|
|
|
|
task="binary",
|
|
|
|
|
n_jobs=1,
|
|
|
|
|
n_estimators=8192,
|
|
|
|
|
learning_rate=0.1,
|
|
|
|
|
early_stopping_rounds=4,
|
|
|
|
|
**params,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
super().__init__(task, **params)
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self.params.update(
|
|
|
|
|
{
|
|
|
|
|
"early_stopping_rounds": int(round(early_stopping_rounds)),
|
|
|
|
|
"n_estimators": n_estimators,
|
|
|
|
|
"learning_rate": learning_rate,
|
|
|
|
|
"thread_count": n_jobs,
|
|
|
|
|
"verbose": params.get("verbose", False),
|
|
|
|
|
"random_seed": params.get("random_seed", 10242048),
|
|
|
|
|
}
|
|
|
|
|
)
|
2021-09-04 01:42:21 -07:00
|
|
|
|
from catboost import CatBoostRegressor
|
2021-09-11 21:19:18 -07:00
|
|
|
|
|
2021-09-04 01:42:21 -07:00
|
|
|
|
self.estimator_class = CatBoostRegressor
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if task in ("binary", "multi"):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
from catboost import CatBoostClassifier
|
2021-09-11 21:19:18 -07:00
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self.estimator_class = CatBoostClassifier
|
|
|
|
|
|
|
|
|
|
def get_params(self, deep=False):
|
|
|
|
|
params = super().get_params()
|
2021-09-11 21:19:18 -07:00
|
|
|
|
params["n_jobs"] = params["thread_count"]
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return params
|
|
|
|
|
|
|
|
|
|
def fit(self, X_train, y_train, budget=None, **kwargs):
|
2021-09-01 16:25:04 -07:00
|
|
|
|
import shutil
|
2021-09-11 21:19:18 -07:00
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
|
start_time = time.time()
|
2021-09-11 21:19:18 -07:00
|
|
|
|
train_dir = f"catboost_{str(start_time)}"
|
2021-02-05 21:41:14 -08:00
|
|
|
|
n_iter = self.params["n_estimators"]
|
2021-08-12 02:02:22 -04:00
|
|
|
|
X_train = self._preprocess(X_train)
|
2021-02-05 21:41:14 -08:00
|
|
|
|
if isinstance(X_train, pd.DataFrame):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
cat_features = list(X_train.select_dtypes(include="category").columns)
|
2021-02-05 21:41:14 -08:00
|
|
|
|
else:
|
|
|
|
|
cat_features = []
|
2021-08-12 02:02:22 -04:00
|
|
|
|
# from catboost import CatBoostError
|
|
|
|
|
# try:
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if (
|
|
|
|
|
not CatBoostEstimator._time_per_iter
|
|
|
|
|
or abs(CatBoostEstimator._train_size - len(y_train)) > 4
|
|
|
|
|
) and budget:
|
2021-08-12 02:02:22 -04:00
|
|
|
|
# measure the time per iteration
|
|
|
|
|
self.params["n_estimators"] = 1
|
2021-09-01 16:25:04 -07:00
|
|
|
|
CatBoostEstimator._smallmodel = self.estimator_class(
|
2021-09-11 21:19:18 -07:00
|
|
|
|
train_dir=train_dir, **self.params
|
|
|
|
|
)
|
2021-08-12 02:02:22 -04:00
|
|
|
|
CatBoostEstimator._smallmodel.fit(
|
2021-09-11 21:19:18 -07:00
|
|
|
|
X_train, y_train, cat_features=cat_features, **kwargs
|
|
|
|
|
)
|
2021-08-12 02:02:22 -04:00
|
|
|
|
CatBoostEstimator._t1 = time.time() - start_time
|
|
|
|
|
if CatBoostEstimator._t1 >= budget:
|
|
|
|
|
self.params["n_estimators"] = n_iter
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self._model = CatBoostEstimator._smallmodel
|
2021-09-01 16:25:04 -07:00
|
|
|
|
shutil.rmtree(train_dir, ignore_errors=True)
|
2021-08-12 02:02:22 -04:00
|
|
|
|
return CatBoostEstimator._t1
|
|
|
|
|
self.params["n_estimators"] = 4
|
2021-09-01 16:25:04 -07:00
|
|
|
|
CatBoostEstimator._smallmodel = self.estimator_class(
|
2021-09-11 21:19:18 -07:00
|
|
|
|
train_dir=train_dir, **self.params
|
|
|
|
|
)
|
2021-08-12 02:02:22 -04:00
|
|
|
|
CatBoostEstimator._smallmodel.fit(
|
2021-09-11 21:19:18 -07:00
|
|
|
|
X_train, y_train, cat_features=cat_features, **kwargs
|
|
|
|
|
)
|
2021-08-12 02:02:22 -04:00
|
|
|
|
CatBoostEstimator._time_per_iter = (
|
2021-09-11 21:19:18 -07:00
|
|
|
|
time.time() - start_time - CatBoostEstimator._t1
|
|
|
|
|
) / (self.params["n_estimators"] - 1)
|
2021-08-12 02:02:22 -04:00
|
|
|
|
if CatBoostEstimator._time_per_iter <= 0:
|
|
|
|
|
CatBoostEstimator._time_per_iter = CatBoostEstimator._t1
|
|
|
|
|
CatBoostEstimator._train_size = len(y_train)
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if (
|
|
|
|
|
time.time() - start_time >= budget
|
|
|
|
|
or n_iter == self.params["n_estimators"]
|
|
|
|
|
):
|
2021-08-12 02:02:22 -04:00
|
|
|
|
self.params["n_estimators"] = n_iter
|
|
|
|
|
self._model = CatBoostEstimator._smallmodel
|
2021-09-01 16:25:04 -07:00
|
|
|
|
shutil.rmtree(train_dir, ignore_errors=True)
|
2021-08-12 02:02:22 -04:00
|
|
|
|
return time.time() - start_time
|
|
|
|
|
if budget:
|
|
|
|
|
train_times = 1
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self.params["n_estimators"] = min(
|
|
|
|
|
n_iter,
|
|
|
|
|
int(
|
|
|
|
|
(budget - time.time() + start_time - CatBoostEstimator._t1)
|
|
|
|
|
/ train_times
|
|
|
|
|
/ CatBoostEstimator._time_per_iter
|
|
|
|
|
+ 1
|
|
|
|
|
),
|
|
|
|
|
)
|
2021-08-12 02:02:22 -04:00
|
|
|
|
self._model = CatBoostEstimator._smallmodel
|
|
|
|
|
if self.params["n_estimators"] > 0:
|
|
|
|
|
n = max(int(len(y_train) * 0.9), len(y_train) - 1000)
|
|
|
|
|
X_tr, y_tr = X_train[:n], y_train[:n]
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if "sample_weight" in kwargs:
|
|
|
|
|
weight = kwargs["sample_weight"]
|
2021-04-08 09:29:55 -07:00
|
|
|
|
if weight is not None:
|
2021-09-11 21:19:18 -07:00
|
|
|
|
kwargs["sample_weight"] = weight[:n]
|
2021-08-12 02:02:22 -04:00
|
|
|
|
else:
|
|
|
|
|
weight = None
|
|
|
|
|
from catboost import Pool
|
2021-09-11 21:19:18 -07:00
|
|
|
|
|
2021-09-01 16:25:04 -07:00
|
|
|
|
model = self.estimator_class(train_dir=train_dir, **self.params)
|
2021-08-12 02:02:22 -04:00
|
|
|
|
model.fit(
|
2021-09-11 21:19:18 -07:00
|
|
|
|
X_tr,
|
|
|
|
|
y_tr,
|
|
|
|
|
cat_features=cat_features,
|
2021-08-12 02:02:22 -04:00
|
|
|
|
eval_set=Pool(
|
2021-09-11 21:19:18 -07:00
|
|
|
|
data=X_train[n:], label=y_train[n:], cat_features=cat_features
|
|
|
|
|
),
|
|
|
|
|
**kwargs,
|
|
|
|
|
) # model.get_best_iteration()
|
2021-09-01 16:25:04 -07:00
|
|
|
|
shutil.rmtree(train_dir, ignore_errors=True)
|
2021-08-12 02:02:22 -04:00
|
|
|
|
if weight is not None:
|
2021-09-11 21:19:18 -07:00
|
|
|
|
kwargs["sample_weight"] = weight
|
2021-08-12 02:02:22 -04:00
|
|
|
|
self._model = model
|
|
|
|
|
# except CatBoostError:
|
|
|
|
|
# self._model = None
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self.params["n_estimators"] = n_iter
|
|
|
|
|
train_time = time.time() - start_time
|
|
|
|
|
return train_time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class KNeighborsEstimator(BaseEstimator):
|
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def search_space(cls, data_size, **params):
|
|
|
|
|
upper = min(512, int(data_size / 2))
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return {
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"n_neighbors": {
|
|
|
|
|
"domain": tune.lograndint(lower=1, upper=upper),
|
|
|
|
|
"init_value": 5,
|
|
|
|
|
"low_cost_init_value": 1,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def cost_relative2lgbm(cls):
|
|
|
|
|
return 30
|
|
|
|
|
|
2021-09-11 21:19:18 -07:00
|
|
|
|
def __init__(self, task="binary", n_jobs=1, n_neighbors=5, **params):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
super().__init__(task, **params)
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self.params.update(
|
|
|
|
|
{
|
|
|
|
|
"n_neighbors": int(round(n_neighbors)),
|
|
|
|
|
"weights": params.get("weights", "distance"),
|
|
|
|
|
"n_jobs": n_jobs,
|
|
|
|
|
}
|
|
|
|
|
)
|
2021-09-04 01:42:21 -07:00
|
|
|
|
from sklearn.neighbors import KNeighborsRegressor
|
2021-09-11 21:19:18 -07:00
|
|
|
|
|
2021-09-04 01:42:21 -07:00
|
|
|
|
self.estimator_class = KNeighborsRegressor
|
2021-09-11 21:19:18 -07:00
|
|
|
|
if task in ("binary", "multi"):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
from sklearn.neighbors import KNeighborsClassifier
|
2021-09-11 21:19:18 -07:00
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self.estimator_class = KNeighborsClassifier
|
|
|
|
|
|
|
|
|
|
def _preprocess(self, X):
|
|
|
|
|
if isinstance(X, pd.DataFrame):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
cat_columns = X.select_dtypes(["category"]).columns
|
2021-02-05 21:41:14 -08:00
|
|
|
|
if X.shape[1] == len(cat_columns):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
raise ValueError("kneighbor requires at least one numeric feature")
|
2021-04-08 09:29:55 -07:00
|
|
|
|
X = X.drop(cat_columns, axis=1)
|
2021-09-11 21:19:18 -07:00
|
|
|
|
elif isinstance(X, np.ndarray) and X.dtype.kind not in "buif":
|
2021-07-27 18:02:49 -07:00
|
|
|
|
# drop categocial columns if any
|
|
|
|
|
X = pd.DataFrame(X)
|
|
|
|
|
cat_columns = []
|
|
|
|
|
for col in X.columns:
|
|
|
|
|
if isinstance(X[col][0], str):
|
|
|
|
|
cat_columns.append(col)
|
|
|
|
|
X = X.drop(cat_columns, axis=1)
|
|
|
|
|
X = X.to_numpy()
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return X
|
2021-08-23 16:26:46 -04:00
|
|
|
|
|
|
|
|
|
|
2021-09-11 21:19:18 -07:00
|
|
|
|
class Prophet(BaseEstimator):
|
2021-08-23 16:26:46 -04:00
|
|
|
|
@classmethod
|
|
|
|
|
def search_space(cls, **params):
|
|
|
|
|
space = {
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"changepoint_prior_scale": {
|
|
|
|
|
"domain": tune.loguniform(lower=0.001, upper=1000),
|
|
|
|
|
"init_value": 0.01,
|
|
|
|
|
"low_cost_init_value": 0.001,
|
2021-08-23 16:26:46 -04:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"seasonality_prior_scale": {
|
|
|
|
|
"domain": tune.loguniform(lower=0.01, upper=100),
|
|
|
|
|
"init_value": 1,
|
2021-08-23 16:26:46 -04:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"holidays_prior_scale": {
|
|
|
|
|
"domain": tune.loguniform(lower=0.01, upper=100),
|
|
|
|
|
"init_value": 1,
|
|
|
|
|
},
|
|
|
|
|
"seasonality_mode": {
|
|
|
|
|
"domain": tune.choice(["additive", "multiplicative"]),
|
|
|
|
|
"init_value": "multiplicative",
|
2021-08-23 16:26:46 -04:00
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
return space
|
|
|
|
|
|
2021-09-11 21:19:18 -07:00
|
|
|
|
def __init__(self, task="forecast", **params):
|
|
|
|
|
if "n_jobs" in params:
|
|
|
|
|
params.pop("n_jobs")
|
2021-09-01 16:25:04 -07:00
|
|
|
|
super().__init__(task, **params)
|
|
|
|
|
|
|
|
|
|
def _join(self, X_train, y_train):
|
2021-09-11 21:19:18 -07:00
|
|
|
|
assert "ds" in X_train, (
|
|
|
|
|
"Dataframe for training forecast model must have column"
|
|
|
|
|
' "ds" with the dates in X_train.'
|
|
|
|
|
)
|
|
|
|
|
y_train = pd.DataFrame(y_train, columns=["y"])
|
2021-08-23 16:26:46 -04:00
|
|
|
|
train_df = X_train.join(y_train)
|
2021-09-01 16:25:04 -07:00
|
|
|
|
return train_df
|
2021-08-23 16:26:46 -04:00
|
|
|
|
|
2021-09-01 16:25:04 -07:00
|
|
|
|
def fit(self, X_train, y_train, budget=None, **kwargs):
|
2021-08-23 16:26:46 -04:00
|
|
|
|
from prophet import Prophet
|
2021-09-11 21:19:18 -07:00
|
|
|
|
|
2021-08-23 16:26:46 -04:00
|
|
|
|
current_time = time.time()
|
2021-09-01 16:25:04 -07:00
|
|
|
|
train_df = self._join(X_train, y_train)
|
2021-08-23 16:26:46 -04:00
|
|
|
|
model = Prophet(**self.params).fit(train_df)
|
|
|
|
|
train_time = time.time() - current_time
|
|
|
|
|
self._model = model
|
|
|
|
|
return train_time
|
|
|
|
|
|
2021-09-01 16:25:04 -07:00
|
|
|
|
def predict(self, X_test):
|
|
|
|
|
if isinstance(X_test, int):
|
|
|
|
|
raise ValueError(
|
|
|
|
|
"predict() with steps is only supported for arima/sarimax."
|
2021-09-11 21:19:18 -07:00
|
|
|
|
" For Prophet, pass a dataframe with a date colum named ds."
|
|
|
|
|
)
|
2021-08-23 16:26:46 -04:00
|
|
|
|
if self._model is not None:
|
2021-09-01 16:25:04 -07:00
|
|
|
|
forecast = self._model.predict(X_test)
|
2021-09-11 21:19:18 -07:00
|
|
|
|
return forecast["yhat"]
|
2021-08-23 16:26:46 -04:00
|
|
|
|
else:
|
2021-09-04 01:42:21 -07:00
|
|
|
|
logger.warning(
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"Estimator is not fit yet. Please run fit() before predict()."
|
|
|
|
|
)
|
2021-08-23 16:26:46 -04:00
|
|
|
|
return np.ones(X_test.shape[0])
|
|
|
|
|
|
|
|
|
|
|
2021-09-11 21:19:18 -07:00
|
|
|
|
class ARIMA(Prophet):
|
2021-08-23 16:26:46 -04:00
|
|
|
|
@classmethod
|
|
|
|
|
def search_space(cls, **params):
|
|
|
|
|
space = {
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"p": {
|
|
|
|
|
"domain": tune.quniform(lower=0, upper=10, q=1),
|
|
|
|
|
"init_value": 2,
|
|
|
|
|
"low_cost_init_value": 0,
|
2021-08-23 16:26:46 -04:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"d": {
|
|
|
|
|
"domain": tune.quniform(lower=0, upper=10, q=1),
|
|
|
|
|
"init_value": 2,
|
|
|
|
|
"low_cost_init_value": 0,
|
|
|
|
|
},
|
|
|
|
|
"q": {
|
|
|
|
|
"domain": tune.quniform(lower=0, upper=10, q=1),
|
|
|
|
|
"init_value": 2,
|
|
|
|
|
"low_cost_init_value": 0,
|
2021-08-23 16:26:46 -04:00
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
return space
|
|
|
|
|
|
2021-09-01 16:25:04 -07:00
|
|
|
|
def _join(self, X_train, y_train):
|
|
|
|
|
train_df = super()._join(X_train, y_train)
|
2021-09-11 21:19:18 -07:00
|
|
|
|
train_df.index = pd.to_datetime(train_df["ds"])
|
|
|
|
|
train_df = train_df.drop("ds", axis=1)
|
2021-09-01 16:25:04 -07:00
|
|
|
|
return train_df
|
2021-08-23 16:26:46 -04:00
|
|
|
|
|
2021-09-01 16:25:04 -07:00
|
|
|
|
def fit(self, X_train, y_train, budget=None, **kwargs):
|
2021-09-04 01:42:21 -07:00
|
|
|
|
import warnings
|
2021-09-11 21:19:18 -07:00
|
|
|
|
|
2021-08-23 16:26:46 -04:00
|
|
|
|
warnings.filterwarnings("ignore")
|
2021-09-04 01:42:21 -07:00
|
|
|
|
from statsmodels.tsa.arima.model import ARIMA as ARIMA_estimator
|
2021-09-11 21:19:18 -07:00
|
|
|
|
|
2021-08-23 16:26:46 -04:00
|
|
|
|
current_time = time.time()
|
2021-09-01 16:25:04 -07:00
|
|
|
|
train_df = self._join(X_train, y_train)
|
|
|
|
|
model = ARIMA_estimator(
|
2021-09-11 21:19:18 -07:00
|
|
|
|
train_df,
|
|
|
|
|
order=(self.params["p"], self.params["d"], self.params["q"]),
|
|
|
|
|
enforce_stationarity=False,
|
|
|
|
|
enforce_invertibility=False,
|
|
|
|
|
)
|
2021-08-23 16:26:46 -04:00
|
|
|
|
model = model.fit()
|
|
|
|
|
train_time = time.time() - current_time
|
|
|
|
|
self._model = model
|
|
|
|
|
return train_time
|
|
|
|
|
|
2021-09-01 16:25:04 -07:00
|
|
|
|
def predict(self, X_test):
|
2021-08-23 16:26:46 -04:00
|
|
|
|
if self._model is not None:
|
2021-09-01 16:25:04 -07:00
|
|
|
|
if isinstance(X_test, int):
|
|
|
|
|
forecast = self._model.forecast(steps=X_test)
|
2021-08-23 16:26:46 -04:00
|
|
|
|
elif isinstance(X_test, pd.DataFrame):
|
2021-09-01 16:25:04 -07:00
|
|
|
|
start = X_test.iloc[0, 0]
|
|
|
|
|
end = X_test.iloc[-1, 0]
|
|
|
|
|
forecast = self._model.predict(start=start, end=end)
|
2021-08-23 16:26:46 -04:00
|
|
|
|
else:
|
|
|
|
|
raise ValueError(
|
2021-09-01 16:25:04 -07:00
|
|
|
|
"X_test needs to be either a pd.Dataframe with dates as column ds)"
|
2021-09-11 21:19:18 -07:00
|
|
|
|
" or an int number of periods for predict()."
|
|
|
|
|
)
|
2021-08-23 16:26:46 -04:00
|
|
|
|
return forecast
|
|
|
|
|
else:
|
2021-09-11 21:19:18 -07:00
|
|
|
|
return np.ones(X_test if isinstance(X_test, int) else X_test.shape[0])
|
2021-08-23 16:26:46 -04:00
|
|
|
|
|
|
|
|
|
|
2021-09-01 16:25:04 -07:00
|
|
|
|
class SARIMAX(ARIMA):
|
2021-08-23 16:26:46 -04:00
|
|
|
|
@classmethod
|
|
|
|
|
def search_space(cls, **params):
|
|
|
|
|
space = {
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"p": {
|
|
|
|
|
"domain": tune.quniform(lower=0, upper=10, q=1),
|
|
|
|
|
"init_value": 2,
|
|
|
|
|
"low_cost_init_value": 0,
|
2021-08-23 16:26:46 -04:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"d": {
|
|
|
|
|
"domain": tune.quniform(lower=0, upper=10, q=1),
|
|
|
|
|
"init_value": 2,
|
|
|
|
|
"low_cost_init_value": 0,
|
2021-08-23 16:26:46 -04:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"q": {
|
|
|
|
|
"domain": tune.quniform(lower=0, upper=10, q=1),
|
|
|
|
|
"init_value": 2,
|
|
|
|
|
"low_cost_init_value": 0,
|
2021-08-23 16:26:46 -04:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"P": {
|
|
|
|
|
"domain": tune.quniform(lower=0, upper=10, q=1),
|
|
|
|
|
"init_value": 1,
|
|
|
|
|
"low_cost_init_value": 0,
|
2021-08-23 16:26:46 -04:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"D": {
|
|
|
|
|
"domain": tune.quniform(lower=0, upper=10, q=1),
|
|
|
|
|
"init_value": 1,
|
|
|
|
|
"low_cost_init_value": 0,
|
2021-08-23 16:26:46 -04:00
|
|
|
|
},
|
2021-09-11 21:19:18 -07:00
|
|
|
|
"Q": {
|
|
|
|
|
"domain": tune.quniform(lower=0, upper=10, q=1),
|
|
|
|
|
"init_value": 1,
|
|
|
|
|
"low_cost_init_value": 0,
|
|
|
|
|
},
|
|
|
|
|
"s": {
|
|
|
|
|
"domain": tune.choice([1, 4, 6, 12]),
|
|
|
|
|
"init_value": 12,
|
2021-08-23 16:26:46 -04:00
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
return space
|
|
|
|
|
|
|
|
|
|
def fit(self, X_train, y_train, budget=None, **kwargs):
|
|
|
|
|
from statsmodels.tsa.statespace.sarimax import SARIMAX as SARIMAX_estimator
|
2021-09-11 21:19:18 -07:00
|
|
|
|
|
2021-08-23 16:26:46 -04:00
|
|
|
|
current_time = time.time()
|
2021-09-01 16:25:04 -07:00
|
|
|
|
train_df = self._join(X_train, y_train)
|
|
|
|
|
model = SARIMAX_estimator(
|
2021-09-11 21:19:18 -07:00
|
|
|
|
train_df,
|
|
|
|
|
order=(self.params["p"], self.params["d"], self.params["q"]),
|
2021-09-01 16:25:04 -07:00
|
|
|
|
seasonality_order=(
|
2021-09-11 21:19:18 -07:00
|
|
|
|
self.params["P"],
|
|
|
|
|
self.params["D"],
|
|
|
|
|
self.params["Q"],
|
|
|
|
|
self.params["s"],
|
|
|
|
|
),
|
|
|
|
|
enforce_stationarity=False,
|
|
|
|
|
enforce_invertibility=False,
|
|
|
|
|
)
|
2021-08-23 16:26:46 -04:00
|
|
|
|
model = model.fit()
|
|
|
|
|
train_time = time.time() - current_time
|
|
|
|
|
self._model = model
|
|
|
|
|
return train_time
|