2021-02-05 21:41:14 -08:00
|
|
|
|
'''!
|
|
|
|
|
* Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
|
2021-04-08 09:29:55 -07:00
|
|
|
|
* Licensed under the MIT License.
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'''
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
import xgboost as xgb
|
|
|
|
|
import time
|
|
|
|
|
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
|
|
|
|
|
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
|
|
|
|
|
from sklearn.linear_model import LogisticRegression
|
|
|
|
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
|
|
|
from scipy.sparse import issparse
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from . import tune
|
|
|
|
|
|
|
|
|
|
import logging
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BaseEstimator:
|
|
|
|
|
'''The abstract class for all learners
|
|
|
|
|
|
|
|
|
|
Typical example:
|
|
|
|
|
XGBoostEstimator: for regression
|
|
|
|
|
XGBoostSklearnEstimator: for classification
|
2021-04-08 09:29:55 -07:00
|
|
|
|
LGBMEstimator, RandomForestEstimator, LRL1Classifier, LRL2Classifier:
|
|
|
|
|
for both regression and classification
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'''
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def __init__(self, task='binary:logistic', **params):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'''Constructor
|
2021-04-08 09:29:55 -07:00
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
|
Args:
|
|
|
|
|
task: A string of the task type, one of
|
|
|
|
|
'binary:logistic', 'multi:softmax', 'regression'
|
|
|
|
|
n_jobs: An integer of the number of parallel threads
|
|
|
|
|
params: A dictionary of the hyperparameter names and values
|
|
|
|
|
'''
|
|
|
|
|
self.params = params
|
|
|
|
|
self.estimator_class = self._model = None
|
|
|
|
|
self._task = task
|
|
|
|
|
if '_estimator_type' in params:
|
|
|
|
|
self._estimator_type = params['_estimator_type']
|
|
|
|
|
else:
|
2021-04-08 09:29:55 -07:00
|
|
|
|
self._estimator_type = "regressor" if task == 'regression' \
|
|
|
|
|
else "classifier"
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
def get_params(self, deep=False):
|
|
|
|
|
params = self.params.copy()
|
|
|
|
|
params["task"] = self._task
|
|
|
|
|
if hasattr(self, '_estimator_type'):
|
|
|
|
|
params['_estimator_type'] = self._estimator_type
|
|
|
|
|
return params
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def classes_(self):
|
|
|
|
|
return self._model.classes_
|
|
|
|
|
|
|
|
|
|
@property
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def n_features_in_(self):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return self.model.n_features_in_
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def model(self):
|
|
|
|
|
'''Trained model after fit() is called, or None before fit() is called
|
|
|
|
|
'''
|
|
|
|
|
return self._model
|
|
|
|
|
|
|
|
|
|
def _preprocess(self, X):
|
|
|
|
|
return X
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def _fit(self, X_train, y_train, **kwargs):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
2021-03-28 17:54:25 -07:00
|
|
|
|
current_time = time.time()
|
2021-02-05 21:41:14 -08:00
|
|
|
|
X_train = self._preprocess(X_train)
|
|
|
|
|
model = self.estimator_class(**self.params)
|
|
|
|
|
model.fit(X_train, y_train, **kwargs)
|
2021-03-28 17:54:25 -07:00
|
|
|
|
train_time = time.time() - current_time
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self._model = model
|
|
|
|
|
return train_time
|
|
|
|
|
|
|
|
|
|
def fit(self, X_train, y_train, budget=None, **kwargs):
|
|
|
|
|
'''Train the model from given training data
|
2021-04-08 09:29:55 -07:00
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
|
Args:
|
|
|
|
|
X_train: A numpy array of training data in shape n*m
|
|
|
|
|
y_train: A numpy array of labels in shape n*1
|
|
|
|
|
budget: A float of the time budget in seconds
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
train_time: A float of the training time in seconds
|
|
|
|
|
'''
|
|
|
|
|
return self._fit(X_train, y_train, **kwargs)
|
|
|
|
|
|
|
|
|
|
def predict(self, X_test):
|
|
|
|
|
'''Predict label from features
|
2021-04-08 09:29:55 -07:00
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
|
Args:
|
|
|
|
|
X_test: A numpy array of featurized instances, shape n*m
|
|
|
|
|
|
|
|
|
|
Returns:
|
2021-04-08 09:29:55 -07:00
|
|
|
|
A numpy array of shape n*1.
|
2021-02-05 21:41:14 -08:00
|
|
|
|
Each element is the label for a instance
|
2021-04-08 09:29:55 -07:00
|
|
|
|
'''
|
2021-04-21 04:36:06 -04:00
|
|
|
|
if self._model is not None:
|
|
|
|
|
X_test = self._preprocess(X_test)
|
|
|
|
|
return self._model.predict(X_test)
|
|
|
|
|
else:
|
|
|
|
|
return np.ones(X_test.shape[0])
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
def predict_proba(self, X_test):
|
|
|
|
|
'''Predict the probability of each class from features
|
|
|
|
|
|
|
|
|
|
Only works for classification problems
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
model: An object of trained model with method predict_proba()
|
|
|
|
|
X_test: A numpy array of featurized instances, shape n*m
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A numpy array of shape n*c. c is the # classes
|
|
|
|
|
Each element at (i,j) is the probability for instance i to be in
|
|
|
|
|
class j
|
|
|
|
|
'''
|
|
|
|
|
if 'regression' in self._task:
|
|
|
|
|
print('Regression tasks do not support predict_prob')
|
|
|
|
|
raise ValueError
|
|
|
|
|
else:
|
|
|
|
|
X_test = self._preprocess(X_test)
|
|
|
|
|
return self._model.predict_proba(X_test)
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def cleanup(self):
|
|
|
|
|
pass
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def search_space(cls, **params):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'''[required method] search space
|
|
|
|
|
|
|
|
|
|
Returns:
|
2021-04-08 09:29:55 -07:00
|
|
|
|
A dictionary of the search space.
|
2021-02-05 21:41:14 -08:00
|
|
|
|
Each key is the name of a hyperparameter, and value is a dict with
|
2021-04-08 09:29:55 -07:00
|
|
|
|
its domain and init_value (optional), cat_hp_cost (optional)
|
|
|
|
|
e.g.,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
{'domain': tune.randint(lower=1, upper=10), 'init_value': 1}
|
|
|
|
|
'''
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def size(cls, config):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'''[optional method] memory size of the estimator in bytes
|
2021-04-08 09:29:55 -07:00
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
|
Args:
|
|
|
|
|
config - the dict of the hyperparameter config
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A float of the memory size required by the estimator to train the
|
|
|
|
|
given config
|
|
|
|
|
'''
|
|
|
|
|
return 1.0
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def cost_relative2lgbm(cls):
|
|
|
|
|
'''[optional method] relative cost compared to lightgbm'''
|
|
|
|
|
return 1.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SKLearnEstimator(BaseEstimator):
|
|
|
|
|
|
|
|
|
|
def _preprocess(self, X):
|
|
|
|
|
if isinstance(X, pd.DataFrame):
|
|
|
|
|
X = X.copy()
|
|
|
|
|
cat_columns = X.select_dtypes(include=['category']).columns
|
|
|
|
|
X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)
|
|
|
|
|
return X
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LGBMEstimator(BaseEstimator):
|
|
|
|
|
|
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def search_space(cls, data_size, **params):
|
|
|
|
|
upper = min(32768, int(data_size))
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return {
|
|
|
|
|
'n_estimators': {
|
|
|
|
|
'domain': tune.qloguniform(lower=4, upper=upper, q=1),
|
|
|
|
|
'init_value': 4,
|
2021-04-06 11:37:52 -07:00
|
|
|
|
'low_cost_init_value': 4,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
2021-04-08 09:29:55 -07:00
|
|
|
|
'num_leaves': {
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'domain': tune.qloguniform(lower=4, upper=upper, q=1),
|
|
|
|
|
'init_value': 4,
|
2021-04-06 11:37:52 -07:00
|
|
|
|
'low_cost_init_value': 4,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
2021-04-08 09:29:55 -07:00
|
|
|
|
'min_child_samples': {
|
2021-03-28 17:54:25 -07:00
|
|
|
|
'domain': tune.qloguniform(lower=2, upper=2**7, q=1),
|
|
|
|
|
'init_value': 20,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
|
|
|
|
'learning_rate': {
|
2021-04-08 09:29:55 -07:00
|
|
|
|
'domain': tune.loguniform(lower=1 / 1024, upper=1.0),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'init_value': 0.1,
|
|
|
|
|
},
|
|
|
|
|
'subsample': {
|
2021-03-28 17:54:25 -07:00
|
|
|
|
'domain': tune.uniform(lower=0.1, upper=1.0),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'init_value': 1.0,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
},
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'log_max_bin': {
|
|
|
|
|
'domain': tune.qloguniform(lower=3, upper=10, q=1),
|
|
|
|
|
'init_value': 8,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
},
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'colsample_bytree': {
|
2021-03-28 17:54:25 -07:00
|
|
|
|
'domain': tune.uniform(lower=0.01, upper=1.0),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'init_value': 1.0,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
},
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'reg_alpha': {
|
2021-04-08 09:29:55 -07:00
|
|
|
|
'domain': tune.loguniform(lower=1 / 1024, upper=1024),
|
|
|
|
|
'init_value': 1 / 1024,
|
|
|
|
|
},
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'reg_lambda': {
|
2021-04-08 09:29:55 -07:00
|
|
|
|
'domain': tune.loguniform(lower=1 / 1024, upper=1024),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'init_value': 1.0,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
},
|
2021-02-05 21:41:14 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def size(cls, config):
|
2021-04-08 09:29:55 -07:00
|
|
|
|
num_leaves = int(round(config.get('num_leaves') or config['max_leaves']))
|
2021-02-05 21:41:14 -08:00
|
|
|
|
n_estimators = int(round(config['n_estimators']))
|
2021-04-08 09:29:55 -07:00
|
|
|
|
return (num_leaves * 3 + (num_leaves - 1) * 4 + 1.0) * n_estimators * 8
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self, task='binary:logistic', n_jobs=1,
|
|
|
|
|
n_estimators=2, num_leaves=2, min_child_samples=20, learning_rate=0.1,
|
|
|
|
|
subsample=1.0, reg_lambda=1.0, reg_alpha=0.0,
|
|
|
|
|
colsample_bytree=1.0, log_max_bin=8, **params
|
|
|
|
|
):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
super().__init__(task, **params)
|
2021-04-08 09:29:55 -07:00
|
|
|
|
# Default: ‘regression’ for LGBMRegressor,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
# ‘binary’ or ‘multiclass’ for LGBMClassifier
|
|
|
|
|
if 'regression' in task:
|
|
|
|
|
objective = 'regression'
|
|
|
|
|
elif 'binary' in task:
|
|
|
|
|
objective = 'binary'
|
|
|
|
|
elif 'multi' in task:
|
|
|
|
|
objective = 'multiclass'
|
2021-04-08 09:29:55 -07:00
|
|
|
|
else:
|
|
|
|
|
objective = 'regression'
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self.params = {
|
|
|
|
|
"n_estimators": int(round(n_estimators)),
|
2021-04-08 09:29:55 -07:00
|
|
|
|
"num_leaves": int(round(num_leaves)),
|
2021-03-19 09:50:47 -07:00
|
|
|
|
'objective': params.get("objective", objective),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'n_jobs': n_jobs,
|
|
|
|
|
'learning_rate': float(learning_rate),
|
|
|
|
|
'reg_alpha': float(reg_alpha),
|
|
|
|
|
'reg_lambda': float(reg_lambda),
|
2021-04-08 09:29:55 -07:00
|
|
|
|
'min_child_samples': int(round(min_child_samples)),
|
|
|
|
|
'colsample_bytree': float(colsample_bytree),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'subsample': float(subsample),
|
|
|
|
|
}
|
|
|
|
|
self.params['max_bin'] = params['max_bin'] if 'max_bin' in params else (
|
2021-04-08 09:29:55 -07:00
|
|
|
|
1 << int(round(log_max_bin))) - 1
|
2021-02-05 21:41:14 -08:00
|
|
|
|
if 'regression' in task:
|
|
|
|
|
self.estimator_class = LGBMRegressor
|
|
|
|
|
else:
|
|
|
|
|
self.estimator_class = LGBMClassifier
|
|
|
|
|
self._time_per_iter = None
|
|
|
|
|
self._train_size = 0
|
|
|
|
|
|
|
|
|
|
def _preprocess(self, X):
|
2021-04-08 09:29:55 -07:00
|
|
|
|
if not isinstance(X, pd.DataFrame) and issparse(X) and np.issubdtype(
|
|
|
|
|
X.dtype, np.integer):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
X = X.astype(float)
|
|
|
|
|
return X
|
|
|
|
|
|
|
|
|
|
def fit(self, X_train, y_train, budget=None, **kwargs):
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
n_iter = self.params["n_estimators"]
|
2021-04-08 09:29:55 -07:00
|
|
|
|
if (not self._time_per_iter or abs(
|
|
|
|
|
self._train_size - X_train.shape[0]) > 4) and budget is not None:
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self.params["n_estimators"] = 1
|
|
|
|
|
self._t1 = self._fit(X_train, y_train, **kwargs)
|
2021-04-08 09:29:55 -07:00
|
|
|
|
if self._t1 >= budget:
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self.params["n_estimators"] = n_iter
|
|
|
|
|
return self._t1
|
|
|
|
|
self.params["n_estimators"] = 4
|
|
|
|
|
self._t2 = self._fit(X_train, y_train, **kwargs)
|
2021-04-08 09:29:55 -07:00
|
|
|
|
self._time_per_iter = (self._t2 - self._t1) / (
|
|
|
|
|
self.params["n_estimators"] - 1) if self._t2 > self._t1 \
|
2021-02-05 21:41:14 -08:00
|
|
|
|
else self._t1 if self._t1 else 0.001
|
|
|
|
|
self._train_size = X_train.shape[0]
|
2021-04-08 09:29:55 -07:00
|
|
|
|
if self._t1 + self._t2 >= budget or n_iter == self.params[
|
|
|
|
|
"n_estimators"]:
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self.params["n_estimators"] = n_iter
|
|
|
|
|
return time.time() - start_time
|
|
|
|
|
if budget is not None:
|
2021-04-08 09:29:55 -07:00
|
|
|
|
self.params["n_estimators"] = min(n_iter, int(
|
|
|
|
|
(budget - time.time() + start_time - self._t1)
|
|
|
|
|
/ self._time_per_iter + 1))
|
2021-02-05 21:41:14 -08:00
|
|
|
|
if self.params["n_estimators"] > 0:
|
|
|
|
|
self._fit(X_train, y_train, **kwargs)
|
|
|
|
|
self.params["n_estimators"] = n_iter
|
|
|
|
|
train_time = time.time() - start_time
|
|
|
|
|
return train_time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class XGBoostEstimator(SKLearnEstimator):
|
|
|
|
|
''' not using sklearn API, used for regression '''
|
|
|
|
|
|
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def search_space(cls, data_size, **params):
|
|
|
|
|
upper = min(32768, int(data_size))
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return {
|
|
|
|
|
'n_estimators': {
|
|
|
|
|
'domain': tune.qloguniform(lower=4, upper=upper, q=1),
|
|
|
|
|
'init_value': 4,
|
2021-04-06 11:37:52 -07:00
|
|
|
|
'low_cost_init_value': 4,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
|
|
|
|
'max_leaves': {
|
|
|
|
|
'domain': tune.qloguniform(lower=4, upper=upper, q=1),
|
|
|
|
|
'init_value': 4,
|
2021-04-06 11:37:52 -07:00
|
|
|
|
'low_cost_init_value': 4,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
|
|
|
|
'min_child_weight': {
|
2021-03-28 17:54:25 -07:00
|
|
|
|
'domain': tune.loguniform(lower=0.001, upper=128),
|
|
|
|
|
'init_value': 1,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
|
|
|
|
'learning_rate': {
|
2021-04-08 09:29:55 -07:00
|
|
|
|
'domain': tune.loguniform(lower=1 / 1024, upper=1.0),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'init_value': 0.1,
|
|
|
|
|
},
|
|
|
|
|
'subsample': {
|
2021-03-28 17:54:25 -07:00
|
|
|
|
'domain': tune.uniform(lower=0.1, upper=1.0),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'init_value': 1.0,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
},
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'colsample_bylevel': {
|
2021-03-28 17:54:25 -07:00
|
|
|
|
'domain': tune.uniform(lower=0.01, upper=1.0),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'init_value': 1.0,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
},
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'colsample_bytree': {
|
2021-03-28 17:54:25 -07:00
|
|
|
|
'domain': tune.uniform(lower=0.01, upper=1.0),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'init_value': 1.0,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
},
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'reg_alpha': {
|
2021-04-08 09:29:55 -07:00
|
|
|
|
'domain': tune.loguniform(lower=1 / 1024, upper=1024),
|
|
|
|
|
'init_value': 1 / 1024,
|
|
|
|
|
},
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'reg_lambda': {
|
2021-04-08 09:29:55 -07:00
|
|
|
|
'domain': tune.loguniform(lower=1 / 1024, upper=1024),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'init_value': 1.0,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
},
|
2021-02-05 21:41:14 -08:00
|
|
|
|
}
|
2021-04-08 09:29:55 -07:00
|
|
|
|
|
2021-02-05 21:41:14 -08:00
|
|
|
|
@classmethod
|
|
|
|
|
def size(cls, config):
|
|
|
|
|
return LGBMEstimator.size(config)
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def cost_relative2lgbm(cls):
|
|
|
|
|
return 1.6
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def __init__(
|
|
|
|
|
self, task='regression', all_thread=False, n_jobs=1,
|
|
|
|
|
n_estimators=4, max_leaves=4, subsample=1.0, min_child_weight=1,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
colsample_bytree=1.0, tree_method='auto', **params
|
|
|
|
|
):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
super().__init__(task, **params)
|
|
|
|
|
self._n_estimators = int(round(n_estimators))
|
|
|
|
|
self.params = {
|
|
|
|
|
'max_leaves': int(round(max_leaves)),
|
2021-03-19 09:50:47 -07:00
|
|
|
|
'max_depth': params.get('max_depth', 0),
|
|
|
|
|
'grow_policy': params.get("grow_policy", 'lossguide'),
|
|
|
|
|
'tree_method': tree_method,
|
|
|
|
|
'verbosity': params.get('verbosity', 0),
|
|
|
|
|
'nthread': n_jobs,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'learning_rate': float(learning_rate),
|
|
|
|
|
'subsample': float(subsample),
|
|
|
|
|
'reg_alpha': float(reg_alpha),
|
|
|
|
|
'reg_lambda': float(reg_lambda),
|
|
|
|
|
'min_child_weight': float(min_child_weight),
|
2021-03-19 09:50:47 -07:00
|
|
|
|
'booster': params.get('booster', 'gbtree'),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'colsample_bylevel': float(colsample_bylevel),
|
2021-04-08 09:29:55 -07:00
|
|
|
|
'colsample_bytree': float(colsample_bytree),
|
2021-04-10 21:14:28 -04:00
|
|
|
|
'objective': params.get("objective")
|
2021-04-08 09:29:55 -07:00
|
|
|
|
}
|
2021-02-05 21:41:14 -08:00
|
|
|
|
if all_thread:
|
|
|
|
|
del self.params['nthread']
|
|
|
|
|
|
|
|
|
|
def get_params(self, deep=False):
|
|
|
|
|
params = super().get_params()
|
|
|
|
|
params["n_jobs"] = params['nthread']
|
|
|
|
|
return params
|
|
|
|
|
|
|
|
|
|
def fit(self, X_train, y_train, budget=None, **kwargs):
|
2021-04-08 09:29:55 -07:00
|
|
|
|
start_time = time.time()
|
2021-02-05 21:41:14 -08:00
|
|
|
|
if not issparse(X_train):
|
|
|
|
|
self.params['tree_method'] = 'hist'
|
|
|
|
|
X_train = self._preprocess(X_train)
|
2021-03-31 22:11:56 -07:00
|
|
|
|
if 'sample_weight' in kwargs:
|
|
|
|
|
dtrain = xgb.DMatrix(X_train, label=y_train, weight=kwargs[
|
|
|
|
|
'sample_weight'])
|
|
|
|
|
else:
|
|
|
|
|
dtrain = xgb.DMatrix(X_train, label=y_train)
|
2021-04-08 09:29:55 -07:00
|
|
|
|
|
2021-04-10 21:14:28 -04:00
|
|
|
|
objective = self.params.get('objective')
|
|
|
|
|
if isinstance(objective, str):
|
|
|
|
|
obj = None
|
2021-02-05 21:41:14 -08:00
|
|
|
|
else:
|
2021-04-10 21:14:28 -04:00
|
|
|
|
obj = objective
|
|
|
|
|
if 'objective' in self.params:
|
|
|
|
|
del self.params['objective']
|
|
|
|
|
self._model = xgb.train(self.params, dtrain, self._n_estimators,
|
|
|
|
|
obj=obj)
|
|
|
|
|
self.params['objective'] = objective
|
|
|
|
|
del dtrain
|
|
|
|
|
train_time = time.time() - start_time
|
|
|
|
|
return train_time
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
def predict(self, X_test):
|
|
|
|
|
if not issparse(X_test):
|
|
|
|
|
X_test = self._preprocess(X_test)
|
|
|
|
|
dtest = xgb.DMatrix(X_test)
|
|
|
|
|
return super().predict(dtest)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
|
|
|
|
|
''' using sklearn API, used for classification '''
|
|
|
|
|
|
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def search_space(cls, data_size, **params):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return XGBoostEstimator.search_space(data_size)
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def cost_relative2lgbm(cls):
|
|
|
|
|
return XGBoostEstimator.cost_relative2lgbm()
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def __init__(
|
|
|
|
|
self, task='binary:logistic', n_jobs=1,
|
|
|
|
|
n_estimators=4, max_leaves=4, subsample=1.0,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
min_child_weight=1, learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0,
|
2021-04-08 09:29:55 -07:00
|
|
|
|
colsample_bylevel=1.0, colsample_bytree=1.0, tree_method='hist',
|
|
|
|
|
**params
|
|
|
|
|
):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
super().__init__(task, **params)
|
|
|
|
|
self.params = {
|
2021-04-08 09:29:55 -07:00
|
|
|
|
"n_estimators": int(round(n_estimators)),
|
|
|
|
|
'max_leaves': int(round(max_leaves)),
|
|
|
|
|
'max_depth': 0,
|
|
|
|
|
'grow_policy': params.get("grow_policy", 'lossguide'),
|
|
|
|
|
'tree_method': tree_method,
|
|
|
|
|
'verbosity': 0,
|
|
|
|
|
'n_jobs': n_jobs,
|
|
|
|
|
'learning_rate': float(learning_rate),
|
|
|
|
|
'subsample': float(subsample),
|
|
|
|
|
'reg_alpha': float(reg_alpha),
|
|
|
|
|
'reg_lambda': float(reg_lambda),
|
|
|
|
|
'min_child_weight': float(min_child_weight),
|
|
|
|
|
'booster': params.get('booster', 'gbtree'),
|
|
|
|
|
'colsample_bylevel': float(colsample_bylevel),
|
|
|
|
|
'colsample_bytree': float(colsample_bytree),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if 'regression' in task:
|
|
|
|
|
self.estimator_class = xgb.XGBRegressor
|
|
|
|
|
else:
|
|
|
|
|
self.estimator_class = xgb.XGBClassifier
|
|
|
|
|
self._time_per_iter = None
|
|
|
|
|
self._train_size = 0
|
|
|
|
|
|
|
|
|
|
def fit(self, X_train, y_train, budget=None, **kwargs):
|
|
|
|
|
if issparse(X_train):
|
|
|
|
|
self.params['tree_method'] = 'auto'
|
|
|
|
|
return super().fit(X_train, y_train, budget, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def search_space(cls, data_size, task, **params):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
upper = min(2048, int(data_size))
|
|
|
|
|
space = {
|
|
|
|
|
'n_estimators': {
|
|
|
|
|
'domain': tune.qloguniform(lower=4, upper=upper, q=1),
|
|
|
|
|
'init_value': 4,
|
2021-04-06 11:37:52 -07:00
|
|
|
|
'low_cost_init_value': 4,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
|
|
|
|
'max_features': {
|
|
|
|
|
'domain': tune.loguniform(lower=0.1, upper=1.0),
|
|
|
|
|
'init_value': 1.0,
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
if task != 'regression':
|
|
|
|
|
space['criterion'] = {
|
|
|
|
|
'domain': tune.choice(['gini', 'entropy']),
|
|
|
|
|
# 'init_value': 'gini',
|
|
|
|
|
}
|
|
|
|
|
return space
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def size(cls, config):
|
|
|
|
|
return 1.0
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def cost_relative2lgbm(cls):
|
|
|
|
|
return 2.0
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def __init__(
|
|
|
|
|
self, task='binary:logistic', n_jobs=1,
|
|
|
|
|
n_estimators=4, max_features=1.0, criterion='gini', **params
|
|
|
|
|
):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
super().__init__(task, **params)
|
|
|
|
|
self.params = {
|
2021-04-08 09:29:55 -07:00
|
|
|
|
"n_estimators": int(round(n_estimators)),
|
|
|
|
|
"n_jobs": n_jobs,
|
|
|
|
|
'max_features': float(max_features),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
}
|
|
|
|
|
if 'regression' in task:
|
|
|
|
|
self.estimator_class = RandomForestRegressor
|
|
|
|
|
else:
|
|
|
|
|
self.estimator_class = RandomForestClassifier
|
|
|
|
|
self.params['criterion'] = criterion
|
|
|
|
|
self._time_per_iter = None
|
|
|
|
|
self._train_size = 0
|
|
|
|
|
|
|
|
|
|
def get_params(self, deep=False):
|
|
|
|
|
params = super().get_params()
|
|
|
|
|
return params
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ExtraTreeEstimator(RandomForestEstimator):
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def cost_relative2lgbm(cls):
|
|
|
|
|
return 1.9
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def __init__(self, task='binary:logistic', **params):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
super().__init__(task, **params)
|
|
|
|
|
if 'regression' in task:
|
|
|
|
|
self.estimator_class = ExtraTreesRegressor
|
|
|
|
|
else:
|
|
|
|
|
self.estimator_class = ExtraTreesClassifier
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LRL1Classifier(SKLearnEstimator):
|
|
|
|
|
|
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def search_space(cls, **params):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return {
|
|
|
|
|
'C': {
|
|
|
|
|
'domain': tune.loguniform(lower=0.03125, upper=32768.0),
|
|
|
|
|
'init_value': 1.0,
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def cost_relative2lgbm(cls):
|
|
|
|
|
return 160
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def __init__(
|
|
|
|
|
self, task='binary:logistic', n_jobs=1, tol=0.0001, C=1.0,
|
|
|
|
|
**params
|
|
|
|
|
):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
super().__init__(task, **params)
|
|
|
|
|
self.params = {
|
2021-03-19 09:50:47 -07:00
|
|
|
|
'penalty': params.get("penalty", 'l1'),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'tol': float(tol),
|
|
|
|
|
'C': float(C),
|
2021-03-19 09:50:47 -07:00
|
|
|
|
'solver': params.get("solver", 'saga'),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'n_jobs': n_jobs,
|
|
|
|
|
}
|
|
|
|
|
if 'regression' in task:
|
|
|
|
|
self.estimator_class = None
|
2021-04-08 09:29:55 -07:00
|
|
|
|
raise NotImplementedError('LR does not support regression task')
|
2021-02-05 21:41:14 -08:00
|
|
|
|
else:
|
|
|
|
|
self.estimator_class = LogisticRegression
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LRL2Classifier(SKLearnEstimator):
|
|
|
|
|
|
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def search_space(cls, **params):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return LRL1Classifier.search_space(**params)
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def cost_relative2lgbm(cls):
|
|
|
|
|
return 25
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def __init__(
|
|
|
|
|
self, task='binary:logistic', n_jobs=1, tol=0.0001, C=1.0,
|
|
|
|
|
**params
|
|
|
|
|
):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
super().__init__(task, **params)
|
|
|
|
|
self.params = {
|
2021-03-19 09:50:47 -07:00
|
|
|
|
'penalty': params.get("penalty", 'l2'),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'tol': float(tol),
|
|
|
|
|
'C': float(C),
|
2021-03-19 09:50:47 -07:00
|
|
|
|
'solver': params.get("solver", 'lbfgs'),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'n_jobs': n_jobs,
|
|
|
|
|
}
|
|
|
|
|
if 'regression' in task:
|
|
|
|
|
self.estimator_class = None
|
2021-04-08 09:29:55 -07:00
|
|
|
|
raise NotImplementedError('LR does not support regression task')
|
2021-02-05 21:41:14 -08:00
|
|
|
|
else:
|
|
|
|
|
self.estimator_class = LogisticRegression
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CatBoostEstimator(BaseEstimator):
|
|
|
|
|
|
|
|
|
|
_time_per_iter = None
|
|
|
|
|
_train_size = 0
|
|
|
|
|
|
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def search_space(cls, data_size, **params):
|
|
|
|
|
upper = max(min(round(1500000 / data_size), 150), 11)
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return {
|
|
|
|
|
'early_stopping_rounds': {
|
|
|
|
|
'domain': tune.qloguniform(lower=10, upper=upper, q=1),
|
|
|
|
|
'init_value': 10,
|
2021-04-06 11:37:52 -07:00
|
|
|
|
'low_cost_init_value': 10,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
|
|
|
|
'learning_rate': {
|
|
|
|
|
'domain': tune.loguniform(lower=.005, upper=.2),
|
|
|
|
|
'init_value': 0.1,
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def size(cls, config):
|
|
|
|
|
n_estimators = 8192
|
|
|
|
|
max_leaves = 64
|
2021-04-08 09:29:55 -07:00
|
|
|
|
return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8
|
2021-02-05 21:41:14 -08:00
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def cost_relative2lgbm(cls):
|
|
|
|
|
return 15
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def __init__(
|
|
|
|
|
self, task='binary:logistic', n_jobs=1,
|
|
|
|
|
n_estimators=8192, learning_rate=0.1, early_stopping_rounds=4, **params
|
|
|
|
|
):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
super().__init__(task, **params)
|
|
|
|
|
self.params = {
|
|
|
|
|
"early_stopping_rounds": int(round(early_stopping_rounds)),
|
2021-04-08 09:29:55 -07:00
|
|
|
|
"n_estimators": n_estimators,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'learning_rate': learning_rate,
|
|
|
|
|
'thread_count': n_jobs,
|
2021-03-19 09:50:47 -07:00
|
|
|
|
'verbose': params.get('verbose', False),
|
|
|
|
|
'random_seed': params.get("random_seed", 10242048),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
}
|
|
|
|
|
if 'regression' in task:
|
|
|
|
|
from catboost import CatBoostRegressor
|
|
|
|
|
self.estimator_class = CatBoostRegressor
|
|
|
|
|
else:
|
|
|
|
|
from catboost import CatBoostClassifier
|
|
|
|
|
self.estimator_class = CatBoostClassifier
|
|
|
|
|
|
|
|
|
|
def get_params(self, deep=False):
|
|
|
|
|
params = super().get_params()
|
|
|
|
|
params['n_jobs'] = params['thread_count']
|
|
|
|
|
return params
|
|
|
|
|
|
|
|
|
|
def fit(self, X_train, y_train, budget=None, **kwargs):
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
n_iter = self.params["n_estimators"]
|
|
|
|
|
if isinstance(X_train, pd.DataFrame):
|
|
|
|
|
cat_features = list(X_train.select_dtypes(
|
|
|
|
|
include='category').columns)
|
|
|
|
|
else:
|
|
|
|
|
cat_features = []
|
2021-04-21 04:36:06 -04:00
|
|
|
|
from catboost import CatBoostError
|
|
|
|
|
try:
|
|
|
|
|
if (not CatBoostEstimator._time_per_iter or abs(
|
|
|
|
|
CatBoostEstimator._train_size - len(y_train)) > 4) and budget:
|
|
|
|
|
# measure the time per iteration
|
|
|
|
|
self.params["n_estimators"] = 1
|
|
|
|
|
CatBoostEstimator._smallmodel = self.estimator_class(**self.params)
|
|
|
|
|
CatBoostEstimator._smallmodel.fit(
|
|
|
|
|
X_train, y_train, cat_features=cat_features, **kwargs)
|
|
|
|
|
CatBoostEstimator._t1 = time.time() - start_time
|
|
|
|
|
if CatBoostEstimator._t1 >= budget:
|
|
|
|
|
self.params["n_estimators"] = n_iter
|
|
|
|
|
self._model = CatBoostEstimator._smallmodel
|
|
|
|
|
return CatBoostEstimator._t1
|
|
|
|
|
self.params["n_estimators"] = 4
|
|
|
|
|
CatBoostEstimator._smallmodel = self.estimator_class(**self.params)
|
|
|
|
|
CatBoostEstimator._smallmodel.fit(
|
|
|
|
|
X_train, y_train, cat_features=cat_features, **kwargs)
|
|
|
|
|
CatBoostEstimator._time_per_iter = (
|
|
|
|
|
time.time() - start_time - CatBoostEstimator._t1) / (
|
|
|
|
|
self.params["n_estimators"] - 1)
|
|
|
|
|
if CatBoostEstimator._time_per_iter <= 0:
|
|
|
|
|
CatBoostEstimator._time_per_iter = CatBoostEstimator._t1
|
|
|
|
|
CatBoostEstimator._train_size = len(y_train)
|
|
|
|
|
if time.time() - start_time >= budget or n_iter == self.params[
|
|
|
|
|
"n_estimators"]:
|
|
|
|
|
self.params["n_estimators"] = n_iter
|
|
|
|
|
self._model = CatBoostEstimator._smallmodel
|
|
|
|
|
return time.time() - start_time
|
|
|
|
|
if budget:
|
|
|
|
|
train_times = 1
|
|
|
|
|
self.params["n_estimators"] = min(n_iter, int(
|
|
|
|
|
(budget - time.time() + start_time - CatBoostEstimator._t1)
|
|
|
|
|
/ train_times / CatBoostEstimator._time_per_iter + 1))
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self._model = CatBoostEstimator._smallmodel
|
2021-04-21 04:36:06 -04:00
|
|
|
|
if self.params["n_estimators"] > 0:
|
|
|
|
|
n = max(int(len(y_train) * 0.9), len(y_train) - 1000)
|
|
|
|
|
X_tr, y_tr = X_train[:n], y_train[:n]
|
|
|
|
|
if 'sample_weight' in kwargs:
|
|
|
|
|
weight = kwargs['sample_weight']
|
|
|
|
|
if weight is not None:
|
|
|
|
|
kwargs['sample_weight'] = weight[:n]
|
|
|
|
|
else:
|
|
|
|
|
weight = None
|
|
|
|
|
from catboost import Pool
|
|
|
|
|
model = self.estimator_class(**self.params)
|
|
|
|
|
model.fit(
|
|
|
|
|
X_tr, y_tr, cat_features=cat_features,
|
|
|
|
|
eval_set=Pool(
|
|
|
|
|
data=X_train[n:], label=y_train[n:],
|
|
|
|
|
cat_features=cat_features),
|
|
|
|
|
**kwargs) # model.get_best_iteration()
|
2021-04-08 09:29:55 -07:00
|
|
|
|
if weight is not None:
|
2021-04-21 04:36:06 -04:00
|
|
|
|
kwargs['sample_weight'] = weight
|
|
|
|
|
self._model = model
|
|
|
|
|
except CatBoostError:
|
|
|
|
|
self._model = None
|
2021-02-05 21:41:14 -08:00
|
|
|
|
self.params["n_estimators"] = n_iter
|
|
|
|
|
train_time = time.time() - start_time
|
|
|
|
|
return train_time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class KNeighborsEstimator(BaseEstimator):
|
|
|
|
|
|
|
|
|
|
@classmethod
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def search_space(cls, data_size, **params):
|
|
|
|
|
upper = min(512, int(data_size / 2))
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return {
|
|
|
|
|
'n_neighbors': {
|
|
|
|
|
'domain': tune.qloguniform(lower=1, upper=upper, q=1),
|
|
|
|
|
'init_value': 5,
|
2021-04-06 11:37:52 -07:00
|
|
|
|
'low_cost_init_value': 1,
|
2021-02-05 21:41:14 -08:00
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def cost_relative2lgbm(cls):
|
|
|
|
|
return 30
|
|
|
|
|
|
2021-04-08 09:29:55 -07:00
|
|
|
|
def __init__(
|
|
|
|
|
self, task='binary:logistic', n_jobs=1, n_neighbors=5, **params
|
|
|
|
|
):
|
2021-02-05 21:41:14 -08:00
|
|
|
|
super().__init__(task, **params)
|
2021-04-08 09:29:55 -07:00
|
|
|
|
self.params = {
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'n_neighbors': int(round(n_neighbors)),
|
2021-03-19 09:50:47 -07:00
|
|
|
|
'weights': params.get('weights', 'distance'),
|
2021-02-05 21:41:14 -08:00
|
|
|
|
'n_jobs': n_jobs,
|
|
|
|
|
}
|
|
|
|
|
if 'regression' in task:
|
|
|
|
|
from sklearn.neighbors import KNeighborsRegressor
|
|
|
|
|
self.estimator_class = KNeighborsRegressor
|
|
|
|
|
else:
|
|
|
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
|
|
|
self.estimator_class = KNeighborsClassifier
|
|
|
|
|
|
|
|
|
|
def _preprocess(self, X):
|
|
|
|
|
if isinstance(X, pd.DataFrame):
|
|
|
|
|
cat_columns = X.select_dtypes(['category']).columns
|
|
|
|
|
if X.shape[1] == len(cat_columns):
|
|
|
|
|
raise ValueError(
|
2021-04-08 09:29:55 -07:00
|
|
|
|
"kneighbor requires at least one numeric feature")
|
|
|
|
|
X = X.drop(cat_columns, axis=1)
|
2021-02-05 21:41:14 -08:00
|
|
|
|
return X
|