mirror of
https://github.com/microsoft/autogen.git
synced 2025-07-26 18:31:36 +00:00
516 lines
19 KiB
Python
516 lines
19 KiB
Python
![]() |
'''!
|
|||
|
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
|
|||
|
* Licensed under the MIT License.
|
|||
|
'''
|
|||
|
|
|||
|
import numpy as np
|
|||
|
import xgboost as xgb
|
|||
|
from xgboost import XGBClassifier, XGBRegressor
|
|||
|
import time
|
|||
|
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
|
|||
|
from sklearn.linear_model import LogisticRegression
|
|||
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
|||
|
import scipy.sparse
|
|||
|
import pandas as pd
|
|||
|
|
|||
|
|
|||
|
class BaseEstimator:
|
|||
|
'''The abstract class for all learners
|
|||
|
|
|||
|
Typical example:
|
|||
|
XGBoostEstimator: for regression
|
|||
|
XGBoostSklearnEstimator: for classification
|
|||
|
LGBMEstimator, RandomForestEstimator, LRL1Classifier, LRL2Classifier:
|
|||
|
for both regression and classification
|
|||
|
'''
|
|||
|
|
|||
|
def __init__(self, objective_name = 'binary:logistic',
|
|||
|
**params):
|
|||
|
'''Constructor
|
|||
|
|
|||
|
Args:
|
|||
|
objective_name: A string of the objective name, one of
|
|||
|
'binary:logistic', 'multi:softmax', 'regression'
|
|||
|
n_jobs: An integer of the number of parallel threads
|
|||
|
params: A dictionary of the hyperparameter names and values
|
|||
|
'''
|
|||
|
self.params = params
|
|||
|
self.estimator_class = None
|
|||
|
self.objective_name = objective_name
|
|||
|
if '_estimator_type' in params:
|
|||
|
self._estimator_type = params['_estimator_type']
|
|||
|
else:
|
|||
|
self._estimator_type = "regressor" if objective_name=='regression' \
|
|||
|
else "classifier"
|
|||
|
|
|||
|
def get_params(self, deep=False):
|
|||
|
params = self.params.copy()
|
|||
|
params["objective_name"] = self.objective_name
|
|||
|
if hasattr(self, '_estimator_type'):
|
|||
|
params['_estimator_type'] = self._estimator_type
|
|||
|
return params
|
|||
|
|
|||
|
@property
|
|||
|
def classes_(self):
|
|||
|
return self.model.classes_
|
|||
|
|
|||
|
def preprocess(self, X):
|
|||
|
return X
|
|||
|
|
|||
|
def _fit(self, X_train, y_train):
|
|||
|
|
|||
|
curent_time = time.time()
|
|||
|
X_train = self.preprocess(X_train)
|
|||
|
model = self.estimator_class(**self.params)
|
|||
|
model.fit(X_train, y_train)
|
|||
|
train_time = time.time() - curent_time
|
|||
|
self.model = model
|
|||
|
return train_time
|
|||
|
|
|||
|
def fit(self, X_train, y_train, budget=None):
|
|||
|
'''Train the model from given training data
|
|||
|
|
|||
|
Args:
|
|||
|
X_train: A numpy array of training data in shape n*m
|
|||
|
y_train: A numpy array of labels in shape n*1
|
|||
|
budget: A float of the time budget in seconds
|
|||
|
|
|||
|
Returns:
|
|||
|
train_time: A float of the training time in seconds
|
|||
|
'''
|
|||
|
return self._fit(X_train, y_train)
|
|||
|
|
|||
|
def predict(self, X_test):
|
|||
|
'''Predict label from features
|
|||
|
|
|||
|
Args:
|
|||
|
X_test: A numpy array of featurized instances, shape n*m
|
|||
|
|
|||
|
Returns:
|
|||
|
A numpy array of shape n*1.
|
|||
|
Each element is the label for a instance
|
|||
|
'''
|
|||
|
X_test = self.preprocess(X_test)
|
|||
|
return self.model.predict(X_test)
|
|||
|
|
|||
|
def predict_proba(self, X_test):
|
|||
|
'''Predict the probability of each class from features
|
|||
|
|
|||
|
Only works for classification problems
|
|||
|
|
|||
|
Args:
|
|||
|
model: An object of trained model with method predict_proba()
|
|||
|
X_test: A numpy array of featurized instances, shape n*m
|
|||
|
|
|||
|
Returns:
|
|||
|
A numpy array of shape n*c. c is the # classes
|
|||
|
Each element at (i,j) is the probability for instance i to be in
|
|||
|
class j
|
|||
|
'''
|
|||
|
if 'regression' in self.objective_name:
|
|||
|
print('Regression tasks do not support predict_prob')
|
|||
|
raise ValueError
|
|||
|
else:
|
|||
|
X_test = self.preprocess(X_test)
|
|||
|
return self.model.predict_proba(X_test)
|
|||
|
|
|||
|
def cleanup(self): pass
|
|||
|
|
|||
|
|
|||
|
class SKLearnEstimator(BaseEstimator):
|
|||
|
|
|||
|
|
|||
|
def preprocess(self, X):
|
|||
|
if isinstance(X, pd.DataFrame):
|
|||
|
X = X.copy()
|
|||
|
cat_columns = X.select_dtypes(include=['category']).columns
|
|||
|
X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)
|
|||
|
return X
|
|||
|
|
|||
|
|
|||
|
class LGBMEstimator(BaseEstimator):
|
|||
|
|
|||
|
|
|||
|
def __init__(self, objective_name='binary:logistic', n_jobs=1,
|
|||
|
n_estimators=2, max_leaves=2, min_child_weight=1e-3, learning_rate=0.1,
|
|||
|
subsample=1.0, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0,
|
|||
|
colsample_bytree=1.0, log_max_bin=8, **params):
|
|||
|
super().__init__(objective_name, **params)
|
|||
|
# Default: ‘regression’ for LGBMRegressor,
|
|||
|
# ‘binary’ or ‘multiclass’ for LGBMClassifier
|
|||
|
if 'regression' in objective_name:
|
|||
|
final_objective_name = 'regression'
|
|||
|
elif 'binary' in objective_name:
|
|||
|
final_objective_name = 'binary'
|
|||
|
elif 'multi' in objective_name:
|
|||
|
final_objective_name = 'multiclass'
|
|||
|
else:
|
|||
|
final_objective_name = 'regression'
|
|||
|
self.params = {
|
|||
|
"n_estimators": int(round(n_estimators)),
|
|||
|
"num_leaves": params[
|
|||
|
'num_leaves'] if 'num_leaves' in params else int(
|
|||
|
round(max_leaves)),
|
|||
|
'objective': params[
|
|||
|
"objective"] if "objective" in params else final_objective_name,
|
|||
|
'n_jobs': n_jobs,
|
|||
|
'learning_rate': float(learning_rate),
|
|||
|
'reg_alpha': float(reg_alpha),
|
|||
|
'reg_lambda': float(reg_lambda),
|
|||
|
'min_child_weight': float(min_child_weight),
|
|||
|
'colsample_bytree':float(colsample_bytree),
|
|||
|
'subsample': float(subsample),
|
|||
|
}
|
|||
|
self.params['max_bin'] = params['max_bin'] if 'max_bin' in params else (
|
|||
|
1<<int(round(log_max_bin)))-1
|
|||
|
if 'regression' in objective_name:
|
|||
|
self.estimator_class = LGBMRegressor
|
|||
|
else:
|
|||
|
self.estimator_class = LGBMClassifier
|
|||
|
self.time_per_iter = None
|
|||
|
self.train_size = 0
|
|||
|
|
|||
|
def preprocess(self, X):
|
|||
|
if not isinstance(X, pd.DataFrame) and scipy.sparse.issparse(
|
|||
|
X) and np.issubdtype(X.dtype, np.integer):
|
|||
|
X = X.astype(float)
|
|||
|
return X
|
|||
|
|
|||
|
def fit(self, X_train, y_train, budget=None):
|
|||
|
start_time = time.time()
|
|||
|
n_iter = self.params["n_estimators"]
|
|||
|
if (not self.time_per_iter or
|
|||
|
abs(self.train_size-X_train.shape[0])>4) and budget is not None:
|
|||
|
self.params["n_estimators"] = 1
|
|||
|
self.t1 = self._fit(X_train, y_train)
|
|||
|
if self.t1 >= budget:
|
|||
|
self.params["n_estimators"] = n_iter
|
|||
|
return self.t1
|
|||
|
self.params["n_estimators"] = 4
|
|||
|
self.t2 = self._fit(X_train, y_train)
|
|||
|
self.time_per_iter = (self.t2 - self.t1)/(
|
|||
|
self.params["n_estimators"]-1) if self.t2 > self.t1 \
|
|||
|
else self.t1 if self.t1 else 0.001
|
|||
|
self.train_size = X_train.shape[0]
|
|||
|
if self.t1+self.t2>=budget or n_iter==self.params["n_estimators"]:
|
|||
|
self.params["n_estimators"] = n_iter
|
|||
|
return time.time() - start_time
|
|||
|
if budget is not None:
|
|||
|
self.params["n_estimators"] = min(n_iter, int((budget-time.time()+
|
|||
|
start_time-self.t1)/self.time_per_iter+1))
|
|||
|
if self.params["n_estimators"] > 0:
|
|||
|
self._fit(X_train, y_train)
|
|||
|
self.params["n_estimators"] = n_iter
|
|||
|
train_time = time.time() - start_time
|
|||
|
return train_time
|
|||
|
|
|||
|
|
|||
|
class XGBoostEstimator(SKLearnEstimator):
|
|||
|
''' not using sklearn API, used for regression '''
|
|||
|
|
|||
|
|
|||
|
def __init__(self, objective_name='regression', all_thread=False, n_jobs=1,
|
|||
|
n_estimators=4, max_leaves=4, subsample=1.0, min_child_weight=1,
|
|||
|
learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0,
|
|||
|
colsample_bytree=1.0, tree_method='auto', **params):
|
|||
|
super().__init__(objective_name, **params)
|
|||
|
self.n_estimators = int(round(n_estimators))
|
|||
|
self.max_leaves = int(round(max_leaves))
|
|||
|
self.grids = []
|
|||
|
self.params = {
|
|||
|
'max_leaves': int(round(max_leaves)),
|
|||
|
'max_depth': 0,
|
|||
|
'grow_policy': params[
|
|||
|
"grow_policy"] if "grow_policy" in params else 'lossguide',
|
|||
|
'tree_method':tree_method,
|
|||
|
'verbosity': 0,
|
|||
|
'nthread':n_jobs,
|
|||
|
'learning_rate': float(learning_rate),
|
|||
|
'subsample': float(subsample),
|
|||
|
'reg_alpha': float(reg_alpha),
|
|||
|
'reg_lambda': float(reg_lambda),
|
|||
|
'min_child_weight': float(min_child_weight),
|
|||
|
'booster': params['booster'] if 'booster' in params else 'gbtree',
|
|||
|
'colsample_bylevel': float(colsample_bylevel),
|
|||
|
'colsample_bytree':float(colsample_bytree),
|
|||
|
}
|
|||
|
if all_thread:
|
|||
|
del self.params['nthread']
|
|||
|
|
|||
|
def get_params(self, deep=False):
|
|||
|
params = super().get_params()
|
|||
|
params["n_jobs"] = params['nthread']
|
|||
|
return params
|
|||
|
|
|||
|
def fit(self, X_train, y_train, budget=None):
|
|||
|
curent_time = time.time()
|
|||
|
if not scipy.sparse.issparse(X_train):
|
|||
|
self.params['tree_method'] = 'hist'
|
|||
|
X_train = self.preprocess(X_train)
|
|||
|
dtrain = xgb.DMatrix(X_train, label=y_train)
|
|||
|
if self.max_leaves>0:
|
|||
|
xgb_model = xgb.train(self.params, dtrain, self.n_estimators)
|
|||
|
del dtrain
|
|||
|
train_time = time.time() - curent_time
|
|||
|
self.model = xgb_model
|
|||
|
return train_time
|
|||
|
else:
|
|||
|
return None
|
|||
|
|
|||
|
def predict(self, X_test):
|
|||
|
if not scipy.sparse.issparse(X_test):
|
|||
|
X_test = self.preprocess(X_test)
|
|||
|
dtest = xgb.DMatrix(X_test)
|
|||
|
return super().predict(dtest)
|
|||
|
|
|||
|
|
|||
|
class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
|
|||
|
''' using sklearn API, used for classification '''
|
|||
|
|
|||
|
|
|||
|
def __init__(self, objective_name='binary:logistic', n_jobs=1,
|
|||
|
n_estimators=4, max_leaves=4, subsample=1.0,
|
|||
|
min_child_weight=1, learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0,
|
|||
|
colsample_bylevel=1.0, colsample_bytree=1.0, tree_method='hist',
|
|||
|
**params):
|
|||
|
super().__init__(objective_name, **params)
|
|||
|
self.params = {
|
|||
|
"n_estimators": int(round(n_estimators)),
|
|||
|
'max_leaves': int(round(max_leaves)),
|
|||
|
'max_depth': 0,
|
|||
|
'grow_policy': params[
|
|||
|
"grow_policy"] if "grow_policy" in params else 'lossguide',
|
|||
|
'tree_method':tree_method,
|
|||
|
'verbosity': 0,
|
|||
|
'n_jobs': n_jobs,
|
|||
|
'learning_rate': float(learning_rate),
|
|||
|
'subsample': float(subsample),
|
|||
|
'reg_alpha': float(reg_alpha),
|
|||
|
'reg_lambda': float(reg_lambda),
|
|||
|
'min_child_weight': float(min_child_weight),
|
|||
|
'booster': params['booster'] if 'booster' in params else 'gbtree',
|
|||
|
'colsample_bylevel': float(colsample_bylevel),
|
|||
|
'colsample_bytree': float(colsample_bytree),
|
|||
|
}
|
|||
|
|
|||
|
if 'regression' in objective_name:
|
|||
|
self.estimator_class = XGBRegressor
|
|||
|
else:
|
|||
|
self.estimator_class = XGBClassifier
|
|||
|
self.time_per_iter = None
|
|||
|
self.train_size = 0
|
|||
|
|
|||
|
def fit(self, X_train, y_train, budget=None):
|
|||
|
if scipy.sparse.issparse(X_train):
|
|||
|
self.params['tree_method'] = 'auto'
|
|||
|
return super().fit(X_train, y_train, budget)
|
|||
|
|
|||
|
|
|||
|
class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
|
|||
|
|
|||
|
|
|||
|
def __init__(self, objective_name = 'binary:logistic', n_jobs = 1,
|
|||
|
n_estimators = 4, max_leaves = 4, max_features = 1.0,
|
|||
|
min_samples_split = 2, min_samples_leaf = 1, criterion = 1, **params):
|
|||
|
super().__init__(objective_name, **params)
|
|||
|
self.params = {
|
|||
|
"n_estimators": int(round(n_estimators)),
|
|||
|
"n_jobs": n_jobs,
|
|||
|
'max_features': float(max_features),
|
|||
|
}
|
|||
|
if 'regression' in objective_name:
|
|||
|
self.estimator_class = RandomForestRegressor
|
|||
|
else:
|
|||
|
self.estimator_class = RandomForestClassifier
|
|||
|
self.params['criterion'] = 'entropy' if criterion>1.5 else 'gini'
|
|||
|
self.time_per_iter = None
|
|||
|
self.train_size = 0
|
|||
|
|
|||
|
def get_params(self, deep=False):
|
|||
|
params = super().get_params()
|
|||
|
params["criterion"] = 1 if params["criterion"]=='gini' else 2
|
|||
|
return params
|
|||
|
|
|||
|
|
|||
|
class ExtraTreeEstimator(RandomForestEstimator):
|
|||
|
|
|||
|
|
|||
|
def __init__(self, objective_name = 'binary:logistic', n_jobs = 1,
|
|||
|
n_estimators = 4, max_leaves = 4, max_features = 1.0,
|
|||
|
min_samples_split = 2, min_samples_leaf = 1, criterion = 1, **params):
|
|||
|
super().__init__(objective_name, **params)
|
|||
|
self.params = {
|
|||
|
"n_estimators": int(round(n_estimators)),
|
|||
|
"n_jobs": n_jobs,
|
|||
|
'max_features': float(max_features),
|
|||
|
}
|
|||
|
if 'regression' in objective_name:
|
|||
|
from sklearn.ensemble import ExtraTreesRegressor
|
|||
|
self.estimator_class = ExtraTreesRegressor
|
|||
|
else:
|
|||
|
from sklearn.ensemble import ExtraTreesClassifier
|
|||
|
self.estimator_class = ExtraTreesClassifier
|
|||
|
self.params['criterion'] = 'entropy' if criterion>1.5 else 'gini'
|
|||
|
self.time_per_iter = None
|
|||
|
self.train_size = 0
|
|||
|
|
|||
|
|
|||
|
class LRL1Classifier(SKLearnEstimator):
|
|||
|
|
|||
|
|
|||
|
def __init__(self, tol=0.0001, C=1.0,
|
|||
|
objective_name='binary:logistic', n_jobs=1, **params):
|
|||
|
super().__init__(objective_name, **params)
|
|||
|
self.params = {
|
|||
|
'penalty': 'l1',
|
|||
|
'tol': float(tol),
|
|||
|
'C': float(C),
|
|||
|
'solver': 'saga',
|
|||
|
'n_jobs': n_jobs,
|
|||
|
}
|
|||
|
if 'regression' in objective_name:
|
|||
|
self.estimator_class = None
|
|||
|
print('Does not support regression task')
|
|||
|
raise NotImplementedError
|
|||
|
else:
|
|||
|
self.estimator_class = LogisticRegression
|
|||
|
|
|||
|
|
|||
|
class LRL2Classifier(SKLearnEstimator):
|
|||
|
|
|||
|
|
|||
|
def __init__(self, tol=0.0001, C=1.0,
|
|||
|
objective_name='binary:logistic', n_jobs=1, **params):
|
|||
|
super().__init__(objective_name, **params)
|
|||
|
self.params = {
|
|||
|
'penalty': 'l2',
|
|||
|
'tol': float(tol),
|
|||
|
'C': float(C),
|
|||
|
'solver': 'lbfgs',
|
|||
|
'n_jobs': n_jobs,
|
|||
|
}
|
|||
|
if 'regression' in objective_name:
|
|||
|
self.estimator_class = None
|
|||
|
print('Does not support regression task')
|
|||
|
raise NotImplementedError
|
|||
|
else:
|
|||
|
self.estimator_class = LogisticRegression
|
|||
|
|
|||
|
|
|||
|
class CatBoostEstimator(BaseEstimator):
|
|||
|
|
|||
|
|
|||
|
time_per_iter = None
|
|||
|
train_size = 0
|
|||
|
|
|||
|
def __init__(self, objective_name = 'binary:logistic', n_jobs=1,
|
|||
|
n_estimators=8192, exp_max_depth=64, learning_rate=0.1, rounds=4,
|
|||
|
l2_leaf_reg=3, **params):
|
|||
|
super().__init__(objective_name, **params)
|
|||
|
self.params = {
|
|||
|
"early_stopping_rounds": int(round(rounds)),
|
|||
|
"n_estimators": n_estimators,
|
|||
|
'learning_rate': learning_rate,
|
|||
|
'thread_count': n_jobs,
|
|||
|
'verbose': False,
|
|||
|
'random_seed': params[
|
|||
|
"random_seed"] if "random_seed" in params else 10242048,
|
|||
|
}
|
|||
|
# print(n_estimators)
|
|||
|
if 'regression' in objective_name:
|
|||
|
from catboost import CatBoostRegressor
|
|||
|
self.estimator_class = CatBoostRegressor
|
|||
|
else:
|
|||
|
from catboost import CatBoostClassifier
|
|||
|
self.estimator_class = CatBoostClassifier
|
|||
|
|
|||
|
def get_params(self, deep=False):
|
|||
|
params = super().get_params()
|
|||
|
params['n_jobs'] = params['thread_count']
|
|||
|
params['rounds'] = params['early_stopping_rounds']
|
|||
|
return params
|
|||
|
|
|||
|
def fit(self, X_train, y_train, budget=None):
|
|||
|
start_time = time.time()
|
|||
|
n_iter = self.params["n_estimators"]
|
|||
|
if isinstance(X_train, pd.DataFrame):
|
|||
|
cat_features = list(X_train.select_dtypes(
|
|||
|
include='category').columns)
|
|||
|
else:
|
|||
|
cat_features = []
|
|||
|
if (not CatBoostEstimator.time_per_iter or
|
|||
|
abs(CatBoostEstimator.train_size-len(y_train))>4) and budget:
|
|||
|
# measure the time per iteration
|
|||
|
self.params["n_estimators"] = 1
|
|||
|
CatBoostEstimator.model = self.estimator_class(**self.params)
|
|||
|
CatBoostEstimator.model.fit(X_train, y_train,
|
|||
|
cat_features=cat_features)
|
|||
|
CatBoostEstimator.t1 = time.time() - start_time
|
|||
|
if CatBoostEstimator.t1 >= budget:
|
|||
|
self.params["n_estimators"] = n_iter
|
|||
|
self.model = CatBoostEstimator.model
|
|||
|
return CatBoostEstimator.t1
|
|||
|
self.params["n_estimators"] = 4
|
|||
|
CatBoostEstimator.model = self.estimator_class(**self.params)
|
|||
|
CatBoostEstimator.model.fit(X_train, y_train,
|
|||
|
cat_features=cat_features)
|
|||
|
CatBoostEstimator.time_per_iter = (time.time() - start_time -
|
|||
|
CatBoostEstimator.t1)/(self.params["n_estimators"]-1)
|
|||
|
if CatBoostEstimator.time_per_iter <= 0:
|
|||
|
CatBoostEstimator.time_per_iter = CatBoostEstimator.t1
|
|||
|
CatBoostEstimator.train_size = len(y_train)
|
|||
|
if time.time()-start_time>=budget or n_iter==self.params[
|
|||
|
"n_estimators"]:
|
|||
|
self.params["n_estimators"] = n_iter
|
|||
|
self.model = CatBoostEstimator.model
|
|||
|
return time.time()-start_time
|
|||
|
if budget:
|
|||
|
train_times = 1
|
|||
|
self.params["n_estimators"] = min(n_iter, int((budget-time.time()+
|
|||
|
start_time-CatBoostEstimator.t1)/train_times/
|
|||
|
CatBoostEstimator.time_per_iter+1))
|
|||
|
self.model = CatBoostEstimator.model
|
|||
|
if self.params["n_estimators"] > 0:
|
|||
|
l = max(int(len(y_train)*0.9), len(y_train)-1000)
|
|||
|
X_tr, y_tr = X_train[:l], y_train[:l]
|
|||
|
from catboost import Pool
|
|||
|
model = self.estimator_class(**self.params)
|
|||
|
model.fit(X_tr, y_tr, cat_features=cat_features, eval_set=Pool(
|
|||
|
data=X_train[l:], label=y_train[l:], cat_features=cat_features))
|
|||
|
# print(self.params["n_estimators"], model.get_best_iteration())
|
|||
|
self.model = model
|
|||
|
self.params["n_estimators"] = n_iter
|
|||
|
train_time = time.time() - start_time
|
|||
|
# print(budget, train_time)
|
|||
|
return train_time
|
|||
|
|
|||
|
|
|||
|
class KNeighborsEstimator(BaseEstimator):
|
|||
|
|
|||
|
|
|||
|
def __init__(self, objective_name='binary:logistic', n_jobs=1,
|
|||
|
n_neighbors=5, **params):
|
|||
|
super().__init__(objective_name, **params)
|
|||
|
self.params= {
|
|||
|
'n_neighbors': int(round(n_neighbors)),
|
|||
|
'weights': 'distance',
|
|||
|
'n_jobs': n_jobs,
|
|||
|
}
|
|||
|
if 'regression' in objective_name:
|
|||
|
from sklearn.neighbors import KNeighborsRegressor
|
|||
|
self.estimator_class = KNeighborsRegressor
|
|||
|
else:
|
|||
|
from sklearn.neighbors import KNeighborsClassifier
|
|||
|
self.estimator_class = KNeighborsClassifier
|
|||
|
|
|||
|
def preprocess(self, X):
|
|||
|
if isinstance(X, pd.DataFrame):
|
|||
|
cat_columns = X.select_dtypes(['category']).columns
|
|||
|
# print(X.dtypes)
|
|||
|
# print(cat_columns)
|
|||
|
if X.shape[1] == len(cat_columns):
|
|||
|
raise ValueError(
|
|||
|
"kneighbor requires at least one numeric feature")
|
|||
|
X = X.drop(cat_columns, axis=1)
|
|||
|
return X
|