fixing auto metric bug (#387)

This commit is contained in:
Xueqing Liu 2022-01-07 19:25:58 -05:00 committed by GitHub
parent d4273669e6
commit c54c1246c6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 70 additions and 35 deletions

View File

@ -73,7 +73,9 @@ class SearchState:
self.total_time_used - self.time_best_found, self.total_time_used - self.time_best_found,
) )
def __init__(self, learner_class, data_size, task, starting_point=None, period=None): def __init__(
self, learner_class, data_size, task, starting_point=None, period=None
):
self.init_eci = learner_class.cost_relative2lgbm() self.init_eci = learner_class.cost_relative2lgbm()
self._search_space_domain = {} self._search_space_domain = {}
self.init_config = {} self.init_config = {}
@ -83,7 +85,9 @@ class SearchState:
self.ls_ever_converged = False self.ls_ever_converged = False
self.learner_class = learner_class self.learner_class = learner_class
if task == TS_FORECAST: if task == TS_FORECAST:
search_space = learner_class.search_space(data_size=data_size, task=task, pred_horizon=period) search_space = learner_class.search_space(
data_size=data_size, task=task, pred_horizon=period
)
else: else:
search_space = learner_class.search_space(data_size=data_size, task=task) search_space = learner_class.search_space(data_size=data_size, task=task)
for name, space in search_space.items(): for name, space in search_space.items():
@ -820,7 +824,11 @@ class AutoML(BaseEstimator):
dataframe[dataframe.columns[0]].dtype.name == "datetime64[ns]" dataframe[dataframe.columns[0]].dtype.name == "datetime64[ns]"
), f"For '{TS_FORECAST}' task, the first column must contain timestamp values." ), f"For '{TS_FORECAST}' task, the first column must contain timestamp values."
if y_train_all is not None: if y_train_all is not None:
y_df = pd.DataFrame(y_train_all) if isinstance(y_train_all, pd.Series) else pd.DataFrame(y_train_all, columns=['labels']) y_df = (
pd.DataFrame(y_train_all)
if isinstance(y_train_all, pd.Series)
else pd.DataFrame(y_train_all, columns=["labels"])
)
dataframe = dataframe.join(y_df) dataframe = dataframe.join(y_df)
duplicates = dataframe.duplicated() duplicates = dataframe.duplicated()
if any(duplicates): if any(duplicates):
@ -881,7 +889,9 @@ class AutoML(BaseEstimator):
self._nrow, self._ndim = X_train_all.shape self._nrow, self._ndim = X_train_all.shape
if self._state.task == TS_FORECAST: if self._state.task == TS_FORECAST:
X_train_all = pd.DataFrame(X_train_all) X_train_all = pd.DataFrame(X_train_all)
X_train_all, y_train_all = self._validate_ts_data(X_train_all, y_train_all) X_train_all, y_train_all = self._validate_ts_data(
X_train_all, y_train_all
)
X, y = X_train_all, y_train_all X, y = X_train_all, y_train_all
elif dataframe is not None and label is not None: elif dataframe is not None and label is not None:
assert isinstance( assert isinstance(

View File

@ -1790,24 +1790,26 @@ class SARIMAX(ARIMA):
class TS_SKLearn_Regressor(SKLearnEstimator): class TS_SKLearn_Regressor(SKLearnEstimator):
""" The class for tuning SKLearn Regressors for time-series forecasting, using hcrystalball""" """The class for tuning SKLearn Regressors for time-series forecasting, using hcrystalball"""
base_class = SKLearnEstimator base_class = SKLearnEstimator
@classmethod @classmethod
def search_space(cls, data_size, pred_horizon, **params): def search_space(cls, data_size, pred_horizon, **params):
space = cls.base_class.search_space(data_size, **params) space = cls.base_class.search_space(data_size, **params)
space.update({ space.update(
"optimize_for_horizon": { {
"domain": tune.choice([True, False]), "optimize_for_horizon": {
"init_value": False, "domain": tune.choice([True, False]),
"low_cost_init_value": False, "init_value": False,
}, "low_cost_init_value": False,
"lags": { },
"domain": tune.randint(lower=1, upper=data_size[0] - pred_horizon), "lags": {
"init_value": 3, "domain": tune.randint(lower=1, upper=data_size[0] - pred_horizon),
}, "init_value": 3,
}) },
}
)
return space return space
def __init__(self, task=TS_FORECAST, **params): def __init__(self, task=TS_FORECAST, **params):
@ -1841,13 +1843,23 @@ class TS_SKLearn_Regressor(SKLearnEstimator):
# Direct Multi-step Forecast Strategy - fit a seperate model for each horizon # Direct Multi-step Forecast Strategy - fit a seperate model for each horizon
model_list = [] model_list = []
for i in range(1, kwargs["period"] + 1): for i in range(1, kwargs["period"] + 1):
X_fit, y_fit = self.hcrystaball_model._transform_data_to_tsmodel_input_format(X_train, y_train, i) (
X_fit,
y_fit,
) = self.hcrystaball_model._transform_data_to_tsmodel_input_format(
X_train, y_train, i
)
self.hcrystaball_model.model.set_params(**estimator.params) self.hcrystaball_model.model.set_params(**estimator.params)
model = self.hcrystaball_model.model.fit(X_fit, y_fit) model = self.hcrystaball_model.model.fit(X_fit, y_fit)
model_list.append(model) model_list.append(model)
self._model = model_list self._model = model_list
else: else:
X_fit, y_fit = self.hcrystaball_model._transform_data_to_tsmodel_input_format(X_train, y_train, kwargs["period"]) (
X_fit,
y_fit,
) = self.hcrystaball_model._transform_data_to_tsmodel_input_format(
X_train, y_train, kwargs["period"]
)
self.hcrystaball_model.model.set_params(**estimator.params) self.hcrystaball_model.model.set_params(**estimator.params)
model = self.hcrystaball_model.model.fit(X_fit, y_fit) model = self.hcrystaball_model.model.fit(X_fit, y_fit)
self._model = model self._model = model
@ -1863,18 +1875,30 @@ class TS_SKLearn_Regressor(SKLearnEstimator):
X_test = self.transform_X(X_test) X_test = self.transform_X(X_test)
X_test = self._preprocess(X_test) X_test = self._preprocess(X_test)
if isinstance(self._model, list): if isinstance(self._model, list):
assert ( assert len(self._model) == len(
len(self._model) == len(X_test) X_test
), "Model is optimized for horizon, length of X_test must be equal to `period`." ), "Model is optimized for horizon, length of X_test must be equal to `period`."
preds = [] preds = []
for i in range(1, len(self._model) + 1): for i in range(1, len(self._model) + 1):
X_pred, _ = self.hcrystaball_model._transform_data_to_tsmodel_input_format(X_test.iloc[:i, :]) (
X_pred,
_,
) = self.hcrystaball_model._transform_data_to_tsmodel_input_format(
X_test.iloc[:i, :]
)
preds.append(self._model[i - 1].predict(X_pred)[-1]) preds.append(self._model[i - 1].predict(X_pred)[-1])
forecast = pd.DataFrame(data=np.asarray(preds).reshape(-1, 1), forecast = pd.DataFrame(
columns=[self.hcrystaball_model.name], data=np.asarray(preds).reshape(-1, 1),
index=X_test.index) columns=[self.hcrystaball_model.name],
index=X_test.index,
)
else: else:
X_pred, _ = self.hcrystaball_model._transform_data_to_tsmodel_input_format(X_test) (
X_pred,
_,
) = self.hcrystaball_model._transform_data_to_tsmodel_input_format(
X_test
)
forecast = self._model.predict(X_pred) forecast = self._model.predict(X_pred)
return forecast return forecast
else: else:
@ -1885,35 +1909,36 @@ class TS_SKLearn_Regressor(SKLearnEstimator):
class LGBM_TS_Regressor(TS_SKLearn_Regressor): class LGBM_TS_Regressor(TS_SKLearn_Regressor):
""" The class for tuning LGBM Regressor for time-series forecasting""" """The class for tuning LGBM Regressor for time-series forecasting"""
base_class = LGBMEstimator base_class = LGBMEstimator
class XGBoost_TS_Regressor(TS_SKLearn_Regressor): class XGBoost_TS_Regressor(TS_SKLearn_Regressor):
""" The class for tuning XGBoost Regressor for time-series forecasting""" """The class for tuning XGBoost Regressor for time-series forecasting"""
base_class = XGBoostSklearnEstimator base_class = XGBoostSklearnEstimator
# catboost regressor is invalid because it has a `name` parameter, making it incompatible with hcrystalball # catboost regressor is invalid because it has a `name` parameter, making it incompatible with hcrystalball
# class CatBoost_TS_Regressor(TS_Regressor): # class CatBoost_TS_Regressor(TS_Regressor):
# base_class = CatBoostEstimator # base_class = CatBoostEstimator
class RF_TS_Regressor(TS_SKLearn_Regressor): class RF_TS_Regressor(TS_SKLearn_Regressor):
""" The class for tuning Random Forest Regressor for time-series forecasting""" """The class for tuning Random Forest Regressor for time-series forecasting"""
base_class = RandomForestEstimator base_class = RandomForestEstimator
class ExtraTrees_TS_Regressor(TS_SKLearn_Regressor): class ExtraTrees_TS_Regressor(TS_SKLearn_Regressor):
""" The class for tuning Extra Trees Regressor for time-series forecasting""" """The class for tuning Extra Trees Regressor for time-series forecasting"""
base_class = ExtraTreesEstimator base_class = ExtraTreesEstimator
class XGBoostLimitDepth_TS_Regressor(TS_SKLearn_Regressor): class XGBoostLimitDepth_TS_Regressor(TS_SKLearn_Regressor):
""" The class for tuning XGBoost Regressor with unlimited depth for time-series forecasting""" """The class for tuning XGBoost Regressor with unlimited depth for time-series forecasting"""
base_class = XGBoostLimitDepthEstimator base_class = XGBoostLimitDepthEstimator

View File

@ -16,15 +16,15 @@ from ..data import (
def load_default_huggingface_metric_for_task(task): def load_default_huggingface_metric_for_task(task):
if task == SEQCLASSIFICATION: if task == SEQCLASSIFICATION:
return "accuracy", "max" return "accuracy"
elif task == SEQREGRESSION: elif task == SEQREGRESSION:
return "rmse", "max" return "rmse"
elif task == SUMMARIZATION: elif task == SUMMARIZATION:
return "rouge", "max" return "rouge"
elif task == MULTICHOICECLASSIFICATION: elif task == MULTICHOICECLASSIFICATION:
return "accuracy", "max" return "accuracy"
elif task == TOKENCLASSIFICATION: elif task == TOKENCLASSIFICATION:
return "seqeval", "max" return "seqeval"
global tokenized_column_names global tokenized_column_names