datetime feature engineering (#285)

resolve #284
When transforming test data, keep a derived column as long as it is kept in the training data.
This commit is contained in:
Chi Wang 2021-11-18 11:19:53 -08:00 committed by GitHub
parent 72caa2172d
commit db1fb9b47b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -269,40 +269,35 @@ class DataTransformer:
else: else:
X[column] = X[column].fillna("__NAN__") X[column] = X[column].fillna("__NAN__")
cat_columns.append(column) cat_columns.append(column)
else: elif X[column].nunique(dropna=True) < 2:
# print(X[column].dtype.name) X.drop(columns=column, inplace=True)
if X[column].nunique(dropna=True) < 2: drop = True
X.drop(columns=column, inplace=True) else: # datetime or numeric
drop = True if X[column].dtype.name == "datetime64[ns]":
else: tmp_dt = X[column].dt
if X[column].dtype.name == "datetime64[ns]": new_columns_dict = {
tmp_dt = X[column].dt f"year_{column}": tmp_dt.year,
new_columns_dict = { f"month_{column}": tmp_dt.month,
f"year_{column}": tmp_dt.year, f"day_{column}": tmp_dt.day,
f"month_{column}": tmp_dt.month, f"hour_{column}": tmp_dt.hour,
f"day_{column}": tmp_dt.day, f"minute_{column}": tmp_dt.minute,
f"hour_{column}": tmp_dt.hour, f"second_{column}": tmp_dt.second,
f"minute_{column}": tmp_dt.minute, f"dayofweek_{column}": tmp_dt.dayofweek,
f"second_{column}": tmp_dt.second, f"dayofyear_{column}": tmp_dt.dayofyear,
f"dayofweek_{column}": tmp_dt.dayofweek, f"quarter_{column}": tmp_dt.quarter,
f"dayofyear_{column}": tmp_dt.dayofyear, }
f"quarter_{column}": tmp_dt.quarter, for key, value in new_columns_dict.items():
} if (
for new_col_name in new_columns_dict.keys(): key not in X.columns
if ( and value.nunique(dropna=False) >= 2
new_col_name not in X.columns ):
and new_columns_dict.get(new_col_name).nunique( X[key] = value
dropna=False num_columns.append(key)
) X[column] = X[column].map(datetime.toordinal)
>= 2 datetime_columns.append(column)
): del tmp_dt
X[new_col_name] = new_columns_dict.get(new_col_name) X[column] = X[column].fillna(np.nan)
num_columns.append(new_col_name) num_columns.append(column)
X[column] = X[column].map(datetime.toordinal)
datetime_columns.append(column)
del tmp_dt
X[column] = X[column].fillna(np.nan)
num_columns.append(column)
X = X[cat_columns + num_columns] X = X[cat_columns + num_columns]
if task == TS_FORECAST: if task == TS_FORECAST:
X.insert(0, TS_TIMESTAMP_COL, ds_col) X.insert(0, TS_TIMESTAMP_COL, ds_col)
@ -380,29 +375,24 @@ class DataTransformer:
if self._task == TS_FORECAST: if self._task == TS_FORECAST:
X = X.rename(columns={X.columns[0]: TS_TIMESTAMP_COL}) X = X.rename(columns={X.columns[0]: TS_TIMESTAMP_COL})
ds_col = X.pop(TS_TIMESTAMP_COL) ds_col = X.pop(TS_TIMESTAMP_COL)
if datetime_columns: for column in datetime_columns:
for column in datetime_columns: tmp_dt = X[column].dt
tmp_dt = X[column].dt new_columns_dict = {
new_columns_dict = { f"year_{column}": tmp_dt.year,
f"year_{column}": tmp_dt.year, f"month_{column}": tmp_dt.month,
f"month_{column}": tmp_dt.month, f"day_{column}": tmp_dt.day,
f"day_{column}": tmp_dt.day, f"hour_{column}": tmp_dt.hour,
f"hour_{column}": tmp_dt.hour, f"minute_{column}": tmp_dt.minute,
f"minute_{column}": tmp_dt.minute, f"second_{column}": tmp_dt.second,
f"second_{column}": tmp_dt.second, f"dayofweek_{column}": tmp_dt.dayofweek,
f"dayofweek_{column}": tmp_dt.dayofweek, f"dayofyear_{column}": tmp_dt.dayofyear,
f"dayofyear_{column}": tmp_dt.dayofyear, f"quarter_{column}": tmp_dt.quarter,
f"quarter_{column}": tmp_dt.quarter, }
} for new_col_name, new_col_value in new_columns_dict.items():
for new_col_name in new_columns_dict.keys(): if new_col_name not in X.columns and new_col_name in num_columns:
if ( X[new_col_name] = new_col_value
new_col_name not in X.columns X[column] = X[column].map(datetime.toordinal)
and new_columns_dict.get(new_col_name).nunique(dropna=False) del tmp_dt
>= 2
):
X[new_col_name] = new_columns_dict.get(new_col_name)
X[column] = X[column].map(datetime.toordinal)
del tmp_dt
X = X[cat_columns + num_columns].copy() X = X[cat_columns + num_columns].copy()
if self._task == TS_FORECAST: if self._task == TS_FORECAST:
X.insert(0, TS_TIMESTAMP_COL, ds_col) X.insert(0, TS_TIMESTAMP_COL, ds_col)