mirror of
https://github.com/microsoft/autogen.git
synced 2025-11-16 18:14:30 +00:00
datetime feature engineering (#285)
resolve #284 When transforming test data, keep a derived column as long as it is kept in the training data.
This commit is contained in:
parent
72caa2172d
commit
db1fb9b47b
104
flaml/data.py
104
flaml/data.py
@ -269,40 +269,35 @@ class DataTransformer:
|
|||||||
else:
|
else:
|
||||||
X[column] = X[column].fillna("__NAN__")
|
X[column] = X[column].fillna("__NAN__")
|
||||||
cat_columns.append(column)
|
cat_columns.append(column)
|
||||||
else:
|
elif X[column].nunique(dropna=True) < 2:
|
||||||
# print(X[column].dtype.name)
|
X.drop(columns=column, inplace=True)
|
||||||
if X[column].nunique(dropna=True) < 2:
|
drop = True
|
||||||
X.drop(columns=column, inplace=True)
|
else: # datetime or numeric
|
||||||
drop = True
|
if X[column].dtype.name == "datetime64[ns]":
|
||||||
else:
|
tmp_dt = X[column].dt
|
||||||
if X[column].dtype.name == "datetime64[ns]":
|
new_columns_dict = {
|
||||||
tmp_dt = X[column].dt
|
f"year_{column}": tmp_dt.year,
|
||||||
new_columns_dict = {
|
f"month_{column}": tmp_dt.month,
|
||||||
f"year_{column}": tmp_dt.year,
|
f"day_{column}": tmp_dt.day,
|
||||||
f"month_{column}": tmp_dt.month,
|
f"hour_{column}": tmp_dt.hour,
|
||||||
f"day_{column}": tmp_dt.day,
|
f"minute_{column}": tmp_dt.minute,
|
||||||
f"hour_{column}": tmp_dt.hour,
|
f"second_{column}": tmp_dt.second,
|
||||||
f"minute_{column}": tmp_dt.minute,
|
f"dayofweek_{column}": tmp_dt.dayofweek,
|
||||||
f"second_{column}": tmp_dt.second,
|
f"dayofyear_{column}": tmp_dt.dayofyear,
|
||||||
f"dayofweek_{column}": tmp_dt.dayofweek,
|
f"quarter_{column}": tmp_dt.quarter,
|
||||||
f"dayofyear_{column}": tmp_dt.dayofyear,
|
}
|
||||||
f"quarter_{column}": tmp_dt.quarter,
|
for key, value in new_columns_dict.items():
|
||||||
}
|
if (
|
||||||
for new_col_name in new_columns_dict.keys():
|
key not in X.columns
|
||||||
if (
|
and value.nunique(dropna=False) >= 2
|
||||||
new_col_name not in X.columns
|
):
|
||||||
and new_columns_dict.get(new_col_name).nunique(
|
X[key] = value
|
||||||
dropna=False
|
num_columns.append(key)
|
||||||
)
|
X[column] = X[column].map(datetime.toordinal)
|
||||||
>= 2
|
datetime_columns.append(column)
|
||||||
):
|
del tmp_dt
|
||||||
X[new_col_name] = new_columns_dict.get(new_col_name)
|
X[column] = X[column].fillna(np.nan)
|
||||||
num_columns.append(new_col_name)
|
num_columns.append(column)
|
||||||
X[column] = X[column].map(datetime.toordinal)
|
|
||||||
datetime_columns.append(column)
|
|
||||||
del tmp_dt
|
|
||||||
X[column] = X[column].fillna(np.nan)
|
|
||||||
num_columns.append(column)
|
|
||||||
X = X[cat_columns + num_columns]
|
X = X[cat_columns + num_columns]
|
||||||
if task == TS_FORECAST:
|
if task == TS_FORECAST:
|
||||||
X.insert(0, TS_TIMESTAMP_COL, ds_col)
|
X.insert(0, TS_TIMESTAMP_COL, ds_col)
|
||||||
@ -380,29 +375,24 @@ class DataTransformer:
|
|||||||
if self._task == TS_FORECAST:
|
if self._task == TS_FORECAST:
|
||||||
X = X.rename(columns={X.columns[0]: TS_TIMESTAMP_COL})
|
X = X.rename(columns={X.columns[0]: TS_TIMESTAMP_COL})
|
||||||
ds_col = X.pop(TS_TIMESTAMP_COL)
|
ds_col = X.pop(TS_TIMESTAMP_COL)
|
||||||
if datetime_columns:
|
for column in datetime_columns:
|
||||||
for column in datetime_columns:
|
tmp_dt = X[column].dt
|
||||||
tmp_dt = X[column].dt
|
new_columns_dict = {
|
||||||
new_columns_dict = {
|
f"year_{column}": tmp_dt.year,
|
||||||
f"year_{column}": tmp_dt.year,
|
f"month_{column}": tmp_dt.month,
|
||||||
f"month_{column}": tmp_dt.month,
|
f"day_{column}": tmp_dt.day,
|
||||||
f"day_{column}": tmp_dt.day,
|
f"hour_{column}": tmp_dt.hour,
|
||||||
f"hour_{column}": tmp_dt.hour,
|
f"minute_{column}": tmp_dt.minute,
|
||||||
f"minute_{column}": tmp_dt.minute,
|
f"second_{column}": tmp_dt.second,
|
||||||
f"second_{column}": tmp_dt.second,
|
f"dayofweek_{column}": tmp_dt.dayofweek,
|
||||||
f"dayofweek_{column}": tmp_dt.dayofweek,
|
f"dayofyear_{column}": tmp_dt.dayofyear,
|
||||||
f"dayofyear_{column}": tmp_dt.dayofyear,
|
f"quarter_{column}": tmp_dt.quarter,
|
||||||
f"quarter_{column}": tmp_dt.quarter,
|
}
|
||||||
}
|
for new_col_name, new_col_value in new_columns_dict.items():
|
||||||
for new_col_name in new_columns_dict.keys():
|
if new_col_name not in X.columns and new_col_name in num_columns:
|
||||||
if (
|
X[new_col_name] = new_col_value
|
||||||
new_col_name not in X.columns
|
X[column] = X[column].map(datetime.toordinal)
|
||||||
and new_columns_dict.get(new_col_name).nunique(dropna=False)
|
del tmp_dt
|
||||||
>= 2
|
|
||||||
):
|
|
||||||
X[new_col_name] = new_columns_dict.get(new_col_name)
|
|
||||||
X[column] = X[column].map(datetime.toordinal)
|
|
||||||
del tmp_dt
|
|
||||||
X = X[cat_columns + num_columns].copy()
|
X = X[cat_columns + num_columns].copy()
|
||||||
if self._task == TS_FORECAST:
|
if self._task == TS_FORECAST:
|
||||||
X.insert(0, TS_TIMESTAMP_COL, ds_col)
|
X.insert(0, TS_TIMESTAMP_COL, ds_col)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user