From ad42889a3b50520a981cb1e0b65b0c747cbb61df Mon Sep 17 00:00:00 2001 From: Gian Pio Domiziani <50718324+gianpDomiziani@users.noreply.github.com> Date: Wed, 21 Apr 2021 16:22:54 +0200 Subject: [PATCH] datetime columns preprocess for validation data fixed. (#73) * datetime columns preprocess for validation data fixed. * code line formatted. --- flaml/data.py | 12 +++++++++--- test/test_automl.py | 20 ++++++++++++++++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/flaml/data.py b/flaml/data.py index 6a9c5dbf3..d00f10238 100644 --- a/flaml/data.py +++ b/flaml/data.py @@ -192,12 +192,13 @@ class DataTransformer: if isinstance(X, pd.DataFrame): X = X.copy() n = X.shape[0] - cat_columns, num_columns = [], [] + cat_columns, num_columns, datetime_columns = [], [], [] drop = False for column in X.columns: # sklearn\utils\validation.py needs int/float values if X[column].dtype.name == 'datetime64[ns]': X[column] = X[column].map(datetime.toordinal) + datetime_columns.append(column) if X[column].dtype.name in ('object', 'category'): if X[column].nunique() == 1 or X[column].nunique( dropna=True) == n - X[column].isnull().sum(): @@ -236,7 +237,8 @@ class DataTransformer: SimpleImputer(missing_values=np.nan, strategy='median'), X_num.columns)]) X[num_columns] = self.transformer.fit_transform(X_num) - self._cat_columns, self._num_columns = cat_columns, num_columns + self._cat_columns, self._num_columns, self._datetime_columns = cat_columns, \ + num_columns, datetime_columns self._drop = drop if task == 'regression': @@ -249,7 +251,11 @@ class DataTransformer: def transform(self, X): if isinstance(X, pd.DataFrame): - cat_columns, num_columns = self._cat_columns, self._num_columns + cat_columns, num_columns, datetime_columns = self._cat_columns, \ + self._num_columns, self._datetime_columns + if datetime_columns: + for dt_column in datetime_columns: + X[dt_column] = X[dt_column].map(datetime.toordinal) X = X[cat_columns + num_columns].copy() for column in cat_columns: # print(column, X[column].dtype.name) diff --git a/test/test_automl.py b/test/test_automl.py index b005d8771..4fbf8296c 100644 --- a/test/test_automl.py +++ b/test/test_automl.py @@ -4,6 +4,9 @@ import numpy as np import scipy.sparse from sklearn.datasets import load_boston, load_iris, load_wine +import pandas as pd +from datetime import datetime + from flaml import AutoML from flaml.data import get_output_from_log @@ -219,6 +222,23 @@ class TestAutoML(unittest.TestCase): print(automl_experiment.model) print(automl_experiment.predict_proba(X_train)[:5]) + def test_datetime_columns(self): + + automl_experiment = AutoML() + automl_settings = { + "time_budget": 2, + "metric": 'mse', + "task": 'regression', + "log_file_name": "test/datetime_columns.log", + "log_training_metric": True, + "n_jobs": 1, + "model_history": True + } + + fake_df = pd.DataFrame({'A': [datetime(1900, 2, 3), datetime(1900, 3, 4)]}) + y = np.array([0, 1]) + automl_experiment.fit(X_train=fake_df, X_val=fake_df, y_train=y, y_val=y, **automl_settings) + def test_regression(self): automl_experiment = AutoML()