From 21fa6c10ec6f963edbda99e9020ad585ae60a1a5 Mon Sep 17 00:00:00 2001
From: Xueqing Liu <liususan091219@users.noreply.github.com>
Date: Wed, 3 Aug 2022 00:11:29 -0400
Subject: [PATCH] Fixing the issue that FLAML trial number is significantly
 smaller than Transformers.hyperparameter_search (#657)

* fix 636

* adding low cost config

* update padding; update tokenization output y type (series -> DF); update low cost init config

* updating todf; updating metric_loss_score
---
 flaml/ml.py                            | 128 ++++++++++---------------
 flaml/model.py                         |  72 +++++++-------
 flaml/nlp/huggingface/data_collator.py |   4 +-
 flaml/nlp/huggingface/utils.py         |  39 ++++++--
 setup.py                               |   4 +-
 test/nlp/test_autohf_custom_metric.py  |   6 +-
 6 files changed, 129 insertions(+), 124 deletions(-)

diff --git a/flaml/ml.py b/flaml/ml.py
index 49c0b0bf0..cc7a0e4b8 100644
--- a/flaml/ml.py
+++ b/flaml/ml.py
@@ -141,7 +141,6 @@ def metric_loss_score(
     groups=None,
 ):
     # y_processed_predict and y_processed_true are processed id labels if the original were the token labels
-
     if is_in_sklearn_metric_name_set(metric_name):
         return sklearn_metric_loss_score(
             metric_name,
@@ -152,86 +151,61 @@ def metric_loss_score(
             groups,
         )
     else:
-        """
-        hf's datasets.load_metric("pearsonr") returns nan (hf's bug), overwriting it here
-        """
-        if metric_name == "spearmanr":
-            from scipy.stats import spearmanr
+        try:
+            import datasets
 
-            y_true = (
-                y_processed_true.to_list()
-                if isinstance(y_processed_true, pd.Series)
-                else list(y_processed_true)
+            datasets_metric_name = huggingface_submetric_to_metric.get(
+                metric_name, metric_name.split(":")[0]
             )
-            score = spearmanr(list(y_processed_predict), y_true)[0]
-            metric_mode = "max"
-        elif metric_name == "pearsonr":
-            from scipy.stats import pearsonr
+            metric = datasets.load_metric(datasets_metric_name)
+            metric_mode = huggingface_metric_to_mode[datasets_metric_name]
 
-            y_true = (
-                y_processed_true.to_list()
-                if type(y_processed_true) == pd.Series
-                else list(y_processed_true)
+            if metric_name.startswith("seqeval"):
+                y_processed_true = [
+                    [labels[tr] for tr in each_list] for each_list in y_processed_true
+                ]
+            elif metric in ("pearsonr", "spearmanr"):
+                y_processed_true = (
+                    y_processed_true.to_list()
+                    if isinstance(y_processed_true, pd.Series)
+                    else list(y_processed_true)
+                )
+            score_dict = metric.compute(
+                predictions=y_processed_predict, references=y_processed_true
+            )
+            if "rouge" in metric_name:
+                score = score_dict[metric_name].mid.fmeasure
+            elif metric_name.startswith("seqeval"):
+                metric_submetric_names = metric_name.split(":")
+                score = score_dict[
+                    metric_submetric_names[1]
+                    if len(metric_submetric_names) > 1
+                    else "overall_accuracy"
+                ]
+            else:
+                score = score_dict[metric_name]
+        except ImportError:
+            raise ValueError(
+                metric_name
+                + " is not an built-in sklearn metric and nlp is not installed. "
+                "Currently built-in sklearn metrics are: "
+                "r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,"
+                "log_loss, mape, f1, micro_f1, macro_f1, ap. "
+                "If the metric is an nlp metric, please pip install flaml[nlp] ",
+                "or pass a customized metric function to AutoML.fit(metric=func)",
+            )
+        # If the metric is not found from huggingface dataset metric list (i.e., FileNotFoundError)
+        # ask the user to provide a custom metric
+        except FileNotFoundError:
+            raise ValueError(
+                metric_name + " is neither an sklearn metric nor a huggingface metric. "
+                "Currently built-in sklearn metrics are: "
+                "r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,"
+                "log_loss, mape, f1, micro_f1, macro_f1, ap. "
+                "Currently built-in huggingface metrics are: "
+                + ", ".join(huggingface_metric_to_mode.keys())
+                + ". Please pass a customized metric function to AutoML.fit(metric=func)"
             )
-            score = pearsonr(list(y_processed_predict), y_true)[0]
-            metric_mode = "max"
-        else:
-            try:
-                import datasets
-
-                datasets_metric_name = huggingface_submetric_to_metric.get(
-                    metric_name, metric_name.split(":")[0]
-                )
-                metric = datasets.load_metric(datasets_metric_name)
-                metric_mode = huggingface_metric_to_mode[datasets_metric_name]
-
-                if "rouge" in metric_name:
-                    score = metric.compute(
-                        predictions=y_processed_predict, references=y_processed_true
-                    )[metric_name].mid.fmeasure
-                elif metric_name.startswith("seqeval"):
-
-                    y_processed_true = [
-                        [labels[tr] for tr in each_list]
-                        for each_list in y_processed_true
-                    ]
-                    metric_submetric_names = metric_name.split(":")
-
-                    score = metric.compute(
-                        predictions=y_processed_predict, references=y_processed_true
-                    )[
-                        metric_submetric_names[1]
-                        if len(metric_submetric_names) > 1
-                        else "overall_accuracy"
-                    ]
-
-                else:
-                    score = metric.compute(
-                        predictions=y_processed_predict, references=y_processed_true
-                    )[metric_name]
-            except ImportError:
-                raise ValueError(
-                    metric_name
-                    + " is not an built-in sklearn metric and nlp is not installed. "
-                    "Currently built-in sklearn metrics are: "
-                    "r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,"
-                    "log_loss, mape, f1, micro_f1, macro_f1, ap. "
-                    "If the metric is an nlp metric, please pip install flaml[nlp] ",
-                    "or pass a customized metric function to AutoML.fit(metric=func)",
-                )
-            # If the metric is not found from huggingface dataset metric list (i.e., FileNotFoundError)
-            # ask the user to provide a custom metric
-            except FileNotFoundError:
-                raise ValueError(
-                    metric_name
-                    + " is neither an sklearn metric nor a huggingface metric. "
-                    "Currently built-in sklearn metrics are: "
-                    "r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,"
-                    "log_loss, mape, f1, micro_f1, macro_f1, ap. "
-                    "Currently built-in huggingface metrics are: "
-                    + ", ".join(huggingface_metric_to_mode.keys())
-                    + ". Please pass a customized metric function to AutoML.fit(metric=func)"
-                )
         if metric_mode == "max":
             return 1 - score
         else:
diff --git a/flaml/model.py b/flaml/model.py
index a11e6668c..0eb6e1b61 100644
--- a/flaml/model.py
+++ b/flaml/model.py
@@ -406,13 +406,6 @@ class TransformersEstimator(BaseEstimator):
             )
         self._TrainingArguments = TrainingArguments
 
-    @staticmethod
-    def _join(X_train, y_train, task):
-        y_train = DataFrame(y_train, index=X_train.index)
-        y_train.columns = ["label"] if task != TOKENCLASSIFICATION else ["labels"]
-        train_df = X_train.join(y_train)
-        return train_df
-
     @classmethod
     def search_space(cls, data_size, task, **params):
         search_space_dict = {
@@ -422,13 +415,18 @@ class TransformersEstimator(BaseEstimator):
             },
             "num_train_epochs": {
                 "domain": tune.choice([1, 2, 3, 4, 5]),
-                "init_value": 3.0,  # to be consistent with roberta
+                "init_value": 3,  # to be consistent with roberta
+                "low_cost_init_value": 1,
             },
             "per_device_train_batch_size": {
                 "domain": tune.choice([4, 8, 16, 32, 64]),
                 "init_value": 32,
+                "low_cost_init_value": 64,
+            },
+            "seed": {
+                "domain": tune.choice(range(1, 40)),
+                "init_value": 20,
             },
-            "seed": {"domain": tune.randint(1, 40), "init_value": 20},
             "global_max_steps": {
                 "domain": sys.maxsize,
                 "init_value": sys.maxsize,
@@ -498,7 +496,7 @@ class TransformersEstimator(BaseEstimator):
             )
             setattr(self._training_args, "max_seq_length", None)
 
-    def _preprocess(self, X, y=None, **kwargs):
+    def _tokenize_text(self, X, y=None, **kwargs):
         from .nlp.huggingface.utils import tokenize_text
         from .nlp.utils import is_a_list_of_str
 
@@ -514,7 +512,7 @@ class TransformersEstimator(BaseEstimator):
                 tokenizer=self.tokenizer,
             )
         else:
-            return X, None
+            return X, y
 
     def _model_init(self):
         from .nlp.huggingface.utils import load_model
@@ -526,18 +524,15 @@ class TransformersEstimator(BaseEstimator):
         )
         return this_model
 
-    def preprocess_data(self, X, y):
+    def _preprocess_data(self, X, y):
         from datasets import Dataset
 
-        if (self._task not in NLG_TASKS) and (self._task != TOKENCLASSIFICATION):
-            processed_X, _ = self._preprocess(X=X, **self._kwargs)
-            processed_y = y
-        else:
-            processed_X, processed_y = self._preprocess(X=X, y=y, **self._kwargs)
+        processed_X, processed_y_df = self._tokenize_text(X=X, y=y, **self._kwargs)
+        # convert y from pd.DataFrame back to pd.Series
+        processed_y = processed_y_df.iloc[:, 0]
+
+        processed_dataset = Dataset.from_pandas(processed_X.join(processed_y_df))
 
-        processed_dataset = Dataset.from_pandas(
-            TransformersEstimator._join(processed_X, processed_y, self._task)
-        )
         return processed_dataset, processed_X, processed_y
 
     @property
@@ -574,14 +569,25 @@ class TransformersEstimator(BaseEstimator):
     def data_collator(self):
         from .nlp.huggingface.data_collator import task_to_datacollator_class
 
-        return (
-            task_to_datacollator_class[self._task](
-                tokenizer=self.tokenizer,
-                pad_to_multiple_of=8,  # if self._training_args.fp16 else None,
-            )
-            if self._task in (MULTICHOICECLASSIFICATION, TOKENCLASSIFICATION)
-            else None
-        )
+        data_collator_class = task_to_datacollator_class.get(self._task)
+
+        if data_collator_class:
+            kwargs = {
+                "model": self._model_init(),  # need to set model, or there's ValueError: Expected input batch_size (..) to match target batch_size (..)
+                "label_pad_token_id": -100,  # pad with token id -100
+                "pad_to_multiple_of": 8,  # pad to multiple of 8 because quote Transformers: "This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta)"
+                "tokenizer": self.tokenizer,
+            }
+
+            for key in list(kwargs.keys()):
+                if (
+                    key not in data_collator_class.__dict__.keys()
+                    and key != "tokenizer"
+                ):
+                    del kwargs[key]
+            return data_collator_class(**kwargs)
+        else:
+            return None
 
     def fit(
         self,
@@ -619,11 +625,11 @@ class TransformersEstimator(BaseEstimator):
         )  # If using roberta model, must set add_prefix_space to True to avoid the assertion error at
         # https://github.com/huggingface/transformers/blob/main/src/transformers/models/roberta/tokenization_roberta_fast.py#L249
 
-        train_dataset, self._X_train, self._y_train = self.preprocess_data(
+        train_dataset, self._X_train, self._y_train = self._preprocess_data(
             X_train, y_train
         )
         if X_val is not None:
-            eval_dataset, self._X_val, self._y_val = self.preprocess_data(X_val, y_val)
+            eval_dataset, self._X_val, self._y_val = self._preprocess_data(X_val, y_val)
         else:
             eval_dataset, self._X_val, self._y_val = None, None, None
 
@@ -825,7 +831,7 @@ class TransformersEstimator(BaseEstimator):
             self._task in CLASSIFICATION
         ), "predict_proba() only for classification tasks."
 
-        X_test, _ = self._preprocess(X, **self._kwargs)
+        X_test, _ = self._tokenize_text(X, **self._kwargs)
         test_dataset = Dataset.from_pandas(X_test)
 
         new_trainer = self._init_model_for_predict()
@@ -839,7 +845,7 @@ class TransformersEstimator(BaseEstimator):
 
         self._metric = kwargs["metric"]
 
-        eval_dataset, X_val, y_val = self.preprocess_data(X_val, y_val)
+        eval_dataset, X_val, y_val = self._preprocess_data(X_val, y_val)
 
         new_trainer = self._init_model_for_predict()
         return new_trainer.evaluate(eval_dataset)
@@ -855,7 +861,7 @@ class TransformersEstimator(BaseEstimator):
             for key, val in pred_kwargs.items():
                 setattr(self._training_args, key, val)
 
-        X_test, _ = self._preprocess(X, **self._kwargs)
+        X_test, _ = self._tokenize_text(X, **self._kwargs)
         test_dataset = Dataset.from_pandas(X_test)
 
         new_trainer = self._init_model_for_predict()
diff --git a/flaml/nlp/huggingface/data_collator.py b/flaml/nlp/huggingface/data_collator.py
index 1203a536c..7f33dc330 100644
--- a/flaml/nlp/huggingface/data_collator.py
+++ b/flaml/nlp/huggingface/data_collator.py
@@ -2,10 +2,11 @@ from dataclasses import dataclass
 from transformers.data.data_collator import (
     DataCollatorWithPadding,
     DataCollatorForTokenClassification,
+    DataCollatorForSeq2Seq,
 )
 from collections import OrderedDict
 
-from flaml.data import TOKENCLASSIFICATION, MULTICHOICECLASSIFICATION
+from flaml.data import TOKENCLASSIFICATION, MULTICHOICECLASSIFICATION, SUMMARIZATION
 
 
 @dataclass
@@ -43,5 +44,6 @@ task_to_datacollator_class = OrderedDict(
     [
         (TOKENCLASSIFICATION, DataCollatorForTokenClassification),
         (MULTICHOICECLASSIFICATION, DataCollatorForMultipleChoiceClassification),
+        (SUMMARIZATION, DataCollatorForSeq2Seq),
     ]
 )
diff --git a/flaml/nlp/huggingface/utils.py b/flaml/nlp/huggingface/utils.py
index 13a5c01c1..728fded88 100644
--- a/flaml/nlp/huggingface/utils.py
+++ b/flaml/nlp/huggingface/utils.py
@@ -12,7 +12,21 @@ from ...data import (
 )
 
 
+def todf(X, Y, column_name):
+    """
+    todf converts Y from any format (list, pandas.Series, numpy array) to a DataFrame before being returned
+    """
+    if Y is not None:
+        Y = pd.DataFrame(Y, index=X.index)
+        Y.columns = column_name
+    return Y
+
+
 def tokenize_text(X, Y=None, task=None, hf_args=None, tokenizer=None):
+    label_col_name = None
+    # label_col_name is the name of the label column Y, label_col_name = ['labels'] for TOKENCLASSIFICATION and SUMMARIZATION,
+    # label_col_name = ['label'] for other tasks. todf is used by all tasks except for SUMMARIZATION,
+    # because the outputs of tokenize_seq2seq are already two DataFrames so no conversion needed.
     if task in (SEQCLASSIFICATION, SEQREGRESSION):
         X_tokenized = tokenize_onedataframe(
             X,
@@ -21,15 +35,23 @@ def tokenize_text(X, Y=None, task=None, hf_args=None, tokenizer=None):
             hf_args=hf_args,
             prefix_str="",
         )
-        return X_tokenized, None
+        Y_tokenized = Y
+        label_col_name = ["label"]
     elif task == TOKENCLASSIFICATION:
-        return tokenize_text_tokclassification(
+        X_tokenized, Y_tokenized = tokenize_text_tokclassification(
             X, Y, tokenizer=tokenizer, hf_args=hf_args
         )
+        label_col_name = ["labels"]
     elif task in NLG_TASKS:
         return tokenize_seq2seq(X, Y, tokenizer=tokenizer, task=task, hf_args=hf_args)
     elif task == MULTICHOICECLASSIFICATION:
-        return tokenize_text_multiplechoice(X, tokenizer=tokenizer, hf_args=hf_args)
+        X_tokenized = tokenize_text_multiplechoice(
+            X, tokenizer=tokenizer, hf_args=hf_args
+        )
+        label_col_name = ["label"]
+        Y_tokenized = Y
+    Y_tokenized = todf(X_tokenized, Y_tokenized, label_col_name)
+    return X_tokenized, Y_tokenized
 
 
 def tokenize_seq2seq(X, Y, tokenizer, task=None, hf_args=None):
@@ -49,7 +71,7 @@ def tokenize_seq2seq(X, Y, tokenizer, task=None, hf_args=None):
             hf_args=hf_args,
             prefix_str="",
         )
-        model_outputs["label"] = [
+        model_outputs["labels"] = [
             [(each_l if each_l != tokenizer.pad_token_id else -100) for each_l in label]
             for label in model_outputs["input_ids"]
         ]
@@ -238,7 +260,7 @@ def tokenize_row(
     # tokenizer.pad_token = tokenizer.eos_token
     tokenized_example = tokenizer(
         *tuple(this_row),
-        padding="max_length",
+        padding="max_length" if hf_args and hf_args.pad_to_max_length else False,
         max_length=hf_args.max_seq_length if hf_args else None,
         truncation=True,
     )
@@ -270,7 +292,7 @@ def tokenize_text_multiplechoice(X, tokenizer, hf_args=None):
     X_tokenized = pd.DataFrame(columns=tokenized_column_names)
     X_tokenized[tokenized_column_names] = d
     output = X_tokenized.join(X)
-    return output, None
+    return output
 
 
 def tokenize_swag(this_row, tokenizer, hf_args=None, return_column_name=False):
@@ -292,7 +314,7 @@ def tokenize_swag(this_row, tokenizer, hf_args=None, return_column_name=False):
         *tuple([first_sentences, second_sentences]),
         truncation=True,
         max_length=hf_args.max_seq_length if hf_args else None,
-        padding=False,
+        padding="max_length" if hf_args and hf_args.pad_to_max_length else False,
     )
     tmp_column_names = sorted(tokenized_example.keys())
 
@@ -318,13 +340,14 @@ def postprocess_prediction_and_true(
         # If y_true is None, we use X to compute y_is_pad (i.e., whether y_true is -100 in that position), and use y_is_pad to remove the -100 in the prediction, and return the postprocessed prediction (not the y_true)
         y_predict = pd.Series(np.argmax(y_pred, axis=2).tolist())
         if y_true is None:
-            _, y_is_pad = tokenize_text(
+            _, y_is_pad_df = tokenize_text(
                 X,
                 y_predict,
                 task=task,
                 hf_args=hf_args,
                 tokenizer=tokenizer,
             )
+            y_is_pad = y_is_pad_df.iloc[:, 0]
         else:
             y_is_pad = y_true
         label_len = len(hf_args.label_list)
diff --git a/setup.py b/setup.py
index b9bb210d1..dc5edd77c 100644
--- a/setup.py
+++ b/setup.py
@@ -59,7 +59,7 @@ setuptools.setup(
             "statsmodels>=0.12.2",
             "psutil==5.8.0",
             "dataclasses",
-            "transformers[torch]>=4.14",
+            "transformers[torch]==4.18",
             "datasets",
             "nltk",
             "rouge_score",
@@ -81,7 +81,7 @@ setuptools.setup(
             "vowpalwabbit",
         ],
         "nlp": [
-            "transformers[torch]>=4.14",
+            "transformers[torch]==4.18",
             "datasets",
             "nltk",
             "rouge_score",
diff --git a/test/nlp/test_autohf_custom_metric.py b/test/nlp/test_autohf_custom_metric.py
index ee5317d15..ac38039b2 100644
--- a/test/nlp/test_autohf_custom_metric.py
+++ b/test/nlp/test_autohf_custom_metric.py
@@ -24,11 +24,11 @@ def custom_metric(
         estimator._trainer = None
     else:
         trainer = estimator._trainer
+    X_test, y_test = estimator._tokenize_text(X_test)
+
     if y_test is not None:
-        X_test = estimator._preprocess(X_test)
-        eval_dataset = Dataset.from_pandas(TransformersEstimator._join(X_test, y_test))
+        eval_dataset = Dataset.from_pandas(X_test.join(y_test))
     else:
-        X_test = estimator._preprocess(X_test)
         eval_dataset = Dataset.from_pandas(X_test)
 
     estimator_metric_backup = estimator._metric