From 21fa6c10ec6f963edbda99e9020ad585ae60a1a5 Mon Sep 17 00:00:00 2001 From: Xueqing Liu Date: Wed, 3 Aug 2022 00:11:29 -0400 Subject: [PATCH] Fixing the issue that FLAML trial number is significantly smaller than Transformers.hyperparameter_search (#657) * fix 636 * adding low cost config * update padding; update tokenization output y type (series -> DF); update low cost init config * updating todf; updating metric_loss_score --- flaml/ml.py | 128 ++++++++++--------------- flaml/model.py | 72 +++++++------- flaml/nlp/huggingface/data_collator.py | 4 +- flaml/nlp/huggingface/utils.py | 39 ++++++-- setup.py | 4 +- test/nlp/test_autohf_custom_metric.py | 6 +- 6 files changed, 129 insertions(+), 124 deletions(-) diff --git a/flaml/ml.py b/flaml/ml.py index 49c0b0bf0..cc7a0e4b8 100644 --- a/flaml/ml.py +++ b/flaml/ml.py @@ -141,7 +141,6 @@ def metric_loss_score( groups=None, ): # y_processed_predict and y_processed_true are processed id labels if the original were the token labels - if is_in_sklearn_metric_name_set(metric_name): return sklearn_metric_loss_score( metric_name, @@ -152,86 +151,61 @@ def metric_loss_score( groups, ) else: - """ - hf's datasets.load_metric("pearsonr") returns nan (hf's bug), overwriting it here - """ - if metric_name == "spearmanr": - from scipy.stats import spearmanr + try: + import datasets - y_true = ( - y_processed_true.to_list() - if isinstance(y_processed_true, pd.Series) - else list(y_processed_true) + datasets_metric_name = huggingface_submetric_to_metric.get( + metric_name, metric_name.split(":")[0] ) - score = spearmanr(list(y_processed_predict), y_true)[0] - metric_mode = "max" - elif metric_name == "pearsonr": - from scipy.stats import pearsonr + metric = datasets.load_metric(datasets_metric_name) + metric_mode = huggingface_metric_to_mode[datasets_metric_name] - y_true = ( - y_processed_true.to_list() - if type(y_processed_true) == pd.Series - else list(y_processed_true) + if metric_name.startswith("seqeval"): + y_processed_true = [ + [labels[tr] for tr in each_list] for each_list in y_processed_true + ] + elif metric in ("pearsonr", "spearmanr"): + y_processed_true = ( + y_processed_true.to_list() + if isinstance(y_processed_true, pd.Series) + else list(y_processed_true) + ) + score_dict = metric.compute( + predictions=y_processed_predict, references=y_processed_true + ) + if "rouge" in metric_name: + score = score_dict[metric_name].mid.fmeasure + elif metric_name.startswith("seqeval"): + metric_submetric_names = metric_name.split(":") + score = score_dict[ + metric_submetric_names[1] + if len(metric_submetric_names) > 1 + else "overall_accuracy" + ] + else: + score = score_dict[metric_name] + except ImportError: + raise ValueError( + metric_name + + " is not an built-in sklearn metric and nlp is not installed. " + "Currently built-in sklearn metrics are: " + "r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo," + "log_loss, mape, f1, micro_f1, macro_f1, ap. " + "If the metric is an nlp metric, please pip install flaml[nlp] ", + "or pass a customized metric function to AutoML.fit(metric=func)", + ) + # If the metric is not found from huggingface dataset metric list (i.e., FileNotFoundError) + # ask the user to provide a custom metric + except FileNotFoundError: + raise ValueError( + metric_name + " is neither an sklearn metric nor a huggingface metric. " + "Currently built-in sklearn metrics are: " + "r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo," + "log_loss, mape, f1, micro_f1, macro_f1, ap. " + "Currently built-in huggingface metrics are: " + + ", ".join(huggingface_metric_to_mode.keys()) + + ". Please pass a customized metric function to AutoML.fit(metric=func)" ) - score = pearsonr(list(y_processed_predict), y_true)[0] - metric_mode = "max" - else: - try: - import datasets - - datasets_metric_name = huggingface_submetric_to_metric.get( - metric_name, metric_name.split(":")[0] - ) - metric = datasets.load_metric(datasets_metric_name) - metric_mode = huggingface_metric_to_mode[datasets_metric_name] - - if "rouge" in metric_name: - score = metric.compute( - predictions=y_processed_predict, references=y_processed_true - )[metric_name].mid.fmeasure - elif metric_name.startswith("seqeval"): - - y_processed_true = [ - [labels[tr] for tr in each_list] - for each_list in y_processed_true - ] - metric_submetric_names = metric_name.split(":") - - score = metric.compute( - predictions=y_processed_predict, references=y_processed_true - )[ - metric_submetric_names[1] - if len(metric_submetric_names) > 1 - else "overall_accuracy" - ] - - else: - score = metric.compute( - predictions=y_processed_predict, references=y_processed_true - )[metric_name] - except ImportError: - raise ValueError( - metric_name - + " is not an built-in sklearn metric and nlp is not installed. " - "Currently built-in sklearn metrics are: " - "r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo," - "log_loss, mape, f1, micro_f1, macro_f1, ap. " - "If the metric is an nlp metric, please pip install flaml[nlp] ", - "or pass a customized metric function to AutoML.fit(metric=func)", - ) - # If the metric is not found from huggingface dataset metric list (i.e., FileNotFoundError) - # ask the user to provide a custom metric - except FileNotFoundError: - raise ValueError( - metric_name - + " is neither an sklearn metric nor a huggingface metric. " - "Currently built-in sklearn metrics are: " - "r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo," - "log_loss, mape, f1, micro_f1, macro_f1, ap. " - "Currently built-in huggingface metrics are: " - + ", ".join(huggingface_metric_to_mode.keys()) - + ". Please pass a customized metric function to AutoML.fit(metric=func)" - ) if metric_mode == "max": return 1 - score else: diff --git a/flaml/model.py b/flaml/model.py index a11e6668c..0eb6e1b61 100644 --- a/flaml/model.py +++ b/flaml/model.py @@ -406,13 +406,6 @@ class TransformersEstimator(BaseEstimator): ) self._TrainingArguments = TrainingArguments - @staticmethod - def _join(X_train, y_train, task): - y_train = DataFrame(y_train, index=X_train.index) - y_train.columns = ["label"] if task != TOKENCLASSIFICATION else ["labels"] - train_df = X_train.join(y_train) - return train_df - @classmethod def search_space(cls, data_size, task, **params): search_space_dict = { @@ -422,13 +415,18 @@ class TransformersEstimator(BaseEstimator): }, "num_train_epochs": { "domain": tune.choice([1, 2, 3, 4, 5]), - "init_value": 3.0, # to be consistent with roberta + "init_value": 3, # to be consistent with roberta + "low_cost_init_value": 1, }, "per_device_train_batch_size": { "domain": tune.choice([4, 8, 16, 32, 64]), "init_value": 32, + "low_cost_init_value": 64, + }, + "seed": { + "domain": tune.choice(range(1, 40)), + "init_value": 20, }, - "seed": {"domain": tune.randint(1, 40), "init_value": 20}, "global_max_steps": { "domain": sys.maxsize, "init_value": sys.maxsize, @@ -498,7 +496,7 @@ class TransformersEstimator(BaseEstimator): ) setattr(self._training_args, "max_seq_length", None) - def _preprocess(self, X, y=None, **kwargs): + def _tokenize_text(self, X, y=None, **kwargs): from .nlp.huggingface.utils import tokenize_text from .nlp.utils import is_a_list_of_str @@ -514,7 +512,7 @@ class TransformersEstimator(BaseEstimator): tokenizer=self.tokenizer, ) else: - return X, None + return X, y def _model_init(self): from .nlp.huggingface.utils import load_model @@ -526,18 +524,15 @@ class TransformersEstimator(BaseEstimator): ) return this_model - def preprocess_data(self, X, y): + def _preprocess_data(self, X, y): from datasets import Dataset - if (self._task not in NLG_TASKS) and (self._task != TOKENCLASSIFICATION): - processed_X, _ = self._preprocess(X=X, **self._kwargs) - processed_y = y - else: - processed_X, processed_y = self._preprocess(X=X, y=y, **self._kwargs) + processed_X, processed_y_df = self._tokenize_text(X=X, y=y, **self._kwargs) + # convert y from pd.DataFrame back to pd.Series + processed_y = processed_y_df.iloc[:, 0] + + processed_dataset = Dataset.from_pandas(processed_X.join(processed_y_df)) - processed_dataset = Dataset.from_pandas( - TransformersEstimator._join(processed_X, processed_y, self._task) - ) return processed_dataset, processed_X, processed_y @property @@ -574,14 +569,25 @@ class TransformersEstimator(BaseEstimator): def data_collator(self): from .nlp.huggingface.data_collator import task_to_datacollator_class - return ( - task_to_datacollator_class[self._task]( - tokenizer=self.tokenizer, - pad_to_multiple_of=8, # if self._training_args.fp16 else None, - ) - if self._task in (MULTICHOICECLASSIFICATION, TOKENCLASSIFICATION) - else None - ) + data_collator_class = task_to_datacollator_class.get(self._task) + + if data_collator_class: + kwargs = { + "model": self._model_init(), # need to set model, or there's ValueError: Expected input batch_size (..) to match target batch_size (..) + "label_pad_token_id": -100, # pad with token id -100 + "pad_to_multiple_of": 8, # pad to multiple of 8 because quote Transformers: "This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta)" + "tokenizer": self.tokenizer, + } + + for key in list(kwargs.keys()): + if ( + key not in data_collator_class.__dict__.keys() + and key != "tokenizer" + ): + del kwargs[key] + return data_collator_class(**kwargs) + else: + return None def fit( self, @@ -619,11 +625,11 @@ class TransformersEstimator(BaseEstimator): ) # If using roberta model, must set add_prefix_space to True to avoid the assertion error at # https://github.com/huggingface/transformers/blob/main/src/transformers/models/roberta/tokenization_roberta_fast.py#L249 - train_dataset, self._X_train, self._y_train = self.preprocess_data( + train_dataset, self._X_train, self._y_train = self._preprocess_data( X_train, y_train ) if X_val is not None: - eval_dataset, self._X_val, self._y_val = self.preprocess_data(X_val, y_val) + eval_dataset, self._X_val, self._y_val = self._preprocess_data(X_val, y_val) else: eval_dataset, self._X_val, self._y_val = None, None, None @@ -825,7 +831,7 @@ class TransformersEstimator(BaseEstimator): self._task in CLASSIFICATION ), "predict_proba() only for classification tasks." - X_test, _ = self._preprocess(X, **self._kwargs) + X_test, _ = self._tokenize_text(X, **self._kwargs) test_dataset = Dataset.from_pandas(X_test) new_trainer = self._init_model_for_predict() @@ -839,7 +845,7 @@ class TransformersEstimator(BaseEstimator): self._metric = kwargs["metric"] - eval_dataset, X_val, y_val = self.preprocess_data(X_val, y_val) + eval_dataset, X_val, y_val = self._preprocess_data(X_val, y_val) new_trainer = self._init_model_for_predict() return new_trainer.evaluate(eval_dataset) @@ -855,7 +861,7 @@ class TransformersEstimator(BaseEstimator): for key, val in pred_kwargs.items(): setattr(self._training_args, key, val) - X_test, _ = self._preprocess(X, **self._kwargs) + X_test, _ = self._tokenize_text(X, **self._kwargs) test_dataset = Dataset.from_pandas(X_test) new_trainer = self._init_model_for_predict() diff --git a/flaml/nlp/huggingface/data_collator.py b/flaml/nlp/huggingface/data_collator.py index 1203a536c..7f33dc330 100644 --- a/flaml/nlp/huggingface/data_collator.py +++ b/flaml/nlp/huggingface/data_collator.py @@ -2,10 +2,11 @@ from dataclasses import dataclass from transformers.data.data_collator import ( DataCollatorWithPadding, DataCollatorForTokenClassification, + DataCollatorForSeq2Seq, ) from collections import OrderedDict -from flaml.data import TOKENCLASSIFICATION, MULTICHOICECLASSIFICATION +from flaml.data import TOKENCLASSIFICATION, MULTICHOICECLASSIFICATION, SUMMARIZATION @dataclass @@ -43,5 +44,6 @@ task_to_datacollator_class = OrderedDict( [ (TOKENCLASSIFICATION, DataCollatorForTokenClassification), (MULTICHOICECLASSIFICATION, DataCollatorForMultipleChoiceClassification), + (SUMMARIZATION, DataCollatorForSeq2Seq), ] ) diff --git a/flaml/nlp/huggingface/utils.py b/flaml/nlp/huggingface/utils.py index 13a5c01c1..728fded88 100644 --- a/flaml/nlp/huggingface/utils.py +++ b/flaml/nlp/huggingface/utils.py @@ -12,7 +12,21 @@ from ...data import ( ) +def todf(X, Y, column_name): + """ + todf converts Y from any format (list, pandas.Series, numpy array) to a DataFrame before being returned + """ + if Y is not None: + Y = pd.DataFrame(Y, index=X.index) + Y.columns = column_name + return Y + + def tokenize_text(X, Y=None, task=None, hf_args=None, tokenizer=None): + label_col_name = None + # label_col_name is the name of the label column Y, label_col_name = ['labels'] for TOKENCLASSIFICATION and SUMMARIZATION, + # label_col_name = ['label'] for other tasks. todf is used by all tasks except for SUMMARIZATION, + # because the outputs of tokenize_seq2seq are already two DataFrames so no conversion needed. if task in (SEQCLASSIFICATION, SEQREGRESSION): X_tokenized = tokenize_onedataframe( X, @@ -21,15 +35,23 @@ def tokenize_text(X, Y=None, task=None, hf_args=None, tokenizer=None): hf_args=hf_args, prefix_str="", ) - return X_tokenized, None + Y_tokenized = Y + label_col_name = ["label"] elif task == TOKENCLASSIFICATION: - return tokenize_text_tokclassification( + X_tokenized, Y_tokenized = tokenize_text_tokclassification( X, Y, tokenizer=tokenizer, hf_args=hf_args ) + label_col_name = ["labels"] elif task in NLG_TASKS: return tokenize_seq2seq(X, Y, tokenizer=tokenizer, task=task, hf_args=hf_args) elif task == MULTICHOICECLASSIFICATION: - return tokenize_text_multiplechoice(X, tokenizer=tokenizer, hf_args=hf_args) + X_tokenized = tokenize_text_multiplechoice( + X, tokenizer=tokenizer, hf_args=hf_args + ) + label_col_name = ["label"] + Y_tokenized = Y + Y_tokenized = todf(X_tokenized, Y_tokenized, label_col_name) + return X_tokenized, Y_tokenized def tokenize_seq2seq(X, Y, tokenizer, task=None, hf_args=None): @@ -49,7 +71,7 @@ def tokenize_seq2seq(X, Y, tokenizer, task=None, hf_args=None): hf_args=hf_args, prefix_str="", ) - model_outputs["label"] = [ + model_outputs["labels"] = [ [(each_l if each_l != tokenizer.pad_token_id else -100) for each_l in label] for label in model_outputs["input_ids"] ] @@ -238,7 +260,7 @@ def tokenize_row( # tokenizer.pad_token = tokenizer.eos_token tokenized_example = tokenizer( *tuple(this_row), - padding="max_length", + padding="max_length" if hf_args and hf_args.pad_to_max_length else False, max_length=hf_args.max_seq_length if hf_args else None, truncation=True, ) @@ -270,7 +292,7 @@ def tokenize_text_multiplechoice(X, tokenizer, hf_args=None): X_tokenized = pd.DataFrame(columns=tokenized_column_names) X_tokenized[tokenized_column_names] = d output = X_tokenized.join(X) - return output, None + return output def tokenize_swag(this_row, tokenizer, hf_args=None, return_column_name=False): @@ -292,7 +314,7 @@ def tokenize_swag(this_row, tokenizer, hf_args=None, return_column_name=False): *tuple([first_sentences, second_sentences]), truncation=True, max_length=hf_args.max_seq_length if hf_args else None, - padding=False, + padding="max_length" if hf_args and hf_args.pad_to_max_length else False, ) tmp_column_names = sorted(tokenized_example.keys()) @@ -318,13 +340,14 @@ def postprocess_prediction_and_true( # If y_true is None, we use X to compute y_is_pad (i.e., whether y_true is -100 in that position), and use y_is_pad to remove the -100 in the prediction, and return the postprocessed prediction (not the y_true) y_predict = pd.Series(np.argmax(y_pred, axis=2).tolist()) if y_true is None: - _, y_is_pad = tokenize_text( + _, y_is_pad_df = tokenize_text( X, y_predict, task=task, hf_args=hf_args, tokenizer=tokenizer, ) + y_is_pad = y_is_pad_df.iloc[:, 0] else: y_is_pad = y_true label_len = len(hf_args.label_list) diff --git a/setup.py b/setup.py index b9bb210d1..dc5edd77c 100644 --- a/setup.py +++ b/setup.py @@ -59,7 +59,7 @@ setuptools.setup( "statsmodels>=0.12.2", "psutil==5.8.0", "dataclasses", - "transformers[torch]>=4.14", + "transformers[torch]==4.18", "datasets", "nltk", "rouge_score", @@ -81,7 +81,7 @@ setuptools.setup( "vowpalwabbit", ], "nlp": [ - "transformers[torch]>=4.14", + "transformers[torch]==4.18", "datasets", "nltk", "rouge_score", diff --git a/test/nlp/test_autohf_custom_metric.py b/test/nlp/test_autohf_custom_metric.py index ee5317d15..ac38039b2 100644 --- a/test/nlp/test_autohf_custom_metric.py +++ b/test/nlp/test_autohf_custom_metric.py @@ -24,11 +24,11 @@ def custom_metric( estimator._trainer = None else: trainer = estimator._trainer + X_test, y_test = estimator._tokenize_text(X_test) + if y_test is not None: - X_test = estimator._preprocess(X_test) - eval_dataset = Dataset.from_pandas(TransformersEstimator._join(X_test, y_test)) + eval_dataset = Dataset.from_pandas(X_test.join(y_test)) else: - X_test = estimator._preprocess(X_test) eval_dataset = Dataset.from_pandas(X_test) estimator_metric_backup = estimator._metric