mirror of
				https://github.com/microsoft/autogen.git
				synced 2025-11-04 03:39:52 +00:00 
			
		
		
		
	Fixing the issue that FLAML trial number is significantly smaller than Transformers.hyperparameter_search (#657)
* fix 636 * adding low cost config * update padding; update tokenization output y type (series -> DF); update low cost init config * updating todf; updating metric_loss_score
This commit is contained in:
		
							parent
							
								
									74e6026ab9
								
							
						
					
					
						commit
						21fa6c10ec
					
				
							
								
								
									
										60
									
								
								flaml/ml.py
									
									
									
									
									
								
							
							
						
						
									
										60
									
								
								flaml/ml.py
									
									
									
									
									
								
							@ -141,7 +141,6 @@ def metric_loss_score(
 | 
				
			|||||||
    groups=None,
 | 
					    groups=None,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    # y_processed_predict and y_processed_true are processed id labels if the original were the token labels
 | 
					    # y_processed_predict and y_processed_true are processed id labels if the original were the token labels
 | 
				
			||||||
 | 
					 | 
				
			||||||
    if is_in_sklearn_metric_name_set(metric_name):
 | 
					    if is_in_sklearn_metric_name_set(metric_name):
 | 
				
			||||||
        return sklearn_metric_loss_score(
 | 
					        return sklearn_metric_loss_score(
 | 
				
			||||||
            metric_name,
 | 
					            metric_name,
 | 
				
			||||||
@ -151,30 +150,6 @@ def metric_loss_score(
 | 
				
			|||||||
            sample_weight,
 | 
					            sample_weight,
 | 
				
			||||||
            groups,
 | 
					            groups,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        hf's datasets.load_metric("pearsonr") returns nan (hf's bug), overwriting it here
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        if metric_name == "spearmanr":
 | 
					 | 
				
			||||||
            from scipy.stats import spearmanr
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            y_true = (
 | 
					 | 
				
			||||||
                y_processed_true.to_list()
 | 
					 | 
				
			||||||
                if isinstance(y_processed_true, pd.Series)
 | 
					 | 
				
			||||||
                else list(y_processed_true)
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
            score = spearmanr(list(y_processed_predict), y_true)[0]
 | 
					 | 
				
			||||||
            metric_mode = "max"
 | 
					 | 
				
			||||||
        elif metric_name == "pearsonr":
 | 
					 | 
				
			||||||
            from scipy.stats import pearsonr
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            y_true = (
 | 
					 | 
				
			||||||
                y_processed_true.to_list()
 | 
					 | 
				
			||||||
                if type(y_processed_true) == pd.Series
 | 
					 | 
				
			||||||
                else list(y_processed_true)
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
            score = pearsonr(list(y_processed_predict), y_true)[0]
 | 
					 | 
				
			||||||
            metric_mode = "max"
 | 
					 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            import datasets
 | 
					            import datasets
 | 
				
			||||||
@ -185,30 +160,30 @@ def metric_loss_score(
 | 
				
			|||||||
            metric = datasets.load_metric(datasets_metric_name)
 | 
					            metric = datasets.load_metric(datasets_metric_name)
 | 
				
			||||||
            metric_mode = huggingface_metric_to_mode[datasets_metric_name]
 | 
					            metric_mode = huggingface_metric_to_mode[datasets_metric_name]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                if "rouge" in metric_name:
 | 
					            if metric_name.startswith("seqeval"):
 | 
				
			||||||
                    score = metric.compute(
 | 
					 | 
				
			||||||
                        predictions=y_processed_predict, references=y_processed_true
 | 
					 | 
				
			||||||
                    )[metric_name].mid.fmeasure
 | 
					 | 
				
			||||||
                elif metric_name.startswith("seqeval"):
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                y_processed_true = [
 | 
					                y_processed_true = [
 | 
				
			||||||
                        [labels[tr] for tr in each_list]
 | 
					                    [labels[tr] for tr in each_list] for each_list in y_processed_true
 | 
				
			||||||
                        for each_list in y_processed_true
 | 
					 | 
				
			||||||
                ]
 | 
					                ]
 | 
				
			||||||
                    metric_submetric_names = metric_name.split(":")
 | 
					            elif metric in ("pearsonr", "spearmanr"):
 | 
				
			||||||
 | 
					                y_processed_true = (
 | 
				
			||||||
                    score = metric.compute(
 | 
					                    y_processed_true.to_list()
 | 
				
			||||||
 | 
					                    if isinstance(y_processed_true, pd.Series)
 | 
				
			||||||
 | 
					                    else list(y_processed_true)
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					            score_dict = metric.compute(
 | 
				
			||||||
                predictions=y_processed_predict, references=y_processed_true
 | 
					                predictions=y_processed_predict, references=y_processed_true
 | 
				
			||||||
                    )[
 | 
					            )
 | 
				
			||||||
 | 
					            if "rouge" in metric_name:
 | 
				
			||||||
 | 
					                score = score_dict[metric_name].mid.fmeasure
 | 
				
			||||||
 | 
					            elif metric_name.startswith("seqeval"):
 | 
				
			||||||
 | 
					                metric_submetric_names = metric_name.split(":")
 | 
				
			||||||
 | 
					                score = score_dict[
 | 
				
			||||||
                    metric_submetric_names[1]
 | 
					                    metric_submetric_names[1]
 | 
				
			||||||
                    if len(metric_submetric_names) > 1
 | 
					                    if len(metric_submetric_names) > 1
 | 
				
			||||||
                    else "overall_accuracy"
 | 
					                    else "overall_accuracy"
 | 
				
			||||||
                ]
 | 
					                ]
 | 
				
			||||||
 | 
					 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                    score = metric.compute(
 | 
					                score = score_dict[metric_name]
 | 
				
			||||||
                        predictions=y_processed_predict, references=y_processed_true
 | 
					 | 
				
			||||||
                    )[metric_name]
 | 
					 | 
				
			||||||
        except ImportError:
 | 
					        except ImportError:
 | 
				
			||||||
            raise ValueError(
 | 
					            raise ValueError(
 | 
				
			||||||
                metric_name
 | 
					                metric_name
 | 
				
			||||||
@ -223,8 +198,7 @@ def metric_loss_score(
 | 
				
			|||||||
        # ask the user to provide a custom metric
 | 
					        # ask the user to provide a custom metric
 | 
				
			||||||
        except FileNotFoundError:
 | 
					        except FileNotFoundError:
 | 
				
			||||||
            raise ValueError(
 | 
					            raise ValueError(
 | 
				
			||||||
                    metric_name
 | 
					                metric_name + " is neither an sklearn metric nor a huggingface metric. "
 | 
				
			||||||
                    + " is neither an sklearn metric nor a huggingface metric. "
 | 
					 | 
				
			||||||
                "Currently built-in sklearn metrics are: "
 | 
					                "Currently built-in sklearn metrics are: "
 | 
				
			||||||
                "r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,"
 | 
					                "r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,"
 | 
				
			||||||
                "log_loss, mape, f1, micro_f1, macro_f1, ap. "
 | 
					                "log_loss, mape, f1, micro_f1, macro_f1, ap. "
 | 
				
			||||||
 | 
				
			|||||||
@ -406,13 +406,6 @@ class TransformersEstimator(BaseEstimator):
 | 
				
			|||||||
            )
 | 
					            )
 | 
				
			||||||
        self._TrainingArguments = TrainingArguments
 | 
					        self._TrainingArguments = TrainingArguments
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					 | 
				
			||||||
    def _join(X_train, y_train, task):
 | 
					 | 
				
			||||||
        y_train = DataFrame(y_train, index=X_train.index)
 | 
					 | 
				
			||||||
        y_train.columns = ["label"] if task != TOKENCLASSIFICATION else ["labels"]
 | 
					 | 
				
			||||||
        train_df = X_train.join(y_train)
 | 
					 | 
				
			||||||
        return train_df
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def search_space(cls, data_size, task, **params):
 | 
					    def search_space(cls, data_size, task, **params):
 | 
				
			||||||
        search_space_dict = {
 | 
					        search_space_dict = {
 | 
				
			||||||
@ -422,13 +415,18 @@ class TransformersEstimator(BaseEstimator):
 | 
				
			|||||||
            },
 | 
					            },
 | 
				
			||||||
            "num_train_epochs": {
 | 
					            "num_train_epochs": {
 | 
				
			||||||
                "domain": tune.choice([1, 2, 3, 4, 5]),
 | 
					                "domain": tune.choice([1, 2, 3, 4, 5]),
 | 
				
			||||||
                "init_value": 3.0,  # to be consistent with roberta
 | 
					                "init_value": 3,  # to be consistent with roberta
 | 
				
			||||||
 | 
					                "low_cost_init_value": 1,
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            "per_device_train_batch_size": {
 | 
					            "per_device_train_batch_size": {
 | 
				
			||||||
                "domain": tune.choice([4, 8, 16, 32, 64]),
 | 
					                "domain": tune.choice([4, 8, 16, 32, 64]),
 | 
				
			||||||
                "init_value": 32,
 | 
					                "init_value": 32,
 | 
				
			||||||
 | 
					                "low_cost_init_value": 64,
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            "seed": {
 | 
				
			||||||
 | 
					                "domain": tune.choice(range(1, 40)),
 | 
				
			||||||
 | 
					                "init_value": 20,
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            "seed": {"domain": tune.randint(1, 40), "init_value": 20},
 | 
					 | 
				
			||||||
            "global_max_steps": {
 | 
					            "global_max_steps": {
 | 
				
			||||||
                "domain": sys.maxsize,
 | 
					                "domain": sys.maxsize,
 | 
				
			||||||
                "init_value": sys.maxsize,
 | 
					                "init_value": sys.maxsize,
 | 
				
			||||||
@ -498,7 +496,7 @@ class TransformersEstimator(BaseEstimator):
 | 
				
			|||||||
            )
 | 
					            )
 | 
				
			||||||
            setattr(self._training_args, "max_seq_length", None)
 | 
					            setattr(self._training_args, "max_seq_length", None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _preprocess(self, X, y=None, **kwargs):
 | 
					    def _tokenize_text(self, X, y=None, **kwargs):
 | 
				
			||||||
        from .nlp.huggingface.utils import tokenize_text
 | 
					        from .nlp.huggingface.utils import tokenize_text
 | 
				
			||||||
        from .nlp.utils import is_a_list_of_str
 | 
					        from .nlp.utils import is_a_list_of_str
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -514,7 +512,7 @@ class TransformersEstimator(BaseEstimator):
 | 
				
			|||||||
                tokenizer=self.tokenizer,
 | 
					                tokenizer=self.tokenizer,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            return X, None
 | 
					            return X, y
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _model_init(self):
 | 
					    def _model_init(self):
 | 
				
			||||||
        from .nlp.huggingface.utils import load_model
 | 
					        from .nlp.huggingface.utils import load_model
 | 
				
			||||||
@ -526,18 +524,15 @@ class TransformersEstimator(BaseEstimator):
 | 
				
			|||||||
        )
 | 
					        )
 | 
				
			||||||
        return this_model
 | 
					        return this_model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def preprocess_data(self, X, y):
 | 
					    def _preprocess_data(self, X, y):
 | 
				
			||||||
        from datasets import Dataset
 | 
					        from datasets import Dataset
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if (self._task not in NLG_TASKS) and (self._task != TOKENCLASSIFICATION):
 | 
					        processed_X, processed_y_df = self._tokenize_text(X=X, y=y, **self._kwargs)
 | 
				
			||||||
            processed_X, _ = self._preprocess(X=X, **self._kwargs)
 | 
					        # convert y from pd.DataFrame back to pd.Series
 | 
				
			||||||
            processed_y = y
 | 
					        processed_y = processed_y_df.iloc[:, 0]
 | 
				
			||||||
        else:
 | 
					
 | 
				
			||||||
            processed_X, processed_y = self._preprocess(X=X, y=y, **self._kwargs)
 | 
					        processed_dataset = Dataset.from_pandas(processed_X.join(processed_y_df))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        processed_dataset = Dataset.from_pandas(
 | 
					 | 
				
			||||||
            TransformersEstimator._join(processed_X, processed_y, self._task)
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        return processed_dataset, processed_X, processed_y
 | 
					        return processed_dataset, processed_X, processed_y
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
@ -574,14 +569,25 @@ class TransformersEstimator(BaseEstimator):
 | 
				
			|||||||
    def data_collator(self):
 | 
					    def data_collator(self):
 | 
				
			||||||
        from .nlp.huggingface.data_collator import task_to_datacollator_class
 | 
					        from .nlp.huggingface.data_collator import task_to_datacollator_class
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return (
 | 
					        data_collator_class = task_to_datacollator_class.get(self._task)
 | 
				
			||||||
            task_to_datacollator_class[self._task](
 | 
					
 | 
				
			||||||
                tokenizer=self.tokenizer,
 | 
					        if data_collator_class:
 | 
				
			||||||
                pad_to_multiple_of=8,  # if self._training_args.fp16 else None,
 | 
					            kwargs = {
 | 
				
			||||||
            )
 | 
					                "model": self._model_init(),  # need to set model, or there's ValueError: Expected input batch_size (..) to match target batch_size (..)
 | 
				
			||||||
            if self._task in (MULTICHOICECLASSIFICATION, TOKENCLASSIFICATION)
 | 
					                "label_pad_token_id": -100,  # pad with token id -100
 | 
				
			||||||
            else None
 | 
					                "pad_to_multiple_of": 8,  # pad to multiple of 8 because quote Transformers: "This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta)"
 | 
				
			||||||
        )
 | 
					                "tokenizer": self.tokenizer,
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            for key in list(kwargs.keys()):
 | 
				
			||||||
 | 
					                if (
 | 
				
			||||||
 | 
					                    key not in data_collator_class.__dict__.keys()
 | 
				
			||||||
 | 
					                    and key != "tokenizer"
 | 
				
			||||||
 | 
					                ):
 | 
				
			||||||
 | 
					                    del kwargs[key]
 | 
				
			||||||
 | 
					            return data_collator_class(**kwargs)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def fit(
 | 
					    def fit(
 | 
				
			||||||
        self,
 | 
					        self,
 | 
				
			||||||
@ -619,11 +625,11 @@ class TransformersEstimator(BaseEstimator):
 | 
				
			|||||||
        )  # If using roberta model, must set add_prefix_space to True to avoid the assertion error at
 | 
					        )  # If using roberta model, must set add_prefix_space to True to avoid the assertion error at
 | 
				
			||||||
        # https://github.com/huggingface/transformers/blob/main/src/transformers/models/roberta/tokenization_roberta_fast.py#L249
 | 
					        # https://github.com/huggingface/transformers/blob/main/src/transformers/models/roberta/tokenization_roberta_fast.py#L249
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        train_dataset, self._X_train, self._y_train = self.preprocess_data(
 | 
					        train_dataset, self._X_train, self._y_train = self._preprocess_data(
 | 
				
			||||||
            X_train, y_train
 | 
					            X_train, y_train
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        if X_val is not None:
 | 
					        if X_val is not None:
 | 
				
			||||||
            eval_dataset, self._X_val, self._y_val = self.preprocess_data(X_val, y_val)
 | 
					            eval_dataset, self._X_val, self._y_val = self._preprocess_data(X_val, y_val)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            eval_dataset, self._X_val, self._y_val = None, None, None
 | 
					            eval_dataset, self._X_val, self._y_val = None, None, None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -825,7 +831,7 @@ class TransformersEstimator(BaseEstimator):
 | 
				
			|||||||
            self._task in CLASSIFICATION
 | 
					            self._task in CLASSIFICATION
 | 
				
			||||||
        ), "predict_proba() only for classification tasks."
 | 
					        ), "predict_proba() only for classification tasks."
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        X_test, _ = self._preprocess(X, **self._kwargs)
 | 
					        X_test, _ = self._tokenize_text(X, **self._kwargs)
 | 
				
			||||||
        test_dataset = Dataset.from_pandas(X_test)
 | 
					        test_dataset = Dataset.from_pandas(X_test)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        new_trainer = self._init_model_for_predict()
 | 
					        new_trainer = self._init_model_for_predict()
 | 
				
			||||||
@ -839,7 +845,7 @@ class TransformersEstimator(BaseEstimator):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        self._metric = kwargs["metric"]
 | 
					        self._metric = kwargs["metric"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        eval_dataset, X_val, y_val = self.preprocess_data(X_val, y_val)
 | 
					        eval_dataset, X_val, y_val = self._preprocess_data(X_val, y_val)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        new_trainer = self._init_model_for_predict()
 | 
					        new_trainer = self._init_model_for_predict()
 | 
				
			||||||
        return new_trainer.evaluate(eval_dataset)
 | 
					        return new_trainer.evaluate(eval_dataset)
 | 
				
			||||||
@ -855,7 +861,7 @@ class TransformersEstimator(BaseEstimator):
 | 
				
			|||||||
            for key, val in pred_kwargs.items():
 | 
					            for key, val in pred_kwargs.items():
 | 
				
			||||||
                setattr(self._training_args, key, val)
 | 
					                setattr(self._training_args, key, val)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        X_test, _ = self._preprocess(X, **self._kwargs)
 | 
					        X_test, _ = self._tokenize_text(X, **self._kwargs)
 | 
				
			||||||
        test_dataset = Dataset.from_pandas(X_test)
 | 
					        test_dataset = Dataset.from_pandas(X_test)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        new_trainer = self._init_model_for_predict()
 | 
					        new_trainer = self._init_model_for_predict()
 | 
				
			||||||
 | 
				
			|||||||
@ -2,10 +2,11 @@ from dataclasses import dataclass
 | 
				
			|||||||
from transformers.data.data_collator import (
 | 
					from transformers.data.data_collator import (
 | 
				
			||||||
    DataCollatorWithPadding,
 | 
					    DataCollatorWithPadding,
 | 
				
			||||||
    DataCollatorForTokenClassification,
 | 
					    DataCollatorForTokenClassification,
 | 
				
			||||||
 | 
					    DataCollatorForSeq2Seq,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
from collections import OrderedDict
 | 
					from collections import OrderedDict
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from flaml.data import TOKENCLASSIFICATION, MULTICHOICECLASSIFICATION
 | 
					from flaml.data import TOKENCLASSIFICATION, MULTICHOICECLASSIFICATION, SUMMARIZATION
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@dataclass
 | 
					@dataclass
 | 
				
			||||||
@ -43,5 +44,6 @@ task_to_datacollator_class = OrderedDict(
 | 
				
			|||||||
    [
 | 
					    [
 | 
				
			||||||
        (TOKENCLASSIFICATION, DataCollatorForTokenClassification),
 | 
					        (TOKENCLASSIFICATION, DataCollatorForTokenClassification),
 | 
				
			||||||
        (MULTICHOICECLASSIFICATION, DataCollatorForMultipleChoiceClassification),
 | 
					        (MULTICHOICECLASSIFICATION, DataCollatorForMultipleChoiceClassification),
 | 
				
			||||||
 | 
					        (SUMMARIZATION, DataCollatorForSeq2Seq),
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
				
			|||||||
@ -12,7 +12,21 @@ from ...data import (
 | 
				
			|||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def todf(X, Y, column_name):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    todf converts Y from any format (list, pandas.Series, numpy array) to a DataFrame before being returned
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    if Y is not None:
 | 
				
			||||||
 | 
					        Y = pd.DataFrame(Y, index=X.index)
 | 
				
			||||||
 | 
					        Y.columns = column_name
 | 
				
			||||||
 | 
					    return Y
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def tokenize_text(X, Y=None, task=None, hf_args=None, tokenizer=None):
 | 
					def tokenize_text(X, Y=None, task=None, hf_args=None, tokenizer=None):
 | 
				
			||||||
 | 
					    label_col_name = None
 | 
				
			||||||
 | 
					    # label_col_name is the name of the label column Y, label_col_name = ['labels'] for TOKENCLASSIFICATION and SUMMARIZATION,
 | 
				
			||||||
 | 
					    # label_col_name = ['label'] for other tasks. todf is used by all tasks except for SUMMARIZATION,
 | 
				
			||||||
 | 
					    # because the outputs of tokenize_seq2seq are already two DataFrames so no conversion needed.
 | 
				
			||||||
    if task in (SEQCLASSIFICATION, SEQREGRESSION):
 | 
					    if task in (SEQCLASSIFICATION, SEQREGRESSION):
 | 
				
			||||||
        X_tokenized = tokenize_onedataframe(
 | 
					        X_tokenized = tokenize_onedataframe(
 | 
				
			||||||
            X,
 | 
					            X,
 | 
				
			||||||
@ -21,15 +35,23 @@ def tokenize_text(X, Y=None, task=None, hf_args=None, tokenizer=None):
 | 
				
			|||||||
            hf_args=hf_args,
 | 
					            hf_args=hf_args,
 | 
				
			||||||
            prefix_str="",
 | 
					            prefix_str="",
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        return X_tokenized, None
 | 
					        Y_tokenized = Y
 | 
				
			||||||
 | 
					        label_col_name = ["label"]
 | 
				
			||||||
    elif task == TOKENCLASSIFICATION:
 | 
					    elif task == TOKENCLASSIFICATION:
 | 
				
			||||||
        return tokenize_text_tokclassification(
 | 
					        X_tokenized, Y_tokenized = tokenize_text_tokclassification(
 | 
				
			||||||
            X, Y, tokenizer=tokenizer, hf_args=hf_args
 | 
					            X, Y, tokenizer=tokenizer, hf_args=hf_args
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					        label_col_name = ["labels"]
 | 
				
			||||||
    elif task in NLG_TASKS:
 | 
					    elif task in NLG_TASKS:
 | 
				
			||||||
        return tokenize_seq2seq(X, Y, tokenizer=tokenizer, task=task, hf_args=hf_args)
 | 
					        return tokenize_seq2seq(X, Y, tokenizer=tokenizer, task=task, hf_args=hf_args)
 | 
				
			||||||
    elif task == MULTICHOICECLASSIFICATION:
 | 
					    elif task == MULTICHOICECLASSIFICATION:
 | 
				
			||||||
        return tokenize_text_multiplechoice(X, tokenizer=tokenizer, hf_args=hf_args)
 | 
					        X_tokenized = tokenize_text_multiplechoice(
 | 
				
			||||||
 | 
					            X, tokenizer=tokenizer, hf_args=hf_args
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        label_col_name = ["label"]
 | 
				
			||||||
 | 
					        Y_tokenized = Y
 | 
				
			||||||
 | 
					    Y_tokenized = todf(X_tokenized, Y_tokenized, label_col_name)
 | 
				
			||||||
 | 
					    return X_tokenized, Y_tokenized
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def tokenize_seq2seq(X, Y, tokenizer, task=None, hf_args=None):
 | 
					def tokenize_seq2seq(X, Y, tokenizer, task=None, hf_args=None):
 | 
				
			||||||
@ -49,7 +71,7 @@ def tokenize_seq2seq(X, Y, tokenizer, task=None, hf_args=None):
 | 
				
			|||||||
            hf_args=hf_args,
 | 
					            hf_args=hf_args,
 | 
				
			||||||
            prefix_str="",
 | 
					            prefix_str="",
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        model_outputs["label"] = [
 | 
					        model_outputs["labels"] = [
 | 
				
			||||||
            [(each_l if each_l != tokenizer.pad_token_id else -100) for each_l in label]
 | 
					            [(each_l if each_l != tokenizer.pad_token_id else -100) for each_l in label]
 | 
				
			||||||
            for label in model_outputs["input_ids"]
 | 
					            for label in model_outputs["input_ids"]
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
@ -238,7 +260,7 @@ def tokenize_row(
 | 
				
			|||||||
    # tokenizer.pad_token = tokenizer.eos_token
 | 
					    # tokenizer.pad_token = tokenizer.eos_token
 | 
				
			||||||
    tokenized_example = tokenizer(
 | 
					    tokenized_example = tokenizer(
 | 
				
			||||||
        *tuple(this_row),
 | 
					        *tuple(this_row),
 | 
				
			||||||
        padding="max_length",
 | 
					        padding="max_length" if hf_args and hf_args.pad_to_max_length else False,
 | 
				
			||||||
        max_length=hf_args.max_seq_length if hf_args else None,
 | 
					        max_length=hf_args.max_seq_length if hf_args else None,
 | 
				
			||||||
        truncation=True,
 | 
					        truncation=True,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
@ -270,7 +292,7 @@ def tokenize_text_multiplechoice(X, tokenizer, hf_args=None):
 | 
				
			|||||||
    X_tokenized = pd.DataFrame(columns=tokenized_column_names)
 | 
					    X_tokenized = pd.DataFrame(columns=tokenized_column_names)
 | 
				
			||||||
    X_tokenized[tokenized_column_names] = d
 | 
					    X_tokenized[tokenized_column_names] = d
 | 
				
			||||||
    output = X_tokenized.join(X)
 | 
					    output = X_tokenized.join(X)
 | 
				
			||||||
    return output, None
 | 
					    return output
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def tokenize_swag(this_row, tokenizer, hf_args=None, return_column_name=False):
 | 
					def tokenize_swag(this_row, tokenizer, hf_args=None, return_column_name=False):
 | 
				
			||||||
@ -292,7 +314,7 @@ def tokenize_swag(this_row, tokenizer, hf_args=None, return_column_name=False):
 | 
				
			|||||||
        *tuple([first_sentences, second_sentences]),
 | 
					        *tuple([first_sentences, second_sentences]),
 | 
				
			||||||
        truncation=True,
 | 
					        truncation=True,
 | 
				
			||||||
        max_length=hf_args.max_seq_length if hf_args else None,
 | 
					        max_length=hf_args.max_seq_length if hf_args else None,
 | 
				
			||||||
        padding=False,
 | 
					        padding="max_length" if hf_args and hf_args.pad_to_max_length else False,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    tmp_column_names = sorted(tokenized_example.keys())
 | 
					    tmp_column_names = sorted(tokenized_example.keys())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -318,13 +340,14 @@ def postprocess_prediction_and_true(
 | 
				
			|||||||
        # If y_true is None, we use X to compute y_is_pad (i.e., whether y_true is -100 in that position), and use y_is_pad to remove the -100 in the prediction, and return the postprocessed prediction (not the y_true)
 | 
					        # If y_true is None, we use X to compute y_is_pad (i.e., whether y_true is -100 in that position), and use y_is_pad to remove the -100 in the prediction, and return the postprocessed prediction (not the y_true)
 | 
				
			||||||
        y_predict = pd.Series(np.argmax(y_pred, axis=2).tolist())
 | 
					        y_predict = pd.Series(np.argmax(y_pred, axis=2).tolist())
 | 
				
			||||||
        if y_true is None:
 | 
					        if y_true is None:
 | 
				
			||||||
            _, y_is_pad = tokenize_text(
 | 
					            _, y_is_pad_df = tokenize_text(
 | 
				
			||||||
                X,
 | 
					                X,
 | 
				
			||||||
                y_predict,
 | 
					                y_predict,
 | 
				
			||||||
                task=task,
 | 
					                task=task,
 | 
				
			||||||
                hf_args=hf_args,
 | 
					                hf_args=hf_args,
 | 
				
			||||||
                tokenizer=tokenizer,
 | 
					                tokenizer=tokenizer,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					            y_is_pad = y_is_pad_df.iloc[:, 0]
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            y_is_pad = y_true
 | 
					            y_is_pad = y_true
 | 
				
			||||||
        label_len = len(hf_args.label_list)
 | 
					        label_len = len(hf_args.label_list)
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										4
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								setup.py
									
									
									
									
									
								
							@ -59,7 +59,7 @@ setuptools.setup(
 | 
				
			|||||||
            "statsmodels>=0.12.2",
 | 
					            "statsmodels>=0.12.2",
 | 
				
			||||||
            "psutil==5.8.0",
 | 
					            "psutil==5.8.0",
 | 
				
			||||||
            "dataclasses",
 | 
					            "dataclasses",
 | 
				
			||||||
            "transformers[torch]>=4.14",
 | 
					            "transformers[torch]==4.18",
 | 
				
			||||||
            "datasets",
 | 
					            "datasets",
 | 
				
			||||||
            "nltk",
 | 
					            "nltk",
 | 
				
			||||||
            "rouge_score",
 | 
					            "rouge_score",
 | 
				
			||||||
@ -81,7 +81,7 @@ setuptools.setup(
 | 
				
			|||||||
            "vowpalwabbit",
 | 
					            "vowpalwabbit",
 | 
				
			||||||
        ],
 | 
					        ],
 | 
				
			||||||
        "nlp": [
 | 
					        "nlp": [
 | 
				
			||||||
            "transformers[torch]>=4.14",
 | 
					            "transformers[torch]==4.18",
 | 
				
			||||||
            "datasets",
 | 
					            "datasets",
 | 
				
			||||||
            "nltk",
 | 
					            "nltk",
 | 
				
			||||||
            "rouge_score",
 | 
					            "rouge_score",
 | 
				
			||||||
 | 
				
			|||||||
@ -24,11 +24,11 @@ def custom_metric(
 | 
				
			|||||||
        estimator._trainer = None
 | 
					        estimator._trainer = None
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        trainer = estimator._trainer
 | 
					        trainer = estimator._trainer
 | 
				
			||||||
 | 
					    X_test, y_test = estimator._tokenize_text(X_test)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if y_test is not None:
 | 
					    if y_test is not None:
 | 
				
			||||||
        X_test = estimator._preprocess(X_test)
 | 
					        eval_dataset = Dataset.from_pandas(X_test.join(y_test))
 | 
				
			||||||
        eval_dataset = Dataset.from_pandas(TransformersEstimator._join(X_test, y_test))
 | 
					 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        X_test = estimator._preprocess(X_test)
 | 
					 | 
				
			||||||
        eval_dataset = Dataset.from_pandas(X_test)
 | 
					        eval_dataset = Dataset.from_pandas(X_test)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    estimator_metric_backup = estimator._metric
 | 
					    estimator_metric_backup = estimator._metric
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user