mirror of
				https://github.com/microsoft/autogen.git
				synced 2025-11-04 11:49:45 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			573 lines
		
	
	
		
			20 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			573 lines
		
	
	
		
			20 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from itertools import chain
 | 
						|
from typing import Dict, Any
 | 
						|
import numpy as np
 | 
						|
 | 
						|
from ..data import (
 | 
						|
    SUMMARIZATION,
 | 
						|
    SEQREGRESSION,
 | 
						|
    SEQCLASSIFICATION,
 | 
						|
    MULTICHOICECLASSIFICATION,
 | 
						|
    TOKENCLASSIFICATION,
 | 
						|
    NLG_TASKS,
 | 
						|
)
 | 
						|
 | 
						|
import pandas as pd
 | 
						|
 | 
						|
 | 
						|
def load_default_huggingface_metric_for_task(task):
 | 
						|
 | 
						|
    if task == SEQCLASSIFICATION:
 | 
						|
        return "accuracy"
 | 
						|
    elif task == SEQREGRESSION:
 | 
						|
        return "r2"
 | 
						|
    elif task == SUMMARIZATION:
 | 
						|
        return "rouge1"
 | 
						|
    elif task == MULTICHOICECLASSIFICATION:
 | 
						|
        return "accuracy"
 | 
						|
    elif task == TOKENCLASSIFICATION:
 | 
						|
        return "seqeval"
 | 
						|
 | 
						|
 | 
						|
def tokenize_text(X, Y=None, task=None, hf_args=None, tokenizer=None):
 | 
						|
    if task in (SEQCLASSIFICATION, SEQREGRESSION):
 | 
						|
        X_tokenized = tokenize_onedataframe(
 | 
						|
            X,
 | 
						|
            tokenizer=tokenizer,
 | 
						|
            task=task,
 | 
						|
            hf_args=hf_args,
 | 
						|
            prefix_str="",
 | 
						|
        )
 | 
						|
        return X_tokenized, None
 | 
						|
    elif task == TOKENCLASSIFICATION:
 | 
						|
        return tokenize_text_tokclassification(
 | 
						|
            X, Y, tokenizer=tokenizer, hf_args=hf_args
 | 
						|
        )
 | 
						|
    elif task in NLG_TASKS:
 | 
						|
        return tokenize_seq2seq(X, Y, tokenizer=tokenizer, task=task, hf_args=hf_args)
 | 
						|
    elif task == MULTICHOICECLASSIFICATION:
 | 
						|
        return tokenize_text_multiplechoice(X, tokenizer=tokenizer, hf_args=hf_args)
 | 
						|
 | 
						|
 | 
						|
def tokenize_seq2seq(X, Y, tokenizer, task=None, hf_args=None):
 | 
						|
    model_inputs = tokenize_onedataframe(
 | 
						|
        X,
 | 
						|
        tokenizer=tokenizer,
 | 
						|
        task=task,
 | 
						|
        hf_args=hf_args,
 | 
						|
        prefix_str="summarize: ",
 | 
						|
    )
 | 
						|
    model_outputs = None
 | 
						|
    if Y is not None:
 | 
						|
        model_outputs = tokenize_onedataframe(
 | 
						|
            Y.to_frame(),
 | 
						|
            tokenizer=tokenizer,
 | 
						|
            task=task,
 | 
						|
            hf_args=hf_args,
 | 
						|
            prefix_str="",
 | 
						|
        )
 | 
						|
        model_outputs["label"] = [
 | 
						|
            [(each_l if each_l != tokenizer.pad_token_id else -100) for each_l in label]
 | 
						|
            for label in model_outputs["input_ids"]
 | 
						|
        ]
 | 
						|
        model_outputs = model_outputs.drop(
 | 
						|
            columns=["attention_mask", "input_ids", "decoder_input_ids"]
 | 
						|
        )
 | 
						|
    return model_inputs, model_outputs
 | 
						|
 | 
						|
 | 
						|
def tokenize_and_align_labels(
 | 
						|
    examples,
 | 
						|
    tokenizer,
 | 
						|
    label_to_id,
 | 
						|
    b_to_i_label,
 | 
						|
    hf_args=None,
 | 
						|
    X_sent_key=None,
 | 
						|
    Y_sent_key=None,
 | 
						|
    return_column_name=False,
 | 
						|
):
 | 
						|
    tokenized_inputs = tokenizer(
 | 
						|
        [list(examples[X_sent_key])],
 | 
						|
        padding="max_length"
 | 
						|
        if hf_args and hf_args.pad_to_max_length
 | 
						|
        else False,  # to be consistent with https://github.com/huggingface/transformers/blob/main/examples/pytorch/token-classification/run_ner.py#L394
 | 
						|
        truncation=True,
 | 
						|
        max_length=hf_args.max_seq_length if hf_args else None,
 | 
						|
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
 | 
						|
        is_split_into_words=True,
 | 
						|
    )
 | 
						|
    if Y_sent_key is not None:
 | 
						|
        previous_word_idx = None
 | 
						|
        label_ids = []
 | 
						|
        for word_idx in tokenized_inputs.word_ids(batch_index=0):
 | 
						|
            if word_idx is None:
 | 
						|
                label_ids.append(-100)
 | 
						|
            elif word_idx != previous_word_idx:
 | 
						|
                label_ids.append(label_to_id[examples[Y_sent_key][word_idx]])
 | 
						|
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
 | 
						|
            # the label_all_tokens flag.
 | 
						|
            else:
 | 
						|
                # Use the label_all_tokens to control whether to copy the label to all subtokens or to pad the additional tokens as -100
 | 
						|
                if hf_args.label_all_tokens:
 | 
						|
                    # If the B- word is converted into multiple subtokens, map the additional subtokens to I-
 | 
						|
                    label_ids.append(
 | 
						|
                        b_to_i_label[label_to_id[examples[Y_sent_key][word_idx]]]
 | 
						|
                    )
 | 
						|
                else:
 | 
						|
                    label_ids.append(-100)
 | 
						|
            previous_word_idx = word_idx
 | 
						|
        tokenized_inputs["labels"] = label_ids
 | 
						|
    tmp_column_names = sorted(tokenized_inputs.keys())
 | 
						|
    tokenized_input_and_labels = [tokenized_inputs[x] for x in tmp_column_names]
 | 
						|
    for key_idx, each_key in enumerate(tmp_column_names):
 | 
						|
        if each_key != "labels":
 | 
						|
            tokenized_input_and_labels[key_idx] = tokenized_input_and_labels[key_idx][0]
 | 
						|
    if return_column_name:
 | 
						|
        return tokenized_input_and_labels, tmp_column_names
 | 
						|
    else:
 | 
						|
        return tokenized_input_and_labels
 | 
						|
 | 
						|
 | 
						|
def tokenize_text_tokclassification(X, Y, tokenizer, hf_args=None):
 | 
						|
 | 
						|
    # If the label_all_tokens flag is True, prepare two dicts label_to_id and b_to_i_label to convert the B- labels to I- labels
 | 
						|
    label_to_id = {i: i for i in range(len(hf_args.label_list))}
 | 
						|
    b_to_i_label = []
 | 
						|
    for idx, label in enumerate(hf_args.label_list):
 | 
						|
        if label.startswith("B-") and label.replace("B-", "I-") in hf_args.label_list:
 | 
						|
            b_to_i_label.append(hf_args.label_list.index(label.replace("B-", "I-")))
 | 
						|
        else:
 | 
						|
            b_to_i_label.append(idx)
 | 
						|
 | 
						|
    if Y is not None:
 | 
						|
        X_and_Y = pd.concat([X, Y.to_frame()], axis=1)
 | 
						|
        X_key = list(X.keys())[0]
 | 
						|
        Y_key = list(Y.to_frame().keys())[0]
 | 
						|
        _, tokenized_column_names = tokenize_and_align_labels(
 | 
						|
            X_and_Y.iloc[0],
 | 
						|
            tokenizer=tokenizer,
 | 
						|
            hf_args=hf_args,
 | 
						|
            X_sent_key=X_key,
 | 
						|
            Y_sent_key=Y_key,
 | 
						|
            return_column_name=True,
 | 
						|
            label_to_id=label_to_id,
 | 
						|
            b_to_i_label=b_to_i_label,
 | 
						|
        )
 | 
						|
        X_and_Y_tokenized = X_and_Y.apply(
 | 
						|
            lambda x: tokenize_and_align_labels(
 | 
						|
                x,
 | 
						|
                tokenizer=tokenizer,
 | 
						|
                hf_args=hf_args,
 | 
						|
                X_sent_key=X_key,
 | 
						|
                Y_sent_key=Y_key,
 | 
						|
                label_to_id=label_to_id,
 | 
						|
                b_to_i_label=b_to_i_label,
 | 
						|
            ),
 | 
						|
            axis=1,
 | 
						|
            result_type="expand",
 | 
						|
        )
 | 
						|
        label_idx = tokenized_column_names.index("labels")
 | 
						|
        other_indices = sorted(
 | 
						|
            set(range(len(tokenized_column_names))).difference({label_idx})
 | 
						|
        )
 | 
						|
        other_column_names = [tokenized_column_names[x] for x in other_indices]
 | 
						|
        d = X_and_Y_tokenized.iloc[:, other_indices]
 | 
						|
        y_tokenized = X_and_Y_tokenized.iloc[:, label_idx]
 | 
						|
    else:
 | 
						|
        X_key = list(X.keys())[0]
 | 
						|
 | 
						|
        _, tokenized_column_names = tokenize_and_align_labels(
 | 
						|
            X.iloc[0],
 | 
						|
            tokenizer=tokenizer,
 | 
						|
            hf_args=hf_args,
 | 
						|
            X_sent_key=X_key,
 | 
						|
            Y_sent_key=None,
 | 
						|
            return_column_name=True,
 | 
						|
            label_to_id=label_to_id,
 | 
						|
            b_to_i_label=b_to_i_label,
 | 
						|
        )
 | 
						|
 | 
						|
        d = X.apply(
 | 
						|
            lambda x: tokenize_and_align_labels(
 | 
						|
                x,
 | 
						|
                tokenizer=tokenizer,
 | 
						|
                hf_args=hf_args,
 | 
						|
                X_sent_key=X_key,
 | 
						|
                Y_sent_key=None,
 | 
						|
                label_to_id=label_to_id,
 | 
						|
                b_to_i_label=b_to_i_label,
 | 
						|
            ),
 | 
						|
            axis=1,
 | 
						|
            result_type="expand",
 | 
						|
        )
 | 
						|
        other_column_names = tokenized_column_names
 | 
						|
        y_tokenized = None
 | 
						|
    X_tokenized = pd.DataFrame(columns=other_column_names)
 | 
						|
    X_tokenized[other_column_names] = d
 | 
						|
    return X_tokenized, y_tokenized
 | 
						|
 | 
						|
 | 
						|
def tokenize_onedataframe(
 | 
						|
    X,
 | 
						|
    tokenizer,
 | 
						|
    task=None,
 | 
						|
    hf_args=None,
 | 
						|
    prefix_str=None,
 | 
						|
):
 | 
						|
 | 
						|
    with tokenizer.as_target_tokenizer():
 | 
						|
        _, tokenized_column_names = tokenize_row(
 | 
						|
            dict(X.iloc[0]),
 | 
						|
            tokenizer,
 | 
						|
            prefix=(prefix_str,) if task is SUMMARIZATION else None,
 | 
						|
            task=task,
 | 
						|
            hf_args=hf_args,
 | 
						|
            return_column_name=True,
 | 
						|
        )
 | 
						|
        d = X.apply(
 | 
						|
            lambda x: tokenize_row(
 | 
						|
                x,
 | 
						|
                tokenizer,
 | 
						|
                prefix=(prefix_str,) if task is SUMMARIZATION else None,
 | 
						|
                task=task,
 | 
						|
                hf_args=hf_args,
 | 
						|
            ),
 | 
						|
            axis=1,
 | 
						|
            result_type="expand",
 | 
						|
        )
 | 
						|
        X_tokenized = pd.DataFrame(columns=tokenized_column_names)
 | 
						|
        X_tokenized[tokenized_column_names] = d
 | 
						|
        return X_tokenized
 | 
						|
 | 
						|
 | 
						|
def tokenize_row(
 | 
						|
    this_row,
 | 
						|
    tokenizer,
 | 
						|
    prefix=None,
 | 
						|
    task=None,
 | 
						|
    hf_args=None,
 | 
						|
    return_column_name=False,
 | 
						|
):
 | 
						|
    if prefix:
 | 
						|
        this_row = tuple(["".join(x) for x in zip(prefix, this_row)])
 | 
						|
 | 
						|
    # tokenizer.pad_token = tokenizer.eos_token
 | 
						|
    tokenized_example = tokenizer(
 | 
						|
        *tuple(this_row),
 | 
						|
        padding="max_length",
 | 
						|
        max_length=hf_args.max_seq_length if hf_args else None,
 | 
						|
        truncation=True,
 | 
						|
    )
 | 
						|
    if task in NLG_TASKS:
 | 
						|
        tokenized_example["decoder_input_ids"] = tokenized_example["input_ids"]
 | 
						|
    tmp_column_names = sorted(tokenized_example.keys())
 | 
						|
 | 
						|
    if return_column_name:
 | 
						|
        return [tokenized_example[x] for x in tmp_column_names], tmp_column_names
 | 
						|
    else:
 | 
						|
        return [tokenized_example[x] for x in tmp_column_names]
 | 
						|
 | 
						|
 | 
						|
def tokenize_text_multiplechoice(X, tokenizer, hf_args=None):
 | 
						|
 | 
						|
    t = X[["sent1", "sent2", "ending0", "ending1", "ending2", "ending3"]]
 | 
						|
    _, tokenized_column_names = tokenize_swag(
 | 
						|
        t.iloc[0],
 | 
						|
        tokenizer=tokenizer,
 | 
						|
        hf_args=hf_args,
 | 
						|
        return_column_name=True,
 | 
						|
    )
 | 
						|
    d = t.apply(
 | 
						|
        lambda x: tokenize_swag(x, tokenizer=tokenizer, hf_args=hf_args),
 | 
						|
        axis=1,
 | 
						|
        result_type="expand",
 | 
						|
    )
 | 
						|
 | 
						|
    X_tokenized = pd.DataFrame(columns=tokenized_column_names)
 | 
						|
    X_tokenized[tokenized_column_names] = d
 | 
						|
    output = X_tokenized.join(X)
 | 
						|
    return output, None
 | 
						|
 | 
						|
 | 
						|
def tokenize_swag(this_row, tokenizer, hf_args=None, return_column_name=False):
 | 
						|
    first_sentences = [[this_row["sent1"]] * 4]
 | 
						|
    # get each 1st sentence, multiply to 4 sentences
 | 
						|
    question_headers = this_row["sent2"]
 | 
						|
    # sent2 are the noun part of 2nd line
 | 
						|
    second_sentences = [
 | 
						|
        question_headers + " " + this_row[key]
 | 
						|
        for key in ["ending0", "ending1", "ending2", "ending3"]
 | 
						|
    ]
 | 
						|
    # now the 2nd-sentences are formed by combing the noun part and 4 ending parts
 | 
						|
 | 
						|
    # Flatten out
 | 
						|
    # From 2 dimension to 1 dimension array
 | 
						|
    first_sentences = list(chain(*first_sentences))
 | 
						|
 | 
						|
    tokenized_example = tokenizer(
 | 
						|
        *tuple([first_sentences, second_sentences]),
 | 
						|
        truncation=True,
 | 
						|
        max_length=hf_args.max_seq_length if hf_args else None,
 | 
						|
        padding=False,
 | 
						|
    )
 | 
						|
    tmp_column_names = sorted(tokenized_example.keys())
 | 
						|
 | 
						|
    if return_column_name:
 | 
						|
        return [tokenized_example[x] for x in tmp_column_names], tmp_column_names
 | 
						|
    else:
 | 
						|
        return [tokenized_example[x] for x in tmp_column_names]
 | 
						|
 | 
						|
 | 
						|
def is_a_list_of_str(this_obj):
 | 
						|
    return (isinstance(this_obj, list) or isinstance(this_obj, np.ndarray)) and all(
 | 
						|
        isinstance(x, str) for x in this_obj
 | 
						|
    )
 | 
						|
 | 
						|
 | 
						|
def _clean_value(value: Any) -> str:
 | 
						|
    if isinstance(value, float):
 | 
						|
        return "{:.5}".format(value)
 | 
						|
    else:
 | 
						|
        return str(value).replace("/", "_")
 | 
						|
 | 
						|
 | 
						|
def format_vars(resolved_vars: Dict) -> str:
 | 
						|
    """Formats the resolved variable dict into a single string."""
 | 
						|
    out = []
 | 
						|
    for path, value in sorted(resolved_vars.items()):
 | 
						|
        if path[0] in ["run", "env", "resources_per_trial"]:
 | 
						|
            continue  # TrialRunner already has these in the experiment_tag
 | 
						|
        pieces = []
 | 
						|
        last_string = True
 | 
						|
        for k in path[::-1]:
 | 
						|
            if isinstance(k, int):
 | 
						|
                pieces.append(str(k))
 | 
						|
            elif last_string:
 | 
						|
                last_string = False
 | 
						|
                pieces.append(k)
 | 
						|
        pieces.reverse()
 | 
						|
        out.append(_clean_value("_".join(pieces)) + "=" + _clean_value(value))
 | 
						|
    return ",".join(out)
 | 
						|
 | 
						|
 | 
						|
counter = 0
 | 
						|
 | 
						|
 | 
						|
def date_str():
 | 
						|
    from datetime import datetime
 | 
						|
 | 
						|
    return datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
 | 
						|
 | 
						|
 | 
						|
def _generate_dirname(experiment_tag, trial_id):
 | 
						|
    generated_dirname = f"train_{str(trial_id)}_{experiment_tag}"
 | 
						|
    generated_dirname = generated_dirname[:130]
 | 
						|
    generated_dirname += f"_{date_str()}"
 | 
						|
    return generated_dirname.replace("/", "_")
 | 
						|
 | 
						|
 | 
						|
def get_logdir_name(dirname, local_dir):
 | 
						|
    import os
 | 
						|
 | 
						|
    local_dir = os.path.expanduser(local_dir)
 | 
						|
    logdir = os.path.join(local_dir, dirname)
 | 
						|
    return logdir
 | 
						|
 | 
						|
 | 
						|
class Counter:
 | 
						|
    counter = 0
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def get_trial_fold_name(local_dir, trial_config, trial_id):
 | 
						|
        Counter.counter += 1
 | 
						|
        experiment_tag = "{0}_{1}".format(
 | 
						|
            str(Counter.counter), format_vars(trial_config)
 | 
						|
        )
 | 
						|
        logdir = get_logdir_name(
 | 
						|
            _generate_dirname(experiment_tag, trial_id=trial_id), local_dir
 | 
						|
        )
 | 
						|
        return logdir
 | 
						|
 | 
						|
 | 
						|
def load_model(checkpoint_path, task, num_labels=None):
 | 
						|
    import transformers
 | 
						|
 | 
						|
    transformers.logging.set_verbosity_error()
 | 
						|
 | 
						|
    from transformers import AutoConfig
 | 
						|
    from .huggingface.switch_head_auto import (
 | 
						|
        AutoSeqClassificationHead,
 | 
						|
        MODEL_CLASSIFICATION_HEAD_MAPPING,
 | 
						|
    )
 | 
						|
    from ..data import SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION
 | 
						|
 | 
						|
    def get_this_model(checkpoint_path, task, model_config):
 | 
						|
        from transformers import AutoModelForSequenceClassification
 | 
						|
        from transformers import AutoModelForSeq2SeqLM
 | 
						|
        from transformers import AutoModelForMultipleChoice
 | 
						|
        from transformers import AutoModelForTokenClassification
 | 
						|
 | 
						|
        if task in (SEQCLASSIFICATION, SEQREGRESSION):
 | 
						|
            return AutoModelForSequenceClassification.from_pretrained(
 | 
						|
                checkpoint_path, config=model_config
 | 
						|
            )
 | 
						|
        elif task == TOKENCLASSIFICATION:
 | 
						|
            return AutoModelForTokenClassification.from_pretrained(
 | 
						|
                checkpoint_path, config=model_config
 | 
						|
            )
 | 
						|
        elif task in NLG_TASKS:
 | 
						|
            return AutoModelForSeq2SeqLM.from_pretrained(
 | 
						|
                checkpoint_path, config=model_config
 | 
						|
            )
 | 
						|
        elif task == MULTICHOICECLASSIFICATION:
 | 
						|
            return AutoModelForMultipleChoice.from_pretrained(
 | 
						|
                checkpoint_path, config=model_config
 | 
						|
            )
 | 
						|
 | 
						|
    def is_pretrained_model_in_classification_head_list(model_type):
 | 
						|
        return model_type in MODEL_CLASSIFICATION_HEAD_MAPPING
 | 
						|
 | 
						|
    def _set_model_config(checkpoint_path):
 | 
						|
        if task in (SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION):
 | 
						|
            model_config = AutoConfig.from_pretrained(
 | 
						|
                checkpoint_path,
 | 
						|
                num_labels=model_config_num_labels,
 | 
						|
            )
 | 
						|
            return model_config
 | 
						|
        else:
 | 
						|
            model_config = AutoConfig.from_pretrained(checkpoint_path)
 | 
						|
            return model_config
 | 
						|
 | 
						|
    current_config = AutoConfig.from_pretrained(checkpoint_path)
 | 
						|
    this_model_type, this_vocab_size = (
 | 
						|
        current_config.model_type,
 | 
						|
        current_config.vocab_size,
 | 
						|
    )
 | 
						|
 | 
						|
    if task == SEQCLASSIFICATION:
 | 
						|
        num_labels_old = current_config.num_labels
 | 
						|
        if is_pretrained_model_in_classification_head_list(this_model_type):
 | 
						|
            model_config_num_labels = num_labels_old
 | 
						|
        else:
 | 
						|
            model_config_num_labels = num_labels
 | 
						|
        new_config = _set_model_config(checkpoint_path)
 | 
						|
 | 
						|
        if is_pretrained_model_in_classification_head_list(this_model_type):
 | 
						|
            if num_labels != num_labels_old:
 | 
						|
                this_model = get_this_model(checkpoint_path, task, new_config)
 | 
						|
                new_config.num_labels = num_labels
 | 
						|
                this_model.num_labels = num_labels
 | 
						|
                this_model.classifier = (
 | 
						|
                    AutoSeqClassificationHead.from_model_type_and_config(
 | 
						|
                        this_model_type, new_config
 | 
						|
                    )
 | 
						|
                )
 | 
						|
            else:
 | 
						|
                this_model = get_this_model(checkpoint_path, task, new_config)
 | 
						|
        else:
 | 
						|
            this_model = get_this_model(checkpoint_path, task, new_config)
 | 
						|
        this_model.resize_token_embeddings(this_vocab_size)
 | 
						|
        return this_model
 | 
						|
    else:
 | 
						|
        if task == SEQREGRESSION:
 | 
						|
            model_config_num_labels = 1
 | 
						|
        elif task == TOKENCLASSIFICATION:
 | 
						|
            model_config_num_labels = num_labels
 | 
						|
        model_config = _set_model_config(checkpoint_path)
 | 
						|
        this_model = get_this_model(checkpoint_path, task, model_config)
 | 
						|
        return this_model
 | 
						|
 | 
						|
 | 
						|
def postprocess_prediction_and_true(
 | 
						|
    task, y_pred, tokenizer, hf_args, y_true=None, X=None
 | 
						|
):
 | 
						|
    # postprocess the matrix prediction y_pred and ground truth y_true into user readable format, e.g., for summarization, decode into text
 | 
						|
    if task == SEQCLASSIFICATION:
 | 
						|
        return np.argmax(y_pred, axis=1), y_true
 | 
						|
    elif task == SEQREGRESSION:
 | 
						|
        return np.squeeze(y_pred), y_true  # predictions.reshape((len(predictions),))
 | 
						|
    elif task == TOKENCLASSIFICATION:
 | 
						|
        assert (y_true is not None) or (
 | 
						|
            X is not None
 | 
						|
        ), "One of y_true and X must not be None"
 | 
						|
        ## If y_true is not None, we use y_true to remove the -100 in the prediction (postprocessing), and return the postprocessed y_true and prediction
 | 
						|
        # If y_true is None, we use X to compute y_is_pad (i.e., whether y_true is -100 in that position), and use y_is_pad to remove the -100 in the prediction, and return the postprocessed prediction (not the y_true)
 | 
						|
        y_predict = pd.Series(np.argmax(y_pred, axis=2).tolist())
 | 
						|
        if y_true is None:
 | 
						|
            _, y_is_pad = tokenize_text(
 | 
						|
                X,
 | 
						|
                y_predict,
 | 
						|
                task=task,
 | 
						|
                hf_args=hf_args,
 | 
						|
                tokenizer=tokenizer,
 | 
						|
            )
 | 
						|
        else:
 | 
						|
            y_is_pad = y_true
 | 
						|
        label_len = len(hf_args.label_list)
 | 
						|
        zip_pred_ispad = [
 | 
						|
            [(p, ispd) for (p, ispd) in zip(each_pred, each_is_pad) if ispd != -100]
 | 
						|
            for (each_pred, each_is_pad) in zip(y_predict, y_is_pad)
 | 
						|
        ]
 | 
						|
        y_pred_label = [
 | 
						|
            [
 | 
						|
                hf_args.label_list[p] if 0 <= p < label_len else -1
 | 
						|
                for (p, ispd) in each_list
 | 
						|
            ]
 | 
						|
            for each_list in zip_pred_ispad
 | 
						|
        ]  # To compute precision and recall, y_pred and y_true must be converted to string labels
 | 
						|
        # (B-PER, I-PER, etc.), so that the category-based precision/recall (i.e., PER, LOC, etc.) scores can be computed
 | 
						|
        if y_true is not None:
 | 
						|
            y_true_label = [
 | 
						|
                [tr for (p, tr) in each_list] for each_list in zip_pred_ispad
 | 
						|
            ]
 | 
						|
        else:
 | 
						|
            y_true_label = None
 | 
						|
        return y_pred_label, y_true_label
 | 
						|
    elif task == SUMMARIZATION:
 | 
						|
        if isinstance(y_pred, tuple):
 | 
						|
            y_pred = np.argmax(y_pred[0], axis=2)
 | 
						|
        decoded_preds = tokenizer.batch_decode(y_pred, skip_special_tokens=True)
 | 
						|
 | 
						|
        import nltk
 | 
						|
 | 
						|
        nltk.download("punkt")
 | 
						|
        decoded_preds = [pred.strip() for pred in decoded_preds]
 | 
						|
        decoded_preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in decoded_preds]
 | 
						|
 | 
						|
        if y_true is not None:
 | 
						|
            y_true_labels = np.where(y_true != -100, y_true, tokenizer.pad_token_id)
 | 
						|
            decoded_y_true_labels = tokenizer.batch_decode(
 | 
						|
                y_true_labels, skip_special_tokens=True
 | 
						|
            )
 | 
						|
            decoded_y_true_labels = [label.strip() for label in decoded_y_true_labels]
 | 
						|
            decoded_y_true_labels = [
 | 
						|
                "\n".join(nltk.sent_tokenize(label)) for label in decoded_y_true_labels
 | 
						|
            ]
 | 
						|
        else:
 | 
						|
            decoded_y_true_labels = None
 | 
						|
 | 
						|
        return decoded_preds, decoded_y_true_labels
 | 
						|
    elif task == MULTICHOICECLASSIFICATION:
 | 
						|
        return np.argmax(y_pred, axis=1), y_true
 | 
						|
 | 
						|
 | 
						|
class LabelEncoderforTokenClassification:
 | 
						|
    def fit_transform(self, y):
 | 
						|
        # if the labels are tokens, convert them to ids
 | 
						|
        if any(isinstance(id, str) for id in y[0]):
 | 
						|
            self.label_list = sorted(list(set().union(*y)))
 | 
						|
            self._tokenlabel_to_id = {
 | 
						|
                self.label_list[id]: id for id in range(len(self.label_list))
 | 
						|
            }
 | 
						|
            y = y.apply(lambda sent: [self._tokenlabel_to_id[token] for token in sent])
 | 
						|
        # if the labels are not tokens, they must be ids
 | 
						|
        else:
 | 
						|
            assert all(
 | 
						|
                isinstance(id, (int, np.integer)) for id in y[0]
 | 
						|
            ), "The labels must either be tokens or ids"
 | 
						|
        return y
 | 
						|
 | 
						|
    def transform(self, y):
 | 
						|
        if hasattr(self, "_tokenlabel_to_id"):
 | 
						|
            y = y.apply(lambda sent: [self._tokenlabel_to_id[token] for token in sent])
 | 
						|
        return y
 |