autogen/flaml/nlp/utils.py

from itertools import chain
from typing import Dict, Any
import numpy as np

from ..data import (
    SUMMARIZATION,
    SEQREGRESSION,
    SEQCLASSIFICATION,
    MULTICHOICECLASSIFICATION,
    TOKENCLASSIFICATION,
    NLG_TASKS,
)


def load_default_huggingface_metric_for_task(task):

    if task == SEQCLASSIFICATION:
        return "accuracy"
    elif task == SEQREGRESSION:
        return "r2"
    elif task == SUMMARIZATION:
        return "rouge1"
    elif task == MULTICHOICECLASSIFICATION:
        return "accuracy"
    elif task == TOKENCLASSIFICATION:
        return "seqeval"


def tokenize_text(X, Y=None, task=None, hf_args=None, tokenizer=None):
    if task in (SEQCLASSIFICATION, SEQREGRESSION):
        X_tokenized = tokenize_onedataframe(
            X,
            tokenizer=tokenizer,
            task=task,
            hf_args=hf_args,
            prefix_str="",
        )
        return X_tokenized, None
    elif task == TOKENCLASSIFICATION:
        return tokenize_text_tokclassification(
            X, Y, tokenizer=tokenizer, hf_args=hf_args
        )
    elif task in NLG_TASKS:
        return tokenize_seq2seq(X, Y, tokenizer=tokenizer, task=task, hf_args=hf_args)
    elif task == MULTICHOICECLASSIFICATION:
        return tokenize_text_multiplechoice(X, tokenizer=tokenizer, hf_args=hf_args)


def tokenize_seq2seq(X, Y, tokenizer, task=None, hf_args=None):
    model_inputs = tokenize_onedataframe(
        X,
        tokenizer=tokenizer,
        task=task,
        hf_args=hf_args,
        prefix_str="summarize: ",
    )
    labels = None
    if Y is not None:
        labels = tokenize_onedataframe(
            Y.to_frame(),
            tokenizer=tokenizer,
            task=task,
            hf_args=hf_args,
            prefix_str="",
        )
        labels["label"] = [
            [(each_l if each_l != tokenizer.pad_token_id else -100) for each_l in label]
            for label in labels["input_ids"]
        ]
        labels = labels.drop(
            columns=["attention_mask", "input_ids", "decoder_input_ids"]
        )
    return model_inputs, labels


def tokenize_and_align_labels(
    examples,
    tokenizer,
    hf_args=None,
    X_sent_key=None,
    Y_sent_key=None,
    return_column_name=False,
):
    tokenized_inputs = tokenizer(
        [list(examples[X_sent_key])],
        padding="max_length"
        if hf_args.pad_to_max_length
        else False,  # to be consistent with https://github.com/huggingface/transformers/blob/main/examples/pytorch/token-classification/run_ner.py#L394
        truncation=True,
        max_length=hf_args.max_seq_length,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    if Y_sent_key is not None:
        previous_word_idx = None
        label_ids = []
        import numbers

        for word_idx in tokenized_inputs.word_ids(batch_index=0):
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                if isinstance(examples[Y_sent_key][word_idx], numbers.Number):
                    label_ids.append(examples[Y_sent_key][word_idx])
                # else:
                #     label_ids.append(label_to_id[label[word_idx]])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                if isinstance(examples[Y_sent_key][word_idx], numbers.Number):
                    label_ids.append(examples[Y_sent_key][word_idx])
                # else:
                #     label_ids.append(b_to_i_label[label_to_id[label[word_idx]]])
            previous_word_idx = word_idx
        tokenized_inputs["labels"] = label_ids
    tmp_column_names = sorted(tokenized_inputs.keys())
    tokenized_input_and_labels = [tokenized_inputs[x] for x in tmp_column_names]
    for key_idx, each_key in enumerate(tmp_column_names):
        if each_key != "labels":
            tokenized_input_and_labels[key_idx] = tokenized_input_and_labels[key_idx][0]
    if return_column_name:
        return tokenized_input_and_labels, tmp_column_names
    else:
        return tokenized_input_and_labels


def tokenize_text_tokclassification(X, Y, tokenizer, hf_args=None):
    import pandas as pd

    if Y is not None:
        X_and_Y = pd.concat([X, Y.to_frame()], axis=1)
        X_key = list(X.keys())[0]
        Y_key = list(Y.to_frame().keys())[0]
        _, tokenized_column_names = tokenize_and_align_labels(
            X_and_Y.iloc[0],
            tokenizer=tokenizer,
            hf_args=hf_args,
            X_sent_key=X_key,
            Y_sent_key=Y_key,
            return_column_name=True,
        )
        X_and_Y_tokenized = X_and_Y.apply(
            lambda x: tokenize_and_align_labels(
                x,
                tokenizer=tokenizer,
                hf_args=hf_args,
                X_sent_key=X_key,
                Y_sent_key=Y_key,
            ),
            axis=1,
            result_type="expand",
        )
        label_idx = tokenized_column_names.index("labels")
        other_indices = sorted(
            set(range(len(tokenized_column_names))).difference({label_idx})
        )
        other_column_names = [tokenized_column_names[x] for x in other_indices]
        d = X_and_Y_tokenized.iloc[:, other_indices]
        y_tokenized = X_and_Y_tokenized.iloc[:, label_idx]
    else:
        X_key = list(X.keys())[0]

        _, tokenized_column_names = tokenize_and_align_labels(
            X.iloc[0],
            tokenizer=tokenizer,
            hf_args=hf_args,
            X_sent_key=X_key,
            Y_sent_key=None,
            return_column_name=True,
        )

        d = X.apply(
            lambda x: tokenize_and_align_labels(
                x,
                tokenizer=tokenizer,
                hf_args=hf_args,
                X_sent_key=X_key,
                Y_sent_key=None,
            ),
            axis=1,
            result_type="expand",
        )
        other_column_names = tokenized_column_names
        y_tokenized = None
    X_tokenized = pd.DataFrame(columns=other_column_names)
    X_tokenized[other_column_names] = d
    return X_tokenized, y_tokenized


def tokenize_onedataframe(
    X,
    tokenizer,
    task=None,
    hf_args=None,
    prefix_str=None,
):
    import pandas

    with tokenizer.as_target_tokenizer():
        _, tokenized_column_names = tokenize_row(
            dict(X.iloc[0]),
            tokenizer,
            prefix=(prefix_str,) if task is SUMMARIZATION else None,
            task=task,
            hf_args=hf_args,
            return_column_name=True,
        )
        d = X.apply(
            lambda x: tokenize_row(
                x,
                tokenizer,
                prefix=(prefix_str,) if task is SUMMARIZATION else None,
                task=task,
                hf_args=hf_args,
            ),
            axis=1,
            result_type="expand",
        )
        X_tokenized = pandas.DataFrame(columns=tokenized_column_names)
        X_tokenized[tokenized_column_names] = d
        return X_tokenized


def postprocess_text(preds, labels):
    import nltk

    nltk.download("punkt")
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def tokenize_row(
    this_row,
    tokenizer,
    prefix=None,
    task=None,
    hf_args=None,
    return_column_name=False,
):
    if prefix:
        this_row = tuple(["".join(x) for x in zip(prefix, this_row)])

    # tokenizer.pad_token = tokenizer.eos_token
    tokenized_example = tokenizer(
        *tuple(this_row),
        padding="max_length",
        max_length=hf_args.max_seq_length if hf_args else None,
        truncation=True,
    )
    if task in NLG_TASKS:
        tokenized_example["decoder_input_ids"] = tokenized_example["input_ids"]
    tmp_column_names = sorted(tokenized_example.keys())

    if return_column_name:
        return [tokenized_example[x] for x in tmp_column_names], tmp_column_names
    else:
        return [tokenized_example[x] for x in tmp_column_names]


def tokenize_text_multiplechoice(X, tokenizer, hf_args=None):
    import pandas

    t = X[["sent1", "sent2", "ending0", "ending1", "ending2", "ending3"]]
    _, tokenized_column_names = tokenize_swag(
        t.iloc[0],
        tokenizer=tokenizer,
        hf_args=hf_args,
        return_column_name=True,
    )
    d = t.apply(
        lambda x: tokenize_swag(x, tokenizer=tokenizer, hf_args=hf_args),
        axis=1,
        result_type="expand",
    )

    X_tokenized = pandas.DataFrame(columns=tokenized_column_names)
    X_tokenized[tokenized_column_names] = d
    output = X_tokenized.join(X)
    return output, None


def tokenize_swag(this_row, tokenizer, hf_args=None, return_column_name=False):
    first_sentences = [[this_row["sent1"]] * 4]
    # get each 1st sentence, multiply to 4 sentences
    question_headers = this_row["sent2"]
    # sent2 are the noun part of 2nd line
    second_sentences = [
        question_headers + " " + this_row[key]
        for key in ["ending0", "ending1", "ending2", "ending3"]
    ]
    # now the 2nd-sentences are formed by combing the noun part and 4 ending parts

    # Flatten out
    # From 2 dimension to 1 dimension array
    first_sentences = list(chain(*first_sentences))

    tokenized_example = tokenizer(
        *tuple([first_sentences, second_sentences]),
        truncation=True,
        max_length=hf_args.max_seq_length if hf_args else None,
        padding=False,
    )
    tmp_column_names = sorted(tokenized_example.keys())

    if return_column_name:
        return [tokenized_example[x] for x in tmp_column_names], tmp_column_names
    else:
        return [tokenized_example[x] for x in tmp_column_names]


def is_a_list_of_str(this_obj):
    return (isinstance(this_obj, list) or isinstance(this_obj, np.ndarray)) and all(
        isinstance(x, str) for x in this_obj
    )


def _clean_value(value: Any) -> str:
    if isinstance(value, float):
        return "{:.5}".format(value)
    else:
        return str(value).replace("/", "_")


def format_vars(resolved_vars: Dict) -> str:
    """Formats the resolved variable dict into a single string."""
    out = []
    for path, value in sorted(resolved_vars.items()):
        if path[0] in ["run", "env", "resources_per_trial"]:
            continue  # TrialRunner already has these in the experiment_tag
        pieces = []
        last_string = True
        for k in path[::-1]:
            if isinstance(k, int):
                pieces.append(str(k))
            elif last_string:
                last_string = False
                pieces.append(k)
        pieces.reverse()
        out.append(_clean_value("_".join(pieces)) + "=" + _clean_value(value))
    return ",".join(out)


counter = 0


def date_str():
    from datetime import datetime

    return datetime.today().strftime("%Y-%m-%d_%H-%M-%S")


def _generate_dirname(experiment_tag, trial_id):
    generated_dirname = f"train_{str(trial_id)}_{experiment_tag}"
    generated_dirname = generated_dirname[:130]
    generated_dirname += f"_{date_str()}"
    return generated_dirname.replace("/", "_")


def get_logdir_name(dirname, local_dir):
    import os

    local_dir = os.path.expanduser(local_dir)
    logdir = os.path.join(local_dir, dirname)
    return logdir


class Counter:
    counter = 0

    @staticmethod
    def get_trial_fold_name(local_dir, trial_config, trial_id):
        Counter.counter += 1
        experiment_tag = "{0}_{1}".format(
            str(Counter.counter), format_vars(trial_config)
        )
        logdir = get_logdir_name(
            _generate_dirname(experiment_tag, trial_id=trial_id), local_dir
        )
        return logdir


def load_model(checkpoint_path, task, num_labels=None):
    import transformers

    transformers.logging.set_verbosity_error()

    from transformers import AutoConfig
    from .huggingface.switch_head_auto import (
        AutoSeqClassificationHead,
        MODEL_CLASSIFICATION_HEAD_MAPPING,
    )
    from ..data import SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION

    def get_this_model(checkpoint_path, task, model_config):
        from transformers import AutoModelForSequenceClassification
        from transformers import AutoModelForSeq2SeqLM
        from transformers import AutoModelForMultipleChoice
        from transformers import AutoModelForTokenClassification

        if task in (SEQCLASSIFICATION, SEQREGRESSION):
            return AutoModelForSequenceClassification.from_pretrained(
                checkpoint_path, config=model_config
            )
        elif task == TOKENCLASSIFICATION:
            return AutoModelForTokenClassification.from_pretrained(
                checkpoint_path, config=model_config
            )
        elif task in NLG_TASKS:
            return AutoModelForSeq2SeqLM.from_pretrained(
                checkpoint_path, config=model_config
            )
        elif task == MULTICHOICECLASSIFICATION:
            return AutoModelForMultipleChoice.from_pretrained(
                checkpoint_path, config=model_config
            )

    def is_pretrained_model_in_classification_head_list(model_type):
        return model_type in MODEL_CLASSIFICATION_HEAD_MAPPING

    def _set_model_config(checkpoint_path):
        if task in (SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION):
            model_config = AutoConfig.from_pretrained(
                checkpoint_path,
                num_labels=model_config_num_labels,
            )
            return model_config
        else:
            model_config = AutoConfig.from_pretrained(checkpoint_path)
            return model_config

    current_config = AutoConfig.from_pretrained(checkpoint_path)
    this_model_type, this_vocab_size = (
        current_config.model_type,
        current_config.vocab_size,
    )

    if task == SEQCLASSIFICATION:
        num_labels_old = current_config.num_labels
        if is_pretrained_model_in_classification_head_list(this_model_type):
            model_config_num_labels = num_labels_old
        else:
            model_config_num_labels = num_labels
        new_config = _set_model_config(checkpoint_path)

        if is_pretrained_model_in_classification_head_list(this_model_type):
            if num_labels != num_labels_old:
                this_model = get_this_model(checkpoint_path, task, new_config)
                new_config.num_labels = num_labels
                this_model.num_labels = num_labels
                this_model.classifier = (
                    AutoSeqClassificationHead.from_model_type_and_config(
                        this_model_type, new_config
                    )
                )
            else:
                this_model = get_this_model(checkpoint_path, task, new_config)
        else:
            this_model = get_this_model(checkpoint_path, task, new_config)
        this_model.resize_token_embeddings(this_vocab_size)
        return this_model
    else:
        if task == SEQREGRESSION:
            model_config_num_labels = 1
        elif task == TOKENCLASSIFICATION:
            model_config_num_labels = num_labels
        model_config = _set_model_config(checkpoint_path)
        this_model = get_this_model(checkpoint_path, task, model_config)
        return this_model