autogen/flaml/nlp/utils.py

632 lines
20 KiB
Python
Raw Normal View History

import argparse
from dataclasses import dataclass, field
from itertools import chain
from typing import Dict, Any
import numpy as np
from ..data import (
SUMMARIZATION,
SEQREGRESSION,
SEQCLASSIFICATION,
MULTICHOICECLASSIFICATION,
TOKENCLASSIFICATION,
NLG_TASKS,
)
def load_default_huggingface_metric_for_task(task):
if task == SEQCLASSIFICATION:
2022-01-07 19:25:58 -05:00
return "accuracy"
elif task == SEQREGRESSION:
return "r2"
elif task == SUMMARIZATION:
return "rouge1"
elif task == MULTICHOICECLASSIFICATION:
2022-01-07 19:25:58 -05:00
return "accuracy"
elif task == TOKENCLASSIFICATION:
2022-01-07 19:25:58 -05:00
return "seqeval"
def get_auto_tokenizer(tokenizer_model_path, task):
from transformers import AutoTokenizer
if task == SUMMARIZATION:
return AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=tokenizer_model_path,
cache_dir=None,
use_fast=True,
revision="main",
use_auth_token=None,
)
else:
return AutoTokenizer.from_pretrained(tokenizer_model_path, use_fast=True)
def tokenize_text(X, Y=None, task=None, hf_args=None, tokenizer=None):
if task in (SEQCLASSIFICATION, SEQREGRESSION):
X_tokenized = tokenize_onedataframe(
X,
tokenizer=tokenizer,
task=task,
hf_args=hf_args,
prefix_str="",
)
return X_tokenized, None
elif task == TOKENCLASSIFICATION:
return tokenize_text_tokclassification(
X, Y, tokenizer=tokenizer, hf_args=hf_args
)
elif task in NLG_TASKS:
return tokenize_seq2seq(X, Y, tokenizer=tokenizer, task=task, hf_args=hf_args)
elif task == MULTICHOICECLASSIFICATION:
return tokenize_text_multiplechoice(X, tokenizer=tokenizer, hf_args=hf_args)
def tokenize_seq2seq(X, Y, tokenizer, task=None, hf_args=None):
model_inputs = tokenize_onedataframe(
X,
tokenizer=tokenizer,
task=task,
hf_args=hf_args,
prefix_str="summarize: ",
)
labels = None
if Y is not None:
labels = tokenize_onedataframe(
Y.to_frame(),
tokenizer=tokenizer,
task=task,
hf_args=hf_args,
prefix_str="",
)
labels["label"] = [
[(each_l if each_l != tokenizer.pad_token_id else -100) for each_l in label]
for label in labels["input_ids"]
]
labels = labels.drop(
columns=["attention_mask", "input_ids", "decoder_input_ids"]
)
return model_inputs, labels
def tokenize_and_align_labels(
examples,
tokenizer,
hf_args=None,
X_sent_key=None,
Y_sent_key=None,
return_column_name=False,
):
tokenized_inputs = tokenizer(
[list(examples[X_sent_key])],
padding="max_length",
truncation=True,
max_length=hf_args.max_seq_length,
# We use this argument because the texts in our dataset are lists of words (with a label for each word).
is_split_into_words=True,
)
if Y_sent_key is not None:
previous_word_idx = None
label_ids = []
import numbers
for word_idx in tokenized_inputs.word_ids(batch_index=0):
# Special tokens have a word id that is None. We set the label to -100 so they are automatically
# ignored in the loss function.
if word_idx is None:
label_ids.append(-100)
# We set the label for the first token of each word.
elif word_idx != previous_word_idx:
if isinstance(examples[Y_sent_key][word_idx], numbers.Number):
label_ids.append(examples[Y_sent_key][word_idx])
# else:
# label_ids.append(label_to_id[label[word_idx]])
# For the other tokens in a word, we set the label to either the current label or -100, depending on
# the label_all_tokens flag.
else:
if isinstance(examples[Y_sent_key][word_idx], numbers.Number):
label_ids.append(examples[Y_sent_key][word_idx])
# else:
# label_ids.append(b_to_i_label[label_to_id[label[word_idx]]])
previous_word_idx = word_idx
tokenized_inputs["label"] = label_ids
tmp_column_names = sorted(tokenized_inputs.keys())
tokenized_input_and_labels = [tokenized_inputs[x] for x in tmp_column_names]
for key_idx, each_key in enumerate(tmp_column_names):
if each_key != "label":
tokenized_input_and_labels[key_idx] = tokenized_input_and_labels[key_idx][0]
if return_column_name:
return tokenized_input_and_labels, tmp_column_names
else:
return tokenized_input_and_labels
def tokenize_text_tokclassification(X, Y, tokenizer, hf_args=None):
import pandas as pd
if Y is not None:
X_and_Y = pd.concat([X, Y.to_frame()], axis=1)
X_key = list(X.keys())[0]
Y_key = list(Y.to_frame().keys())[0]
_, tokenized_column_names = tokenize_and_align_labels(
X_and_Y.iloc[0],
tokenizer=tokenizer,
hf_args=hf_args,
X_sent_key=X_key,
Y_sent_key=Y_key,
return_column_name=True,
)
X_and_Y_tokenized = X_and_Y.apply(
lambda x: tokenize_and_align_labels(
x,
tokenizer=tokenizer,
hf_args=hf_args,
X_sent_key=X_key,
Y_sent_key=Y_key,
),
axis=1,
result_type="expand",
)
label_idx = tokenized_column_names.index("label")
other_indices = sorted(
set(range(len(tokenized_column_names))).difference({label_idx})
)
other_column_names = [tokenized_column_names[x] for x in other_indices]
d = X_and_Y_tokenized.iloc[:, other_indices]
y_tokenized = X_and_Y_tokenized.iloc[:, label_idx]
else:
X_key = list(X.keys())[0]
_, tokenized_column_names = tokenize_and_align_labels(
X.iloc[0],
tokenizer=tokenizer,
hf_args=hf_args,
X_sent_key=X_key,
Y_sent_key=None,
return_column_name=True,
)
d = X.apply(
lambda x: tokenize_and_align_labels(
x,
tokenizer=tokenizer,
hf_args=hf_args,
X_sent_key=X_key,
Y_sent_key=None,
),
axis=1,
result_type="expand",
)
other_column_names = tokenized_column_names
y_tokenized = None
X_tokenized = pd.DataFrame(columns=other_column_names)
X_tokenized[other_column_names] = d
return X_tokenized, y_tokenized
def tokenize_onedataframe(
X,
tokenizer,
task=None,
hf_args=None,
prefix_str=None,
):
import pandas
with tokenizer.as_target_tokenizer():
_, tokenized_column_names = tokenize_row(
dict(X.iloc[0]),
tokenizer,
prefix=(prefix_str,) if task is SUMMARIZATION else None,
task=task,
hf_args=hf_args,
return_column_name=True,
)
d = X.apply(
lambda x: tokenize_row(
x,
tokenizer,
prefix=(prefix_str,) if task is SUMMARIZATION else None,
task=task,
hf_args=hf_args,
),
axis=1,
result_type="expand",
)
X_tokenized = pandas.DataFrame(columns=tokenized_column_names)
X_tokenized[tokenized_column_names] = d
return X_tokenized
def postprocess_text(preds, labels):
import nltk
nltk.download("punkt")
preds = [pred.strip() for pred in preds]
labels = [label.strip() for label in labels]
# rougeLSum expects newline after each sentence
preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
return preds, labels
def tokenize_row(
this_row,
tokenizer,
prefix=None,
task=None,
hf_args=None,
return_column_name=False,
):
assert (
"max_seq_length" in hf_args.__dict__
), "max_seq_length must be provided for glue"
if prefix:
this_row = tuple(["".join(x) for x in zip(prefix, this_row)])
# tokenizer.pad_token = tokenizer.eos_token
tokenized_example = tokenizer(
*tuple(this_row),
padding="max_length",
max_length=hf_args.max_seq_length,
truncation=True,
)
if task in NLG_TASKS:
tokenized_example["decoder_input_ids"] = tokenized_example["input_ids"]
tmp_column_names = sorted(tokenized_example.keys())
if return_column_name:
return [tokenized_example[x] for x in tmp_column_names], tmp_column_names
else:
return [tokenized_example[x] for x in tmp_column_names]
def tokenize_text_multiplechoice(X, tokenizer, hf_args=None):
import pandas
t = X[["sent1", "sent2", "ending0", "ending1", "ending2", "ending3"]]
_, tokenized_column_names = tokenize_swag(
t.iloc[0],
tokenizer=tokenizer,
hf_args=hf_args,
return_column_name=True,
)
d = t.apply(
lambda x: tokenize_swag(x, tokenizer=tokenizer, hf_args=hf_args),
axis=1,
result_type="expand",
)
X_tokenized = pandas.DataFrame(columns=tokenized_column_names)
X_tokenized[tokenized_column_names] = d
output = X_tokenized.join(X)
return output, None
def tokenize_swag(this_row, tokenizer, hf_args=None, return_column_name=False):
first_sentences = [[this_row["sent1"]] * 4]
# get each 1st sentence, multiply to 4 sentences
question_headers = this_row["sent2"]
# sent2 are the noun part of 2nd line
second_sentences = [
question_headers + " " + this_row[key]
for key in ["ending0", "ending1", "ending2", "ending3"]
]
# now the 2nd-sentences are formed by combing the noun part and 4 ending parts
# Flatten out
# From 2 dimension to 1 dimension array
first_sentences = list(chain(*first_sentences))
tokenized_example = tokenizer(
*tuple([first_sentences, second_sentences]),
truncation=True,
max_length=hf_args.max_seq_length,
padding=False,
)
tmp_column_names = sorted(tokenized_example.keys())
if return_column_name:
return [tokenized_example[x] for x in tmp_column_names], tmp_column_names
else:
return [tokenized_example[x] for x in tmp_column_names]
def separate_config(config, task):
if task in NLG_TASKS:
from transformers import Seq2SeqTrainingArguments, TrainingArguments
trainargs_class_list = [Seq2SeqTrainingArguments, TrainingArguments]
else:
from transformers import TrainingArguments
trainargs_class_list = [TrainingArguments]
training_args_config = {}
per_model_config = {}
for key, val in config.items():
is_in_training_args = any(key in x.__dict__ for x in trainargs_class_list)
if is_in_training_args:
training_args_config[key] = val
else:
per_model_config[key] = val
return training_args_config, per_model_config
def get_num_labels(task, y_train):
from ..data import SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION
if task == SEQREGRESSION:
return 1
elif task == SEQCLASSIFICATION:
return len(set(y_train))
elif task == TOKENCLASSIFICATION:
return len(set([a for b in y_train.tolist() for a in b]))
else:
return None
def is_a_list_of_str(this_obj):
return (isinstance(this_obj, list) or isinstance(this_obj, np.ndarray)) and all(
isinstance(x, str) for x in this_obj
)
def _clean_value(value: Any) -> str:
if isinstance(value, float):
return "{:.5}".format(value)
else:
return str(value).replace("/", "_")
def format_vars(resolved_vars: Dict) -> str:
"""Formats the resolved variable dict into a single string."""
out = []
for path, value in sorted(resolved_vars.items()):
if path[0] in ["run", "env", "resources_per_trial"]:
continue # TrialRunner already has these in the experiment_tag
pieces = []
last_string = True
for k in path[::-1]:
if isinstance(k, int):
pieces.append(str(k))
elif last_string:
last_string = False
pieces.append(k)
pieces.reverse()
out.append(_clean_value("_".join(pieces)) + "=" + _clean_value(value))
return ",".join(out)
counter = 0
def date_str():
from datetime import datetime
return datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
def _generate_dirname(experiment_tag, trial_id):
generated_dirname = f"train_{str(trial_id)}_{experiment_tag}"
generated_dirname = generated_dirname[:130]
generated_dirname += f"_{date_str()}"
return generated_dirname.replace("/", "_")
def get_logdir_name(dirname, local_dir):
import os
local_dir = os.path.expanduser(local_dir)
logdir = os.path.join(local_dir, dirname)
return logdir
class Counter:
counter = 0
@staticmethod
def get_trial_fold_name(local_dir, trial_config, trial_id):
Counter.counter += 1
experiment_tag = "{0}_{1}".format(
str(Counter.counter), format_vars(trial_config)
)
logdir = get_logdir_name(
_generate_dirname(experiment_tag, trial_id=trial_id), local_dir
)
return logdir
def load_model(checkpoint_path, task, num_labels, per_model_config=None):
import transformers
transformers.logging.set_verbosity_error()
from transformers import AutoConfig
from .huggingface.switch_head_auto import (
AutoSeqClassificationHead,
MODEL_CLASSIFICATION_HEAD_MAPPING,
)
from ..data import SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION
def get_this_model(task, model_config):
from transformers import AutoModelForSequenceClassification
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoModelForMultipleChoice
from transformers import AutoModelForTokenClassification
if task in (SEQCLASSIFICATION, SEQREGRESSION):
return AutoModelForSequenceClassification.from_pretrained(
checkpoint_path, config=model_config
)
elif task == TOKENCLASSIFICATION:
return AutoModelForTokenClassification.from_pretrained(
checkpoint_path, config=model_config
)
elif task in NLG_TASKS:
return AutoModelForSeq2SeqLM.from_pretrained(
checkpoint_path, config=model_config
)
elif task == MULTICHOICECLASSIFICATION:
return AutoModelForMultipleChoice.from_pretrained(
checkpoint_path, config=model_config
)
def is_pretrained_model_in_classification_head_list(model_type):
return model_type in MODEL_CLASSIFICATION_HEAD_MAPPING
def _set_model_config(checkpoint_path):
if task in (SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION):
if per_model_config:
model_config = AutoConfig.from_pretrained(
checkpoint_path,
num_labels=model_config_num_labels,
**per_model_config,
)
else:
model_config = AutoConfig.from_pretrained(
checkpoint_path, num_labels=model_config_num_labels
)
return model_config
else:
if per_model_config:
model_config = AutoConfig.from_pretrained(
checkpoint_path,
**per_model_config,
)
else:
model_config = AutoConfig.from_pretrained(checkpoint_path)
return model_config
current_config = AutoConfig.from_pretrained(checkpoint_path)
this_model_type, this_vocab_size = (
current_config.model_type,
current_config.vocab_size,
)
if task == SEQCLASSIFICATION:
num_labels_old = current_config.num_labels
if is_pretrained_model_in_classification_head_list(this_model_type):
model_config_num_labels = num_labels_old
else:
model_config_num_labels = num_labels
new_config = _set_model_config(checkpoint_path)
if is_pretrained_model_in_classification_head_list(this_model_type):
if num_labels != num_labels_old:
this_model = get_this_model(task, new_config)
new_config.num_labels = num_labels
this_model.num_labels = num_labels
this_model.classifier = (
AutoSeqClassificationHead.from_model_type_and_config(
this_model_type, new_config
)
)
else:
this_model = get_this_model(task, new_config)
else:
this_model = get_this_model(task, new_config)
this_model.resize_token_embeddings(this_vocab_size)
return this_model
else:
if task == SEQREGRESSION:
model_config_num_labels = 1
elif task == TOKENCLASSIFICATION:
model_config_num_labels = num_labels
model_config = _set_model_config(checkpoint_path)
this_model = get_this_model(task, model_config)
return this_model
def compute_checkpoint_freq(
train_data_size,
hf_args,
num_train_epochs,
batch_size,
):
ckpt_step_freq = (
int(
min(num_train_epochs, 1)
* train_data_size
/ batch_size
/ hf_args.ckpt_per_epoch
)
+ 1
)
return ckpt_step_freq
@dataclass
class HFArgs:
"""The HPO setting.
Args:
output_dir (str): data root directory for outputing the log, etc.
model_path (str, optional, defaults to "facebook/muppet-roberta-base"): A string,
the path of the language model file, either a path from huggingface
model card huggingface.co/models, or a local path for the model.
fp16 (bool, optional, defaults to "False"): A bool, whether to use FP16.
max_seq_length (int, optional, defaults to 128): An integer, the max length of the sequence.
ckpt_per_epoch (int, optional, defaults to 1): An integer, the number of checkpoints per epoch.
"""
output_dir: str = field(
default="data/output/", metadata={"help": "data dir", "required": True}
)
model_path: str = field(
default="facebook/muppet-roberta-base",
metadata={"help": "model path for HPO"},
)
tokenizer_model_path: str = field(
default=None,
metadata={"help": "tokenizer model path for HPO"},
)
fp16: bool = field(default=True, metadata={"help": "whether to use the FP16 mode"})
max_seq_length: int = field(default=128, metadata={"help": "max seq length"})
pad_to_max_length: bool = field(
default=True,
metadata={
"help": "Whether to pad all samples to model maximum sentence length. "
"If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
"efficient on GPU but very bad for TPU."
},
)
ckpt_per_epoch: int = field(default=1, metadata={"help": "checkpoint per epoch"})
per_device_eval_batch_size: int = field(
default=1,
metadata={"help": "per gpu evaluation batch size"},
)
@staticmethod
def load_args():
from dataclasses import fields
arg_parser = argparse.ArgumentParser()
for each_field in fields(HFArgs):
print(each_field)
arg_parser.add_argument(
"--" + each_field.name,
type=each_field.type,
help=each_field.metadata["help"],
required=each_field.metadata["required"]
if "required" in each_field.metadata
else False,
choices=each_field.metadata["choices"]
if "choices" in each_field.metadata
else None,
default=each_field.default,
)
console_args, unknown = arg_parser.parse_known_args()
return console_args