mirror of
https://github.com/microsoft/autogen.git
synced 2025-07-25 01:41:01 +00:00
478 lines
16 KiB
Python
478 lines
16 KiB
Python
from itertools import chain
|
|
from typing import Dict, Any
|
|
import numpy as np
|
|
|
|
from ..data import (
|
|
SUMMARIZATION,
|
|
SEQREGRESSION,
|
|
SEQCLASSIFICATION,
|
|
MULTICHOICECLASSIFICATION,
|
|
TOKENCLASSIFICATION,
|
|
NLG_TASKS,
|
|
)
|
|
|
|
|
|
def load_default_huggingface_metric_for_task(task):
|
|
|
|
if task == SEQCLASSIFICATION:
|
|
return "accuracy"
|
|
elif task == SEQREGRESSION:
|
|
return "r2"
|
|
elif task == SUMMARIZATION:
|
|
return "rouge1"
|
|
elif task == MULTICHOICECLASSIFICATION:
|
|
return "accuracy"
|
|
elif task == TOKENCLASSIFICATION:
|
|
return "seqeval"
|
|
|
|
|
|
def tokenize_text(X, Y=None, task=None, hf_args=None, tokenizer=None):
|
|
if task in (SEQCLASSIFICATION, SEQREGRESSION):
|
|
X_tokenized = tokenize_onedataframe(
|
|
X,
|
|
tokenizer=tokenizer,
|
|
task=task,
|
|
hf_args=hf_args,
|
|
prefix_str="",
|
|
)
|
|
return X_tokenized, None
|
|
elif task == TOKENCLASSIFICATION:
|
|
return tokenize_text_tokclassification(
|
|
X, Y, tokenizer=tokenizer, hf_args=hf_args
|
|
)
|
|
elif task in NLG_TASKS:
|
|
return tokenize_seq2seq(X, Y, tokenizer=tokenizer, task=task, hf_args=hf_args)
|
|
elif task == MULTICHOICECLASSIFICATION:
|
|
return tokenize_text_multiplechoice(X, tokenizer=tokenizer, hf_args=hf_args)
|
|
|
|
|
|
def tokenize_seq2seq(X, Y, tokenizer, task=None, hf_args=None):
|
|
model_inputs = tokenize_onedataframe(
|
|
X,
|
|
tokenizer=tokenizer,
|
|
task=task,
|
|
hf_args=hf_args,
|
|
prefix_str="summarize: ",
|
|
)
|
|
labels = None
|
|
if Y is not None:
|
|
labels = tokenize_onedataframe(
|
|
Y.to_frame(),
|
|
tokenizer=tokenizer,
|
|
task=task,
|
|
hf_args=hf_args,
|
|
prefix_str="",
|
|
)
|
|
labels["label"] = [
|
|
[(each_l if each_l != tokenizer.pad_token_id else -100) for each_l in label]
|
|
for label in labels["input_ids"]
|
|
]
|
|
labels = labels.drop(
|
|
columns=["attention_mask", "input_ids", "decoder_input_ids"]
|
|
)
|
|
return model_inputs, labels
|
|
|
|
|
|
def tokenize_and_align_labels(
|
|
examples,
|
|
tokenizer,
|
|
hf_args=None,
|
|
X_sent_key=None,
|
|
Y_sent_key=None,
|
|
return_column_name=False,
|
|
):
|
|
tokenized_inputs = tokenizer(
|
|
[list(examples[X_sent_key])],
|
|
padding="max_length"
|
|
if hf_args.pad_to_max_length
|
|
else False, # to be consistent with https://github.com/huggingface/transformers/blob/main/examples/pytorch/token-classification/run_ner.py#L394
|
|
truncation=True,
|
|
max_length=hf_args.max_seq_length,
|
|
# We use this argument because the texts in our dataset are lists of words (with a label for each word).
|
|
is_split_into_words=True,
|
|
)
|
|
if Y_sent_key is not None:
|
|
previous_word_idx = None
|
|
label_ids = []
|
|
import numbers
|
|
|
|
for word_idx in tokenized_inputs.word_ids(batch_index=0):
|
|
# Special tokens have a word id that is None. We set the label to -100 so they are automatically
|
|
# ignored in the loss function.
|
|
if word_idx is None:
|
|
label_ids.append(-100)
|
|
# We set the label for the first token of each word.
|
|
elif word_idx != previous_word_idx:
|
|
if isinstance(examples[Y_sent_key][word_idx], numbers.Number):
|
|
label_ids.append(examples[Y_sent_key][word_idx])
|
|
# else:
|
|
# label_ids.append(label_to_id[label[word_idx]])
|
|
# For the other tokens in a word, we set the label to either the current label or -100, depending on
|
|
# the label_all_tokens flag.
|
|
else:
|
|
if isinstance(examples[Y_sent_key][word_idx], numbers.Number):
|
|
label_ids.append(examples[Y_sent_key][word_idx])
|
|
# else:
|
|
# label_ids.append(b_to_i_label[label_to_id[label[word_idx]]])
|
|
previous_word_idx = word_idx
|
|
tokenized_inputs["labels"] = label_ids
|
|
tmp_column_names = sorted(tokenized_inputs.keys())
|
|
tokenized_input_and_labels = [tokenized_inputs[x] for x in tmp_column_names]
|
|
for key_idx, each_key in enumerate(tmp_column_names):
|
|
if each_key != "labels":
|
|
tokenized_input_and_labels[key_idx] = tokenized_input_and_labels[key_idx][0]
|
|
if return_column_name:
|
|
return tokenized_input_and_labels, tmp_column_names
|
|
else:
|
|
return tokenized_input_and_labels
|
|
|
|
|
|
def tokenize_text_tokclassification(X, Y, tokenizer, hf_args=None):
|
|
import pandas as pd
|
|
|
|
if Y is not None:
|
|
X_and_Y = pd.concat([X, Y.to_frame()], axis=1)
|
|
X_key = list(X.keys())[0]
|
|
Y_key = list(Y.to_frame().keys())[0]
|
|
_, tokenized_column_names = tokenize_and_align_labels(
|
|
X_and_Y.iloc[0],
|
|
tokenizer=tokenizer,
|
|
hf_args=hf_args,
|
|
X_sent_key=X_key,
|
|
Y_sent_key=Y_key,
|
|
return_column_name=True,
|
|
)
|
|
X_and_Y_tokenized = X_and_Y.apply(
|
|
lambda x: tokenize_and_align_labels(
|
|
x,
|
|
tokenizer=tokenizer,
|
|
hf_args=hf_args,
|
|
X_sent_key=X_key,
|
|
Y_sent_key=Y_key,
|
|
),
|
|
axis=1,
|
|
result_type="expand",
|
|
)
|
|
label_idx = tokenized_column_names.index("labels")
|
|
other_indices = sorted(
|
|
set(range(len(tokenized_column_names))).difference({label_idx})
|
|
)
|
|
other_column_names = [tokenized_column_names[x] for x in other_indices]
|
|
d = X_and_Y_tokenized.iloc[:, other_indices]
|
|
y_tokenized = X_and_Y_tokenized.iloc[:, label_idx]
|
|
else:
|
|
X_key = list(X.keys())[0]
|
|
|
|
_, tokenized_column_names = tokenize_and_align_labels(
|
|
X.iloc[0],
|
|
tokenizer=tokenizer,
|
|
hf_args=hf_args,
|
|
X_sent_key=X_key,
|
|
Y_sent_key=None,
|
|
return_column_name=True,
|
|
)
|
|
|
|
d = X.apply(
|
|
lambda x: tokenize_and_align_labels(
|
|
x,
|
|
tokenizer=tokenizer,
|
|
hf_args=hf_args,
|
|
X_sent_key=X_key,
|
|
Y_sent_key=None,
|
|
),
|
|
axis=1,
|
|
result_type="expand",
|
|
)
|
|
other_column_names = tokenized_column_names
|
|
y_tokenized = None
|
|
X_tokenized = pd.DataFrame(columns=other_column_names)
|
|
X_tokenized[other_column_names] = d
|
|
return X_tokenized, y_tokenized
|
|
|
|
|
|
def tokenize_onedataframe(
|
|
X,
|
|
tokenizer,
|
|
task=None,
|
|
hf_args=None,
|
|
prefix_str=None,
|
|
):
|
|
import pandas
|
|
|
|
with tokenizer.as_target_tokenizer():
|
|
_, tokenized_column_names = tokenize_row(
|
|
dict(X.iloc[0]),
|
|
tokenizer,
|
|
prefix=(prefix_str,) if task is SUMMARIZATION else None,
|
|
task=task,
|
|
hf_args=hf_args,
|
|
return_column_name=True,
|
|
)
|
|
d = X.apply(
|
|
lambda x: tokenize_row(
|
|
x,
|
|
tokenizer,
|
|
prefix=(prefix_str,) if task is SUMMARIZATION else None,
|
|
task=task,
|
|
hf_args=hf_args,
|
|
),
|
|
axis=1,
|
|
result_type="expand",
|
|
)
|
|
X_tokenized = pandas.DataFrame(columns=tokenized_column_names)
|
|
X_tokenized[tokenized_column_names] = d
|
|
return X_tokenized
|
|
|
|
|
|
def postprocess_text(preds, labels):
|
|
import nltk
|
|
|
|
nltk.download("punkt")
|
|
preds = [pred.strip() for pred in preds]
|
|
labels = [label.strip() for label in labels]
|
|
|
|
# rougeLSum expects newline after each sentence
|
|
preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
|
|
labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
|
|
|
|
return preds, labels
|
|
|
|
|
|
def tokenize_row(
|
|
this_row,
|
|
tokenizer,
|
|
prefix=None,
|
|
task=None,
|
|
hf_args=None,
|
|
return_column_name=False,
|
|
):
|
|
if prefix:
|
|
this_row = tuple(["".join(x) for x in zip(prefix, this_row)])
|
|
|
|
# tokenizer.pad_token = tokenizer.eos_token
|
|
tokenized_example = tokenizer(
|
|
*tuple(this_row),
|
|
padding="max_length",
|
|
max_length=hf_args.max_seq_length if hf_args else None,
|
|
truncation=True,
|
|
)
|
|
if task in NLG_TASKS:
|
|
tokenized_example["decoder_input_ids"] = tokenized_example["input_ids"]
|
|
tmp_column_names = sorted(tokenized_example.keys())
|
|
|
|
if return_column_name:
|
|
return [tokenized_example[x] for x in tmp_column_names], tmp_column_names
|
|
else:
|
|
return [tokenized_example[x] for x in tmp_column_names]
|
|
|
|
|
|
def tokenize_text_multiplechoice(X, tokenizer, hf_args=None):
|
|
import pandas
|
|
|
|
t = X[["sent1", "sent2", "ending0", "ending1", "ending2", "ending3"]]
|
|
_, tokenized_column_names = tokenize_swag(
|
|
t.iloc[0],
|
|
tokenizer=tokenizer,
|
|
hf_args=hf_args,
|
|
return_column_name=True,
|
|
)
|
|
d = t.apply(
|
|
lambda x: tokenize_swag(x, tokenizer=tokenizer, hf_args=hf_args),
|
|
axis=1,
|
|
result_type="expand",
|
|
)
|
|
|
|
X_tokenized = pandas.DataFrame(columns=tokenized_column_names)
|
|
X_tokenized[tokenized_column_names] = d
|
|
output = X_tokenized.join(X)
|
|
return output, None
|
|
|
|
|
|
def tokenize_swag(this_row, tokenizer, hf_args=None, return_column_name=False):
|
|
first_sentences = [[this_row["sent1"]] * 4]
|
|
# get each 1st sentence, multiply to 4 sentences
|
|
question_headers = this_row["sent2"]
|
|
# sent2 are the noun part of 2nd line
|
|
second_sentences = [
|
|
question_headers + " " + this_row[key]
|
|
for key in ["ending0", "ending1", "ending2", "ending3"]
|
|
]
|
|
# now the 2nd-sentences are formed by combing the noun part and 4 ending parts
|
|
|
|
# Flatten out
|
|
# From 2 dimension to 1 dimension array
|
|
first_sentences = list(chain(*first_sentences))
|
|
|
|
tokenized_example = tokenizer(
|
|
*tuple([first_sentences, second_sentences]),
|
|
truncation=True,
|
|
max_length=hf_args.max_seq_length if hf_args else None,
|
|
padding=False,
|
|
)
|
|
tmp_column_names = sorted(tokenized_example.keys())
|
|
|
|
if return_column_name:
|
|
return [tokenized_example[x] for x in tmp_column_names], tmp_column_names
|
|
else:
|
|
return [tokenized_example[x] for x in tmp_column_names]
|
|
|
|
|
|
def is_a_list_of_str(this_obj):
|
|
return (isinstance(this_obj, list) or isinstance(this_obj, np.ndarray)) and all(
|
|
isinstance(x, str) for x in this_obj
|
|
)
|
|
|
|
|
|
def _clean_value(value: Any) -> str:
|
|
if isinstance(value, float):
|
|
return "{:.5}".format(value)
|
|
else:
|
|
return str(value).replace("/", "_")
|
|
|
|
|
|
def format_vars(resolved_vars: Dict) -> str:
|
|
"""Formats the resolved variable dict into a single string."""
|
|
out = []
|
|
for path, value in sorted(resolved_vars.items()):
|
|
if path[0] in ["run", "env", "resources_per_trial"]:
|
|
continue # TrialRunner already has these in the experiment_tag
|
|
pieces = []
|
|
last_string = True
|
|
for k in path[::-1]:
|
|
if isinstance(k, int):
|
|
pieces.append(str(k))
|
|
elif last_string:
|
|
last_string = False
|
|
pieces.append(k)
|
|
pieces.reverse()
|
|
out.append(_clean_value("_".join(pieces)) + "=" + _clean_value(value))
|
|
return ",".join(out)
|
|
|
|
|
|
counter = 0
|
|
|
|
|
|
def date_str():
|
|
from datetime import datetime
|
|
|
|
return datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
|
|
|
|
|
|
def _generate_dirname(experiment_tag, trial_id):
|
|
generated_dirname = f"train_{str(trial_id)}_{experiment_tag}"
|
|
generated_dirname = generated_dirname[:130]
|
|
generated_dirname += f"_{date_str()}"
|
|
return generated_dirname.replace("/", "_")
|
|
|
|
|
|
def get_logdir_name(dirname, local_dir):
|
|
import os
|
|
|
|
local_dir = os.path.expanduser(local_dir)
|
|
logdir = os.path.join(local_dir, dirname)
|
|
return logdir
|
|
|
|
|
|
class Counter:
|
|
counter = 0
|
|
|
|
@staticmethod
|
|
def get_trial_fold_name(local_dir, trial_config, trial_id):
|
|
Counter.counter += 1
|
|
experiment_tag = "{0}_{1}".format(
|
|
str(Counter.counter), format_vars(trial_config)
|
|
)
|
|
logdir = get_logdir_name(
|
|
_generate_dirname(experiment_tag, trial_id=trial_id), local_dir
|
|
)
|
|
return logdir
|
|
|
|
|
|
def load_model(checkpoint_path, task, num_labels=None):
|
|
import transformers
|
|
|
|
transformers.logging.set_verbosity_error()
|
|
|
|
from transformers import AutoConfig
|
|
from .huggingface.switch_head_auto import (
|
|
AutoSeqClassificationHead,
|
|
MODEL_CLASSIFICATION_HEAD_MAPPING,
|
|
)
|
|
from ..data import SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION
|
|
|
|
def get_this_model(checkpoint_path, task, model_config):
|
|
from transformers import AutoModelForSequenceClassification
|
|
from transformers import AutoModelForSeq2SeqLM
|
|
from transformers import AutoModelForMultipleChoice
|
|
from transformers import AutoModelForTokenClassification
|
|
|
|
if task in (SEQCLASSIFICATION, SEQREGRESSION):
|
|
return AutoModelForSequenceClassification.from_pretrained(
|
|
checkpoint_path, config=model_config
|
|
)
|
|
elif task == TOKENCLASSIFICATION:
|
|
return AutoModelForTokenClassification.from_pretrained(
|
|
checkpoint_path, config=model_config
|
|
)
|
|
elif task in NLG_TASKS:
|
|
return AutoModelForSeq2SeqLM.from_pretrained(
|
|
checkpoint_path, config=model_config
|
|
)
|
|
elif task == MULTICHOICECLASSIFICATION:
|
|
return AutoModelForMultipleChoice.from_pretrained(
|
|
checkpoint_path, config=model_config
|
|
)
|
|
|
|
def is_pretrained_model_in_classification_head_list(model_type):
|
|
return model_type in MODEL_CLASSIFICATION_HEAD_MAPPING
|
|
|
|
def _set_model_config(checkpoint_path):
|
|
if task in (SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION):
|
|
model_config = AutoConfig.from_pretrained(
|
|
checkpoint_path,
|
|
num_labels=model_config_num_labels,
|
|
)
|
|
return model_config
|
|
else:
|
|
model_config = AutoConfig.from_pretrained(checkpoint_path)
|
|
return model_config
|
|
|
|
current_config = AutoConfig.from_pretrained(checkpoint_path)
|
|
this_model_type, this_vocab_size = (
|
|
current_config.model_type,
|
|
current_config.vocab_size,
|
|
)
|
|
|
|
if task == SEQCLASSIFICATION:
|
|
num_labels_old = current_config.num_labels
|
|
if is_pretrained_model_in_classification_head_list(this_model_type):
|
|
model_config_num_labels = num_labels_old
|
|
else:
|
|
model_config_num_labels = num_labels
|
|
new_config = _set_model_config(checkpoint_path)
|
|
|
|
if is_pretrained_model_in_classification_head_list(this_model_type):
|
|
if num_labels != num_labels_old:
|
|
this_model = get_this_model(checkpoint_path, task, new_config)
|
|
new_config.num_labels = num_labels
|
|
this_model.num_labels = num_labels
|
|
this_model.classifier = (
|
|
AutoSeqClassificationHead.from_model_type_and_config(
|
|
this_model_type, new_config
|
|
)
|
|
)
|
|
else:
|
|
this_model = get_this_model(checkpoint_path, task, new_config)
|
|
else:
|
|
this_model = get_this_model(checkpoint_path, task, new_config)
|
|
this_model.resize_token_embeddings(this_vocab_size)
|
|
return this_model
|
|
else:
|
|
if task == SEQREGRESSION:
|
|
model_config_num_labels = 1
|
|
elif task == TOKENCLASSIFICATION:
|
|
model_config_num_labels = num_labels
|
|
model_config = _set_model_config(checkpoint_path)
|
|
this_model = get_this_model(checkpoint_path, task, model_config)
|
|
return this_model
|