import argparse from dataclasses import dataclass, field from typing import Dict, Any from ..data import SUMMARIZATION, SEQREGRESSION, SEQCLASSIFICATION, NLG_TASKS def load_default_huggingface_metric_for_task(task): if task == SEQCLASSIFICATION: return "accuracy", "max" elif task == SEQREGRESSION: return "rmse", "max" elif task == SUMMARIZATION: return "rouge", "max" # TODO: elif task == your task, return the default metric name for your task, # e.g., if task == MULTIPLECHOICE, return "accuracy" # notice this metric name has to be in ['accuracy', 'bertscore', 'bleu', 'bleurt', # 'cer', 'chrf', 'code_eval', 'comet', 'competition_math', 'coval', 'cuad', # 'f1', 'gleu', 'glue', 'google_bleu', 'indic_glue', 'matthews_correlation', # 'meteor', 'pearsonr', 'precision', 'recall', 'rouge', 'sacrebleu', 'sari', # 'seqeval', 'spearmanr', 'squad', 'squad_v2', 'super_glue', 'ter', 'wer', # 'wiki_split', 'xnli'] global tokenized_column_names def tokenize_text(X, Y=None, task=None, custom_hpo_args=None): if task in (SEQCLASSIFICATION, SEQREGRESSION): X_tokenized, _ = tokenize_onedataframe( X, this_tokenizer=None, task=task, custom_hpo_args=custom_hpo_args ) return X_tokenized, None elif task in NLG_TASKS: return tokenize_seq2seq(X, Y, task=task, custom_hpo_args=custom_hpo_args) def tokenize_seq2seq(X, Y, task=None, custom_hpo_args=None): model_inputs, tokenizer = tokenize_onedataframe( X, this_tokenizer=None, task=task, custom_hpo_args=custom_hpo_args, ) labels = None if Y is not None: labels, _ = tokenize_onedataframe( Y.to_frame(), this_tokenizer=tokenizer, task=task, custom_hpo_args=custom_hpo_args, ) labels["label"] = [ [(each_l if each_l != tokenizer.pad_token_id else -100) for each_l in label] for label in labels["input_ids"] ] labels = labels.drop( columns=["attention_mask", "input_ids", "decoder_input_ids"] ) return model_inputs, labels def tokenize_onedataframe( X, this_tokenizer=None, task=None, custom_hpo_args=None, ): from transformers import AutoTokenizer import pandas global tokenized_column_names if this_tokenizer: with this_tokenizer.as_target_tokenizer(): d = X.apply( lambda x: tokenize_row( x, this_tokenizer, prefix=("",) if task is SUMMARIZATION else None, task=task, custom_hpo_args=custom_hpo_args, ), axis=1, result_type="expand", ) else: this_tokenizer = AutoTokenizer.from_pretrained( custom_hpo_args.model_path, use_fast=True ) d = X.apply( lambda x: tokenize_row( x, this_tokenizer, prefix=("summarize: ",) if task is SUMMARIZATION else None, task=task, custom_hpo_args=custom_hpo_args, ), axis=1, result_type="expand", ) X_tokenized = pandas.DataFrame(columns=tokenized_column_names) X_tokenized[tokenized_column_names] = d return X_tokenized, this_tokenizer def postprocess_text(preds, labels): import nltk nltk.download("punkt") preds = [pred.strip() for pred in preds] labels = [label.strip() for label in labels] # rougeLSum expects newline after each sentence preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] return preds, labels def tokenize_row( this_row, this_tokenizer, prefix=None, task=None, custom_hpo_args=None ): global tokenized_column_names assert ( "max_seq_length" in custom_hpo_args.__dict__ ), "max_seq_length must be provided for glue" if prefix: this_row = tuple(["".join(x) for x in zip(prefix, this_row)]) tokenized_example = this_tokenizer( *tuple(this_row), padding="max_length", max_length=custom_hpo_args.max_seq_length, truncation=True, ) if task in NLG_TASKS: tokenized_example["decoder_input_ids"] = tokenized_example["input_ids"] tokenized_column_names = sorted(tokenized_example.keys()) return [tokenized_example[x] for x in tokenized_column_names] def separate_config(config, task): if task in NLG_TASKS: from transformers import Seq2SeqTrainingArguments, TrainingArguments trainargs_class_list = [Seq2SeqTrainingArguments, TrainingArguments] else: from transformers import TrainingArguments trainargs_class_list = [TrainingArguments] training_args_config = {} per_model_config = {} for key, val in config.items(): is_in_training_args = any(key in x.__dict__ for x in trainargs_class_list) if is_in_training_args: training_args_config[key] = val else: per_model_config[key] = val return training_args_config, per_model_config def get_num_labels(task, y_train): from ..data import SEQCLASSIFICATION, SEQREGRESSION if task == SEQREGRESSION: return 1 elif task == SEQCLASSIFICATION: return len(set(y_train)) else: return None def _clean_value(value: Any) -> str: if isinstance(value, float): return "{:.5}".format(value) else: return str(value).replace("/", "_") def format_vars(resolved_vars: Dict) -> str: """Formats the resolved variable dict into a single string.""" out = [] for path, value in sorted(resolved_vars.items()): if path[0] in ["run", "env", "resources_per_trial"]: continue # TrialRunner already has these in the experiment_tag pieces = [] last_string = True for k in path[::-1]: if isinstance(k, int): pieces.append(str(k)) elif last_string: last_string = False pieces.append(k) pieces.reverse() out.append(_clean_value("_".join(pieces)) + "=" + _clean_value(value)) return ",".join(out) counter = 0 def date_str(): from datetime import datetime return datetime.today().strftime("%Y-%m-%d_%H-%M-%S") def _generate_dirname(experiment_tag, trial_id): generated_dirname = f"train_{str(trial_id)}_{experiment_tag}" generated_dirname = generated_dirname[:130] generated_dirname += f"_{date_str()}" return generated_dirname.replace("/", "_") def get_logdir_name(dirname, local_dir): import os local_dir = os.path.expanduser(local_dir) logdir = os.path.join(local_dir, dirname) return logdir def get_trial_fold_name(local_dir, trial_config, trial_id): global counter counter = counter + 1 experiment_tag = "{0}_{1}".format(str(counter), format_vars(trial_config)) logdir = get_logdir_name( _generate_dirname(experiment_tag, trial_id=trial_id), local_dir ) return logdir def load_model(checkpoint_path, task, num_labels, per_model_config=None): from transformers import AutoConfig from .huggingface.switch_head_auto import ( AutoSeqClassificationHead, MODEL_CLASSIFICATION_HEAD_MAPPING, ) from ..data import SEQCLASSIFICATION, SEQREGRESSION this_model_type = AutoConfig.from_pretrained(checkpoint_path).model_type this_vocab_size = AutoConfig.from_pretrained(checkpoint_path).vocab_size def get_this_model(task): from transformers import AutoModelForSequenceClassification from transformers import AutoModelForSeq2SeqLM if task in (SEQCLASSIFICATION, SEQREGRESSION): return AutoModelForSequenceClassification.from_pretrained( checkpoint_path, config=model_config ) elif task in NLG_TASKS: return AutoModelForSeq2SeqLM.from_pretrained( checkpoint_path, config=model_config ) def is_pretrained_model_in_classification_head_list(model_type): return model_type in MODEL_CLASSIFICATION_HEAD_MAPPING def _set_model_config(checkpoint_path): if task in (SEQCLASSIFICATION, SEQREGRESSION): if per_model_config: model_config = AutoConfig.from_pretrained( checkpoint_path, num_labels=model_config_num_labels, **per_model_config, ) else: model_config = AutoConfig.from_pretrained( checkpoint_path, num_labels=model_config_num_labels ) return model_config else: if per_model_config: model_config = AutoConfig.from_pretrained( checkpoint_path, **per_model_config, ) else: model_config = AutoConfig.from_pretrained(checkpoint_path) return model_config if task == SEQCLASSIFICATION: num_labels_old = AutoConfig.from_pretrained(checkpoint_path).num_labels if is_pretrained_model_in_classification_head_list(this_model_type): model_config_num_labels = num_labels_old else: model_config_num_labels = num_labels model_config = _set_model_config(checkpoint_path) if is_pretrained_model_in_classification_head_list(this_model_type): if num_labels != num_labels_old: this_model = get_this_model(task) model_config.num_labels = num_labels this_model.num_labels = num_labels this_model.classifier = ( AutoSeqClassificationHead.from_model_type_and_config( this_model_type, model_config ) ) else: this_model = get_this_model(task) else: this_model = get_this_model(task) this_model.resize_token_embeddings(this_vocab_size) return this_model else: if task == SEQREGRESSION: model_config_num_labels = 1 model_config = _set_model_config(checkpoint_path) this_model = get_this_model(task) return this_model def compute_checkpoint_freq( train_data_size, custom_hpo_args, num_train_epochs, batch_size, ): ckpt_step_freq = ( int( min(num_train_epochs, 1) * train_data_size / batch_size / custom_hpo_args.ckpt_per_epoch ) + 1 ) return ckpt_step_freq @dataclass class HPOArgs: """The HPO setting. Args: output_dir (str): data root directory for outputing the log, etc. model_path (str, optional, defaults to "facebook/muppet-roberta-base"): A string, the path of the language model file, either a path from huggingface model card huggingface.co/models, or a local path for the model. fp16 (bool, optional, defaults to "False"): A bool, whether to use FP16. max_seq_length (int, optional, defaults to 128): An integer, the max length of the sequence. ckpt_per_epoch (int, optional, defaults to 1): An integer, the number of checkpoints per epoch. """ output_dir: str = field( default="data/output/", metadata={"help": "data dir", "required": True} ) model_path: str = field( default="facebook/muppet-roberta-base", metadata={"help": "model path model for HPO"}, ) fp16: bool = field(default=True, metadata={"help": "whether to use the FP16 mode"}) max_seq_length: int = field(default=128, metadata={"help": "max seq length"}) ckpt_per_epoch: int = field(default=1, metadata={"help": "checkpoint per epoch"}) @staticmethod def load_args(): from dataclasses import fields arg_parser = argparse.ArgumentParser() for each_field in fields(HPOArgs): print(each_field) arg_parser.add_argument( "--" + each_field.name, type=each_field.type, help=each_field.metadata["help"], required=each_field.metadata["required"] if "required" in each_field.metadata else False, choices=each_field.metadata["choices"] if "choices" in each_field.metadata else None, default=each_field.default, ) console_args, unknown = arg_parser.parse_known_args() return console_args