mirror of
https://github.com/microsoft/autogen.git
synced 2025-10-30 09:20:18 +00:00
888 lines
38 KiB
Python
888 lines
38 KiB
Python
import json
|
|
import os
|
|
import numpy as np
|
|
import time
|
|
|
|
try:
|
|
import ray
|
|
import transformers
|
|
from transformers import TrainingArguments
|
|
import datasets
|
|
from .dataset.task_auto import get_default_task
|
|
from .result_analysis.azure_utils import JobID
|
|
from .huggingface.trainer import TrainerForAutoTransformers
|
|
except ImportError:
|
|
print("To use the nlp component in flaml, run pip install flaml[nlp]")
|
|
|
|
task_list = [
|
|
"seq-classification",
|
|
"regression",
|
|
"question-answering"
|
|
]
|
|
|
|
|
|
class AutoTransformers:
|
|
'''The AutoTransformers class
|
|
|
|
Example:
|
|
|
|
.. code-block:: python
|
|
|
|
autohf = AutoTransformers()
|
|
autohf_settings = {
|
|
"resources_per_trial": {"cpu": 1, "gpu": 1},
|
|
"num_samples": -1,
|
|
"time_budget": 60,
|
|
}
|
|
|
|
validation_metric, analysis = autohf.fit(**autohf_settings)
|
|
|
|
'''
|
|
|
|
@staticmethod
|
|
def _convert_dict_to_ray_tune_space(config_json, mode="grid"):
|
|
search_space = {}
|
|
|
|
if mode == "grid":
|
|
# TODO add test
|
|
for each_hp in config_json.keys():
|
|
this_config = config_json[each_hp]
|
|
assert isinstance(this_config, dict) or isinstance(this_config, list), \
|
|
"config of " + each_hp + " must be dict or list for grid search"
|
|
search_space[each_hp] = ray.tune.grid_search(this_config)
|
|
else:
|
|
for each_hp in config_json.keys():
|
|
this_config = config_json[each_hp]
|
|
assert isinstance(this_config, dict) or isinstance(this_config, list), \
|
|
"config of " + each_hp + " must be dict or list"
|
|
if isinstance(this_config, dict):
|
|
lower = this_config["l"]
|
|
upper = this_config["u"]
|
|
space = this_config["space"]
|
|
if space == "log":
|
|
search_space[each_hp] = ray.tune.loguniform(lower, upper)
|
|
elif space == "linear":
|
|
search_space[each_hp] = ray.tune.uniform(lower, upper)
|
|
elif space == "quniform":
|
|
search_space[each_hp] = ray.tune.quniform(lower, upper, this_config["interval"])
|
|
else:
|
|
search_space[each_hp] = ray.tune.choice(this_config)
|
|
|
|
return search_space
|
|
|
|
def _set_search_space(self,
|
|
**custom_hpo_args):
|
|
from .hpo.hpo_searchspace import AutoHPOSearchSpace
|
|
|
|
search_space_hpo_json \
|
|
= AutoHPOSearchSpace.from_model_and_dataset_name(self.jobid_config.spa,
|
|
self.jobid_config.pre,
|
|
self.jobid_config.presz,
|
|
self.jobid_config.dat,
|
|
self.jobid_config.subdat,
|
|
**custom_hpo_args)
|
|
self._search_space_hpo = AutoTransformers._convert_dict_to_ray_tune_space(
|
|
search_space_hpo_json,
|
|
mode=self.jobid_config.mod)
|
|
|
|
@staticmethod
|
|
def _get_split_name(data_raw, fold_name=None):
|
|
# TODO coverage
|
|
if fold_name:
|
|
return fold_name
|
|
fold_keys = data_raw.keys()
|
|
if fold_keys == {"train", "validation", "test"}:
|
|
return "train", "validation", "test"
|
|
for each_key in fold_keys:
|
|
for each_split_name in {"train", "validation", "test"}:
|
|
assert not (each_key.startswith(each_split_name) and each_key != each_split_name), \
|
|
"Dataset split must be within {}, must be explicitly specified in dataset_config, e.g.," \
|
|
"'fold_name': ['train','validation_matched','test_matched']. Please refer to the example in the " \
|
|
"documentation of AutoTransformers.prepare_data()".format(",".join(fold_keys))
|
|
return "train", "validation", "test"
|
|
|
|
def prepare_data(self,
|
|
data_root_path,
|
|
jobid_config=None,
|
|
is_wandb_on=False,
|
|
server_name=None,
|
|
max_seq_length=128,
|
|
fold_name=None,
|
|
resplit_portion=None,
|
|
**custom_data_args):
|
|
"""Prepare data
|
|
|
|
Example:
|
|
|
|
.. code-block:: python
|
|
|
|
preparedata_setting = {"server_name": "tmdev", "data_root_path": "data/", "max_seq_length": 128,
|
|
"jobid_config": jobid_config, "wandb_utils": wandb_utils,
|
|
"resplit_portion": {"source": ["train", "validation"],
|
|
"train": [0, 0.8], "validation": [0.8, 0.9], "test": [0.9, 1.0]}}
|
|
|
|
autohf.prepare_data(**preparedata_setting)
|
|
|
|
Args:
|
|
server_name:
|
|
A string variable, which can be tmdev or azureml
|
|
data_root_path:
|
|
The root path for storing the checkpoints and output results, e.g., "data/"
|
|
jobid_config:
|
|
A JobID object describing the profile of job
|
|
wandb_utils:
|
|
A WandbUtils object for wandb operations
|
|
max_seq_length (optional):
|
|
Max_seq_lckpt_per_epochength for the huggingface, this hyperparameter must be specified
|
|
at the data processing step
|
|
resplit_portion:
|
|
The proportion for resplitting the train and dev data when split_mode="resplit".
|
|
If args.resplit_mode = "rspt", resplit_portion is required
|
|
is_wandb_on:
|
|
A boolean variable indicating whether wandb is used
|
|
"""
|
|
from .dataset.dataprocess_auto import AutoEncodeText
|
|
from transformers import AutoTokenizer
|
|
from datasets import load_dataset
|
|
from .utils import PathUtils
|
|
from .utils import load_dft_args
|
|
|
|
self._max_seq_length = max_seq_length
|
|
self._server_name = server_name if server_name is not None else "tmdev"
|
|
"""
|
|
loading the jobid config from console args
|
|
"""
|
|
console_args = load_dft_args()
|
|
self.jobid_config = JobID(console_args)
|
|
if jobid_config:
|
|
self.jobid_config = jobid_config
|
|
if len(custom_data_args) > 0:
|
|
self.jobid_config.set_jobid_from_console_args(console_args=custom_data_args)
|
|
if is_wandb_on:
|
|
from .result_analysis.wandb_utils import WandbUtils
|
|
self.wandb_utils = WandbUtils(is_wandb_on=is_wandb_on,
|
|
wandb_key_path=console_args.key_path,
|
|
jobid_config=self.jobid_config)
|
|
self.wandb_utils.set_wandb_per_run()
|
|
else:
|
|
self.wandb_utils = None
|
|
|
|
self.path_utils = PathUtils(self.jobid_config, hpo_data_root_path=data_root_path)
|
|
|
|
if self.jobid_config.spt == "rspt":
|
|
assert resplit_portion, "If split mode is 'rspt', the resplit_portion must be provided. Please " \
|
|
"refer to the example in the documentation of AutoTransformers.prepare_data()"
|
|
if self.jobid_config.subdat:
|
|
data_raw = load_dataset(JobID.dataset_list_to_str(self.jobid_config.dat),
|
|
self.jobid_config.subdat)
|
|
else:
|
|
data_raw = load_dataset(*self.jobid_config.dat)
|
|
|
|
self._train_name, self._dev_name, self._test_name = AutoTransformers._get_split_name(
|
|
data_raw,
|
|
fold_name=fold_name)
|
|
auto_tokentoids_config = {"max_seq_length": self._max_seq_length}
|
|
self._tokenizer = AutoTokenizer.from_pretrained(self.jobid_config.pre_full, use_fast=True)
|
|
|
|
def autoencodetext_from_model_and_dataset_name():
|
|
return AutoEncodeText.from_model_and_dataset_name(
|
|
data_raw,
|
|
self.jobid_config.pre_full,
|
|
self.jobid_config.dat,
|
|
self.jobid_config.subdat,
|
|
**auto_tokentoids_config)
|
|
|
|
data_encoded = autoencodetext_from_model_and_dataset_name()
|
|
self._max_seq_length = 0
|
|
"""
|
|
Update the max_seq_length to the minimum of the actual max seq length and the user defined max_seq_length
|
|
"""
|
|
for each_fold in data_encoded.keys():
|
|
self._max_seq_length = max(self._max_seq_length,
|
|
max([sum(data_encoded[each_fold][x]['attention_mask']) for x in
|
|
range(len(data_encoded[each_fold]))]))
|
|
self._max_seq_length = int((self._max_seq_length + 15) / 16) * 16
|
|
data_encoded = autoencodetext_from_model_and_dataset_name()
|
|
|
|
if self.jobid_config.spt == "rspt":
|
|
all_folds_from_source = []
|
|
assert "source" in resplit_portion.keys(), "Must specify the source for resplitting the dataset in" \
|
|
"resplit_portion, which is a list of folder names, e.g., " \
|
|
"resplit_portion = {'source': ['train']}"
|
|
|
|
source_fold_names = resplit_portion['source']
|
|
for each_fold_name in source_fold_names:
|
|
this_fold_dataset = data_encoded[each_fold_name]
|
|
all_folds_from_source.append(this_fold_dataset)
|
|
|
|
merged_folds_from_source = datasets.concatenate_datasets(all_folds_from_source)
|
|
merged_folds_from_source = merged_folds_from_source.shuffle(seed=self.jobid_config.sddt)
|
|
|
|
assert "train" in resplit_portion.keys() and "validation" in resplit_portion.keys() \
|
|
and "test" in resplit_portion.keys(), "train, validation, test must exist in resplit_portion"
|
|
|
|
for key in ["train", "validation", "test"]:
|
|
target_fold_start, target_fold_end = \
|
|
int(resplit_portion[key][0] * len(merged_folds_from_source)), \
|
|
int(resplit_portion[key][1] * len(merged_folds_from_source))
|
|
subfold_dataset = merged_folds_from_source.select(
|
|
[x for x in range(target_fold_start, target_fold_end)]).flatten_indices()
|
|
if key == "train":
|
|
self.train_dataset = subfold_dataset
|
|
elif key == "validation":
|
|
self.eval_dataset = subfold_dataset
|
|
else:
|
|
self.test_dataset = subfold_dataset
|
|
else:
|
|
self.train_dataset, self.eval_dataset, self.test_dataset \
|
|
= data_encoded[self._train_name], data_encoded[self._dev_name], data_encoded[self._test_name]
|
|
|
|
def _load_model(self,
|
|
checkpoint_path=None,
|
|
per_model_config=None):
|
|
from transformers import AutoConfig
|
|
from .huggingface.switch_head_auto import AutoSeqClassificationHead, MODEL_CLASSIFICATION_HEAD_MAPPING
|
|
|
|
this_task = get_default_task(self.jobid_config.dat,
|
|
self.jobid_config.subdat)
|
|
if this_task == "seq-classification":
|
|
self._num_labels = len(self.train_dataset.features["label"].names)
|
|
elif this_task == "regression":
|
|
self._num_labels = 1
|
|
|
|
if not checkpoint_path:
|
|
checkpoint_path = self.jobid_config.pre_full
|
|
|
|
def get_this_model():
|
|
from transformers import AutoModelForSequenceClassification
|
|
return AutoModelForSequenceClassification.from_pretrained(checkpoint_path, config=model_config)
|
|
|
|
def is_pretrained_model_in_classification_head_list():
|
|
return self.jobid_config.pre in MODEL_CLASSIFICATION_HEAD_MAPPING.keys()
|
|
|
|
def _set_model_config():
|
|
if per_model_config and len(per_model_config) > 0:
|
|
model_config = AutoConfig.from_pretrained(
|
|
checkpoint_path,
|
|
num_labels=model_config_num_labels,
|
|
**per_model_config)
|
|
else:
|
|
model_config = AutoConfig.from_pretrained(
|
|
checkpoint_path,
|
|
num_labels=model_config_num_labels)
|
|
return model_config
|
|
|
|
if this_task == "seq-classification":
|
|
num_labels_old = AutoConfig.from_pretrained(checkpoint_path).num_labels
|
|
if is_pretrained_model_in_classification_head_list():
|
|
model_config_num_labels = num_labels_old
|
|
else:
|
|
model_config_num_labels = self._num_labels
|
|
model_config = _set_model_config()
|
|
|
|
if is_pretrained_model_in_classification_head_list():
|
|
# TODO coverage
|
|
if self._num_labels != num_labels_old:
|
|
this_model = get_this_model()
|
|
model_config.num_labels = self._num_labels
|
|
this_model.num_labels = self._num_labels
|
|
this_model.classifier = AutoSeqClassificationHead \
|
|
.from_model_type_and_config(self.jobid_config.pre,
|
|
model_config)
|
|
else:
|
|
this_model = get_this_model()
|
|
else:
|
|
this_model = get_this_model()
|
|
|
|
this_model.resize_token_embeddings(len(self._tokenizer))
|
|
return this_model
|
|
elif this_task == "regression":
|
|
# TODO add test
|
|
model_config_num_labels = 1
|
|
model_config = _set_model_config()
|
|
this_model = get_this_model()
|
|
return this_model
|
|
|
|
def _get_metric_func(self):
|
|
data_name = JobID.dataset_list_to_str(self.jobid_config.dat)
|
|
if data_name in ("glue", "super_glue"):
|
|
metric = datasets.load.load_metric(data_name, self.jobid_config.subdat)
|
|
# TODO delete
|
|
elif data_name in ("squad", "squad_v2"):
|
|
metric = datasets.load.load_metric(data_name)
|
|
else:
|
|
metric = datasets.load.load_metric(self.metric_name)
|
|
return metric
|
|
|
|
def _compute_metrics_by_dataset_name(self,
|
|
eval_pred):
|
|
# TODO coverage
|
|
predictions, labels = eval_pred
|
|
predictions = np.squeeze(predictions) \
|
|
if self.task_name == "regression" else np.argmax(predictions, axis=1)
|
|
metric_func = self._get_metric_func()
|
|
return metric_func.compute(predictions=predictions, references=labels)
|
|
|
|
def _compute_checkpoint_freq(self,
|
|
num_train_epochs,
|
|
batch_size):
|
|
# TODO coverage
|
|
if "gpu" in self._resources_per_trial:
|
|
ckpt_step_freq = int(min(num_train_epochs, 1) * len(self.train_dataset) / batch_size
|
|
/ self._resources_per_trial["gpu"] / self.ckpt_per_epoch) + 1
|
|
else:
|
|
ckpt_step_freq = int(min(num_train_epochs, 1) * len(self.train_dataset) / batch_size
|
|
/ self._resources_per_trial["cpu"] / self.ckpt_per_epoch) + 1
|
|
|
|
return ckpt_step_freq
|
|
|
|
@staticmethod
|
|
def _separate_config(config):
|
|
|
|
training_args_config = {}
|
|
per_model_config = {}
|
|
|
|
for key in config.keys():
|
|
if key in TrainingArguments.__dict__.keys():
|
|
training_args_config[key] = config[key]
|
|
else:
|
|
per_model_config[key] = config[key]
|
|
|
|
return training_args_config, per_model_config
|
|
|
|
def _objective(self, config, reporter, checkpoint_dir=None):
|
|
# TODO add test
|
|
from transformers.trainer_utils import set_seed
|
|
self._set_transformers_verbosity(self._transformers_verbose)
|
|
|
|
def model_init():
|
|
return self._load_model()
|
|
set_seed(config["seed"])
|
|
|
|
training_args_config, per_model_config = AutoTransformers._separate_config(config)
|
|
this_model = self._load_model(per_model_config=per_model_config)
|
|
|
|
trial_id = reporter.trial_id
|
|
self.path_utils.make_dir_per_trial(trial_id)
|
|
|
|
ckpt_freq = self._compute_checkpoint_freq(
|
|
num_train_epochs=config["num_train_epochs"],
|
|
batch_size=config["per_device_train_batch_size"])
|
|
|
|
assert self.path_utils.ckpt_dir_per_trial
|
|
|
|
if transformers.__version__.startswith("3"):
|
|
training_args = TrainingArguments(
|
|
output_dir=self.path_utils.ckpt_dir_per_trial,
|
|
do_eval=True,
|
|
per_device_eval_batch_size=32,
|
|
eval_steps=ckpt_freq,
|
|
evaluate_during_training=True,
|
|
save_steps=ckpt_freq,
|
|
save_total_limit=0,
|
|
fp16=self._fp16,
|
|
**training_args_config,
|
|
)
|
|
else:
|
|
from transformers import IntervalStrategy
|
|
training_args = TrainingArguments(
|
|
output_dir=self.path_utils.ckpt_dir_per_trial,
|
|
do_eval=True,
|
|
per_device_eval_batch_size=32,
|
|
eval_steps=ckpt_freq,
|
|
evaluation_strategy=IntervalStrategy.STEPS,
|
|
save_steps=ckpt_freq,
|
|
save_total_limit=0,
|
|
fp16=self._fp16,
|
|
**training_args_config,
|
|
)
|
|
|
|
trainer = TrainerForAutoTransformers(
|
|
model=this_model,
|
|
args=training_args,
|
|
model_init=model_init,
|
|
train_dataset=self.train_dataset,
|
|
eval_dataset=self.eval_dataset,
|
|
tokenizer=self._tokenizer,
|
|
compute_metrics=self._compute_metrics_by_dataset_name,
|
|
)
|
|
trainer.trial_id = reporter.trial_id
|
|
|
|
"""
|
|
create a wandb run. If os.environ["WANDB_MODE"] == "offline", run = None
|
|
"""
|
|
|
|
if self.wandb_utils:
|
|
run = self.wandb_utils.set_wandb_per_trial()
|
|
import wandb
|
|
for each_hp in config:
|
|
wandb.log({each_hp: config[each_hp]})
|
|
else:
|
|
run = None
|
|
|
|
trainer.train()
|
|
trainer.evaluate(self.eval_dataset)
|
|
"""
|
|
If a wandb run was created, close the run after train and evaluate finish
|
|
"""
|
|
if run:
|
|
run.finish()
|
|
|
|
def _verify_init_config(self,
|
|
**custom_hpo_args):
|
|
for key in custom_hpo_args.keys():
|
|
if key == "points_to_evaluate":
|
|
for each_init_config in custom_hpo_args[key]:
|
|
for each_hp in each_init_config.keys():
|
|
assert each_hp in self._search_space_hpo.keys(), \
|
|
"points_to_evaluate hp must be within the search space"
|
|
|
|
assert isinstance(each_init_config[each_hp], int) or \
|
|
isinstance(each_init_config[each_hp], float) or \
|
|
isinstance(each_init_config[each_hp], str) or \
|
|
isinstance(each_init_config[each_hp], bool), " points_to_evaluate must be a scalar"
|
|
|
|
assert isinstance(self._search_space_hpo[each_hp], ray.tune.sample.Categorical) or \
|
|
isinstance(self._search_space_hpo[each_hp], ray.tune.sample.Float) or \
|
|
isinstance(self._search_space_hpo[each_hp], ray.tune.sample.Integer), \
|
|
"Every hp space must either be categorical, integer or float"
|
|
|
|
if isinstance(self._search_space_hpo[each_hp], ray.tune.sample.Categorical):
|
|
assert each_init_config[each_hp] in self._search_space_hpo[each_hp].categories, \
|
|
"points_to_evaluate {each_hp} value must be within the search space"
|
|
else:
|
|
assert self._search_space_hpo[each_hp].lower <= each_init_config[each_hp] <= \
|
|
self._search_space_hpo[each_hp].upper, \
|
|
"points_to_evaluate {each_hp} value must be within the search space"
|
|
|
|
def _get_search_algo(self,
|
|
search_algo_name,
|
|
search_algo_args_mode,
|
|
time_budget,
|
|
metric_name,
|
|
metric_mode_name,
|
|
**custom_hpo_args):
|
|
from .hpo.searchalgo_auto import AutoSearchAlgorithm
|
|
|
|
if search_algo_name in ("bs", "cfo"):
|
|
self._verify_init_config(**custom_hpo_args)
|
|
search_algo = AutoSearchAlgorithm.from_method_name(
|
|
search_algo_name,
|
|
search_algo_args_mode,
|
|
self._search_space_hpo,
|
|
time_budget,
|
|
metric_name,
|
|
metric_mode_name,
|
|
**custom_hpo_args)
|
|
return search_algo
|
|
|
|
@staticmethod
|
|
def _recover_checkpoint(tune_checkpoint_dir):
|
|
assert tune_checkpoint_dir
|
|
# Get subdirectory used for Huggingface.
|
|
subdirs = [
|
|
os.path.join(tune_checkpoint_dir, name)
|
|
for name in os.listdir(tune_checkpoint_dir)
|
|
if os.path.isdir(os.path.join(tune_checkpoint_dir, name))
|
|
]
|
|
# There should only be 1 subdir.
|
|
assert len(subdirs) == 1, subdirs
|
|
return subdirs[0]
|
|
|
|
def _save_ckpt_json(self,
|
|
best_ckpt):
|
|
json.dump({"best_ckpt": best_ckpt},
|
|
open(os.path.join(self.path_utils.result_dir_per_run,
|
|
"save_ckpt_" + self.jobid_config.to_jobid_string() + ".json"), "w"))
|
|
|
|
def _save_output_metric(self,
|
|
output_metrics):
|
|
json.dump(output_metrics, open(
|
|
os.path.join(self.path_utils.result_dir_per_run,
|
|
"output_metric_" + self.jobid_config.to_jobid_string() + ".json"), "w"))
|
|
|
|
def _load_ckpt_json(self,
|
|
ckpt_dir=None,
|
|
**kwargs):
|
|
if not ckpt_dir:
|
|
ckpt_dir = os.path.join(self.path_utils.result_dir_per_run,
|
|
"save_ckpt_" + self.jobid_config.to_jobid_string() + ".json")
|
|
try:
|
|
ckpt_json = json.load(open(ckpt_dir))
|
|
return ckpt_json["best_ckpt"]
|
|
except FileNotFoundError as err:
|
|
print("Saved checkpoint not found. Please make sure checkpoint is stored under {}".format(ckpt_dir))
|
|
raise err
|
|
|
|
def _set_metric(self, custom_metric_name=None, custom_metric_mode_name=None):
|
|
from .dataset.metric_auto import get_default_and_alternative_metric
|
|
from .utils import _variable_override_default_alternative
|
|
|
|
default_metric, default_mode, all_metrics, all_modes = \
|
|
get_default_and_alternative_metric(
|
|
dataset_name_list=self.jobid_config.dat,
|
|
subdataset_name=self.jobid_config.subdat,
|
|
custom_metric_name=custom_metric_name,
|
|
custom_metric_mode_name=custom_metric_mode_name)
|
|
_variable_override_default_alternative(self,
|
|
"metric_name",
|
|
default_metric,
|
|
all_metrics,
|
|
custom_metric_name)
|
|
_variable_override_default_alternative(self,
|
|
"metric_mode_name",
|
|
default_mode,
|
|
all_modes,
|
|
custom_metric_mode_name)
|
|
self._all_metrics = all_metrics
|
|
self._all_modes = all_modes
|
|
|
|
def _set_task(self):
|
|
self.task_name = get_default_task(self.jobid_config.dat,
|
|
self.jobid_config.subdat)
|
|
|
|
def fit_hf(self,
|
|
resources_per_trial,
|
|
num_samples,
|
|
time_budget,
|
|
custom_metric_name=None,
|
|
custom_metric_mode_name=None,
|
|
_fp16=True,
|
|
**custom_hpo_args
|
|
):
|
|
# TODO remove?
|
|
from transformers.trainer_utils import HPSearchBackend
|
|
|
|
'''Fine tuning the huggingface using HF's API Transformers.hyperparameter_search (for comparitive purpose).
|
|
Transformers.hyperparameter_search has the following disadvantages:
|
|
(1) it does not return tune.analysis.Analysis result, what is analysis used for
|
|
(2) it is inconvenient to develop on top of Transformers.hyperparameter_search, whose trainable function,
|
|
search space, etc. are defined inside of Transformers.hyperparameter_search.
|
|
An example:
|
|
autohf_settings = {"resources_per_trial": {"cpu": 1},
|
|
"num_samples": 1,
|
|
"time_budget": 100000,
|
|
"ckpt_per_epoch": 1,
|
|
"fp16": False,
|
|
}
|
|
validation_metric, analysis = autohf.fit(**autohf_settings,)
|
|
Args:
|
|
resources_per_trial:
|
|
A dict showing the resources used by each trial,
|
|
e.g., {"gpu": 4, "cpu": 4}
|
|
num_samples:
|
|
An int variable of the maximum number of trials
|
|
time_budget:
|
|
An int variable of the maximum time budget
|
|
custom_metric_name:
|
|
A string of the dataset name or a function,
|
|
e.g., 'accuracy', 'f1', 'loss',
|
|
custom_metric_mode_name:
|
|
A string of the mode name,
|
|
e.g., "max", "min", "last", "all"
|
|
fp16:
|
|
boolean, default = True | whether to use fp16
|
|
custom_hpo_args:
|
|
The additional keyword arguments, e.g.,
|
|
custom_hpo_args = {"points_to_evaluate": [{
|
|
"num_train_epochs": 1,
|
|
"per_device_train_batch_size": 128, }]}
|
|
Returns:
|
|
validation_metric:
|
|
a dict storing the validation score
|
|
'''
|
|
|
|
def model_init():
|
|
return self._load_model()
|
|
|
|
def ray_hp_space(trial):
|
|
return {
|
|
"learning_rate": ray.tune.loguniform(1e-6, 1e-4),
|
|
"num_train_epochs": ray.tune.choice(list(range(1, 6))),
|
|
"seed": ray.tune.quniform(1, 41, 1),
|
|
"per_device_train_batch_size": ray.tune.choice([4, 8, 16, 32, 64]),
|
|
}
|
|
|
|
self._set_metric(custom_metric_name, custom_metric_mode_name)
|
|
self._set_task()
|
|
|
|
training_args = TrainingArguments(
|
|
output_dir=self.path_utils.hpo_ckpt_path,
|
|
fp16=_fp16,
|
|
)
|
|
this_model = self._load_model()
|
|
|
|
trainer = TrainerForAutoTransformers(
|
|
this_model,
|
|
training_args,
|
|
model_init=model_init,
|
|
train_dataset=self.train_dataset,
|
|
eval_dataset=self.eval_dataset,
|
|
tokenizer=self._tokenizer,
|
|
compute_metrics=self._compute_metrics_by_dataset_name,
|
|
)
|
|
self.path_utils.make_dir_per_run()
|
|
|
|
start_time = time.time()
|
|
best_run = trainer.hyperparameter_search(
|
|
n_trials=num_samples,
|
|
time_budget_s=time_budget,
|
|
# hp_space=ray_hp_space,
|
|
backend=HPSearchBackend.RAY,
|
|
resources_per_trial=resources_per_trial)
|
|
duration = time.time() - start_time
|
|
self.last_run_duration = duration
|
|
print("Total running time: {} seconds".format(duration))
|
|
|
|
hp_dict = best_run.hyperparameters
|
|
hp_dict["seed"] = int(hp_dict["seed"])
|
|
|
|
best_training_args = TrainingArguments(
|
|
output_dir=self.path_utils.hpo_ckpt_path,
|
|
fp16=_fp16,
|
|
**hp_dict,
|
|
)
|
|
|
|
best_trainer = TrainerForAutoTransformers(
|
|
this_model,
|
|
best_training_args,
|
|
model_init=model_init,
|
|
train_dataset=self.train_dataset,
|
|
eval_dataset=self.eval_dataset,
|
|
tokenizer=self._tokenizer,
|
|
compute_metrics=self._compute_metrics_by_dataset_name,
|
|
)
|
|
|
|
best_model_checkpoint_path = os.path.join(self.path_utils.hpo_ckpt_path, "hpo_hf")
|
|
if not os.path.exists(best_model_checkpoint_path):
|
|
os.mkdir(best_model_checkpoint_path)
|
|
best_trainer.train()
|
|
best_trainer.save_model(best_model_checkpoint_path)
|
|
self._save_ckpt_json(best_model_checkpoint_path)
|
|
validation_metric = best_trainer.evaluate()
|
|
|
|
return validation_metric
|
|
|
|
def _set_transformers_verbosity(self, transformers_verbose):
|
|
# TODO coverage
|
|
if transformers_verbose == transformers.logging.ERROR:
|
|
transformers.logging.set_verbosity_error()
|
|
elif transformers_verbose == transformers.logging.WARNING:
|
|
transformers.logging.set_verbosity_warning()
|
|
elif transformers_verbose == transformers.logging.INFO:
|
|
transformers.logging.set_verbosity_info()
|
|
elif transformers_verbose == transformers.logging.DEBUG:
|
|
transformers.logging.set_verbosity_debug()
|
|
else:
|
|
raise Exception("transformers_verbose must be set to ERROR, WARNING, INFO or DEBUG")
|
|
|
|
def fit(self,
|
|
num_samples,
|
|
time_budget,
|
|
custom_metric_name=None,
|
|
custom_metric_mode_name=None,
|
|
ckpt_per_epoch=1,
|
|
fp16=True,
|
|
ray_verbose=1,
|
|
transformers_verbose=10,
|
|
resources_per_trial=None,
|
|
ray_local_mode=False,
|
|
**custom_hpo_args):
|
|
"""Fine tuning the huggingface using the hpo setting
|
|
|
|
Example:
|
|
|
|
.. code-block:: python
|
|
|
|
autohf_settings = {"resources_per_trial": {"cpu": 1},
|
|
"num_samples": 1,
|
|
"time_budget": 100000,
|
|
"ckpt_per_epoch": 1,
|
|
"fp16": False,
|
|
}
|
|
|
|
validation_metric, analysis = autohf.fit(**autohf_settings)
|
|
|
|
Args:
|
|
resources_per_trial:
|
|
A dict showing the resources used by each trial,
|
|
e.g., {"gpu": 4, "cpu": 4}
|
|
num_samples:
|
|
An int variable of the maximum number of trials
|
|
time_budget:
|
|
An int variable of the maximum time budget
|
|
custom_metric_name:
|
|
A string of the dataset name or a function,
|
|
e.g., 'accuracy', 'f1', 'loss'
|
|
custom_metric_mode_name:
|
|
A string of the mode name,
|
|
e.g., "max", "min", "last", "all"
|
|
ckpt_per_epoch:
|
|
An integer value of number of checkpoints per epoch, default = 1
|
|
ray_verbose:
|
|
An integer, default=1 | verbosit of ray,
|
|
transformers_verbose:
|
|
An integer, default=transformers.logging.INFO | verbosity of transformers, must be chosen from one of
|
|
transformers.logging.ERROR, transformers.logging.INFO, transformers.logging.WARNING,
|
|
or transformers.logging.DEBUG
|
|
fp16:
|
|
A boolean, default = True | whether to use fp16
|
|
ray_local_mode:
|
|
A boolean, default = False | whether to use the local mode (debugging mode) for ray tune.run
|
|
custom_hpo_args:
|
|
The additional keyword arguments, e.g., custom_hpo_args = {"points_to_evaluate": [{
|
|
"num_train_epochs": 1, "per_device_train_batch_size": 128, }]}
|
|
|
|
Returns:
|
|
|
|
validation_metric: A dict storing the validation score
|
|
|
|
analysis: A ray.tune.analysis.Analysis object storing the analysis results from tune.run
|
|
"""
|
|
from .hpo.scheduler_auto import AutoScheduler
|
|
self._transformers_verbose = transformers_verbose
|
|
|
|
"""
|
|
Specify the other parse of jobid configs from custom_hpo_args, e.g., if the search algorithm was not specified
|
|
previously, can specify the algorithm here
|
|
"""
|
|
if len(custom_hpo_args) > 0:
|
|
self.jobid_config.set_jobid_from_console_args(console_args=custom_hpo_args)
|
|
|
|
self._resources_per_trial = resources_per_trial
|
|
self._set_metric(custom_metric_name, custom_metric_mode_name)
|
|
self._set_task()
|
|
self._fp16 = fp16
|
|
ray.shutdown()
|
|
ray.init(local_mode=ray_local_mode)
|
|
self._set_search_space(**custom_hpo_args)
|
|
|
|
search_algo = self._get_search_algo(self.jobid_config.alg,
|
|
self.jobid_config.arg,
|
|
time_budget,
|
|
self.metric_name,
|
|
self.metric_mode_name,
|
|
**custom_hpo_args)
|
|
scheduler = AutoScheduler.from_scheduler_name(self.jobid_config.pru)
|
|
self.ckpt_per_epoch = ckpt_per_epoch
|
|
self.path_utils.make_dir_per_run()
|
|
|
|
assert self.path_utils.ckpt_dir_per_run
|
|
start_time = time.time()
|
|
|
|
tune_config = self._search_space_hpo
|
|
tune_config["seed"] = self.jobid_config.sdhf
|
|
|
|
analysis = ray.tune.run(
|
|
self._objective,
|
|
metric=self.metric_name,
|
|
mode=self.metric_mode_name,
|
|
name="ray_result",
|
|
resources_per_trial=resources_per_trial,
|
|
config=tune_config,
|
|
verbose=ray_verbose,
|
|
local_dir=self.path_utils.ckpt_dir_per_run,
|
|
num_samples=num_samples,
|
|
time_budget_s=time_budget,
|
|
keep_checkpoints_num=1,
|
|
scheduler=scheduler,
|
|
search_alg=search_algo,
|
|
)
|
|
duration = time.time() - start_time
|
|
self.last_run_duration = duration
|
|
print("Total running time: {} seconds".format(duration))
|
|
|
|
ray.shutdown()
|
|
|
|
best_trial = analysis.get_best_trial(scope="all", metric=self.metric_name, mode=self.metric_mode_name)
|
|
validation_metric = {"eval_" + self.metric_name
|
|
: best_trial.metric_analysis[self.metric_name][self.metric_mode_name]}
|
|
for x in range(len(self._all_metrics)):
|
|
validation_metric["eval_" + self._all_metrics[x]] \
|
|
= best_trial.metric_analysis[self._all_metrics[x]][self._all_modes[x]]
|
|
|
|
get_best_ckpt = analysis.get_best_checkpoint(best_trial, metric=self.metric_name, mode=self.metric_mode_name)
|
|
best_ckpt = AutoTransformers._recover_checkpoint(get_best_ckpt)
|
|
|
|
self._save_ckpt_json(best_ckpt)
|
|
|
|
return validation_metric, analysis
|
|
|
|
def predict(self,
|
|
ckpt_json_dir=None,
|
|
**kwargs):
|
|
'''Predict label for test data.
|
|
|
|
An example:
|
|
predictions, test_metric = autohf.predict()
|
|
|
|
Args:
|
|
ckpt_json_dir:
|
|
the checkpoint for the fine-tuned huggingface if you wish to override
|
|
the saved checkpoint in the training stage under self.path_utils._result_dir_per_run
|
|
|
|
Returns:
|
|
A numpy array of shape n * 1 - - each element is a predicted class
|
|
label for an instance.
|
|
'''
|
|
best_checkpoint = self._load_ckpt_json(ckpt_json_dir, **kwargs)
|
|
best_model = self._load_model(checkpoint_path=best_checkpoint)
|
|
training_args = TrainingArguments(per_device_eval_batch_size=1,
|
|
output_dir=self.path_utils.result_dir_per_run)
|
|
test_trainer = TrainerForAutoTransformers(best_model, training_args)
|
|
|
|
if self.jobid_config.spt == "ori":
|
|
# TODO add test
|
|
if "label" in self.test_dataset.features.keys():
|
|
self.test_dataset.remove_columns_("label")
|
|
print("Cleaning the existing label column from test data")
|
|
|
|
test_dataloader = test_trainer.get_test_dataloader(self.test_dataset)
|
|
predictions, labels, _ = test_trainer.prediction_loop(test_dataloader, description="Prediction")
|
|
predictions = np.squeeze(predictions) \
|
|
if get_default_task(self.jobid_config.dat,
|
|
self.jobid_config.subdat) == "regression" \
|
|
else np.argmax(predictions, axis=1)
|
|
|
|
if self.jobid_config.spt == "rspt":
|
|
assert labels is not None
|
|
metric = self._get_metric_func()
|
|
output_metric = metric.compute(predictions=predictions, references=labels)
|
|
self._save_output_metric(output_metric)
|
|
return predictions, output_metric
|
|
else:
|
|
return predictions, None
|
|
|
|
def output_prediction(self,
|
|
predictions=None,
|
|
output_prediction_path=None,
|
|
output_zip_file_name=None):
|
|
"""
|
|
When using the original GLUE split, output the prediction on test data,
|
|
and prepare the .zip file for submission
|
|
|
|
Example:
|
|
local_archive_path = self.autohf.output_prediction(predictions,
|
|
output_prediction_path= self.console_args.data_root_dir + "result/",
|
|
output_zip_file_name=azure_save_file_name)
|
|
|
|
Args:
|
|
predictions:
|
|
A list of predictions, which is the output of AutoTransformers.predict()
|
|
output_prediction_path:
|
|
Output path for the prediction
|
|
output_zip_file_name:
|
|
An string, which is the name of the output zip file
|
|
|
|
Returns:
|
|
The path of the output .zip file
|
|
"""
|
|
from .dataset.submission_auto import auto_output_prediction
|
|
return auto_output_prediction(self.jobid_config.dat,
|
|
output_prediction_path,
|
|
output_zip_file_name,
|
|
predictions,
|
|
self.train_dataset,
|
|
self._dev_name,
|
|
self.jobid_config.subdat)
|