import json import os import numpy as np import time try: import ray import transformers from transformers import TrainingArguments import datasets from .dataset.task_auto import get_default_task from .result_analysis.azure_utils import JobID from .huggingface.trainer import TrainerForAutoTransformers except ImportError: print("To use the nlp component in flaml, run pip install flaml[nlp]") task_list = [ "seq-classification", "regression", "question-answering" ] class AutoTransformers: '''The AutoTransformers class Example: .. code-block:: python autohf = AutoTransformers() autohf_settings = { "resources_per_trial": {"cpu": 1, "gpu": 1}, "num_samples": -1, "time_budget": 60, } validation_metric, analysis = autohf.fit(**autohf_settings) ''' @staticmethod def _convert_dict_to_ray_tune_space(config_json, mode="grid"): search_space = {} if mode == "grid": # TODO add test for each_hp in config_json.keys(): this_config = config_json[each_hp] assert isinstance(this_config, dict) or isinstance(this_config, list), \ "config of " + each_hp + " must be dict or list for grid search" search_space[each_hp] = ray.tune.grid_search(this_config) else: for each_hp in config_json.keys(): this_config = config_json[each_hp] assert isinstance(this_config, dict) or isinstance(this_config, list), \ "config of " + each_hp + " must be dict or list" if isinstance(this_config, dict): lower = this_config["l"] upper = this_config["u"] space = this_config["space"] if space == "log": search_space[each_hp] = ray.tune.loguniform(lower, upper) elif space == "linear": search_space[each_hp] = ray.tune.uniform(lower, upper) elif space == "quniform": search_space[each_hp] = ray.tune.quniform(lower, upper, this_config["interval"]) else: search_space[each_hp] = ray.tune.choice(this_config) return search_space def _set_search_space(self, **custom_hpo_args): from .hpo.hpo_searchspace import AutoHPOSearchSpace search_space_hpo_json \ = AutoHPOSearchSpace.from_model_and_dataset_name(self.jobid_config.spa, self.jobid_config.pre, self.jobid_config.presz, self.jobid_config.dat, self.jobid_config.subdat, **custom_hpo_args) self._search_space_hpo = AutoTransformers._convert_dict_to_ray_tune_space( search_space_hpo_json, mode=self.jobid_config.mod) @staticmethod def _get_split_name(data_raw, fold_name=None): # TODO coverage if fold_name: return fold_name fold_keys = data_raw.keys() if fold_keys == {"train", "validation", "test"}: return "train", "validation", "test" for each_key in fold_keys: for each_split_name in {"train", "validation", "test"}: assert not (each_key.startswith(each_split_name) and each_key != each_split_name), \ "Dataset split must be within {}, must be explicitly specified in dataset_config, e.g.," \ "'fold_name': ['train','validation_matched','test_matched']. Please refer to the example in the " \ "documentation of AutoTransformers.prepare_data()".format(",".join(fold_keys)) return "train", "validation", "test" def prepare_data(self, data_root_path, jobid_config=None, is_wandb_on=False, server_name=None, max_seq_length=128, fold_name=None, resplit_portion=None, **custom_data_args): """Prepare data Example: .. code-block:: python preparedata_setting = {"server_name": "tmdev", "data_root_path": "data/", "max_seq_length": 128, "jobid_config": jobid_config, "wandb_utils": wandb_utils, "resplit_portion": {"source": ["train", "validation"], "train": [0, 0.8], "validation": [0.8, 0.9], "test": [0.9, 1.0]}} autohf.prepare_data(**preparedata_setting) Args: server_name: A string variable, which can be tmdev or azureml data_root_path: The root path for storing the checkpoints and output results, e.g., "data/" jobid_config: A JobID object describing the profile of job wandb_utils: A WandbUtils object for wandb operations max_seq_length (optional): Max_seq_lckpt_per_epochength for the huggingface, this hyperparameter must be specified at the data processing step resplit_portion: The proportion for resplitting the train and dev data when split_mode="resplit". If args.resplit_mode = "rspt", resplit_portion is required is_wandb_on: A boolean variable indicating whether wandb is used """ from .dataset.dataprocess_auto import AutoEncodeText from transformers import AutoTokenizer from datasets import load_dataset from .utils import PathUtils from .utils import load_dft_args self._max_seq_length = max_seq_length self._server_name = server_name if server_name is not None else "tmdev" """ loading the jobid config from console args """ console_args = load_dft_args() self.jobid_config = JobID(console_args) if jobid_config: self.jobid_config = jobid_config if len(custom_data_args) > 0: self.jobid_config.set_jobid_from_console_args(console_args=custom_data_args) if is_wandb_on: from .result_analysis.wandb_utils import WandbUtils self.wandb_utils = WandbUtils(is_wandb_on=is_wandb_on, wandb_key_path=console_args.key_path, jobid_config=self.jobid_config) self.wandb_utils.set_wandb_per_run() else: self.wandb_utils = None self.path_utils = PathUtils(self.jobid_config, hpo_data_root_path=data_root_path) if self.jobid_config.spt == "rspt": assert resplit_portion, "If split mode is 'rspt', the resplit_portion must be provided. Please " \ "refer to the example in the documentation of AutoTransformers.prepare_data()" if self.jobid_config.subdat: data_raw = load_dataset(JobID.dataset_list_to_str(self.jobid_config.dat), self.jobid_config.subdat) else: data_raw = load_dataset(*self.jobid_config.dat) self._train_name, self._dev_name, self._test_name = AutoTransformers._get_split_name( data_raw, fold_name=fold_name) auto_tokentoids_config = {"max_seq_length": self._max_seq_length} self._tokenizer = AutoTokenizer.from_pretrained(self.jobid_config.pre_full, use_fast=True) def autoencodetext_from_model_and_dataset_name(): return AutoEncodeText.from_model_and_dataset_name( data_raw, self.jobid_config.pre_full, self.jobid_config.dat, self.jobid_config.subdat, **auto_tokentoids_config) data_encoded = autoencodetext_from_model_and_dataset_name() self._max_seq_length = 0 """ Update the max_seq_length to the minimum of the actual max seq length and the user defined max_seq_length """ for each_fold in data_encoded.keys(): self._max_seq_length = max(self._max_seq_length, max([sum(data_encoded[each_fold][x]['attention_mask']) for x in range(len(data_encoded[each_fold]))])) self._max_seq_length = int((self._max_seq_length + 15) / 16) * 16 data_encoded = autoencodetext_from_model_and_dataset_name() if self.jobid_config.spt == "rspt": all_folds_from_source = [] assert "source" in resplit_portion.keys(), "Must specify the source for resplitting the dataset in" \ "resplit_portion, which is a list of folder names, e.g., " \ "resplit_portion = {'source': ['train']}" source_fold_names = resplit_portion['source'] for each_fold_name in source_fold_names: this_fold_dataset = data_encoded[each_fold_name] all_folds_from_source.append(this_fold_dataset) merged_folds_from_source = datasets.concatenate_datasets(all_folds_from_source) merged_folds_from_source = merged_folds_from_source.shuffle(seed=self.jobid_config.sddt) assert "train" in resplit_portion.keys() and "validation" in resplit_portion.keys() \ and "test" in resplit_portion.keys(), "train, validation, test must exist in resplit_portion" for key in ["train", "validation", "test"]: target_fold_start, target_fold_end = \ int(resplit_portion[key][0] * len(merged_folds_from_source)), \ int(resplit_portion[key][1] * len(merged_folds_from_source)) subfold_dataset = merged_folds_from_source.select( [x for x in range(target_fold_start, target_fold_end)]).flatten_indices() if key == "train": self.train_dataset = subfold_dataset elif key == "validation": self.eval_dataset = subfold_dataset else: self.test_dataset = subfold_dataset else: self.train_dataset, self.eval_dataset, self.test_dataset \ = data_encoded[self._train_name], data_encoded[self._dev_name], data_encoded[self._test_name] def _load_model(self, checkpoint_path=None, per_model_config=None): from transformers import AutoConfig from .huggingface.switch_head_auto import AutoSeqClassificationHead, MODEL_CLASSIFICATION_HEAD_MAPPING this_task = get_default_task(self.jobid_config.dat, self.jobid_config.subdat) if this_task == "seq-classification": self._num_labels = len(self.train_dataset.features["label"].names) elif this_task == "regression": self._num_labels = 1 if not checkpoint_path: checkpoint_path = self.jobid_config.pre_full def get_this_model(): from transformers import AutoModelForSequenceClassification return AutoModelForSequenceClassification.from_pretrained(checkpoint_path, config=model_config) def is_pretrained_model_in_classification_head_list(): return self.jobid_config.pre in MODEL_CLASSIFICATION_HEAD_MAPPING.keys() def _set_model_config(): if per_model_config and len(per_model_config) > 0: model_config = AutoConfig.from_pretrained( checkpoint_path, num_labels=model_config_num_labels, **per_model_config) else: model_config = AutoConfig.from_pretrained( checkpoint_path, num_labels=model_config_num_labels) return model_config if this_task == "seq-classification": num_labels_old = AutoConfig.from_pretrained(checkpoint_path).num_labels if is_pretrained_model_in_classification_head_list(): model_config_num_labels = num_labels_old else: model_config_num_labels = self._num_labels model_config = _set_model_config() if is_pretrained_model_in_classification_head_list(): # TODO coverage if self._num_labels != num_labels_old: this_model = get_this_model() model_config.num_labels = self._num_labels this_model.num_labels = self._num_labels this_model.classifier = AutoSeqClassificationHead \ .from_model_type_and_config(self.jobid_config.pre, model_config) else: this_model = get_this_model() else: this_model = get_this_model() this_model.resize_token_embeddings(len(self._tokenizer)) return this_model elif this_task == "regression": # TODO add test model_config_num_labels = 1 model_config = _set_model_config() this_model = get_this_model() return this_model def _get_metric_func(self): data_name = JobID.dataset_list_to_str(self.jobid_config.dat) if data_name in ("glue", "super_glue"): metric = datasets.load.load_metric(data_name, self.jobid_config.subdat) # TODO delete elif data_name in ("squad", "squad_v2"): metric = datasets.load.load_metric(data_name) else: metric = datasets.load.load_metric(self.metric_name) return metric def _compute_metrics_by_dataset_name(self, eval_pred): # TODO coverage predictions, labels = eval_pred predictions = np.squeeze(predictions) \ if self.task_name == "regression" else np.argmax(predictions, axis=1) metric_func = self._get_metric_func() return metric_func.compute(predictions=predictions, references=labels) def _compute_checkpoint_freq(self, num_train_epochs, batch_size): # TODO coverage if "gpu" in self._resources_per_trial: ckpt_step_freq = int(min(num_train_epochs, 1) * len(self.train_dataset) / batch_size / self._resources_per_trial["gpu"] / self.ckpt_per_epoch) + 1 else: ckpt_step_freq = int(min(num_train_epochs, 1) * len(self.train_dataset) / batch_size / self._resources_per_trial["cpu"] / self.ckpt_per_epoch) + 1 return ckpt_step_freq @staticmethod def _separate_config(config): training_args_config = {} per_model_config = {} for key in config.keys(): if key in TrainingArguments.__dict__.keys(): training_args_config[key] = config[key] else: per_model_config[key] = config[key] return training_args_config, per_model_config def _objective(self, config, reporter, checkpoint_dir=None): # TODO add test from transformers.trainer_utils import set_seed self._set_transformers_verbosity(self._transformers_verbose) def model_init(): return self._load_model() set_seed(config["seed"]) training_args_config, per_model_config = AutoTransformers._separate_config(config) this_model = self._load_model(per_model_config=per_model_config) trial_id = reporter.trial_id self.path_utils.make_dir_per_trial(trial_id) ckpt_freq = self._compute_checkpoint_freq( num_train_epochs=config["num_train_epochs"], batch_size=config["per_device_train_batch_size"]) assert self.path_utils.ckpt_dir_per_trial if transformers.__version__.startswith("3"): training_args = TrainingArguments( output_dir=self.path_utils.ckpt_dir_per_trial, do_eval=True, per_device_eval_batch_size=32, eval_steps=ckpt_freq, evaluate_during_training=True, save_steps=ckpt_freq, save_total_limit=0, fp16=self._fp16, **training_args_config, ) else: from transformers import IntervalStrategy training_args = TrainingArguments( output_dir=self.path_utils.ckpt_dir_per_trial, do_eval=True, per_device_eval_batch_size=32, eval_steps=ckpt_freq, evaluation_strategy=IntervalStrategy.STEPS, save_steps=ckpt_freq, save_total_limit=0, fp16=self._fp16, **training_args_config, ) trainer = TrainerForAutoTransformers( model=this_model, args=training_args, model_init=model_init, train_dataset=self.train_dataset, eval_dataset=self.eval_dataset, tokenizer=self._tokenizer, compute_metrics=self._compute_metrics_by_dataset_name, ) trainer.trial_id = reporter.trial_id """ create a wandb run. If os.environ["WANDB_MODE"] == "offline", run = None """ if self.wandb_utils: run = self.wandb_utils.set_wandb_per_trial() import wandb for each_hp in config: wandb.log({each_hp: config[each_hp]}) else: run = None trainer.train() trainer.evaluate(self.eval_dataset) """ If a wandb run was created, close the run after train and evaluate finish """ if run: run.finish() def _verify_init_config(self, **custom_hpo_args): for key in custom_hpo_args.keys(): if key == "points_to_evaluate": for each_init_config in custom_hpo_args[key]: for each_hp in each_init_config.keys(): assert each_hp in self._search_space_hpo.keys(), \ "points_to_evaluate hp must be within the search space" assert isinstance(each_init_config[each_hp], int) or \ isinstance(each_init_config[each_hp], float) or \ isinstance(each_init_config[each_hp], str) or \ isinstance(each_init_config[each_hp], bool), " points_to_evaluate must be a scalar" assert isinstance(self._search_space_hpo[each_hp], ray.tune.sample.Categorical) or \ isinstance(self._search_space_hpo[each_hp], ray.tune.sample.Float) or \ isinstance(self._search_space_hpo[each_hp], ray.tune.sample.Integer), \ "Every hp space must either be categorical, integer or float" if isinstance(self._search_space_hpo[each_hp], ray.tune.sample.Categorical): assert each_init_config[each_hp] in self._search_space_hpo[each_hp].categories, \ "points_to_evaluate {each_hp} value must be within the search space" else: assert self._search_space_hpo[each_hp].lower <= each_init_config[each_hp] <= \ self._search_space_hpo[each_hp].upper, \ "points_to_evaluate {each_hp} value must be within the search space" def _get_search_algo(self, search_algo_name, search_algo_args_mode, time_budget, metric_name, metric_mode_name, **custom_hpo_args): from .hpo.searchalgo_auto import AutoSearchAlgorithm if search_algo_name in ("bs", "cfo"): self._verify_init_config(**custom_hpo_args) search_algo = AutoSearchAlgorithm.from_method_name( search_algo_name, search_algo_args_mode, self._search_space_hpo, time_budget, metric_name, metric_mode_name, **custom_hpo_args) return search_algo @staticmethod def _recover_checkpoint(tune_checkpoint_dir): assert tune_checkpoint_dir # Get subdirectory used for Huggingface. subdirs = [ os.path.join(tune_checkpoint_dir, name) for name in os.listdir(tune_checkpoint_dir) if os.path.isdir(os.path.join(tune_checkpoint_dir, name)) ] # There should only be 1 subdir. assert len(subdirs) == 1, subdirs return subdirs[0] def _save_ckpt_json(self, best_ckpt): json.dump({"best_ckpt": best_ckpt}, open(os.path.join(self.path_utils.result_dir_per_run, "save_ckpt_" + self.jobid_config.to_jobid_string() + ".json"), "w")) def _save_output_metric(self, output_metrics): json.dump(output_metrics, open( os.path.join(self.path_utils.result_dir_per_run, "output_metric_" + self.jobid_config.to_jobid_string() + ".json"), "w")) def _load_ckpt_json(self, ckpt_dir=None, **kwargs): if not ckpt_dir: ckpt_dir = os.path.join(self.path_utils.result_dir_per_run, "save_ckpt_" + self.jobid_config.to_jobid_string() + ".json") try: ckpt_json = json.load(open(ckpt_dir)) return ckpt_json["best_ckpt"] except FileNotFoundError as err: print("Saved checkpoint not found. Please make sure checkpoint is stored under {}".format(ckpt_dir)) raise err def _set_metric(self, custom_metric_name=None, custom_metric_mode_name=None): from .dataset.metric_auto import get_default_and_alternative_metric from .utils import _variable_override_default_alternative default_metric, default_mode, all_metrics, all_modes = \ get_default_and_alternative_metric( dataset_name_list=self.jobid_config.dat, subdataset_name=self.jobid_config.subdat, custom_metric_name=custom_metric_name, custom_metric_mode_name=custom_metric_mode_name) _variable_override_default_alternative(self, "metric_name", default_metric, all_metrics, custom_metric_name) _variable_override_default_alternative(self, "metric_mode_name", default_mode, all_modes, custom_metric_mode_name) self._all_metrics = all_metrics self._all_modes = all_modes def _set_task(self): self.task_name = get_default_task(self.jobid_config.dat, self.jobid_config.subdat) def fit_hf(self, resources_per_trial, num_samples, time_budget, custom_metric_name=None, custom_metric_mode_name=None, _fp16=True, **custom_hpo_args ): # TODO remove? from transformers.trainer_utils import HPSearchBackend '''Fine tuning the huggingface using HF's API Transformers.hyperparameter_search (for comparitive purpose). Transformers.hyperparameter_search has the following disadvantages: (1) it does not return tune.analysis.Analysis result, what is analysis used for (2) it is inconvenient to develop on top of Transformers.hyperparameter_search, whose trainable function, search space, etc. are defined inside of Transformers.hyperparameter_search. An example: autohf_settings = {"resources_per_trial": {"cpu": 1}, "num_samples": 1, "time_budget": 100000, "ckpt_per_epoch": 1, "fp16": False, } validation_metric, analysis = autohf.fit(**autohf_settings,) Args: resources_per_trial: A dict showing the resources used by each trial, e.g., {"gpu": 4, "cpu": 4} num_samples: An int variable of the maximum number of trials time_budget: An int variable of the maximum time budget custom_metric_name: A string of the dataset name or a function, e.g., 'accuracy', 'f1', 'loss', custom_metric_mode_name: A string of the mode name, e.g., "max", "min", "last", "all" fp16: boolean, default = True | whether to use fp16 custom_hpo_args: The additional keyword arguments, e.g., custom_hpo_args = {"points_to_evaluate": [{ "num_train_epochs": 1, "per_device_train_batch_size": 128, }]} Returns: validation_metric: a dict storing the validation score ''' def model_init(): return self._load_model() def ray_hp_space(trial): return { "learning_rate": ray.tune.loguniform(1e-6, 1e-4), "num_train_epochs": ray.tune.choice(list(range(1, 6))), "seed": ray.tune.quniform(1, 41, 1), "per_device_train_batch_size": ray.tune.choice([4, 8, 16, 32, 64]), } self._set_metric(custom_metric_name, custom_metric_mode_name) self._set_task() training_args = TrainingArguments( output_dir=self.path_utils.hpo_ckpt_path, fp16=_fp16, ) this_model = self._load_model() trainer = TrainerForAutoTransformers( this_model, training_args, model_init=model_init, train_dataset=self.train_dataset, eval_dataset=self.eval_dataset, tokenizer=self._tokenizer, compute_metrics=self._compute_metrics_by_dataset_name, ) self.path_utils.make_dir_per_run() start_time = time.time() best_run = trainer.hyperparameter_search( n_trials=num_samples, time_budget_s=time_budget, # hp_space=ray_hp_space, backend=HPSearchBackend.RAY, resources_per_trial=resources_per_trial) duration = time.time() - start_time self.last_run_duration = duration print("Total running time: {} seconds".format(duration)) hp_dict = best_run.hyperparameters hp_dict["seed"] = int(hp_dict["seed"]) best_training_args = TrainingArguments( output_dir=self.path_utils.hpo_ckpt_path, fp16=_fp16, **hp_dict, ) best_trainer = TrainerForAutoTransformers( this_model, best_training_args, model_init=model_init, train_dataset=self.train_dataset, eval_dataset=self.eval_dataset, tokenizer=self._tokenizer, compute_metrics=self._compute_metrics_by_dataset_name, ) best_model_checkpoint_path = os.path.join(self.path_utils.hpo_ckpt_path, "hpo_hf") if not os.path.exists(best_model_checkpoint_path): os.mkdir(best_model_checkpoint_path) best_trainer.train() best_trainer.save_model(best_model_checkpoint_path) self._save_ckpt_json(best_model_checkpoint_path) validation_metric = best_trainer.evaluate() return validation_metric def _set_transformers_verbosity(self, transformers_verbose): # TODO coverage if transformers_verbose == transformers.logging.ERROR: transformers.logging.set_verbosity_error() elif transformers_verbose == transformers.logging.WARNING: transformers.logging.set_verbosity_warning() elif transformers_verbose == transformers.logging.INFO: transformers.logging.set_verbosity_info() elif transformers_verbose == transformers.logging.DEBUG: transformers.logging.set_verbosity_debug() else: raise Exception("transformers_verbose must be set to ERROR, WARNING, INFO or DEBUG") def fit(self, num_samples, time_budget, custom_metric_name=None, custom_metric_mode_name=None, ckpt_per_epoch=1, fp16=True, ray_verbose=1, transformers_verbose=10, resources_per_trial=None, ray_local_mode=False, **custom_hpo_args): """Fine tuning the huggingface using the hpo setting Example: .. code-block:: python autohf_settings = {"resources_per_trial": {"cpu": 1}, "num_samples": 1, "time_budget": 100000, "ckpt_per_epoch": 1, "fp16": False, } validation_metric, analysis = autohf.fit(**autohf_settings) Args: resources_per_trial: A dict showing the resources used by each trial, e.g., {"gpu": 4, "cpu": 4} num_samples: An int variable of the maximum number of trials time_budget: An int variable of the maximum time budget custom_metric_name: A string of the dataset name or a function, e.g., 'accuracy', 'f1', 'loss' custom_metric_mode_name: A string of the mode name, e.g., "max", "min", "last", "all" ckpt_per_epoch: An integer value of number of checkpoints per epoch, default = 1 ray_verbose: An integer, default=1 | verbosit of ray, transformers_verbose: An integer, default=transformers.logging.INFO | verbosity of transformers, must be chosen from one of transformers.logging.ERROR, transformers.logging.INFO, transformers.logging.WARNING, or transformers.logging.DEBUG fp16: A boolean, default = True | whether to use fp16 ray_local_mode: A boolean, default = False | whether to use the local mode (debugging mode) for ray tune.run custom_hpo_args: The additional keyword arguments, e.g., custom_hpo_args = {"points_to_evaluate": [{ "num_train_epochs": 1, "per_device_train_batch_size": 128, }]} Returns: validation_metric: A dict storing the validation score analysis: A ray.tune.analysis.Analysis object storing the analysis results from tune.run """ from .hpo.scheduler_auto import AutoScheduler self._transformers_verbose = transformers_verbose """ Specify the other parse of jobid configs from custom_hpo_args, e.g., if the search algorithm was not specified previously, can specify the algorithm here """ if len(custom_hpo_args) > 0: self.jobid_config.set_jobid_from_console_args(console_args=custom_hpo_args) self._resources_per_trial = resources_per_trial self._set_metric(custom_metric_name, custom_metric_mode_name) self._set_task() self._fp16 = fp16 ray.shutdown() ray.init(local_mode=ray_local_mode) self._set_search_space(**custom_hpo_args) search_algo = self._get_search_algo(self.jobid_config.alg, self.jobid_config.arg, time_budget, self.metric_name, self.metric_mode_name, **custom_hpo_args) scheduler = AutoScheduler.from_scheduler_name(self.jobid_config.pru) self.ckpt_per_epoch = ckpt_per_epoch self.path_utils.make_dir_per_run() assert self.path_utils.ckpt_dir_per_run start_time = time.time() tune_config = self._search_space_hpo tune_config["seed"] = self.jobid_config.sdhf analysis = ray.tune.run( self._objective, metric=self.metric_name, mode=self.metric_mode_name, name="ray_result", resources_per_trial=resources_per_trial, config=tune_config, verbose=ray_verbose, local_dir=self.path_utils.ckpt_dir_per_run, num_samples=num_samples, time_budget_s=time_budget, keep_checkpoints_num=1, scheduler=scheduler, search_alg=search_algo, ) duration = time.time() - start_time self.last_run_duration = duration print("Total running time: {} seconds".format(duration)) ray.shutdown() best_trial = analysis.get_best_trial(scope="all", metric=self.metric_name, mode=self.metric_mode_name) validation_metric = {"eval_" + self.metric_name : best_trial.metric_analysis[self.metric_name][self.metric_mode_name]} for x in range(len(self._all_metrics)): validation_metric["eval_" + self._all_metrics[x]] \ = best_trial.metric_analysis[self._all_metrics[x]][self._all_modes[x]] get_best_ckpt = analysis.get_best_checkpoint(best_trial, metric=self.metric_name, mode=self.metric_mode_name) best_ckpt = AutoTransformers._recover_checkpoint(get_best_ckpt) self._save_ckpt_json(best_ckpt) return validation_metric, analysis def predict(self, ckpt_json_dir=None, **kwargs): '''Predict label for test data. An example: predictions, test_metric = autohf.predict() Args: ckpt_json_dir: the checkpoint for the fine-tuned huggingface if you wish to override the saved checkpoint in the training stage under self.path_utils._result_dir_per_run Returns: A numpy array of shape n * 1 - - each element is a predicted class label for an instance. ''' best_checkpoint = self._load_ckpt_json(ckpt_json_dir, **kwargs) best_model = self._load_model(checkpoint_path=best_checkpoint) training_args = TrainingArguments(per_device_eval_batch_size=1, output_dir=self.path_utils.result_dir_per_run) test_trainer = TrainerForAutoTransformers(best_model, training_args) if self.jobid_config.spt == "ori": # TODO add test if "label" in self.test_dataset.features.keys(): self.test_dataset.remove_columns_("label") print("Cleaning the existing label column from test data") test_dataloader = test_trainer.get_test_dataloader(self.test_dataset) predictions, labels, _ = test_trainer.prediction_loop(test_dataloader, description="Prediction") predictions = np.squeeze(predictions) \ if get_default_task(self.jobid_config.dat, self.jobid_config.subdat) == "regression" \ else np.argmax(predictions, axis=1) if self.jobid_config.spt == "rspt": assert labels is not None metric = self._get_metric_func() output_metric = metric.compute(predictions=predictions, references=labels) self._save_output_metric(output_metric) return predictions, output_metric else: return predictions, None def output_prediction(self, predictions=None, output_prediction_path=None, output_zip_file_name=None): """ When using the original GLUE split, output the prediction on test data, and prepare the .zip file for submission Example: local_archive_path = self.autohf.output_prediction(predictions, output_prediction_path= self.console_args.data_root_dir + "result/", output_zip_file_name=azure_save_file_name) Args: predictions: A list of predictions, which is the output of AutoTransformers.predict() output_prediction_path: Output path for the prediction output_zip_file_name: An string, which is the name of the output zip file Returns: The path of the output .zip file """ from .dataset.submission_auto import auto_output_prediction return auto_output_prediction(self.jobid_config.dat, output_prediction_path, output_zip_file_name, predictions, self.train_dataset, self._dev_name, self.jobid_config.subdat)