autogen/flaml/nlp/autotransformers.py

import json
import os
import numpy as np
import time

try:
    import ray
    import transformers
    from transformers import TrainingArguments
    import datasets
    from .dataset.task_auto import get_default_task
    from .result_analysis.azure_utils import JobID
    from .huggingface.trainer import TrainerForAutoTransformers
except ImportError:
    print("To use the nlp component in flaml, run pip install flaml[nlp]")

task_list = [
    "seq-classification",
    "regression",
    "question-answering"
]


class AutoTransformers:
    '''The AutoTransformers class

    Example:

        .. code-block:: python

            autohf = AutoTransformers()
            autohf_settings = {
                "resources_per_trial": {"cpu": 1, "gpu": 1},
                "num_samples": -1,
                "time_budget": 60,
            }

            validation_metric, analysis = autohf.fit(**autohf_settings)

    '''

    @staticmethod
    def _convert_dict_to_ray_tune_space(config_json, mode="grid"):
        search_space = {}

        if mode == "grid":
            # TODO add test
            for each_hp in config_json.keys():
                this_config = config_json[each_hp]
                assert isinstance(this_config, dict) or isinstance(this_config, list), \
                    "config of " + each_hp + " must be dict or list for grid search"
                search_space[each_hp] = ray.tune.grid_search(this_config)
        else:
            for each_hp in config_json.keys():
                this_config = config_json[each_hp]
                assert isinstance(this_config, dict) or isinstance(this_config, list), \
                    "config of " + each_hp + " must be dict or list"
                if isinstance(this_config, dict):
                    lower = this_config["l"]
                    upper = this_config["u"]
                    space = this_config["space"]
                    if space == "log":
                        search_space[each_hp] = ray.tune.loguniform(lower, upper)
                    elif space == "linear":
                        search_space[each_hp] = ray.tune.uniform(lower, upper)
                    elif space == "quniform":
                        search_space[each_hp] = ray.tune.quniform(lower, upper, this_config["interval"])
                else:
                    search_space[each_hp] = ray.tune.choice(this_config)

        return search_space

    def _set_search_space(self,
                          **custom_hpo_args):
        from .hpo.hpo_searchspace import AutoHPOSearchSpace

        search_space_hpo_json \
            = AutoHPOSearchSpace.from_model_and_dataset_name(self.jobid_config.spa,
                                                             self.jobid_config.pre,
                                                             self.jobid_config.presz,
                                                             self.jobid_config.dat,
                                                             self.jobid_config.subdat,
                                                             **custom_hpo_args)
        self._search_space_hpo = AutoTransformers._convert_dict_to_ray_tune_space(
            search_space_hpo_json,
            mode=self.jobid_config.mod)

    @staticmethod
    def _get_split_name(data_raw, fold_name=None):
        # TODO coverage
        if fold_name:
            return fold_name
        fold_keys = data_raw.keys()
        if fold_keys == {"train", "validation", "test"}:
            return "train", "validation", "test"
        for each_key in fold_keys:
            for each_split_name in {"train", "validation", "test"}:
                assert not (each_key.startswith(each_split_name) and each_key != each_split_name), \
                    "Dataset split must be within {}, must be explicitly specified in dataset_config, e.g.," \
                    "'fold_name': ['train','validation_matched','test_matched']. Please refer to the example in the " \
                    "documentation of AutoTransformers.prepare_data()".format(",".join(fold_keys))
        return "train", "validation", "test"

    def prepare_data(self,
                     data_root_path,
                     jobid_config=None,
                     is_wandb_on=False,
                     server_name=None,
                     max_seq_length=128,
                     fold_name=None,
                     resplit_portion=None,
                     **custom_data_args):
        """Prepare data

            Example:

                .. code-block:: python

                    preparedata_setting = {"server_name": "tmdev", "data_root_path": "data/", "max_seq_length": 128,
                                               "jobid_config": jobid_config, "wandb_utils": wandb_utils,
                                               "resplit_portion": {"source": ["train", "validation"],
                                               "train": [0, 0.8], "validation": [0.8, 0.9], "test": [0.9, 1.0]}}

                    autohf.prepare_data(**preparedata_setting)

            Args:
                server_name:
                    A string variable, which can be tmdev or azureml
                data_root_path:
                    The root path for storing the checkpoints and output results, e.g., "data/"
                jobid_config:
                    A JobID object describing the profile of job
                wandb_utils:
                    A WandbUtils object for wandb operations
                max_seq_length (optional):
                    Max_seq_lckpt_per_epochength for the huggingface, this hyperparameter must be specified
                    at the data processing step
                resplit_portion:
                    The proportion for resplitting the train and dev data when split_mode="resplit".
                    If args.resplit_mode = "rspt", resplit_portion is required
                is_wandb_on:
                    A boolean variable indicating whether wandb is used
        """
        from .dataset.dataprocess_auto import AutoEncodeText
        from transformers import AutoTokenizer
        from datasets import load_dataset
        from .utils import PathUtils
        from .utils import load_dft_args

        self._max_seq_length = max_seq_length
        self._server_name = server_name if server_name is not None else "tmdev"
        """
            loading the jobid config from console args
        """
        console_args = load_dft_args()
        self.jobid_config = JobID(console_args)
        if jobid_config:
            self.jobid_config = jobid_config
        if len(custom_data_args) > 0:
            self.jobid_config.set_jobid_from_console_args(console_args=custom_data_args)
        if is_wandb_on:
            from .result_analysis.wandb_utils import WandbUtils
            self.wandb_utils = WandbUtils(is_wandb_on=is_wandb_on,
                                          wandb_key_path=console_args.key_path,
                                          jobid_config=self.jobid_config)
            self.wandb_utils.set_wandb_per_run()
        else:
            self.wandb_utils = None

        self.path_utils = PathUtils(self.jobid_config, hpo_data_root_path=data_root_path)

        if self.jobid_config.spt == "rspt":
            assert resplit_portion, "If split mode is 'rspt', the resplit_portion must be provided. Please " \
                                    "refer to the example in the documentation of AutoTransformers.prepare_data()"
        if self.jobid_config.subdat:
            data_raw = load_dataset(JobID.dataset_list_to_str(self.jobid_config.dat),
                                    self.jobid_config.subdat)
        else:
            data_raw = load_dataset(*self.jobid_config.dat)

        self._train_name, self._dev_name, self._test_name = AutoTransformers._get_split_name(
            data_raw,
            fold_name=fold_name)
        auto_tokentoids_config = {"max_seq_length": self._max_seq_length}
        self._tokenizer = AutoTokenizer.from_pretrained(self.jobid_config.pre_full, use_fast=True)

        def autoencodetext_from_model_and_dataset_name():
            return AutoEncodeText.from_model_and_dataset_name(
                data_raw,
                self.jobid_config.pre_full,
                self.jobid_config.dat,
                self.jobid_config.subdat,
                **auto_tokentoids_config)

        data_encoded = autoencodetext_from_model_and_dataset_name()
        self._max_seq_length = 0
        """
            Update the max_seq_length to the minimum of the actual max seq length and the user defined max_seq_length
        """
        for each_fold in data_encoded.keys():
            self._max_seq_length = max(self._max_seq_length,
                                       max([sum(data_encoded[each_fold][x]['attention_mask']) for x in
                                            range(len(data_encoded[each_fold]))]))
        self._max_seq_length = int((self._max_seq_length + 15) / 16) * 16
        data_encoded = autoencodetext_from_model_and_dataset_name()

        if self.jobid_config.spt == "rspt":
            all_folds_from_source = []
            assert "source" in resplit_portion.keys(), "Must specify the source for resplitting the dataset in" \
                                                       "resplit_portion, which is a list of folder names, e.g., " \
                                                       "resplit_portion = {'source': ['train']}"

            source_fold_names = resplit_portion['source']
            for each_fold_name in source_fold_names:
                this_fold_dataset = data_encoded[each_fold_name]
                all_folds_from_source.append(this_fold_dataset)

            merged_folds_from_source = datasets.concatenate_datasets(all_folds_from_source)
            merged_folds_from_source = merged_folds_from_source.shuffle(seed=self.jobid_config.sddt)

            assert "train" in resplit_portion.keys() and "validation" in resplit_portion.keys() \
                   and "test" in resplit_portion.keys(), "train, validation, test must exist in resplit_portion"

            for key in ["train", "validation", "test"]:
                target_fold_start, target_fold_end = \
                    int(resplit_portion[key][0] * len(merged_folds_from_source)), \
                    int(resplit_portion[key][1] * len(merged_folds_from_source))
                subfold_dataset = merged_folds_from_source.select(
                    [x for x in range(target_fold_start, target_fold_end)]).flatten_indices()
                if key == "train":
                    self.train_dataset = subfold_dataset
                elif key == "validation":
                    self.eval_dataset = subfold_dataset
                else:
                    self.test_dataset = subfold_dataset
        else:
            self.train_dataset, self.eval_dataset, self.test_dataset \
                = data_encoded[self._train_name], data_encoded[self._dev_name], data_encoded[self._test_name]

    def _load_model(self,
                    checkpoint_path=None,
                    per_model_config=None):
        from transformers import AutoConfig
        from .huggingface.switch_head_auto import AutoSeqClassificationHead, MODEL_CLASSIFICATION_HEAD_MAPPING

        this_task = get_default_task(self.jobid_config.dat,
                                     self.jobid_config.subdat)
        if this_task == "seq-classification":
            self._num_labels = len(self.train_dataset.features["label"].names)
        elif this_task == "regression":
            self._num_labels = 1

        if not checkpoint_path:
            checkpoint_path = self.jobid_config.pre_full

        def get_this_model():
            from transformers import AutoModelForSequenceClassification
            return AutoModelForSequenceClassification.from_pretrained(checkpoint_path, config=model_config)

        def is_pretrained_model_in_classification_head_list():
            return self.jobid_config.pre in MODEL_CLASSIFICATION_HEAD_MAPPING.keys()

        def _set_model_config():
            if per_model_config and len(per_model_config) > 0:
                model_config = AutoConfig.from_pretrained(
                    checkpoint_path,
                    num_labels=model_config_num_labels,
                    **per_model_config)
            else:
                model_config = AutoConfig.from_pretrained(
                    checkpoint_path,
                    num_labels=model_config_num_labels)
            return model_config

        if this_task == "seq-classification":
            num_labels_old = AutoConfig.from_pretrained(checkpoint_path).num_labels
            if is_pretrained_model_in_classification_head_list():
                model_config_num_labels = num_labels_old
            else:
                model_config_num_labels = self._num_labels
            model_config = _set_model_config()

            if is_pretrained_model_in_classification_head_list():
                # TODO coverage
                if self._num_labels != num_labels_old:
                    this_model = get_this_model()
                    model_config.num_labels = self._num_labels
                    this_model.num_labels = self._num_labels
                    this_model.classifier = AutoSeqClassificationHead \
                        .from_model_type_and_config(self.jobid_config.pre,
                                                    model_config)
                else:
                    this_model = get_this_model()
            else:
                this_model = get_this_model()

            this_model.resize_token_embeddings(len(self._tokenizer))
            return this_model
        elif this_task == "regression":
            # TODO add test
            model_config_num_labels = 1
            model_config = _set_model_config()
            this_model = get_this_model()
            return this_model

    def _get_metric_func(self):
        data_name = JobID.dataset_list_to_str(self.jobid_config.dat)
        if data_name in ("glue", "super_glue"):
            metric = datasets.load.load_metric(data_name, self.jobid_config.subdat)
        # TODO delete
        elif data_name in ("squad", "squad_v2"):
            metric = datasets.load.load_metric(data_name)
        else:
            metric = datasets.load.load_metric(self.metric_name)
        return metric

    def _compute_metrics_by_dataset_name(self,
                                         eval_pred):
        # TODO coverage
        predictions, labels = eval_pred
        predictions = np.squeeze(predictions) \
            if self.task_name == "regression" else np.argmax(predictions, axis=1)
        metric_func = self._get_metric_func()
        return metric_func.compute(predictions=predictions, references=labels)

    def _compute_checkpoint_freq(self,
                                 num_train_epochs,
                                 batch_size):
        # TODO coverage
        if "gpu" in self._resources_per_trial:
            ckpt_step_freq = int(min(num_train_epochs, 1) * len(self.train_dataset) / batch_size
                                 / self._resources_per_trial["gpu"] / self.ckpt_per_epoch) + 1
        else:
            ckpt_step_freq = int(min(num_train_epochs, 1) * len(self.train_dataset) / batch_size
                                 / self._resources_per_trial["cpu"] / self.ckpt_per_epoch) + 1

        return ckpt_step_freq

    @staticmethod
    def _separate_config(config):

        training_args_config = {}
        per_model_config = {}

        for key in config.keys():
            if key in TrainingArguments.__dict__.keys():
                training_args_config[key] = config[key]
            else:
                per_model_config[key] = config[key]

        return training_args_config, per_model_config

    def _objective(self, config, reporter, checkpoint_dir=None):
        # TODO add test
        from transformers.trainer_utils import set_seed
        self._set_transformers_verbosity(self._transformers_verbose)

        def model_init():
            return self._load_model()
        set_seed(config["seed"])

        training_args_config, per_model_config = AutoTransformers._separate_config(config)
        this_model = self._load_model(per_model_config=per_model_config)

        trial_id = reporter.trial_id
        self.path_utils.make_dir_per_trial(trial_id)

        ckpt_freq = self._compute_checkpoint_freq(
            num_train_epochs=config["num_train_epochs"],
            batch_size=config["per_device_train_batch_size"])

        assert self.path_utils.ckpt_dir_per_trial

        if transformers.__version__.startswith("3"):
            training_args = TrainingArguments(
                output_dir=self.path_utils.ckpt_dir_per_trial,
                do_eval=True,
                per_device_eval_batch_size=32,
                eval_steps=ckpt_freq,
                evaluate_during_training=True,
                save_steps=ckpt_freq,
                save_total_limit=0,
                fp16=self._fp16,
                **training_args_config,
            )
        else:
            from transformers import IntervalStrategy
            training_args = TrainingArguments(
                output_dir=self.path_utils.ckpt_dir_per_trial,
                do_eval=True,
                per_device_eval_batch_size=32,
                eval_steps=ckpt_freq,
                evaluation_strategy=IntervalStrategy.STEPS,
                save_steps=ckpt_freq,
                save_total_limit=0,
                fp16=self._fp16,
                **training_args_config,
            )

        trainer = TrainerForAutoTransformers(
            model=this_model,
            args=training_args,
            model_init=model_init,
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            tokenizer=self._tokenizer,
            compute_metrics=self._compute_metrics_by_dataset_name,
        )
        trainer.trial_id = reporter.trial_id

        """
            create a wandb run. If os.environ["WANDB_MODE"] == "offline", run = None
        """

        if self.wandb_utils:
            run = self.wandb_utils.set_wandb_per_trial()
            import wandb
            for each_hp in config:
                wandb.log({each_hp: config[each_hp]})
        else:
            run = None

        trainer.train()
        trainer.evaluate(self.eval_dataset)
        """
            If a wandb run was created, close the run after train and evaluate finish
        """
        if run:
            run.finish()

    def _verify_init_config(self,
                            **custom_hpo_args):
        for key in custom_hpo_args.keys():
            if key == "points_to_evaluate":
                for each_init_config in custom_hpo_args[key]:
                    for each_hp in each_init_config.keys():
                        assert each_hp in self._search_space_hpo.keys(), \
                            "points_to_evaluate hp must be within the search space"

                        assert isinstance(each_init_config[each_hp], int) or \
                               isinstance(each_init_config[each_hp], float) or \
                               isinstance(each_init_config[each_hp], str) or \
                               isinstance(each_init_config[each_hp], bool), " points_to_evaluate must be a scalar"

                        assert isinstance(self._search_space_hpo[each_hp], ray.tune.sample.Categorical) or \
                               isinstance(self._search_space_hpo[each_hp], ray.tune.sample.Float) or \
                               isinstance(self._search_space_hpo[each_hp], ray.tune.sample.Integer), \
                               "Every hp space must either be categorical, integer or float"

                        if isinstance(self._search_space_hpo[each_hp], ray.tune.sample.Categorical):
                            assert each_init_config[each_hp] in self._search_space_hpo[each_hp].categories, \
                                "points_to_evaluate {each_hp} value must be within the search space"
                        else:
                            assert self._search_space_hpo[each_hp].lower <= each_init_config[each_hp] <= \
                                   self._search_space_hpo[each_hp].upper, \
                                   "points_to_evaluate {each_hp} value must be within the search space"

    def _get_search_algo(self,
                         search_algo_name,
                         search_algo_args_mode,
                         time_budget,
                         metric_name,
                         metric_mode_name,
                         **custom_hpo_args):
        from .hpo.searchalgo_auto import AutoSearchAlgorithm

        if search_algo_name in ("bs", "cfo"):
            self._verify_init_config(**custom_hpo_args)
        search_algo = AutoSearchAlgorithm.from_method_name(
            search_algo_name,
            search_algo_args_mode,
            self._search_space_hpo,
            time_budget,
            metric_name,
            metric_mode_name,
            **custom_hpo_args)
        return search_algo

    @staticmethod
    def _recover_checkpoint(tune_checkpoint_dir):
        assert tune_checkpoint_dir
        # Get subdirectory used for Huggingface.
        subdirs = [
            os.path.join(tune_checkpoint_dir, name)
            for name in os.listdir(tune_checkpoint_dir)
            if os.path.isdir(os.path.join(tune_checkpoint_dir, name))
        ]
        # There should only be 1 subdir.
        assert len(subdirs) == 1, subdirs
        return subdirs[0]

    def _save_ckpt_json(self,
                        best_ckpt):
        json.dump({"best_ckpt": best_ckpt},
                  open(os.path.join(self.path_utils.result_dir_per_run,
                                    "save_ckpt_" + self.jobid_config.to_jobid_string() + ".json"), "w"))

    def _save_output_metric(self,
                            output_metrics):
        json.dump(output_metrics, open(
            os.path.join(self.path_utils.result_dir_per_run,
                         "output_metric_" + self.jobid_config.to_jobid_string() + ".json"), "w"))

    def _load_ckpt_json(self,
                        ckpt_dir=None,
                        **kwargs):
        if not ckpt_dir:
            ckpt_dir = os.path.join(self.path_utils.result_dir_per_run,
                                    "save_ckpt_" + self.jobid_config.to_jobid_string() + ".json")
        try:
            ckpt_json = json.load(open(ckpt_dir))
            return ckpt_json["best_ckpt"]
        except FileNotFoundError as err:
            print("Saved checkpoint not found. Please make sure checkpoint is stored under {}".format(ckpt_dir))
            raise err

    def _set_metric(self, custom_metric_name=None, custom_metric_mode_name=None):
        from .dataset.metric_auto import get_default_and_alternative_metric
        from .utils import _variable_override_default_alternative

        default_metric, default_mode, all_metrics, all_modes = \
            get_default_and_alternative_metric(
                dataset_name_list=self.jobid_config.dat,
                subdataset_name=self.jobid_config.subdat,
                custom_metric_name=custom_metric_name,
                custom_metric_mode_name=custom_metric_mode_name)
        _variable_override_default_alternative(self,
                                               "metric_name",
                                               default_metric,
                                               all_metrics,
                                               custom_metric_name)
        _variable_override_default_alternative(self,
                                               "metric_mode_name",
                                               default_mode,
                                               all_modes,
                                               custom_metric_mode_name)
        self._all_metrics = all_metrics
        self._all_modes = all_modes

    def _set_task(self):
        self.task_name = get_default_task(self.jobid_config.dat,
                                          self.jobid_config.subdat)

    def fit_hf(self,
               resources_per_trial,
               num_samples,
               time_budget,
               custom_metric_name=None,
               custom_metric_mode_name=None,
               _fp16=True,
               **custom_hpo_args
               ):
        # TODO remove?
        from transformers.trainer_utils import HPSearchBackend

        '''Fine tuning the huggingface using HF's API Transformers.hyperparameter_search (for comparitive purpose).
               Transformers.hyperparameter_search has the following disadvantages:
            (1) it does not return tune.analysis.Analysis result, what is analysis used for
            (2) it is inconvenient to develop on top of Transformers.hyperparameter_search, whose trainable function,
                 search space, etc. are defined inside of Transformers.hyperparameter_search.
               An example:
            autohf_settings = {"resources_per_trial": {"cpu": 1},
                       "num_samples": 1,
                       "time_budget": 100000,
                       "ckpt_per_epoch": 1,
                       "fp16": False,
                      }
            validation_metric, analysis = autohf.fit(**autohf_settings,)
            Args:
                resources_per_trial:
                    A dict showing the resources used by each trial,
                    e.g., {"gpu": 4, "cpu": 4}
                num_samples:
                    An int variable of the maximum number of trials
                time_budget:
                    An int variable of the maximum time budget
                custom_metric_name:
                    A string of the dataset name or a function,
                    e.g., 'accuracy', 'f1', 'loss',
                custom_metric_mode_name:
                    A string of the mode name,
                    e.g., "max", "min", "last", "all"
                fp16:
                    boolean, default = True | whether to use fp16
                custom_hpo_args:
                    The additional keyword arguments, e.g.,
                    custom_hpo_args = {"points_to_evaluate": [{
                               "num_train_epochs": 1,
                               "per_device_train_batch_size": 128, }]}
            Returns:
               validation_metric:
                    a dict storing the validation score
            '''

        def model_init():
            return self._load_model()

        def ray_hp_space(trial):
            return {
                "learning_rate": ray.tune.loguniform(1e-6, 1e-4),
                "num_train_epochs": ray.tune.choice(list(range(1, 6))),
                "seed": ray.tune.quniform(1, 41, 1),
                "per_device_train_batch_size": ray.tune.choice([4, 8, 16, 32, 64]),
            }

        self._set_metric(custom_metric_name, custom_metric_mode_name)
        self._set_task()

        training_args = TrainingArguments(
            output_dir=self.path_utils.hpo_ckpt_path,
            fp16=_fp16,
        )
        this_model = self._load_model()

        trainer = TrainerForAutoTransformers(
            this_model,
            training_args,
            model_init=model_init,
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            tokenizer=self._tokenizer,
            compute_metrics=self._compute_metrics_by_dataset_name,
        )
        self.path_utils.make_dir_per_run()

        start_time = time.time()
        best_run = trainer.hyperparameter_search(
            n_trials=num_samples,
            time_budget_s=time_budget,
            # hp_space=ray_hp_space,
            backend=HPSearchBackend.RAY,
            resources_per_trial=resources_per_trial)
        duration = time.time() - start_time
        self.last_run_duration = duration
        print("Total running time: {} seconds".format(duration))

        hp_dict = best_run.hyperparameters
        hp_dict["seed"] = int(hp_dict["seed"])

        best_training_args = TrainingArguments(
            output_dir=self.path_utils.hpo_ckpt_path,
            fp16=_fp16,
            **hp_dict,
        )

        best_trainer = TrainerForAutoTransformers(
            this_model,
            best_training_args,
            model_init=model_init,
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            tokenizer=self._tokenizer,
            compute_metrics=self._compute_metrics_by_dataset_name,
        )

        best_model_checkpoint_path = os.path.join(self.path_utils.hpo_ckpt_path, "hpo_hf")
        if not os.path.exists(best_model_checkpoint_path):
            os.mkdir(best_model_checkpoint_path)
        best_trainer.train()
        best_trainer.save_model(best_model_checkpoint_path)
        self._save_ckpt_json(best_model_checkpoint_path)
        validation_metric = best_trainer.evaluate()

        return validation_metric

    def _set_transformers_verbosity(self, transformers_verbose):
        # TODO coverage
        if transformers_verbose == transformers.logging.ERROR:
            transformers.logging.set_verbosity_error()
        elif transformers_verbose == transformers.logging.WARNING:
            transformers.logging.set_verbosity_warning()
        elif transformers_verbose == transformers.logging.INFO:
            transformers.logging.set_verbosity_info()
        elif transformers_verbose == transformers.logging.DEBUG:
            transformers.logging.set_verbosity_debug()
        else:
            raise Exception("transformers_verbose must be set to ERROR, WARNING, INFO or DEBUG")

    def fit(self,
            num_samples,
            time_budget,
            custom_metric_name=None,
            custom_metric_mode_name=None,
            ckpt_per_epoch=1,
            fp16=True,
            ray_verbose=1,
            transformers_verbose=10,
            resources_per_trial=None,
            ray_local_mode=False,
            **custom_hpo_args):
        """Fine tuning the huggingface using the hpo setting

        Example:

            .. code-block:: python

                autohf_settings = {"resources_per_trial": {"cpu": 1},
                           "num_samples": 1,
                           "time_budget": 100000,
                           "ckpt_per_epoch": 1,
                           "fp16": False,
                          }

                validation_metric, analysis = autohf.fit(**autohf_settings)

        Args:
            resources_per_trial:
                A dict showing the resources used by each trial,
                e.g., {"gpu": 4, "cpu": 4}
            num_samples:
                An int variable of the maximum number of trials
            time_budget:
                An int variable of the maximum time budget
            custom_metric_name:
                A string of the dataset name or a function,
                e.g., 'accuracy', 'f1', 'loss'
            custom_metric_mode_name:
                A string of the mode name,
                e.g., "max", "min", "last", "all"
            ckpt_per_epoch:
                An integer value of number of checkpoints per epoch, default = 1
            ray_verbose:
                An integer, default=1 | verbosit of ray,
            transformers_verbose:
                An integer, default=transformers.logging.INFO | verbosity of transformers, must be chosen from one of
                transformers.logging.ERROR, transformers.logging.INFO, transformers.logging.WARNING,
                or transformers.logging.DEBUG
            fp16:
                A boolean, default = True | whether to use fp16
            ray_local_mode:
                A boolean, default = False | whether to use the local mode (debugging mode) for ray tune.run
            custom_hpo_args:
                The additional keyword arguments, e.g., custom_hpo_args = {"points_to_evaluate": [{
                "num_train_epochs": 1, "per_device_train_batch_size": 128, }]}

        Returns:

            validation_metric: A dict storing the validation score

            analysis: A ray.tune.analysis.Analysis object storing the analysis results from tune.run
        """
        from .hpo.scheduler_auto import AutoScheduler
        self._transformers_verbose = transformers_verbose

        """
         Specify the other parse of jobid configs from custom_hpo_args, e.g., if the search algorithm was not specified
         previously, can specify the algorithm here
        """
        if len(custom_hpo_args) > 0:
            self.jobid_config.set_jobid_from_console_args(console_args=custom_hpo_args)

        self._resources_per_trial = resources_per_trial
        self._set_metric(custom_metric_name, custom_metric_mode_name)
        self._set_task()
        self._fp16 = fp16
        ray.shutdown()
        ray.init(local_mode=ray_local_mode)
        self._set_search_space(**custom_hpo_args)

        search_algo = self._get_search_algo(self.jobid_config.alg,
                                            self.jobid_config.arg,
                                            time_budget,
                                            self.metric_name,
                                            self.metric_mode_name,
                                            **custom_hpo_args)
        scheduler = AutoScheduler.from_scheduler_name(self.jobid_config.pru)
        self.ckpt_per_epoch = ckpt_per_epoch
        self.path_utils.make_dir_per_run()

        assert self.path_utils.ckpt_dir_per_run
        start_time = time.time()

        tune_config = self._search_space_hpo
        tune_config["seed"] = self.jobid_config.sdhf

        analysis = ray.tune.run(
            self._objective,
            metric=self.metric_name,
            mode=self.metric_mode_name,
            name="ray_result",
            resources_per_trial=resources_per_trial,
            config=tune_config,
            verbose=ray_verbose,
            local_dir=self.path_utils.ckpt_dir_per_run,
            num_samples=num_samples,
            time_budget_s=time_budget,
            keep_checkpoints_num=1,
            scheduler=scheduler,
            search_alg=search_algo,
        )
        duration = time.time() - start_time
        self.last_run_duration = duration
        print("Total running time: {} seconds".format(duration))

        ray.shutdown()

        best_trial = analysis.get_best_trial(scope="all", metric=self.metric_name, mode=self.metric_mode_name)
        validation_metric = {"eval_" + self.metric_name
                             : best_trial.metric_analysis[self.metric_name][self.metric_mode_name]}
        for x in range(len(self._all_metrics)):
            validation_metric["eval_" + self._all_metrics[x]] \
                = best_trial.metric_analysis[self._all_metrics[x]][self._all_modes[x]]

        get_best_ckpt = analysis.get_best_checkpoint(best_trial, metric=self.metric_name, mode=self.metric_mode_name)
        best_ckpt = AutoTransformers._recover_checkpoint(get_best_ckpt)

        self._save_ckpt_json(best_ckpt)

        return validation_metric, analysis

    def predict(self,
                ckpt_json_dir=None,
                **kwargs):
        '''Predict label for test data.

        An example:
            predictions, test_metric = autohf.predict()

        Args:
            ckpt_json_dir:
                the checkpoint for the fine-tuned huggingface if you wish to override
                the saved checkpoint in the training stage under self.path_utils._result_dir_per_run

        Returns:
            A numpy array of shape n * 1 - - each element is a predicted class
            label for an instance.
        '''
        best_checkpoint = self._load_ckpt_json(ckpt_json_dir, **kwargs)
        best_model = self._load_model(checkpoint_path=best_checkpoint)
        training_args = TrainingArguments(per_device_eval_batch_size=1,
                                          output_dir=self.path_utils.result_dir_per_run)
        test_trainer = TrainerForAutoTransformers(best_model, training_args)

        if self.jobid_config.spt == "ori":
            # TODO add test
            if "label" in self.test_dataset.features.keys():
                self.test_dataset.remove_columns_("label")
                print("Cleaning the existing label column from test data")

        test_dataloader = test_trainer.get_test_dataloader(self.test_dataset)
        predictions, labels, _ = test_trainer.prediction_loop(test_dataloader, description="Prediction")
        predictions = np.squeeze(predictions) \
            if get_default_task(self.jobid_config.dat,
                                self.jobid_config.subdat) == "regression" \
            else np.argmax(predictions, axis=1)

        if self.jobid_config.spt == "rspt":
            assert labels is not None
            metric = self._get_metric_func()
            output_metric = metric.compute(predictions=predictions, references=labels)
            self._save_output_metric(output_metric)
            return predictions, output_metric
        else:
            return predictions, None

    def output_prediction(self,
                          predictions=None,
                          output_prediction_path=None,
                          output_zip_file_name=None):
        """
            When using the original GLUE split, output the prediction on test data,
            and prepare the .zip file for submission

            Example:
                local_archive_path = self.autohf.output_prediction(predictions,
                                      output_prediction_path= self.console_args.data_root_dir + "result/",
                                      output_zip_file_name=azure_save_file_name)

            Args:
                predictions:
                    A list of predictions, which is the output of AutoTransformers.predict()
                output_prediction_path:
                    Output path for the prediction
                output_zip_file_name:
                    An string, which is the name of the output zip file

            Returns:
                The path of the output .zip file
        """
        from .dataset.submission_auto import auto_output_prediction
        return auto_output_prediction(self.jobid_config.dat,
                                      output_prediction_path,
                                      output_zip_file_name,
                                      predictions,
                                      self.train_dataset,
                                      self._dev_name,
                                      self.jobid_config.subdat)