diff --git a/python/nn4k/nn4k/executor/base.py b/python/nn4k/nn4k/executor/base.py index 00294797..2abf643c 100644 --- a/python/nn4k/nn4k/executor/base.py +++ b/python/nn4k/nn4k/executor/base.py @@ -71,7 +71,7 @@ class NNExecutor(ABC): f"{self.__class__.__name__} does not support batch inference." ) - def inference(self, data, args=None, **kwargs): + def inference(self, inputs, **kwargs): """ The entry point of inference. Usually for local invokers or model services. """ @@ -248,3 +248,103 @@ class NNAdapterModelArgs(NNModelArgs): def __post_init__(self): super().__post_init__() + + +@dataclass +class NNInferenceArgs: + max_input_length: Optional[int] = field( + default=None, + metadata={ + "help": "Controls the maximum length to use by one of the truncation/padding parameters. " + "In HuggingFace executors, known as max_length in tokenize callable function config." + }, + ) + max_output_length: Optional[int] = field( + default=None, + metadata={ + "help": "The maximum numbers of tokens to generate. In HuggingFace executors, this arg will be tread as " + "max_new_tokens." + }, + ) + return_input_text: Optional[bool] = field( + default=False, + metadata={"help": "Whether return input texts together with output texts."}, + ) + stop_sequence: Optional[str] = field( + default=None, + metadata={ + "help": "Generation will stop when stop sequence encountered in the output." + }, + ) + do_sample: bool = field( + default=False, + metadata={ + "help": "If false, generation will be in greedy search mode, otherwise will sampling the probable tokens." + }, + ) + temperature: float = field( + default=1.0, + metadata={"help": "The creativity and diversity of the text generated."}, + ) + top_k: Optional[int] = field( + default=50, + metadata={ + "help": "In nucleus sampling, model will only sampling the tokens with the highest top_p(percentage) " + "probability" + }, + ) + top_p: Optional[float] = field( + default=1.0, + metadata={ + "help": "In nucleus sampling, model will only sampling the tokens with the highest top_k(count) probability" + }, + ) + repetition_penalty: Optional[float] = field( + default=1.0, + metadata={"help": "By default 1.0 means no penalty."}, + ) + + generate_config: dict = field( + default_factory=lambda: {}, + metadata={"help": "Config dict that will be use in model generation"}, + ) + + tokenize_return_tensors: str = field( + default="pt", + metadata={ + "help": "Tokenizer return type, will be merged into tokenize_config and pass into tokenize function" + }, + ) + tokenize_config: dict = field( + default_factory=lambda: {}, + metadata={ + "help": "Tokenize function config, will be pass into tokenize function" + }, + ) + + decode_config: dict = field( + default_factory=lambda: {}, + metadata={"help": "Configs to be pass into tokenizer.decode fucntion"}, + ) + + def update_if_not_none(self, from_key, to_dict, to_key=None): + to_key = to_key or from_key + from_value = self.__getattribute__(from_key) + value_in_to_dict = self.__getattribute__(to_dict).get(to_key, None) + if value_in_to_dict is None and from_value is not None: + self.__getattribute__(to_dict)[to_key] = from_value + + def __post_init__(self): + # merging generation args + self.update_if_not_none("max_output_length", "generate_config") + self.update_if_not_none("do_sample", "generate_config") + self.update_if_not_none("temperature", "generate_config") + self.update_if_not_none("top_k", "generate_config") + self.update_if_not_none("top_p", "generate_config") + self.update_if_not_none("repetition_penalty", "generate_config") + + # merging tokenize args + self.update_if_not_none("max_input_length", "tokenize_config", "max_length") + self.update_if_not_none( + "tokenize_return_tensors", "tokenize_config", "return_tensors" + ) diff --git a/python/nn4k/nn4k/executor/huggingface/base/hf_args.py b/python/nn4k/nn4k/executor/huggingface/base/hf_args.py index e93ef4ab..419b8304 100644 --- a/python/nn4k/nn4k/executor/huggingface/base/hf_args.py +++ b/python/nn4k/nn4k/executor/huggingface/base/hf_args.py @@ -15,6 +15,7 @@ from typing import Optional from transformers import TrainingArguments from nn4k.executor import NNAdapterModelArgs +from nn4k.executor.base import NNInferenceArgs @dataclass @@ -52,6 +53,13 @@ class HFModelArgs(NNAdapterModelArgs): "help": " Load the model weights from a TensorFlow checkpoint save file, default to False" }, ) + padding_side: Optional[str] = field( + default=None, + metadata={ + "help": "Padding side of the tokenizer when padding batch inputs", + "choices": [None, "left", "right"], + }, + ) def __post_init__(self): super().__post_init__() @@ -105,3 +113,45 @@ class HFSftArgs(HFModelArgs, TrainingArguments): print( f"a eval_dataset_path is set but do_eval flag is not set, automatically set do_eval to True" ) + + +@dataclass +class HFInferArgs(NNInferenceArgs): + delete_heading_new_lines: bool = field( + default=False, + metadata={ + "help": "an additional question mark or new line marks sometimes occurs at the beginning of output." + "Try to get rid of these marks by setting this parameter to True. Different model may have different " + "behavior, please check the result carefully." + }, + ) + + tokenize_config: dict = field( + default_factory=lambda: { + "add_special_tokens": False, + "padding": False, + "truncation": False, + }, + metadata={ + "help": "padding: https://huggingface.co/docs/transformers/pad_truncation#padding-and-truncation" + }, + ) + + decode_config: dict = field( + default_factory=lambda: { + "skip_special_tokens": True, + "clean_up_tokenization_spaces": True, + }, + metadata={ + "help": "check https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__" + }, + ) + + def __post_init__(self): + super().__post_init__() + + # HF specific map + self.update_if_not_none( + "max_output_length", "generate_config", "max_new_tokens" + ) + self.update_if_not_none("max_input_length", "tokenize_config", "max_length") diff --git a/python/nn4k/nn4k/executor/huggingface/base/hf_llm_executor.py b/python/nn4k/nn4k/executor/huggingface/base/hf_llm_executor.py index d86ee6cb..5be7ec4b 100644 --- a/python/nn4k/nn4k/executor/huggingface/base/hf_llm_executor.py +++ b/python/nn4k/nn4k/executor/huggingface/base/hf_llm_executor.py @@ -18,8 +18,9 @@ from torch.utils.data import Dataset from transformers import AutoConfig, AutoTokenizer, Trainer from nn4k.executor import LLMExecutor -from .hf_args import HFSftArgs, HFModelArgs +from .hf_args import HFInferArgs, HFSftArgs, HFModelArgs from nn4k.executor.huggingface.nn_hf_trainer import NNHFTrainer +from nn4k.utils.args_utils import ArgsUtils class HFLLMExecutor(LLMExecutor): @@ -188,57 +189,85 @@ class HFLLMExecutor(LLMExecutor): if self.model_mode == mode and self._model is not None: return + args = args or self._init_args + from transformers import HfArgumentParser from nn4k.executor.huggingface import HFModelArgs parser = HfArgumentParser(HFModelArgs) + hf_model_args: HFModelArgs hf_model_args, *_ = parser.parse_dict(args, allow_extra_keys=True) self.model_mode = mode self._tokenizer = self._hf_tokenizer_loader(hf_model_args) self._model = self._hf_model_loader( - hf_model_args, mode, hf_model_args.nn_device + args=hf_model_args, mode=mode, device=hf_model_args.nn_device ) if self.tokenizer.eos_token_id is None: self.tokenizer.eos_token_id = self.model.config.eos_token_id if self.tokenizer.pad_token_id is None: self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + if hf_model_args.padding_side is not None: + self.tokenizer.padding_side = hf_model_args.padding_side + + def inference(self, inputs, **kwargs): + infer_args = ArgsUtils.update_args(self.init_args, kwargs) + + from transformers import HfArgumentParser + + parser = HfArgumentParser(HFInferArgs) + hf_infer_args: HFInferArgs + hf_infer_args, *_ = parser.parse_dict(infer_args, allow_extra_keys=True) - def inference( - self, - data, - max_input_length: int = 1024, - max_output_length: int = 1024, - do_sample: bool = False, - **kwargs, - ): model = self.model tokenizer = self.tokenizer + input_ids = tokenizer( - data, - padding=True, - return_token_type_ids=False, - return_tensors="pt", - truncation=True, - max_length=max_input_length, + inputs, + **hf_infer_args.tokenize_config, ).to(model.device) + + if hf_infer_args.stop_sequence is not None: + stop_sequence = hf_infer_args.stop_sequence + stop_sequence_ids = self.tokenizer.encode( + stop_sequence, add_special_tokens=False + ) + if len(stop_sequence_ids) > 1: + print( # TODO: use logger instead + "Warning: Stopping on a multiple token sequence is not yet supported on transformers. " + "The first token of the stop sequence will be used as the stop sequence string in the interim." + ) + hf_infer_args.generate_config["eos_token_id"] = stop_sequence_ids[0] + output_ids = model.generate( **input_ids, - max_new_tokens=max_output_length, - do_sample=do_sample, - eos_token_id=tokenizer.eos_token_id, - pad_token_id=tokenizer.pad_token_id, - **kwargs, + **hf_infer_args.generate_config, ) - outputs = [ - tokenizer.decode( - output_id[len(input_ids["input_ids"][idx]) :], skip_special_tokens=True + output_texts = [] + for idx, output_id in enumerate(output_ids): + if not hf_infer_args.return_input_text: + output_id = output_id[len(input_ids["input_ids"][idx]) :] + output_text = self.tokenizer.decode( + output_id, **hf_infer_args.decode_config ) - for idx, output_id in enumerate(output_ids) - ] - return outputs + + if ( + not hf_infer_args.return_input_text + and hf_infer_args.delete_heading_new_lines + ): + import re + + match = re.search("(\\n)+", output_text) + if match is not None: + start_index = match.end() + if start_index < len(output_text) - 1: + output_text = output_text[start_index:] + + output_texts.append(output_text) + + return output_texts @abstractmethod def _hf_model_loader( diff --git a/python/nn4k/nn4k/executor/huggingface/default_config/decodeonly/local_infer.json5 b/python/nn4k/nn4k/executor/huggingface/default_config/decodeonly/local_infer.json5 new file mode 100644 index 00000000..5b0c66ce --- /dev/null +++ b/python/nn4k/nn4k/executor/huggingface/default_config/decodeonly/local_infer.json5 @@ -0,0 +1,14 @@ + +{ + // -- base model info + "nn_model_path": "/Path/to/model_dir", // local model path + "nn_invoker": "nn4k.invoker.base.LLMInvoker", // invoker to use + "nn_executor": "nn4k.executor.huggingface.hf_decode_only_executor.HFDecodeOnlyExecutor", // executor to use + // the following are optional + "adapter_name": "adapter_name", // adapter_name must be given to enable adapter; with adapter_path along has no effect! + "adapter_path": "/path/to/adapter", + "generate_config":{ + "temperature": 0.2, + "do_sample": true + } +} diff --git a/python/nn4k/nn4k/executor/huggingface/default_config/decodeonly/task_entry.py b/python/nn4k/nn4k/executor/huggingface/default_config/decodeonly/task_entry.py index 3a8808ed..87a365b0 100644 --- a/python/nn4k/nn4k/executor/huggingface/default_config/decodeonly/task_entry.py +++ b/python/nn4k/nn4k/executor/huggingface/default_config/decodeonly/task_entry.py @@ -13,9 +13,22 @@ from nn4k.invoker.base import NNInvoker def main(): - NNInvoker.from_config("local_sft.json5").local_sft() - # Inference example, not implemented yet. - # NNInvoker.from_config("inferece_args.json").local_inference("你是谁") + # example for local sft + # NNInvoker.from_config("local_sft.json5").local_sft() + + # example for local inference + invoker = NNInvoker.from_config("local_infer.json5") + answer = invoker.local_inference( + "What could LLM do for human?", + tokenize_config={"padding": True}, + delete_heading_new_lines=True, + ) + # doing so to avoid load model everytime. You could hold a invoker, which has alreday load the model at the first time. + answer2 = invoker.local_inference( + "What could LLM do for a programmer", + tokenize_config={"padding": True}, + delete_heading_new_lines=True, + ) if __name__ == "__main__": diff --git a/python/nn4k/nn4k/executor/huggingface/hf_embedding_executor.py b/python/nn4k/nn4k/executor/huggingface/hf_embedding_executor.py index 34047258..96d19097 100644 --- a/python/nn4k/nn4k/executor/huggingface/hf_embedding_executor.py +++ b/python/nn4k/nn4k/executor/huggingface/hf_embedding_executor.py @@ -53,7 +53,7 @@ class HFEmbeddingExecutor(LLMExecutor): ) self._model = model - def inference(self, data, args=None, **kwargs): + def inference(self, inputs, **kwargs): model = self.model - embeddings = model.encode(data) + embeddings = model.encode(inputs) return embeddings diff --git a/python/nn4k/nn4k/invoker/base.py b/python/nn4k/nn4k/invoker/base.py index 6358acc7..86da1456 100644 --- a/python/nn4k/nn4k/invoker/base.py +++ b/python/nn4k/nn4k/invoker/base.py @@ -14,7 +14,10 @@ from abc import ABC, abstractmethod from enum import Enum from typing import Union -from nn4k.executor import NNExecutor +from nn4k.utils.class_importing import dynamic_import_class + +from nn4k.executor import LLMExecutor +from nn4k.utils.args_utils import ArgsUtils class SubmitMode(Enum): @@ -36,6 +39,7 @@ class NNInvoker(ABC): def __init__(self, init_args: dict, **kwargs): self._init_args = init_args self._kwargs = kwargs + self.inference_warmed_up = False @property def init_args(self): @@ -71,7 +75,6 @@ class NNInvoker(ABC): from nn4k.consts import NN_INVOKER_KEY, NN_INVOKER_TEXT from nn4k.utils.config_parsing import preprocess_config from nn4k.utils.config_parsing import get_string_field - from nn4k.utils.class_importing import dynamic_import_class nn_config = preprocess_config(nn_config) nn_invoker = nn_config.get(NN_INVOKER_KEY) @@ -141,9 +144,7 @@ class LLMInvoker(NNInvoker): raise NotImplementedError(f"{self.__class__.__name__} does not support SFT.") def local_sft(self, args: dict = None): - sft_args = copy.deepcopy(self.init_args) - args = args or {} - sft_args.update(args) + sft_args = ArgsUtils.update_args(self.init_args, args) from nn4k.executor import LLMExecutor @@ -161,29 +162,46 @@ class LLMInvoker(NNInvoker): """ Implement local inference for local invoker. """ - return self._nn_executor.inference(data, **kwargs) + args = ArgsUtils.handle_dict_config(kwargs) + + if not self.inference_warmed_up: + print( + "warming up the model for inference, only happen for the first time..." + ) + self.warmup_local_model() + self.inference_warmed_up = True + print("inference model is warmed up") + + return self._nn_executor.inference(inputs=data, **args) def warmup_local_model(self): """ Implement local model warming up logic for local invoker. """ + nn_config = self.init_args + from nn4k.nnhub import NNHub - from nn4k.consts import NN_EXECUTOR_KEY, NN_EXECUTOR_TEXT from nn4k.consts import NN_NAME_KEY, NN_NAME_TEXT from nn4k.consts import NN_VERSION_KEY, NN_VERSION_TEXT from nn4k.utils.config_parsing import get_string_field - from nn4k.utils.class_importing import dynamic_import_class + from transformers import HfArgumentParser + from nn4k.executor import NNModelArgs - nn_executor = self.init_args.get(NN_EXECUTOR_KEY) + parser = HfArgumentParser(NNModelArgs) + model_args: NNModelArgs + model_args, *_ = parser.parse_dict(self.init_args, allow_extra_keys=True) + + from nn4k.consts import NN_EXECUTOR_KEY, NN_EXECUTOR_TEXT + + nn_executor = nn_config.get(NN_EXECUTOR_KEY) if nn_executor is not None: - nn_executor = get_string_field( - self.init_args, NN_EXECUTOR_KEY, NN_EXECUTOR_TEXT - ) + from nn4k.executor import NNExecutor + executor_class = dynamic_import_class(nn_executor, NN_EXECUTOR_TEXT) if not issubclass(executor_class, NNExecutor): message = "%r is not an %s class" % (nn_executor, NN_EXECUTOR_TEXT) raise RuntimeError(message) - executor = executor_class.from_config(self.init_args) + executor = executor_class.from_config(nn_config) else: nn_name = get_string_field(self.init_args, NN_NAME_KEY, NN_NAME_TEXT) nn_version = self.init_args.get(NN_VERSION_KEY) @@ -193,13 +211,18 @@ class LLMInvoker(NNInvoker): ) hub = NNHub.get_instance() executor = hub.get_model_executor(nn_name, nn_version) - if executor is None: - message = "model %r version %r " % (nn_name, nn_version) - message += "is not found in the model hub" - raise RuntimeError(message) - self._nn_executor: NNExecutor = executor + + if executor is None: + message = "model %r version %r " % ( + model_args.nn_name, + model_args.nn_version, + ) + message += "is not found in the model hub, you should provide a valid nn_executor class path" + raise RuntimeError(message) + self._nn_executor: LLMExecutor = executor self._nn_executor.load_model(mode="inference") self._nn_executor.warmup_inference() + self.inference_warmed_up = True @classmethod def from_config(cls, nn_config: dict) -> "LLMInvoker": diff --git a/python/nn4k/nn4k/utils/args_utils.py b/python/nn4k/nn4k/utils/args_utils.py new file mode 100644 index 00000000..ab69041e --- /dev/null +++ b/python/nn4k/nn4k/utils/args_utils.py @@ -0,0 +1,63 @@ +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + + +class ArgsUtils: + CONFIG_FILE_KEY = "config_file" + + @staticmethod + def update_args(base_args: dict, new_args: dict) -> dict: + """ + update an existing args with a new set of args + :param base_args: args to get updated. Will be copied before get updated. + :param new_args: args to update the base args. + :rtype: dict + """ + import copy + + copy_base_args = copy.deepcopy(base_args) + new_args = new_args or {} + copy_base_args.update(new_args) + return copy_base_args + + @staticmethod + def handle_dict_config(kwargs: dict) -> dict: + if "config_file" in kwargs: + configs = ArgsUtils.load_config_dict_from_file(kwargs.get("config_file")) + else: + configs = kwargs + + return configs + + @staticmethod + def load_config_dict_from_file(file_path: str) -> dict: + from pathlib import Path + + if file_path.endswith(".json"): + import json + + with open(Path(file_path), "r", encoding="utf-8") as open_json_file: + data = json.load(open_json_file) + nn_config = data + return nn_config + if file_path.endswith(".json5"): + import json5 + + with open(Path(file_path), "r", encoding="utf-8") as open_json5_file: + data = json5.load(open_json5_file) + nn_config = data + return nn_config + from nn4k.utils.io.file_utils import FileUtils + + raise ValueError( + f"Config file with extension type {FileUtils.get_extension(file_path)} is not supported." + f"use json or json5 instead." + ) diff --git a/python/nn4k/nn4k/utils/config_parsing.py b/python/nn4k/nn4k/utils/config_parsing.py index a9063723..8b58c231 100644 --- a/python/nn4k/nn4k/utils/config_parsing.py +++ b/python/nn4k/nn4k/utils/config_parsing.py @@ -32,26 +32,9 @@ def preprocess_config(nn_config: Union[str, dict]) -> dict: if isinstance(nn_config, dict): return nn_config elif isinstance(nn_config, str): - if nn_config.endswith(".json"): - import json + from nn4k.utils.args_utils import ArgsUtils - with open(Path(nn_config), "r", encoding="utf-8") as open_json_file: - data = json.load(open_json_file) - nn_config = data - return nn_config - if nn_config.endswith(".json5"): - import json5 - - with open(Path(nn_config), "r", encoding="utf-8") as open_json5_file: - data = json5.load(open_json5_file) - nn_config = data - return nn_config - from nn4k.utils.io.file_utils import FileUtils - - raise ValueError( - f"Config file with extension type {FileUtils.get_extension(nn_config)} is not supported." - f"use json or json5 instead." - ) + return ArgsUtils.load_config_dict_from_file(nn_config) else: raise ValueError( f"nn_config could be dict or str, {type(nn_config)} is not yet supported." diff --git a/python/nn4k/tests/executor/executor_test_stub.py b/python/nn4k/tests/executor/executor_test_stub.py index 09b8a5c4..e1820b4c 100644 --- a/python/nn4k/tests/executor/executor_test_stub.py +++ b/python/nn4k/tests/executor/executor_test_stub.py @@ -22,7 +22,7 @@ class StubExecutor(LLMExecutor): def warmup_inference(self, args=None, **kwargs): pass - def inference(self, data, args=None, **kwargs): + def inference(self, inputs, args=None, **kwargs): pass @classmethod diff --git a/python/nn4k/tests/invoker/invoker_test_stub.py b/python/nn4k/tests/invoker/invoker_test_stub.py index c8f61bd9..b96475fd 100644 --- a/python/nn4k/tests/invoker/invoker_test_stub.py +++ b/python/nn4k/tests/invoker/invoker_test_stub.py @@ -37,7 +37,7 @@ class StubExecutor(NNExecutor): def warmup_inference(self, args=None, **kwargs): self.warmup_inference_called = True - def inference(self, data, args=None, **kwargs): + def inference(self, inputs, args=None, **kwargs): return self.inference_result @classmethod diff --git a/python/nn4k/tests/invoker/test_base_invoker.py b/python/nn4k/tests/invoker/test_base_invoker.py index 108cad81..8db4b9aa 100644 --- a/python/nn4k/tests/invoker/test_base_invoker.py +++ b/python/nn4k/tests/invoker/test_base_invoker.py @@ -89,7 +89,10 @@ class TestBaseInvoker(unittest.TestCase): def testLocalLLMInvokerWithCustomExecutor(self): from nn4k.invoker import LLMInvoker - nn_config = {"nn_executor": "invoker_test_stub.StubExecutor"} + nn_config = { + "nn_model_path": "/path/to/model", + "nn_executor": "invoker_test_stub.StubExecutor", + } invoker = LLMInvoker.from_config(nn_config) self.assertTrue(isinstance(invoker, LLMInvoker)) self.assertEqual(invoker.init_args, nn_config)