From 99a6a340478ca6fb70137f6811a1af282796debc Mon Sep 17 00:00:00 2001 From: Malte Pietsch Date: Tue, 14 Jul 2020 18:53:15 +0200 Subject: [PATCH] Upgrade to new FARM / Transformers / PyTorch versions (#212) --- haystack/reader/farm.py | 73 +- haystack/reader/transformers.py | 8 +- haystack/reader/transformers_utils.py | 1912 +++++++++++++++++++++++++ requirements.txt | 2 +- test/conftest.py | 35 + test/test_finder.py | 8 +- test/test_reader.py | 102 +- tutorials/Tutorial5_Evaluation.py | 4 +- 8 files changed, 2083 insertions(+), 61 deletions(-) create mode 100644 haystack/reader/transformers_utils.py diff --git a/haystack/reader/farm.py b/haystack/reader/farm.py index e36aae3f2..a1ca2e10b 100644 --- a/haystack/reader/farm.py +++ b/haystack/reader/farm.py @@ -6,8 +6,10 @@ import numpy as np from farm.data_handler.data_silo import DataSilo from farm.data_handler.processor import SquadProcessor from farm.data_handler.dataloader import NamedDataLoader -from farm.infer import Inferencer +from farm.data_handler.inputs import QAInput, Question +from farm.infer import QAInferencer from farm.modeling.optimization import initialize_optimizer +from farm.modeling.predictions import QAPred, QACandidate from farm.train import Trainer from farm.eval import Evaluator from farm.utils import set_all_seeds, initialize_device_settings @@ -85,7 +87,7 @@ class FARMReader(BaseReader): else: self.return_no_answers = True self.top_k_per_candidate = top_k_per_candidate - self.inferencer = Inferencer.load(model_name_or_path, batch_size=batch_size, gpu=use_gpu, + self.inferencer = QAInferencer.load(model_name_or_path, batch_size=batch_size, gpu=use_gpu, task_type="question_answering", max_seq_len=max_seq_len, doc_stride=doc_stride, num_processes=num_processes) self.inferencer.model.prediction_heads[0].context_window_size = context_window_size @@ -231,18 +233,16 @@ class FARMReader(BaseReader): """ # convert input to FARM format - input_dicts = [] + inputs = [] for doc in documents: - cur = { - "text": doc.text, - "questions": [question], - "document_id": doc.id - } - input_dicts.append(cur) + cur = QAInput(doc_text=doc.text, + questions=Question(text=question, + uid=doc.id)) + inputs.append(cur) # get answers from QA model - predictions = self.inferencer.inference_from_dicts( - dicts=input_dicts, return_json=True, multiprocessing_chunksize=1 + predictions = self.inferencer.inference_from_objects( + objects=inputs, return_json=False, multiprocessing_chunksize=1 ) # assemble answers from all the different documents & format them. # For the "no answer" option, we collect all no_ans_gaps and decide how likely @@ -250,29 +250,28 @@ class FARMReader(BaseReader): answers = [] no_ans_gaps = [] best_score_answer = 0 - # TODO once FARM returns doc ids again we can revert to using them inside the preds and remove - for pred, inp in zip(predictions, input_dicts): + for pred, inp in zip(predictions, inputs): answers_per_document = [] - no_ans_gaps.append(pred["predictions"][0]["no_ans_gap"]) - for ans in pred["predictions"][0]["answers"]: + no_ans_gaps.append(pred.no_answer_gap) + for ans in pred.prediction: # skip "no answers" here if self._check_no_answer(ans): pass else: - cur = {"answer": ans["answer"], - "score": ans["score"], + cur = {"answer": ans.answer, + "score": ans.score, # just a pseudo prob for now - "probability": float(expit(np.asarray([ans["score"]]) / 8)), # type: ignore - "context": ans["context"], - "offset_start": ans["offset_answer_start"] - ans["offset_context_start"], - "offset_end": ans["offset_answer_end"] - ans["offset_context_start"], - "offset_start_in_doc": ans["offset_answer_start"], - "offset_end_in_doc": ans["offset_answer_end"], - "document_id": inp["document_id"]} #TODO revert to ans["docid"] once it is populated + "probability": float(expit(np.asarray([ans.score]) / 8)), # type: ignore + "context": ans.context_window, + "offset_start": ans.offset_answer_start - ans.offset_context_window_start, + "offset_end": ans.offset_answer_end - ans.offset_context_window_start, + "offset_start_in_doc": ans.offset_answer_start, + "offset_end_in_doc": ans.offset_answer_end, + "document_id": pred.id} answers_per_document.append(cur) - if ans["score"] > best_score_answer: - best_score_answer = ans["score"] + if ans.score > best_score_answer: + best_score_answer = ans.score # only take n best candidates. Answers coming back from FARM are sorted with decreasing relevance. answers += answers_per_document[:self.top_k_per_candidate] @@ -299,7 +298,7 @@ class FARMReader(BaseReader): Returns a dict containing the following metrics: - "EM": exact match score - "f1": F1-Score - - "top_n_recall": Proportion of predicted answers that overlap with correct answer + - "top_n_accuracy": Proportion of predicted answers that match with correct answer :param data_dir: The directory in which the test set can be found :type data_dir: Path or str @@ -329,7 +328,7 @@ class FARMReader(BaseReader): results = { "EM": eval_results[0]["EM"], "f1": eval_results[0]["f1"], - "top_n_recall": eval_results[0]["top_n_recall"] + "top_n_accuracy": eval_results[0]["top_n_accuracy"] } return results @@ -347,7 +346,7 @@ class FARMReader(BaseReader): Returns a dict containing the following metrics: - "EM": Proportion of exact matches of predicted answers with their corresponding correct answers - "f1": Average overlap between predicted answers and their corresponding correct answers - - "top_n_recall": Proportion of predicted answers that overlap with correct answer + - "top_n_accuracy": Proportion of predicted answers that match with correct answer :param document_store: The ElasticsearchDocumentStore containing the evaluation documents :type document_store: ElasticsearchDocumentStore @@ -404,23 +403,23 @@ class FARMReader(BaseReader): results = { "EM": eval_results[0]["EM"], "f1": eval_results[0]["f1"], - "top_n_recall": eval_results[0]["top_n_recall"] + "top_n_accuracy": eval_results[0]["top_n_accuracy"] } return results @staticmethod - def _check_no_answer(d: dict): + def _check_no_answer(c: QACandidate): # check for correct value in "answer" - if d["offset_answer_start"] == 0 and d["offset_answer_end"] == 0: - assert d["answer"] == "is_impossible", f"Check for no answer is not working" - - # check weather the model thinks there is no answer - if d["answer"] == "is_impossible": + if c.offset_answer_start == 0 and c.offset_answer_end == 0: + if c.answer != "no_answer": + logger.error("Invalid 'no_answer': Got a prediction for position 0, but answer string is not 'no_answer'") + if c.answer == "no_answer": return True else: return False + @staticmethod def _calc_no_answer(no_ans_gaps: List[float], best_score_answer: float): # "no answer" scores and positive answers scores are difficult to compare, because @@ -476,5 +475,5 @@ class FARMReader(BaseReader): are "gpu_tensor_core" (GPUs with tensor core like V100 or T4), "gpu_without_tensor_core" (most other GPUs), and "cpu". """ - inferencer = Inferencer.load(model_name_or_path, task_type="question_answering") + inferencer = QAInferencer.load(model_name_or_path, task_type="question_answering") inferencer.model.convert_to_onnx(output_path=Path("onnx-export"), opset_version=opset_version, optimize_for=optimize_for) diff --git a/haystack/reader/transformers.py b/haystack/reader/transformers.py index 40a957df7..197c7a248 100644 --- a/haystack/reader/transformers.py +++ b/haystack/reader/transformers.py @@ -1,6 +1,6 @@ from typing import List, Optional -from transformers import pipeline +from haystack.reader.transformers_utils import pipeline from haystack.database.base import Document from haystack.reader.base import BaseReader @@ -40,10 +40,11 @@ class TransformersReader(BaseReader): :param use_gpu: < 0 -> use cpu >= 0 -> ordinal of the gpu to use """ - self.model = pipeline("question-answering", model=model, tokenizer=tokenizer, device=use_gpu) + self.model = pipeline('question-answering', model=model, tokenizer=tokenizer, device=use_gpu) self.context_window_size = context_window_size self.n_best_per_passage = n_best_per_passage #TODO param to modify bias for no_answer + # TODO context_window_size behaviour different from behavior in FARMReader def predict(self, question: str, documents: List[Document], top_k: Optional[int] = None): """ @@ -76,6 +77,9 @@ class TransformersReader(BaseReader): for doc in documents: query = {"context": doc.text, "question": question} predictions = self.model(query, topk=self.n_best_per_passage) + # for single preds (e.g. via top_k=1) transformers returns a dict instead of a list + if type(predictions) == dict: + predictions = [predictions] # assemble and format all answers for pred in predictions: if pred["answer"]: diff --git a/haystack/reader/transformers_utils.py b/haystack/reader/transformers_utils.py new file mode 100644 index 000000000..eeacd642a --- /dev/null +++ b/haystack/reader/transformers_utils.py @@ -0,0 +1,1912 @@ +# mypy: ignore-errors + +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################## +# Adjusted the original QuestionAnweringPipeline class slightly to cope for https://github.com/huggingface/transformers/issues/5711 +# until it is fixed upstream. Everything else is identical to transformers.pipelines.py +############################## + +import csv +import json +import logging +import os +import pickle +import sys +from abc import ABC, abstractmethod +from contextlib import contextmanager +from itertools import chain +from os.path import abspath, exists +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union + +import numpy as np + +from transformers.configuration_auto import AutoConfig +from transformers.configuration_utils import PretrainedConfig +from transformers.data import SquadExample, squad_convert_examples_to_features +from transformers.file_utils import is_tf_available, is_torch_available +from transformers.modelcard import ModelCard +from transformers.tokenization_auto import AutoTokenizer +from transformers.tokenization_bert import BasicTokenizer +from transformers.tokenization_utils import PreTrainedTokenizer + +if is_tf_available(): + import tensorflow as tf + from transformers.modeling_tf_auto import ( + TFAutoModel, + TFAutoModelForSequenceClassification, + TFAutoModelForQuestionAnswering, + TFAutoModelForTokenClassification, + TFAutoModelWithLMHead, + ) + +if is_torch_available(): + import torch + from transformers.modeling_auto import ( + AutoModel, + AutoModelForSequenceClassification, + AutoModelForQuestionAnswering, + AutoModelForTokenClassification, + AutoModelWithLMHead, + AutoModelForSeq2SeqLM, + ) + +if TYPE_CHECKING: + from transformers.modeling_utils import PreTrainedModel + from transformers.modeling_tf_utils import TFPreTrainedModel + +logger = logging.getLogger(__name__) + + +def get_framework(model=None): + """ Select framework (TensorFlow/PyTorch) to use. + If both frameworks are installed and no specific model is provided, defaults to using PyTorch. + """ + if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str): + # Both framework are available but the user supplied a model class instance. + # Try to guess which framework to use from the model classname + framework = "tf" if model.__class__.__name__.startswith("TF") else "pt" + elif not is_tf_available() and not is_torch_available(): + raise RuntimeError( + "At least one of TensorFlow 2.0 or PyTorch should be installed. " + "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ " + "To install PyTorch, read the instructions at https://pytorch.org/." + ) + else: + # framework = 'tf' if is_tf_available() else 'pt' + framework = "pt" if is_torch_available() else "tf" + return framework + + +class PipelineException(Exception): + """ + Raised by pipelines when handling __call__ + """ + + def __init__(self, task: str, model: str, reason: str): + super().__init__(reason) + + self.task = task + self.model = model + + +class ArgumentHandler(ABC): + """ + Base interface for handling varargs for each Pipeline + """ + + @abstractmethod + def __call__(self, *args, **kwargs): + raise NotImplementedError() + + +class DefaultArgumentHandler(ArgumentHandler): + """ + Default varargs argument parser handling parameters for each Pipeline + """ + + @staticmethod + def handle_kwargs(kwargs: Dict) -> List: + if len(kwargs) == 1: + output = list(kwargs.values()) + else: + output = list(chain(kwargs.values())) + + return DefaultArgumentHandler.handle_args(output) + + @staticmethod + def handle_args(args: Sequence[Any]) -> List[str]: + + # Only one argument, let's do case by case + if len(args) == 1: + if isinstance(args[0], str): + return [args[0]] + elif not isinstance(args[0], list): + return list(args) + else: + return args[0] + + # Multiple arguments (x1, x2, ...) + elif len(args) > 1: + if all([isinstance(arg, str) for arg in args]): + return list(args) + + # If not instance of list, then it should instance of iterable + elif isinstance(args, Iterable): + return list(chain.from_iterable(chain(args))) + else: + raise ValueError( + "Invalid input type {}. Pipeline supports Union[str, Iterable[str]]".format(type(args)) + ) + else: + return [] + + def __call__(self, *args, **kwargs): + if len(kwargs) > 0 and len(args) > 0: + raise ValueError("Pipeline cannot handle mixed args and kwargs") + + if len(kwargs) > 0: + return DefaultArgumentHandler.handle_kwargs(kwargs) + else: + return DefaultArgumentHandler.handle_args(args) + + +class PipelineDataFormat: + """ + Base class for all the pipeline supported data format both for reading and writing. + Supported data formats currently includes: + - JSON + - CSV + - stdin/stdout (pipe) + + PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns + to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format. + """ + + SUPPORTED_FORMATS = ["json", "csv", "pipe"] + + def __init__( + self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False, + ): + self.output_path = output_path + self.input_path = input_path + self.column = column.split(",") if column is not None else [""] + self.is_multi_columns = len(self.column) > 1 + + if self.is_multi_columns: + self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column] + + if output_path is not None and not overwrite: + if exists(abspath(self.output_path)): + raise OSError("{} already exists on disk".format(self.output_path)) + + if input_path is not None: + if not exists(abspath(self.input_path)): + raise OSError("{} doesnt exist on disk".format(self.input_path)) + + @abstractmethod + def __iter__(self): + raise NotImplementedError() + + @abstractmethod + def save(self, data: dict): + """ + Save the provided data object with the representation for the current `DataFormat`. + :param data: data to store + :return: + """ + raise NotImplementedError() + + def save_binary(self, data: Union[dict, List[dict]]) -> str: + """ + Save the provided data object as a pickle-formatted binary data on the disk. + :param data: data to store + :return: (str) Path where the data has been saved + """ + path, _ = os.path.splitext(self.output_path) + binary_path = os.path.extsep.join((path, "pickle")) + + with open(binary_path, "wb+") as f_output: + pickle.dump(data, f_output) + + return binary_path + + @staticmethod + def from_str( + format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False, + ): + if format == "json": + return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) + elif format == "csv": + return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) + elif format == "pipe": + return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) + else: + raise KeyError("Unknown reader {} (Available reader are json/csv/pipe)".format(format)) + + +class CsvPipelineDataFormat(PipelineDataFormat): + def __init__( + self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False, + ): + super().__init__(output_path, input_path, column, overwrite=overwrite) + + def __iter__(self): + with open(self.input_path, "r") as f: + reader = csv.DictReader(f) + for row in reader: + if self.is_multi_columns: + yield {k: row[c] for k, c in self.column} + else: + yield row[self.column[0]] + + def save(self, data: List[dict]): + with open(self.output_path, "w") as f: + if len(data) > 0: + writer = csv.DictWriter(f, list(data[0].keys())) + writer.writeheader() + writer.writerows(data) + + +class JsonPipelineDataFormat(PipelineDataFormat): + def __init__( + self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False, + ): + super().__init__(output_path, input_path, column, overwrite=overwrite) + + with open(input_path, "r") as f: + self._entries = json.load(f) + + def __iter__(self): + for entry in self._entries: + if self.is_multi_columns: + yield {k: entry[c] for k, c in self.column} + else: + yield entry[self.column[0]] + + def save(self, data: dict): + with open(self.output_path, "w") as f: + json.dump(data, f) + + +class PipedPipelineDataFormat(PipelineDataFormat): + """ + Read data from piped input to the python process. + For multi columns data, columns should separated by \t + + If columns are provided, then the output will be a dictionary with {column_x: value_x} + """ + + def __iter__(self): + for line in sys.stdin: + # Split for multi-columns + if "\t" in line: + + line = line.split("\t") + if self.column: + # Dictionary to map arguments + yield {kwargs: l for (kwargs, _), l in zip(self.column, line)} + else: + yield tuple(line) + + # No dictionary to map arguments + else: + yield line + + def save(self, data: dict): + print(data) + + def save_binary(self, data: Union[dict, List[dict]]) -> str: + if self.output_path is None: + raise KeyError( + "When using piped input on pipeline outputting large object requires an output file path. " + "Please provide such output path through --output argument." + ) + + return super().save_binary(data) + + +class _ScikitCompat(ABC): + """ + Interface layer for the Scikit and Keras compatibility. + """ + + @abstractmethod + def transform(self, X): + raise NotImplementedError() + + @abstractmethod + def predict(self, X): + raise NotImplementedError() + + +class Pipeline(_ScikitCompat): + """ + The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across + different pipelines. + + Base class implementing pipelined operations. + Pipeline workflow is defined as a sequence of the following operations: + + Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output + + Pipeline supports running on CPU or GPU through the device argument. Users can specify + device argument as an integer, -1 meaning "CPU", >= 0 referring the CUDA device ordinal. + + Some pipeline, like for instance FeatureExtractionPipeline ('feature-extraction') outputs large + tensor object as nested-lists. In order to avoid dumping such large structure as textual data we + provide the binary_output constructor argument. If set to True, the output will be stored in the + pickle format. + + Arguments: + model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`): + The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from + :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for + TensorFlow. + tokenizer (:obj:`~transformers.PreTrainedTokenizer`): + The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from + :class:`~transformers.PreTrainedTokenizer`. + modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): + Model card attributed to the model for this pipeline. + framework (:obj:`str`, `optional`, defaults to :obj:`None`): + The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be + installed. + + If no framework is specified, will default to the one currently installed. If no framework is specified + and both frameworks are installed, will default to PyTorch. + args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): + Reference to the object in charge of parsing supplied pipeline parameters. + device (:obj:`int`, `optional`, defaults to :obj:`-1`): + Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model + on the associated CUDA device id. + binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`): + Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text. + + Return: + :obj:`List` or :obj:`Dict`: + Pipeline returns list or dictionary depending on: + + - Whether the user supplied multiple samples + - Whether the pipeline exposes multiple fields in the output object + """ + + default_input_names = None + + def __init__( + self, + model: Union["PreTrainedModel", "TFPreTrainedModel"], + tokenizer: PreTrainedTokenizer, + modelcard: Optional[ModelCard] = None, + framework: Optional[str] = None, + task: str = "", + args_parser: ArgumentHandler = None, + device: int = -1, + binary_output: bool = False, + ): + + if framework is None: + framework = get_framework() + + self.model = model + self.tokenizer = tokenizer + self.modelcard = modelcard + self.framework = framework + self.device = device if framework == "tf" else torch.device("cpu" if device < 0 else "cuda:{}".format(device)) + self.binary_output = binary_output + self._args_parser = args_parser or DefaultArgumentHandler() + + # Special handling + if self.framework == "pt" and self.device.type == "cuda": + self.model = self.model.to(self.device) + + # Update config with task specific parameters + task_specific_params = self.model.config.task_specific_params + if task_specific_params is not None and task in task_specific_params: + self.model.config.update(task_specific_params.get(task)) + + def save_pretrained(self, save_directory): + """ + Save the pipeline's model and tokenizer to the specified save_directory + """ + if os.path.isfile(save_directory): + logger.error("Provided path ({}) should be a directory, not a file".format(save_directory)) + return + os.makedirs(save_directory, exist_ok=True) + + self.model.save_pretrained(save_directory) + self.tokenizer.save_pretrained(save_directory) + if self.modelcard is not None: + self.modelcard.save_pretrained(save_directory) + + def transform(self, X): + """ + Scikit / Keras interface to transformers' pipelines. This method will forward to __call__(). + """ + return self(X=X) + + def predict(self, X): + """ + Scikit / Keras interface to transformers' pipelines. This method will forward to __call__(). + """ + return self(X=X) + + @contextmanager + def device_placement(self): + """ + Context Manager allowing tensor allocation on the user-specified device in framework agnostic way. + example: + # Explicitly ask for tensor allocation on CUDA device :0 + nlp = pipeline(..., device=0) + with nlp.device_placement(): + # Every framework specific tensor allocation will be done on the request device + output = nlp(...) + Returns: + Context manager + """ + if self.framework == "tf": + with tf.device("/CPU:0" if self.device == -1 else "/device:GPU:{}".format(self.device)): + yield + else: + if self.device.type == "cuda": + torch.cuda.set_device(self.device) + + yield + + def ensure_tensor_on_device(self, **inputs): + """ + Ensure PyTorch tensors are on the specified device. + :param inputs: + :return: + """ + return {name: tensor.to(self.device) for name, tensor in inputs.items()} + + def _parse_and_tokenize(self, *args, padding=True, add_special_tokens=True, **kwargs): + """ + Parse arguments and tokenize + """ + # Parse arguments + inputs = self._args_parser(*args, **kwargs) + inputs = self.tokenizer( + inputs, add_special_tokens=add_special_tokens, return_tensors=self.framework, padding=padding, + ) + + return inputs + + def __call__(self, *args, **kwargs): + inputs = self._parse_and_tokenize(*args, **kwargs) + return self._forward(inputs) + + def _forward(self, inputs, return_tensors=False): + """ + Internal framework specific forward dispatching. + Args: + inputs: dict holding all the keyworded arguments for required by the model forward method. + return_tensors: Whether to return native framework (pt/tf) tensors rather than numpy array. + Returns: + Numpy array + """ + # Encode for forward + with self.device_placement(): + if self.framework == "tf": + # TODO trace model + predictions = self.model(inputs.data, training=False)[0] + else: + with torch.no_grad(): + inputs = self.ensure_tensor_on_device(**inputs) + predictions = self.model(**inputs)[0].cpu() + + if return_tensors: + return predictions + else: + return predictions.numpy() + + +class FeatureExtractionPipeline(Pipeline): + """ + Feature extraction pipeline using Model head. This pipeline extracts the hidden states from the base transformer, + which can be used as features in downstream tasks. + + This feature extraction pipeline can currently be loaded from the :func:`~transformers.pipeline` method using + the following task identifier(s): + + - "feature-extraction", for extracting features of a sequence. + + All models may be used for this pipeline. See a list of all models, including community-contributed models on + `huggingface.co/models `__. + + Arguments: + model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`): + The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from + :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for + TensorFlow. + tokenizer (:obj:`~transformers.PreTrainedTokenizer`): + The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from + :class:`~transformers.PreTrainedTokenizer`. + modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): + Model card attributed to the model for this pipeline. + framework (:obj:`str`, `optional`, defaults to :obj:`None`): + The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be + installed. + + If no framework is specified, will default to the one currently installed. If no framework is specified + and both frameworks are installed, will default to PyTorch. + args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): + Reference to the object in charge of parsing supplied pipeline parameters. + device (:obj:`int`, `optional`, defaults to :obj:`-1`): + Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model + on the associated CUDA device id. + """ + + def __init__( + self, + model: Union["PreTrainedModel", "TFPreTrainedModel"], + tokenizer: PreTrainedTokenizer, + modelcard: Optional[ModelCard] = None, + framework: Optional[str] = None, + args_parser: ArgumentHandler = None, + device: int = -1, + task: str = "", + ): + super().__init__( + model=model, + tokenizer=tokenizer, + modelcard=modelcard, + framework=framework, + args_parser=args_parser, + device=device, + binary_output=True, + task=task, + ) + + def __call__(self, *args, **kwargs): + return super().__call__(*args, **kwargs).tolist() + + +class TextGenerationPipeline(Pipeline): + """ + Language generation pipeline using any ModelWithLMHead head. This pipeline predicts the words that will follow a specified text prompt. + + This language generation pipeline can currently be loaded from the :func:`~transformers.pipeline` method using + the following task identifier(s): + + - "text-generation", for generating text from a specified prompt. + + The models that this pipeline can use are models that have been trained with an autoregressive language modeling objective, + which includes the uni-directional models in the library (e.g. gpt2). + See the list of available community models on + `huggingface.co/models `__. + """ + + # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia + # in https://github.com/rusiaaman/XLNet-gen#methodology + # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e + + PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family + (except for Alexei and Maria) are discovered. + The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the + remainder of the story. 1883 Western Siberia, + a young Grigori Rasputin is asked by his father and a group of men to perform magic. + Rasputin has a vision and denounces one of the men as a horse thief. Although his + father initially slaps him for making such an accusation, Rasputin watches as the + man is chased outside and beaten. Twenty years later, Rasputin sees a vision of + the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, + with people, even a bishop, begging for his blessing. """ + + ALLOWED_MODELS = [ + "XLNetLMHeadModel", + "TransfoXLLMHeadModel", + "ReformerModelWithLMHead", + "GPT2LMHeadModel", + "OpenAIGPTLMHeadModel", + "CTRLLMHeadModel", + "TFXLNetLMHeadModel", + "TFTransfoXLLMHeadModel", + "TFGPT2LMHeadModel", + "TFOpenAIGPTLMHeadModel", + "TFCTRLLMHeadModel", + ] + + # overriding _parse_and_tokenize to allow for unusual language-modeling tokenizer arguments + + def _parse_and_tokenize(self, *args, padding=True, add_special_tokens=True, **kwargs): + """ + Parse arguments and tokenize + """ + # Parse arguments + if self.model.__class__.__name__ in ["TransfoXLLMHeadModel"]: + tokenizer_kwargs = {"add_space_before_punct_symbol": True} + else: + tokenizer_kwargs = {} + inputs = self._args_parser(*args, **kwargs) + inputs = self.tokenizer( + inputs, + add_special_tokens=add_special_tokens, + return_tensors=self.framework, + padding=padding, + **tokenizer_kwargs, + ) + + return inputs + + def __call__( + self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs + ): + if self.model.__class__.__name__ not in self.ALLOWED_MODELS: + raise NotImplementedError( + "Generation is currently not supported for {}. Please select a model from {} for generation.".format( + self.model.__class__.__name__, self.ALLOWED_MODELS + ) + ) + + text_inputs = self._args_parser(*args) + + results = [] + for prompt_text in text_inputs: + # Manage correct placement of the tensors + with self.device_placement(): + if self.model.__class__.__name__ in ["XLNetLMHeadModel", "TransfoXLLMHeadModel"]: + # For XLNet and TransformerXL we had an article to the prompt to give more state to the model. + padding_text = self.PADDING_TEXT + self.tokenizer.eos_token + padding = self._parse_and_tokenize(padding_text, padding=False, add_special_tokens=False) + # This impacts max_length and min_length argument that need adjusting. + padding_length = padding["input_ids"].shape[-1] + if "max_length" in generate_kwargs and generate_kwargs["max_length"] is not None: + generate_kwargs["max_length"] += padding_length + if "min_length" in generate_kwargs and generate_kwargs["min_length"] is not None: + generate_kwargs["min_length"] += padding_length + + inputs = self._parse_and_tokenize( + padding_text + prompt_text, padding=False, add_special_tokens=False + ) + else: + inputs = self._parse_and_tokenize(prompt_text, padding=False, add_special_tokens=False) + + # set input_ids to None to allow empty prompt + if inputs["input_ids"].shape[-1] == 0: + inputs["input_ids"] = None + inputs["attention_mask"] = None + + if self.framework == "pt" and inputs["input_ids"] is not None: + inputs = self.ensure_tensor_on_device(**inputs) + + input_ids = inputs["input_ids"] + + # Ensure that batch size = 1 (batch generation not allowed for now) + assert ( + input_ids is None or input_ids.shape[0] == 1 + ), "Batch generation is currently not supported. See https://github.com/huggingface/transformers/issues/3021 for more information." + + output_sequences = self.model.generate(input_ids=input_ids, **generate_kwargs) # BS x SL + + result = [] + for generated_sequence in output_sequences: + generated_sequence = generated_sequence.numpy().tolist() + record = {} + if return_tensors: + record["generated_token_ids"] = generated_sequence + if return_text: + # Decode text + text = self.tokenizer.decode( + generated_sequence, + skip_special_tokens=True, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + ) + + # Remove PADDING prompt of the sequence if XLNet or Transfo-XL model is used + if input_ids is None: + prompt_length = 0 + else: + prompt_length = len( + self.tokenizer.decode( + input_ids[0], + skip_special_tokens=True, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + ) + ) + + record["generated_text"] = prompt_text + text[prompt_length:] + + result.append(record) + results += [result] + + if len(results) == 1: + return results[0] + + return results + + +class TextClassificationPipeline(Pipeline): + """ + Text classification pipeline using ModelForSequenceClassification head. See the + `sequence classification usage <../usage.html#sequence-classification>`__ examples for more information. + + This text classification pipeline can currently be loaded from the :func:`~transformers.pipeline` method using + the following task identifier(s): + + - "sentiment-analysis", for classifying sequences according to positive or negative sentiments. + + The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. + See the up-to-date list of available models on + `huggingface.co/models `__. + + Arguments: + model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`): + The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from + :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for + TensorFlow. + tokenizer (:obj:`~transformers.PreTrainedTokenizer`): + The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from + :class:`~transformers.PreTrainedTokenizer`. + modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): + Model card attributed to the model for this pipeline. + framework (:obj:`str`, `optional`, defaults to :obj:`None`): + The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be + installed. + + If no framework is specified, will default to the one currently installed. If no framework is specified + and both frameworks are installed, will default to PyTorch. + args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): + Reference to the object in charge of parsing supplied pipeline parameters. + device (:obj:`int`, `optional`, defaults to :obj:`-1`): + Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model + on the associated CUDA device id. + """ + + def __init__(self, return_all_scores: bool = False, **kwargs): + super().__init__(**kwargs) + + self.return_all_scores = return_all_scores + + def __call__(self, *args, **kwargs): + outputs = super().__call__(*args, **kwargs) + scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True) + if self.return_all_scores: + return [ + [{"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(item)] + for item in scores + ] + else: + return [ + {"label": self.model.config.id2label[item.argmax()], "score": item.max().item()} for item in scores + ] + + +class FillMaskPipeline(Pipeline): + """ + Masked language modeling prediction pipeline using ModelWithLMHead head. See the + `masked language modeling usage <../usage.html#masked-language-modeling>`__ examples for more information. + + This mask filling pipeline can currently be loaded from the :func:`~transformers.pipeline` method using + the following task identifier(s): + + - "fill-mask", for predicting masked tokens in a sequence. + + The models that this pipeline can use are models that have been trained with a masked language modeling objective, + which includes the bi-directional models in the library. + See the up-to-date list of available models on + `huggingface.co/models `__. + + Arguments: + model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`): + The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from + :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for + TensorFlow. + tokenizer (:obj:`~transformers.PreTrainedTokenizer`): + The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from + :class:`~transformers.PreTrainedTokenizer`. + modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): + Model card attributed to the model for this pipeline. + framework (:obj:`str`, `optional`, defaults to :obj:`None`): + The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be + installed. + + If no framework is specified, will default to the one currently installed. If no framework is specified + and both frameworks are installed, will default to PyTorch. + args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): + Reference to the object in charge of parsing supplied pipeline parameters. + device (:obj:`int`, `optional`, defaults to :obj:`-1`): + Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model + on the associated CUDA device id. + """ + + def __init__( + self, + model: Union["PreTrainedModel", "TFPreTrainedModel"], + tokenizer: PreTrainedTokenizer, + modelcard: Optional[ModelCard] = None, + framework: Optional[str] = None, + args_parser: ArgumentHandler = None, + device: int = -1, + topk=5, + task: str = "", + ): + super().__init__( + model=model, + tokenizer=tokenizer, + modelcard=modelcard, + framework=framework, + args_parser=args_parser, + device=device, + binary_output=True, + task=task, + ) + + self.topk = topk + + def ensure_exactly_one_mask_token(self, masked_index: np.ndarray): + numel = np.prod(masked_index.shape) + if numel > 1: + raise PipelineException( + "fill-mask", + self.model.base_model_prefix, + f"More than one mask_token ({self.tokenizer.mask_token}) is not supported", + ) + elif numel < 1: + raise PipelineException( + "fill-mask", + self.model.base_model_prefix, + f"No mask_token ({self.tokenizer.mask_token}) found on the input", + ) + + def __call__(self, *args, **kwargs): + inputs = self._parse_and_tokenize(*args, **kwargs) + outputs = self._forward(inputs, return_tensors=True) + + results = [] + batch_size = outputs.shape[0] if self.framework == "tf" else outputs.size(0) + + for i in range(batch_size): + input_ids = inputs["input_ids"][i] + result = [] + + if self.framework == "tf": + masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy() + + # Fill mask pipeline supports only one ${mask_token} per sample + self.ensure_exactly_one_mask_token(masked_index) + + logits = outputs[i, masked_index.item(), :] + probs = tf.nn.softmax(logits) + topk = tf.math.top_k(probs, k=self.topk) + values, predictions = topk.values.numpy(), topk.indices.numpy() + else: + masked_index = (input_ids == self.tokenizer.mask_token_id).nonzero() + + # Fill mask pipeline supports only one ${mask_token} per sample + self.ensure_exactly_one_mask_token(masked_index.numpy()) + + logits = outputs[i, masked_index.item(), :] + probs = logits.softmax(dim=0) + values, predictions = probs.topk(self.topk) + + for v, p in zip(values.tolist(), predictions.tolist()): + tokens = input_ids.numpy() + tokens[masked_index] = p + # Filter padding out: + tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)] + result.append( + { + "sequence": self.tokenizer.decode(tokens), + "score": v, + "token": p, + "token_str": self.tokenizer.convert_ids_to_tokens(p), + } + ) + + # Append + results += [result] + + if len(results) == 1: + return results[0] + return results + + +class TokenClassificationPipeline(Pipeline): + """ + Named Entity Recognition pipeline using ModelForTokenClassification head. See the + `named entity recognition usage <../usage.html#named-entity-recognition>`__ examples for more information. + + This token recognition pipeline can currently be loaded from the :func:`~transformers.pipeline` method using + the following task identifier(s): + + - "ner", for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous. + + The models that this pipeline can use are models that have been fine-tuned on a token classification task. + See the up-to-date list of available models on + `huggingface.co/models `__. + + Arguments: + model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`): + The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from + :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for + TensorFlow. + tokenizer (:obj:`~transformers.PreTrainedTokenizer`): + The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from + :class:`~transformers.PreTrainedTokenizer`. + modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): + Model card attributed to the model for this pipeline. + framework (:obj:`str`, `optional`, defaults to :obj:`None`): + The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be + installed. + + If no framework is specified, will default to the one currently installed. If no framework is specified + and both frameworks are installed, will default to PyTorch. + args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): + Reference to the object in charge of parsing supplied pipeline parameters. + device (:obj:`int`, `optional`, defaults to :obj:`-1`): + Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model + on the associated CUDA device id. + """ + + default_input_names = "sequences" + + def __init__( + self, + model: Union["PreTrainedModel", "TFPreTrainedModel"], + tokenizer: PreTrainedTokenizer, + modelcard: Optional[ModelCard] = None, + framework: Optional[str] = None, + args_parser: ArgumentHandler = None, + device: int = -1, + binary_output: bool = False, + ignore_labels=["O"], + task: str = "", + grouped_entities: bool = False, + ): + super().__init__( + model=model, + tokenizer=tokenizer, + modelcard=modelcard, + framework=framework, + args_parser=args_parser, + device=device, + binary_output=binary_output, + task=task, + ) + + self._basic_tokenizer = BasicTokenizer(do_lower_case=False) + self.ignore_labels = ignore_labels + self.grouped_entities = grouped_entities + + def __call__(self, *args, **kwargs): + inputs = self._args_parser(*args, **kwargs) + answers = [] + for sentence in inputs: + + # Manage correct placement of the tensors + with self.device_placement(): + + tokens = self.tokenizer( + sentence, return_attention_mask=False, return_tensors=self.framework, truncation=True, + ) + + # Forward + if self.framework == "tf": + entities = self.model(tokens.data)[0][0].numpy() + input_ids = tokens["input_ids"].numpy()[0] + else: + with torch.no_grad(): + tokens = self.ensure_tensor_on_device(**tokens) + entities = self.model(**tokens)[0][0].cpu().numpy() + input_ids = tokens["input_ids"].cpu().numpy()[0] + + score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True) + labels_idx = score.argmax(axis=-1) + + entities = [] + entity_groups = [] + entity_group_disagg = [] + # Filter to labels not in `self.ignore_labels` + filtered_labels_idx = [ + (idx, label_idx) + for idx, label_idx in enumerate(labels_idx) + if self.model.config.id2label[label_idx] not in self.ignore_labels + ] + + for idx, label_idx in filtered_labels_idx: + + entity = { + "word": self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])), + "score": score[idx][label_idx].item(), + "entity": self.model.config.id2label[label_idx], + "index": idx, + } + last_idx, _ = filtered_labels_idx[-1] + if self.grouped_entities: + if not entity_group_disagg: + entity_group_disagg += [entity] + if idx == last_idx: + entity_groups += [self.group_entities(entity_group_disagg)] + continue + + # If the current entity is similar and adjacent to the previous entity, append it to the disaggregated entity group + if ( + entity["entity"] == entity_group_disagg[-1]["entity"] + and entity["index"] == entity_group_disagg[-1]["index"] + 1 + ): + entity_group_disagg += [entity] + # Group the entities at the last entity + if idx == last_idx: + entity_groups += [self.group_entities(entity_group_disagg)] + # If the current entity is different from the previous entity, aggregate the disaggregated entity group + else: + entity_groups += [self.group_entities(entity_group_disagg)] + entity_group_disagg = [entity] + + entities += [entity] + + # Ensure if an entity is the latest one in the sequence it gets appended to the output + if len(entity_group_disagg) > 0: + entity_groups.append(self.group_entities(entity_group_disagg)) + + # Append + if self.grouped_entities: + answers += [entity_groups] + else: + answers += [entities] + + if len(answers) == 1: + return answers[0] + return answers + + def group_entities(self, entities): + """ + Returns grouped entities + """ + # Get the last entity in the entity group + entity = entities[-1]["entity"] + scores = np.mean([entity["score"] for entity in entities]) + tokens = [entity["word"] for entity in entities] + + entity_group = { + "entity_group": entity, + "score": np.mean(scores), + "word": self.tokenizer.convert_tokens_to_string(tokens), + } + return entity_group + + +NerPipeline = TokenClassificationPipeline + + + +class QuestionAnsweringArgumentHandler(ArgumentHandler): + """ + QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped + to internal SquadExample / SquadFeature structures. + + QuestionAnsweringArgumentHandler manages all the possible to create SquadExample from the command-line supplied + arguments. + """ + + def __call__(self, *args, **kwargs): + # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating + if args is not None and len(args) > 0: + if len(args) == 1: + kwargs["X"] = args[0] + else: + kwargs["X"] = list(args) + + # Generic compatibility with sklearn and Keras + # Batched data + if "X" in kwargs or "data" in kwargs: + inputs = kwargs["X"] if "X" in kwargs else kwargs["data"] + + if isinstance(inputs, dict): + inputs = [inputs] + else: + # Copy to avoid overriding arguments + inputs = [i for i in inputs] + + for i, item in enumerate(inputs): + if isinstance(item, dict): + if any(k not in item for k in ["question", "context"]): + raise KeyError("You need to provide a dictionary with keys {question:..., context:...}") + + inputs[i] = QuestionAnsweringPipeline.create_sample(**item) + + elif not isinstance(item, SquadExample): + raise ValueError( + "{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)".format( + "X" if "X" in kwargs else "data" + ) + ) + + # Tabular input + elif "question" in kwargs and "context" in kwargs: + if isinstance(kwargs["question"], str): + kwargs["question"] = [kwargs["question"]] + + if isinstance(kwargs["context"], str): + kwargs["context"] = [kwargs["context"]] + + inputs = [ + QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs["question"], kwargs["context"]) + ] + else: + raise ValueError("Unknown arguments {}".format(kwargs)) + + if not isinstance(inputs, list): + inputs = [inputs] + + return inputs + + +class QuestionAnsweringPipeline(Pipeline): + """ + Question Answering pipeline using ModelForQuestionAnswering head. See the + `question answering usage <../usage.html#question-answering>`__ examples for more information. + + This question answering can currently be loaded from the :func:`~transformers.pipeline` method using + the following task identifier(s): + + - "question-answering", for answering questions given a context. + + The models that this pipeline can use are models that have been fine-tuned on a question answering task. + See the up-to-date list of available models on + `huggingface.co/models `__. + + Arguments: + model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`): + The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from + :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for + TensorFlow. + tokenizer (:obj:`~transformers.PreTrainedTokenizer`): + The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from + :class:`~transformers.PreTrainedTokenizer`. + modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): + Model card attributed to the model for this pipeline. + framework (:obj:`str`, `optional`, defaults to :obj:`None`): + The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be + installed. + + If no framework is specified, will default to the one currently installed. If no framework is specified + and both frameworks are installed, will default to PyTorch. + args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): + Reference to the object in charge of parsing supplied pipeline parameters. + device (:obj:`int`, `optional`, defaults to :obj:`-1`): + Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model + on the associated CUDA device id. + """ + + default_input_names = "question,context" + + def __init__( + self, + model: Union["PreTrainedModel", "TFPreTrainedModel"], + tokenizer: PreTrainedTokenizer, + modelcard: Optional[ModelCard] = None, + framework: Optional[str] = None, + device: int = -1, + task: str = "", + **kwargs + ): + super().__init__( + model=model, + tokenizer=tokenizer, + modelcard=modelcard, + framework=framework, + args_parser=QuestionAnsweringArgumentHandler(), + device=device, + task=task, + **kwargs, + ) + + @staticmethod + def create_sample( + question: Union[str, List[str]], context: Union[str, List[str]] + ) -> Union[SquadExample, List[SquadExample]]: + """ + QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally. + This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s). + We currently support extractive question answering. + Arguments: + question: (str, List[str]) The question to be ask for the associated context + context: (str, List[str]) The context in which we will look for the answer. + + Returns: + SquadExample initialized with the corresponding question and context. + """ + if isinstance(question, list): + return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)] + else: + return SquadExample(None, question, context, None, None, None) + + def __call__(self, *args, **kwargs): + """ + Args: + We support multiple use-cases, the following are exclusive: + X: sequence of SquadExample + data: sequence of SquadExample + question: (str, List[str]), batch of question(s) to map along with context + context: (str, List[str]), batch of context(s) associated with the provided question keyword argument + Returns: + dict: {'answer': str, 'score": float, 'start": int, "end": int} + answer: the textual answer in the intial context + score: the score the current answer scored for the model + start: the character index in the original string corresponding to the beginning of the answer' span + end: the character index in the original string corresponding to the ending of the answer' span + """ + # Set defaults values + kwargs.setdefault("topk", 1) + kwargs.setdefault("doc_stride", 128) + kwargs.setdefault("max_answer_len", 15) + kwargs.setdefault("max_seq_len", 384) + kwargs.setdefault("max_question_len", 64) + kwargs.setdefault("handle_impossible_answer", False) + + if kwargs["topk"] < 1: + raise ValueError("topk parameter should be >= 1 (got {})".format(kwargs["topk"])) + + if kwargs["max_answer_len"] < 1: + raise ValueError("max_answer_len parameter should be >= 1 (got {})".format(kwargs["max_answer_len"])) + + # Convert inputs to features + examples = self._args_parser(*args, **kwargs) + features_list = [ + squad_convert_examples_to_features( + examples=[example], + tokenizer=self.tokenizer, + max_seq_length=kwargs["max_seq_len"], + doc_stride=kwargs["doc_stride"], + max_query_length=kwargs["max_question_len"], + is_training=False, + tqdm_enabled=False, + ) + for example in examples + ] + all_answers = [] + for features, example in zip(features_list, examples): + model_input_names = self.tokenizer.model_input_names + ["input_ids"] + fw_args = {k: [feature.__dict__[k] for feature in features] for k in model_input_names} + + # Manage tensor allocation on correct device + with self.device_placement(): + if self.framework == "tf": + fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()} + start, end = self.model(fw_args)[:2] + start, end = start.numpy(), end.numpy() + else: + with torch.no_grad(): + # Retrieve the score for the context tokens only (removing question tokens) + fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()} + start, end = self.model(**fw_args)[:2] + start, end = start.cpu().numpy(), end.cpu().numpy() + + min_null_score = 1000000 # large and positive + answers = [] + for (feature, start_, end_) in zip(features, start, end): + # Mask padding and question + start_, end_ = ( + start_ * np.abs(np.array(feature.p_mask) - 1), + end_ * np.abs(np.array(feature.p_mask) - 1), + ) + + # Mask CLS + start_[0] = end_[0] = 0 + + # # Mask SEP Tokens + # sep_pos = np.where(np.array(feature.input_ids) == self.tokenizer.sep_token_id)[0] + # start_[sep_pos] = -10 + # end_[sep_pos] = -10 + + # Normalize logits and spans to retrieve the answer + start_ = np.exp(start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True))) + end_ = np.exp(end_ - np.log(np.sum(np.exp(end_), axis=-1, keepdims=True))) + + if kwargs["handle_impossible_answer"]: + min_null_score = min(min_null_score, (start_[0] * end_[0]).item()) + + starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"]) + char_to_word = np.array(example.char_to_word_offset) + + # Convert the answer (tokens) back to the original text + for s, e, score in zip(starts, ends, scores): + # CUSTOM ADJUSTMENT: We added this try-catch block here to not crash when model predicts + # start / end to be the final [SEP] token (https://github.com/huggingface/transformers/issues/5711) + try: + answers += [ + { + "score": score.item(), + "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), + "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), + "answer": " ".join( + example.doc_tokens[feature.token_to_orig_map[s]: feature.token_to_orig_map[e] + 1] + ), + }] + except KeyError as e: + logger.warning( + f"Could not map predicted span ({s},{e}) back to token space. Skipping this prediction ...") + # answers += [ + # { + # "score": score.item(), + # "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), + # "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), + # "answer": " ".join( + # example.doc_tokens[feature.token_to_orig_map[s]: feature.token_to_orig_map[e] + 1] + # ), + # } + # for s, e, score in zip(starts, ends, scores) + # ] + if kwargs["handle_impossible_answer"]: + answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""}) + + answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: kwargs["topk"]] + all_answers += answers + + if len(all_answers) == 1: + return all_answers[0] + return all_answers + + def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple: + """ + Take the output of any QuestionAnswering head and will generate probalities for each span to be + the actual answer. + In addition, it filters out some unwanted/impossible cases like answer len being greater than + max_answer_len or answer end position being before the starting position. + The method supports output the k-best answer through the topk argument. + + Args: + start: numpy array, holding individual start probabilities for each token + end: numpy array, holding individual end probabilities for each token + topk: int, indicates how many possible answer span(s) to extract from the model's output + max_answer_len: int, maximum size of the answer to extract from the model's output + """ + # Ensure we have batch axis + if start.ndim == 1: + start = start[None] + + if end.ndim == 1: + end = end[None] + + # Compute the score of each tuple(start, end) to be the real answer + outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1)) + + # Remove candidate with end < start and end - start > max_answer_len + candidates = np.tril(np.triu(outer), max_answer_len - 1) + + # Inspired by Chen & al. (https://github.com/facebookresearch/DrQA) + scores_flat = candidates.flatten() + if topk == 1: + idx_sort = [np.argmax(scores_flat)] + elif len(scores_flat) < topk: + idx_sort = np.argsort(-scores_flat) + else: + idx = np.argpartition(-scores_flat, topk)[0:topk] + idx_sort = idx[np.argsort(-scores_flat[idx])] + + start, end = np.unravel_index(idx_sort, candidates.shape)[1:] + return start, end, candidates[0, start, end] + + def span_to_answer(self, text: str, start: int, end: int): + """ + When decoding from token probalities, this method maps token indexes to actual word in + the initial context. + + Args: + text: str, the actual context to extract the answer from + start: int, starting answer token index + end: int, ending answer token index + + Returns: + dict: {'answer': str, 'start': int, 'end': int} + """ + words = [] + token_idx = char_start_idx = char_end_idx = chars_idx = 0 + + for i, word in enumerate(text.split(" ")): + token = self.tokenizer.tokenize(word) + + # Append words if they are in the span + if start <= token_idx <= end: + if token_idx == start: + char_start_idx = chars_idx + + if token_idx == end: + char_end_idx = chars_idx + len(word) + + words += [word] + + # Stop if we went over the end of the answer + if token_idx > end: + break + + # Append the subtokenization length to the running index + token_idx += len(token) + chars_idx += len(word) + 1 + + # Join text with spaces + return { + "answer": " ".join(words), + "start": max(0, char_start_idx), + "end": min(len(text), char_end_idx), + } + + +class SummarizationPipeline(Pipeline): + """ + Summarize news articles and other documents + + Usage:: + + # use bart in pytorch + summarizer = pipeline("summarization") + summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20) + + # use t5 in tf + summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf") + summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20) + + The models that this pipeline can use are models that have been fine-tuned on a summarization task, + which is currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'. + See the up-to-date list of available models on + `huggingface.co/models `__. + + Arguments: + model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`): + The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string + checkpoint identifier or an actual pre-trained model inheriting from + :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for + TensorFlow. + + If :obj:`None`, the default of the pipeline will be loaded. + tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`): + The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`, + a string checkpoint identifier or an actual pre-trained tokenizer inheriting from + :class:`~transformers.PreTrainedTokenizer`. + + If :obj:`None`, the default of the pipeline will be loaded. + modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): + Model card attributed to the model for this pipeline. + framework (:obj:`str`, `optional`, defaults to :obj:`None`): + The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be + installed. + + If no framework is specified, will default to the one currently installed. If no framework is specified + and both frameworks are installed, will default to PyTorch. + args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): + Reference to the object in charge of parsing supplied pipeline parameters. + device (:obj:`int`, `optional`, defaults to :obj:`-1`): + Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model + on the associated CUDA device id. + """ + + def __init__(self, **kwargs): + kwargs.update(task="summarization") + super().__init__(**kwargs) + + def __call__( + self, *documents, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, + **generate_kwargs + ): + r""" + Args: + *documents: (list of strings) articles to be summarized + return_text: (bool, default=True) whether to add a decoded "summary_text" to each result + return_tensors: (bool, default=False) whether to return the raw "summary_token_ids" to each result + + clean_up_tokenization_spaces: (`optional`) bool whether to include extra spaces in the output + **generate_kwargs: extra kwargs passed to `self.model.generate`_ + + Returns: + list of dicts with 'summary_text' and/or 'summary_token_ids' for each document_to_summarize + + .. _`self.model.generate`: + https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate + + """ + assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True" + assert len(documents) > 0, "Please provide a document to summarize" + + if self.framework == "tf" and "BartForConditionalGeneration" in self.model.__class__.__name__: + raise NotImplementedError( + "Tensorflow is not yet supported for Bart. Please consider using T5, e.g. `t5-base`" + ) + + prefix = self.model.config.prefix if self.model.config.prefix is not None else "" + + if isinstance(documents[0], list): + assert ( + self.tokenizer.pad_token_id is not None + ), "Please make sure that the tokenizer has a pad_token_id when using a batch input" + + documents = ([prefix + document for document in documents[0]],) + padding = True + + elif isinstance(documents[0], str): + documents = (prefix + documents[0],) + padding = False + else: + raise ValueError( + " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format( + documents[0] + ) + ) + + with self.device_placement(): + inputs = self._parse_and_tokenize(*documents, padding=padding) + + if self.framework == "pt": + inputs = self.ensure_tensor_on_device(**inputs) + input_length = inputs["input_ids"].shape[-1] + elif self.framework == "tf": + input_length = tf.shape(inputs["input_ids"])[-1].numpy() + + min_length = generate_kwargs.get("min_length", self.model.config.min_length) + if input_length < min_length // 2: + logger.warning( + "Your min_length is set to {}, but you input_length is only {}. You might consider decreasing min_length manually, e.g. summarizer('...', min_length=10)".format( + min_length, input_length + ) + ) + + max_length = generate_kwargs.get("max_length", self.model.config.max_length) + if input_length < max_length: + logger.warning( + "Your max_length is set to {}, but you input_length is only {}. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)".format( + max_length, input_length + ) + ) + + summaries = self.model.generate( + inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs, + ) + + results = [] + for summary in summaries: + record = {} + if return_tensors: + record["summary_token_ids"] = summary + if return_text: + record["summary_text"] = self.tokenizer.decode( + summary, skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces, + ) + results.append(record) + return results + + +class TranslationPipeline(Pipeline): + """ + Translates from one language to another. + + Usage:: + en_fr_translator = pipeline("translation_en_to_fr") + en_fr_translator("How old are you?") + + The models that this pipeline can use are models that have been fine-tuned on a translation task, + currently: "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b" + See the up-to-date list of available models on + `huggingface.co/models `__. + + Arguments: + model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`): + The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string + checkpoint identifier or an actual pre-trained model inheriting from + :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for + TensorFlow. + If :obj:`None`, the default of the pipeline will be loaded. + tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`): + The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`, + a string checkpoint identifier or an actual pre-trained tokenizer inheriting from + :class:`~transformers.PreTrainedTokenizer`. + If :obj:`None`, the default of the pipeline will be loaded. + modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): + Model card attributed to the model for this pipeline. + framework (:obj:`str`, `optional`, defaults to :obj:`None`): + The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be + installed. + If no framework is specified, will default to the one currently installed. If no framework is specified + and both frameworks are installed, will default to PyTorch. + args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): + Reference to the object in charge of parsing supplied pipeline parameters. + device (:obj:`int`, `optional`, defaults to :obj:`-1`): + Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model + on the associated CUDA device id. + """ + + def __call__( + self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs + ): + r""" + Args: + *args: (list of strings) texts to be translated + return_text: (bool, default=True) whether to add a decoded "translation_text" to each result + return_tensors: (bool, default=False) whether to return the raw "translation_token_ids" to each result + + **generate_kwargs: extra kwargs passed to `self.model.generate`_ + + Returns: + list of dicts with 'translation_text' and/or 'translation_token_ids' for each text_to_translate + .. _`self.model.generate`: + https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate + """ + assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True" + + prefix = self.model.config.prefix if self.model.config.prefix is not None else "" + + if isinstance(args[0], list): + assert ( + self.tokenizer.pad_token_id is not None + ), "Please make sure that the tokenizer has a pad_token_id when using a batch input" + args = ([prefix + text for text in args[0]],) + padding = True + + elif isinstance(args[0], str): + args = (prefix + args[0],) + padding = False + else: + raise ValueError( + " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format( + args[0] + ) + ) + + with self.device_placement(): + inputs = self._parse_and_tokenize(*args, padding=padding) + + if self.framework == "pt": + inputs = self.ensure_tensor_on_device(**inputs) + input_length = inputs["input_ids"].shape[-1] + + elif self.framework == "tf": + input_length = tf.shape(inputs["input_ids"])[-1].numpy() + + max_length = generate_kwargs.get("max_length", self.model.config.max_length) + if input_length > 0.9 * max_length: + logger.warning( + "Your input_length: {} is bigger than 0.9 * max_length: {}. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)".format( + input_length, max_length + ) + ) + + translations = self.model.generate( + inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs, + ) + results = [] + for translation in translations: + record = {} + if return_tensors: + record["translation_token_ids"] = translation + if return_text: + record["translation_text"] = self.tokenizer.decode( + translation, + skip_special_tokens=True, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + ) + results.append(record) + return results + + +# Register all the supported tasks here +SUPPORTED_TASKS = { + "feature-extraction": { + "impl": FeatureExtractionPipeline, + "tf": TFAutoModel if is_tf_available() else None, + "pt": AutoModel if is_torch_available() else None, + "default": {"model": {"pt": "distilbert-base-cased", "tf": "distilbert-base-cased"}}, + }, + "sentiment-analysis": { + "impl": TextClassificationPipeline, + "tf": TFAutoModelForSequenceClassification if is_tf_available() else None, + "pt": AutoModelForSequenceClassification if is_torch_available() else None, + "default": { + "model": { + "pt": "distilbert-base-uncased-finetuned-sst-2-english", + "tf": "distilbert-base-uncased-finetuned-sst-2-english", + }, + }, + }, + "ner": { + "impl": TokenClassificationPipeline, + "tf": TFAutoModelForTokenClassification if is_tf_available() else None, + "pt": AutoModelForTokenClassification if is_torch_available() else None, + "default": { + "model": { + "pt": "dbmdz/bert-large-cased-finetuned-conll03-english", + "tf": "dbmdz/bert-large-cased-finetuned-conll03-english", + }, + }, + }, + "question-answering": { + "impl": QuestionAnsweringPipeline, + "tf": TFAutoModelForQuestionAnswering if is_tf_available() else None, + "pt": AutoModelForQuestionAnswering if is_torch_available() else None, + "default": { + "model": {"pt": "distilbert-base-cased-distilled-squad", "tf": "distilbert-base-cased-distilled-squad"}, + }, + }, + "fill-mask": { + "impl": FillMaskPipeline, + "tf": TFAutoModelWithLMHead if is_tf_available() else None, + "pt": AutoModelWithLMHead if is_torch_available() else None, + "default": {"model": {"pt": "distilroberta-base", "tf": "distilroberta-base"}}, + }, + "summarization": { + "impl": SummarizationPipeline, + "tf": TFAutoModelWithLMHead if is_tf_available() else None, + "pt": AutoModelForSeq2SeqLM if is_torch_available() else None, + "default": {"model": {"pt": "sshleifer/distilbart-cnn-12-6", "tf": "t5-small"}}, + }, + "translation_en_to_fr": { + "impl": TranslationPipeline, + "tf": TFAutoModelWithLMHead if is_tf_available() else None, + "pt": AutoModelWithLMHead if is_torch_available() else None, + "default": {"model": {"pt": "t5-base", "tf": "t5-base"}}, + }, + "translation_en_to_de": { + "impl": TranslationPipeline, + "tf": TFAutoModelWithLMHead if is_tf_available() else None, + "pt": AutoModelWithLMHead if is_torch_available() else None, + "default": {"model": {"pt": "t5-base", "tf": "t5-base"}}, + }, + "translation_en_to_ro": { + "impl": TranslationPipeline, + "tf": TFAutoModelWithLMHead if is_tf_available() else None, + "pt": AutoModelWithLMHead if is_torch_available() else None, + "default": {"model": {"pt": "t5-base", "tf": "t5-base"}}, + }, + "text-generation": { + "impl": TextGenerationPipeline, + "tf": TFAutoModelWithLMHead if is_tf_available() else None, + "pt": AutoModelWithLMHead if is_torch_available() else None, + "default": {"model": {"pt": "gpt2", "tf": "gpt2"}}, + }, +} + + +def pipeline( + task: str, + model: Optional = None, + config: Optional[Union[str, PretrainedConfig]] = None, + tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, + framework: Optional[str] = None, + **kwargs +) -> Pipeline: + """ + Utility factory method to build a pipeline. + + Pipeline are made of: + + - A Tokenizer instance in charge of mapping raw textual input to token + - A Model instance + - Some (optional) post processing for enhancing model's output + + + Args: + task (:obj:`str`): + The task defining which pipeline will be returned. Currently accepted tasks are: + + - "feature-extraction": will return a :class:`~transformers.FeatureExtractionPipeline` + - "sentiment-analysis": will return a :class:`~transformers.TextClassificationPipeline` + - "ner": will return a :class:`~transformers.TokenClassificationPipeline` + - "question-answering": will return a :class:`~transformers.QuestionAnsweringPipeline` + - "fill-mask": will return a :class:`~transformers.FillMaskPipeline` + - "summarization": will return a :class:`~transformers.SummarizationPipeline` + - "translation_xx_to_yy": will return a :class:`~transformers.TranslationPipeline` + - "text-generation": will return a :class:`~transformers.TextGenerationPipeline` + model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`): + The model that will be used by the pipeline to make predictions. This can be :obj:`None`, + a model identifier or an actual pre-trained model inheriting from + :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for + TensorFlow. + + If :obj:`None`, the default for this pipeline will be loaded. + config (:obj:`str` or :obj:`~transformers.PretrainedConfig`, `optional`, defaults to :obj:`None`): + The configuration that will be used by the pipeline to instantiate the model. This can be :obj:`None`, + a model identifier or an actual pre-trained model configuration inheriting from + :class:`~transformers.PretrainedConfig`. + + If :obj:`None`, the default for this pipeline will be loaded. + tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`): + The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`, + a model identifier or an actual pre-trained tokenizer inheriting from + :class:`~transformers.PreTrainedTokenizer`. + + If :obj:`None`, the default for this pipeline will be loaded. + framework (:obj:`str`, `optional`, defaults to :obj:`None`): + The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be + installed. + + If no framework is specified, will default to the one currently installed. If no framework is specified + and both frameworks are installed, will default to PyTorch. + + Returns: + :class:`~transformers.Pipeline`: Class inheriting from :class:`~transformers.Pipeline`, according to + the task. + + Examples:: + + from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer + + # Sentiment analysis pipeline + pipeline('sentiment-analysis') + + # Question answering pipeline, specifying the checkpoint identifier + pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased') + + # Named entity recognition pipeline, passing in a specific model and tokenizer + model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") + tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + pipeline('ner', model=model, tokenizer=tokenizer) + """ + # Retrieve the task + if task not in SUPPORTED_TASKS: + raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys()))) + + framework = framework or get_framework(model) + + targeted_task = SUPPORTED_TASKS[task] + task_class, model_class = targeted_task["impl"], targeted_task[framework] + + # Use default model/config/tokenizer for the task if no model is provided + if model is None: + model = targeted_task["default"]["model"][framework] + + # Try to infer tokenizer from model or config name (if provided as str) + if tokenizer is None: + if isinstance(model, str): + tokenizer = model + elif isinstance(config, str): + tokenizer = config + else: + # Impossible to guest what is the right tokenizer here + raise Exception( + "Impossible to guess which tokenizer to use. " + "Please provided a PretrainedTokenizer class or a path/identifier to a pretrained tokenizer." + ) + + modelcard = None + # Try to infer modelcard from model or config name (if provided as str) + if isinstance(model, str): + modelcard = model + elif isinstance(config, str): + modelcard = config + + # Instantiate tokenizer if needed + if isinstance(tokenizer, (str, tuple)): + if isinstance(tokenizer, tuple): + # For tuple we have (tokenizer name, {kwargs}) + tokenizer = AutoTokenizer.from_pretrained(tokenizer[0], **tokenizer[1]) + else: + tokenizer = AutoTokenizer.from_pretrained(tokenizer) + + # Instantiate config if needed + if isinstance(config, str): + config = AutoConfig.from_pretrained(config) + + # Instantiate modelcard if needed + if isinstance(modelcard, str): + modelcard = ModelCard.from_pretrained(modelcard) + + # Instantiate model if needed + if isinstance(model, str): + # Handle transparent TF/PT model conversion + model_kwargs = {} + if framework == "pt" and model.endswith(".h5"): + model_kwargs["from_tf"] = True + logger.warning( + "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. " + "Trying to load the model with PyTorch." + ) + elif framework == "tf" and model.endswith(".bin"): + model_kwargs["from_pt"] = True + logger.warning( + "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. " + "Trying to load the model with Tensorflow." + ) + model = model_class.from_pretrained(model, config=config, **model_kwargs) + + return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs) diff --git a/requirements.txt b/requirements.txt index 533661e02..9a409b94e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -farm==0.4.5 +farm==0.4.6 --find-links=https://download.pytorch.org/whl/torch_stable.html fastapi uvicorn diff --git a/test/conftest.py b/test/conftest.py index b286fb71c..7a18a4488 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,3 +1,4 @@ + import tarfile import time import urllib.request @@ -10,6 +11,7 @@ from elasticsearch import Elasticsearch from haystack.reader.farm import FARMReader from haystack.reader.transformers import TransformersReader +from haystack.database.base import Document from haystack.database.sql import SQLDocumentStore from haystack.database.memory import InMemoryDocumentStore from haystack.database.elasticsearch import ElasticsearchDocumentStore @@ -72,6 +74,39 @@ def reader(request): use_gpu=-1) +# TODO Fix bug in test_no_answer_output when using +# @pytest.fixture(params=["farm", "transformers"]) +@pytest.fixture(params=["farm"]) +def no_answer_reader(request): + if request.param == "farm": + return FARMReader(model_name_or_path="deepset/roberta-base-squad2", + use_gpu=False, top_k_per_sample=5, no_ans_boost=0, num_processes=0) + if request.param == "transformers": + return TransformersReader(model="deepset/roberta-base-squad2", + tokenizer="deepset/roberta-base-squad2", + use_gpu=-1, n_best_per_passage=5) + + +@pytest.fixture() +def prediction(reader, test_docs_xs): + docs = [] + for d in test_docs_xs: + doc = Document(id=d["meta"]["name"], text=d["text"], meta=d["meta"]) + docs.append(doc) + prediction = reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5) + return prediction + + +@pytest.fixture() +def no_answer_prediction(no_answer_reader, test_docs_xs): + docs = [] + for d in test_docs_xs: + doc = Document(id=d["meta"]["name"], text=d["text"], meta=d["meta"]) + docs.append(doc) + prediction = no_answer_reader.predict(question="What is the meaning of life?", documents=docs, top_k=5) + return prediction + + @pytest.fixture(params=["sql", "memory", "elasticsearch"]) def document_store_with_docs(request, test_docs_xs, elasticsearch_fixture): if request.param == "sql": diff --git a/test/test_finder.py b/test/test_finder.py index 236210ad7..7bbfde5dd 100644 --- a/test/test_finder.py +++ b/test/test_finder.py @@ -28,17 +28,17 @@ def test_finder_offsets(reader, document_store_with_docs): top_k_reader=5) assert prediction["answers"][0]["offset_start"] == 11 - #TODO enable again when FARM is upgraded incl. the new offset calc -# assert prediction["answers"][0]["offset_end"] == 16 + assert prediction["answers"][0]["offset_end"] == 16 start = prediction["answers"][0]["offset_start"] end = prediction["answers"][0]["offset_end"] - #assert prediction["answers"][0]["context"][start:end] == prediction["answers"][0]["answer"] + assert prediction["answers"][0]["context"][start:end] == prediction["answers"][0]["answer"] def test_finder_get_answers_single_result(reader, document_store_with_docs): retriever = TfidfRetriever(document_store=document_store_with_docs) finder = Finder(reader, retriever) - prediction = finder.get_answers(question="testing finder", top_k_retriever=1, + query = "testing finder" + prediction = finder.get_answers(question=query, top_k_retriever=1, top_k_reader=1) assert prediction is not None assert len(prediction["answers"]) == 1 diff --git a/test/test_reader.py b/test/test_reader.py index b0482282e..e6cdd062f 100644 --- a/test/test_reader.py +++ b/test/test_reader.py @@ -1,7 +1,10 @@ -import pytest +import math -from haystack.reader.base import BaseReader from haystack.database.base import Document +from haystack.reader.base import BaseReader +from haystack.reader.farm import FARMReader +from haystack.reader.transformers import TransformersReader + def test_reader_basic(reader): @@ -9,20 +12,89 @@ def test_reader_basic(reader): assert isinstance(reader, BaseReader) -def test_output(reader, test_docs_xs): +def test_output(prediction): + assert prediction is not None + assert prediction["question"] == "Who lives in Berlin?" + assert prediction["answers"][0]["answer"] == "Carla" + assert prediction["answers"][0]["offset_start"] == 11 + assert prediction["answers"][0]["offset_end"] == 16 + assert prediction["answers"][0]["probability"] <= 1 + assert prediction["answers"][0]["probability"] >= 0 + assert prediction["answers"][0]["context"] == "My name is Carla and I live in Berlin" + assert prediction["answers"][0]["document_id"] == "filename1" + assert len(prediction["answers"]) == 5 + + +def test_no_answer_output(no_answer_prediction): + assert no_answer_prediction is not None + assert no_answer_prediction["question"] == "What is the meaning of life?" + assert math.isclose(no_answer_prediction["no_ans_gap"], -14.4729533, rel_tol=0.0001) + assert no_answer_prediction["answers"][0]["answer"] is None + assert no_answer_prediction["answers"][0]["offset_start"] == 0 + assert no_answer_prediction["answers"][0]["offset_end"] == 0 + assert no_answer_prediction["answers"][0]["probability"] <= 1 + assert no_answer_prediction["answers"][0]["probability"] >= 0 + assert no_answer_prediction["answers"][0]["context"] == None + assert no_answer_prediction["answers"][0]["document_id"] == None + answers = [x["answer"] for x in no_answer_prediction["answers"]] + assert answers.count(None) == 1 + assert len(no_answer_prediction["answers"]) == 5 + +# TODO Directly compare farm and transformers reader outputs +# TODO checks to see that model is responsive to input arguments e.g. context_window_size - topk + +def test_prediction_attributes(prediction): + # TODO FARM's prediction also has no_ans_gap + attributes_gold = ["question", "answers"] + for ag in attributes_gold: + assert ag in prediction + + +def test_answer_attributes(prediction): + # TODO Transformers answer also has meta key + # TODO FARM answer has offset_start_in_doc, offset_end_in_doc + answer = prediction["answers"][0] + attributes_gold = ['answer', 'score', 'probability', 'context', 'offset_start', 'offset_end', 'document_id'] + for ag in attributes_gold: + assert ag in answer + + +def test_context_window_size(test_docs_xs): + # TODO parametrize window_size and farm/transformers reader using pytest docs = [] for d in test_docs_xs: doc = Document(id=d["meta"]["name"], text=d["text"], meta=d["meta"]) docs.append(doc) - results = reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5) - assert results is not None - assert results["question"] == "Who lives in Berlin?" - assert results["answers"][0]["answer"] == "Carla" - assert results["answers"][0]["offset_start"] == 11 - #TODO enable again when FARM is upgraded incl. the new offset calc - # assert results["answers"][0]["offset_end"] == 16 - assert results["answers"][0]["probability"] <= 1 - assert results["answers"][0]["probability"] >= 0 - assert results["answers"][0]["context"] == "My name is Carla and I live in Berlin" - assert results["answers"][0]["document_id"] == "filename1" - assert len(results["answers"]) == 5 + for window_size in [10, 15, 20]: + farm_reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", + use_gpu=False, top_k_per_sample=5, no_ans_boost=None, context_window_size=window_size) + prediction = farm_reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5) + for answer in prediction["answers"]: + # If the extracted answer is larger than the context window, the context window is expanded. + # If the extracted answer is odd in length, the resulting context window is one less than context_window_size + # due to rounding (See FARM's QACandidate) + # TODO Currently the behaviour of context_window_size in FARMReader and TransformerReader is different + if len(answer["answer"]) <= window_size: + assert len(answer["context"]) in [window_size, window_size-1] + else: + assert len(answer["answer"]) == len(answer["context"]) + + # TODO Need to test transformers reader + # TODO Currently the behaviour of context_window_size in FARMReader and TransformerReader is different + + +def test_top_k(test_docs_xs): + # TODO parametrize top_k and farm/transformers reader using pytest + # TODO transformers reader was crashing when tested on this + docs = [] + for d in test_docs_xs: + doc = Document(id=d["meta"]["name"], text=d["text"], meta=d["meta"]) + docs.append(doc) + farm_reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", + use_gpu=False, top_k_per_sample=4, no_ans_boost=None, top_k_per_candidate=4) + for top_k in [2, 5, 10]: + prediction = farm_reader.predict(question="Who lives in Berlin?", documents=docs, top_k=top_k) + assert len(prediction["answers"]) == top_k + + + diff --git a/tutorials/Tutorial5_Evaluation.py b/tutorials/Tutorial5_Evaluation.py index 483cec829..33c5b2006 100644 --- a/tutorials/Tutorial5_Evaluation.py +++ b/tutorials/Tutorial5_Evaluation.py @@ -76,8 +76,8 @@ if eval_reader_only: # Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch #reader_eval_results = reader.eval_on_file("../data/natural_questions", "dev_subset.json", device=device) - ## Reader Top-N-Recall is the proportion of predicted answers that overlap with their corresponding correct answer - print("Reader Top-N-Recall:", reader_eval_results["top_n_recall"]) + ## Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer + print("Reader Top-N-Accuracy:", reader_eval_results["top_n_accuracy"]) ## Reader Exact Match is the proportion of questions where the predicted answer is exactly the same as the correct answer print("Reader Exact Match:", reader_eval_results["EM"]) ## Reader F1-Score is the average overlap between the predicted answers and the correct answers