haystack/haystack/reader/transformers.py
2020-06-10 17:22:37 +02:00

105 lines
4.2 KiB
Python

from typing import List, Optional
from transformers import pipeline
from haystack.database.base import Document
from haystack.reader.base import BaseReader
class TransformersReader(BaseReader):
"""
Transformer based model for extractive Question Answering using the huggingface's transformers framework
(https://github.com/huggingface/transformers).
While the underlying model can vary (BERT, Roberta, DistilBERT ...) the interface remains the same.
With the reader, you can:
- directly get predictions via predict()
"""
def __init__(
self,
model: str = "distilbert-base-uncased-distilled-squad",
tokenizer: str = "distilbert-base-uncased",
context_window_size: int = 30,
use_gpu: int = 0,
n_best_per_passage: int = 2,
):
"""
Load a QA model from Transformers.
Available models include:
- distilbert-base-uncased-distilled-squad
- bert-large-cased-whole-word-masking-finetuned-squad
- bert-large-uncased-whole-word-masking-finetuned-squad
See https://huggingface.co/models for full list of available QA models
:param model: name of the model
:param tokenizer: name of the tokenizer (usually the same as model)
:param context_window_size: num of chars (before and after the answer) to return as "context" for each answer.
The context usually helps users to understand if the answer really makes sense.
:param use_gpu: < 1 -> use cpu
>= 0 -> ordinal of the gpu to use
"""
self.model = pipeline("question-answering", model=model, tokenizer=tokenizer, device=use_gpu)
self.context_window_size = context_window_size
self.n_best_per_passage = n_best_per_passage
#TODO param to modify bias for no_answer
def predict(self, question: str, documents: List[Document], top_k: Optional[int] = None):
"""
Use loaded QA model to find answers for a question in the supplied list of Document.
Returns dictionaries containing answers sorted by (desc.) probability
Example:
{'question': 'Who is the father of Arya Stark?',
'answers': [
{'answer': 'Eddard,',
'context': " She travels with her father, Eddard, to King's Landing when he is ",
'offset_answer_start': 147,
'offset_answer_end': 154,
'probability': 0.9787139466668613,
'score': None,
'document_id': None
},
...
]
}
:param question: question string
:param documents: list of Document in which to search for the answer
:param top_k: the maximum number of answers to return
:return: dict containing question and answers
"""
# get top-answers for each candidate passage
answers = []
for doc in documents:
query = {"context": doc.text, "question": question}
predictions = self.model(query, topk=self.n_best_per_passage)
# assemble and format all answers
for pred in predictions:
if pred["answer"]:
context_start = max(0, pred["start"] - self.context_window_size)
context_end = min(len(doc.text), pred["end"] + self.context_window_size)
answers.append({
"answer": pred["answer"],
"context": doc.text[context_start:context_end],
"offset_answer_start": pred["start"],
"offset_answer_end": pred["end"],
"probability": pred["score"],
"score": None,
"document_id": doc.id,
"meta": doc.meta
})
# sort answers by their `probability` and select top-k
answers = sorted(
answers, key=lambda k: k["probability"], reverse=True
)
answers = answers[:top_k]
results = {"question": question,
"answers": answers}
return results