2020-12-03 10:27:06 +01:00
|
|
|
from typing import Any, Optional, Dict, List
|
2020-08-03 16:20:17 +02:00
|
|
|
from uuid import uuid4
|
2020-08-07 14:25:08 +02:00
|
|
|
import numpy as np
|
2021-03-11 12:47:10 +01:00
|
|
|
from abc import abstractmethod
|
2020-01-22 15:53:04 +01:00
|
|
|
|
2021-04-30 12:23:29 +02:00
|
|
|
|
2020-07-31 11:34:06 +02:00
|
|
|
class Document:
|
|
|
|
|
def __init__(self, text: str,
|
2020-12-03 10:27:06 +01:00
|
|
|
id: Optional[str] = None,
|
2020-09-17 16:25:46 +02:00
|
|
|
score: Optional[float] = None,
|
|
|
|
|
probability: Optional[float] = None,
|
2020-07-31 11:34:06 +02:00
|
|
|
question: Optional[str] = None,
|
2020-12-03 10:27:06 +01:00
|
|
|
meta: Dict[str, Any] = None,
|
2021-02-01 12:15:36 +01:00
|
|
|
embedding: Optional[np.ndarray] = None):
|
2020-07-31 11:34:06 +02:00
|
|
|
"""
|
|
|
|
|
Object used to represent documents / passages in a standardized way within Haystack.
|
|
|
|
|
For example, this is what the retriever will return from the DocumentStore,
|
|
|
|
|
regardless if it's ElasticsearchDocumentStore or InMemoryDocumentStore.
|
|
|
|
|
|
|
|
|
|
Note that there can be multiple Documents originating from one file (e.g. PDF),
|
|
|
|
|
if you split the text into smaller passages. We'll have one Document per passage in this case.
|
|
|
|
|
|
|
|
|
|
:param id: ID used within the DocumentStore
|
|
|
|
|
:param text: Text of the document
|
2020-09-17 16:25:46 +02:00
|
|
|
:param score: Retriever's query score for a retrieved document
|
2020-12-03 10:27:06 +01:00
|
|
|
:param probability: a pseudo probability by scaling score in the range 0 to 1
|
2020-07-31 11:34:06 +02:00
|
|
|
:param question: Question text for FAQs.
|
|
|
|
|
:param meta: Meta fields for a document like name, url, or author.
|
|
|
|
|
:param embedding: Vector encoding of the text
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
self.text = text
|
|
|
|
|
# Create a unique ID (either new one, or one from user input)
|
|
|
|
|
if id:
|
2020-08-03 16:20:17 +02:00
|
|
|
self.id = str(id)
|
2020-07-31 11:34:06 +02:00
|
|
|
else:
|
2020-08-03 16:20:17 +02:00
|
|
|
self.id = str(uuid4())
|
2020-07-31 11:34:06 +02:00
|
|
|
|
2020-09-17 16:25:46 +02:00
|
|
|
self.score = score
|
|
|
|
|
self.probability = probability
|
2020-07-31 11:34:06 +02:00
|
|
|
self.question = question
|
2020-12-03 10:27:06 +01:00
|
|
|
self.meta = meta or {}
|
2020-07-31 11:34:06 +02:00
|
|
|
self.embedding = embedding
|
|
|
|
|
|
2020-08-10 05:34:39 -04:00
|
|
|
def to_dict(self, field_map={}):
|
|
|
|
|
inv_field_map = {v:k for k, v in field_map.items()}
|
|
|
|
|
_doc: Dict[str, str] = {}
|
|
|
|
|
for k, v in self.__dict__.items():
|
|
|
|
|
k = k if k not in inv_field_map else inv_field_map[k]
|
|
|
|
|
_doc[k] = v
|
|
|
|
|
return _doc
|
2020-07-31 11:34:06 +02:00
|
|
|
|
|
|
|
|
@classmethod
|
2020-08-10 05:34:39 -04:00
|
|
|
def from_dict(cls, dict, field_map={}):
|
2020-07-31 11:34:06 +02:00
|
|
|
_doc = dict.copy()
|
2020-09-17 16:25:46 +02:00
|
|
|
init_args = ["text", "id", "score", "probability", "question", "meta", "embedding"]
|
2020-07-31 11:34:06 +02:00
|
|
|
if "meta" not in _doc.keys():
|
|
|
|
|
_doc["meta"] = {}
|
|
|
|
|
# copy additional fields into "meta"
|
|
|
|
|
for k, v in _doc.items():
|
2020-08-10 05:34:39 -04:00
|
|
|
if k not in init_args and k not in field_map:
|
2020-07-31 11:34:06 +02:00
|
|
|
_doc["meta"][k] = v
|
|
|
|
|
# remove additional fields from top level
|
2020-08-10 05:34:39 -04:00
|
|
|
_new_doc = {}
|
|
|
|
|
for k, v in _doc.items():
|
|
|
|
|
if k in init_args:
|
|
|
|
|
_new_doc[k] = v
|
|
|
|
|
elif k in field_map:
|
|
|
|
|
k = field_map[k]
|
|
|
|
|
_new_doc[k] = v
|
2020-07-31 11:34:06 +02:00
|
|
|
|
2020-08-10 05:34:39 -04:00
|
|
|
return cls(**_new_doc)
|
2020-07-31 11:34:06 +02:00
|
|
|
|
2020-11-20 17:41:08 +01:00
|
|
|
def __repr__(self):
|
|
|
|
|
return str(self.to_dict())
|
|
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
|
return str(self.to_dict())
|
2020-07-31 11:34:06 +02:00
|
|
|
|
2021-02-15 10:48:59 +01:00
|
|
|
|
2020-07-31 11:34:06 +02:00
|
|
|
class Label:
|
|
|
|
|
def __init__(self, question: str,
|
|
|
|
|
answer: str,
|
|
|
|
|
is_correct_answer: bool,
|
|
|
|
|
is_correct_document: bool,
|
|
|
|
|
origin: str,
|
2021-01-12 10:02:40 +01:00
|
|
|
id: Optional[str] = None,
|
2020-08-03 16:20:17 +02:00
|
|
|
document_id: Optional[str] = None,
|
2020-07-31 11:34:06 +02:00
|
|
|
offset_start_in_doc: Optional[int] = None,
|
|
|
|
|
no_answer: Optional[bool] = None,
|
2021-02-15 10:48:59 +01:00
|
|
|
model_id: Optional[int] = None,
|
|
|
|
|
created_at: Optional[str] = None,
|
|
|
|
|
updated_at: Optional[str] = None):
|
2020-07-31 11:34:06 +02:00
|
|
|
"""
|
|
|
|
|
Object used to represent label/feedback in a standardized way within Haystack.
|
|
|
|
|
This includes labels from dataset like SQuAD, annotations from labeling tools,
|
|
|
|
|
or, user-feedback from the Haystack REST API.
|
|
|
|
|
|
|
|
|
|
:param question: the question(or query) for finding answers.
|
2020-08-10 19:30:31 +02:00
|
|
|
:param answer: the answer string.
|
2020-07-31 11:34:06 +02:00
|
|
|
:param is_correct_answer: whether the sample is positive or negative.
|
|
|
|
|
:param is_correct_document: in case of negative sample(is_correct_answer is False), there could be two cases;
|
|
|
|
|
incorrect answer but correct document & incorrect document. This flag denotes if
|
|
|
|
|
the returned document was correct.
|
|
|
|
|
:param origin: the source for the labels. It can be used to later for filtering.
|
2021-01-12 10:02:40 +01:00
|
|
|
:param id: Unique ID used within the DocumentStore. If not supplied, a uuid will be generated automatically.
|
2020-07-31 11:34:06 +02:00
|
|
|
:param document_id: the document_store's ID for the returned answer document.
|
|
|
|
|
:param offset_start_in_doc: the answer start offset in the document.
|
|
|
|
|
:param no_answer: whether the question in unanswerable.
|
2020-08-07 11:24:41 +02:00
|
|
|
:param model_id: model_id used for prediction (in-case of user feedback).
|
2021-02-15 10:48:59 +01:00
|
|
|
:param created_at: Timestamp of creation with format yyyy-MM-dd HH:mm:ss.
|
|
|
|
|
Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S").
|
|
|
|
|
:param created_at: Timestamp of update with format yyyy-MM-dd HH:mm:ss.
|
|
|
|
|
Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S")
|
2020-07-31 11:34:06 +02:00
|
|
|
"""
|
2021-01-12 10:02:40 +01:00
|
|
|
|
|
|
|
|
# Create a unique ID (either new one, or one from user input)
|
|
|
|
|
if id:
|
|
|
|
|
self.id = str(id)
|
|
|
|
|
else:
|
|
|
|
|
self.id = str(uuid4())
|
|
|
|
|
|
2021-02-15 10:48:59 +01:00
|
|
|
self.created_at = created_at
|
|
|
|
|
self.updated_at = updated_at
|
2020-07-31 11:34:06 +02:00
|
|
|
self.question = question
|
2020-08-07 11:24:41 +02:00
|
|
|
self.answer = answer
|
2020-07-31 11:34:06 +02:00
|
|
|
self.is_correct_answer = is_correct_answer
|
|
|
|
|
self.is_correct_document = is_correct_document
|
2020-08-07 11:24:41 +02:00
|
|
|
self.origin = origin
|
2020-08-03 16:20:17 +02:00
|
|
|
self.document_id = document_id
|
2020-07-31 11:34:06 +02:00
|
|
|
self.offset_start_in_doc = offset_start_in_doc
|
2020-08-07 11:24:41 +02:00
|
|
|
self.no_answer = no_answer
|
|
|
|
|
self.model_id = model_id
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def from_dict(cls, dict):
|
|
|
|
|
return cls(**dict)
|
|
|
|
|
|
|
|
|
|
def to_dict(self):
|
|
|
|
|
return self.__dict__
|
|
|
|
|
|
|
|
|
|
# define __eq__ and __hash__ functions to deduplicate Label Objects
|
|
|
|
|
def __eq__(self, other):
|
|
|
|
|
return (isinstance(other, self.__class__) and
|
|
|
|
|
getattr(other, 'question', None) == self.question and
|
|
|
|
|
getattr(other, 'answer', None) == self.answer and
|
|
|
|
|
getattr(other, 'is_correct_answer', None) == self.is_correct_answer and
|
|
|
|
|
getattr(other, 'is_correct_document', None) == self.is_correct_document and
|
|
|
|
|
getattr(other, 'origin', None) == self.origin and
|
|
|
|
|
getattr(other, 'document_id', None) == self.document_id and
|
|
|
|
|
getattr(other, 'offset_start_in_doc', None) == self.offset_start_in_doc and
|
|
|
|
|
getattr(other, 'no_answer', None) == self.no_answer and
|
2021-02-15 10:48:59 +01:00
|
|
|
getattr(other, 'model_id', None) == self.model_id and
|
|
|
|
|
getattr(other, 'created_at', None) == self.created_at and
|
|
|
|
|
getattr(other, 'updated_at', None) == self.updated_at)
|
2020-08-07 11:24:41 +02:00
|
|
|
|
|
|
|
|
def __hash__(self):
|
|
|
|
|
return hash(self.question +
|
|
|
|
|
self.answer +
|
|
|
|
|
str(self.is_correct_answer) +
|
|
|
|
|
str(self.is_correct_document) +
|
|
|
|
|
str(self.origin) +
|
|
|
|
|
str(self.document_id) +
|
|
|
|
|
str(self.offset_start_in_doc) +
|
|
|
|
|
str(self.no_answer) +
|
2021-02-15 10:48:59 +01:00
|
|
|
str(self.model_id)
|
|
|
|
|
)
|
2020-08-07 11:24:41 +02:00
|
|
|
|
2020-11-20 17:41:08 +01:00
|
|
|
def __repr__(self):
|
|
|
|
|
return str(self.to_dict())
|
|
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
|
return str(self.to_dict())
|
2020-08-07 11:24:41 +02:00
|
|
|
|
|
|
|
|
class MultiLabel:
|
|
|
|
|
def __init__(self, question: str,
|
|
|
|
|
multiple_answers: List[str],
|
|
|
|
|
is_correct_answer: bool,
|
|
|
|
|
is_correct_document: bool,
|
|
|
|
|
origin: str,
|
|
|
|
|
multiple_document_ids: List[Any],
|
|
|
|
|
multiple_offset_start_in_docs: List[Any],
|
|
|
|
|
no_answer: Optional[bool] = None,
|
|
|
|
|
model_id: Optional[int] = None):
|
|
|
|
|
"""
|
|
|
|
|
Object used to aggregate multiple possible answers for the same question
|
|
|
|
|
|
|
|
|
|
:param question: the question(or query) for finding answers.
|
|
|
|
|
:param multiple_answers: list of possible answer strings
|
|
|
|
|
:param is_correct_answer: whether the sample is positive or negative.
|
|
|
|
|
:param is_correct_document: in case of negative sample(is_correct_answer is False), there could be two cases;
|
|
|
|
|
incorrect answer but correct document & incorrect document. This flag denotes if
|
|
|
|
|
the returned document was correct.
|
|
|
|
|
:param origin: the source for the labels. It can be used to later for filtering.
|
|
|
|
|
:param multiple_document_ids: the document_store's IDs for the returned answer documents.
|
|
|
|
|
:param multiple_offset_start_in_docs: the answer start offsets in the document.
|
|
|
|
|
:param no_answer: whether the question in unanswerable.
|
|
|
|
|
:param model_id: model_id used for prediction (in-case of user feedback).
|
|
|
|
|
"""
|
|
|
|
|
self.question = question
|
|
|
|
|
self.multiple_answers = multiple_answers
|
|
|
|
|
self.is_correct_answer = is_correct_answer
|
|
|
|
|
self.is_correct_document = is_correct_document
|
|
|
|
|
self.origin = origin
|
|
|
|
|
self.multiple_document_ids = multiple_document_ids
|
|
|
|
|
self.multiple_offset_start_in_docs = multiple_offset_start_in_docs
|
|
|
|
|
self.no_answer = no_answer
|
2020-07-31 11:34:06 +02:00
|
|
|
self.model_id = model_id
|
2020-01-22 15:53:04 +01:00
|
|
|
|
2020-07-31 11:34:06 +02:00
|
|
|
@classmethod
|
|
|
|
|
def from_dict(cls, dict):
|
|
|
|
|
return cls(**dict)
|
|
|
|
|
|
|
|
|
|
def to_dict(self):
|
2020-11-20 17:41:08 +01:00
|
|
|
return self.__dict__
|
|
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
return str(self.to_dict())
|
|
|
|
|
|
|
|
|
|
def __str__(self):
|
2021-02-02 17:32:17 +01:00
|
|
|
return str(self.to_dict())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BaseComponent:
|
|
|
|
|
"""
|
|
|
|
|
A base class for implementing nodes in a Pipeline.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
outgoing_edges: int
|
|
|
|
|
subclasses: dict = {}
|
2021-04-30 12:23:29 +02:00
|
|
|
pipeline_config: dict = {}
|
2021-02-02 17:32:17 +01:00
|
|
|
|
|
|
|
|
def __init_subclass__(cls, **kwargs):
|
|
|
|
|
""" This automatically keeps track of all available subclasses.
|
|
|
|
|
Enables generic load() for all specific component implementations.
|
|
|
|
|
"""
|
|
|
|
|
super().__init_subclass__(**kwargs)
|
|
|
|
|
cls.subclasses[cls.__name__] = cls
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def load_from_args(cls, component_type: str, **kwargs):
|
|
|
|
|
"""
|
|
|
|
|
Load a component instance of the given type using the kwargs.
|
|
|
|
|
|
|
|
|
|
:param component_type: name of the component class to load.
|
|
|
|
|
:param kwargs: parameters to pass to the __init__() for the component.
|
|
|
|
|
"""
|
2021-02-04 11:53:51 +01:00
|
|
|
if component_type not in cls.subclasses.keys():
|
|
|
|
|
raise Exception(f"Haystack component with the name '{component_type}' does not exist.")
|
2021-02-02 17:32:17 +01:00
|
|
|
instance = cls.subclasses[component_type](**kwargs)
|
|
|
|
|
return instance
|
2021-03-11 12:47:10 +01:00
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
def run(self, *args: Any, **kwargs: Any):
|
|
|
|
|
"""
|
|
|
|
|
Method that will be executed when the node in the graph is called.
|
|
|
|
|
The argument that are passed can vary between different types of nodes
|
|
|
|
|
(e.g. retriever nodes expect different args than a reader node)
|
|
|
|
|
See an example for an implementation in haystack/reader/base/BaseReader.py
|
|
|
|
|
:param kwargs:
|
|
|
|
|
:return:
|
|
|
|
|
"""
|
2021-04-30 12:23:29 +02:00
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def set_config(self, **kwargs):
|
|
|
|
|
"""
|
|
|
|
|
Save the init parameters of a component that later can be used with exporting
|
|
|
|
|
YAML configuration of a Pipeline.
|
|
|
|
|
|
|
|
|
|
:param kwargs: all parameters passed to the __init__() of the Component.
|
|
|
|
|
"""
|
|
|
|
|
if not self.pipeline_config:
|
|
|
|
|
self.pipeline_config = {"params": {}, "type": type(self).__name__}
|
|
|
|
|
for k, v in kwargs.items():
|
|
|
|
|
if isinstance(v, BaseComponent):
|
|
|
|
|
self.pipeline_config["params"][k] = v.pipeline_config
|
|
|
|
|
elif v is not None:
|
|
|
|
|
self.pipeline_config["params"][k] = v
|