mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-02-06 23:12:43 +00:00
206 lines
10 KiB
Python
206 lines
10 KiB
Python
import logging
|
|
from abc import abstractmethod, ABC
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, List, Union
|
|
|
|
import numpy as np
|
|
|
|
from haystack import Document, Label, MultiLabel
|
|
from haystack.preprocessor.preprocessor import PreProcessor
|
|
from haystack.preprocessor.utils import eval_data_from_json, eval_data_from_jsonl, squad_json_to_jsonl
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class BaseDocumentStore(ABC):
|
|
"""
|
|
Base class for implementing Document Stores.
|
|
"""
|
|
index: Optional[str]
|
|
label_index: Optional[str]
|
|
similarity: Optional[str]
|
|
|
|
@abstractmethod
|
|
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None):
|
|
"""
|
|
Indexes documents for later queries.
|
|
|
|
:param documents: a list of Python dictionaries or a list of Haystack Document objects.
|
|
For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
|
|
Optionally: Include meta data via {"text": "<the-actual-text>",
|
|
"meta":{"name": "<some-document-name>, "author": "somebody", ...}}
|
|
It can be used for filtering and is accessible in the responses of the Finder.
|
|
:param index: Optional name of index where the documents shall be written to.
|
|
If None, the DocumentStore's default index (self.index) will be used.
|
|
|
|
:return: None
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def get_all_documents(
|
|
self,
|
|
index: Optional[str] = None,
|
|
filters: Optional[Dict[str, List[str]]] = None,
|
|
return_embedding: Optional[bool] = None
|
|
) -> List[Document]:
|
|
"""
|
|
Get documents from the document store.
|
|
|
|
:param index: Name of the index to get the documents from. If None, the
|
|
DocumentStore's default index (self.index) will be used.
|
|
:param filters: Optional filters to narrow down the documents to return.
|
|
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
|
:param return_embedding: Whether to return the document embeddings.
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def get_all_labels(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Label]:
|
|
pass
|
|
|
|
def get_all_labels_aggregated(self,
|
|
index: Optional[str] = None,
|
|
filters: Optional[Dict[str, List[str]]] = None) -> List[MultiLabel]:
|
|
aggregated_labels = []
|
|
all_labels = self.get_all_labels(index=index, filters=filters)
|
|
|
|
# Collect all answers to a question in a dict
|
|
question_ans_dict: dict = {}
|
|
for l in all_labels:
|
|
# only aggregate labels with correct answers, as only those can be currently used in evaluation
|
|
if not l.is_correct_answer:
|
|
continue
|
|
|
|
if l.question in question_ans_dict:
|
|
question_ans_dict[l.question].append(l)
|
|
else:
|
|
question_ans_dict[l.question] = [l]
|
|
|
|
# Aggregate labels
|
|
for q, ls in question_ans_dict.items():
|
|
ls = list(set(ls)) # get rid of exact duplicates
|
|
# check if there are both text answer and "no answer" present
|
|
t_present = False
|
|
no_present = False
|
|
no_idx = []
|
|
for idx, l in enumerate(ls):
|
|
if len(l.answer) == 0:
|
|
no_present = True
|
|
no_idx.append(idx)
|
|
else:
|
|
t_present = True
|
|
# if both text and no answer are present, remove no answer labels
|
|
if t_present and no_present:
|
|
logger.warning(
|
|
f"Both text label and 'no answer possible' label is present for question: {ls[0].question}")
|
|
for remove_idx in no_idx[::-1]:
|
|
ls.pop(remove_idx)
|
|
|
|
# construct Aggregated_label
|
|
for i, l in enumerate(ls):
|
|
if i == 0:
|
|
agg_label = MultiLabel(question=l.question,
|
|
multiple_answers=[l.answer],
|
|
is_correct_answer=l.is_correct_answer,
|
|
is_correct_document=l.is_correct_document,
|
|
origin=l.origin,
|
|
multiple_document_ids=[l.document_id],
|
|
multiple_offset_start_in_docs=[l.offset_start_in_doc],
|
|
no_answer=l.no_answer,
|
|
model_id=l.model_id,
|
|
)
|
|
else:
|
|
agg_label.multiple_answers.append(l.answer)
|
|
agg_label.multiple_document_ids.append(l.document_id)
|
|
agg_label.multiple_offset_start_in_docs.append(l.offset_start_in_doc)
|
|
aggregated_labels.append(agg_label)
|
|
return aggregated_labels
|
|
|
|
@abstractmethod
|
|
def get_document_by_id(self, id: str, index: Optional[str] = None) -> Optional[Document]:
|
|
pass
|
|
|
|
@abstractmethod
|
|
def get_document_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int:
|
|
pass
|
|
|
|
@abstractmethod
|
|
def query_by_embedding(self,
|
|
query_emb: np.ndarray,
|
|
filters: Optional[Optional[Dict[str, List[str]]]] = None,
|
|
top_k: int = 10,
|
|
index: Optional[str] = None,
|
|
return_embedding: Optional[bool] = None) -> List[Document]:
|
|
pass
|
|
|
|
@abstractmethod
|
|
def get_label_count(self, index: Optional[str] = None) -> int:
|
|
pass
|
|
|
|
@abstractmethod
|
|
def write_labels(self, labels: Union[List[Label], List[dict]], index: Optional[str] = None):
|
|
pass
|
|
|
|
def add_eval_data(self, filename: str, doc_index: str = "eval_document", label_index: str = "label",
|
|
batch_size: Optional[int] = None, preprocessor: Optional[PreProcessor] = None,
|
|
max_docs: Union[int, bool] = None):
|
|
"""
|
|
Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it.
|
|
If a jsonl file and a batch_size is passed to the function, documents are loaded batchwise
|
|
from disk and also indexed batchwise to the DocumentStore in order to prevent out of memory errors.
|
|
|
|
:param filename: Name of the file containing evaluation data (json or jsonl)
|
|
:param doc_index: Elasticsearch index where evaluation documents should be stored
|
|
:param label_index: Elasticsearch index where labeled questions should be stored
|
|
:param batch_size: Optional number of documents that are loaded and processed at a time.
|
|
When set to None (default) all documents are processed at once.
|
|
:param preprocessor: Optional PreProcessor to preprocess evaluation documents.
|
|
It can be used for splitting documents into passages (and assigning labels to corresponding passages).
|
|
Currently the PreProcessor does not support split_by sentence, cleaning nor split_overlap != 0.
|
|
When set to None (default) preprocessing is disabled.
|
|
:param max_docs: Optional number of documents that will be loaded.
|
|
When set to None (default) all available eval documents are used.
|
|
|
|
"""
|
|
# TODO improve support for PreProcessor when adding eval data
|
|
if preprocessor is not None:
|
|
assert preprocessor.split_by != "sentence", f"Split by sentence not supported.\n" \
|
|
f"Please set 'split_by' to either 'word' or 'passage' in the supplied PreProcessor."
|
|
assert preprocessor.split_overlap == 0, f"Overlapping documents are currently not supported when adding eval data.\n" \
|
|
f"Please set 'split_overlap=0' in the supplied PreProcessor."
|
|
assert preprocessor.clean_empty_lines == False, f"clean_empty_lines currently not supported when adding eval data.\n" \
|
|
f"Please set 'clean_empty_lines=False' in the supplied PreProcessor."
|
|
assert preprocessor.clean_whitespace == False, f"clean_whitespace is currently not supported when adding eval data.\n" \
|
|
f"Please set 'clean_whitespace=False' in the supplied PreProcessor."
|
|
assert preprocessor.clean_header_footer == False, f"clean_header_footer is currently not supported when adding eval data.\n" \
|
|
f"Please set 'clean_header_footer=False' in the supplied PreProcessor."
|
|
|
|
file_path = Path(filename)
|
|
if file_path.suffix == ".json":
|
|
if batch_size is None:
|
|
docs, labels = eval_data_from_json(filename, max_docs=max_docs, preprocessor=preprocessor)
|
|
self.write_documents(docs, index=doc_index)
|
|
self.write_labels(labels, index=label_index)
|
|
else:
|
|
jsonl_filename = (file_path.parent / (file_path.stem + '.jsonl')).as_posix()
|
|
logger.info(f"Adding evaluation data batch-wise is not compatible with json-formatted SQuAD files. "
|
|
f"Converting json to jsonl to: {jsonl_filename}")
|
|
squad_json_to_jsonl(filename, jsonl_filename)
|
|
self.add_eval_data(jsonl_filename, doc_index, label_index, batch_size)
|
|
|
|
elif file_path.suffix == ".jsonl":
|
|
for docs, labels in eval_data_from_jsonl(filename, batch_size, max_docs=max_docs, preprocessor=preprocessor):
|
|
if docs:
|
|
self.write_documents(docs, index=doc_index)
|
|
if labels:
|
|
self.write_labels(labels, index=label_index)
|
|
|
|
else:
|
|
logger.error("File needs to be in json or jsonl format.")
|
|
|
|
@abstractmethod
|
|
def delete_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
|
|
pass
|
|
|