mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-24 21:48:52 +00:00
Deprecate Tags for Document Stores (#286)
This commit is contained in:
parent
6a103252ef
commit
5937f9cf16
@ -9,7 +9,6 @@ class Document:
|
||||
query_score: Optional[float] = None,
|
||||
question: Optional[str] = None,
|
||||
meta: Dict[str, Any] = None,
|
||||
tags: Optional[Dict[str, Any]] = None,
|
||||
embedding: Optional[List[float]] = None):
|
||||
"""
|
||||
Object used to represent documents / passages in a standardized way within Haystack.
|
||||
@ -24,7 +23,6 @@ class Document:
|
||||
:param query_score: Retriever's query score for a retrieved document
|
||||
:param question: Question text for FAQs.
|
||||
:param meta: Meta fields for a document like name, url, or author.
|
||||
:param tags: Tags that allow filtering of the data
|
||||
:param embedding: Vector encoding of the text
|
||||
"""
|
||||
|
||||
@ -38,7 +36,6 @@ class Document:
|
||||
self.query_score = query_score
|
||||
self.question = question
|
||||
self.meta = meta
|
||||
self.tags = tags # deprecate?
|
||||
self.embedding = embedding
|
||||
|
||||
def to_dict(self):
|
||||
@ -47,7 +44,7 @@ class Document:
|
||||
@classmethod
|
||||
def from_dict(cls, dict):
|
||||
_doc = dict.copy()
|
||||
init_args = ["text", "id", "query_score", "question", "meta", "tags", "embedding"]
|
||||
init_args = ["text", "id", "query_score", "question", "meta", "embedding"]
|
||||
if "meta" not in _doc.keys():
|
||||
_doc["meta"] = {}
|
||||
# copy additional fields into "meta"
|
||||
@ -110,14 +107,15 @@ class BaseDocumentStore(ABC):
|
||||
Base class for implementing Document Stores.
|
||||
"""
|
||||
index: Optional[str]
|
||||
label_index: Optional[str]
|
||||
|
||||
@abstractmethod
|
||||
def write_documents(self, documents: List[dict], index: Optional[str] = None):
|
||||
"""
|
||||
Indexes documents for later queries.
|
||||
|
||||
:param documents: List of dictionaries.
|
||||
Default format: {"text": "<the-actual-text>"}
|
||||
:param documents: a list of Python dictionaries or a list of Haystack Document objects.
|
||||
For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
|
||||
Optionally: Include meta data via {"text": "<the-actual-text>",
|
||||
"meta":{"name": "<some-document-name>, "author": "somebody", ...}}
|
||||
It can be used for filtering and is accessible in the responses of the Finder.
|
||||
@ -129,21 +127,17 @@ class BaseDocumentStore(ABC):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_all_documents(self, index: Optional[str] = None) -> List[Document]:
|
||||
def get_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Document]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_all_labels(self, index: str = "label", filters: Optional[dict] = None) -> List[Label]:
|
||||
def get_all_labels(self, index: str = "label", filters: Optional[Optional[Dict[str, List[str]]]] = None) -> List[Label]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_document_by_id(self, id: str, index: Optional[str] = None) -> Optional[Document]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_document_ids_by_tags(self, tag, index) -> List[str]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_document_count(self, index: Optional[str] = None) -> int:
|
||||
pass
|
||||
@ -151,7 +145,7 @@ class BaseDocumentStore(ABC):
|
||||
@abstractmethod
|
||||
def query_by_embedding(self,
|
||||
query_emb: List[float],
|
||||
filters: Optional[dict] = None,
|
||||
filters: Optional[Optional[Dict[str, List[str]]]] = None,
|
||||
top_k: int = 10,
|
||||
index: Optional[str] = None) -> List[Document]:
|
||||
pass
|
||||
|
||||
@ -9,6 +9,7 @@ import numpy as np
|
||||
|
||||
from haystack.database.base import BaseDocumentStore, Document, Label
|
||||
from haystack.indexing.utils import eval_data_from_file
|
||||
from haystack.retriever.base import BaseRetriever
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -89,7 +90,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
self.index: str = index
|
||||
|
||||
self._create_label_index(label_index)
|
||||
self.label_index = label_index
|
||||
self.label_index: str = label_index
|
||||
self.update_existing_documents = update_existing_documents
|
||||
|
||||
def _create_document_index(self, index_name):
|
||||
@ -136,17 +137,6 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
document = self._convert_es_hit_to_document(result[0]) if result else None
|
||||
return document
|
||||
|
||||
def get_document_ids_by_tags(self, tags: dict, index: Optional[str]) -> List[str]:
|
||||
index = index or self.index
|
||||
term_queries = [{"terms": {key: value}} for key, value in tags.items()]
|
||||
query = {"query": {"bool": {"must": term_queries}}}
|
||||
logger.debug(f"Tag filter query: {query}")
|
||||
result = self.client.search(index=index, body=query, size=10000)["hits"]["hits"]
|
||||
doc_ids = []
|
||||
for hit in result:
|
||||
doc_ids.append(hit["_id"])
|
||||
return doc_ids
|
||||
|
||||
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None):
|
||||
"""
|
||||
Indexes documents for later queries in Elasticsearch.
|
||||
@ -198,7 +188,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
documents_to_index.append(_doc)
|
||||
bulk(self.client, documents_to_index, request_timeout=300, refresh="wait_for")
|
||||
|
||||
def write_labels(self, labels: Union[List[Label], List[dict]], index: Optional[str] = "label"):
|
||||
def write_labels(self, labels: Union[List[Label], List[dict]], index: Optional[str] = None):
|
||||
index = index or self.label_index
|
||||
if index and not self.client.indices.exists(index=index):
|
||||
self._create_label_index(index)
|
||||
|
||||
@ -230,7 +221,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
def get_label_count(self, index: Optional[str] = None) -> int:
|
||||
return self.get_document_count(index=index)
|
||||
|
||||
def get_all_documents(self, index: Optional[str] = None, filters: Optional[dict] = None) -> List[Document]:
|
||||
def get_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Document]:
|
||||
if index is None:
|
||||
index = self.index
|
||||
|
||||
@ -239,12 +230,13 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
|
||||
return documents
|
||||
|
||||
def get_all_labels(self, index: str = "label", filters: Optional[dict] = None) -> List[Label]:
|
||||
def get_all_labels(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Label]:
|
||||
index = index or self.label_index
|
||||
result = self.get_all_documents_in_index(index=index, filters=filters)
|
||||
labels = [Label.from_dict(hit["_source"]) for hit in result]
|
||||
return labels
|
||||
|
||||
def get_all_documents_in_index(self, index: str, filters: Optional[dict] = None) -> List[dict]:
|
||||
def get_all_documents_in_index(self, index: str, filters: Optional[Dict[str, List[str]]] = None) -> List[dict]:
|
||||
body = {
|
||||
"query": {
|
||||
"bool": {
|
||||
@ -346,7 +338,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
|
||||
def query_by_embedding(self,
|
||||
query_emb: np.array,
|
||||
filters: Optional[dict] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
top_k: int = 10,
|
||||
index: Optional[str] = None) -> List[Document]:
|
||||
if index is None:
|
||||
@ -392,7 +384,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
|
||||
def _convert_es_hit_to_document(self, hit: dict, score_adjustment: int = 0) -> Document:
|
||||
# We put all additional data of the doc into meta_data and return it in the API
|
||||
meta_data = {k:v for k,v in hit["_source"].items() if k not in (self.text_field, self.faq_question_field, self.embedding_field, "tags")}
|
||||
meta_data = {k:v for k,v in hit["_source"].items() if k not in (self.text_field, self.faq_question_field, self.embedding_field)}
|
||||
meta_data["name"] = meta_data.pop(self.name_field, None)
|
||||
|
||||
document = Document(
|
||||
@ -401,7 +393,6 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
meta=meta_data,
|
||||
query_score=hit["_score"] + score_adjustment if hit["_score"] else None,
|
||||
question=hit["_source"].get(self.faq_question_field),
|
||||
tags=hit["_source"].get("tags"),
|
||||
embedding=hit["_source"].get(self.embedding_field)
|
||||
)
|
||||
return document
|
||||
@ -420,12 +411,13 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
}
|
||||
return stats
|
||||
|
||||
def update_embeddings(self, retriever, index=None):
|
||||
def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None):
|
||||
"""
|
||||
Updates the embeddings in the the document store using the encoding model specified in the retriever.
|
||||
This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config).
|
||||
|
||||
:param retriever: Retriever
|
||||
:param index: Index name to update
|
||||
:return: None
|
||||
"""
|
||||
if index is None:
|
||||
@ -439,7 +431,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
|
||||
#TODO Index embeddings every X batches to avoid OOM for huge document collections
|
||||
logger.info(f"Updating embeddings for {len(passages)} docs ...")
|
||||
embeddings = retriever.embed_passages(passages)
|
||||
embeddings = retriever.embed_passages(passages) # type: ignore
|
||||
|
||||
assert len(docs) == len(embeddings)
|
||||
|
||||
|
||||
@ -12,7 +12,6 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
"""
|
||||
|
||||
def __init__(self, embedding_field: Optional[str] = None):
|
||||
self.doc_tags: Dict[str, Any] = {}
|
||||
self.indexes: Dict[str, Dict] = defaultdict(dict)
|
||||
self.index: str = "document"
|
||||
self.label_index: str = "label"
|
||||
@ -22,10 +21,11 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
Indexes documents for later queries.
|
||||
|
||||
|
||||
:param documents: a list of Python dictionaries or a list of Haystack Document objects.
|
||||
:param documents: a list of Python dictionaries or a list of Haystack Document objects.
|
||||
For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
|
||||
Optionally, you can also supply "tags": ["one-tag", "another-one"]
|
||||
or additional meta data via "meta": {"name": "<some-document-name>, "author": "someone", "url":"some-url" ...}
|
||||
Optionally: Include meta data via {"text": "<the-actual-text>",
|
||||
"meta": {"name": "<some-document-name>, "author": "somebody", ...}}
|
||||
It can be used for filtering and is accessible in the responses of the Finder.
|
||||
:param index: write documents to a custom namespace. For instance, documents for evaluation can be indexed in a
|
||||
separate index than the documents for search.
|
||||
:return: None
|
||||
@ -37,10 +37,6 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
for document in documents_objects:
|
||||
self.indexes[index][document.id] = document
|
||||
|
||||
#TODO fix tags after id refactoring
|
||||
tags = document.tags
|
||||
self._map_tags_to_ids(document.id, tags)
|
||||
|
||||
def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[str] = None):
|
||||
index = index or self.label_index
|
||||
label_objects = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels]
|
||||
@ -49,21 +45,6 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
label_id = str(uuid4())
|
||||
self.indexes[index][label_id] = label
|
||||
|
||||
def _map_tags_to_ids(self, hash: str, tags: List[str]):
|
||||
if isinstance(tags, list):
|
||||
for tag in tags:
|
||||
if isinstance(tag, dict):
|
||||
tag_keys = tag.keys()
|
||||
for tag_key in tag_keys:
|
||||
tag_values = tag.get(tag_key, [])
|
||||
if tag_values:
|
||||
for tag_value in tag_values:
|
||||
comp_key = str((tag_key, tag_value))
|
||||
if comp_key in self.doc_tags:
|
||||
self.doc_tags[comp_key].append(hash)
|
||||
else:
|
||||
self.doc_tags[comp_key] = [hash]
|
||||
|
||||
def get_document_by_id(self, id: str, index: Optional[str] = None) -> Document:
|
||||
index = index or self.index
|
||||
return self.indexes[index][id]
|
||||
@ -79,7 +60,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
|
||||
def query_by_embedding(self,
|
||||
query_emb: List[float],
|
||||
filters: Optional[dict] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
top_k: int = 10,
|
||||
index: Optional[str] = None) -> List[Document]:
|
||||
|
||||
@ -116,44 +97,36 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
#TODO
|
||||
raise NotImplementedError("update_embeddings() is not yet implemented for this DocumentStore")
|
||||
|
||||
def get_document_ids_by_tags(self, tags: Union[List[Dict[str, Union[str, List[str]]]], Dict[str, Union[str, List[str]]]], index: Optional[str] = None) -> List[str]:
|
||||
"""
|
||||
The format for the dict is {"tag-1": "value-1", "tag-2": "value-2" ...}
|
||||
The format for the dict is {"tag-1": ["value-1","value-2"], "tag-2": ["value-3]" ...}
|
||||
"""
|
||||
index = index or self.index
|
||||
if not isinstance(tags, list):
|
||||
tags = [tags]
|
||||
result = self._find_ids_by_tags(tags, index=index)
|
||||
return result
|
||||
|
||||
def _find_ids_by_tags(self, tags: List[Dict[str, Union[str, List[str]]]], index: str):
|
||||
result = []
|
||||
for tag in tags:
|
||||
tag_keys = tag.keys()
|
||||
for tag_key in tag_keys:
|
||||
tag_values = tag.get(tag_key, None)
|
||||
if tag_values:
|
||||
for tag_value in tag_values:
|
||||
comp_key = str((tag_key, tag_value))
|
||||
doc_ids = self.doc_tags.get(comp_key, [])
|
||||
for doc_id in doc_ids:
|
||||
result.append(self.indexes[index].get(doc_id))
|
||||
return result
|
||||
|
||||
def get_document_count(self, index=None) -> int:
|
||||
def get_document_count(self, index: Optional[str] = None) -> int:
|
||||
index = index or self.index
|
||||
return len(self.indexes[index].items())
|
||||
|
||||
def get_label_count(self, index=None) -> int:
|
||||
def get_label_count(self, index: Optional[str] = None) -> int:
|
||||
index = index or self.label_index
|
||||
return len(self.indexes[index].items())
|
||||
|
||||
def get_all_documents(self, index=None) -> List[Document]:
|
||||
def get_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Document]:
|
||||
index = index or self.index
|
||||
return list(self.indexes[index].values())
|
||||
documents = list(self.indexes[index].values())
|
||||
filtered_documents = []
|
||||
|
||||
def get_all_labels(self, index=None, filters=None) -> List[Label]:
|
||||
if filters:
|
||||
for doc in documents:
|
||||
is_hit = True
|
||||
for key, values in filters.items():
|
||||
if doc.meta.get(key):
|
||||
if doc.meta[key] not in values:
|
||||
is_hit = False
|
||||
else:
|
||||
is_hit = False
|
||||
if is_hit:
|
||||
filtered_documents.append(doc)
|
||||
else:
|
||||
filtered_documents = documents
|
||||
|
||||
return filtered_documents
|
||||
|
||||
def get_all_labels(self, index: str = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Label]:
|
||||
index = index or self.label_index
|
||||
|
||||
if filters:
|
||||
@ -172,7 +145,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
|
||||
return result
|
||||
|
||||
def add_eval_data(self, filename: str, doc_index: str = "document", label_index: str = "label"):
|
||||
def add_eval_data(self, filename: str, doc_index: Optional[str] = None, label_index: Optional[str] = None):
|
||||
"""
|
||||
Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it.
|
||||
|
||||
@ -185,10 +158,12 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
"""
|
||||
|
||||
docs, labels = eval_data_from_file(filename)
|
||||
doc_index = doc_index or self.index
|
||||
label_index = label_index or self.label_index
|
||||
self.write_documents(docs, index=doc_index)
|
||||
self.write_labels(labels, index=label_index)
|
||||
|
||||
def delete_all_documents(self, index=None):
|
||||
def delete_all_documents(self, index: Optional[str] = None):
|
||||
"""
|
||||
Delete all documents in a index.
|
||||
|
||||
@ -197,4 +172,4 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
"""
|
||||
|
||||
index = index or self.index
|
||||
self.indexes[index] = {}
|
||||
self.indexes[index] = {}
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
from typing import Any, Dict, Union, List, Optional
|
||||
from uuid import uuid4
|
||||
|
||||
from sqlalchemy import create_engine, Column, Integer, String, DateTime, func, ForeignKey, PickleType, Boolean
|
||||
from sqlalchemy import create_engine, Column, Integer, String, DateTime, func, ForeignKey, Boolean
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import relationship, sessionmaker
|
||||
|
||||
@ -14,7 +14,7 @@ Base = declarative_base() # type: Any
|
||||
class ORMBase(Base):
|
||||
__abstract__ = True
|
||||
|
||||
id = Column(String, primary_key=True)
|
||||
id = Column(String, default=lambda: str(uuid4()), primary_key=True)
|
||||
created = Column(DateTime, server_default=func.now())
|
||||
updated = Column(DateTime, server_default=func.now(), server_onupdate=func.now())
|
||||
|
||||
@ -24,25 +24,24 @@ class DocumentORM(ORMBase):
|
||||
|
||||
text = Column(String, nullable=False)
|
||||
index = Column(String, nullable=False)
|
||||
meta_data = Column(PickleType)
|
||||
|
||||
tags = relationship("TagORM", secondary="document_tag", backref="Document")
|
||||
meta = relationship("MetaORM", secondary="document_meta", backref="Document")
|
||||
|
||||
|
||||
class TagORM(ORMBase):
|
||||
__tablename__ = "tag"
|
||||
class MetaORM(ORMBase):
|
||||
__tablename__ = "meta"
|
||||
|
||||
name = Column(String)
|
||||
value = Column(String)
|
||||
|
||||
documents = relationship(DocumentORM, secondary="document_tag", backref="Tag")
|
||||
documents = relationship(DocumentORM, secondary="document_meta", backref="Meta")
|
||||
|
||||
|
||||
class DocumentTagORM(ORMBase):
|
||||
__tablename__ = "document_tag"
|
||||
class DocumentMetaORM(ORMBase):
|
||||
__tablename__ = "document_meta"
|
||||
|
||||
document_id = Column(String, ForeignKey("document.id"), nullable=False)
|
||||
tag_id = Column(Integer, ForeignKey("tag.id"), nullable=False)
|
||||
meta_id = Column(Integer, ForeignKey("meta.id"), nullable=False)
|
||||
|
||||
|
||||
class LabelORM(ORMBase):
|
||||
@ -75,11 +74,17 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
document = document_row or self._convert_sql_row_to_document(document_row)
|
||||
return document
|
||||
|
||||
def get_all_documents(self, index=None) -> List[Document]:
|
||||
def get_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Document]:
|
||||
index = index or self.index
|
||||
document_rows = self.session.query(DocumentORM).filter_by(index=index).all()
|
||||
documents = [self._convert_sql_row_to_document(row) for row in document_rows]
|
||||
|
||||
if filters:
|
||||
for key, values in filters.items():
|
||||
results = self.session.query(DocumentORM).filter(DocumentORM.meta.any(MetaORM.name.in_([key]))).\
|
||||
filter(DocumentORM.meta.any(MetaORM.value.in_(values))).all()
|
||||
else:
|
||||
results = self.session.query(DocumentORM).filter_by(index=index).all()
|
||||
|
||||
documents = [self._convert_sql_row_to_document(row) for row in results]
|
||||
return documents
|
||||
|
||||
def get_all_labels(self, index=None, filters: Optional[dict] = None):
|
||||
@ -89,45 +94,15 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
|
||||
return labels
|
||||
|
||||
def get_document_ids_by_tags(self, tags: Dict[str, Union[str, List]], index: Optional[str] = None) -> List[str]:
|
||||
"""
|
||||
Get list of document ids that have tags from the given list of tags.
|
||||
|
||||
:param tags: limit scope to documents having the given tags and their corresponding values.
|
||||
The format for the dict is {"tag-1": "value-1", "tag-2": "value-2" ...}
|
||||
"""
|
||||
if not tags:
|
||||
raise Exception("No tag supplied for filtering the documents")
|
||||
|
||||
if index:
|
||||
raise Exception("'index' parameter is not supported in SQLDocumentStore.get_document_ids_by_tags().")
|
||||
|
||||
query = """
|
||||
SELECT id FROM document WHERE id in (
|
||||
SELECT dt.document_id
|
||||
FROM document_tag dt JOIN
|
||||
tag t
|
||||
ON t.id = dt.tag_id
|
||||
GROUP BY dt.document_id
|
||||
"""
|
||||
tag_filters = []
|
||||
for tag in tags:
|
||||
tag_filters.append(f"SUM(CASE WHEN t.value='{tag}' THEN 1 ELSE 0 END) > 0")
|
||||
|
||||
final_query = f"{query} HAVING {' AND '.join(tag_filters)});"
|
||||
query_results = self.session.execute(final_query)
|
||||
|
||||
doc_ids = [row[0] for row in query_results]
|
||||
return doc_ids
|
||||
|
||||
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None):
|
||||
"""
|
||||
Indexes documents for later queries.
|
||||
|
||||
:param documents: a list of Python dictionaries or a list of Haystack Document objects.
|
||||
:param documents: a list of Python dictionaries or a list of Haystack Document objects.
|
||||
For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
|
||||
Optionally, you can also supply "tags": ["one-tag", "another-one"]
|
||||
or additional meta data via "meta": {"name": "<some-document-name>, "author": "someone", "url":"some-url" ...}
|
||||
Optionally: Include meta data via {"text": "<the-actual-text>",
|
||||
"meta":{"name": "<some-document-name>, "author": "somebody", ...}}
|
||||
It can be used for filtering and is accessible in the responses of the Finder.
|
||||
:param index: add an optional index attribute to documents. It can be later used for filtering. For instance,
|
||||
documents for evaluation can be indexed in a separate index than the documents for search.
|
||||
|
||||
@ -135,11 +110,12 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
"""
|
||||
|
||||
# Make sure we comply to Document class format
|
||||
documents = [Document.from_dict(d) if isinstance(d, dict) else d for d in documents]
|
||||
document_objects = [Document.from_dict(d) if isinstance(d, dict) else d for d in documents]
|
||||
index = index or self.index
|
||||
for doc in documents:
|
||||
row = DocumentORM(id=doc.id, text=doc.text, meta_data=doc.meta, index=index) # type: ignore
|
||||
self.session.add(row)
|
||||
for doc in document_objects:
|
||||
meta_orms = [MetaORM(name=key, value=value) for key, value in doc.meta.items()]
|
||||
doc_orm = DocumentORM(id=doc.id, text=doc.text, meta=meta_orms, index=index)
|
||||
self.session.add(doc_orm)
|
||||
self.session.commit()
|
||||
|
||||
def write_labels(self, labels, index=None):
|
||||
@ -148,7 +124,6 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
index = index or self.index
|
||||
for label in labels:
|
||||
label_orm = LabelORM(
|
||||
id=str(uuid4()),
|
||||
document_id=label.document_id,
|
||||
no_answer=label.no_answer,
|
||||
origin=label.origin,
|
||||
@ -163,7 +138,7 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
self.session.add(label_orm)
|
||||
self.session.commit()
|
||||
|
||||
def add_eval_data(self, filename: str, doc_index: str = "document", label_index: str = "label"):
|
||||
def add_eval_data(self, filename: str, doc_index: str = "eval_document", label_index: str = "label"):
|
||||
"""
|
||||
Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it.
|
||||
|
||||
@ -191,8 +166,7 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
document = Document(
|
||||
id=row.id,
|
||||
text=row.text,
|
||||
meta=row.meta_data,
|
||||
tags=row.tags
|
||||
meta={meta.name: meta.value for meta in row.meta}
|
||||
)
|
||||
return document
|
||||
|
||||
|
||||
@ -32,8 +32,8 @@ class Finder:
|
||||
:param question: the question string
|
||||
:param top_k_reader: number of answers returned by the reader
|
||||
:param top_k_retriever: number of text units to be retrieved
|
||||
:param filters: limit scope to documents having the given tags and their corresponding values.
|
||||
The format for the dict is {"tag-1": ["value-1","value-2"], "tag-2": ["value-3]" ...}
|
||||
:param filters: limit scope to documents having the given meta data values.
|
||||
The format for the dict is {"key-1": ["value-1", "value-2"], "key-2": ["value-3]" ...}
|
||||
:return:
|
||||
"""
|
||||
|
||||
@ -71,8 +71,8 @@ class Finder:
|
||||
|
||||
:param question: the question string
|
||||
:param top_k_retriever: number of text units to be retrieved
|
||||
:param filters: limit scope to documents having the given tags and their corresponding values.
|
||||
The format for the dict is {"tag-1": "value-1", "tag-2": "value-2" ...}
|
||||
:param filters: limit scope to documents having the given meta data values.
|
||||
The format for the dict is {"key-1": ["value-1", "value-2"], "key-2": ["value-3]" ...}
|
||||
:return:
|
||||
"""
|
||||
|
||||
|
||||
@ -1,20 +1,19 @@
|
||||
|
||||
import os
|
||||
import tarfile
|
||||
import time
|
||||
import urllib.request
|
||||
from subprocess import Popen, PIPE, STDOUT, run
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
from haystack.database.base import Document
|
||||
from haystack.database.elasticsearch import ElasticsearchDocumentStore
|
||||
from haystack.database.memory import InMemoryDocumentStore
|
||||
from haystack.database.sql import SQLDocumentStore
|
||||
from haystack.reader.farm import FARMReader
|
||||
from haystack.reader.transformers import TransformersReader
|
||||
|
||||
from haystack.database.base import Document
|
||||
from haystack.database.sql import SQLDocumentStore
|
||||
from haystack.database.memory import InMemoryDocumentStore
|
||||
from haystack.database.elasticsearch import ElasticsearchDocumentStore
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
def elasticsearch_dir(tmpdir_factory):
|
||||
@ -122,7 +121,6 @@ def document_store_with_docs(request, test_docs_xs, elasticsearch_fixture):
|
||||
document_store = ElasticsearchDocumentStore(index="haystack_test")
|
||||
assert document_store.get_document_count() == 0
|
||||
document_store.write_documents(test_docs_xs)
|
||||
time.sleep(2)
|
||||
|
||||
return document_store
|
||||
|
||||
|
||||
@ -1,21 +1,45 @@
|
||||
import pytest
|
||||
import time
|
||||
|
||||
from haystack.database.base import Document
|
||||
|
||||
|
||||
def test_get_all_documents(document_store_with_docs):
|
||||
def test_get_all_documents_without_filters(document_store_with_docs):
|
||||
documents = document_store_with_docs.get_all_documents()
|
||||
assert all(isinstance(d, Document) for d in documents)
|
||||
assert len(documents) == 3
|
||||
assert {d.meta["name"] for d in documents} == {"filename1", "filename2", "filename3"}
|
||||
assert {d.meta["meta_field"] for d in documents} == {"test1", "test2", "test3"}
|
||||
|
||||
|
||||
def test_get_all_documents_with_correct_filters(document_store_with_docs):
|
||||
documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test2"]})
|
||||
assert len(documents) == 1
|
||||
assert documents[0].meta["name"] == "filename2"
|
||||
|
||||
documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test1", "test3"]})
|
||||
assert len(documents) == 2
|
||||
assert {d.meta["name"] for d in documents} == {"filename1", "filename3"}
|
||||
assert {d.meta["meta_field"] for d in documents} == {"test1", "test3"}
|
||||
|
||||
|
||||
def test_get_all_documents_with_incorrect_filter_name(document_store_with_docs):
|
||||
documents = document_store_with_docs.get_all_documents(filters={"incorrect_meta_field": ["test2"]})
|
||||
assert len(documents) == 0
|
||||
|
||||
|
||||
def test_get_all_documents_with_incorrect_filter_value(document_store_with_docs):
|
||||
documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["incorrect_value"]})
|
||||
assert len(documents) == 0
|
||||
|
||||
|
||||
def test_get_documents_by_id(document_store_with_docs):
|
||||
documents = document_store_with_docs.get_all_documents()
|
||||
doc = document_store_with_docs.get_document_by_id(documents[0].id)
|
||||
assert doc.id == documents[0].id
|
||||
assert doc.text == documents[0].text
|
||||
|
||||
|
||||
@pytest.mark.parametrize("document_store_with_docs", [("elasticsearch")], indirect=True)
|
||||
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
|
||||
def test_elasticsearch_update_meta(document_store_with_docs):
|
||||
document = document_store_with_docs.query(query=None, filters={"name": ["filename1"]})[0]
|
||||
document_store_with_docs.update_document_meta(document.id, meta={"meta_field": "updated_meta"})
|
||||
|
||||
@ -1,85 +0,0 @@
|
||||
from haystack import Finder
|
||||
from haystack.reader.transformers import TransformersReader
|
||||
from haystack.retriever.sparse import TfidfRetriever
|
||||
|
||||
|
||||
def test_finder_get_answers_with_in_memory_store():
|
||||
test_docs = [
|
||||
{"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}},
|
||||
{"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': 'url'}},
|
||||
{"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}}
|
||||
]
|
||||
|
||||
from haystack.database.memory import InMemoryDocumentStore
|
||||
document_store = InMemoryDocumentStore()
|
||||
document_store.write_documents(test_docs)
|
||||
|
||||
retriever = TfidfRetriever(document_store=document_store)
|
||||
reader = TransformersReader(model="distilbert-base-uncased-distilled-squad",
|
||||
tokenizer="distilbert-base-uncased", use_gpu=-1)
|
||||
finder = Finder(reader, retriever)
|
||||
prediction = finder.get_answers(question="testing finder", top_k_retriever=10,
|
||||
top_k_reader=5)
|
||||
assert prediction is not None
|
||||
|
||||
|
||||
def test_memory_store_get_by_tags():
|
||||
test_docs = [
|
||||
{"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}},
|
||||
{"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': None}},
|
||||
{"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}}
|
||||
]
|
||||
|
||||
from haystack.database.memory import InMemoryDocumentStore
|
||||
document_store = InMemoryDocumentStore()
|
||||
document_store.write_documents(test_docs)
|
||||
|
||||
docs = document_store.get_document_ids_by_tags({'has_url': 'false'})
|
||||
|
||||
assert docs == []
|
||||
|
||||
|
||||
def test_memory_store_get_by_tag_lists_union():
|
||||
test_docs = [
|
||||
{"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}, 'tags': [{'tag2': ["1"]}]},
|
||||
{"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': None}, 'tags': [{'tag1': ['1']}]},
|
||||
{"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}, 'tags': [{'tag2': ["1", "2"]}]}
|
||||
]
|
||||
|
||||
from haystack.database.memory import InMemoryDocumentStore
|
||||
document_store = InMemoryDocumentStore()
|
||||
document_store.write_documents(test_docs)
|
||||
|
||||
docs = document_store.get_document_ids_by_tags({'tag2': ["1"]})
|
||||
assert docs[0].text == 'testing the finder with pyhton unit test 1'
|
||||
assert docs[1].text == 'testing the finder with pyhton unit test 3'
|
||||
assert docs[1].text == 'testing the finder with pyhton unit test 3'
|
||||
assert docs[1].tags[0] == {"tag2": ["1", "2"]}
|
||||
|
||||
def test_memory_store_get_by_tag_lists_non_existent_tag():
|
||||
test_docs = [
|
||||
{"text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url', "name": "testing the finder 1"}, 'tags': [{'tag1': ["1"]}]},
|
||||
]
|
||||
from haystack.database.memory import InMemoryDocumentStore
|
||||
document_store = InMemoryDocumentStore()
|
||||
document_store.write_documents(test_docs)
|
||||
docs = document_store.get_document_ids_by_tags({'tag1': ["3"]})
|
||||
assert docs == []
|
||||
|
||||
|
||||
def test_memory_store_get_by_tag_lists_disjoint():
|
||||
test_docs = [
|
||||
{"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}, 'tags': [{'tag1': ["1"]}]},
|
||||
{"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': None}, 'tags': [{'tag2': ['1']}]},
|
||||
{"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}, 'tags': [{'tag3': ["1", "2"]}]},
|
||||
{"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 4", 'url': 'url'}, 'tags': [{'tag3': ["1", "3"]}]}
|
||||
]
|
||||
|
||||
from haystack.database.memory import InMemoryDocumentStore
|
||||
document_store = InMemoryDocumentStore()
|
||||
document_store.write_documents(test_docs)
|
||||
|
||||
docs = document_store.get_document_ids_by_tags({'tag3': ["3"]})
|
||||
assert len(docs) == 1
|
||||
assert docs[0].text == 'testing the finder with pyhton unit test 3'
|
||||
assert docs[0].tags[0] == {"tag3": ["1", "3"]}
|
||||
Loading…
x
Reference in New Issue
Block a user