Move document_name attribute to meta (#217)

This commit is contained in:
Tanay Soni 2020-07-14 09:53:31 +02:00 committed by GitHub
parent 4c21556a79
commit b886e054a3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 110 additions and 104 deletions

View File

@ -12,10 +12,9 @@ class Document(BaseModel):
description="id for the source file the document was created from. In the case when a large file is divided "
"across multiple Elasticsearch documents, this id can be used to reference original source file.",
)
# name: Optional[str] = Field(None, description="Title of the document")
question: Optional[str] = Field(None, description="Question text for FAQs.")
query_score: Optional[float] = Field(None, description="Elasticsearch query score for a retrieved document")
meta: Dict[str, Any] = Field({}, description="")
meta: Dict[str, Any] = Field({}, description="Meta fields for a document like name, url, or author.")
tags: Optional[Dict[str, Any]] = Field(None, description="Tags that allow filtering of the data")
@ -30,8 +29,11 @@ class BaseDocumentStore(ABC):
"""
Indexes documents for later queries.
:param documents: List of dictionaries in the format {"name": "<some-document-name>, "text": "<the-actual-text>"}.
Optionally, further fields can be supplied depending on the child class.
:param documents: List of dictionaries.
Default format: {"text": "<the-actual-text>"}
Optionally: Include meta data via {"text": "<the-actual-text>",
"meta":{"name": "<some-document-name>, "author": "somebody", ...}}
It can be used for filtering and is accessible in the responses of the Finder.
:return: None
"""

View File

@ -117,9 +117,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
Indexes documents for later queries in Elasticsearch.
:param documents: List of dictionaries.
Default format: {"name": "<some-document-name>, "text": "<the-actual-text>"}
Optionally: Include meta data via {"name": "<some-document-name>,
"text": "<the-actual-text>", "meta":{"author": "somebody", ...}}
Default format: {"text": "<the-actual-text>"}
Optionally: Include meta data via {"text": "<the-actual-text>",
"meta":{"name": "<some-document-name>, "author": "somebody", ...}}
It can be used for filtering and is accessible in the responses of the Finder.
Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary
should be changed to what you have set for self.text_field and self.name_field .

View File

@ -18,9 +18,9 @@ class InMemoryDocumentStore(BaseDocumentStore):
"""
Indexes documents for later queries.
:param documents: List of dictionaries in the format {"name": "<some-document-name>, "text": "<the-actual-text>"}.
:param documents: List of dictionaries in the format {"text": "<the-actual-text>"}.
Optionally, you can also supply "tags": ["one-tag", "another-one"]
or additional meta data via "meta": {"author": "someone", "url":"some-url" ...}
or additional meta data via "meta": {"name": "<some-document-name>, "author": "someone", "url":"some-url" ...}
:return: None
"""
@ -30,19 +30,21 @@ class InMemoryDocumentStore(BaseDocumentStore):
return
for document in documents:
name = document.get("name", None)
text = document.get("text", None)
text = document["text"]
if "meta" not in document.keys():
document["meta"] = {}
for k, v in document.items(): # put additional fields other than text in meta
if k not in ["text", "meta", "tags"]:
document["meta"][k] = v
if name is None or text is None:
continue
if not text:
raise Exception("A document cannot have empty text field.")
signature = name + text
hash = hashlib.md5(signature.encode("utf-8")).hexdigest()
hash = hashlib.md5(text.encode("utf-8")).hexdigest()
self.docs[hash] = document
tags = document.get('tags', [])
tags = document.get("tags", [])
self._map_tags_to_ids(hash, tags)
@ -65,12 +67,12 @@ class InMemoryDocumentStore(BaseDocumentStore):
document = self._convert_memory_hit_to_document(self.docs[id], doc_id=id)
return document
def _convert_memory_hit_to_document(self, hit: Tuple[Any, Any], doc_id: Optional[str] = None) -> Document:
def _convert_memory_hit_to_document(self, hit: Dict[str, Any], doc_id: Optional[str] = None) -> Document:
document = Document(
id=doc_id,
text=hit[0].get('text', None),
meta=hit[0].get('meta', {}),
query_score=hit[1],
text=hit.get("text", None),
meta=hit.get("meta", {}),
query_score=hit.get("query_score", None),
)
return document
@ -89,14 +91,21 @@ class InMemoryDocumentStore(BaseDocumentStore):
"use a different DocumentStore (e.g. ElasticsearchDocumentStore).")
if self.embedding_field is None:
return []
raise Exception(
"To use query_by_embedding() 'embedding field' must "
"be specified when initializing the document store."
)
if query_emb is None:
return []
candidate_docs = [self._convert_memory_hit_to_document(
(doc, dot(query_emb, doc[self.embedding_field]) / (norm(query_emb) * norm(doc[self.embedding_field]))), doc_id=idx) for idx, doc in self.docs.items()
]
candidate_docs = []
for idx, hit in self.docs.items():
hit["query_score"] = dot(query_emb, hit[self.embedding_field]) / (
norm(query_emb) * norm(hit[self.embedding_field])
)
_doc = self._convert_memory_hit_to_document(hit=hit, doc_id=idx)
candidate_docs.append(_doc)
return sorted(candidate_docs, key=lambda x: x.query_score, reverse=True)[0:top_k]
@ -139,4 +148,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
return len(self.docs.items())
def get_all_documents(self) -> List[Document]:
return [Document(id=item[0], text=item[1]['text'], name=item[1]['name'], meta=item[1].get('meta', {})) for item in self.docs.items()]
return [
Document(id=item[0], text=item[1]["text"], meta=item[1].get("meta", {}))
for item in self.docs.items()
]

View File

@ -20,7 +20,6 @@ class ORMBase(Base):
class Document(ORMBase):
__tablename__ = "document"
name = Column(String)
text = Column(String)
meta_data = Column(PickleType)
@ -96,14 +95,19 @@ class SQLDocumentStore(BaseDocumentStore):
"""
Indexes documents for later queries.
:param documents: List of dictionaries in the format {"name": "<some-document-name>, "text": "<the-actual-text>"}.
:param documents: List of dictionaries in the format {"text": "<the-actual-text>"}.
Optionally, you can also supply meta data via "meta": {"author": "someone", "url":"some-url" ...}
:return: None
"""
for doc in documents:
row = Document(name=doc["name"], text=doc["text"], meta_data=doc.get("meta", {}))
if "meta" not in doc.keys():
doc["meta"] = {}
for k, v in doc.items(): # put additional fields other than text in meta
if k not in ["text", "meta", "tags"]:
doc["meta"][k] = v
row = Document(text=doc["text"], meta_data=doc.get("meta", {}))
self.session.add(row)
self.session.commit()

View File

@ -48,9 +48,9 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
for para in text.split("\n\n"):
if not para.strip(): # skip empty paragraphs
continue
documents.append({"name": path.name, "text": para})
documents.append({"text": para, "meta": {"name": path.name}})
else:
documents.append({"name": path.name, "text": text})
documents.append({"text": text, "meta": {"name": path.name}})
return documents

View File

@ -54,9 +54,10 @@ def xpdf_fixture():
@pytest.fixture()
def test_docs_xs():
return [
{"name": "filename1", "text": "My name is Carla and I live in Berlin", "meta": {"meta_field": "test1"}},
{"name": "filename2", "text": "My name is Paul and I live in New York", "meta": {"meta_field": "test2"}},
{"name": "filename3", "text": "My name is Christelle and I live in Paris", "meta": {"meta_field": "test3"}}
{"text": "My name is Carla and I live in Berlin", "meta": {"meta_field": "test1", "name": "filename1"}},
{"text": "My name is Paul and I live in New York", "meta": {"meta_field": "test2", "name": "filename2"}},
{"text": "My name is Christelle and I live in Paris", "meta_field": "test3", "meta": {"name": "filename3"}}
# last doc has meta_field at the top level for backward compatibility
]

View File

@ -1,27 +1,12 @@
from time import sleep
from haystack.database.elasticsearch import ElasticsearchDocumentStore
from haystack.database.sql import SQLDocumentStore
from haystack.indexing.utils import convert_files_to_dicts
from haystack.database.base import Document
def test_sql_write_read():
sql_document_store = SQLDocumentStore()
documents = convert_files_to_dicts(dir_path="samples/docs")
sql_document_store.write_documents(documents)
documents = sql_document_store.get_all_documents()
assert len(documents) == 2
doc = sql_document_store.get_document_by_id("1")
assert doc.id
assert doc.text
def test_elasticsearch_write_read(elasticsearch_fixture):
document_store = ElasticsearchDocumentStore()
documents = convert_files_to_dicts(dir_path="samples/docs")
document_store.write_documents(documents)
sleep(2) # wait for documents to be available for query
documents = document_store.get_all_documents()
assert len(documents) == 2
assert documents[0].id
assert documents[0].text
def test_get_all_documents(document_store_with_docs):
documents = document_store_with_docs.get_all_documents()
assert all(isinstance(d, Document) for d in documents)
assert len(documents) == 3
assert {d.meta["name"] for d in documents} == {"filename1", "filename2", "filename3"}
assert {d.meta["meta_field"] for d in documents} == {"test1", "test2", "test3"}
doc = document_store_with_docs.get_document_by_id(documents[0].id)
assert doc.id == documents[0].id
assert doc.text == documents[0].text

View File

@ -1,20 +1,8 @@
from haystack.retriever.dpr_utils import download_dpr
def test_dpr_passage_encoder():
from haystack.retriever.dense import DensePassageRetriever
passage = ["Let's encode this one"]
retriever = DensePassageRetriever(document_store=None, embedding_model="dpr-bert-base-nq", gpu=False)
emb = retriever.embed_passages(passage)[0]
assert(emb.shape[0] == 768)
assert(emb[0]-0.52872 < 0.001)
from haystack.database.memory import InMemoryDocumentStore
from haystack.retriever.dense import DensePassageRetriever
def test_dpr_inmemory_retrieval():
from haystack.database.memory import InMemoryDocumentStore
from haystack.retriever.dense import DensePassageRetriever
document_store = InMemoryDocumentStore(embedding_field="embedding")
documents = [
@ -27,8 +15,13 @@ def test_dpr_inmemory_retrieval():
embedded = []
for doc in documents:
doc['embedding'] = retriever.embed_passages([doc['text']])[0]
embedding = retriever.embed_passages([doc['text']])[0]
doc['embedding'] = embedding
embedded.append(doc)
assert (embedding.shape[0] == 768)
assert (embedding[0] - 0.52872 < 0.001)
document_store.write_documents(embedded)
res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?")

View File

@ -9,17 +9,17 @@ def test_faq_retriever_in_memory_store():
document_store = InMemoryDocumentStore(embedding_field="embedding")
documents = [
{'name': 'How to test this library?', 'text': 'By running tox in the command line!', 'meta': {'question': 'How to test this library?'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'How to test this library?', 'question': 'How to test this library?'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
]
retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", gpu=False)

View File

@ -5,9 +5,9 @@ from haystack.retriever.sparse import TfidfRetriever
def test_finder_get_answers_with_in_memory_store():
test_docs = [
{"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}},
{"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': 'url'}},
{"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}}
{"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}},
{"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': 'url'}},
{"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}}
]
from haystack.database.memory import InMemoryDocumentStore
@ -25,9 +25,9 @@ def test_finder_get_answers_with_in_memory_store():
def test_memory_store_get_by_tags():
test_docs = [
{"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}},
{"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': None}},
{"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}}
{"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}},
{"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': None}},
{"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}}
]
from haystack.database.memory import InMemoryDocumentStore
@ -41,9 +41,9 @@ def test_memory_store_get_by_tags():
def test_memory_store_get_by_tag_lists_union():
test_docs = [
{"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}, 'tags': [{'tag2': ["1"]}]},
{"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': None}, 'tags': [{'tag1': ['1']}]},
{"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}, 'tags': [{'tag2': ["1", "2"]}]}
{"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}, 'tags': [{'tag2': ["1"]}]},
{"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': None}, 'tags': [{'tag1': ['1']}]},
{"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}, 'tags': [{'tag2': ["1", "2"]}]}
]
from haystack.database.memory import InMemoryDocumentStore
@ -53,14 +53,14 @@ def test_memory_store_get_by_tag_lists_union():
docs = document_store.get_document_ids_by_tags({'tag2': ["1"]})
assert docs == [
{'name': 'testing the finder 1', 'text': 'testing the finder with pyhton unit test 1', 'meta': {'url': 'url'}, 'tags': [{'tag2': ['1']}]},
{'name': 'testing the finder 3', 'text': 'testing the finder with pyhton unit test 3', 'meta': {'url': 'url'}, 'tags': [{'tag2': ['1', '2']}]}
{'text': 'testing the finder with pyhton unit test 1', 'meta': {'name': 'testing the finder 1', 'url': 'url'}, 'tags': [{'tag2': ['1']}]},
{'text': 'testing the finder with pyhton unit test 3', 'meta': {'name': 'testing the finder 3', 'url': 'url'}, 'tags': [{'tag2': ['1', '2']}]}
]
def test_memory_store_get_by_tag_lists_non_existent_tag():
test_docs = [
{"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}, 'tags': [{'tag1': ["1"]}]},
{"text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url', "name": "testing the finder 1"}, 'tags': [{'tag1': ["1"]}]},
]
from haystack.database.memory import InMemoryDocumentStore
document_store = InMemoryDocumentStore()
@ -71,10 +71,10 @@ def test_memory_store_get_by_tag_lists_non_existent_tag():
def test_memory_store_get_by_tag_lists_disjoint():
test_docs = [
{"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}, 'tags': [{'tag1': ["1"]}]},
{"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': None}, 'tags': [{'tag2': ['1']}]},
{"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}, 'tags': [{'tag3': ["1", "2"]}]},
{"name": "testing the finder 4", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}, 'tags': [{'tag3': ["1", "3"]}]}
{"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}, 'tags': [{'tag1': ["1"]}]},
{"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': None}, 'tags': [{'tag2': ['1']}]},
{"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}, 'tags': [{'tag3': ["1", "2"]}]},
{"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 4", 'url': 'url'}, 'tags': [{'tag3': ["1", "3"]}]}
]
from haystack.database.memory import InMemoryDocumentStore
@ -83,4 +83,4 @@ def test_memory_store_get_by_tag_lists_disjoint():
docs = document_store.get_document_ids_by_tags({'tag3': ["3"]})
assert docs == [{'name': 'testing the finder 4', 'text': 'testing the finder with pyhton unit test 3', 'meta': {'url': 'url'}, 'tags': [{'tag3': ['1', '3']}]}]
assert docs == [{'text': 'testing the finder with pyhton unit test 3', 'meta': {'name': 'testing the finder 4', 'url': 'url'}, 'tags': [{'tag3': ['1', '3']}]}]

View File

@ -12,7 +12,7 @@ def test_reader_basic(reader):
def test_output(reader, test_docs_xs):
docs = []
for d in test_docs_xs:
doc = Document(id=d["name"], text=d["text"], meta=d["meta"])
doc = Document(id=d["meta"]["name"], text=d["text"], meta=d["meta"])
docs.append(doc)
results = reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5)
assert results is not None

View File

@ -16,4 +16,13 @@ def test_tfidf_retriever():
retriever = TfidfRetriever(document_store)
retriever.fit()
assert retriever.retrieve("godzilla", top_k=1) == [Document(id='0', text='godzilla says hello', external_source_id=None, question=None, query_score=None, meta={})]
assert retriever.retrieve("godzilla", top_k=1) == [
Document(
id='0',
text='godzilla says hello',
external_source_id=None,
question=None,
query_score=None,
meta={"name": "testing the finder 1"},
)
]