Move document_name attribute to meta (#217)

This commit is contained in:
Tanay Soni 2020-07-14 09:53:31 +02:00 committed by GitHub
parent 4c21556a79
commit b886e054a3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 110 additions and 104 deletions

View File

@ -12,10 +12,9 @@ class Document(BaseModel):
description="id for the source file the document was created from. In the case when a large file is divided " description="id for the source file the document was created from. In the case when a large file is divided "
"across multiple Elasticsearch documents, this id can be used to reference original source file.", "across multiple Elasticsearch documents, this id can be used to reference original source file.",
) )
# name: Optional[str] = Field(None, description="Title of the document")
question: Optional[str] = Field(None, description="Question text for FAQs.") question: Optional[str] = Field(None, description="Question text for FAQs.")
query_score: Optional[float] = Field(None, description="Elasticsearch query score for a retrieved document") query_score: Optional[float] = Field(None, description="Elasticsearch query score for a retrieved document")
meta: Dict[str, Any] = Field({}, description="") meta: Dict[str, Any] = Field({}, description="Meta fields for a document like name, url, or author.")
tags: Optional[Dict[str, Any]] = Field(None, description="Tags that allow filtering of the data") tags: Optional[Dict[str, Any]] = Field(None, description="Tags that allow filtering of the data")
@ -30,8 +29,11 @@ class BaseDocumentStore(ABC):
""" """
Indexes documents for later queries. Indexes documents for later queries.
:param documents: List of dictionaries in the format {"name": "<some-document-name>, "text": "<the-actual-text>"}. :param documents: List of dictionaries.
Optionally, further fields can be supplied depending on the child class. Default format: {"text": "<the-actual-text>"}
Optionally: Include meta data via {"text": "<the-actual-text>",
"meta":{"name": "<some-document-name>, "author": "somebody", ...}}
It can be used for filtering and is accessible in the responses of the Finder.
:return: None :return: None
""" """

View File

@ -117,9 +117,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
Indexes documents for later queries in Elasticsearch. Indexes documents for later queries in Elasticsearch.
:param documents: List of dictionaries. :param documents: List of dictionaries.
Default format: {"name": "<some-document-name>, "text": "<the-actual-text>"} Default format: {"text": "<the-actual-text>"}
Optionally: Include meta data via {"name": "<some-document-name>, Optionally: Include meta data via {"text": "<the-actual-text>",
"text": "<the-actual-text>", "meta":{"author": "somebody", ...}} "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
It can be used for filtering and is accessible in the responses of the Finder. It can be used for filtering and is accessible in the responses of the Finder.
Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary
should be changed to what you have set for self.text_field and self.name_field . should be changed to what you have set for self.text_field and self.name_field .

View File

@ -18,9 +18,9 @@ class InMemoryDocumentStore(BaseDocumentStore):
""" """
Indexes documents for later queries. Indexes documents for later queries.
:param documents: List of dictionaries in the format {"name": "<some-document-name>, "text": "<the-actual-text>"}. :param documents: List of dictionaries in the format {"text": "<the-actual-text>"}.
Optionally, you can also supply "tags": ["one-tag", "another-one"] Optionally, you can also supply "tags": ["one-tag", "another-one"]
or additional meta data via "meta": {"author": "someone", "url":"some-url" ...} or additional meta data via "meta": {"name": "<some-document-name>, "author": "someone", "url":"some-url" ...}
:return: None :return: None
""" """
@ -30,19 +30,21 @@ class InMemoryDocumentStore(BaseDocumentStore):
return return
for document in documents: for document in documents:
name = document.get("name", None) text = document["text"]
text = document.get("text", None) if "meta" not in document.keys():
document["meta"] = {}
for k, v in document.items(): # put additional fields other than text in meta
if k not in ["text", "meta", "tags"]:
document["meta"][k] = v
if name is None or text is None: if not text:
continue raise Exception("A document cannot have empty text field.")
signature = name + text hash = hashlib.md5(text.encode("utf-8")).hexdigest()
hash = hashlib.md5(signature.encode("utf-8")).hexdigest()
self.docs[hash] = document self.docs[hash] = document
tags = document.get('tags', []) tags = document.get("tags", [])
self._map_tags_to_ids(hash, tags) self._map_tags_to_ids(hash, tags)
@ -65,12 +67,12 @@ class InMemoryDocumentStore(BaseDocumentStore):
document = self._convert_memory_hit_to_document(self.docs[id], doc_id=id) document = self._convert_memory_hit_to_document(self.docs[id], doc_id=id)
return document return document
def _convert_memory_hit_to_document(self, hit: Tuple[Any, Any], doc_id: Optional[str] = None) -> Document: def _convert_memory_hit_to_document(self, hit: Dict[str, Any], doc_id: Optional[str] = None) -> Document:
document = Document( document = Document(
id=doc_id, id=doc_id,
text=hit[0].get('text', None), text=hit.get("text", None),
meta=hit[0].get('meta', {}), meta=hit.get("meta", {}),
query_score=hit[1], query_score=hit.get("query_score", None),
) )
return document return document
@ -89,14 +91,21 @@ class InMemoryDocumentStore(BaseDocumentStore):
"use a different DocumentStore (e.g. ElasticsearchDocumentStore).") "use a different DocumentStore (e.g. ElasticsearchDocumentStore).")
if self.embedding_field is None: if self.embedding_field is None:
return [] raise Exception(
"To use query_by_embedding() 'embedding field' must "
"be specified when initializing the document store."
)
if query_emb is None: if query_emb is None:
return [] return []
candidate_docs = [self._convert_memory_hit_to_document( candidate_docs = []
(doc, dot(query_emb, doc[self.embedding_field]) / (norm(query_emb) * norm(doc[self.embedding_field]))), doc_id=idx) for idx, doc in self.docs.items() for idx, hit in self.docs.items():
] hit["query_score"] = dot(query_emb, hit[self.embedding_field]) / (
norm(query_emb) * norm(hit[self.embedding_field])
)
_doc = self._convert_memory_hit_to_document(hit=hit, doc_id=idx)
candidate_docs.append(_doc)
return sorted(candidate_docs, key=lambda x: x.query_score, reverse=True)[0:top_k] return sorted(candidate_docs, key=lambda x: x.query_score, reverse=True)[0:top_k]
@ -139,4 +148,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
return len(self.docs.items()) return len(self.docs.items())
def get_all_documents(self) -> List[Document]: def get_all_documents(self) -> List[Document]:
return [Document(id=item[0], text=item[1]['text'], name=item[1]['name'], meta=item[1].get('meta', {})) for item in self.docs.items()] return [
Document(id=item[0], text=item[1]["text"], meta=item[1].get("meta", {}))
for item in self.docs.items()
]

View File

@ -20,7 +20,6 @@ class ORMBase(Base):
class Document(ORMBase): class Document(ORMBase):
__tablename__ = "document" __tablename__ = "document"
name = Column(String)
text = Column(String) text = Column(String)
meta_data = Column(PickleType) meta_data = Column(PickleType)
@ -96,14 +95,19 @@ class SQLDocumentStore(BaseDocumentStore):
""" """
Indexes documents for later queries. Indexes documents for later queries.
:param documents: List of dictionaries in the format {"name": "<some-document-name>, "text": "<the-actual-text>"}. :param documents: List of dictionaries in the format {"text": "<the-actual-text>"}.
Optionally, you can also supply meta data via "meta": {"author": "someone", "url":"some-url" ...} Optionally, you can also supply meta data via "meta": {"author": "someone", "url":"some-url" ...}
:return: None :return: None
""" """
for doc in documents: for doc in documents:
row = Document(name=doc["name"], text=doc["text"], meta_data=doc.get("meta", {})) if "meta" not in doc.keys():
doc["meta"] = {}
for k, v in doc.items(): # put additional fields other than text in meta
if k not in ["text", "meta", "tags"]:
doc["meta"][k] = v
row = Document(text=doc["text"], meta_data=doc.get("meta", {}))
self.session.add(row) self.session.add(row)
self.session.commit() self.session.commit()

View File

@ -48,9 +48,9 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
for para in text.split("\n\n"): for para in text.split("\n\n"):
if not para.strip(): # skip empty paragraphs if not para.strip(): # skip empty paragraphs
continue continue
documents.append({"name": path.name, "text": para}) documents.append({"text": para, "meta": {"name": path.name}})
else: else:
documents.append({"name": path.name, "text": text}) documents.append({"text": text, "meta": {"name": path.name}})
return documents return documents

View File

@ -54,9 +54,10 @@ def xpdf_fixture():
@pytest.fixture() @pytest.fixture()
def test_docs_xs(): def test_docs_xs():
return [ return [
{"name": "filename1", "text": "My name is Carla and I live in Berlin", "meta": {"meta_field": "test1"}}, {"text": "My name is Carla and I live in Berlin", "meta": {"meta_field": "test1", "name": "filename1"}},
{"name": "filename2", "text": "My name is Paul and I live in New York", "meta": {"meta_field": "test2"}}, {"text": "My name is Paul and I live in New York", "meta": {"meta_field": "test2", "name": "filename2"}},
{"name": "filename3", "text": "My name is Christelle and I live in Paris", "meta": {"meta_field": "test3"}} {"text": "My name is Christelle and I live in Paris", "meta_field": "test3", "meta": {"name": "filename3"}}
# last doc has meta_field at the top level for backward compatibility
] ]

View File

@ -1,27 +1,12 @@
from time import sleep from haystack.database.base import Document
from haystack.database.elasticsearch import ElasticsearchDocumentStore
from haystack.database.sql import SQLDocumentStore
from haystack.indexing.utils import convert_files_to_dicts
def test_sql_write_read(): def test_get_all_documents(document_store_with_docs):
sql_document_store = SQLDocumentStore() documents = document_store_with_docs.get_all_documents()
documents = convert_files_to_dicts(dir_path="samples/docs") assert all(isinstance(d, Document) for d in documents)
sql_document_store.write_documents(documents) assert len(documents) == 3
documents = sql_document_store.get_all_documents() assert {d.meta["name"] for d in documents} == {"filename1", "filename2", "filename3"}
assert len(documents) == 2 assert {d.meta["meta_field"] for d in documents} == {"test1", "test2", "test3"}
doc = sql_document_store.get_document_by_id("1") doc = document_store_with_docs.get_document_by_id(documents[0].id)
assert doc.id assert doc.id == documents[0].id
assert doc.text assert doc.text == documents[0].text
def test_elasticsearch_write_read(elasticsearch_fixture):
document_store = ElasticsearchDocumentStore()
documents = convert_files_to_dicts(dir_path="samples/docs")
document_store.write_documents(documents)
sleep(2) # wait for documents to be available for query
documents = document_store.get_all_documents()
assert len(documents) == 2
assert documents[0].id
assert documents[0].text

View File

@ -1,20 +1,8 @@
from haystack.retriever.dpr_utils import download_dpr
def test_dpr_passage_encoder():
from haystack.retriever.dense import DensePassageRetriever
passage = ["Let's encode this one"]
retriever = DensePassageRetriever(document_store=None, embedding_model="dpr-bert-base-nq", gpu=False)
emb = retriever.embed_passages(passage)[0]
assert(emb.shape[0] == 768)
assert(emb[0]-0.52872 < 0.001)
def test_dpr_inmemory_retrieval():
from haystack.database.memory import InMemoryDocumentStore from haystack.database.memory import InMemoryDocumentStore
from haystack.retriever.dense import DensePassageRetriever from haystack.retriever.dense import DensePassageRetriever
def test_dpr_inmemory_retrieval():
document_store = InMemoryDocumentStore(embedding_field="embedding") document_store = InMemoryDocumentStore(embedding_field="embedding")
documents = [ documents = [
@ -27,8 +15,13 @@ def test_dpr_inmemory_retrieval():
embedded = [] embedded = []
for doc in documents: for doc in documents:
doc['embedding'] = retriever.embed_passages([doc['text']])[0] embedding = retriever.embed_passages([doc['text']])[0]
doc['embedding'] = embedding
embedded.append(doc) embedded.append(doc)
assert (embedding.shape[0] == 768)
assert (embedding[0] - 0.52872 < 0.001)
document_store.write_documents(embedded) document_store.write_documents(embedded)
res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?") res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?")

View File

@ -9,17 +9,17 @@ def test_faq_retriever_in_memory_store():
document_store = InMemoryDocumentStore(embedding_field="embedding") document_store = InMemoryDocumentStore(embedding_field="embedding")
documents = [ documents = [
{'name': 'How to test this library?', 'text': 'By running tox in the command line!', 'meta': {'question': 'How to test this library?'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'How to test this library?', 'question': 'How to test this library?'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
] ]
retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", gpu=False) retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", gpu=False)

View File

@ -5,9 +5,9 @@ from haystack.retriever.sparse import TfidfRetriever
def test_finder_get_answers_with_in_memory_store(): def test_finder_get_answers_with_in_memory_store():
test_docs = [ test_docs = [
{"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}}, {"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}},
{"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': 'url'}}, {"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': 'url'}},
{"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}} {"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}}
] ]
from haystack.database.memory import InMemoryDocumentStore from haystack.database.memory import InMemoryDocumentStore
@ -25,9 +25,9 @@ def test_finder_get_answers_with_in_memory_store():
def test_memory_store_get_by_tags(): def test_memory_store_get_by_tags():
test_docs = [ test_docs = [
{"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}}, {"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}},
{"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': None}}, {"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': None}},
{"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}} {"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}}
] ]
from haystack.database.memory import InMemoryDocumentStore from haystack.database.memory import InMemoryDocumentStore
@ -41,9 +41,9 @@ def test_memory_store_get_by_tags():
def test_memory_store_get_by_tag_lists_union(): def test_memory_store_get_by_tag_lists_union():
test_docs = [ test_docs = [
{"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}, 'tags': [{'tag2': ["1"]}]}, {"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}, 'tags': [{'tag2': ["1"]}]},
{"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': None}, 'tags': [{'tag1': ['1']}]}, {"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': None}, 'tags': [{'tag1': ['1']}]},
{"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}, 'tags': [{'tag2': ["1", "2"]}]} {"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}, 'tags': [{'tag2': ["1", "2"]}]}
] ]
from haystack.database.memory import InMemoryDocumentStore from haystack.database.memory import InMemoryDocumentStore
@ -53,14 +53,14 @@ def test_memory_store_get_by_tag_lists_union():
docs = document_store.get_document_ids_by_tags({'tag2': ["1"]}) docs = document_store.get_document_ids_by_tags({'tag2': ["1"]})
assert docs == [ assert docs == [
{'name': 'testing the finder 1', 'text': 'testing the finder with pyhton unit test 1', 'meta': {'url': 'url'}, 'tags': [{'tag2': ['1']}]}, {'text': 'testing the finder with pyhton unit test 1', 'meta': {'name': 'testing the finder 1', 'url': 'url'}, 'tags': [{'tag2': ['1']}]},
{'name': 'testing the finder 3', 'text': 'testing the finder with pyhton unit test 3', 'meta': {'url': 'url'}, 'tags': [{'tag2': ['1', '2']}]} {'text': 'testing the finder with pyhton unit test 3', 'meta': {'name': 'testing the finder 3', 'url': 'url'}, 'tags': [{'tag2': ['1', '2']}]}
] ]
def test_memory_store_get_by_tag_lists_non_existent_tag(): def test_memory_store_get_by_tag_lists_non_existent_tag():
test_docs = [ test_docs = [
{"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}, 'tags': [{'tag1': ["1"]}]}, {"text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url', "name": "testing the finder 1"}, 'tags': [{'tag1': ["1"]}]},
] ]
from haystack.database.memory import InMemoryDocumentStore from haystack.database.memory import InMemoryDocumentStore
document_store = InMemoryDocumentStore() document_store = InMemoryDocumentStore()
@ -71,10 +71,10 @@ def test_memory_store_get_by_tag_lists_non_existent_tag():
def test_memory_store_get_by_tag_lists_disjoint(): def test_memory_store_get_by_tag_lists_disjoint():
test_docs = [ test_docs = [
{"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}, 'tags': [{'tag1': ["1"]}]}, {"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}, 'tags': [{'tag1': ["1"]}]},
{"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': None}, 'tags': [{'tag2': ['1']}]}, {"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': None}, 'tags': [{'tag2': ['1']}]},
{"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}, 'tags': [{'tag3': ["1", "2"]}]}, {"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}, 'tags': [{'tag3': ["1", "2"]}]},
{"name": "testing the finder 4", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}, 'tags': [{'tag3': ["1", "3"]}]} {"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 4", 'url': 'url'}, 'tags': [{'tag3': ["1", "3"]}]}
] ]
from haystack.database.memory import InMemoryDocumentStore from haystack.database.memory import InMemoryDocumentStore
@ -83,4 +83,4 @@ def test_memory_store_get_by_tag_lists_disjoint():
docs = document_store.get_document_ids_by_tags({'tag3': ["3"]}) docs = document_store.get_document_ids_by_tags({'tag3': ["3"]})
assert docs == [{'name': 'testing the finder 4', 'text': 'testing the finder with pyhton unit test 3', 'meta': {'url': 'url'}, 'tags': [{'tag3': ['1', '3']}]}] assert docs == [{'text': 'testing the finder with pyhton unit test 3', 'meta': {'name': 'testing the finder 4', 'url': 'url'}, 'tags': [{'tag3': ['1', '3']}]}]

View File

@ -12,7 +12,7 @@ def test_reader_basic(reader):
def test_output(reader, test_docs_xs): def test_output(reader, test_docs_xs):
docs = [] docs = []
for d in test_docs_xs: for d in test_docs_xs:
doc = Document(id=d["name"], text=d["text"], meta=d["meta"]) doc = Document(id=d["meta"]["name"], text=d["text"], meta=d["meta"])
docs.append(doc) docs.append(doc)
results = reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5) results = reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5)
assert results is not None assert results is not None

View File

@ -16,4 +16,13 @@ def test_tfidf_retriever():
retriever = TfidfRetriever(document_store) retriever = TfidfRetriever(document_store)
retriever.fit() retriever.fit()
assert retriever.retrieve("godzilla", top_k=1) == [Document(id='0', text='godzilla says hello', external_source_id=None, question=None, query_score=None, meta={})] assert retriever.retrieve("godzilla", top_k=1) == [
Document(
id='0',
text='godzilla says hello',
external_source_id=None,
question=None,
query_score=None,
meta={"name": "testing the finder 1"},
)
]