Move document_name attribute to meta (#217)

2025-10-18 19:38:57 +00:00 · 2020-07-14 09:53:31 +02:00 · 2020-07-14 09:53:31 +02:00 · b886e054a3
commit b886e054a3
parent 4c21556a79
12 changed files with 110 additions and 104 deletions
--- a/haystack/database/base.py
+++ b/haystack/database/base.py
@ -12,10 +12,9 @@ class Document(BaseModel):
        description="id for the source file the document was created from. In the case when a large file is divided "
        "across multiple Elasticsearch documents, this id can be used to reference original source file.",
    )
-    # name: Optional[str] = Field(None, description="Title of the document")
    question: Optional[str] = Field(None, description="Question text for FAQs.")
    query_score: Optional[float] = Field(None, description="Elasticsearch query score for a retrieved document")
-    meta: Dict[str, Any] = Field({}, description="")
+    meta: Dict[str, Any] = Field({}, description="Meta fields for a document like name, url, or author.")
    tags: Optional[Dict[str, Any]] = Field(None, description="Tags that allow filtering of the data")


@ -30,8 +29,11 @@ class BaseDocumentStore(ABC):
        """
        Indexes documents for later queries.

-        :param documents: List of dictionaries in the format {"name": "<some-document-name>, "text": "<the-actual-text>"}.
-                          Optionally, further fields can be supplied depending on the child class.
+        :param documents: List of dictionaries.
+                          Default format: {"text": "<the-actual-text>"}
+                          Optionally: Include meta data via {"text": "<the-actual-text>",
+                          "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
+                          It can be used for filtering and is accessible in the responses of the Finder.

        :return: None
        """
--- a/haystack/database/elasticsearch.py
+++ b/haystack/database/elasticsearch.py
@ -117,9 +117,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
        Indexes documents for later queries in Elasticsearch.

        :param documents: List of dictionaries.
-                          Default format: {"name": "<some-document-name>, "text": "<the-actual-text>"}
-                          Optionally: Include meta data via {"name": "<some-document-name>,
-                          "text": "<the-actual-text>", "meta":{"author": "somebody", ...}}
+                          Default format: {"text": "<the-actual-text>"}
+                          Optionally: Include meta data via {"text": "<the-actual-text>",
+                          "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
                          It can be used for filtering and is accessible in the responses of the Finder.
                          Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary
                          should be changed to what you have set for self.text_field and self.name_field .
--- a/haystack/database/memory.py
+++ b/haystack/database/memory.py
@ -18,9 +18,9 @@ class InMemoryDocumentStore(BaseDocumentStore):
        """
        Indexes documents for later queries.

-        :param documents: List of dictionaries in the format {"name": "<some-document-name>, "text": "<the-actual-text>"}.
+        :param documents: List of dictionaries in the format {"text": "<the-actual-text>"}.
                          Optionally, you can also supply "tags": ["one-tag", "another-one"]
-                          or additional meta data via "meta": {"author": "someone", "url":"some-url" ...}
+                          or additional meta data via "meta": {"name": "<some-document-name>, "author": "someone", "url":"some-url" ...}

        :return: None
        """
@ -30,19 +30,21 @@ class InMemoryDocumentStore(BaseDocumentStore):
            return

        for document in documents:
-            name = document.get("name", None)
-            text = document.get("text", None)
+            text = document["text"]
+            if "meta" not in document.keys():
+                document["meta"] = {}
+            for k, v in document.items():  # put additional fields other than text in meta
+                if k not in ["text", "meta", "tags"]:
+                    document["meta"][k] = v

-            if name is None or text is None:
-                continue
+            if not text:
+                raise Exception("A document cannot have empty text field.")

-            signature = name + text
-
-            hash = hashlib.md5(signature.encode("utf-8")).hexdigest()
+            hash = hashlib.md5(text.encode("utf-8")).hexdigest()

            self.docs[hash] = document

-            tags = document.get('tags', [])
+            tags = document.get("tags", [])

            self._map_tags_to_ids(hash, tags)

@ -65,12 +67,12 @@ class InMemoryDocumentStore(BaseDocumentStore):
        document = self._convert_memory_hit_to_document(self.docs[id], doc_id=id)
        return document

-    def _convert_memory_hit_to_document(self, hit: Tuple[Any, Any], doc_id: Optional[str] = None) -> Document:
+    def _convert_memory_hit_to_document(self, hit: Dict[str, Any], doc_id: Optional[str] = None) -> Document:
        document = Document(
            id=doc_id,
-            text=hit[0].get('text', None),
-            meta=hit[0].get('meta', {}),
-            query_score=hit[1],
+            text=hit.get("text", None),
+            meta=hit.get("meta", {}),
+            query_score=hit.get("query_score", None),
        )
        return document

@ -89,14 +91,21 @@ class InMemoryDocumentStore(BaseDocumentStore):
                                      "use a different DocumentStore (e.g. ElasticsearchDocumentStore).")

        if self.embedding_field is None:
-            return []
+            raise Exception(
+                "To use query_by_embedding() 'embedding field' must "
+                "be specified when initializing the document store."
+            )

        if query_emb is None:
            return []

-        candidate_docs = [self._convert_memory_hit_to_document(
-            (doc, dot(query_emb, doc[self.embedding_field]) / (norm(query_emb) * norm(doc[self.embedding_field]))), doc_id=idx) for idx, doc in self.docs.items()
-        ]
+        candidate_docs = []
+        for idx, hit in self.docs.items():
+            hit["query_score"] = dot(query_emb, hit[self.embedding_field]) / (
+                norm(query_emb) * norm(hit[self.embedding_field])
+            )
+            _doc = self._convert_memory_hit_to_document(hit=hit, doc_id=idx)
+            candidate_docs.append(_doc)

        return sorted(candidate_docs, key=lambda x: x.query_score, reverse=True)[0:top_k]

@ -139,4 +148,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
        return len(self.docs.items())

    def get_all_documents(self) -> List[Document]:
-        return [Document(id=item[0], text=item[1]['text'], name=item[1]['name'], meta=item[1].get('meta', {})) for item in self.docs.items()]
+        return [
+            Document(id=item[0], text=item[1]["text"], meta=item[1].get("meta", {}))
+            for item in self.docs.items()
+        ]
--- a/haystack/database/sql.py
+++ b/haystack/database/sql.py
@ -20,7 +20,6 @@ class ORMBase(Base):
 class Document(ORMBase):
    __tablename__ = "document"

-    name = Column(String)
    text = Column(String)
    meta_data = Column(PickleType)

@ -96,14 +95,19 @@ class SQLDocumentStore(BaseDocumentStore):
        """
        Indexes documents for later queries.

-        :param documents: List of dictionaries in the format {"name": "<some-document-name>, "text": "<the-actual-text>"}.
+        :param documents: List of dictionaries in the format {"text": "<the-actual-text>"}.
                          Optionally, you can also supply meta data via "meta": {"author": "someone", "url":"some-url" ...}

        :return: None
        """

        for doc in documents:
-            row = Document(name=doc["name"], text=doc["text"], meta_data=doc.get("meta", {}))
+            if "meta" not in doc.keys():
+                doc["meta"] = {}
+            for k, v in doc.items():  # put additional fields other than text in meta
+                if k not in ["text", "meta", "tags"]:
+                    doc["meta"][k] = v
+            row = Document(text=doc["text"], meta_data=doc.get("meta", {}))
            self.session.add(row)
        self.session.commit()

--- a/haystack/indexing/utils.py
+++ b/haystack/indexing/utils.py
@ -48,9 +48,9 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
            for para in text.split("\n\n"):
                if not para.strip():  # skip empty paragraphs
                    continue
-                documents.append({"name": path.name, "text": para})
+                documents.append({"text": para, "meta": {"name": path.name}})
        else:
-            documents.append({"name": path.name, "text": text})
+            documents.append({"text": text, "meta": {"name": path.name}})

    return documents

--- a/test/conftest.py
+++ b/test/conftest.py
@ -54,9 +54,10 @@ def xpdf_fixture():
@pytest.fixture()
 def test_docs_xs():
    return [
-        {"name": "filename1", "text": "My name is Carla and I live in Berlin", "meta": {"meta_field": "test1"}},
-        {"name": "filename2", "text": "My name is Paul and I live in New York", "meta": {"meta_field": "test2"}},
-        {"name": "filename3", "text": "My name is Christelle and I live in Paris", "meta": {"meta_field": "test3"}}
+        {"text": "My name is Carla and I live in Berlin", "meta": {"meta_field": "test1", "name": "filename1"}},
+        {"text": "My name is Paul and I live in New York", "meta": {"meta_field": "test2", "name": "filename2"}},
+        {"text": "My name is Christelle and I live in Paris", "meta_field": "test3", "meta": {"name": "filename3"}}
+        # last doc has meta_field at the top level for backward compatibility
    ]


--- a/test/test_db.py
+++ b/test/test_db.py
@ -1,27 +1,12 @@
-from time import sleep
-
-from haystack.database.elasticsearch import ElasticsearchDocumentStore
-from haystack.database.sql import SQLDocumentStore
-from haystack.indexing.utils import convert_files_to_dicts
+from haystack.database.base import Document


-def test_sql_write_read():
-    sql_document_store = SQLDocumentStore()
-    documents = convert_files_to_dicts(dir_path="samples/docs")
-    sql_document_store.write_documents(documents)
-    documents = sql_document_store.get_all_documents()
-    assert len(documents) == 2
-    doc = sql_document_store.get_document_by_id("1")
-    assert doc.id
-    assert doc.text
-
-
-def test_elasticsearch_write_read(elasticsearch_fixture):
-    document_store = ElasticsearchDocumentStore()
-    documents = convert_files_to_dicts(dir_path="samples/docs")
-    document_store.write_documents(documents)
-    sleep(2)  # wait for documents to be available for query
-    documents = document_store.get_all_documents()
-    assert len(documents) == 2
-    assert documents[0].id
-    assert documents[0].text
+def test_get_all_documents(document_store_with_docs):
+    documents = document_store_with_docs.get_all_documents()
+    assert all(isinstance(d, Document) for d in documents)
+    assert len(documents) == 3
+    assert {d.meta["name"] for d in documents} == {"filename1", "filename2", "filename3"}
+    assert {d.meta["meta_field"] for d in documents} == {"test1", "test2", "test3"}
+    doc = document_store_with_docs.get_document_by_id(documents[0].id)
+    assert doc.id == documents[0].id
+    assert doc.text == documents[0].text
--- a/test/test_dpr_retriever.py
+++ b/test/test_dpr_retriever.py
@ -1,20 +1,8 @@
-from haystack.retriever.dpr_utils import download_dpr
-
-def test_dpr_passage_encoder():
-    from haystack.retriever.dense import DensePassageRetriever
-
-    passage = ["Let's encode this one"]
-    retriever = DensePassageRetriever(document_store=None, embedding_model="dpr-bert-base-nq", gpu=False)
-    emb = retriever.embed_passages(passage)[0]
-    assert(emb.shape[0] == 768)
-    assert(emb[0]-0.52872 < 0.001)
+from haystack.database.memory import InMemoryDocumentStore
+from haystack.retriever.dense import DensePassageRetriever


 def test_dpr_inmemory_retrieval():
-
-    from haystack.database.memory import InMemoryDocumentStore
-    from haystack.retriever.dense import DensePassageRetriever
-
    document_store = InMemoryDocumentStore(embedding_field="embedding")

    documents = [
@ -27,8 +15,13 @@ def test_dpr_inmemory_retrieval():

    embedded = []
    for doc in documents:
-        doc['embedding'] = retriever.embed_passages([doc['text']])[0]
+        embedding = retriever.embed_passages([doc['text']])[0]
+        doc['embedding'] = embedding
        embedded.append(doc)
+
+        assert (embedding.shape[0] == 768)
+        assert (embedding[0] - 0.52872 < 0.001)
+
    document_store.write_documents(embedded)

    res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?")
--- a/test/test_faq_retriever.py
+++ b/test/test_faq_retriever.py
@ -9,17 +9,17 @@ def test_faq_retriever_in_memory_store():
    document_store = InMemoryDocumentStore(embedding_field="embedding")

    documents = [
-        {'name': 'How to test this library?', 'text': 'By running tox in the command line!', 'meta': {'question': 'How to test this library?'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'How to test this library?', 'question': 'How to test this library?'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
    ]

    retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", gpu=False)
--- a/test/test_in_memory_store.py
+++ b/test/test_in_memory_store.py
@ -5,9 +5,9 @@ from haystack.retriever.sparse import TfidfRetriever

 def test_finder_get_answers_with_in_memory_store():
    test_docs = [
-        {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}},
-        {"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': 'url'}},
-        {"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}}
+        {"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}},
+        {"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': 'url'}},
+        {"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}}
    ]

    from haystack.database.memory import InMemoryDocumentStore
@ -25,9 +25,9 @@ def test_finder_get_answers_with_in_memory_store():

 def test_memory_store_get_by_tags():
    test_docs = [
-        {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}},
-        {"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': None}},
-        {"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}}
+        {"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}},
+        {"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': None}},
+        {"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}}
    ]

    from haystack.database.memory import InMemoryDocumentStore
@ -41,9 +41,9 @@ def test_memory_store_get_by_tags():

 def test_memory_store_get_by_tag_lists_union():
    test_docs = [
-        {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}, 'tags': [{'tag2': ["1"]}]},
-        {"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': None}, 'tags': [{'tag1': ['1']}]},
-        {"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}, 'tags': [{'tag2': ["1", "2"]}]}
+        {"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}, 'tags': [{'tag2': ["1"]}]},
+        {"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': None}, 'tags': [{'tag1': ['1']}]},
+        {"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}, 'tags': [{'tag2': ["1", "2"]}]}
    ]

    from haystack.database.memory import InMemoryDocumentStore
@ -53,14 +53,14 @@ def test_memory_store_get_by_tag_lists_union():
    docs = document_store.get_document_ids_by_tags({'tag2': ["1"]})

    assert docs == [
-        {'name': 'testing the finder 1', 'text': 'testing the finder with pyhton unit test 1', 'meta': {'url': 'url'}, 'tags': [{'tag2': ['1']}]},
-        {'name': 'testing the finder 3', 'text': 'testing the finder with pyhton unit test 3', 'meta': {'url': 'url'}, 'tags': [{'tag2': ['1', '2']}]}
+        {'text': 'testing the finder with pyhton unit test 1', 'meta': {'name': 'testing the finder 1', 'url': 'url'}, 'tags': [{'tag2': ['1']}]},
+        {'text': 'testing the finder with pyhton unit test 3', 'meta': {'name': 'testing the finder 3', 'url': 'url'}, 'tags': [{'tag2': ['1', '2']}]}
    ]


 def test_memory_store_get_by_tag_lists_non_existent_tag():
    test_docs = [
-        {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}, 'tags': [{'tag1': ["1"]}]},
+        {"text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url', "name": "testing the finder 1"}, 'tags': [{'tag1': ["1"]}]},
    ]
    from haystack.database.memory import InMemoryDocumentStore
    document_store = InMemoryDocumentStore()
@ -71,10 +71,10 @@ def test_memory_store_get_by_tag_lists_non_existent_tag():

 def test_memory_store_get_by_tag_lists_disjoint():
    test_docs = [
-        {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}, 'tags': [{'tag1': ["1"]}]},
-        {"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': None}, 'tags': [{'tag2': ['1']}]},
-        {"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}, 'tags': [{'tag3': ["1", "2"]}]},
-        {"name": "testing the finder 4", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}, 'tags': [{'tag3': ["1", "3"]}]}
+        {"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}, 'tags': [{'tag1': ["1"]}]},
+        {"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': None}, 'tags': [{'tag2': ['1']}]},
+        {"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}, 'tags': [{'tag3': ["1", "2"]}]},
+        {"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 4", 'url': 'url'}, 'tags': [{'tag3': ["1", "3"]}]}
    ]

    from haystack.database.memory import InMemoryDocumentStore
@ -83,4 +83,4 @@ def test_memory_store_get_by_tag_lists_disjoint():

    docs = document_store.get_document_ids_by_tags({'tag3': ["3"]})

-    assert docs == [{'name': 'testing the finder 4', 'text': 'testing the finder with pyhton unit test 3', 'meta': {'url': 'url'}, 'tags': [{'tag3': ['1', '3']}]}]
+    assert docs == [{'text': 'testing the finder with pyhton unit test 3', 'meta': {'name': 'testing the finder 4', 'url': 'url'}, 'tags': [{'tag3': ['1', '3']}]}]
--- a/test/test_reader.py
+++ b/test/test_reader.py
@ -12,7 +12,7 @@ def test_reader_basic(reader):
 def test_output(reader, test_docs_xs):
    docs = []
    for d in test_docs_xs:
-        doc = Document(id=d["name"], text=d["text"], meta=d["meta"])
+        doc = Document(id=d["meta"]["name"], text=d["text"], meta=d["meta"])
        docs.append(doc)
    results = reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5)
    assert results is not None
--- a/test/test_tfidf_retriever.py
+++ b/test/test_tfidf_retriever.py
@ -16,4 +16,13 @@ def test_tfidf_retriever():

    retriever = TfidfRetriever(document_store)
    retriever.fit()
-    assert retriever.retrieve("godzilla", top_k=1) == [Document(id='0', text='godzilla says hello', external_source_id=None, question=None, query_score=None, meta={})]
+    assert retriever.retrieve("godzilla", top_k=1) == [
+        Document(
+            id='0',
+            text='godzilla says hello',
+            external_source_id=None,
+            question=None,
+            query_score=None,
+            meta={"name": "testing the finder 1"},
+        )
+    ]