From 6e8e3c68d9ed5728c93b6f405ca8c873b40e8207 Mon Sep 17 00:00:00 2001 From: Kristof Herrmann <37148029+ArzelaAscoIi@users.noreply.github.com> Date: Mon, 3 Jan 2022 16:58:19 +0100 Subject: [PATCH] Custom id hashing on documentstore level (#1910) * adding dynamic id hashing * Add latest docstring and tutorial changes * added pr review * Add latest docstring and tutorial changes * fixed tests * fix mypy error * fix mypy issue * ignore typing * fixed correct check * fixed tests * try fixing the tests * set id hash keys only if not none * dont store id_hash_keys * fix tests * Add latest docstring and tutorial changes Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/_src/api/api/document_store.md | 21 ++++++++++++ docs/_src/api/api/primitives.md | 7 ++-- haystack/document_stores/base.py | 29 ++++++++++++++-- haystack/document_stores/graphdb.py | 7 ++++ haystack/document_stores/sql.py | 6 ++++ haystack/schema.py | 45 +++++++++++++++++++++---- test/test_document_store.py | 51 ++++++++++++++++++----------- test/test_schema.py | 18 +++++++--- 8 files changed, 148 insertions(+), 36 deletions(-) diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index b2eecdf4d..9d0009c0b 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -164,6 +164,27 @@ from disk and also indexed batchwise to the DocumentStore in order to prevent ou same question might be found in different contexts. - `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + +#### run + +```python + | run(documents: List[dict], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, id_hash_keys: Optional[List[str]] = None) +``` + +Run requests of document stores + +Comment: We will gradually introduce the primitives. The doument stores also accept dicts and parse them to documents. +In the future, however, only documents themselves will be accepted. Parsing the dictionaries in the run function +is therefore only an interim solution until the run function also accepts documents. + +**Arguments**: + +- `documents`: A list of dicts that are documents. +- `headers`: A list of headers. +- `index`: Optional name of index where the documents shall be written to. + If None, the DocumentStore's default index (self.index) will be used. +- `id_hash_keys`: List of the fields that the hashes of the ids are generated from. + #### get\_batches\_from\_generator diff --git a/docs/_src/api/api/primitives.md b/docs/_src/api/api/primitives.md index 95234e046..9e2a3e3ed 100644 --- a/docs/_src/api/api/primitives.md +++ b/docs/_src/api/api/primitives.md @@ -39,9 +39,10 @@ There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`. In the range of [0,1], where 1 means extremely relevant. - `meta`: Meta fields for a document like name, url, or author in the form of a custom dict (any keys and values allowed). - `embedding`: Vector encoding of the text -- `id_hash_keys`: Generate the document id from a custom list of strings. +- `id_hash_keys`: Generate the document id from a custom list of strings that refere to the documents attributes. If you want ensure you don't have duplicate documents in your DocumentStore but texts are - not unique, you can provide custom strings here that will be used (e.g. ["filename_xy", "text_of_doc"]. + not unique, you can modify the metadata and pass e.g. "meta" to this field (e.g. ["content", "meta"]). + In this case the id will be generated by using the content and the defined metadata. #### to\_dict @@ -71,7 +72,7 @@ dict with content of the Document ```python | @classmethod - | from_dict(cls, dict, field_map={}) + | from_dict(cls, dict, field_map={}, id_hash_keys=None) ``` Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the diff --git a/haystack/document_stores/base.py b/haystack/document_stores/base.py index 385752771..9526abe5a 100644 --- a/haystack/document_stores/base.py +++ b/haystack/document_stores/base.py @@ -7,6 +7,11 @@ from itertools import islice from abc import abstractmethod from pathlib import Path +try: + from typing import Literal +except ImportError: + from typing_extensions import Literal #type: ignore + from haystack.schema import Document, Label, MultiLabel from haystack.nodes.base import BaseComponent from haystack.errors import DuplicateDocumentError @@ -342,9 +347,29 @@ class BaseDocumentStore(BaseComponent): @abstractmethod def delete_labels(self, index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None): pass + + @abstractmethod + def _create_document_field_map(self) -> Dict: + pass - def run(self, documents: List[dict], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None): # type: ignore - self.write_documents(documents=documents, index=index, headers=headers) + def run(self, documents: List[dict], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, id_hash_keys: Optional[List[str]] = None ): # type: ignore + """ + Run requests of document stores + + Comment: We will gradually introduce the primitives. The doument stores also accept dicts and parse them to documents. + In the future, however, only documents themselves will be accepted. Parsing the dictionaries in the run function + is therefore only an interim solution until the run function also accepts documents. + + :param documents: A list of dicts that are documents. + :param headers: A list of headers. + :param index: Optional name of index where the documents shall be written to. + If None, the DocumentStore's default index (self.index) will be used. + :param id_hash_keys: List of the fields that the hashes of the ids are generated from. + """ + + field_map = self._create_document_field_map() + doc_objects = [Document.from_dict(d, field_map=field_map, id_hash_keys=id_hash_keys) for d in documents] + self.write_documents(documents=doc_objects, index=index, headers=headers) return {}, "output_1" @abstractmethod diff --git a/haystack/document_stores/graphdb.py b/haystack/document_stores/graphdb.py index fd1ce17f0..a2d851829 100644 --- a/haystack/document_stores/graphdb.py +++ b/haystack/document_stores/graphdb.py @@ -1,3 +1,4 @@ + from typing import Dict, Optional import requests @@ -125,6 +126,12 @@ class GraphDBKnowledgeGraph(BaseKnowledgeGraph): results = self.query(sparql_query=sparql_query, index=index, headers=headers) return results + def _create_document_field_map(self)->Dict: + """ + There is no field mapping required + """ + return {} + def get_all_objects(self, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None): """ Query the given index in the GraphDB instance for all its stored objects. Duplicates are not filtered. diff --git a/haystack/document_stores/sql.py b/haystack/document_stores/sql.py index e9c254742..666870238 100644 --- a/haystack/document_stores/sql.py +++ b/haystack/document_stores/sql.py @@ -225,6 +225,12 @@ class SQLDocumentStore(BaseDocumentStore): batch_size=batch_size, ) yield from result + + def _create_document_field_map(self)->Dict: + """ + There is no field mapping required + """ + return {} def _query( self, diff --git a/haystack/schema.py b/haystack/schema.py index 51dd85a70..1eed1b82b 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -79,9 +79,10 @@ class Document: In the range of [0,1], where 1 means extremely relevant. :param meta: Meta fields for a document like name, url, or author in the form of a custom dict (any keys and values allowed). :param embedding: Vector encoding of the text - :param id_hash_keys: Generate the document id from a custom list of strings. + :param id_hash_keys: Generate the document id from a custom list of strings that refere to the documents attributes. If you want ensure you don't have duplicate documents in your DocumentStore but texts are - not unique, you can provide custom strings here that will be used (e.g. ["filename_xy", "text_of_doc"]. + not unique, you can modify the metadata and pass e.g. "meta" to this field (e.g. ["content", "meta"]). + In this case the id will be generated by using the content and the defined metadata. """ if content is None: @@ -92,6 +93,14 @@ class Document: self.score = score self.meta = meta or {} + allowed_hash_key_attributes = ["content", "content_type", "score", "meta", "embedding" ] + + + if id_hash_keys is not None: + if not set(id_hash_keys) <= set(allowed_hash_key_attributes): #type: ignore + raise ValueError(f"You passed custom strings {id_hash_keys} to id_hash_keys which is deprecated. Supply instead a list of Document's attribute names that the id should be based on (e.g. {allowed_hash_key_attributes}). See https://github.com/deepset-ai/haystack/pull/1910 for details)") + + if embedding is not None: embedding = np.asarray(embedding) self.embedding = embedding @@ -100,11 +109,30 @@ class Document: if id: self.id: str = str(id) else: - self.id: str = self._get_id(id_hash_keys) + self.id: str = self._get_id(id_hash_keys=id_hash_keys) - def _get_id(self, id_hash_keys): - final_hash_key = ":".join(id_hash_keys) if id_hash_keys else str(self.content) - return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False)) + + def _get_id(self, + id_hash_keys: Optional[List[str]] = None + ): + """ + Generate the id of a document by creating the hash of strings. By default the content of a document is + used to generate the hash. There are two ways of modifying the generated id of a document. Either static keys + or a selection of the content. + :param id_hash_keys: Optional list of fields that should be dynamically used to generate the hash. + """ + + if id_hash_keys is None: + return '{:02x}'.format(mmh3.hash128(str(self.content), signed=False)) + + final_hash_key = "" + for attr in id_hash_keys: + final_hash_key += ":" + str(getattr(self,attr)) + + if final_hash_key == "": + raise ValueError(f"Cant't create 'Document': 'id_hash_keys' must contain at least one of ['content', 'meta']") + + return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False)) def to_dict(self, field_map={}) -> Dict: """ @@ -131,7 +159,7 @@ class Document: return _doc @classmethod - def from_dict(cls, dict, field_map={}): + def from_dict(cls, dict, field_map={}, id_hash_keys=None): """ Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the input dict. This way you can work with standardized Document objects in Haystack, but adjust the format that @@ -160,6 +188,9 @@ class Document: elif k in field_map: k = field_map[k] _new_doc[k] = v + + if _doc.get("id") is None: + _new_doc["id_hash_keys"]=id_hash_keys # Convert list of rows to pd.DataFrame if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list): diff --git a/test/test_document_store.py b/test/test_document_store.py index adb57cf5e..bb73f0cca 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -40,56 +40,57 @@ def test_init_elastic_client(): def test_write_with_duplicate_doc_ids(document_store): - documents = [ + duplicate_documents = [ Document( content="Doc1", - id_hash_keys=["key1"] + id_hash_keys=["content"] ), Document( - content="Doc2", - id_hash_keys=["key1"] + content="Doc1", + id_hash_keys=["content"] ) ] - document_store.write_documents(documents, duplicate_documents="skip") + document_store.write_documents(duplicate_documents, duplicate_documents="skip") assert len(document_store.get_all_documents()) == 1 with pytest.raises(Exception): - document_store.write_documents(documents, duplicate_documents="fail") + document_store.write_documents(duplicate_documents, duplicate_documents="fail") @pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate"], indirect=True) def test_write_with_duplicate_doc_ids_custom_index(document_store): - documents = [ + duplicate_documents = [ Document( content="Doc1", - id_hash_keys=["key1"] + id_hash_keys=["content"] ), Document( - content="Doc2", - id_hash_keys=["key1"] + content="Doc1", + id_hash_keys=["content"] ) ] document_store.delete_documents(index="haystack_custom_test") - document_store.write_documents(documents, index="haystack_custom_test", duplicate_documents="skip") + document_store.write_documents(duplicate_documents, index="haystack_custom_test", duplicate_documents="skip") + assert len(document_store.get_all_documents(index="haystack_custom_test")) == 1 with pytest.raises(DuplicateDocumentError): - document_store.write_documents(documents, index="haystack_custom_test", duplicate_documents="fail") + document_store.write_documents(duplicate_documents, index="haystack_custom_test", duplicate_documents="fail") # Weaviate manipulates document objects in-place when writing them to an index. # It generates a uuid based on the provided id and the index name where the document is added to. # We need to get rid of these generated uuids for this test and therefore reset the document objects. # As a result, the documents will receive a fresh uuid based on their id_hash_keys and a different index name. if isinstance(document_store, WeaviateDocumentStore): - documents = [ + duplicate_documents = [ Document( content="Doc1", - id_hash_keys=["key1"] + id_hash_keys=["content"] ), Document( - content="Doc2", - id_hash_keys=["key1"] + content="Doc1", + id_hash_keys=["content"] ) ] # writing to the default, empty index should still work - document_store.write_documents(documents, duplicate_documents="fail") + document_store.write_documents(duplicate_documents, duplicate_documents="fail") def test_get_all_documents_without_filters(document_store_with_docs): @@ -105,17 +106,17 @@ def test_get_all_document_filter_duplicate_text_value(document_store): Document( content="Doc1", meta={"f1": "0"}, - id_hash_keys=["Doc1", "1"] + id_hash_keys=["meta"] ), Document( content="Doc1", meta={"f1": "1", "meta_id": "0"}, - id_hash_keys=["Doc1", "2"] + id_hash_keys=["meta"] ), Document( content="Doc2", meta={"f3": "0"}, - id_hash_keys=["Doc2", "3"] + id_hash_keys=["meta"] ) ] document_store.write_documents(documents) @@ -124,6 +125,16 @@ def test_get_all_document_filter_duplicate_text_value(document_store): assert len(documents) == 1 assert {d.meta["meta_id"] for d in documents} == {"0"} + documents = document_store.get_all_documents(filters={"f1": ["0"]}) + assert documents[0].content == "Doc1" + assert len(documents) == 1 + assert documents[0].meta.get("meta_id") is None + + documents = document_store.get_all_documents(filters={"f3": ["0"]}) + assert documents[0].content == "Doc2" + assert len(documents) == 1 + assert documents[0].meta.get("meta_id") is None + def test_get_all_documents_with_correct_filters(document_store_with_docs): documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test2"]}) diff --git a/test/test_schema.py b/test/test_schema.py index 9faedbfbe..bbba3429c 100644 --- a/test/test_schema.py +++ b/test/test_schema.py @@ -1,4 +1,5 @@ from haystack.schema import Document, Label, Answer, Span +import pytest import numpy as np LABELS = [ @@ -152,9 +153,18 @@ def test_generate_doc_id_using_custom_list(): text1 = "text1" text2 = "text2" - doc1_text1 = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["key1", text1]) - doc2_text1 = Document(content=text1, meta={"name": "doc2"}, id_hash_keys=["key1", text1]) - doc3_text2 = Document(content=text2, meta={"name": "doc3"}, id_hash_keys=["key1", text2]) + doc1_meta1_id_by_content = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content"]) + doc1_meta2_id_by_content = Document(content=text1, meta={"name": "doc2"}, id_hash_keys=["content"]) + assert doc1_meta1_id_by_content.id == doc1_meta2_id_by_content.id - assert doc1_text1.id == doc2_text1.id + doc1_meta1_id_by_content_and_meta = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content","meta"]) + doc1_meta2_id_by_content_and_meta = Document(content=text1, meta={"name": "doc2"}, id_hash_keys=["content", "meta"]) + assert doc1_meta1_id_by_content_and_meta.id != doc1_meta2_id_by_content_and_meta.id + + doc1_text1 = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content"]) + doc3_text2 = Document(content=text2, meta={"name": "doc3"}, id_hash_keys=["content"]) assert doc1_text1.id != doc3_text2.id + + + with pytest.raises(ValueError): + _ = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content","non_existing_field"]) \ No newline at end of file