From 1168f6365d0e3eab543dec356fddefac7ea7d321 Mon Sep 17 00:00:00 2001
From: tstadel <60758086+tstadel@users.noreply.github.com>
Date: Fri, 24 Jun 2022 09:55:09 +0200
Subject: [PATCH] Fix using id_hash_keys as pipeline params (#2717)

* Fix using id_hash_keys as pipeline params

* Update Documentation & Code Style

* add tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 docs/_src/api/api/file_converter.md   |  6 +++++-
 docs/_src/api/api/preprocessor.md     |  2 +-
 haystack/nodes/file_converter/base.py | 20 ++++++++++++++++----
 haystack/nodes/preprocessor/base.py   |  5 +++++
 test/nodes/test_file_converter.py     | 16 ++++++++++++++++
 test/nodes/test_preprocessor.py       | 14 ++++++++++++++
 6 files changed, 57 insertions(+), 6 deletions(-)
diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md
index f6ef56fc5..a4ac939d6 100644
--- a/docs/_src/api/api/file_converter.md
+++ b/docs/_src/api/api/file_converter.md
@@ -86,7 +86,7 @@ Validate if the language of the text is one of valid languages.
 #### BaseConverter.run
 
 ```python
-def run(file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, known_ligatures: Dict[str, str] = KNOWN_LIGATURES, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8")
+def run(file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, known_ligatures: Dict[str, str] = KNOWN_LIGATURES, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None)
 ```
 
 Extract text from a file.
@@ -114,6 +114,10 @@ This option can be used to add test for encoding errors. If the extracted text i
 not one of the valid languages, then it might likely be encoding error resulting
 in garbled text.
 - `encoding`: Select the file encoding (default is `UTF-8`)
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
 
 <a id="docx"></a>
 
diff --git a/docs/_src/api/api/preprocessor.md b/docs/_src/api/api/preprocessor.md
index 852099be9..160b3e51a 100644
--- a/docs/_src/api/api/preprocessor.md
+++ b/docs/_src/api/api/preprocessor.md
@@ -16,7 +16,7 @@ class BasePreProcessor(BaseComponent)
 
 ```python
 @abstractmethod
-def process(documents: Union[dict, Document, List[Union[dict, Document]]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[Document]
+def process(documents: Union[dict, Document, List[Union[dict, Document]]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True, id_hash_keys: Optional[List[str]] = None) -> List[Document]
 ```
 
 Perform document cleaning and splitting. Takes a single Document or a List of Documents as input and returns a
diff --git a/haystack/nodes/file_converter/base.py b/haystack/nodes/file_converter/base.py
index feaf66475..cd5e96378 100644
--- a/haystack/nodes/file_converter/base.py
+++ b/haystack/nodes/file_converter/base.py
@@ -137,6 +137,7 @@ class BaseConverter(BaseComponent):
         known_ligatures: Dict[str, str] = KNOWN_LIGATURES,
         valid_languages: Optional[List[str]] = None,
         encoding: Optional[str] = "UTF-8",
+        id_hash_keys: Optional[List[str]] = None,
     ):
         """
         Extract text from a file.
@@ -162,6 +163,10 @@ class BaseConverter(BaseComponent):
                                 not one of the valid languages, then it might likely be encoding error resulting
                                 in garbled text.
         :param encoding: Select the file encoding (default is `UTF-8`)
+        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
+            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+            In this case the id will be generated by using the content and the defined metadata.
         """
 
         if isinstance(file_paths, Path):
@@ -178,6 +183,7 @@ class BaseConverter(BaseComponent):
                 remove_numeric_tables=remove_numeric_tables,
                 valid_languages=valid_languages,
                 encoding=encoding,
+                id_hash_keys=id_hash_keys,
             ):
                 documents.append(doc)
 
@@ -192,14 +198,20 @@ class BaseConverter(BaseComponent):
 
     def run_batch(  # type: ignore
         self,
-        file_paths: Union[Path, List[Path]],  # type: ignore
-        meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,  # type: ignore
-        remove_numeric_tables: Optional[bool] = None,  # type: ignore
-        valid_languages: Optional[List[str]] = None,  # type: ignore
+        file_paths: Union[Path, List[Path]],
+        meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,
+        remove_numeric_tables: Optional[bool] = None,
+        known_ligatures: Dict[str, str] = KNOWN_LIGATURES,
+        valid_languages: Optional[List[str]] = None,
+        encoding: Optional[str] = "UTF-8",
+        id_hash_keys: Optional[List[str]] = None,
     ):
         return self.run(
             file_paths=file_paths,
             meta=meta,
             remove_numeric_tables=remove_numeric_tables,
+            known_ligatures=known_ligatures,
             valid_languages=valid_languages,
+            encoding=encoding,
+            id_hash_keys=id_hash_keys,
         )
diff --git a/haystack/nodes/preprocessor/base.py b/haystack/nodes/preprocessor/base.py
index 774d4a66d..61bd7667e 100644
--- a/haystack/nodes/preprocessor/base.py
+++ b/haystack/nodes/preprocessor/base.py
@@ -20,6 +20,7 @@ class BasePreProcessor(BaseComponent):
         split_length: Optional[int] = 1000,
         split_overlap: Optional[int] = None,
         split_respect_sentence_boundary: Optional[bool] = True,
+        id_hash_keys: Optional[List[str]] = None,
     ) -> List[Document]:
         """
         Perform document cleaning and splitting. Takes a single Document or a List of Documents as input and returns a
@@ -59,6 +60,7 @@ class BasePreProcessor(BaseComponent):
         split_length: Optional[int] = None,
         split_overlap: Optional[int] = None,
         split_respect_sentence_boundary: Optional[bool] = None,
+        id_hash_keys: Optional[List[str]] = None,
     ):
         processed_documents = self.process(
             documents=documents,
@@ -69,6 +71,7 @@ class BasePreProcessor(BaseComponent):
             split_length=split_length,
             split_overlap=split_overlap,
             split_respect_sentence_boundary=split_respect_sentence_boundary,
+            id_hash_keys=id_hash_keys,
         )
         result = {"documents": processed_documents}
         return result, "output_1"
@@ -83,6 +86,7 @@ class BasePreProcessor(BaseComponent):
         split_length: Optional[int] = None,
         split_overlap: Optional[int] = None,
         split_respect_sentence_boundary: Optional[bool] = None,
+        id_hash_keys: Optional[List[str]] = None,
     ):
         return self.run(
             documents=documents,
@@ -93,4 +97,5 @@ class BasePreProcessor(BaseComponent):
             split_length=split_length,
             split_overlap=split_overlap,
             split_respect_sentence_boundary=split_respect_sentence_boundary,
+            id_hash_keys=id_hash_keys,
         )
diff --git a/test/nodes/test_file_converter.py b/test/nodes/test_file_converter.py
index 0ea44fc22..6f0d0c9b4 100644
--- a/test/nodes/test_file_converter.py
+++ b/test/nodes/test_file_converter.py
@@ -13,6 +13,7 @@ from haystack.nodes import (
     TikaConverter,
     AzureConverter,
     ParsrConverter,
+    TextConverter,
 )
 
 from ..conftest import SAMPLES_PATH
@@ -172,3 +173,18 @@ def test_parsr_converter():
     assert docs[1].content_type == "text"
     assert docs[1].content.startswith("A sample PDF ﬁle")
     assert docs[1].content.endswith("Page 4 of Sample PDF\n… the page 3 is empty.")
+
+
+def test_id_hash_keys_from_pipeline_params():
+    doc_path = SAMPLES_PATH / "docs" / "doc_1.txt"
+    meta_1 = {"key": "a"}
+    meta_2 = {"key": "b"}
+    meta = [meta_1, meta_2]
+
+    converter = TextConverter()
+    output, _ = converter.run(file_paths=[doc_path, doc_path], meta=meta, id_hash_keys=["content", "meta"])
+    documents = output["documents"]
+    unique_ids = set(d.id for d in documents)
+
+    assert len(documents) == 2
+    assert len(unique_ids) == 2
diff --git a/test/nodes/test_preprocessor.py b/test/nodes/test_preprocessor.py
index 942d7ec41..685900249 100644
--- a/test/nodes/test_preprocessor.py
+++ b/test/nodes/test_preprocessor.py
@@ -113,3 +113,17 @@ def test_remove_substrings():
     assert "🪲" not in documents[0].content
     assert "whitespace" in documents[0].content
     assert "✨" in documents[0].content
+
+
+def test_id_hash_keys_from_pipeline_params():
+    document_1 = Document(content="This is a document.", meta={"key": "a"})
+    document_2 = Document(content="This is a document.", meta={"key": "b"})
+    assert document_1.id == document_2.id
+
+    preprocessor = PreProcessor(split_length=2, split_respect_sentence_boundary=False)
+    output, _ = preprocessor.run(documents=[document_1, document_2], id_hash_keys=["content", "meta"])
+    documents = output["documents"]
+    unique_ids = set(d.id for d in documents)
+
+    assert len(documents) == 4
+    assert len(unique_ids) == 4