From ec376c7dbd33faa50fce27d5c4a86c1ae20b5bbb Mon Sep 17 00:00:00 2001
From: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
Date: Fri, 20 Oct 2023 15:16:06 +0200
Subject: [PATCH] Remove id_hash_keys from DocumentCleaner (#6123)

---
 .../components/preprocessors/text_document_cleaner.py      | 6 ++----
 .../components/preprocessors/test_text_document_cleaner.py | 7 +++----
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/haystack/preview/components/preprocessors/text_document_cleaner.py b/haystack/preview/components/preprocessors/text_document_cleaner.py
index 0e6cafd56..63d833a59 100644
--- a/haystack/preview/components/preprocessors/text_document_cleaner.py
+++ b/haystack/preview/components/preprocessors/text_document_cleaner.py
@@ -59,7 +59,7 @@ class DocumentCleaner:
     def run(self, documents: List[Document]):
         """
         Run the DocumentCleaner on the given list of documents.
-        The documents' metadata and id_hash_keys remain unchanged.
+        The documents' metadata remain unchanged.
         """
         if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
             raise TypeError("DocumentCleaner expects a List of Documents as input.")
@@ -85,9 +85,7 @@ class DocumentCleaner:
             if self.remove_repeated_substrings:
                 text = self._remove_repeated_substrings(text)
 
-            cleaned_docs.append(
-                Document(text=text, metadata=deepcopy(doc.metadata), id_hash_keys=deepcopy(doc.id_hash_keys))
-            )
+            cleaned_docs.append(Document(text=text, metadata=deepcopy(doc.metadata)))
 
         return {"documents": cleaned_docs}
 
diff --git a/test/preview/components/preprocessors/test_text_document_cleaner.py b/test/preview/components/preprocessors/test_text_document_cleaner.py
index 17e431e3f..88d671d30 100644
--- a/test/preview/components/preprocessors/test_text_document_cleaner.py
+++ b/test/preview/components/preprocessors/test_text_document_cleaner.py
@@ -125,16 +125,15 @@ class TestDocumentCleaner:
         assert result["documents"][0].text == expected_text
 
     @pytest.mark.unit
-    def test_copy_id_hash_keys_and_metadata(self):
+    def test_copy_metadata(self):
         cleaner = DocumentCleaner()
         documents = [
-            Document(text="Text. ", metadata={"name": "doc 0"}, id_hash_keys=["name"]),
-            Document(text="Text. ", metadata={"name": "doc 1"}, id_hash_keys=["name"]),
+            Document(text="Text. ", metadata={"name": "doc 0"}),
+            Document(text="Text. ", metadata={"name": "doc 1"}),
         ]
         result = cleaner.run(documents=documents)
         assert len(result["documents"]) == 2
         assert result["documents"][0].id != result["documents"][1].id
         for doc, cleaned_doc in zip(documents, result["documents"]):
-            assert doc.id_hash_keys == cleaned_doc.id_hash_keys
             assert doc.metadata == cleaned_doc.metadata
             assert cleaned_doc.text == "Text."