From ec376c7dbd33faa50fce27d5c4a86c1ae20b5bbb Mon Sep 17 00:00:00 2001 From: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Date: Fri, 20 Oct 2023 15:16:06 +0200 Subject: [PATCH] Remove id_hash_keys from DocumentCleaner (#6123) --- .../components/preprocessors/text_document_cleaner.py | 6 ++---- .../components/preprocessors/test_text_document_cleaner.py | 7 +++---- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/haystack/preview/components/preprocessors/text_document_cleaner.py b/haystack/preview/components/preprocessors/text_document_cleaner.py index 0e6cafd56..63d833a59 100644 --- a/haystack/preview/components/preprocessors/text_document_cleaner.py +++ b/haystack/preview/components/preprocessors/text_document_cleaner.py @@ -59,7 +59,7 @@ class DocumentCleaner: def run(self, documents: List[Document]): """ Run the DocumentCleaner on the given list of documents. - The documents' metadata and id_hash_keys remain unchanged. + The documents' metadata remain unchanged. """ if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): raise TypeError("DocumentCleaner expects a List of Documents as input.") @@ -85,9 +85,7 @@ class DocumentCleaner: if self.remove_repeated_substrings: text = self._remove_repeated_substrings(text) - cleaned_docs.append( - Document(text=text, metadata=deepcopy(doc.metadata), id_hash_keys=deepcopy(doc.id_hash_keys)) - ) + cleaned_docs.append(Document(text=text, metadata=deepcopy(doc.metadata))) return {"documents": cleaned_docs} diff --git a/test/preview/components/preprocessors/test_text_document_cleaner.py b/test/preview/components/preprocessors/test_text_document_cleaner.py index 17e431e3f..88d671d30 100644 --- a/test/preview/components/preprocessors/test_text_document_cleaner.py +++ b/test/preview/components/preprocessors/test_text_document_cleaner.py @@ -125,16 +125,15 @@ class TestDocumentCleaner: assert result["documents"][0].text == expected_text @pytest.mark.unit - def test_copy_id_hash_keys_and_metadata(self): + def test_copy_metadata(self): cleaner = DocumentCleaner() documents = [ - Document(text="Text. ", metadata={"name": "doc 0"}, id_hash_keys=["name"]), - Document(text="Text. ", metadata={"name": "doc 1"}, id_hash_keys=["name"]), + Document(text="Text. ", metadata={"name": "doc 0"}), + Document(text="Text. ", metadata={"name": "doc 1"}), ] result = cleaner.run(documents=documents) assert len(result["documents"]) == 2 assert result["documents"][0].id != result["documents"][1].id for doc, cleaned_doc in zip(documents, result["documents"]): - assert doc.id_hash_keys == cleaned_doc.id_hash_keys assert doc.metadata == cleaned_doc.metadata assert cleaned_doc.text == "Text."