fix: DocumentSplitter and DocumentCleaner copy id_hash_keys to newly created Documents (#6083)

* copy id_hash_keys in splitter and cleaner * reno
2026-01-07 20:46:31 +00:00 · 2023-10-17 11:03:48 +02:00 · 2023-10-17 11:03:48 +02:00 · 90ddeba579
commit 90ddeba579
parent e963c8acdd
5 changed files with 42 additions and 3 deletions
--- a/haystack/preview/components/preprocessors/text_document_cleaner.py
+++ b/haystack/preview/components/preprocessors/text_document_cleaner.py
@ -58,7 +58,8 @@ class DocumentCleaner:
    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]):
        """
-        Run the DocumentCleaner on the given list of documents
+        Run the DocumentCleaner on the given list of documents.
+        The documents' metadata and id_hash_keys remain unchanged.
        """
        if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
            raise TypeError("DocumentCleaner expects a List of Documents as input.")
@ -84,7 +85,9 @@ class DocumentCleaner:
            if self.remove_repeated_substrings:
                text = self._remove_repeated_substrings(text)

-            cleaned_docs.append(Document(text=text, metadata=deepcopy(doc.metadata)))
+            cleaned_docs.append(
+                Document(text=text, metadata=deepcopy(doc.metadata), id_hash_keys=deepcopy(doc.id_hash_keys))
+            )

        return {"documents": cleaned_docs}

--- a/haystack/preview/components/preprocessors/text_document_splitter.py
+++ b/haystack/preview/components/preprocessors/text_document_splitter.py
@ -39,6 +39,7 @@ class TextDocumentSplitter:
        Splits the documents by split_by after split_length units with an overlap of split_overlap units.
        Returns a list of documents with the split texts.
        A metadata field "source_id" is added to each document to keep track of the original document that was split.
+        Other metadata and id_hash_keys are copied from the original document.
        :param documents: The documents to split.
        :return: A list of documents with the split texts.
        """
@ -54,8 +55,9 @@ class TextDocumentSplitter:
            units = self._split_into_units(doc.text, self.split_by)
            text_splits = self._concatenate_units(units, self.split_length, self.split_overlap)
            metadata = deepcopy(doc.metadata)
+            id_hash_keys = deepcopy(doc.id_hash_keys)
            metadata["source_id"] = doc.id
-            split_docs += [Document(text=txt, metadata=metadata) for txt in text_splits]
+            split_docs += [Document(text=txt, metadata=metadata, id_hash_keys=id_hash_keys) for txt in text_splits]
        return {"documents": split_docs}

    def to_dict(self) -> Dict[str, Any]:
--- a/releasenotes/notes/fix-splitter-cleaner-hash-key-3b6f042af7da9ab4.yaml
+++ b/releasenotes/notes/fix-splitter-cleaner-hash-key-3b6f042af7da9ab4.yaml
@ -0,0 +1,4 @@
+---
+preview:
+  - |
+    Fixed a bug that caused TextDocumentSplitter and DocumentCleaner to ignore id_hash_keys and create Documents with duplicate ids if the documents differed only in their metadata.
--- a/test/preview/components/preprocessors/test_text_document_cleaner.py
+++ b/test/preview/components/preprocessors/test_text_document_cleaner.py
@ -178,3 +178,18 @@ class TestDocumentCleaner:
        Sed do eiusmod tempor."""
        result = cleaner.run(documents=[Document(text=text)])
        assert result["documents"][0].text == expected_text
+
+    @pytest.mark.unit
+    def test_copy_id_hash_keys_and_metadata(self):
+        cleaner = DocumentCleaner()
+        documents = [
+            Document(text="Text. ", metadata={"name": "doc 0"}, id_hash_keys=["name"]),
+            Document(text="Text. ", metadata={"name": "doc 1"}, id_hash_keys=["name"]),
+        ]
+        result = cleaner.run(documents=documents)
+        assert len(result["documents"]) == 2
+        assert result["documents"][0].id != result["documents"][1].id
+        for doc, cleaned_doc in zip(documents, result["documents"]):
+            assert doc.id_hash_keys == cleaned_doc.id_hash_keys
+            assert doc.metadata == cleaned_doc.metadata
+            assert cleaned_doc.text == "Text."
--- a/test/preview/components/preprocessors/test_text_document_splitter.py
+++ b/test/preview/components/preprocessors/test_text_document_splitter.py
@ -155,3 +155,18 @@ class TestTextDocumentSplitter:
        result = splitter.run(documents=[doc1, doc2])
        assert result["documents"][0].metadata["source_id"] == doc1.id
        assert result["documents"][1].metadata["source_id"] == doc2.id
+
+    @pytest.mark.unit
+    def test_copy_id_hash_keys_and_metadata(self):
+        splitter = TextDocumentSplitter(split_by="word", split_length=10)
+        documents = [
+            Document(text="Text.", metadata={"name": "doc 0"}, id_hash_keys=["name"]),
+            Document(text="Text.", metadata={"name": "doc 1"}, id_hash_keys=["name"]),
+        ]
+        result = splitter.run(documents=documents)
+        assert len(result["documents"]) == 2
+        assert result["documents"][0].id != result["documents"][1].id
+        for doc, split_doc in zip(documents, result["documents"]):
+            assert doc.id_hash_keys == split_doc.id_hash_keys
+            assert doc.metadata.items() <= split_doc.metadata.items()
+            assert split_doc.text == "Text."