diff --git a/haystack/preview/components/preprocessors/text_document_cleaner.py b/haystack/preview/components/preprocessors/text_document_cleaner.py index 0c0693d27..b2f39ddaf 100644 --- a/haystack/preview/components/preprocessors/text_document_cleaner.py +++ b/haystack/preview/components/preprocessors/text_document_cleaner.py @@ -58,7 +58,8 @@ class DocumentCleaner: @component.output_types(documents=List[Document]) def run(self, documents: List[Document]): """ - Run the DocumentCleaner on the given list of documents + Run the DocumentCleaner on the given list of documents. + The documents' metadata and id_hash_keys remain unchanged. """ if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): raise TypeError("DocumentCleaner expects a List of Documents as input.") @@ -84,7 +85,9 @@ class DocumentCleaner: if self.remove_repeated_substrings: text = self._remove_repeated_substrings(text) - cleaned_docs.append(Document(text=text, metadata=deepcopy(doc.metadata))) + cleaned_docs.append( + Document(text=text, metadata=deepcopy(doc.metadata), id_hash_keys=deepcopy(doc.id_hash_keys)) + ) return {"documents": cleaned_docs} diff --git a/haystack/preview/components/preprocessors/text_document_splitter.py b/haystack/preview/components/preprocessors/text_document_splitter.py index 6273f8167..f9f748b9d 100644 --- a/haystack/preview/components/preprocessors/text_document_splitter.py +++ b/haystack/preview/components/preprocessors/text_document_splitter.py @@ -39,6 +39,7 @@ class TextDocumentSplitter: Splits the documents by split_by after split_length units with an overlap of split_overlap units. Returns a list of documents with the split texts. A metadata field "source_id" is added to each document to keep track of the original document that was split. + Other metadata and id_hash_keys are copied from the original document. :param documents: The documents to split. :return: A list of documents with the split texts. """ @@ -54,8 +55,9 @@ class TextDocumentSplitter: units = self._split_into_units(doc.text, self.split_by) text_splits = self._concatenate_units(units, self.split_length, self.split_overlap) metadata = deepcopy(doc.metadata) + id_hash_keys = deepcopy(doc.id_hash_keys) metadata["source_id"] = doc.id - split_docs += [Document(text=txt, metadata=metadata) for txt in text_splits] + split_docs += [Document(text=txt, metadata=metadata, id_hash_keys=id_hash_keys) for txt in text_splits] return {"documents": split_docs} def to_dict(self) -> Dict[str, Any]: diff --git a/releasenotes/notes/fix-splitter-cleaner-hash-key-3b6f042af7da9ab4.yaml b/releasenotes/notes/fix-splitter-cleaner-hash-key-3b6f042af7da9ab4.yaml new file mode 100644 index 000000000..33708f0b4 --- /dev/null +++ b/releasenotes/notes/fix-splitter-cleaner-hash-key-3b6f042af7da9ab4.yaml @@ -0,0 +1,4 @@ +--- +preview: + - | + Fixed a bug that caused TextDocumentSplitter and DocumentCleaner to ignore id_hash_keys and create Documents with duplicate ids if the documents differed only in their metadata. diff --git a/test/preview/components/preprocessors/test_text_document_cleaner.py b/test/preview/components/preprocessors/test_text_document_cleaner.py index d8106972f..b4404b8c5 100644 --- a/test/preview/components/preprocessors/test_text_document_cleaner.py +++ b/test/preview/components/preprocessors/test_text_document_cleaner.py @@ -178,3 +178,18 @@ class TestDocumentCleaner: Sed do eiusmod tempor.""" result = cleaner.run(documents=[Document(text=text)]) assert result["documents"][0].text == expected_text + + @pytest.mark.unit + def test_copy_id_hash_keys_and_metadata(self): + cleaner = DocumentCleaner() + documents = [ + Document(text="Text. ", metadata={"name": "doc 0"}, id_hash_keys=["name"]), + Document(text="Text. ", metadata={"name": "doc 1"}, id_hash_keys=["name"]), + ] + result = cleaner.run(documents=documents) + assert len(result["documents"]) == 2 + assert result["documents"][0].id != result["documents"][1].id + for doc, cleaned_doc in zip(documents, result["documents"]): + assert doc.id_hash_keys == cleaned_doc.id_hash_keys + assert doc.metadata == cleaned_doc.metadata + assert cleaned_doc.text == "Text." diff --git a/test/preview/components/preprocessors/test_text_document_splitter.py b/test/preview/components/preprocessors/test_text_document_splitter.py index 584afc6b3..900842333 100644 --- a/test/preview/components/preprocessors/test_text_document_splitter.py +++ b/test/preview/components/preprocessors/test_text_document_splitter.py @@ -155,3 +155,18 @@ class TestTextDocumentSplitter: result = splitter.run(documents=[doc1, doc2]) assert result["documents"][0].metadata["source_id"] == doc1.id assert result["documents"][1].metadata["source_id"] == doc2.id + + @pytest.mark.unit + def test_copy_id_hash_keys_and_metadata(self): + splitter = TextDocumentSplitter(split_by="word", split_length=10) + documents = [ + Document(text="Text.", metadata={"name": "doc 0"}, id_hash_keys=["name"]), + Document(text="Text.", metadata={"name": "doc 1"}, id_hash_keys=["name"]), + ] + result = splitter.run(documents=documents) + assert len(result["documents"]) == 2 + assert result["documents"][0].id != result["documents"][1].id + for doc, split_doc in zip(documents, result["documents"]): + assert doc.id_hash_keys == split_doc.id_hash_keys + assert doc.metadata.items() <= split_doc.metadata.items() + assert split_doc.text == "Text."