diff --git a/haystack/components/preprocessors/document_cleaner.py b/haystack/components/preprocessors/document_cleaner.py index 80f281beb..4248862d4 100644 --- a/haystack/components/preprocessors/document_cleaner.py +++ b/haystack/components/preprocessors/document_cleaner.py @@ -40,6 +40,7 @@ class DocumentCleaner: remove_empty_lines: bool = True, remove_extra_whitespaces: bool = True, remove_repeated_substrings: bool = False, + keep_id: bool = False, remove_substrings: Optional[List[str]] = None, remove_regex: Optional[str] = None, ): @@ -53,6 +54,7 @@ class DocumentCleaner: which is supported by `TextFileToDocument` and `AzureOCRDocumentConverter`. :param remove_substrings: List of substrings to remove from the text. :param remove_regex: Regex to match and replace substrings by "". + :param keep_id: keep the ids of the original documents """ self.remove_empty_lines = remove_empty_lines @@ -60,6 +62,7 @@ class DocumentCleaner: self.remove_repeated_substrings = remove_repeated_substrings self.remove_substrings = remove_substrings self.remove_regex = remove_regex + self.keep_id = keep_id @component.output_types(documents=List[Document]) def run(self, documents: List[Document]): @@ -98,7 +101,7 @@ class DocumentCleaner: if self.remove_repeated_substrings: text = self._remove_repeated_substrings(text) - cleaned_docs.append(Document(content=text, meta=deepcopy(doc.meta))) + cleaned_docs.append(Document(content=text, meta=deepcopy(doc.meta), id=doc.id if self.keep_id else "")) return {"documents": cleaned_docs} diff --git a/releasenotes/notes/add-keep-id-to-document-cleaner-2a9854b5f195bb78.yaml b/releasenotes/notes/add-keep-id-to-document-cleaner-2a9854b5f195bb78.yaml new file mode 100644 index 000000000..805c811d5 --- /dev/null +++ b/releasenotes/notes/add-keep-id-to-document-cleaner-2a9854b5f195bb78.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + The `DocumentCleaner` class has the optional attribute `keep_id` that if set to True it keeps the document ids unchanged after cleanup. diff --git a/test/components/preprocessors/test_document_cleaner.py b/test/components/preprocessors/test_document_cleaner.py index 2aaf2f370..0acd9e8e8 100644 --- a/test/components/preprocessors/test_document_cleaner.py +++ b/test/components/preprocessors/test_document_cleaner.py @@ -17,6 +17,7 @@ class TestDocumentCleaner: assert cleaner.remove_repeated_substrings is False assert cleaner.remove_substrings is None assert cleaner.remove_regex is None + assert cleaner.keep_id is False def test_non_text_document(self, caplog): with caplog.at_level(logging.WARNING): @@ -130,3 +131,11 @@ class TestDocumentCleaner: for doc, cleaned_doc in zip(documents, result["documents"]): assert doc.meta == cleaned_doc.meta assert cleaned_doc.content == "Text." + + def test_keep_id_does_not_alter_document_ids(self): + cleaner = DocumentCleaner(keep_id=True) + documents = [Document(content="Text. ", id="1"), Document(content="Text. ", id="2")] + result = cleaner.run(documents=documents) + assert len(result["documents"]) == 2 + assert result["documents"][0].id == "1" + assert result["documents"][1].id == "2"