From d059cf2c23f429e90d79b3a535eebbc0a9204958 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Thu, 24 Jul 2025 11:26:11 +0200 Subject: [PATCH] feat: add `skip_empty_documents` init parameter to `DocumentSplitter` (#9649) * feat: add skip_empty_documents init parameter to DocumentSplitter * improve test * fix + relnote --- .../test_pdf_content_extraction_pipeline.py | 82 +++++++++++++++++++ .../preprocessors/document_splitter.py | 13 ++- ...tter-skip-empty-docs-a031fa7c5ddf8c93.yaml | 6 ++ .../preprocessors/test_document_splitter.py | 21 ++++- 4 files changed, 117 insertions(+), 5 deletions(-) create mode 100644 e2e/pipelines/test_pdf_content_extraction_pipeline.py create mode 100644 releasenotes/notes/doc-splitter-skip-empty-docs-a031fa7c5ddf8c93.yaml diff --git a/e2e/pipelines/test_pdf_content_extraction_pipeline.py b/e2e/pipelines/test_pdf_content_extraction_pipeline.py new file mode 100644 index 000000000..f4ca83a8d --- /dev/null +++ b/e2e/pipelines/test_pdf_content_extraction_pipeline.py @@ -0,0 +1,82 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from haystack import Pipeline +from haystack.components.converters.pypdf import PyPDFToDocument +from haystack.components.joiners import DocumentJoiner +from haystack.components.preprocessors.document_splitter import DocumentSplitter +from haystack.components.writers.document_writer import DocumentWriter +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.components.extractors.image.llm_document_content_extractor import LLMDocumentContentExtractor +from haystack.components.generators.chat.openai import OpenAIChatGenerator +from haystack.components.routers.document_length_router import DocumentLengthRouter + + +def test_pdf_content_extraction_pipeline(): + """ + Test a pipeline that processes PDFs with the following steps: + 1. Convert PDFs to documents + 2. Split documents by page + 3. Route documents by length (short vs long) + 4. Extract content from short documents using LLM + 5. Join documents back together + 6. Write to document store + """ + document_store = InMemoryDocumentStore() + + pdf_converter = PyPDFToDocument(store_full_path=True) + pdf_splitter = DocumentSplitter(split_by="page", split_length=1, skip_empty_documents=False) + doc_length_router = DocumentLengthRouter(threshold=10) + content_extractor = LLMDocumentContentExtractor(chat_generator=OpenAIChatGenerator(model="gpt-4o-mini")) + final_doc_joiner = DocumentJoiner(sort_by_score=False) + document_writer = DocumentWriter(document_store=document_store) + + # Create and configure pipeline + indexing_pipe = Pipeline() + indexing_pipe.add_component("pdf_converter", pdf_converter) + indexing_pipe.add_component("pdf_splitter", pdf_splitter) + indexing_pipe.add_component("doc_length_router", doc_length_router) + indexing_pipe.add_component("content_extractor", content_extractor) + indexing_pipe.add_component("final_doc_joiner", final_doc_joiner) + indexing_pipe.add_component("document_writer", document_writer) + + # Connect components + indexing_pipe.connect("pdf_converter.documents", "pdf_splitter.documents") + indexing_pipe.connect("pdf_splitter.documents", "doc_length_router.documents") + # The short PDF pages will be enriched/captioned + indexing_pipe.connect("doc_length_router.short_documents", "content_extractor.documents") + indexing_pipe.connect("doc_length_router.long_documents", "final_doc_joiner.documents") + indexing_pipe.connect("content_extractor.documents", "final_doc_joiner.documents") + indexing_pipe.connect("final_doc_joiner.documents", "document_writer.documents") + + # Test with both text-searchable and non-text-searchable PDFs + test_files = [ + "test/test_files/pdf/sample_pdf_1.pdf", # a PDF with 4 pages + "test/test_files/pdf/non_text_searchable.pdf", # a non-text searchable PDF with 1 page + ] + + # Run the indexing pipeline + indexing_result = indexing_pipe.run(data={"sources": test_files}) + + assert indexing_result is not None + assert "document_writer" in indexing_result + + indexed_documents = document_store.filter_documents() + + # We expect documents from both PDFs + # sample_pdf_1.pdf has 4 pages, non_text_searchable.pdf has 1 page + assert len(indexed_documents) == 5 + + file_paths = {doc.meta["file_path"] for doc in indexed_documents} + assert file_paths == set(test_files) + + for doc in indexed_documents: + assert hasattr(doc, "content") + assert hasattr(doc, "meta") + assert "file_path" in doc.meta + assert "page_number" in doc.meta + + for doc in indexed_documents: + assert isinstance(doc.meta["page_number"], int) + assert doc.meta["page_number"] >= 1 diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py index 7c646176b..fcd58e999 100644 --- a/haystack/components/preprocessors/document_splitter.py +++ b/haystack/components/preprocessors/document_splitter.py @@ -62,6 +62,8 @@ class DocumentSplitter: language: Language = "en", use_split_rules: bool = True, extend_abbreviations: bool = True, + *, + skip_empty_documents: bool = True, ): """ Initialize DocumentSplitter. @@ -87,6 +89,9 @@ class DocumentSplitter: :param use_split_rules: Choose whether to use additional split rules when splitting by `sentence`. :param extend_abbreviations: Choose whether to extend NLTK's PunktTokenizer abbreviations with a list of curated abbreviations, if available. This is currently supported for English ("en") and German ("de"). + :param skip_empty_documents: Choose whether to skip documents with empty content. Default is True. + Set to False when downstream components in the Pipeline (like LLMDocumentContentExtractor) can extract text + from non-textual documents. """ self.split_by = split_by @@ -98,6 +103,7 @@ class DocumentSplitter: self.language = language self.use_split_rules = use_split_rules self.extend_abbreviations = extend_abbreviations + self.skip_empty_documents = skip_empty_documents self._init_checks( split_by=split_by, @@ -194,7 +200,7 @@ class DocumentSplitter: raise ValueError( f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None." ) - if doc.content == "": + if doc.content == "" and self.skip_empty_documents: logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id) continue @@ -287,8 +293,8 @@ class DocumentSplitter: # concatenate the last split with the current one text_splits[-1] += txt - # NOTE: This line skips documents that have content="" - elif len(txt) > 0: + # NOTE: If skip_empty_documents is True, this line skips documents that have content="" + elif not self.skip_empty_documents or len(txt) > 0: text_splits.append(txt) splits_pages.append(cur_page) splits_start_idxs.append(cur_start_idx) @@ -375,6 +381,7 @@ class DocumentSplitter: language=self.language, use_split_rules=self.use_split_rules, extend_abbreviations=self.extend_abbreviations, + skip_empty_documents=self.skip_empty_documents, ) if self.splitting_function: serialized["init_parameters"]["splitting_function"] = serialize_callable(self.splitting_function) diff --git a/releasenotes/notes/doc-splitter-skip-empty-docs-a031fa7c5ddf8c93.yaml b/releasenotes/notes/doc-splitter-skip-empty-docs-a031fa7c5ddf8c93.yaml new file mode 100644 index 000000000..48c2314eb --- /dev/null +++ b/releasenotes/notes/doc-splitter-skip-empty-docs-a031fa7c5ddf8c93.yaml @@ -0,0 +1,6 @@ +--- +enhancements: + - | + Add the init parameter `skip_empty_documents` to the `DocumentSplitter` component. The default value is True. + Setting it to False can be useful when downstream components in the Pipeline (like `LLMDocumentContentExtractor`) + can extract text from non-textual documents. diff --git a/test/components/preprocessors/test_document_splitter.py b/test/components/preprocessors/test_document_splitter.py index 91779f606..551b15577 100644 --- a/test/components/preprocessors/test_document_splitter.py +++ b/test/components/preprocessors/test_document_splitter.py @@ -444,6 +444,7 @@ class TestSplittingByFunctionOrCharacterRegex: assert serialized["init_parameters"]["split_length"] == 10 assert serialized["init_parameters"]["split_overlap"] == 2 assert serialized["init_parameters"]["split_threshold"] == 5 + assert serialized["init_parameters"]["skip_empty_documents"] assert "splitting_function" not in serialized["init_parameters"] def test_to_dict_with_splitting_function(self): @@ -457,6 +458,7 @@ class TestSplittingByFunctionOrCharacterRegex: assert serialized["type"] == "haystack.components.preprocessors.document_splitter.DocumentSplitter" assert serialized["init_parameters"]["split_by"] == "function" assert "splitting_function" in serialized["init_parameters"] + assert serialized["init_parameters"]["skip_empty_documents"] assert callable(deserialize_callable(serialized["init_parameters"]["splitting_function"])) def test_from_dict(self): @@ -465,7 +467,13 @@ class TestSplittingByFunctionOrCharacterRegex: """ data = { "type": "haystack.components.preprocessors.document_splitter.DocumentSplitter", - "init_parameters": {"split_by": "word", "split_length": 10, "split_overlap": 2, "split_threshold": 5}, + "init_parameters": { + "split_by": "word", + "split_length": 10, + "split_overlap": 2, + "split_threshold": 5, + "skip_empty_documents": False, + }, } splitter = DocumentSplitter.from_dict(data) @@ -474,6 +482,7 @@ class TestSplittingByFunctionOrCharacterRegex: assert splitter.split_overlap == 2 assert splitter.split_threshold == 5 assert splitter.splitting_function is None + assert splitter.skip_empty_documents is False def test_from_dict_with_splitting_function(self): """ @@ -516,7 +525,7 @@ class TestSplittingByFunctionOrCharacterRegex: assert callable(deserialized_splitter.splitting_function) assert deserialized_splitter.splitting_function("a.b.c") == ["a", "b", "c"] - def test_run_empty_document(self): + def test_run_empty_document_with_skip_empty_documents_true(self): """ Test if the component runs correctly with an empty document. """ @@ -526,6 +535,14 @@ class TestSplittingByFunctionOrCharacterRegex: results = splitter.run([doc]) assert results["documents"] == [] + def test_run_empty_document_with_skip_empty_documents_false(self): + splitter = DocumentSplitter(skip_empty_documents=False) + doc = Document(content="") + splitter.warm_up() + results = splitter.run([doc]) + assert len(results["documents"]) == 1 + assert results["documents"][0].content == "" + def test_run_document_only_whitespaces(self): """ Test if the component runs correctly with a document containing only whitespaces.