feat: add skip_empty_documents init parameter to DocumentSplitter (#9649)

* feat: add skip_empty_documents init parameter to DocumentSplitter * improve test * fix + relnote
2025-12-14 08:37:42 +00:00 · 2025-07-24 11:26:11 +02:00 · 2025-07-24 11:26:11 +02:00 · d059cf2c23
commit d059cf2c23
parent 3b9b1ae802
4 changed files with 117 additions and 5 deletions
--- a/e2e/pipelines/test_pdf_content_extraction_pipeline.py
+++ b/e2e/pipelines/test_pdf_content_extraction_pipeline.py
@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from haystack import Pipeline
+from haystack.components.converters.pypdf import PyPDFToDocument
+from haystack.components.joiners import DocumentJoiner
+from haystack.components.preprocessors.document_splitter import DocumentSplitter
+from haystack.components.writers.document_writer import DocumentWriter
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from haystack.components.extractors.image.llm_document_content_extractor import LLMDocumentContentExtractor
+from haystack.components.generators.chat.openai import OpenAIChatGenerator
+from haystack.components.routers.document_length_router import DocumentLengthRouter
+
+
+def test_pdf_content_extraction_pipeline():
+    """
+    Test a pipeline that processes PDFs with the following steps:
+    1. Convert PDFs to documents
+    2. Split documents by page
+    3. Route documents by length (short vs long)
+    4. Extract content from short documents using LLM
+    5. Join documents back together
+    6. Write to document store
+    """
+    document_store = InMemoryDocumentStore()
+
+    pdf_converter = PyPDFToDocument(store_full_path=True)
+    pdf_splitter = DocumentSplitter(split_by="page", split_length=1, skip_empty_documents=False)
+    doc_length_router = DocumentLengthRouter(threshold=10)
+    content_extractor = LLMDocumentContentExtractor(chat_generator=OpenAIChatGenerator(model="gpt-4o-mini"))
+    final_doc_joiner = DocumentJoiner(sort_by_score=False)
+    document_writer = DocumentWriter(document_store=document_store)
+
+    # Create and configure pipeline
+    indexing_pipe = Pipeline()
+    indexing_pipe.add_component("pdf_converter", pdf_converter)
+    indexing_pipe.add_component("pdf_splitter", pdf_splitter)
+    indexing_pipe.add_component("doc_length_router", doc_length_router)
+    indexing_pipe.add_component("content_extractor", content_extractor)
+    indexing_pipe.add_component("final_doc_joiner", final_doc_joiner)
+    indexing_pipe.add_component("document_writer", document_writer)
+
+    # Connect components
+    indexing_pipe.connect("pdf_converter.documents", "pdf_splitter.documents")
+    indexing_pipe.connect("pdf_splitter.documents", "doc_length_router.documents")
+    # The short PDF pages will be enriched/captioned
+    indexing_pipe.connect("doc_length_router.short_documents", "content_extractor.documents")
+    indexing_pipe.connect("doc_length_router.long_documents", "final_doc_joiner.documents")
+    indexing_pipe.connect("content_extractor.documents", "final_doc_joiner.documents")
+    indexing_pipe.connect("final_doc_joiner.documents", "document_writer.documents")
+
+    # Test with both text-searchable and non-text-searchable PDFs
+    test_files = [
+        "test/test_files/pdf/sample_pdf_1.pdf",  # a PDF with 4 pages
+        "test/test_files/pdf/non_text_searchable.pdf",  # a non-text searchable PDF with 1 page
+    ]
+
+    # Run the indexing pipeline
+    indexing_result = indexing_pipe.run(data={"sources": test_files})
+
+    assert indexing_result is not None
+    assert "document_writer" in indexing_result
+
+    indexed_documents = document_store.filter_documents()
+
+    # We expect documents from both PDFs
+    # sample_pdf_1.pdf has 4 pages, non_text_searchable.pdf has 1 page
+    assert len(indexed_documents) == 5
+
+    file_paths = {doc.meta["file_path"] for doc in indexed_documents}
+    assert file_paths == set(test_files)
+
+    for doc in indexed_documents:
+        assert hasattr(doc, "content")
+        assert hasattr(doc, "meta")
+        assert "file_path" in doc.meta
+        assert "page_number" in doc.meta
+
+    for doc in indexed_documents:
+        assert isinstance(doc.meta["page_number"], int)
+        assert doc.meta["page_number"] >= 1
--- a/haystack/components/preprocessors/document_splitter.py
+++ b/haystack/components/preprocessors/document_splitter.py
@ -62,6 +62,8 @@ class DocumentSplitter:
        language: Language = "en",
        use_split_rules: bool = True,
        extend_abbreviations: bool = True,
+        *,
+        skip_empty_documents: bool = True,
    ):
        """
        Initialize DocumentSplitter.
@ -87,6 +89,9 @@ class DocumentSplitter:
        :param use_split_rules: Choose whether to use additional split rules when splitting by `sentence`.
        :param extend_abbreviations: Choose whether to extend NLTK's PunktTokenizer abbreviations with a list
            of curated abbreviations, if available. This is currently supported for English ("en") and German ("de").
+        :param skip_empty_documents: Choose whether to skip documents with empty content. Default is True.
+            Set to False when downstream components in the Pipeline (like LLMDocumentContentExtractor) can extract text
+            from non-textual documents.
        """

        self.split_by = split_by
@ -98,6 +103,7 @@ class DocumentSplitter:
        self.language = language
        self.use_split_rules = use_split_rules
        self.extend_abbreviations = extend_abbreviations
+        self.skip_empty_documents = skip_empty_documents

        self._init_checks(
            split_by=split_by,
@ -194,7 +200,7 @@ class DocumentSplitter:
                raise ValueError(
                    f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
                )
-            if doc.content == "":
+            if doc.content == "" and self.skip_empty_documents:
                logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
                continue

@ -287,8 +293,8 @@ class DocumentSplitter:
                # concatenate the last split with the current one
                text_splits[-1] += txt

-            # NOTE: This line skips documents that have content=""
-            elif len(txt) > 0:
+            # NOTE: If skip_empty_documents is True, this line skips documents that have content=""
+            elif not self.skip_empty_documents or len(txt) > 0:
                text_splits.append(txt)
                splits_pages.append(cur_page)
                splits_start_idxs.append(cur_start_idx)
@ -375,6 +381,7 @@ class DocumentSplitter:
            language=self.language,
            use_split_rules=self.use_split_rules,
            extend_abbreviations=self.extend_abbreviations,
+            skip_empty_documents=self.skip_empty_documents,
        )
        if self.splitting_function:
            serialized["init_parameters"]["splitting_function"] = serialize_callable(self.splitting_function)
--- a/releasenotes/notes/doc-splitter-skip-empty-docs-a031fa7c5ddf8c93.yaml
+++ b/releasenotes/notes/doc-splitter-skip-empty-docs-a031fa7c5ddf8c93.yaml
@ -0,0 +1,6 @@
+---
+enhancements:
+  - |
+    Add the init parameter `skip_empty_documents` to the `DocumentSplitter` component. The default value is True.
+    Setting it to False can be useful when downstream components in the Pipeline (like `LLMDocumentContentExtractor`)
+    can extract text from non-textual documents.
--- a/test/components/preprocessors/test_document_splitter.py
+++ b/test/components/preprocessors/test_document_splitter.py
@ -444,6 +444,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        assert serialized["init_parameters"]["split_length"] == 10
        assert serialized["init_parameters"]["split_overlap"] == 2
        assert serialized["init_parameters"]["split_threshold"] == 5
+        assert serialized["init_parameters"]["skip_empty_documents"]
        assert "splitting_function" not in serialized["init_parameters"]

    def test_to_dict_with_splitting_function(self):
@ -457,6 +458,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        assert serialized["type"] == "haystack.components.preprocessors.document_splitter.DocumentSplitter"
        assert serialized["init_parameters"]["split_by"] == "function"
        assert "splitting_function" in serialized["init_parameters"]
+        assert serialized["init_parameters"]["skip_empty_documents"]
        assert callable(deserialize_callable(serialized["init_parameters"]["splitting_function"]))

    def test_from_dict(self):
@ -465,7 +467,13 @@ class TestSplittingByFunctionOrCharacterRegex:
        """
        data = {
            "type": "haystack.components.preprocessors.document_splitter.DocumentSplitter",
-            "init_parameters": {"split_by": "word", "split_length": 10, "split_overlap": 2, "split_threshold": 5},
+            "init_parameters": {
+                "split_by": "word",
+                "split_length": 10,
+                "split_overlap": 2,
+                "split_threshold": 5,
+                "skip_empty_documents": False,
+            },
        }
        splitter = DocumentSplitter.from_dict(data)

@ -474,6 +482,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        assert splitter.split_overlap == 2
        assert splitter.split_threshold == 5
        assert splitter.splitting_function is None
+        assert splitter.skip_empty_documents is False

    def test_from_dict_with_splitting_function(self):
        """
@ -516,7 +525,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        assert callable(deserialized_splitter.splitting_function)
        assert deserialized_splitter.splitting_function("a.b.c") == ["a", "b", "c"]

-    def test_run_empty_document(self):
+    def test_run_empty_document_with_skip_empty_documents_true(self):
        """
        Test if the component runs correctly with an empty document.
        """
@ -526,6 +535,14 @@ class TestSplittingByFunctionOrCharacterRegex:
        results = splitter.run([doc])
        assert results["documents"] == []

+    def test_run_empty_document_with_skip_empty_documents_false(self):
+        splitter = DocumentSplitter(skip_empty_documents=False)
+        doc = Document(content="")
+        splitter.warm_up()
+        results = splitter.run([doc])
+        assert len(results["documents"]) == 1
+        assert results["documents"][0].content == ""
+
    def test_run_document_only_whitespaces(self):
        """
        Test if the component runs correctly with a document containing only whitespaces.