From d059cf2c23f429e90d79b3a535eebbc0a9204958 Mon Sep 17 00:00:00 2001
From: Stefano Fiorucci <stefanofiorucci@gmail.com>
Date: Thu, 24 Jul 2025 11:26:11 +0200
Subject: [PATCH] feat: add `skip_empty_documents` init parameter to
 `DocumentSplitter` (#9649)

* feat: add skip_empty_documents init parameter to DocumentSplitter

* improve test

* fix + relnote
---
 .../test_pdf_content_extraction_pipeline.py   | 82 +++++++++++++++++++
 .../preprocessors/document_splitter.py        | 13 ++-
 ...tter-skip-empty-docs-a031fa7c5ddf8c93.yaml |  6 ++
 .../preprocessors/test_document_splitter.py   | 21 ++++-
 4 files changed, 117 insertions(+), 5 deletions(-)
 create mode 100644 e2e/pipelines/test_pdf_content_extraction_pipeline.py
 create mode 100644 releasenotes/notes/doc-splitter-skip-empty-docs-a031fa7c5ddf8c93.yaml

diff --git a/e2e/pipelines/test_pdf_content_extraction_pipeline.py b/e2e/pipelines/test_pdf_content_extraction_pipeline.py
new file mode 100644
index 000000000..f4ca83a8d
--- /dev/null
+++ b/e2e/pipelines/test_pdf_content_extraction_pipeline.py
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from haystack import Pipeline
+from haystack.components.converters.pypdf import PyPDFToDocument
+from haystack.components.joiners import DocumentJoiner
+from haystack.components.preprocessors.document_splitter import DocumentSplitter
+from haystack.components.writers.document_writer import DocumentWriter
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from haystack.components.extractors.image.llm_document_content_extractor import LLMDocumentContentExtractor
+from haystack.components.generators.chat.openai import OpenAIChatGenerator
+from haystack.components.routers.document_length_router import DocumentLengthRouter
+
+
+def test_pdf_content_extraction_pipeline():
+    """
+    Test a pipeline that processes PDFs with the following steps:
+    1. Convert PDFs to documents
+    2. Split documents by page
+    3. Route documents by length (short vs long)
+    4. Extract content from short documents using LLM
+    5. Join documents back together
+    6. Write to document store
+    """
+    document_store = InMemoryDocumentStore()
+
+    pdf_converter = PyPDFToDocument(store_full_path=True)
+    pdf_splitter = DocumentSplitter(split_by="page", split_length=1, skip_empty_documents=False)
+    doc_length_router = DocumentLengthRouter(threshold=10)
+    content_extractor = LLMDocumentContentExtractor(chat_generator=OpenAIChatGenerator(model="gpt-4o-mini"))
+    final_doc_joiner = DocumentJoiner(sort_by_score=False)
+    document_writer = DocumentWriter(document_store=document_store)
+
+    # Create and configure pipeline
+    indexing_pipe = Pipeline()
+    indexing_pipe.add_component("pdf_converter", pdf_converter)
+    indexing_pipe.add_component("pdf_splitter", pdf_splitter)
+    indexing_pipe.add_component("doc_length_router", doc_length_router)
+    indexing_pipe.add_component("content_extractor", content_extractor)
+    indexing_pipe.add_component("final_doc_joiner", final_doc_joiner)
+    indexing_pipe.add_component("document_writer", document_writer)
+
+    # Connect components
+    indexing_pipe.connect("pdf_converter.documents", "pdf_splitter.documents")
+    indexing_pipe.connect("pdf_splitter.documents", "doc_length_router.documents")
+    # The short PDF pages will be enriched/captioned
+    indexing_pipe.connect("doc_length_router.short_documents", "content_extractor.documents")
+    indexing_pipe.connect("doc_length_router.long_documents", "final_doc_joiner.documents")
+    indexing_pipe.connect("content_extractor.documents", "final_doc_joiner.documents")
+    indexing_pipe.connect("final_doc_joiner.documents", "document_writer.documents")
+
+    # Test with both text-searchable and non-text-searchable PDFs
+    test_files = [
+        "test/test_files/pdf/sample_pdf_1.pdf",  # a PDF with 4 pages
+        "test/test_files/pdf/non_text_searchable.pdf",  # a non-text searchable PDF with 1 page
+    ]
+
+    # Run the indexing pipeline
+    indexing_result = indexing_pipe.run(data={"sources": test_files})
+
+    assert indexing_result is not None
+    assert "document_writer" in indexing_result
+
+    indexed_documents = document_store.filter_documents()
+
+    # We expect documents from both PDFs
+    # sample_pdf_1.pdf has 4 pages, non_text_searchable.pdf has 1 page
+    assert len(indexed_documents) == 5
+
+    file_paths = {doc.meta["file_path"] for doc in indexed_documents}
+    assert file_paths == set(test_files)
+
+    for doc in indexed_documents:
+        assert hasattr(doc, "content")
+        assert hasattr(doc, "meta")
+        assert "file_path" in doc.meta
+        assert "page_number" in doc.meta
+
+    for doc in indexed_documents:
+        assert isinstance(doc.meta["page_number"], int)
+        assert doc.meta["page_number"] >= 1
diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py
index 7c646176b..fcd58e999 100644
--- a/haystack/components/preprocessors/document_splitter.py
+++ b/haystack/components/preprocessors/document_splitter.py
@@ -62,6 +62,8 @@ class DocumentSplitter:
         language: Language = "en",
         use_split_rules: bool = True,
         extend_abbreviations: bool = True,
+        *,
+        skip_empty_documents: bool = True,
     ):
         """
         Initialize DocumentSplitter.
@@ -87,6 +89,9 @@ class DocumentSplitter:
         :param use_split_rules: Choose whether to use additional split rules when splitting by `sentence`.
         :param extend_abbreviations: Choose whether to extend NLTK's PunktTokenizer abbreviations with a list
             of curated abbreviations, if available. This is currently supported for English ("en") and German ("de").
+        :param skip_empty_documents: Choose whether to skip documents with empty content. Default is True.
+            Set to False when downstream components in the Pipeline (like LLMDocumentContentExtractor) can extract text
+            from non-textual documents.
         """
 
         self.split_by = split_by
@@ -98,6 +103,7 @@ class DocumentSplitter:
         self.language = language
         self.use_split_rules = use_split_rules
         self.extend_abbreviations = extend_abbreviations
+        self.skip_empty_documents = skip_empty_documents
 
         self._init_checks(
             split_by=split_by,
@@ -194,7 +200,7 @@ class DocumentSplitter:
                 raise ValueError(
                     f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
                 )
-            if doc.content == "":
+            if doc.content == "" and self.skip_empty_documents:
                 logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
                 continue
 
@@ -287,8 +293,8 @@ class DocumentSplitter:
                 # concatenate the last split with the current one
                 text_splits[-1] += txt
 
-            # NOTE: This line skips documents that have content=""
-            elif len(txt) > 0:
+            # NOTE: If skip_empty_documents is True, this line skips documents that have content=""
+            elif not self.skip_empty_documents or len(txt) > 0:
                 text_splits.append(txt)
                 splits_pages.append(cur_page)
                 splits_start_idxs.append(cur_start_idx)
@@ -375,6 +381,7 @@ class DocumentSplitter:
             language=self.language,
             use_split_rules=self.use_split_rules,
             extend_abbreviations=self.extend_abbreviations,
+            skip_empty_documents=self.skip_empty_documents,
         )
         if self.splitting_function:
             serialized["init_parameters"]["splitting_function"] = serialize_callable(self.splitting_function)
diff --git a/releasenotes/notes/doc-splitter-skip-empty-docs-a031fa7c5ddf8c93.yaml b/releasenotes/notes/doc-splitter-skip-empty-docs-a031fa7c5ddf8c93.yaml
new file mode 100644
index 000000000..48c2314eb
--- /dev/null
+++ b/releasenotes/notes/doc-splitter-skip-empty-docs-a031fa7c5ddf8c93.yaml
@@ -0,0 +1,6 @@
+---
+enhancements:
+  - |
+    Add the init parameter `skip_empty_documents` to the `DocumentSplitter` component. The default value is True.
+    Setting it to False can be useful when downstream components in the Pipeline (like `LLMDocumentContentExtractor`)
+    can extract text from non-textual documents.
diff --git a/test/components/preprocessors/test_document_splitter.py b/test/components/preprocessors/test_document_splitter.py
index 91779f606..551b15577 100644
--- a/test/components/preprocessors/test_document_splitter.py
+++ b/test/components/preprocessors/test_document_splitter.py
@@ -444,6 +444,7 @@ class TestSplittingByFunctionOrCharacterRegex:
         assert serialized["init_parameters"]["split_length"] == 10
         assert serialized["init_parameters"]["split_overlap"] == 2
         assert serialized["init_parameters"]["split_threshold"] == 5
+        assert serialized["init_parameters"]["skip_empty_documents"]
         assert "splitting_function" not in serialized["init_parameters"]
 
     def test_to_dict_with_splitting_function(self):
@@ -457,6 +458,7 @@ class TestSplittingByFunctionOrCharacterRegex:
         assert serialized["type"] == "haystack.components.preprocessors.document_splitter.DocumentSplitter"
         assert serialized["init_parameters"]["split_by"] == "function"
         assert "splitting_function" in serialized["init_parameters"]
+        assert serialized["init_parameters"]["skip_empty_documents"]
         assert callable(deserialize_callable(serialized["init_parameters"]["splitting_function"]))
 
     def test_from_dict(self):
@@ -465,7 +467,13 @@ class TestSplittingByFunctionOrCharacterRegex:
         """
         data = {
             "type": "haystack.components.preprocessors.document_splitter.DocumentSplitter",
-            "init_parameters": {"split_by": "word", "split_length": 10, "split_overlap": 2, "split_threshold": 5},
+            "init_parameters": {
+                "split_by": "word",
+                "split_length": 10,
+                "split_overlap": 2,
+                "split_threshold": 5,
+                "skip_empty_documents": False,
+            },
         }
         splitter = DocumentSplitter.from_dict(data)
 
@@ -474,6 +482,7 @@ class TestSplittingByFunctionOrCharacterRegex:
         assert splitter.split_overlap == 2
         assert splitter.split_threshold == 5
         assert splitter.splitting_function is None
+        assert splitter.skip_empty_documents is False
 
     def test_from_dict_with_splitting_function(self):
         """
@@ -516,7 +525,7 @@ class TestSplittingByFunctionOrCharacterRegex:
         assert callable(deserialized_splitter.splitting_function)
         assert deserialized_splitter.splitting_function("a.b.c") == ["a", "b", "c"]
 
-    def test_run_empty_document(self):
+    def test_run_empty_document_with_skip_empty_documents_true(self):
         """
         Test if the component runs correctly with an empty document.
         """
@@ -526,6 +535,14 @@ class TestSplittingByFunctionOrCharacterRegex:
         results = splitter.run([doc])
         assert results["documents"] == []
 
+    def test_run_empty_document_with_skip_empty_documents_false(self):
+        splitter = DocumentSplitter(skip_empty_documents=False)
+        doc = Document(content="")
+        splitter.warm_up()
+        results = splitter.run([doc])
+        assert len(results["documents"]) == 1
+        assert results["documents"][0].content == ""
+
     def test_run_document_only_whitespaces(self):
         """
         Test if the component runs correctly with a document containing only whitespaces.