feat: add skip_empty_documents init parameter to DocumentSplitter (#9649)

* feat: add skip_empty_documents init parameter to DocumentSplitter

* improve test

* fix + relnote
This commit is contained in:
Stefano Fiorucci 2025-07-24 11:26:11 +02:00 committed by GitHub
parent 3b9b1ae802
commit d059cf2c23
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 117 additions and 5 deletions

View File

@ -0,0 +1,82 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
from haystack import Pipeline
from haystack.components.converters.pypdf import PyPDFToDocument
from haystack.components.joiners import DocumentJoiner
from haystack.components.preprocessors.document_splitter import DocumentSplitter
from haystack.components.writers.document_writer import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.extractors.image.llm_document_content_extractor import LLMDocumentContentExtractor
from haystack.components.generators.chat.openai import OpenAIChatGenerator
from haystack.components.routers.document_length_router import DocumentLengthRouter
def test_pdf_content_extraction_pipeline():
"""
Test a pipeline that processes PDFs with the following steps:
1. Convert PDFs to documents
2. Split documents by page
3. Route documents by length (short vs long)
4. Extract content from short documents using LLM
5. Join documents back together
6. Write to document store
"""
document_store = InMemoryDocumentStore()
pdf_converter = PyPDFToDocument(store_full_path=True)
pdf_splitter = DocumentSplitter(split_by="page", split_length=1, skip_empty_documents=False)
doc_length_router = DocumentLengthRouter(threshold=10)
content_extractor = LLMDocumentContentExtractor(chat_generator=OpenAIChatGenerator(model="gpt-4o-mini"))
final_doc_joiner = DocumentJoiner(sort_by_score=False)
document_writer = DocumentWriter(document_store=document_store)
# Create and configure pipeline
indexing_pipe = Pipeline()
indexing_pipe.add_component("pdf_converter", pdf_converter)
indexing_pipe.add_component("pdf_splitter", pdf_splitter)
indexing_pipe.add_component("doc_length_router", doc_length_router)
indexing_pipe.add_component("content_extractor", content_extractor)
indexing_pipe.add_component("final_doc_joiner", final_doc_joiner)
indexing_pipe.add_component("document_writer", document_writer)
# Connect components
indexing_pipe.connect("pdf_converter.documents", "pdf_splitter.documents")
indexing_pipe.connect("pdf_splitter.documents", "doc_length_router.documents")
# The short PDF pages will be enriched/captioned
indexing_pipe.connect("doc_length_router.short_documents", "content_extractor.documents")
indexing_pipe.connect("doc_length_router.long_documents", "final_doc_joiner.documents")
indexing_pipe.connect("content_extractor.documents", "final_doc_joiner.documents")
indexing_pipe.connect("final_doc_joiner.documents", "document_writer.documents")
# Test with both text-searchable and non-text-searchable PDFs
test_files = [
"test/test_files/pdf/sample_pdf_1.pdf", # a PDF with 4 pages
"test/test_files/pdf/non_text_searchable.pdf", # a non-text searchable PDF with 1 page
]
# Run the indexing pipeline
indexing_result = indexing_pipe.run(data={"sources": test_files})
assert indexing_result is not None
assert "document_writer" in indexing_result
indexed_documents = document_store.filter_documents()
# We expect documents from both PDFs
# sample_pdf_1.pdf has 4 pages, non_text_searchable.pdf has 1 page
assert len(indexed_documents) == 5
file_paths = {doc.meta["file_path"] for doc in indexed_documents}
assert file_paths == set(test_files)
for doc in indexed_documents:
assert hasattr(doc, "content")
assert hasattr(doc, "meta")
assert "file_path" in doc.meta
assert "page_number" in doc.meta
for doc in indexed_documents:
assert isinstance(doc.meta["page_number"], int)
assert doc.meta["page_number"] >= 1

View File

@ -62,6 +62,8 @@ class DocumentSplitter:
language: Language = "en",
use_split_rules: bool = True,
extend_abbreviations: bool = True,
*,
skip_empty_documents: bool = True,
):
"""
Initialize DocumentSplitter.
@ -87,6 +89,9 @@ class DocumentSplitter:
:param use_split_rules: Choose whether to use additional split rules when splitting by `sentence`.
:param extend_abbreviations: Choose whether to extend NLTK's PunktTokenizer abbreviations with a list
of curated abbreviations, if available. This is currently supported for English ("en") and German ("de").
:param skip_empty_documents: Choose whether to skip documents with empty content. Default is True.
Set to False when downstream components in the Pipeline (like LLMDocumentContentExtractor) can extract text
from non-textual documents.
"""
self.split_by = split_by
@ -98,6 +103,7 @@ class DocumentSplitter:
self.language = language
self.use_split_rules = use_split_rules
self.extend_abbreviations = extend_abbreviations
self.skip_empty_documents = skip_empty_documents
self._init_checks(
split_by=split_by,
@ -194,7 +200,7 @@ class DocumentSplitter:
raise ValueError(
f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
)
if doc.content == "":
if doc.content == "" and self.skip_empty_documents:
logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
continue
@ -287,8 +293,8 @@ class DocumentSplitter:
# concatenate the last split with the current one
text_splits[-1] += txt
# NOTE: This line skips documents that have content=""
elif len(txt) > 0:
# NOTE: If skip_empty_documents is True, this line skips documents that have content=""
elif not self.skip_empty_documents or len(txt) > 0:
text_splits.append(txt)
splits_pages.append(cur_page)
splits_start_idxs.append(cur_start_idx)
@ -375,6 +381,7 @@ class DocumentSplitter:
language=self.language,
use_split_rules=self.use_split_rules,
extend_abbreviations=self.extend_abbreviations,
skip_empty_documents=self.skip_empty_documents,
)
if self.splitting_function:
serialized["init_parameters"]["splitting_function"] = serialize_callable(self.splitting_function)

View File

@ -0,0 +1,6 @@
---
enhancements:
- |
Add the init parameter `skip_empty_documents` to the `DocumentSplitter` component. The default value is True.
Setting it to False can be useful when downstream components in the Pipeline (like `LLMDocumentContentExtractor`)
can extract text from non-textual documents.

View File

@ -444,6 +444,7 @@ class TestSplittingByFunctionOrCharacterRegex:
assert serialized["init_parameters"]["split_length"] == 10
assert serialized["init_parameters"]["split_overlap"] == 2
assert serialized["init_parameters"]["split_threshold"] == 5
assert serialized["init_parameters"]["skip_empty_documents"]
assert "splitting_function" not in serialized["init_parameters"]
def test_to_dict_with_splitting_function(self):
@ -457,6 +458,7 @@ class TestSplittingByFunctionOrCharacterRegex:
assert serialized["type"] == "haystack.components.preprocessors.document_splitter.DocumentSplitter"
assert serialized["init_parameters"]["split_by"] == "function"
assert "splitting_function" in serialized["init_parameters"]
assert serialized["init_parameters"]["skip_empty_documents"]
assert callable(deserialize_callable(serialized["init_parameters"]["splitting_function"]))
def test_from_dict(self):
@ -465,7 +467,13 @@ class TestSplittingByFunctionOrCharacterRegex:
"""
data = {
"type": "haystack.components.preprocessors.document_splitter.DocumentSplitter",
"init_parameters": {"split_by": "word", "split_length": 10, "split_overlap": 2, "split_threshold": 5},
"init_parameters": {
"split_by": "word",
"split_length": 10,
"split_overlap": 2,
"split_threshold": 5,
"skip_empty_documents": False,
},
}
splitter = DocumentSplitter.from_dict(data)
@ -474,6 +482,7 @@ class TestSplittingByFunctionOrCharacterRegex:
assert splitter.split_overlap == 2
assert splitter.split_threshold == 5
assert splitter.splitting_function is None
assert splitter.skip_empty_documents is False
def test_from_dict_with_splitting_function(self):
"""
@ -516,7 +525,7 @@ class TestSplittingByFunctionOrCharacterRegex:
assert callable(deserialized_splitter.splitting_function)
assert deserialized_splitter.splitting_function("a.b.c") == ["a", "b", "c"]
def test_run_empty_document(self):
def test_run_empty_document_with_skip_empty_documents_true(self):
"""
Test if the component runs correctly with an empty document.
"""
@ -526,6 +535,14 @@ class TestSplittingByFunctionOrCharacterRegex:
results = splitter.run([doc])
assert results["documents"] == []
def test_run_empty_document_with_skip_empty_documents_false(self):
splitter = DocumentSplitter(skip_empty_documents=False)
doc = Document(content="")
splitter.warm_up()
results = splitter.run([doc])
assert len(results["documents"]) == 1
assert results["documents"][0].content == ""
def test_run_document_only_whitespaces(self):
"""
Test if the component runs correctly with a document containing only whitespaces.