mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-14 08:37:42 +00:00
feat: add skip_empty_documents init parameter to DocumentSplitter (#9649)
* feat: add skip_empty_documents init parameter to DocumentSplitter * improve test * fix + relnote
This commit is contained in:
parent
3b9b1ae802
commit
d059cf2c23
82
e2e/pipelines/test_pdf_content_extraction_pipeline.py
Normal file
82
e2e/pipelines/test_pdf_content_extraction_pipeline.py
Normal file
@ -0,0 +1,82 @@
|
||||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from haystack import Pipeline
|
||||
from haystack.components.converters.pypdf import PyPDFToDocument
|
||||
from haystack.components.joiners import DocumentJoiner
|
||||
from haystack.components.preprocessors.document_splitter import DocumentSplitter
|
||||
from haystack.components.writers.document_writer import DocumentWriter
|
||||
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
||||
from haystack.components.extractors.image.llm_document_content_extractor import LLMDocumentContentExtractor
|
||||
from haystack.components.generators.chat.openai import OpenAIChatGenerator
|
||||
from haystack.components.routers.document_length_router import DocumentLengthRouter
|
||||
|
||||
|
||||
def test_pdf_content_extraction_pipeline():
|
||||
"""
|
||||
Test a pipeline that processes PDFs with the following steps:
|
||||
1. Convert PDFs to documents
|
||||
2. Split documents by page
|
||||
3. Route documents by length (short vs long)
|
||||
4. Extract content from short documents using LLM
|
||||
5. Join documents back together
|
||||
6. Write to document store
|
||||
"""
|
||||
document_store = InMemoryDocumentStore()
|
||||
|
||||
pdf_converter = PyPDFToDocument(store_full_path=True)
|
||||
pdf_splitter = DocumentSplitter(split_by="page", split_length=1, skip_empty_documents=False)
|
||||
doc_length_router = DocumentLengthRouter(threshold=10)
|
||||
content_extractor = LLMDocumentContentExtractor(chat_generator=OpenAIChatGenerator(model="gpt-4o-mini"))
|
||||
final_doc_joiner = DocumentJoiner(sort_by_score=False)
|
||||
document_writer = DocumentWriter(document_store=document_store)
|
||||
|
||||
# Create and configure pipeline
|
||||
indexing_pipe = Pipeline()
|
||||
indexing_pipe.add_component("pdf_converter", pdf_converter)
|
||||
indexing_pipe.add_component("pdf_splitter", pdf_splitter)
|
||||
indexing_pipe.add_component("doc_length_router", doc_length_router)
|
||||
indexing_pipe.add_component("content_extractor", content_extractor)
|
||||
indexing_pipe.add_component("final_doc_joiner", final_doc_joiner)
|
||||
indexing_pipe.add_component("document_writer", document_writer)
|
||||
|
||||
# Connect components
|
||||
indexing_pipe.connect("pdf_converter.documents", "pdf_splitter.documents")
|
||||
indexing_pipe.connect("pdf_splitter.documents", "doc_length_router.documents")
|
||||
# The short PDF pages will be enriched/captioned
|
||||
indexing_pipe.connect("doc_length_router.short_documents", "content_extractor.documents")
|
||||
indexing_pipe.connect("doc_length_router.long_documents", "final_doc_joiner.documents")
|
||||
indexing_pipe.connect("content_extractor.documents", "final_doc_joiner.documents")
|
||||
indexing_pipe.connect("final_doc_joiner.documents", "document_writer.documents")
|
||||
|
||||
# Test with both text-searchable and non-text-searchable PDFs
|
||||
test_files = [
|
||||
"test/test_files/pdf/sample_pdf_1.pdf", # a PDF with 4 pages
|
||||
"test/test_files/pdf/non_text_searchable.pdf", # a non-text searchable PDF with 1 page
|
||||
]
|
||||
|
||||
# Run the indexing pipeline
|
||||
indexing_result = indexing_pipe.run(data={"sources": test_files})
|
||||
|
||||
assert indexing_result is not None
|
||||
assert "document_writer" in indexing_result
|
||||
|
||||
indexed_documents = document_store.filter_documents()
|
||||
|
||||
# We expect documents from both PDFs
|
||||
# sample_pdf_1.pdf has 4 pages, non_text_searchable.pdf has 1 page
|
||||
assert len(indexed_documents) == 5
|
||||
|
||||
file_paths = {doc.meta["file_path"] for doc in indexed_documents}
|
||||
assert file_paths == set(test_files)
|
||||
|
||||
for doc in indexed_documents:
|
||||
assert hasattr(doc, "content")
|
||||
assert hasattr(doc, "meta")
|
||||
assert "file_path" in doc.meta
|
||||
assert "page_number" in doc.meta
|
||||
|
||||
for doc in indexed_documents:
|
||||
assert isinstance(doc.meta["page_number"], int)
|
||||
assert doc.meta["page_number"] >= 1
|
||||
@ -62,6 +62,8 @@ class DocumentSplitter:
|
||||
language: Language = "en",
|
||||
use_split_rules: bool = True,
|
||||
extend_abbreviations: bool = True,
|
||||
*,
|
||||
skip_empty_documents: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize DocumentSplitter.
|
||||
@ -87,6 +89,9 @@ class DocumentSplitter:
|
||||
:param use_split_rules: Choose whether to use additional split rules when splitting by `sentence`.
|
||||
:param extend_abbreviations: Choose whether to extend NLTK's PunktTokenizer abbreviations with a list
|
||||
of curated abbreviations, if available. This is currently supported for English ("en") and German ("de").
|
||||
:param skip_empty_documents: Choose whether to skip documents with empty content. Default is True.
|
||||
Set to False when downstream components in the Pipeline (like LLMDocumentContentExtractor) can extract text
|
||||
from non-textual documents.
|
||||
"""
|
||||
|
||||
self.split_by = split_by
|
||||
@ -98,6 +103,7 @@ class DocumentSplitter:
|
||||
self.language = language
|
||||
self.use_split_rules = use_split_rules
|
||||
self.extend_abbreviations = extend_abbreviations
|
||||
self.skip_empty_documents = skip_empty_documents
|
||||
|
||||
self._init_checks(
|
||||
split_by=split_by,
|
||||
@ -194,7 +200,7 @@ class DocumentSplitter:
|
||||
raise ValueError(
|
||||
f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
|
||||
)
|
||||
if doc.content == "":
|
||||
if doc.content == "" and self.skip_empty_documents:
|
||||
logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
|
||||
continue
|
||||
|
||||
@ -287,8 +293,8 @@ class DocumentSplitter:
|
||||
# concatenate the last split with the current one
|
||||
text_splits[-1] += txt
|
||||
|
||||
# NOTE: This line skips documents that have content=""
|
||||
elif len(txt) > 0:
|
||||
# NOTE: If skip_empty_documents is True, this line skips documents that have content=""
|
||||
elif not self.skip_empty_documents or len(txt) > 0:
|
||||
text_splits.append(txt)
|
||||
splits_pages.append(cur_page)
|
||||
splits_start_idxs.append(cur_start_idx)
|
||||
@ -375,6 +381,7 @@ class DocumentSplitter:
|
||||
language=self.language,
|
||||
use_split_rules=self.use_split_rules,
|
||||
extend_abbreviations=self.extend_abbreviations,
|
||||
skip_empty_documents=self.skip_empty_documents,
|
||||
)
|
||||
if self.splitting_function:
|
||||
serialized["init_parameters"]["splitting_function"] = serialize_callable(self.splitting_function)
|
||||
|
||||
@ -0,0 +1,6 @@
|
||||
---
|
||||
enhancements:
|
||||
- |
|
||||
Add the init parameter `skip_empty_documents` to the `DocumentSplitter` component. The default value is True.
|
||||
Setting it to False can be useful when downstream components in the Pipeline (like `LLMDocumentContentExtractor`)
|
||||
can extract text from non-textual documents.
|
||||
@ -444,6 +444,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
assert serialized["init_parameters"]["split_length"] == 10
|
||||
assert serialized["init_parameters"]["split_overlap"] == 2
|
||||
assert serialized["init_parameters"]["split_threshold"] == 5
|
||||
assert serialized["init_parameters"]["skip_empty_documents"]
|
||||
assert "splitting_function" not in serialized["init_parameters"]
|
||||
|
||||
def test_to_dict_with_splitting_function(self):
|
||||
@ -457,6 +458,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
assert serialized["type"] == "haystack.components.preprocessors.document_splitter.DocumentSplitter"
|
||||
assert serialized["init_parameters"]["split_by"] == "function"
|
||||
assert "splitting_function" in serialized["init_parameters"]
|
||||
assert serialized["init_parameters"]["skip_empty_documents"]
|
||||
assert callable(deserialize_callable(serialized["init_parameters"]["splitting_function"]))
|
||||
|
||||
def test_from_dict(self):
|
||||
@ -465,7 +467,13 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
"""
|
||||
data = {
|
||||
"type": "haystack.components.preprocessors.document_splitter.DocumentSplitter",
|
||||
"init_parameters": {"split_by": "word", "split_length": 10, "split_overlap": 2, "split_threshold": 5},
|
||||
"init_parameters": {
|
||||
"split_by": "word",
|
||||
"split_length": 10,
|
||||
"split_overlap": 2,
|
||||
"split_threshold": 5,
|
||||
"skip_empty_documents": False,
|
||||
},
|
||||
}
|
||||
splitter = DocumentSplitter.from_dict(data)
|
||||
|
||||
@ -474,6 +482,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
assert splitter.split_overlap == 2
|
||||
assert splitter.split_threshold == 5
|
||||
assert splitter.splitting_function is None
|
||||
assert splitter.skip_empty_documents is False
|
||||
|
||||
def test_from_dict_with_splitting_function(self):
|
||||
"""
|
||||
@ -516,7 +525,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
assert callable(deserialized_splitter.splitting_function)
|
||||
assert deserialized_splitter.splitting_function("a.b.c") == ["a", "b", "c"]
|
||||
|
||||
def test_run_empty_document(self):
|
||||
def test_run_empty_document_with_skip_empty_documents_true(self):
|
||||
"""
|
||||
Test if the component runs correctly with an empty document.
|
||||
"""
|
||||
@ -526,6 +535,14 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
results = splitter.run([doc])
|
||||
assert results["documents"] == []
|
||||
|
||||
def test_run_empty_document_with_skip_empty_documents_false(self):
|
||||
splitter = DocumentSplitter(skip_empty_documents=False)
|
||||
doc = Document(content="")
|
||||
splitter.warm_up()
|
||||
results = splitter.run([doc])
|
||||
assert len(results["documents"]) == 1
|
||||
assert results["documents"][0].content == ""
|
||||
|
||||
def test_run_document_only_whitespaces(self):
|
||||
"""
|
||||
Test if the component runs correctly with a document containing only whitespaces.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user