feat: add DocumentLengthRouter (#9636)

This commit is contained in:
Stefano Fiorucci 2025-07-22 14:59:28 +02:00 committed by GitHub
parent 868ea41698
commit c9e43c9ca2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 150 additions and 0 deletions

View File

@ -4,6 +4,7 @@ loaders:
modules:
[
"conditional_router",
"document_length_router",
"document_type_router",
"file_type_router",
"llm_messages_router",

View File

@ -4,6 +4,7 @@ loaders:
modules:
[
"conditional_router",
"document_length_router",
"document_type_router",
"file_type_router",
"llm_messages_router",

View File

@ -9,6 +9,7 @@ from lazy_imports import LazyImporter
_import_structure = {
"conditional_router": ["ConditionalRouter"],
"document_length_router": ["DocumentLengthRouter"],
"document_type_router": ["DocumentTypeRouter"],
"file_type_router": ["FileTypeRouter"],
"llm_messages_router": ["LLMMessagesRouter"],
@ -20,6 +21,7 @@ _import_structure = {
if TYPE_CHECKING:
from .conditional_router import ConditionalRouter as ConditionalRouter
from .document_length_router import DocumentLengthRouter as DocumentLengthRouter
from .document_type_router import DocumentTypeRouter as DocumentTypeRouter
from .file_type_router import FileTypeRouter as FileTypeRouter
from .llm_messages_router import LLMMessagesRouter as LLMMessagesRouter

View File

@ -0,0 +1,76 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
from typing import Dict, List
from haystack import component
from haystack.dataclasses import Document
@component
class DocumentLengthRouter:
"""
Categorizes documents based on the length of the `content` field and routes them to the appropriate output.
A common use case for DocumentLengthRouter is handling documents obtained from PDFs that contain non-text
content, such as scanned pages or images. This component can detect empty or low-content documents and route them to
components that perform OCR, generate captions, or compute image embeddings.
### Usage example
```python
from haystack.components.routers import DocumentLengthRouter
from haystack.dataclasses import Document
docs = [
Document(content="Short"),
Document(content="Long document "*20),
]
router = DocumentLengthRouter(threshold=10)
result = router.run(documents=docs)
print(result)
# {
# "short_documents": [Document(content="Short", ...)],
# "long_documents": [Document(content="Long document ...", ...)],
# }
"""
def __init__(self, *, threshold: int = 10) -> None:
"""
Initialize the DocumentLengthRouter component.
:param threshold:
The threshold for the number of characters in the document `content` field. Documents where `content` is
None or whose character count is less than or equal to the threshold will be routed to the `short_documents`
output. Otherwise, they will be routed to the `long_documents` output.
To route only documents with None content to `short_documents`, set the threshold to a negative number.
"""
self.threshold = threshold
@component.output_types(short_documents=List[Document], long_documents=List[Document])
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
"""
Categorize input documents into groups based on the length of the `content` field.
:param documents:
A list of documents to be categorized.
:returns: A dictionary with the following keys:
- `short_documents`: A list of documents where `content` is None or the length of `content` is less than or
equal to the threshold.
- `long_documents`: A list of documents where the length of `content` is greater than the threshold.
"""
short_documents = []
long_documents = []
for doc in documents:
if doc.content is None or len(doc.content) <= self.threshold:
short_documents.append(doc)
else:
long_documents.append(doc)
return {"short_documents": short_documents, "long_documents": long_documents}

View File

@ -0,0 +1,8 @@
---
features:
- |
Introduce the DocumentLengthRouter, a component for routing Documents based on the length of the `content` field.
A common use case for DocumentLengthRouter is handling documents obtained from PDFs that contain non-text
content, such as scanned pages or images. This component can detect empty or low-content documents and route them to
components that perform OCR, generate captions, or compute image embeddings.

View File

@ -0,0 +1,62 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
from haystack.components.routers import DocumentLengthRouter
from haystack.core.serialization import component_from_dict, component_to_dict
from haystack.dataclasses import Document
class TestDocumentLengthRouter:
def test_init(self):
router = DocumentLengthRouter(threshold=20)
assert router.threshold == 20
def test_run(self):
docs = [Document(content="Short"), Document(content="Long document " * 20)]
router = DocumentLengthRouter(threshold=10)
result = router.run(documents=docs)
assert len(result["short_documents"]) == 1
assert len(result["long_documents"]) == 1
assert result["short_documents"][0] == docs[0]
assert result["long_documents"][0] == docs[1]
def test_run_with_null_content(self):
docs = [Document(content=None), Document(content="Long document " * 20)]
router = DocumentLengthRouter(threshold=10)
result = router.run(documents=docs)
assert len(result["short_documents"]) == 1
assert len(result["long_documents"]) == 1
assert result["short_documents"][0] == docs[0]
assert result["long_documents"][0] == docs[1]
def test_run_with_negative_threshold(self):
docs = [Document(content=None), Document(content="Short"), Document(content="Long document " * 20)]
router = DocumentLengthRouter(threshold=-1)
result = router.run(documents=docs)
assert len(result["short_documents"]) == 1
assert len(result["long_documents"]) == 2
assert result["short_documents"][0] == docs[0]
assert result["long_documents"][0] == docs[1]
assert result["long_documents"][1] == docs[2]
def test_to_dict(self):
router = DocumentLengthRouter(threshold=10)
expected_dict = {
"type": "haystack.components.routers.document_length_router.DocumentLengthRouter",
"init_parameters": {"threshold": 10},
}
assert component_to_dict(router, "router") == expected_dict
def test_from_dict(self):
router_dict = {
"type": "haystack.components.routers.document_length_router.DocumentLengthRouter",
"init_parameters": {"threshold": 10},
}
loaded_router = component_from_dict(DocumentLengthRouter, router_dict, name="router")
assert isinstance(loaded_router, DocumentLengthRouter)
assert loaded_router.threshold == 10