mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-04 19:06:44 +00:00
feat: add DocumentLengthRouter (#9636)
This commit is contained in:
parent
868ea41698
commit
c9e43c9ca2
@ -4,6 +4,7 @@ loaders:
|
||||
modules:
|
||||
[
|
||||
"conditional_router",
|
||||
"document_length_router",
|
||||
"document_type_router",
|
||||
"file_type_router",
|
||||
"llm_messages_router",
|
||||
|
||||
@ -4,6 +4,7 @@ loaders:
|
||||
modules:
|
||||
[
|
||||
"conditional_router",
|
||||
"document_length_router",
|
||||
"document_type_router",
|
||||
"file_type_router",
|
||||
"llm_messages_router",
|
||||
|
||||
@ -9,6 +9,7 @@ from lazy_imports import LazyImporter
|
||||
|
||||
_import_structure = {
|
||||
"conditional_router": ["ConditionalRouter"],
|
||||
"document_length_router": ["DocumentLengthRouter"],
|
||||
"document_type_router": ["DocumentTypeRouter"],
|
||||
"file_type_router": ["FileTypeRouter"],
|
||||
"llm_messages_router": ["LLMMessagesRouter"],
|
||||
@ -20,6 +21,7 @@ _import_structure = {
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .conditional_router import ConditionalRouter as ConditionalRouter
|
||||
from .document_length_router import DocumentLengthRouter as DocumentLengthRouter
|
||||
from .document_type_router import DocumentTypeRouter as DocumentTypeRouter
|
||||
from .file_type_router import FileTypeRouter as FileTypeRouter
|
||||
from .llm_messages_router import LLMMessagesRouter as LLMMessagesRouter
|
||||
|
||||
76
haystack/components/routers/document_length_router.py
Normal file
76
haystack/components/routers/document_length_router.py
Normal file
@ -0,0 +1,76 @@
|
||||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import Dict, List
|
||||
|
||||
from haystack import component
|
||||
from haystack.dataclasses import Document
|
||||
|
||||
|
||||
@component
|
||||
class DocumentLengthRouter:
|
||||
"""
|
||||
Categorizes documents based on the length of the `content` field and routes them to the appropriate output.
|
||||
|
||||
A common use case for DocumentLengthRouter is handling documents obtained from PDFs that contain non-text
|
||||
content, such as scanned pages or images. This component can detect empty or low-content documents and route them to
|
||||
components that perform OCR, generate captions, or compute image embeddings.
|
||||
|
||||
### Usage example
|
||||
|
||||
```python
|
||||
from haystack.components.routers import DocumentLengthRouter
|
||||
from haystack.dataclasses import Document
|
||||
|
||||
docs = [
|
||||
Document(content="Short"),
|
||||
Document(content="Long document "*20),
|
||||
]
|
||||
|
||||
router = DocumentLengthRouter(threshold=10)
|
||||
|
||||
result = router.run(documents=docs)
|
||||
print(result)
|
||||
|
||||
# {
|
||||
# "short_documents": [Document(content="Short", ...)],
|
||||
# "long_documents": [Document(content="Long document ...", ...)],
|
||||
# }
|
||||
"""
|
||||
|
||||
def __init__(self, *, threshold: int = 10) -> None:
|
||||
"""
|
||||
Initialize the DocumentLengthRouter component.
|
||||
|
||||
:param threshold:
|
||||
The threshold for the number of characters in the document `content` field. Documents where `content` is
|
||||
None or whose character count is less than or equal to the threshold will be routed to the `short_documents`
|
||||
output. Otherwise, they will be routed to the `long_documents` output.
|
||||
To route only documents with None content to `short_documents`, set the threshold to a negative number.
|
||||
"""
|
||||
self.threshold = threshold
|
||||
|
||||
@component.output_types(short_documents=List[Document], long_documents=List[Document])
|
||||
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
|
||||
"""
|
||||
Categorize input documents into groups based on the length of the `content` field.
|
||||
|
||||
:param documents:
|
||||
A list of documents to be categorized.
|
||||
|
||||
:returns: A dictionary with the following keys:
|
||||
- `short_documents`: A list of documents where `content` is None or the length of `content` is less than or
|
||||
equal to the threshold.
|
||||
- `long_documents`: A list of documents where the length of `content` is greater than the threshold.
|
||||
"""
|
||||
short_documents = []
|
||||
long_documents = []
|
||||
|
||||
for doc in documents:
|
||||
if doc.content is None or len(doc.content) <= self.threshold:
|
||||
short_documents.append(doc)
|
||||
else:
|
||||
long_documents.append(doc)
|
||||
|
||||
return {"short_documents": short_documents, "long_documents": long_documents}
|
||||
@ -0,0 +1,8 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Introduce the DocumentLengthRouter, a component for routing Documents based on the length of the `content` field.
|
||||
|
||||
A common use case for DocumentLengthRouter is handling documents obtained from PDFs that contain non-text
|
||||
content, such as scanned pages or images. This component can detect empty or low-content documents and route them to
|
||||
components that perform OCR, generate captions, or compute image embeddings.
|
||||
62
test/components/routers/test_document_length_router.py
Normal file
62
test/components/routers/test_document_length_router.py
Normal file
@ -0,0 +1,62 @@
|
||||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from haystack.components.routers import DocumentLengthRouter
|
||||
from haystack.core.serialization import component_from_dict, component_to_dict
|
||||
from haystack.dataclasses import Document
|
||||
|
||||
|
||||
class TestDocumentLengthRouter:
|
||||
def test_init(self):
|
||||
router = DocumentLengthRouter(threshold=20)
|
||||
assert router.threshold == 20
|
||||
|
||||
def test_run(self):
|
||||
docs = [Document(content="Short"), Document(content="Long document " * 20)]
|
||||
router = DocumentLengthRouter(threshold=10)
|
||||
result = router.run(documents=docs)
|
||||
|
||||
assert len(result["short_documents"]) == 1
|
||||
assert len(result["long_documents"]) == 1
|
||||
assert result["short_documents"][0] == docs[0]
|
||||
assert result["long_documents"][0] == docs[1]
|
||||
|
||||
def test_run_with_null_content(self):
|
||||
docs = [Document(content=None), Document(content="Long document " * 20)]
|
||||
router = DocumentLengthRouter(threshold=10)
|
||||
result = router.run(documents=docs)
|
||||
|
||||
assert len(result["short_documents"]) == 1
|
||||
assert len(result["long_documents"]) == 1
|
||||
assert result["short_documents"][0] == docs[0]
|
||||
assert result["long_documents"][0] == docs[1]
|
||||
|
||||
def test_run_with_negative_threshold(self):
|
||||
docs = [Document(content=None), Document(content="Short"), Document(content="Long document " * 20)]
|
||||
router = DocumentLengthRouter(threshold=-1)
|
||||
result = router.run(documents=docs)
|
||||
|
||||
assert len(result["short_documents"]) == 1
|
||||
assert len(result["long_documents"]) == 2
|
||||
assert result["short_documents"][0] == docs[0]
|
||||
assert result["long_documents"][0] == docs[1]
|
||||
assert result["long_documents"][1] == docs[2]
|
||||
|
||||
def test_to_dict(self):
|
||||
router = DocumentLengthRouter(threshold=10)
|
||||
expected_dict = {
|
||||
"type": "haystack.components.routers.document_length_router.DocumentLengthRouter",
|
||||
"init_parameters": {"threshold": 10},
|
||||
}
|
||||
assert component_to_dict(router, "router") == expected_dict
|
||||
|
||||
def test_from_dict(self):
|
||||
router_dict = {
|
||||
"type": "haystack.components.routers.document_length_router.DocumentLengthRouter",
|
||||
"init_parameters": {"threshold": 10},
|
||||
}
|
||||
loaded_router = component_from_dict(DocumentLengthRouter, router_dict, name="router")
|
||||
|
||||
assert isinstance(loaded_router, DocumentLengthRouter)
|
||||
assert loaded_router.threshold == 10
|
||||
Loading…
x
Reference in New Issue
Block a user