haystack/haystack/components/routers/document_length_router.py
2025-07-22 14:59:28 +02:00

77 lines
2.8 KiB
Python

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
from typing import Dict, List
from haystack import component
from haystack.dataclasses import Document
@component
class DocumentLengthRouter:
"""
Categorizes documents based on the length of the `content` field and routes them to the appropriate output.
A common use case for DocumentLengthRouter is handling documents obtained from PDFs that contain non-text
content, such as scanned pages or images. This component can detect empty or low-content documents and route them to
components that perform OCR, generate captions, or compute image embeddings.
### Usage example
```python
from haystack.components.routers import DocumentLengthRouter
from haystack.dataclasses import Document
docs = [
Document(content="Short"),
Document(content="Long document "*20),
]
router = DocumentLengthRouter(threshold=10)
result = router.run(documents=docs)
print(result)
# {
# "short_documents": [Document(content="Short", ...)],
# "long_documents": [Document(content="Long document ...", ...)],
# }
"""
def __init__(self, *, threshold: int = 10) -> None:
"""
Initialize the DocumentLengthRouter component.
:param threshold:
The threshold for the number of characters in the document `content` field. Documents where `content` is
None or whose character count is less than or equal to the threshold will be routed to the `short_documents`
output. Otherwise, they will be routed to the `long_documents` output.
To route only documents with None content to `short_documents`, set the threshold to a negative number.
"""
self.threshold = threshold
@component.output_types(short_documents=List[Document], long_documents=List[Document])
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
"""
Categorize input documents into groups based on the length of the `content` field.
:param documents:
A list of documents to be categorized.
:returns: A dictionary with the following keys:
- `short_documents`: A list of documents where `content` is None or the length of `content` is less than or
equal to the threshold.
- `long_documents`: A list of documents where the length of `content` is greater than the threshold.
"""
short_documents = []
long_documents = []
for doc in documents:
if doc.content is None or len(doc.content) <= self.threshold:
short_documents.append(doc)
else:
long_documents.append(doc)
return {"short_documents": short_documents, "long_documents": long_documents}