From c9e43c9ca272f752ddfaf236bdcb983e785cb77d Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Tue, 22 Jul 2025 14:59:28 +0200 Subject: [PATCH] feat: add DocumentLengthRouter (#9636) --- docs/pydoc/config/routers_api.yml | 1 + docs/pydoc/config_docusaurus/routers_api.yml | 1 + haystack/components/routers/__init__.py | 2 + .../routers/document_length_router.py | 76 +++++++++++++++++++ .../doc-length-router-a270eca40dba83ea.yaml | 8 ++ .../routers/test_document_length_router.py | 62 +++++++++++++++ 6 files changed, 150 insertions(+) create mode 100644 haystack/components/routers/document_length_router.py create mode 100644 releasenotes/notes/doc-length-router-a270eca40dba83ea.yaml create mode 100644 test/components/routers/test_document_length_router.py diff --git a/docs/pydoc/config/routers_api.yml b/docs/pydoc/config/routers_api.yml index 100e0c591..0cde13923 100644 --- a/docs/pydoc/config/routers_api.yml +++ b/docs/pydoc/config/routers_api.yml @@ -4,6 +4,7 @@ loaders: modules: [ "conditional_router", + "document_length_router", "document_type_router", "file_type_router", "llm_messages_router", diff --git a/docs/pydoc/config_docusaurus/routers_api.yml b/docs/pydoc/config_docusaurus/routers_api.yml index b09d87470..b77ddb550 100644 --- a/docs/pydoc/config_docusaurus/routers_api.yml +++ b/docs/pydoc/config_docusaurus/routers_api.yml @@ -4,6 +4,7 @@ loaders: modules: [ "conditional_router", + "document_length_router", "document_type_router", "file_type_router", "llm_messages_router", diff --git a/haystack/components/routers/__init__.py b/haystack/components/routers/__init__.py index 5302d7e80..d79c73bcc 100644 --- a/haystack/components/routers/__init__.py +++ b/haystack/components/routers/__init__.py @@ -9,6 +9,7 @@ from lazy_imports import LazyImporter _import_structure = { "conditional_router": ["ConditionalRouter"], + "document_length_router": ["DocumentLengthRouter"], "document_type_router": ["DocumentTypeRouter"], "file_type_router": ["FileTypeRouter"], "llm_messages_router": ["LLMMessagesRouter"], @@ -20,6 +21,7 @@ _import_structure = { if TYPE_CHECKING: from .conditional_router import ConditionalRouter as ConditionalRouter + from .document_length_router import DocumentLengthRouter as DocumentLengthRouter from .document_type_router import DocumentTypeRouter as DocumentTypeRouter from .file_type_router import FileTypeRouter as FileTypeRouter from .llm_messages_router import LLMMessagesRouter as LLMMessagesRouter diff --git a/haystack/components/routers/document_length_router.py b/haystack/components/routers/document_length_router.py new file mode 100644 index 000000000..af05a0c21 --- /dev/null +++ b/haystack/components/routers/document_length_router.py @@ -0,0 +1,76 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List + +from haystack import component +from haystack.dataclasses import Document + + +@component +class DocumentLengthRouter: + """ + Categorizes documents based on the length of the `content` field and routes them to the appropriate output. + + A common use case for DocumentLengthRouter is handling documents obtained from PDFs that contain non-text + content, such as scanned pages or images. This component can detect empty or low-content documents and route them to + components that perform OCR, generate captions, or compute image embeddings. + + ### Usage example + + ```python + from haystack.components.routers import DocumentLengthRouter + from haystack.dataclasses import Document + + docs = [ + Document(content="Short"), + Document(content="Long document "*20), + ] + + router = DocumentLengthRouter(threshold=10) + + result = router.run(documents=docs) + print(result) + + # { + # "short_documents": [Document(content="Short", ...)], + # "long_documents": [Document(content="Long document ...", ...)], + # } + """ + + def __init__(self, *, threshold: int = 10) -> None: + """ + Initialize the DocumentLengthRouter component. + + :param threshold: + The threshold for the number of characters in the document `content` field. Documents where `content` is + None or whose character count is less than or equal to the threshold will be routed to the `short_documents` + output. Otherwise, they will be routed to the `long_documents` output. + To route only documents with None content to `short_documents`, set the threshold to a negative number. + """ + self.threshold = threshold + + @component.output_types(short_documents=List[Document], long_documents=List[Document]) + def run(self, documents: List[Document]) -> Dict[str, List[Document]]: + """ + Categorize input documents into groups based on the length of the `content` field. + + :param documents: + A list of documents to be categorized. + + :returns: A dictionary with the following keys: + - `short_documents`: A list of documents where `content` is None or the length of `content` is less than or + equal to the threshold. + - `long_documents`: A list of documents where the length of `content` is greater than the threshold. + """ + short_documents = [] + long_documents = [] + + for doc in documents: + if doc.content is None or len(doc.content) <= self.threshold: + short_documents.append(doc) + else: + long_documents.append(doc) + + return {"short_documents": short_documents, "long_documents": long_documents} diff --git a/releasenotes/notes/doc-length-router-a270eca40dba83ea.yaml b/releasenotes/notes/doc-length-router-a270eca40dba83ea.yaml new file mode 100644 index 000000000..163fec5a4 --- /dev/null +++ b/releasenotes/notes/doc-length-router-a270eca40dba83ea.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Introduce the DocumentLengthRouter, a component for routing Documents based on the length of the `content` field. + + A common use case for DocumentLengthRouter is handling documents obtained from PDFs that contain non-text + content, such as scanned pages or images. This component can detect empty or low-content documents and route them to + components that perform OCR, generate captions, or compute image embeddings. diff --git a/test/components/routers/test_document_length_router.py b/test/components/routers/test_document_length_router.py new file mode 100644 index 000000000..1cf85ab8d --- /dev/null +++ b/test/components/routers/test_document_length_router.py @@ -0,0 +1,62 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from haystack.components.routers import DocumentLengthRouter +from haystack.core.serialization import component_from_dict, component_to_dict +from haystack.dataclasses import Document + + +class TestDocumentLengthRouter: + def test_init(self): + router = DocumentLengthRouter(threshold=20) + assert router.threshold == 20 + + def test_run(self): + docs = [Document(content="Short"), Document(content="Long document " * 20)] + router = DocumentLengthRouter(threshold=10) + result = router.run(documents=docs) + + assert len(result["short_documents"]) == 1 + assert len(result["long_documents"]) == 1 + assert result["short_documents"][0] == docs[0] + assert result["long_documents"][0] == docs[1] + + def test_run_with_null_content(self): + docs = [Document(content=None), Document(content="Long document " * 20)] + router = DocumentLengthRouter(threshold=10) + result = router.run(documents=docs) + + assert len(result["short_documents"]) == 1 + assert len(result["long_documents"]) == 1 + assert result["short_documents"][0] == docs[0] + assert result["long_documents"][0] == docs[1] + + def test_run_with_negative_threshold(self): + docs = [Document(content=None), Document(content="Short"), Document(content="Long document " * 20)] + router = DocumentLengthRouter(threshold=-1) + result = router.run(documents=docs) + + assert len(result["short_documents"]) == 1 + assert len(result["long_documents"]) == 2 + assert result["short_documents"][0] == docs[0] + assert result["long_documents"][0] == docs[1] + assert result["long_documents"][1] == docs[2] + + def test_to_dict(self): + router = DocumentLengthRouter(threshold=10) + expected_dict = { + "type": "haystack.components.routers.document_length_router.DocumentLengthRouter", + "init_parameters": {"threshold": 10}, + } + assert component_to_dict(router, "router") == expected_dict + + def test_from_dict(self): + router_dict = { + "type": "haystack.components.routers.document_length_router.DocumentLengthRouter", + "init_parameters": {"threshold": 10}, + } + loaded_router = component_from_dict(DocumentLengthRouter, router_dict, name="router") + + assert isinstance(loaded_router, DocumentLengthRouter) + assert loaded_router.threshold == 10