# pyright: reportPrivateUsage=false """Unit-test suite for the `unstructured.chunking.dispatch` module.""" from __future__ import annotations from typing import Any, Iterable, Optional import pytest from unstructured.chunking import add_chunking_strategy, register_chunking_strategy from unstructured.chunking.dispatch import _ChunkerSpec, chunk from unstructured.documents.elements import CompositeElement, Element, Text class Describe_add_chunking_strategy: """Unit-test suite for `unstructured.chunking.add_chunking_strategy()` decorator.""" def it_dispatches_the_partitioned_elements_to_the_indicated_chunker(self): decorated_partitioner = add_chunking_strategy(partition_this) chunks = decorated_partitioner(chunking_strategy="basic") assert chunks == [CompositeElement("Lorem ipsum.\n\nSit amet.")] def but_it_skips_dispatch_when_no_chunking_strategy_is_specified(self): decorated_partitioner = add_chunking_strategy(partition_this) elements = decorated_partitioner() assert elements == [Text("Lorem ipsum."), Text("Sit amet.")] class Describe_chunk: """Unit-test suite for `unstructured.chunking.dispatch.chunk()` function.""" def it_dispatches_to_the_chunker_registered_for_the_chunking_strategy(self): register_chunking_strategy("by_something_else", chunk_by_something_else) kwargs = { "max_characters": 750, # -- unused kwargs shouldn't cause a problem; in general `kwargs` will contain all # -- keyword arguments used in the partitioning call. "foo": "bar", } chunks = chunk([Text("Lorem"), Text("Ipsum")], "by_something_else", **kwargs) assert chunks == [ CompositeElement("chunked 2 elements with `(max_characters=750, whizbang=None)`") ] def it_raises_when_the_requested_chunking_strategy_is_not_registered(self): with pytest.raises( ValueError, match="unrecognized chunking strategy 'foobar'", ): chunk(elements=[], chunking_strategy="foobar") class Describe_ChunkerSpec: """Unit-test suite for `unstructured.chunking.dispatch._ChunkerSpec` objects.""" def it_provides_access_to_the_chunking_function(self): spec = _ChunkerSpec(chunk_by_something_else) assert spec.chunker is chunk_by_something_else def it_knows_which_keyword_args_the_chunking_function_can_accept(self): spec = _ChunkerSpec(chunk_by_something_else) assert spec.kw_arg_names == ("max_characters", "whizbang") # -- MODULE-LEVEL FIXTURES ----------------------------------------------------------------------- def chunk_by_something_else( elements: Iterable[Element], max_characters: Optional[int] = None, whizbang: Optional[float] = None, ) -> list[Element]: """A "fake" minimal chunker suitable for use in tests.""" els = list(elements) return [ CompositeElement( f"chunked {len(els)} elements with" f" `(max_characters={max_characters}, whizbang={whizbang})`" ) ] def partition_this(**kwargs: Any) -> list[Element]: """A fake partitioner.""" return [Text("Lorem ipsum."), Text("Sit amet.")]