mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-04 23:52:23 +00:00
94 lines
3.2 KiB
Python
94 lines
3.2 KiB
Python
![]() |
# pyright: reportPrivateUsage=false
|
||
|
|
||
|
"""Unit-test suite for the `unstructured.chunking.dispatch` module."""
|
||
|
|
||
|
from __future__ import annotations
|
||
|
|
||
|
from typing import Any, Iterable, Optional
|
||
|
|
||
|
import pytest
|
||
|
|
||
|
from unstructured.chunking import add_chunking_strategy, register_chunking_strategy
|
||
|
from unstructured.chunking.dispatch import _ChunkerSpec, chunk
|
||
|
from unstructured.documents.elements import CompositeElement, Element, Text
|
||
|
|
||
|
|
||
|
class Describe_add_chunking_strategy:
|
||
|
"""Unit-test suite for `unstructured.chunking.add_chunking_strategy()` decorator."""
|
||
|
|
||
|
def it_dispatches_the_partitioned_elements_to_the_indicated_chunker(self):
|
||
|
decorated_partitioner = add_chunking_strategy(partition_this)
|
||
|
|
||
|
chunks = decorated_partitioner(chunking_strategy="basic")
|
||
|
|
||
|
assert chunks == [CompositeElement("Lorem ipsum.\n\nSit amet.")]
|
||
|
|
||
|
def but_it_skips_dispatch_when_no_chunking_strategy_is_specified(self):
|
||
|
decorated_partitioner = add_chunking_strategy(partition_this)
|
||
|
|
||
|
elements = decorated_partitioner()
|
||
|
|
||
|
assert elements == [Text("Lorem ipsum."), Text("Sit amet.")]
|
||
|
|
||
|
|
||
|
class Describe_chunk:
|
||
|
"""Unit-test suite for `unstructured.chunking.dispatch.chunk()` function."""
|
||
|
|
||
|
def it_dispatches_to_the_chunker_registered_for_the_chunking_strategy(self):
|
||
|
|
||
|
register_chunking_strategy("by_something_else", chunk_by_something_else)
|
||
|
kwargs = {
|
||
|
"max_characters": 750,
|
||
|
# -- unused kwargs shouldn't cause a problem; in general `kwargs` will contain all
|
||
|
# -- keyword arguments used in the partitioning call.
|
||
|
"foo": "bar",
|
||
|
}
|
||
|
|
||
|
chunks = chunk([Text("Lorem"), Text("Ipsum")], "by_something_else", **kwargs)
|
||
|
|
||
|
assert chunks == [
|
||
|
CompositeElement("chunked 2 elements with `(max_characters=750, whizbang=None)`")
|
||
|
]
|
||
|
|
||
|
def it_raises_when_the_requested_chunking_strategy_is_not_registered(self):
|
||
|
with pytest.raises(
|
||
|
ValueError,
|
||
|
match="unrecognized chunking strategy 'foobar'",
|
||
|
):
|
||
|
chunk(elements=[], chunking_strategy="foobar")
|
||
|
|
||
|
|
||
|
class Describe_ChunkerSpec:
|
||
|
"""Unit-test suite for `unstructured.chunking.dispatch._ChunkerSpec` objects."""
|
||
|
|
||
|
def it_provides_access_to_the_chunking_function(self):
|
||
|
spec = _ChunkerSpec(chunk_by_something_else)
|
||
|
assert spec.chunker is chunk_by_something_else
|
||
|
|
||
|
def it_knows_which_keyword_args_the_chunking_function_can_accept(self):
|
||
|
spec = _ChunkerSpec(chunk_by_something_else)
|
||
|
assert spec.kw_arg_names == ("max_characters", "whizbang")
|
||
|
|
||
|
|
||
|
# -- MODULE-LEVEL FIXTURES -----------------------------------------------------------------------
|
||
|
|
||
|
|
||
|
def chunk_by_something_else(
|
||
|
elements: Iterable[Element],
|
||
|
max_characters: Optional[int] = None,
|
||
|
whizbang: Optional[float] = None,
|
||
|
) -> list[Element]:
|
||
|
"""A "fake" minimal chunker suitable for use in tests."""
|
||
|
els = list(elements)
|
||
|
return [
|
||
|
CompositeElement(
|
||
|
f"chunked {len(els)} elements with"
|
||
|
f" `(max_characters={max_characters}, whizbang={whizbang})`"
|
||
|
)
|
||
|
]
|
||
|
|
||
|
|
||
|
def partition_this(**kwargs: Any) -> list[Element]:
|
||
|
"""A fake partitioner."""
|
||
|
return [Text("Lorem ipsum."), Text("Sit amet.")]
|