mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-04 03:53:45 +00:00 
			
		
		
		
	
		
			
	
	
		
			93 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			93 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| 
								 | 
							
								# pyright: reportPrivateUsage=false
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								"""Unit-test suite for the `unstructured.chunking.dispatch` module."""
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								from __future__ import annotations
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								from typing import Any, Iterable, Optional
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								import pytest
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								from unstructured.chunking import add_chunking_strategy, register_chunking_strategy
							 | 
						||
| 
								 | 
							
								from unstructured.chunking.dispatch import _ChunkerSpec, chunk
							 | 
						||
| 
								 | 
							
								from unstructured.documents.elements import CompositeElement, Element, Text
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								class Describe_add_chunking_strategy:
							 | 
						||
| 
								 | 
							
								    """Unit-test suite for `unstructured.chunking.add_chunking_strategy()` decorator."""
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def it_dispatches_the_partitioned_elements_to_the_indicated_chunker(self):
							 | 
						||
| 
								 | 
							
								        decorated_partitioner = add_chunking_strategy(partition_this)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        chunks = decorated_partitioner(chunking_strategy="basic")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        assert chunks == [CompositeElement("Lorem ipsum.\n\nSit amet.")]
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def but_it_skips_dispatch_when_no_chunking_strategy_is_specified(self):
							 | 
						||
| 
								 | 
							
								        decorated_partitioner = add_chunking_strategy(partition_this)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        elements = decorated_partitioner()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        assert elements == [Text("Lorem ipsum."), Text("Sit amet.")]
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								class Describe_chunk:
							 | 
						||
| 
								 | 
							
								    """Unit-test suite for `unstructured.chunking.dispatch.chunk()` function."""
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def it_dispatches_to_the_chunker_registered_for_the_chunking_strategy(self):
							 | 
						||
| 
								 | 
							
								        register_chunking_strategy("by_something_else", chunk_by_something_else)
							 | 
						||
| 
								 | 
							
								        kwargs = {
							 | 
						||
| 
								 | 
							
								            "max_characters": 750,
							 | 
						||
| 
								 | 
							
								            # -- unused kwargs shouldn't cause a problem; in general `kwargs` will contain all
							 | 
						||
| 
								 | 
							
								            # -- keyword arguments used in the partitioning call.
							 | 
						||
| 
								 | 
							
								            "foo": "bar",
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        chunks = chunk([Text("Lorem"), Text("Ipsum")], "by_something_else", **kwargs)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        assert chunks == [
							 | 
						||
| 
								 | 
							
								            CompositeElement("chunked 2 elements with `(max_characters=750, whizbang=None)`")
							 | 
						||
| 
								 | 
							
								        ]
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def it_raises_when_the_requested_chunking_strategy_is_not_registered(self):
							 | 
						||
| 
								 | 
							
								        with pytest.raises(
							 | 
						||
| 
								 | 
							
								            ValueError,
							 | 
						||
| 
								 | 
							
								            match="unrecognized chunking strategy 'foobar'",
							 | 
						||
| 
								 | 
							
								        ):
							 | 
						||
| 
								 | 
							
								            chunk(elements=[], chunking_strategy="foobar")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								class Describe_ChunkerSpec:
							 | 
						||
| 
								 | 
							
								    """Unit-test suite for `unstructured.chunking.dispatch._ChunkerSpec` objects."""
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def it_provides_access_to_the_chunking_function(self):
							 | 
						||
| 
								 | 
							
								        spec = _ChunkerSpec(chunk_by_something_else)
							 | 
						||
| 
								 | 
							
								        assert spec.chunker is chunk_by_something_else
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def it_knows_which_keyword_args_the_chunking_function_can_accept(self):
							 | 
						||
| 
								 | 
							
								        spec = _ChunkerSpec(chunk_by_something_else)
							 | 
						||
| 
								 | 
							
								        assert spec.kw_arg_names == ("max_characters", "whizbang")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								# -- MODULE-LEVEL FIXTURES -----------------------------------------------------------------------
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								def chunk_by_something_else(
							 | 
						||
| 
								 | 
							
								    elements: Iterable[Element],
							 | 
						||
| 
								 | 
							
								    max_characters: Optional[int] = None,
							 | 
						||
| 
								 | 
							
								    whizbang: Optional[float] = None,
							 | 
						||
| 
								 | 
							
								) -> list[Element]:
							 | 
						||
| 
								 | 
							
								    """A "fake" minimal chunker suitable for use in tests."""
							 | 
						||
| 
								 | 
							
								    els = list(elements)
							 | 
						||
| 
								 | 
							
								    return [
							 | 
						||
| 
								 | 
							
								        CompositeElement(
							 | 
						||
| 
								 | 
							
								            f"chunked {len(els)} elements with"
							 | 
						||
| 
								 | 
							
								            f" `(max_characters={max_characters}, whizbang={whizbang})`"
							 | 
						||
| 
								 | 
							
								        )
							 | 
						||
| 
								 | 
							
								    ]
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								def partition_this(**kwargs: Any) -> list[Element]:
							 | 
						||
| 
								 | 
							
								    """A fake partitioner."""
							 | 
						||
| 
								 | 
							
								    return [Text("Lorem ipsum."), Text("Sit amet.")]
							 |