mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 18:14:51 +00:00 
			
		
		
		
	 b8d894f963
			
		
	
	
		b8d894f963
		
			
		
	
	
	
	
		
			
			It's pretty basic change, just literally moved the category field to Element class. Can't think of other changes that are needed here, because I think pretty much everything expected the category to be directly in elements list. For local testing, IDE's and linters should see difference in that `category` is now in Element.
		
			
				
	
	
		
			93 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			93 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # pyright: reportPrivateUsage=false
 | |
| 
 | |
| """Unit-test suite for the `unstructured.chunking.dispatch` module."""
 | |
| 
 | |
| from __future__ import annotations
 | |
| 
 | |
| from typing import Any, Iterable, Optional
 | |
| 
 | |
| import pytest
 | |
| 
 | |
| from unstructured.chunking import add_chunking_strategy, register_chunking_strategy
 | |
| from unstructured.chunking.dispatch import _ChunkerSpec, chunk
 | |
| from unstructured.documents.elements import CompositeElement, Element, Text
 | |
| 
 | |
| 
 | |
| class Describe_add_chunking_strategy:
 | |
|     """Unit-test suite for `unstructured.chunking.add_chunking_strategy()` decorator."""
 | |
| 
 | |
|     def it_dispatches_the_partitioned_elements_to_the_indicated_chunker(self):
 | |
|         decorated_partitioner = add_chunking_strategy(partition_this)
 | |
| 
 | |
|         chunks = decorated_partitioner(chunking_strategy="basic")
 | |
| 
 | |
|         assert chunks == [CompositeElement("Lorem ipsum.\n\nSit amet.")]
 | |
| 
 | |
|     def but_it_skips_dispatch_when_no_chunking_strategy_is_specified(self):
 | |
|         decorated_partitioner = add_chunking_strategy(partition_this)
 | |
| 
 | |
|         elements = decorated_partitioner()
 | |
| 
 | |
|         assert elements == [Text("Lorem ipsum."), Text("Sit amet.")]
 | |
| 
 | |
| 
 | |
| class Describe_chunk:
 | |
|     """Unit-test suite for `unstructured.chunking.dispatch.chunk()` function."""
 | |
| 
 | |
|     def it_dispatches_to_the_chunker_registered_for_the_chunking_strategy(self):
 | |
|         register_chunking_strategy("by_something_else", chunk_by_something_else)
 | |
|         kwargs = {
 | |
|             "max_characters": 750,
 | |
|             # -- unused kwargs shouldn't cause a problem; in general `kwargs` will contain all
 | |
|             # -- keyword arguments used in the partitioning call.
 | |
|             "foo": "bar",
 | |
|         }
 | |
| 
 | |
|         chunks = chunk([Text("Lorem"), Text("Ipsum")], "by_something_else", **kwargs)
 | |
| 
 | |
|         assert chunks == [
 | |
|             CompositeElement("chunked 2 elements with `(max_characters=750, whizbang=None)`")
 | |
|         ]
 | |
| 
 | |
|     def it_raises_when_the_requested_chunking_strategy_is_not_registered(self):
 | |
|         with pytest.raises(
 | |
|             ValueError,
 | |
|             match="unrecognized chunking strategy 'foobar'",
 | |
|         ):
 | |
|             chunk(elements=[], chunking_strategy="foobar")
 | |
| 
 | |
| 
 | |
| class Describe_ChunkerSpec:
 | |
|     """Unit-test suite for `unstructured.chunking.dispatch._ChunkerSpec` objects."""
 | |
| 
 | |
|     def it_provides_access_to_the_chunking_function(self):
 | |
|         spec = _ChunkerSpec(chunk_by_something_else)
 | |
|         assert spec.chunker is chunk_by_something_else
 | |
| 
 | |
|     def it_knows_which_keyword_args_the_chunking_function_can_accept(self):
 | |
|         spec = _ChunkerSpec(chunk_by_something_else)
 | |
|         assert spec.kw_arg_names == ("max_characters", "whizbang")
 | |
| 
 | |
| 
 | |
| # -- MODULE-LEVEL FIXTURES -----------------------------------------------------------------------
 | |
| 
 | |
| 
 | |
| def chunk_by_something_else(
 | |
|     elements: Iterable[Element],
 | |
|     max_characters: Optional[int] = None,
 | |
|     whizbang: Optional[float] = None,
 | |
| ) -> list[Element]:
 | |
|     """A "fake" minimal chunker suitable for use in tests."""
 | |
|     els = list(elements)
 | |
|     return [
 | |
|         CompositeElement(
 | |
|             f"chunked {len(els)} elements with"
 | |
|             f" `(max_characters={max_characters}, whizbang={whizbang})`"
 | |
|         )
 | |
|     ]
 | |
| 
 | |
| 
 | |
| def partition_this(**kwargs: Any) -> list[Element]:
 | |
|     """A fake partitioner."""
 | |
|     return [Text("Lorem ipsum."), Text("Sit amet.")]
 |