mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-04 12:03:15 +00:00 
			
		
		
		
	rfctr(chunking): extract chunking-strategy dispatch (#2545)
**Summary** This is the final step in adding pluggable chunking-strategies. It introduces the `chunk()` function to replace calls to strategy-specific chunkers in the `@add_chunking_strategy` decorator. The `chunk()` function then uses a mapping of chunking-strategy names (e.g. "by_title", "basic") to chunking functions (chunkers) to dispatch the chunking call. This allows other chunkers to be added at runtime rather than requiring a code change, which is what "pluggable" chunkers is. **Additional Information** - Move the `@add_chunking_strategy` to the new `chunking.dispatch` module since it coheres strongly with that operation, but publish it from `chunking(.__init__)` (as it was before) so users don't couple to the way we organize the chunking sub-package. Also remove the third level of nesting as it's unrequired in this case. - Add unit tests for the `@add_chunking_strategy` decorator which was previously uncovered by any direct test.
This commit is contained in:
		
							parent
							
								
									3ff6de4f50
								
							
						
					
					
						commit
						4096a38371
					
				@ -1,7 +1,9 @@
 | 
				
			|||||||
## 0.12.6-dev1 
 | 
					## 0.12.6-dev2
 | 
				
			||||||
 
 | 
					
 | 
				
			||||||
### Enhancements
 | 
					### Enhancements
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					* **Refactor `add_chunking_strategy` decorator to dispatch by name.** Add `chunk()` function to be used by the `add_chunking_strategy` decorator to dispatch chunking call based on a chunking-strategy name (that can be dynamic at runtime). This decouples chunking dispatch from only those chunkers known at "compile" time and enables runtime registration of custom chunkers.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Features
 | 
					### Features
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Fixes
 | 
					### Fixes
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										93
									
								
								test_unstructured/chunking/test_dispatch.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										93
									
								
								test_unstructured/chunking/test_dispatch.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,93 @@
 | 
				
			|||||||
 | 
					# pyright: reportPrivateUsage=false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					"""Unit-test suite for the `unstructured.chunking.dispatch` module."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from __future__ import annotations
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from typing import Any, Iterable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from unstructured.chunking import add_chunking_strategy, register_chunking_strategy
 | 
				
			||||||
 | 
					from unstructured.chunking.dispatch import _ChunkerSpec, chunk
 | 
				
			||||||
 | 
					from unstructured.documents.elements import CompositeElement, Element, Text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Describe_add_chunking_strategy:
 | 
				
			||||||
 | 
					    """Unit-test suite for `unstructured.chunking.add_chunking_strategy()` decorator."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def it_dispatches_the_partitioned_elements_to_the_indicated_chunker(self):
 | 
				
			||||||
 | 
					        decorated_partitioner = add_chunking_strategy(partition_this)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        chunks = decorated_partitioner(chunking_strategy="basic")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        assert chunks == [CompositeElement("Lorem ipsum.\n\nSit amet.")]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def but_it_skips_dispatch_when_no_chunking_strategy_is_specified(self):
 | 
				
			||||||
 | 
					        decorated_partitioner = add_chunking_strategy(partition_this)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        elements = decorated_partitioner()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        assert elements == [Text("Lorem ipsum."), Text("Sit amet.")]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Describe_chunk:
 | 
				
			||||||
 | 
					    """Unit-test suite for `unstructured.chunking.dispatch.chunk()` function."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def it_dispatches_to_the_chunker_registered_for_the_chunking_strategy(self):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        register_chunking_strategy("by_something_else", chunk_by_something_else)
 | 
				
			||||||
 | 
					        kwargs = {
 | 
				
			||||||
 | 
					            "max_characters": 750,
 | 
				
			||||||
 | 
					            # -- unused kwargs shouldn't cause a problem; in general `kwargs` will contain all
 | 
				
			||||||
 | 
					            # -- keyword arguments used in the partitioning call.
 | 
				
			||||||
 | 
					            "foo": "bar",
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        chunks = chunk([Text("Lorem"), Text("Ipsum")], "by_something_else", **kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        assert chunks == [
 | 
				
			||||||
 | 
					            CompositeElement("chunked 2 elements with `(max_characters=750, whizbang=None)`")
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def it_raises_when_the_requested_chunking_strategy_is_not_registered(self):
 | 
				
			||||||
 | 
					        with pytest.raises(
 | 
				
			||||||
 | 
					            ValueError,
 | 
				
			||||||
 | 
					            match="unrecognized chunking strategy 'foobar'",
 | 
				
			||||||
 | 
					        ):
 | 
				
			||||||
 | 
					            chunk(elements=[], chunking_strategy="foobar")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Describe_ChunkerSpec:
 | 
				
			||||||
 | 
					    """Unit-test suite for `unstructured.chunking.dispatch._ChunkerSpec` objects."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def it_provides_access_to_the_chunking_function(self):
 | 
				
			||||||
 | 
					        spec = _ChunkerSpec(chunk_by_something_else)
 | 
				
			||||||
 | 
					        assert spec.chunker is chunk_by_something_else
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def it_knows_which_keyword_args_the_chunking_function_can_accept(self):
 | 
				
			||||||
 | 
					        spec = _ChunkerSpec(chunk_by_something_else)
 | 
				
			||||||
 | 
					        assert spec.kw_arg_names == ("max_characters", "whizbang")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# -- MODULE-LEVEL FIXTURES -----------------------------------------------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def chunk_by_something_else(
 | 
				
			||||||
 | 
					    elements: Iterable[Element],
 | 
				
			||||||
 | 
					    max_characters: Optional[int] = None,
 | 
				
			||||||
 | 
					    whizbang: Optional[float] = None,
 | 
				
			||||||
 | 
					) -> list[Element]:
 | 
				
			||||||
 | 
					    """A "fake" minimal chunker suitable for use in tests."""
 | 
				
			||||||
 | 
					    els = list(elements)
 | 
				
			||||||
 | 
					    return [
 | 
				
			||||||
 | 
					        CompositeElement(
 | 
				
			||||||
 | 
					            f"chunked {len(els)} elements with"
 | 
				
			||||||
 | 
					            f" `(max_characters={max_characters}, whizbang={whizbang})`"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def partition_this(**kwargs: Any) -> list[Element]:
 | 
				
			||||||
 | 
					    """A fake partitioner."""
 | 
				
			||||||
 | 
					    return [Text("Lorem ipsum."), Text("Sit amet.")]
 | 
				
			||||||
@ -1 +1 @@
 | 
				
			|||||||
__version__ = "0.12.6-dev1"  # pragma: no cover
 | 
					__version__ = "0.12.6-dev2"  # pragma: no cover
 | 
				
			||||||
 | 
				
			|||||||
@ -1,100 +1,22 @@
 | 
				
			|||||||
"""Chunking module initializer.
 | 
					"""Chunking module initializer.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Provides the the `@add_chunking_strategy()` decorator.
 | 
					Publishes the public aspects of the chunking sub-package interface.
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from __future__ import annotations
 | 
					from __future__ import annotations
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import functools
 | 
					 | 
				
			||||||
import inspect
 | 
					 | 
				
			||||||
from typing import Any, Callable
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from typing_extensions import ParamSpec
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from unstructured.chunking.base import CHUNK_MAX_CHARS_DEFAULT, CHUNK_MULTI_PAGE_DEFAULT
 | 
					from unstructured.chunking.base import CHUNK_MAX_CHARS_DEFAULT, CHUNK_MULTI_PAGE_DEFAULT
 | 
				
			||||||
from unstructured.chunking.basic import chunk_elements
 | 
					from unstructured.chunking.dispatch import (
 | 
				
			||||||
from unstructured.chunking.title import chunk_by_title
 | 
					    Chunker,
 | 
				
			||||||
from unstructured.documents.elements import Element
 | 
					    add_chunking_strategy,
 | 
				
			||||||
 | 
					    register_chunking_strategy,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["CHUNK_MAX_CHARS_DEFAULT", "CHUNK_MULTI_PAGE_DEFAULT", "add_chunking_strategy"]
 | 
					__all__ = [
 | 
				
			||||||
 | 
					    "CHUNK_MAX_CHARS_DEFAULT",
 | 
				
			||||||
_P = ParamSpec("_P")
 | 
					    "CHUNK_MULTI_PAGE_DEFAULT",
 | 
				
			||||||
 | 
					    "add_chunking_strategy",
 | 
				
			||||||
 | 
					    # -- these must be published to allow pluggable chunkers in other code-bases --
 | 
				
			||||||
def add_chunking_strategy() -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]:
 | 
					    "Chunker",
 | 
				
			||||||
    """Decorator for chunking text.
 | 
					    "register_chunking_strategy",
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
    Chunks the element sequence produced by the partitioner it decorates when a `chunking_strategy`
 | 
					 | 
				
			||||||
    argument is present in the partitioner call and it names an available chunking strategy.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
 | 
					 | 
				
			||||||
        # -- Patch the docstring of the decorated function to add chunking strategy and
 | 
					 | 
				
			||||||
        # -- chunking-related argument documentation. This only applies when `chunking_strategy`
 | 
					 | 
				
			||||||
        # -- is an explicit argument of the decorated function and "chunking_strategy" is not
 | 
					 | 
				
			||||||
        # -- already mentioned in the docstring.
 | 
					 | 
				
			||||||
        if func.__doc__ and (
 | 
					 | 
				
			||||||
            "chunking_strategy" in func.__code__.co_varnames
 | 
					 | 
				
			||||||
            and "chunking_strategy" not in func.__doc__
 | 
					 | 
				
			||||||
        ):
 | 
					 | 
				
			||||||
            func.__doc__ += (
 | 
					 | 
				
			||||||
                "\nchunking_strategy"
 | 
					 | 
				
			||||||
                + "\n\tStrategy used for chunking text into larger or smaller elements."
 | 
					 | 
				
			||||||
                + "\n\tDefaults to `None` with optional arg of 'basic' or 'by_title'."
 | 
					 | 
				
			||||||
                + "\n\tAdditional Parameters:"
 | 
					 | 
				
			||||||
                + "\n\t\tmultipage_sections"
 | 
					 | 
				
			||||||
                + "\n\t\t\tIf True, sections can span multiple pages. Defaults to True."
 | 
					 | 
				
			||||||
                + "\n\t\tcombine_text_under_n_chars"
 | 
					 | 
				
			||||||
                + "\n\t\t\tCombines elements (for example a series of titles) until a section"
 | 
					 | 
				
			||||||
                + "\n\t\t\treaches a length of n characters. Only applies to 'by_title' strategy."
 | 
					 | 
				
			||||||
                + "\n\t\tnew_after_n_chars"
 | 
					 | 
				
			||||||
                + "\n\t\t\tCuts off chunks once they reach a length of n characters; a soft max."
 | 
					 | 
				
			||||||
                + "\n\t\tmax_characters"
 | 
					 | 
				
			||||||
                + "\n\t\t\tChunks elements text and text_as_html (if present) into chunks"
 | 
					 | 
				
			||||||
                + "\n\t\t\tof length n characters, a hard max."
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        @functools.wraps(func)
 | 
					 | 
				
			||||||
        def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
 | 
					 | 
				
			||||||
            """The decorated function is replaced with this one."""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            def get_call_args_applying_defaults() -> dict[str, Any]:
 | 
					 | 
				
			||||||
                """Map both explicit and default arguments of decorated func call by param name."""
 | 
					 | 
				
			||||||
                sig = inspect.signature(func)
 | 
					 | 
				
			||||||
                call_args: dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
 | 
					 | 
				
			||||||
                for param in sig.parameters.values():
 | 
					 | 
				
			||||||
                    if param.name not in call_args and param.default is not param.empty:
 | 
					 | 
				
			||||||
                        call_args[param.name] = param.default
 | 
					 | 
				
			||||||
                return call_args
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            # -- call the partitioning function to get the elements --
 | 
					 | 
				
			||||||
            elements = func(*args, **kwargs)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            # -- look for a chunking-strategy argument and run the indicated chunker when present --
 | 
					 | 
				
			||||||
            call_args = get_call_args_applying_defaults()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            if call_args.get("chunking_strategy") == "by_title":
 | 
					 | 
				
			||||||
                return chunk_by_title(
 | 
					 | 
				
			||||||
                    elements,
 | 
					 | 
				
			||||||
                    combine_text_under_n_chars=call_args.get("combine_text_under_n_chars"),
 | 
					 | 
				
			||||||
                    max_characters=call_args.get("max_characters"),
 | 
					 | 
				
			||||||
                    multipage_sections=call_args.get("multipage_sections"),
 | 
					 | 
				
			||||||
                    new_after_n_chars=call_args.get("new_after_n_chars"),
 | 
					 | 
				
			||||||
                    overlap=call_args.get("overlap"),
 | 
					 | 
				
			||||||
                    overlap_all=call_args.get("overlap_all"),
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            if call_args.get("chunking_strategy") == "basic":
 | 
					 | 
				
			||||||
                return chunk_elements(
 | 
					 | 
				
			||||||
                    elements,
 | 
					 | 
				
			||||||
                    max_characters=call_args.get("max_characters"),
 | 
					 | 
				
			||||||
                    new_after_n_chars=call_args.get("new_after_n_chars"),
 | 
					 | 
				
			||||||
                    overlap=call_args.get("overlap"),
 | 
					 | 
				
			||||||
                    overlap_all=call_args.get("overlap_all"),
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            return elements
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return wrapper
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return decorator
 | 
					 | 
				
			||||||
 | 
				
			|||||||
@ -25,6 +25,7 @@ from unstructured.documents.elements import Element
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
def chunk_elements(
 | 
					def chunk_elements(
 | 
				
			||||||
    elements: Iterable[Element],
 | 
					    elements: Iterable[Element],
 | 
				
			||||||
 | 
					    *,
 | 
				
			||||||
    max_characters: Optional[int] = None,
 | 
					    max_characters: Optional[int] = None,
 | 
				
			||||||
    new_after_n_chars: Optional[int] = None,
 | 
					    new_after_n_chars: Optional[int] = None,
 | 
				
			||||||
    overlap: Optional[int] = None,
 | 
					    overlap: Optional[int] = None,
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										138
									
								
								unstructured/chunking/dispatch.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										138
									
								
								unstructured/chunking/dispatch.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,138 @@
 | 
				
			|||||||
 | 
					"""Handles dispatch of elements to a chunking-strategy by name.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Also provides the `@add_chunking_strategy` decorator which is the chief current user of "by-name"
 | 
				
			||||||
 | 
					chunking dispatch.
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from __future__ import annotations
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import dataclasses as dc
 | 
				
			||||||
 | 
					import functools
 | 
				
			||||||
 | 
					import inspect
 | 
				
			||||||
 | 
					from typing import Any, Callable, Iterable, Optional, Protocol
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from typing_extensions import ParamSpec
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from unstructured.chunking.basic import chunk_elements
 | 
				
			||||||
 | 
					from unstructured.chunking.title import chunk_by_title
 | 
				
			||||||
 | 
					from unstructured.documents.elements import Element
 | 
				
			||||||
 | 
					from unstructured.utils import lazyproperty
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_P = ParamSpec("_P")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Chunker(Protocol):
 | 
				
			||||||
 | 
					    """Abstract interface for chunking functions."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __call__(
 | 
				
			||||||
 | 
					        self, elements: Iterable[Element], *, max_characters: Optional[int]
 | 
				
			||||||
 | 
					    ) -> list[Element]:
 | 
				
			||||||
 | 
					        """A chunking function must have this signature.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        In particular it must minimally have an `elements` parameter and all chunkers will have a
 | 
				
			||||||
 | 
					        `max_characters` parameter (doesn't need to follow `elements` directly). All others can
 | 
				
			||||||
 | 
					        vary by chunker.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        ...
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def add_chunking_strategy(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
 | 
				
			||||||
 | 
					    """Decorator for chunking text.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Chunks the element sequence produced by the partitioner it decorates when a `chunking_strategy`
 | 
				
			||||||
 | 
					    argument is present in the partitioner call and it names an available chunking strategy.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    # -- Patch the docstring of the decorated function to add chunking strategy and
 | 
				
			||||||
 | 
					    # -- chunking-related argument documentation. This only applies when `chunking_strategy`
 | 
				
			||||||
 | 
					    # -- is an explicit argument of the decorated function and "chunking_strategy" is not
 | 
				
			||||||
 | 
					    # -- already mentioned in the docstring.
 | 
				
			||||||
 | 
					    if func.__doc__ and (
 | 
				
			||||||
 | 
					        "chunking_strategy" in func.__code__.co_varnames and "chunking_strategy" not in func.__doc__
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
 | 
					        func.__doc__ += (
 | 
				
			||||||
 | 
					            "\nchunking_strategy"
 | 
				
			||||||
 | 
					            + "\n\tStrategy used for chunking text into larger or smaller elements."
 | 
				
			||||||
 | 
					            + "\n\tDefaults to `None` with optional arg of 'basic' or 'by_title'."
 | 
				
			||||||
 | 
					            + "\n\tAdditional Parameters:"
 | 
				
			||||||
 | 
					            + "\n\t\tmultipage_sections"
 | 
				
			||||||
 | 
					            + "\n\t\t\tIf True, sections can span multiple pages. Defaults to True."
 | 
				
			||||||
 | 
					            + "\n\t\tcombine_text_under_n_chars"
 | 
				
			||||||
 | 
					            + "\n\t\t\tCombines elements (for example a series of titles) until a section"
 | 
				
			||||||
 | 
					            + "\n\t\t\treaches a length of n characters. Only applies to 'by_title' strategy."
 | 
				
			||||||
 | 
					            + "\n\t\tnew_after_n_chars"
 | 
				
			||||||
 | 
					            + "\n\t\t\tCuts off chunks once they reach a length of n characters; a soft max."
 | 
				
			||||||
 | 
					            + "\n\t\tmax_characters"
 | 
				
			||||||
 | 
					            + "\n\t\t\tChunks elements text and text_as_html (if present) into chunks"
 | 
				
			||||||
 | 
					            + "\n\t\t\tof length n characters, a hard max."
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @functools.wraps(func)
 | 
				
			||||||
 | 
					    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
 | 
				
			||||||
 | 
					        """The decorated function is replaced with this one."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def get_call_args_applying_defaults() -> dict[str, Any]:
 | 
				
			||||||
 | 
					            """Map both explicit and default arguments of decorated func call by param name."""
 | 
				
			||||||
 | 
					            sig = inspect.signature(func)
 | 
				
			||||||
 | 
					            call_args: dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
 | 
				
			||||||
 | 
					            for param in sig.parameters.values():
 | 
				
			||||||
 | 
					                if param.name not in call_args and param.default is not param.empty:
 | 
				
			||||||
 | 
					                    call_args[param.name] = param.default
 | 
				
			||||||
 | 
					            return call_args
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # -- call the partitioning function to get the elements --
 | 
				
			||||||
 | 
					        elements = func(*args, **kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # -- look for a chunking-strategy argument --
 | 
				
			||||||
 | 
					        call_args = get_call_args_applying_defaults()
 | 
				
			||||||
 | 
					        chunking_strategy = call_args.pop("chunking_strategy", None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # -- no chunking-strategy means no chunking --
 | 
				
			||||||
 | 
					        if chunking_strategy is None:
 | 
				
			||||||
 | 
					            return elements
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # -- otherwise, chunk away :) --
 | 
				
			||||||
 | 
					        return chunk(elements, chunking_strategy, **call_args)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return wrapper
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def chunk(elements: Iterable[Element], chunking_strategy: str, **kwargs: Any) -> list[Element]:
 | 
				
			||||||
 | 
					    """Dispatch chunking of `elements` to the chunking function for `chunking_strategy`."""
 | 
				
			||||||
 | 
					    chunker_spec = _chunker_registry.get(chunking_strategy)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if chunker_spec is None:
 | 
				
			||||||
 | 
					        raise ValueError(f"unrecognized chunking strategy {repr(chunking_strategy)}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # -- `kwargs` will in general be an omnibus dict of all keyword arguments to the partitioner;
 | 
				
			||||||
 | 
					    # -- pick out and use only those supported by this chunker.
 | 
				
			||||||
 | 
					    chunking_kwargs = {k: v for k, v in kwargs.items() if k in chunker_spec.kw_arg_names}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return chunker_spec.chunker(elements, **chunking_kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def register_chunking_strategy(name: str, chunker: Chunker) -> None:
 | 
				
			||||||
 | 
					    """Make chunker available by using `name` as `chunking_strategy` arg in partitioner call."""
 | 
				
			||||||
 | 
					    _chunker_registry[name] = _ChunkerSpec(chunker)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@dc.dataclass(frozen=True)
 | 
				
			||||||
 | 
					class _ChunkerSpec:
 | 
				
			||||||
 | 
					    """A registry entry for a chunker."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    chunker: Chunker
 | 
				
			||||||
 | 
					    """The "chunk_by_{x}() function that implements this chunking strategy."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @lazyproperty
 | 
				
			||||||
 | 
					    def kw_arg_names(self) -> tuple[str, ...]:
 | 
				
			||||||
 | 
					        """Keyword arguments supported by this chunker.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        These are all arguments other than the required `elements: list[Element]` first parameter.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        sig = inspect.signature(self.chunker)
 | 
				
			||||||
 | 
					        return tuple(key for key in sig.parameters if key != "elements")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_chunker_registry: dict[str, _ChunkerSpec] = {
 | 
				
			||||||
 | 
					    "basic": _ChunkerSpec(chunk_elements),
 | 
				
			||||||
 | 
					    "by_title": _ChunkerSpec(chunk_by_title),
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@ -26,7 +26,7 @@ DETECTION_ORIGIN: str = "csv"
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata_with_filetype(FileType.CSV)
 | 
					@add_metadata_with_filetype(FileType.CSV)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_csv(
 | 
					def partition_csv(
 | 
				
			||||||
    filename: Optional[str] = None,
 | 
					    filename: Optional[str] = None,
 | 
				
			||||||
    file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
 | 
					    file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -16,7 +16,7 @@ from unstructured.partition.docx import partition_docx
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata_with_filetype(FileType.DOC)
 | 
					@add_metadata_with_filetype(FileType.DOC)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_doc(
 | 
					def partition_doc(
 | 
				
			||||||
    filename: Optional[str] = None,
 | 
					    filename: Optional[str] = None,
 | 
				
			||||||
    file: Optional[IO[bytes]] = None,
 | 
					    file: Optional[IO[bytes]] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -168,7 +168,7 @@ def convert_and_partition_docx(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata_with_filetype(FileType.DOCX)
 | 
					@add_metadata_with_filetype(FileType.DOCX)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_docx(
 | 
					def partition_docx(
 | 
				
			||||||
    filename: Optional[str] = None,
 | 
					    filename: Optional[str] = None,
 | 
				
			||||||
    file: Optional[IO[bytes]] = None,
 | 
					    file: Optional[IO[bytes]] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -262,7 +262,7 @@ def parse_email(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata_with_filetype(FileType.EML)
 | 
					@add_metadata_with_filetype(FileType.EML)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_email(
 | 
					def partition_email(
 | 
				
			||||||
    filename: Optional[str] = None,
 | 
					    filename: Optional[str] = None,
 | 
				
			||||||
    file: Optional[Union[IO[bytes], SpooledTemporaryFile[bytes]]] = None,
 | 
					    file: Optional[Union[IO[bytes], SpooledTemporaryFile[bytes]]] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -11,7 +11,7 @@ DETECTION_ORIGIN: str = "epub"
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata_with_filetype(FileType.EPUB)
 | 
					@add_metadata_with_filetype(FileType.EPUB)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_epub(
 | 
					def partition_epub(
 | 
				
			||||||
    filename: Optional[str] = None,
 | 
					    filename: Optional[str] = None,
 | 
				
			||||||
    file: Optional[IO[bytes]] = None,
 | 
					    file: Optional[IO[bytes]] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -26,7 +26,7 @@ if TYPE_CHECKING:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata_with_filetype(FileType.HTML)
 | 
					@add_metadata_with_filetype(FileType.HTML)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_html(
 | 
					def partition_html(
 | 
				
			||||||
    filename: Optional[str] = None,
 | 
					    filename: Optional[str] = None,
 | 
				
			||||||
    file: Optional[IO[bytes]] = None,
 | 
					    file: Optional[IO[bytes]] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -13,7 +13,7 @@ from unstructured.partition.utils.constants import PartitionStrategy
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata
 | 
					@add_metadata
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_image(
 | 
					def partition_image(
 | 
				
			||||||
    filename: str = "",
 | 
					    filename: str = "",
 | 
				
			||||||
    file: Optional[bytes] = None,
 | 
					    file: Optional[bytes] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -29,7 +29,7 @@ from unstructured.staging.base import dict_to_elements
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata_with_filetype(FileType.JSON)
 | 
					@add_metadata_with_filetype(FileType.JSON)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_json(
 | 
					def partition_json(
 | 
				
			||||||
    filename: Optional[str] = None,
 | 
					    filename: Optional[str] = None,
 | 
				
			||||||
    file: Optional[IO[bytes]] = None,
 | 
					    file: Optional[IO[bytes]] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -26,7 +26,7 @@ DETECTION_ORIGIN: str = "md"
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata_with_filetype(FileType.MD)
 | 
					@add_metadata_with_filetype(FileType.MD)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_md(
 | 
					def partition_md(
 | 
				
			||||||
    filename: Optional[str] = None,
 | 
					    filename: Optional[str] = None,
 | 
				
			||||||
    file: Optional[IO[bytes]] = None,
 | 
					    file: Optional[IO[bytes]] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -17,7 +17,7 @@ from unstructured.partition.text import partition_text
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata_with_filetype(FileType.MSG)
 | 
					@add_metadata_with_filetype(FileType.MSG)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_msg(
 | 
					def partition_msg(
 | 
				
			||||||
    filename: Optional[str] = None,
 | 
					    filename: Optional[str] = None,
 | 
				
			||||||
    file: Optional[IO[bytes]] = None,
 | 
					    file: Optional[IO[bytes]] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -12,7 +12,7 @@ from unstructured.partition.docx import convert_and_partition_docx
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata_with_filetype(FileType.ODT)
 | 
					@add_metadata_with_filetype(FileType.ODT)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_odt(
 | 
					def partition_odt(
 | 
				
			||||||
    filename: Optional[str] = None,
 | 
					    filename: Optional[str] = None,
 | 
				
			||||||
    file: Optional[BinaryIO] = None,
 | 
					    file: Optional[BinaryIO] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -9,7 +9,7 @@ DETECTION_ORIGIN: str = "org"
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@add_metadata_with_filetype(FileType.ORG)
 | 
					@add_metadata_with_filetype(FileType.ORG)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_org(
 | 
					def partition_org(
 | 
				
			||||||
    filename: Optional[str] = None,
 | 
					    filename: Optional[str] = None,
 | 
				
			||||||
    file: Optional[IO[bytes]] = None,
 | 
					    file: Optional[IO[bytes]] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -132,7 +132,7 @@ def default_hi_res_model() -> str:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata_with_filetype(FileType.PDF)
 | 
					@add_metadata_with_filetype(FileType.PDF)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_pdf(
 | 
					def partition_pdf(
 | 
				
			||||||
    filename: str = "",
 | 
					    filename: str = "",
 | 
				
			||||||
    file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None,
 | 
					    file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -16,7 +16,7 @@ from unstructured.partition.pptx import partition_pptx
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata_with_filetype(FileType.PPT)
 | 
					@add_metadata_with_filetype(FileType.PPT)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_ppt(
 | 
					def partition_ppt(
 | 
				
			||||||
    filename: Optional[str] = None,
 | 
					    filename: Optional[str] = None,
 | 
				
			||||||
    file: Optional[IO[bytes]] = None,
 | 
					    file: Optional[IO[bytes]] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -47,7 +47,7 @@ DETECTION_ORIGIN = "pptx"
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata_with_filetype(FileType.PPTX)
 | 
					@add_metadata_with_filetype(FileType.PPTX)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_pptx(
 | 
					def partition_pptx(
 | 
				
			||||||
    filename: Optional[str] = None,
 | 
					    filename: Optional[str] = None,
 | 
				
			||||||
    file: Optional[IO[bytes]] = None,
 | 
					    file: Optional[IO[bytes]] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -10,7 +10,7 @@ DETECTION_ORIGIN: str = "rst"
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata_with_filetype(FileType.RST)
 | 
					@add_metadata_with_filetype(FileType.RST)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_rst(
 | 
					def partition_rst(
 | 
				
			||||||
    filename: Optional[str] = None,
 | 
					    filename: Optional[str] = None,
 | 
				
			||||||
    file: Optional[IO[bytes]] = None,
 | 
					    file: Optional[IO[bytes]] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -10,7 +10,7 @@ DETECTION_ORIGIN: str = "rtf"
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata_with_filetype(FileType.RTF)
 | 
					@add_metadata_with_filetype(FileType.RTF)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_rtf(
 | 
					def partition_rtf(
 | 
				
			||||||
    filename: Optional[str] = None,
 | 
					    filename: Optional[str] = None,
 | 
				
			||||||
    file: Optional[IO[bytes]] = None,
 | 
					    file: Optional[IO[bytes]] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -113,7 +113,7 @@ def partition_text(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata_with_filetype(FileType.TXT)
 | 
					@add_metadata_with_filetype(FileType.TXT)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def _partition_text(
 | 
					def _partition_text(
 | 
				
			||||||
    filename: Optional[str] = None,
 | 
					    filename: Optional[str] = None,
 | 
				
			||||||
    file: Optional[IO[bytes]] = None,
 | 
					    file: Optional[IO[bytes]] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -42,7 +42,7 @@ DETECTION_ORIGIN: str = "xlsx"
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata_with_filetype(FileType.XLSX)
 | 
					@add_metadata_with_filetype(FileType.XLSX)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_xlsx(
 | 
					def partition_xlsx(
 | 
				
			||||||
    filename: Optional[str] = None,
 | 
					    filename: Optional[str] = None,
 | 
				
			||||||
    file: Optional[IO[bytes]] = None,
 | 
					    file: Optional[IO[bytes]] = None,
 | 
				
			||||||
 | 
				
			|||||||
@ -81,7 +81,7 @@ def _get_leaf_elements(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@process_metadata()
 | 
					@process_metadata()
 | 
				
			||||||
@add_metadata_with_filetype(FileType.XML)
 | 
					@add_metadata_with_filetype(FileType.XML)
 | 
				
			||||||
@add_chunking_strategy()
 | 
					@add_chunking_strategy
 | 
				
			||||||
def partition_xml(
 | 
					def partition_xml(
 | 
				
			||||||
    filename: Optional[str] = None,
 | 
					    filename: Optional[str] = None,
 | 
				
			||||||
    file: Optional[Union[IO[bytes], SpooledTemporaryFile[bytes]]] = None,
 | 
					    file: Optional[Union[IO[bytes], SpooledTemporaryFile[bytes]]] = None,
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user