diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c4c65d6a..3571e248d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,9 @@ -## 0.12.6-dev1 - +## 0.12.6-dev2 + ### Enhancements +* **Refactor `add_chunking_strategy` decorator to dispatch by name.** Add `chunk()` function to be used by the `add_chunking_strategy` decorator to dispatch chunking call based on a chunking-strategy name (that can be dynamic at runtime). This decouples chunking dispatch from only those chunkers known at "compile" time and enables runtime registration of custom chunkers. + ### Features ### Fixes diff --git a/test_unstructured/chunking/test_dispatch.py b/test_unstructured/chunking/test_dispatch.py new file mode 100644 index 000000000..224998dc6 --- /dev/null +++ b/test_unstructured/chunking/test_dispatch.py @@ -0,0 +1,93 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for the `unstructured.chunking.dispatch` module.""" + +from __future__ import annotations + +from typing import Any, Iterable, Optional + +import pytest + +from unstructured.chunking import add_chunking_strategy, register_chunking_strategy +from unstructured.chunking.dispatch import _ChunkerSpec, chunk +from unstructured.documents.elements import CompositeElement, Element, Text + + +class Describe_add_chunking_strategy: + """Unit-test suite for `unstructured.chunking.add_chunking_strategy()` decorator.""" + + def it_dispatches_the_partitioned_elements_to_the_indicated_chunker(self): + decorated_partitioner = add_chunking_strategy(partition_this) + + chunks = decorated_partitioner(chunking_strategy="basic") + + assert chunks == [CompositeElement("Lorem ipsum.\n\nSit amet.")] + + def but_it_skips_dispatch_when_no_chunking_strategy_is_specified(self): + decorated_partitioner = add_chunking_strategy(partition_this) + + elements = decorated_partitioner() + + assert elements == [Text("Lorem ipsum."), Text("Sit amet.")] + + +class Describe_chunk: + """Unit-test suite for `unstructured.chunking.dispatch.chunk()` function.""" + + def it_dispatches_to_the_chunker_registered_for_the_chunking_strategy(self): + + register_chunking_strategy("by_something_else", chunk_by_something_else) + kwargs = { + "max_characters": 750, + # -- unused kwargs shouldn't cause a problem; in general `kwargs` will contain all + # -- keyword arguments used in the partitioning call. + "foo": "bar", + } + + chunks = chunk([Text("Lorem"), Text("Ipsum")], "by_something_else", **kwargs) + + assert chunks == [ + CompositeElement("chunked 2 elements with `(max_characters=750, whizbang=None)`") + ] + + def it_raises_when_the_requested_chunking_strategy_is_not_registered(self): + with pytest.raises( + ValueError, + match="unrecognized chunking strategy 'foobar'", + ): + chunk(elements=[], chunking_strategy="foobar") + + +class Describe_ChunkerSpec: + """Unit-test suite for `unstructured.chunking.dispatch._ChunkerSpec` objects.""" + + def it_provides_access_to_the_chunking_function(self): + spec = _ChunkerSpec(chunk_by_something_else) + assert spec.chunker is chunk_by_something_else + + def it_knows_which_keyword_args_the_chunking_function_can_accept(self): + spec = _ChunkerSpec(chunk_by_something_else) + assert spec.kw_arg_names == ("max_characters", "whizbang") + + +# -- MODULE-LEVEL FIXTURES ----------------------------------------------------------------------- + + +def chunk_by_something_else( + elements: Iterable[Element], + max_characters: Optional[int] = None, + whizbang: Optional[float] = None, +) -> list[Element]: + """A "fake" minimal chunker suitable for use in tests.""" + els = list(elements) + return [ + CompositeElement( + f"chunked {len(els)} elements with" + f" `(max_characters={max_characters}, whizbang={whizbang})`" + ) + ] + + +def partition_this(**kwargs: Any) -> list[Element]: + """A fake partitioner.""" + return [Text("Lorem ipsum."), Text("Sit amet.")] diff --git a/unstructured/__version__.py b/unstructured/__version__.py index c97780dbf..29275b7c9 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.12.6-dev1" # pragma: no cover +__version__ = "0.12.6-dev2" # pragma: no cover diff --git a/unstructured/chunking/__init__.py b/unstructured/chunking/__init__.py index 04074135d..9bc12cdc8 100644 --- a/unstructured/chunking/__init__.py +++ b/unstructured/chunking/__init__.py @@ -1,100 +1,22 @@ """Chunking module initializer. -Provides the the `@add_chunking_strategy()` decorator. +Publishes the public aspects of the chunking sub-package interface. """ from __future__ import annotations -import functools -import inspect -from typing import Any, Callable - -from typing_extensions import ParamSpec - from unstructured.chunking.base import CHUNK_MAX_CHARS_DEFAULT, CHUNK_MULTI_PAGE_DEFAULT -from unstructured.chunking.basic import chunk_elements -from unstructured.chunking.title import chunk_by_title -from unstructured.documents.elements import Element +from unstructured.chunking.dispatch import ( + Chunker, + add_chunking_strategy, + register_chunking_strategy, +) -__all__ = ["CHUNK_MAX_CHARS_DEFAULT", "CHUNK_MULTI_PAGE_DEFAULT", "add_chunking_strategy"] - -_P = ParamSpec("_P") - - -def add_chunking_strategy() -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]: - """Decorator for chunking text. - - Chunks the element sequence produced by the partitioner it decorates when a `chunking_strategy` - argument is present in the partitioner call and it names an available chunking strategy. - """ - - def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]: - # -- Patch the docstring of the decorated function to add chunking strategy and - # -- chunking-related argument documentation. This only applies when `chunking_strategy` - # -- is an explicit argument of the decorated function and "chunking_strategy" is not - # -- already mentioned in the docstring. - if func.__doc__ and ( - "chunking_strategy" in func.__code__.co_varnames - and "chunking_strategy" not in func.__doc__ - ): - func.__doc__ += ( - "\nchunking_strategy" - + "\n\tStrategy used for chunking text into larger or smaller elements." - + "\n\tDefaults to `None` with optional arg of 'basic' or 'by_title'." - + "\n\tAdditional Parameters:" - + "\n\t\tmultipage_sections" - + "\n\t\t\tIf True, sections can span multiple pages. Defaults to True." - + "\n\t\tcombine_text_under_n_chars" - + "\n\t\t\tCombines elements (for example a series of titles) until a section" - + "\n\t\t\treaches a length of n characters. Only applies to 'by_title' strategy." - + "\n\t\tnew_after_n_chars" - + "\n\t\t\tCuts off chunks once they reach a length of n characters; a soft max." - + "\n\t\tmax_characters" - + "\n\t\t\tChunks elements text and text_as_html (if present) into chunks" - + "\n\t\t\tof length n characters, a hard max." - ) - - @functools.wraps(func) - def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]: - """The decorated function is replaced with this one.""" - - def get_call_args_applying_defaults() -> dict[str, Any]: - """Map both explicit and default arguments of decorated func call by param name.""" - sig = inspect.signature(func) - call_args: dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs) - for param in sig.parameters.values(): - if param.name not in call_args and param.default is not param.empty: - call_args[param.name] = param.default - return call_args - - # -- call the partitioning function to get the elements -- - elements = func(*args, **kwargs) - - # -- look for a chunking-strategy argument and run the indicated chunker when present -- - call_args = get_call_args_applying_defaults() - - if call_args.get("chunking_strategy") == "by_title": - return chunk_by_title( - elements, - combine_text_under_n_chars=call_args.get("combine_text_under_n_chars"), - max_characters=call_args.get("max_characters"), - multipage_sections=call_args.get("multipage_sections"), - new_after_n_chars=call_args.get("new_after_n_chars"), - overlap=call_args.get("overlap"), - overlap_all=call_args.get("overlap_all"), - ) - - if call_args.get("chunking_strategy") == "basic": - return chunk_elements( - elements, - max_characters=call_args.get("max_characters"), - new_after_n_chars=call_args.get("new_after_n_chars"), - overlap=call_args.get("overlap"), - overlap_all=call_args.get("overlap_all"), - ) - - return elements - - return wrapper - - return decorator +__all__ = [ + "CHUNK_MAX_CHARS_DEFAULT", + "CHUNK_MULTI_PAGE_DEFAULT", + "add_chunking_strategy", + # -- these must be published to allow pluggable chunkers in other code-bases -- + "Chunker", + "register_chunking_strategy", +] diff --git a/unstructured/chunking/basic.py b/unstructured/chunking/basic.py index 6f4ae49de..4c0f1b266 100644 --- a/unstructured/chunking/basic.py +++ b/unstructured/chunking/basic.py @@ -25,6 +25,7 @@ from unstructured.documents.elements import Element def chunk_elements( elements: Iterable[Element], + *, max_characters: Optional[int] = None, new_after_n_chars: Optional[int] = None, overlap: Optional[int] = None, diff --git a/unstructured/chunking/dispatch.py b/unstructured/chunking/dispatch.py new file mode 100644 index 000000000..69a416bdf --- /dev/null +++ b/unstructured/chunking/dispatch.py @@ -0,0 +1,138 @@ +"""Handles dispatch of elements to a chunking-strategy by name. + +Also provides the `@add_chunking_strategy` decorator which is the chief current user of "by-name" +chunking dispatch. +""" + +from __future__ import annotations + +import dataclasses as dc +import functools +import inspect +from typing import Any, Callable, Iterable, Optional, Protocol + +from typing_extensions import ParamSpec + +from unstructured.chunking.basic import chunk_elements +from unstructured.chunking.title import chunk_by_title +from unstructured.documents.elements import Element +from unstructured.utils import lazyproperty + +_P = ParamSpec("_P") + + +class Chunker(Protocol): + """Abstract interface for chunking functions.""" + + def __call__( + self, elements: Iterable[Element], *, max_characters: Optional[int] + ) -> list[Element]: + """A chunking function must have this signature. + + In particular it must minimally have an `elements` parameter and all chunkers will have a + `max_characters` parameter (doesn't need to follow `elements` directly). All others can + vary by chunker. + """ + ... + + +def add_chunking_strategy(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]: + """Decorator for chunking text. + + Chunks the element sequence produced by the partitioner it decorates when a `chunking_strategy` + argument is present in the partitioner call and it names an available chunking strategy. + """ + # -- Patch the docstring of the decorated function to add chunking strategy and + # -- chunking-related argument documentation. This only applies when `chunking_strategy` + # -- is an explicit argument of the decorated function and "chunking_strategy" is not + # -- already mentioned in the docstring. + if func.__doc__ and ( + "chunking_strategy" in func.__code__.co_varnames and "chunking_strategy" not in func.__doc__ + ): + func.__doc__ += ( + "\nchunking_strategy" + + "\n\tStrategy used for chunking text into larger or smaller elements." + + "\n\tDefaults to `None` with optional arg of 'basic' or 'by_title'." + + "\n\tAdditional Parameters:" + + "\n\t\tmultipage_sections" + + "\n\t\t\tIf True, sections can span multiple pages. Defaults to True." + + "\n\t\tcombine_text_under_n_chars" + + "\n\t\t\tCombines elements (for example a series of titles) until a section" + + "\n\t\t\treaches a length of n characters. Only applies to 'by_title' strategy." + + "\n\t\tnew_after_n_chars" + + "\n\t\t\tCuts off chunks once they reach a length of n characters; a soft max." + + "\n\t\tmax_characters" + + "\n\t\t\tChunks elements text and text_as_html (if present) into chunks" + + "\n\t\t\tof length n characters, a hard max." + ) + + @functools.wraps(func) + def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]: + """The decorated function is replaced with this one.""" + + def get_call_args_applying_defaults() -> dict[str, Any]: + """Map both explicit and default arguments of decorated func call by param name.""" + sig = inspect.signature(func) + call_args: dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs) + for param in sig.parameters.values(): + if param.name not in call_args and param.default is not param.empty: + call_args[param.name] = param.default + return call_args + + # -- call the partitioning function to get the elements -- + elements = func(*args, **kwargs) + + # -- look for a chunking-strategy argument -- + call_args = get_call_args_applying_defaults() + chunking_strategy = call_args.pop("chunking_strategy", None) + + # -- no chunking-strategy means no chunking -- + if chunking_strategy is None: + return elements + + # -- otherwise, chunk away :) -- + return chunk(elements, chunking_strategy, **call_args) + + return wrapper + + +def chunk(elements: Iterable[Element], chunking_strategy: str, **kwargs: Any) -> list[Element]: + """Dispatch chunking of `elements` to the chunking function for `chunking_strategy`.""" + chunker_spec = _chunker_registry.get(chunking_strategy) + + if chunker_spec is None: + raise ValueError(f"unrecognized chunking strategy {repr(chunking_strategy)}") + + # -- `kwargs` will in general be an omnibus dict of all keyword arguments to the partitioner; + # -- pick out and use only those supported by this chunker. + chunking_kwargs = {k: v for k, v in kwargs.items() if k in chunker_spec.kw_arg_names} + + return chunker_spec.chunker(elements, **chunking_kwargs) + + +def register_chunking_strategy(name: str, chunker: Chunker) -> None: + """Make chunker available by using `name` as `chunking_strategy` arg in partitioner call.""" + _chunker_registry[name] = _ChunkerSpec(chunker) + + +@dc.dataclass(frozen=True) +class _ChunkerSpec: + """A registry entry for a chunker.""" + + chunker: Chunker + """The "chunk_by_{x}() function that implements this chunking strategy.""" + + @lazyproperty + def kw_arg_names(self) -> tuple[str, ...]: + """Keyword arguments supported by this chunker. + + These are all arguments other than the required `elements: list[Element]` first parameter. + """ + sig = inspect.signature(self.chunker) + return tuple(key for key in sig.parameters if key != "elements") + + +_chunker_registry: dict[str, _ChunkerSpec] = { + "basic": _ChunkerSpec(chunk_elements), + "by_title": _ChunkerSpec(chunk_by_title), +} diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index ecc8b7aba..6691198fe 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -26,7 +26,7 @@ DETECTION_ORIGIN: str = "csv" @process_metadata() @add_metadata_with_filetype(FileType.CSV) -@add_chunking_strategy() +@add_chunking_strategy def partition_csv( filename: Optional[str] = None, file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None, diff --git a/unstructured/partition/doc.py b/unstructured/partition/doc.py index 157312806..3491ad006 100644 --- a/unstructured/partition/doc.py +++ b/unstructured/partition/doc.py @@ -16,7 +16,7 @@ from unstructured.partition.docx import partition_docx @process_metadata() @add_metadata_with_filetype(FileType.DOC) -@add_chunking_strategy() +@add_chunking_strategy def partition_doc( filename: Optional[str] = None, file: Optional[IO[bytes]] = None, diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index d24c6f311..c1b69fa39 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -168,7 +168,7 @@ def convert_and_partition_docx( @process_metadata() @add_metadata_with_filetype(FileType.DOCX) -@add_chunking_strategy() +@add_chunking_strategy def partition_docx( filename: Optional[str] = None, file: Optional[IO[bytes]] = None, diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index a80d5d665..f18ebea48 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -262,7 +262,7 @@ def parse_email( @process_metadata() @add_metadata_with_filetype(FileType.EML) -@add_chunking_strategy() +@add_chunking_strategy def partition_email( filename: Optional[str] = None, file: Optional[Union[IO[bytes], SpooledTemporaryFile[bytes]]] = None, diff --git a/unstructured/partition/epub.py b/unstructured/partition/epub.py index 32891f3b1..0b3ab6295 100644 --- a/unstructured/partition/epub.py +++ b/unstructured/partition/epub.py @@ -11,7 +11,7 @@ DETECTION_ORIGIN: str = "epub" @process_metadata() @add_metadata_with_filetype(FileType.EPUB) -@add_chunking_strategy() +@add_chunking_strategy def partition_epub( filename: Optional[str] = None, file: Optional[IO[bytes]] = None, diff --git a/unstructured/partition/html.py b/unstructured/partition/html.py index c5d6c11c4..49ef8487d 100644 --- a/unstructured/partition/html.py +++ b/unstructured/partition/html.py @@ -26,7 +26,7 @@ if TYPE_CHECKING: @process_metadata() @add_metadata_with_filetype(FileType.HTML) -@add_chunking_strategy() +@add_chunking_strategy def partition_html( filename: Optional[str] = None, file: Optional[IO[bytes]] = None, diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py index d94ead56c..051a88024 100644 --- a/unstructured/partition/image.py +++ b/unstructured/partition/image.py @@ -13,7 +13,7 @@ from unstructured.partition.utils.constants import PartitionStrategy @process_metadata() @add_metadata -@add_chunking_strategy() +@add_chunking_strategy def partition_image( filename: str = "", file: Optional[bytes] = None, diff --git a/unstructured/partition/json.py b/unstructured/partition/json.py index 2c2fd3102..602c4cd34 100644 --- a/unstructured/partition/json.py +++ b/unstructured/partition/json.py @@ -29,7 +29,7 @@ from unstructured.staging.base import dict_to_elements @process_metadata() @add_metadata_with_filetype(FileType.JSON) -@add_chunking_strategy() +@add_chunking_strategy def partition_json( filename: Optional[str] = None, file: Optional[IO[bytes]] = None, diff --git a/unstructured/partition/md.py b/unstructured/partition/md.py index e22b61103..ebe57db51 100644 --- a/unstructured/partition/md.py +++ b/unstructured/partition/md.py @@ -26,7 +26,7 @@ DETECTION_ORIGIN: str = "md" @process_metadata() @add_metadata_with_filetype(FileType.MD) -@add_chunking_strategy() +@add_chunking_strategy def partition_md( filename: Optional[str] = None, file: Optional[IO[bytes]] = None, diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py index 965aedf77..e4514bc30 100644 --- a/unstructured/partition/msg.py +++ b/unstructured/partition/msg.py @@ -17,7 +17,7 @@ from unstructured.partition.text import partition_text @process_metadata() @add_metadata_with_filetype(FileType.MSG) -@add_chunking_strategy() +@add_chunking_strategy def partition_msg( filename: Optional[str] = None, file: Optional[IO[bytes]] = None, diff --git a/unstructured/partition/odt.py b/unstructured/partition/odt.py index b48d462ff..eb2fa05d0 100644 --- a/unstructured/partition/odt.py +++ b/unstructured/partition/odt.py @@ -12,7 +12,7 @@ from unstructured.partition.docx import convert_and_partition_docx @process_metadata() @add_metadata_with_filetype(FileType.ODT) -@add_chunking_strategy() +@add_chunking_strategy def partition_odt( filename: Optional[str] = None, file: Optional[BinaryIO] = None, diff --git a/unstructured/partition/org.py b/unstructured/partition/org.py index ae0c630df..690ff691e 100644 --- a/unstructured/partition/org.py +++ b/unstructured/partition/org.py @@ -9,7 +9,7 @@ DETECTION_ORIGIN: str = "org" @add_metadata_with_filetype(FileType.ORG) -@add_chunking_strategy() +@add_chunking_strategy def partition_org( filename: Optional[str] = None, file: Optional[IO[bytes]] = None, diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 4a6e403ba..f1c7f9eff 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -132,7 +132,7 @@ def default_hi_res_model() -> str: @process_metadata() @add_metadata_with_filetype(FileType.PDF) -@add_chunking_strategy() +@add_chunking_strategy def partition_pdf( filename: str = "", file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None, diff --git a/unstructured/partition/ppt.py b/unstructured/partition/ppt.py index f2eb10fb4..3fae94762 100644 --- a/unstructured/partition/ppt.py +++ b/unstructured/partition/ppt.py @@ -16,7 +16,7 @@ from unstructured.partition.pptx import partition_pptx @process_metadata() @add_metadata_with_filetype(FileType.PPT) -@add_chunking_strategy() +@add_chunking_strategy def partition_ppt( filename: Optional[str] = None, file: Optional[IO[bytes]] = None, diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index 32ea0b2d0..6943e4194 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -47,7 +47,7 @@ DETECTION_ORIGIN = "pptx" @process_metadata() @add_metadata_with_filetype(FileType.PPTX) -@add_chunking_strategy() +@add_chunking_strategy def partition_pptx( filename: Optional[str] = None, file: Optional[IO[bytes]] = None, diff --git a/unstructured/partition/rst.py b/unstructured/partition/rst.py index e7d062818..81f83c226 100644 --- a/unstructured/partition/rst.py +++ b/unstructured/partition/rst.py @@ -10,7 +10,7 @@ DETECTION_ORIGIN: str = "rst" @process_metadata() @add_metadata_with_filetype(FileType.RST) -@add_chunking_strategy() +@add_chunking_strategy def partition_rst( filename: Optional[str] = None, file: Optional[IO[bytes]] = None, diff --git a/unstructured/partition/rtf.py b/unstructured/partition/rtf.py index a01fd3e23..bb6717562 100644 --- a/unstructured/partition/rtf.py +++ b/unstructured/partition/rtf.py @@ -10,7 +10,7 @@ DETECTION_ORIGIN: str = "rtf" @process_metadata() @add_metadata_with_filetype(FileType.RTF) -@add_chunking_strategy() +@add_chunking_strategy def partition_rtf( filename: Optional[str] = None, file: Optional[IO[bytes]] = None, diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py index ddc187cc0..b75659944 100644 --- a/unstructured/partition/text.py +++ b/unstructured/partition/text.py @@ -113,7 +113,7 @@ def partition_text( @process_metadata() @add_metadata_with_filetype(FileType.TXT) -@add_chunking_strategy() +@add_chunking_strategy def _partition_text( filename: Optional[str] = None, file: Optional[IO[bytes]] = None, diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index f713b2833..4e1e217df 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -42,7 +42,7 @@ DETECTION_ORIGIN: str = "xlsx" @process_metadata() @add_metadata_with_filetype(FileType.XLSX) -@add_chunking_strategy() +@add_chunking_strategy def partition_xlsx( filename: Optional[str] = None, file: Optional[IO[bytes]] = None, diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py index 144184bf6..886112584 100644 --- a/unstructured/partition/xml.py +++ b/unstructured/partition/xml.py @@ -81,7 +81,7 @@ def _get_leaf_elements( @process_metadata() @add_metadata_with_filetype(FileType.XML) -@add_chunking_strategy() +@add_chunking_strategy def partition_xml( filename: Optional[str] = None, file: Optional[Union[IO[bytes], SpooledTemporaryFile[bytes]]] = None,