rfctr(chunking): extract chunking-strategy dispatch (#2545)

**Summary**
This is the final step in adding pluggable chunking-strategies. It
introduces the `chunk()` function to replace calls to strategy-specific
chunkers in the `@add_chunking_strategy` decorator. The `chunk()`
function then uses a mapping of chunking-strategy names (e.g.
"by_title", "basic") to chunking functions (chunkers) to dispatch the
chunking call. This allows other chunkers to be added at runtime rather
than requiring a code change, which is what "pluggable" chunkers is.

**Additional Information**
- Move the `@add_chunking_strategy` to the new `chunking.dispatch`
module since it coheres strongly with that operation, but publish it
from `chunking(.__init__)` (as it was before) so users don't couple to
the way we organize the chunking sub-package. Also remove the third
level of nesting as it's unrequired in this case.
- Add unit tests for the `@add_chunking_strategy` decorator which was
previously uncovered by any direct test.
This commit is contained in:
Steve Canny 2024-03-05 15:19:29 -08:00 committed by GitHub
parent 3ff6de4f50
commit 4096a38371
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
26 changed files with 271 additions and 115 deletions

View File

@ -1,7 +1,9 @@
## 0.12.6-dev1
## 0.12.6-dev2
### Enhancements
* **Refactor `add_chunking_strategy` decorator to dispatch by name.** Add `chunk()` function to be used by the `add_chunking_strategy` decorator to dispatch chunking call based on a chunking-strategy name (that can be dynamic at runtime). This decouples chunking dispatch from only those chunkers known at "compile" time and enables runtime registration of custom chunkers.
### Features
### Fixes

View File

@ -0,0 +1,93 @@
# pyright: reportPrivateUsage=false
"""Unit-test suite for the `unstructured.chunking.dispatch` module."""
from __future__ import annotations
from typing import Any, Iterable, Optional
import pytest
from unstructured.chunking import add_chunking_strategy, register_chunking_strategy
from unstructured.chunking.dispatch import _ChunkerSpec, chunk
from unstructured.documents.elements import CompositeElement, Element, Text
class Describe_add_chunking_strategy:
"""Unit-test suite for `unstructured.chunking.add_chunking_strategy()` decorator."""
def it_dispatches_the_partitioned_elements_to_the_indicated_chunker(self):
decorated_partitioner = add_chunking_strategy(partition_this)
chunks = decorated_partitioner(chunking_strategy="basic")
assert chunks == [CompositeElement("Lorem ipsum.\n\nSit amet.")]
def but_it_skips_dispatch_when_no_chunking_strategy_is_specified(self):
decorated_partitioner = add_chunking_strategy(partition_this)
elements = decorated_partitioner()
assert elements == [Text("Lorem ipsum."), Text("Sit amet.")]
class Describe_chunk:
"""Unit-test suite for `unstructured.chunking.dispatch.chunk()` function."""
def it_dispatches_to_the_chunker_registered_for_the_chunking_strategy(self):
register_chunking_strategy("by_something_else", chunk_by_something_else)
kwargs = {
"max_characters": 750,
# -- unused kwargs shouldn't cause a problem; in general `kwargs` will contain all
# -- keyword arguments used in the partitioning call.
"foo": "bar",
}
chunks = chunk([Text("Lorem"), Text("Ipsum")], "by_something_else", **kwargs)
assert chunks == [
CompositeElement("chunked 2 elements with `(max_characters=750, whizbang=None)`")
]
def it_raises_when_the_requested_chunking_strategy_is_not_registered(self):
with pytest.raises(
ValueError,
match="unrecognized chunking strategy 'foobar'",
):
chunk(elements=[], chunking_strategy="foobar")
class Describe_ChunkerSpec:
"""Unit-test suite for `unstructured.chunking.dispatch._ChunkerSpec` objects."""
def it_provides_access_to_the_chunking_function(self):
spec = _ChunkerSpec(chunk_by_something_else)
assert spec.chunker is chunk_by_something_else
def it_knows_which_keyword_args_the_chunking_function_can_accept(self):
spec = _ChunkerSpec(chunk_by_something_else)
assert spec.kw_arg_names == ("max_characters", "whizbang")
# -- MODULE-LEVEL FIXTURES -----------------------------------------------------------------------
def chunk_by_something_else(
elements: Iterable[Element],
max_characters: Optional[int] = None,
whizbang: Optional[float] = None,
) -> list[Element]:
"""A "fake" minimal chunker suitable for use in tests."""
els = list(elements)
return [
CompositeElement(
f"chunked {len(els)} elements with"
f" `(max_characters={max_characters}, whizbang={whizbang})`"
)
]
def partition_this(**kwargs: Any) -> list[Element]:
"""A fake partitioner."""
return [Text("Lorem ipsum."), Text("Sit amet.")]

View File

@ -1 +1 @@
__version__ = "0.12.6-dev1" # pragma: no cover
__version__ = "0.12.6-dev2" # pragma: no cover

View File

@ -1,100 +1,22 @@
"""Chunking module initializer.
Provides the the `@add_chunking_strategy()` decorator.
Publishes the public aspects of the chunking sub-package interface.
"""
from __future__ import annotations
import functools
import inspect
from typing import Any, Callable
from typing_extensions import ParamSpec
from unstructured.chunking.base import CHUNK_MAX_CHARS_DEFAULT, CHUNK_MULTI_PAGE_DEFAULT
from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import Element
from unstructured.chunking.dispatch import (
Chunker,
add_chunking_strategy,
register_chunking_strategy,
)
__all__ = ["CHUNK_MAX_CHARS_DEFAULT", "CHUNK_MULTI_PAGE_DEFAULT", "add_chunking_strategy"]
_P = ParamSpec("_P")
def add_chunking_strategy() -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]:
"""Decorator for chunking text.
Chunks the element sequence produced by the partitioner it decorates when a `chunking_strategy`
argument is present in the partitioner call and it names an available chunking strategy.
"""
def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
# -- Patch the docstring of the decorated function to add chunking strategy and
# -- chunking-related argument documentation. This only applies when `chunking_strategy`
# -- is an explicit argument of the decorated function and "chunking_strategy" is not
# -- already mentioned in the docstring.
if func.__doc__ and (
"chunking_strategy" in func.__code__.co_varnames
and "chunking_strategy" not in func.__doc__
):
func.__doc__ += (
"\nchunking_strategy"
+ "\n\tStrategy used for chunking text into larger or smaller elements."
+ "\n\tDefaults to `None` with optional arg of 'basic' or 'by_title'."
+ "\n\tAdditional Parameters:"
+ "\n\t\tmultipage_sections"
+ "\n\t\t\tIf True, sections can span multiple pages. Defaults to True."
+ "\n\t\tcombine_text_under_n_chars"
+ "\n\t\t\tCombines elements (for example a series of titles) until a section"
+ "\n\t\t\treaches a length of n characters. Only applies to 'by_title' strategy."
+ "\n\t\tnew_after_n_chars"
+ "\n\t\t\tCuts off chunks once they reach a length of n characters; a soft max."
+ "\n\t\tmax_characters"
+ "\n\t\t\tChunks elements text and text_as_html (if present) into chunks"
+ "\n\t\t\tof length n characters, a hard max."
)
@functools.wraps(func)
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
"""The decorated function is replaced with this one."""
def get_call_args_applying_defaults() -> dict[str, Any]:
"""Map both explicit and default arguments of decorated func call by param name."""
sig = inspect.signature(func)
call_args: dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
for param in sig.parameters.values():
if param.name not in call_args and param.default is not param.empty:
call_args[param.name] = param.default
return call_args
# -- call the partitioning function to get the elements --
elements = func(*args, **kwargs)
# -- look for a chunking-strategy argument and run the indicated chunker when present --
call_args = get_call_args_applying_defaults()
if call_args.get("chunking_strategy") == "by_title":
return chunk_by_title(
elements,
combine_text_under_n_chars=call_args.get("combine_text_under_n_chars"),
max_characters=call_args.get("max_characters"),
multipage_sections=call_args.get("multipage_sections"),
new_after_n_chars=call_args.get("new_after_n_chars"),
overlap=call_args.get("overlap"),
overlap_all=call_args.get("overlap_all"),
)
if call_args.get("chunking_strategy") == "basic":
return chunk_elements(
elements,
max_characters=call_args.get("max_characters"),
new_after_n_chars=call_args.get("new_after_n_chars"),
overlap=call_args.get("overlap"),
overlap_all=call_args.get("overlap_all"),
)
return elements
return wrapper
return decorator
__all__ = [
"CHUNK_MAX_CHARS_DEFAULT",
"CHUNK_MULTI_PAGE_DEFAULT",
"add_chunking_strategy",
# -- these must be published to allow pluggable chunkers in other code-bases --
"Chunker",
"register_chunking_strategy",
]

View File

@ -25,6 +25,7 @@ from unstructured.documents.elements import Element
def chunk_elements(
elements: Iterable[Element],
*,
max_characters: Optional[int] = None,
new_after_n_chars: Optional[int] = None,
overlap: Optional[int] = None,

View File

@ -0,0 +1,138 @@
"""Handles dispatch of elements to a chunking-strategy by name.
Also provides the `@add_chunking_strategy` decorator which is the chief current user of "by-name"
chunking dispatch.
"""
from __future__ import annotations
import dataclasses as dc
import functools
import inspect
from typing import Any, Callable, Iterable, Optional, Protocol
from typing_extensions import ParamSpec
from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import Element
from unstructured.utils import lazyproperty
_P = ParamSpec("_P")
class Chunker(Protocol):
"""Abstract interface for chunking functions."""
def __call__(
self, elements: Iterable[Element], *, max_characters: Optional[int]
) -> list[Element]:
"""A chunking function must have this signature.
In particular it must minimally have an `elements` parameter and all chunkers will have a
`max_characters` parameter (doesn't need to follow `elements` directly). All others can
vary by chunker.
"""
...
def add_chunking_strategy(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
"""Decorator for chunking text.
Chunks the element sequence produced by the partitioner it decorates when a `chunking_strategy`
argument is present in the partitioner call and it names an available chunking strategy.
"""
# -- Patch the docstring of the decorated function to add chunking strategy and
# -- chunking-related argument documentation. This only applies when `chunking_strategy`
# -- is an explicit argument of the decorated function and "chunking_strategy" is not
# -- already mentioned in the docstring.
if func.__doc__ and (
"chunking_strategy" in func.__code__.co_varnames and "chunking_strategy" not in func.__doc__
):
func.__doc__ += (
"\nchunking_strategy"
+ "\n\tStrategy used for chunking text into larger or smaller elements."
+ "\n\tDefaults to `None` with optional arg of 'basic' or 'by_title'."
+ "\n\tAdditional Parameters:"
+ "\n\t\tmultipage_sections"
+ "\n\t\t\tIf True, sections can span multiple pages. Defaults to True."
+ "\n\t\tcombine_text_under_n_chars"
+ "\n\t\t\tCombines elements (for example a series of titles) until a section"
+ "\n\t\t\treaches a length of n characters. Only applies to 'by_title' strategy."
+ "\n\t\tnew_after_n_chars"
+ "\n\t\t\tCuts off chunks once they reach a length of n characters; a soft max."
+ "\n\t\tmax_characters"
+ "\n\t\t\tChunks elements text and text_as_html (if present) into chunks"
+ "\n\t\t\tof length n characters, a hard max."
)
@functools.wraps(func)
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
"""The decorated function is replaced with this one."""
def get_call_args_applying_defaults() -> dict[str, Any]:
"""Map both explicit and default arguments of decorated func call by param name."""
sig = inspect.signature(func)
call_args: dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
for param in sig.parameters.values():
if param.name not in call_args and param.default is not param.empty:
call_args[param.name] = param.default
return call_args
# -- call the partitioning function to get the elements --
elements = func(*args, **kwargs)
# -- look for a chunking-strategy argument --
call_args = get_call_args_applying_defaults()
chunking_strategy = call_args.pop("chunking_strategy", None)
# -- no chunking-strategy means no chunking --
if chunking_strategy is None:
return elements
# -- otherwise, chunk away :) --
return chunk(elements, chunking_strategy, **call_args)
return wrapper
def chunk(elements: Iterable[Element], chunking_strategy: str, **kwargs: Any) -> list[Element]:
"""Dispatch chunking of `elements` to the chunking function for `chunking_strategy`."""
chunker_spec = _chunker_registry.get(chunking_strategy)
if chunker_spec is None:
raise ValueError(f"unrecognized chunking strategy {repr(chunking_strategy)}")
# -- `kwargs` will in general be an omnibus dict of all keyword arguments to the partitioner;
# -- pick out and use only those supported by this chunker.
chunking_kwargs = {k: v for k, v in kwargs.items() if k in chunker_spec.kw_arg_names}
return chunker_spec.chunker(elements, **chunking_kwargs)
def register_chunking_strategy(name: str, chunker: Chunker) -> None:
"""Make chunker available by using `name` as `chunking_strategy` arg in partitioner call."""
_chunker_registry[name] = _ChunkerSpec(chunker)
@dc.dataclass(frozen=True)
class _ChunkerSpec:
"""A registry entry for a chunker."""
chunker: Chunker
"""The "chunk_by_{x}() function that implements this chunking strategy."""
@lazyproperty
def kw_arg_names(self) -> tuple[str, ...]:
"""Keyword arguments supported by this chunker.
These are all arguments other than the required `elements: list[Element]` first parameter.
"""
sig = inspect.signature(self.chunker)
return tuple(key for key in sig.parameters if key != "elements")
_chunker_registry: dict[str, _ChunkerSpec] = {
"basic": _ChunkerSpec(chunk_elements),
"by_title": _ChunkerSpec(chunk_by_title),
}

View File

@ -26,7 +26,7 @@ DETECTION_ORIGIN: str = "csv"
@process_metadata()
@add_metadata_with_filetype(FileType.CSV)
@add_chunking_strategy()
@add_chunking_strategy
def partition_csv(
filename: Optional[str] = None,
file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,

View File

@ -16,7 +16,7 @@ from unstructured.partition.docx import partition_docx
@process_metadata()
@add_metadata_with_filetype(FileType.DOC)
@add_chunking_strategy()
@add_chunking_strategy
def partition_doc(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,

View File

@ -168,7 +168,7 @@ def convert_and_partition_docx(
@process_metadata()
@add_metadata_with_filetype(FileType.DOCX)
@add_chunking_strategy()
@add_chunking_strategy
def partition_docx(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,

View File

@ -262,7 +262,7 @@ def parse_email(
@process_metadata()
@add_metadata_with_filetype(FileType.EML)
@add_chunking_strategy()
@add_chunking_strategy
def partition_email(
filename: Optional[str] = None,
file: Optional[Union[IO[bytes], SpooledTemporaryFile[bytes]]] = None,

View File

@ -11,7 +11,7 @@ DETECTION_ORIGIN: str = "epub"
@process_metadata()
@add_metadata_with_filetype(FileType.EPUB)
@add_chunking_strategy()
@add_chunking_strategy
def partition_epub(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,

View File

@ -26,7 +26,7 @@ if TYPE_CHECKING:
@process_metadata()
@add_metadata_with_filetype(FileType.HTML)
@add_chunking_strategy()
@add_chunking_strategy
def partition_html(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,

View File

@ -13,7 +13,7 @@ from unstructured.partition.utils.constants import PartitionStrategy
@process_metadata()
@add_metadata
@add_chunking_strategy()
@add_chunking_strategy
def partition_image(
filename: str = "",
file: Optional[bytes] = None,

View File

@ -29,7 +29,7 @@ from unstructured.staging.base import dict_to_elements
@process_metadata()
@add_metadata_with_filetype(FileType.JSON)
@add_chunking_strategy()
@add_chunking_strategy
def partition_json(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,

View File

@ -26,7 +26,7 @@ DETECTION_ORIGIN: str = "md"
@process_metadata()
@add_metadata_with_filetype(FileType.MD)
@add_chunking_strategy()
@add_chunking_strategy
def partition_md(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,

View File

@ -17,7 +17,7 @@ from unstructured.partition.text import partition_text
@process_metadata()
@add_metadata_with_filetype(FileType.MSG)
@add_chunking_strategy()
@add_chunking_strategy
def partition_msg(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,

View File

@ -12,7 +12,7 @@ from unstructured.partition.docx import convert_and_partition_docx
@process_metadata()
@add_metadata_with_filetype(FileType.ODT)
@add_chunking_strategy()
@add_chunking_strategy
def partition_odt(
filename: Optional[str] = None,
file: Optional[BinaryIO] = None,

View File

@ -9,7 +9,7 @@ DETECTION_ORIGIN: str = "org"
@add_metadata_with_filetype(FileType.ORG)
@add_chunking_strategy()
@add_chunking_strategy
def partition_org(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,

View File

@ -132,7 +132,7 @@ def default_hi_res_model() -> str:
@process_metadata()
@add_metadata_with_filetype(FileType.PDF)
@add_chunking_strategy()
@add_chunking_strategy
def partition_pdf(
filename: str = "",
file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None,

View File

@ -16,7 +16,7 @@ from unstructured.partition.pptx import partition_pptx
@process_metadata()
@add_metadata_with_filetype(FileType.PPT)
@add_chunking_strategy()
@add_chunking_strategy
def partition_ppt(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,

View File

@ -47,7 +47,7 @@ DETECTION_ORIGIN = "pptx"
@process_metadata()
@add_metadata_with_filetype(FileType.PPTX)
@add_chunking_strategy()
@add_chunking_strategy
def partition_pptx(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,

View File

@ -10,7 +10,7 @@ DETECTION_ORIGIN: str = "rst"
@process_metadata()
@add_metadata_with_filetype(FileType.RST)
@add_chunking_strategy()
@add_chunking_strategy
def partition_rst(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,

View File

@ -10,7 +10,7 @@ DETECTION_ORIGIN: str = "rtf"
@process_metadata()
@add_metadata_with_filetype(FileType.RTF)
@add_chunking_strategy()
@add_chunking_strategy
def partition_rtf(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,

View File

@ -113,7 +113,7 @@ def partition_text(
@process_metadata()
@add_metadata_with_filetype(FileType.TXT)
@add_chunking_strategy()
@add_chunking_strategy
def _partition_text(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,

View File

@ -42,7 +42,7 @@ DETECTION_ORIGIN: str = "xlsx"
@process_metadata()
@add_metadata_with_filetype(FileType.XLSX)
@add_chunking_strategy()
@add_chunking_strategy
def partition_xlsx(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,

View File

@ -81,7 +81,7 @@ def _get_leaf_elements(
@process_metadata()
@add_metadata_with_filetype(FileType.XML)
@add_chunking_strategy()
@add_chunking_strategy
def partition_xml(
filename: Optional[str] = None,
file: Optional[Union[IO[bytes], SpooledTemporaryFile[bytes]]] = None,