mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 19:13:13 +00:00
rfctr(chunking): extract chunking-strategy dispatch (#2545)
**Summary** This is the final step in adding pluggable chunking-strategies. It introduces the `chunk()` function to replace calls to strategy-specific chunkers in the `@add_chunking_strategy` decorator. The `chunk()` function then uses a mapping of chunking-strategy names (e.g. "by_title", "basic") to chunking functions (chunkers) to dispatch the chunking call. This allows other chunkers to be added at runtime rather than requiring a code change, which is what "pluggable" chunkers is. **Additional Information** - Move the `@add_chunking_strategy` to the new `chunking.dispatch` module since it coheres strongly with that operation, but publish it from `chunking(.__init__)` (as it was before) so users don't couple to the way we organize the chunking sub-package. Also remove the third level of nesting as it's unrequired in this case. - Add unit tests for the `@add_chunking_strategy` decorator which was previously uncovered by any direct test.
This commit is contained in:
parent
3ff6de4f50
commit
4096a38371
@ -1,7 +1,9 @@
|
||||
## 0.12.6-dev1
|
||||
|
||||
## 0.12.6-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Refactor `add_chunking_strategy` decorator to dispatch by name.** Add `chunk()` function to be used by the `add_chunking_strategy` decorator to dispatch chunking call based on a chunking-strategy name (that can be dynamic at runtime). This decouples chunking dispatch from only those chunkers known at "compile" time and enables runtime registration of custom chunkers.
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
93
test_unstructured/chunking/test_dispatch.py
Normal file
93
test_unstructured/chunking/test_dispatch.py
Normal file
@ -0,0 +1,93 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
"""Unit-test suite for the `unstructured.chunking.dispatch` module."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Iterable, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy, register_chunking_strategy
|
||||
from unstructured.chunking.dispatch import _ChunkerSpec, chunk
|
||||
from unstructured.documents.elements import CompositeElement, Element, Text
|
||||
|
||||
|
||||
class Describe_add_chunking_strategy:
|
||||
"""Unit-test suite for `unstructured.chunking.add_chunking_strategy()` decorator."""
|
||||
|
||||
def it_dispatches_the_partitioned_elements_to_the_indicated_chunker(self):
|
||||
decorated_partitioner = add_chunking_strategy(partition_this)
|
||||
|
||||
chunks = decorated_partitioner(chunking_strategy="basic")
|
||||
|
||||
assert chunks == [CompositeElement("Lorem ipsum.\n\nSit amet.")]
|
||||
|
||||
def but_it_skips_dispatch_when_no_chunking_strategy_is_specified(self):
|
||||
decorated_partitioner = add_chunking_strategy(partition_this)
|
||||
|
||||
elements = decorated_partitioner()
|
||||
|
||||
assert elements == [Text("Lorem ipsum."), Text("Sit amet.")]
|
||||
|
||||
|
||||
class Describe_chunk:
|
||||
"""Unit-test suite for `unstructured.chunking.dispatch.chunk()` function."""
|
||||
|
||||
def it_dispatches_to_the_chunker_registered_for_the_chunking_strategy(self):
|
||||
|
||||
register_chunking_strategy("by_something_else", chunk_by_something_else)
|
||||
kwargs = {
|
||||
"max_characters": 750,
|
||||
# -- unused kwargs shouldn't cause a problem; in general `kwargs` will contain all
|
||||
# -- keyword arguments used in the partitioning call.
|
||||
"foo": "bar",
|
||||
}
|
||||
|
||||
chunks = chunk([Text("Lorem"), Text("Ipsum")], "by_something_else", **kwargs)
|
||||
|
||||
assert chunks == [
|
||||
CompositeElement("chunked 2 elements with `(max_characters=750, whizbang=None)`")
|
||||
]
|
||||
|
||||
def it_raises_when_the_requested_chunking_strategy_is_not_registered(self):
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="unrecognized chunking strategy 'foobar'",
|
||||
):
|
||||
chunk(elements=[], chunking_strategy="foobar")
|
||||
|
||||
|
||||
class Describe_ChunkerSpec:
|
||||
"""Unit-test suite for `unstructured.chunking.dispatch._ChunkerSpec` objects."""
|
||||
|
||||
def it_provides_access_to_the_chunking_function(self):
|
||||
spec = _ChunkerSpec(chunk_by_something_else)
|
||||
assert spec.chunker is chunk_by_something_else
|
||||
|
||||
def it_knows_which_keyword_args_the_chunking_function_can_accept(self):
|
||||
spec = _ChunkerSpec(chunk_by_something_else)
|
||||
assert spec.kw_arg_names == ("max_characters", "whizbang")
|
||||
|
||||
|
||||
# -- MODULE-LEVEL FIXTURES -----------------------------------------------------------------------
|
||||
|
||||
|
||||
def chunk_by_something_else(
|
||||
elements: Iterable[Element],
|
||||
max_characters: Optional[int] = None,
|
||||
whizbang: Optional[float] = None,
|
||||
) -> list[Element]:
|
||||
"""A "fake" minimal chunker suitable for use in tests."""
|
||||
els = list(elements)
|
||||
return [
|
||||
CompositeElement(
|
||||
f"chunked {len(els)} elements with"
|
||||
f" `(max_characters={max_characters}, whizbang={whizbang})`"
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def partition_this(**kwargs: Any) -> list[Element]:
|
||||
"""A fake partitioner."""
|
||||
return [Text("Lorem ipsum."), Text("Sit amet.")]
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.12.6-dev1" # pragma: no cover
|
||||
__version__ = "0.12.6-dev2" # pragma: no cover
|
||||
|
||||
@ -1,100 +1,22 @@
|
||||
"""Chunking module initializer.
|
||||
|
||||
Provides the the `@add_chunking_strategy()` decorator.
|
||||
Publishes the public aspects of the chunking sub-package interface.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import functools
|
||||
import inspect
|
||||
from typing import Any, Callable
|
||||
|
||||
from typing_extensions import ParamSpec
|
||||
|
||||
from unstructured.chunking.base import CHUNK_MAX_CHARS_DEFAULT, CHUNK_MULTI_PAGE_DEFAULT
|
||||
from unstructured.chunking.basic import chunk_elements
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.chunking.dispatch import (
|
||||
Chunker,
|
||||
add_chunking_strategy,
|
||||
register_chunking_strategy,
|
||||
)
|
||||
|
||||
__all__ = ["CHUNK_MAX_CHARS_DEFAULT", "CHUNK_MULTI_PAGE_DEFAULT", "add_chunking_strategy"]
|
||||
|
||||
_P = ParamSpec("_P")
|
||||
|
||||
|
||||
def add_chunking_strategy() -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]:
|
||||
"""Decorator for chunking text.
|
||||
|
||||
Chunks the element sequence produced by the partitioner it decorates when a `chunking_strategy`
|
||||
argument is present in the partitioner call and it names an available chunking strategy.
|
||||
"""
|
||||
|
||||
def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
|
||||
# -- Patch the docstring of the decorated function to add chunking strategy and
|
||||
# -- chunking-related argument documentation. This only applies when `chunking_strategy`
|
||||
# -- is an explicit argument of the decorated function and "chunking_strategy" is not
|
||||
# -- already mentioned in the docstring.
|
||||
if func.__doc__ and (
|
||||
"chunking_strategy" in func.__code__.co_varnames
|
||||
and "chunking_strategy" not in func.__doc__
|
||||
):
|
||||
func.__doc__ += (
|
||||
"\nchunking_strategy"
|
||||
+ "\n\tStrategy used for chunking text into larger or smaller elements."
|
||||
+ "\n\tDefaults to `None` with optional arg of 'basic' or 'by_title'."
|
||||
+ "\n\tAdditional Parameters:"
|
||||
+ "\n\t\tmultipage_sections"
|
||||
+ "\n\t\t\tIf True, sections can span multiple pages. Defaults to True."
|
||||
+ "\n\t\tcombine_text_under_n_chars"
|
||||
+ "\n\t\t\tCombines elements (for example a series of titles) until a section"
|
||||
+ "\n\t\t\treaches a length of n characters. Only applies to 'by_title' strategy."
|
||||
+ "\n\t\tnew_after_n_chars"
|
||||
+ "\n\t\t\tCuts off chunks once they reach a length of n characters; a soft max."
|
||||
+ "\n\t\tmax_characters"
|
||||
+ "\n\t\t\tChunks elements text and text_as_html (if present) into chunks"
|
||||
+ "\n\t\t\tof length n characters, a hard max."
|
||||
)
|
||||
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
|
||||
"""The decorated function is replaced with this one."""
|
||||
|
||||
def get_call_args_applying_defaults() -> dict[str, Any]:
|
||||
"""Map both explicit and default arguments of decorated func call by param name."""
|
||||
sig = inspect.signature(func)
|
||||
call_args: dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
|
||||
for param in sig.parameters.values():
|
||||
if param.name not in call_args and param.default is not param.empty:
|
||||
call_args[param.name] = param.default
|
||||
return call_args
|
||||
|
||||
# -- call the partitioning function to get the elements --
|
||||
elements = func(*args, **kwargs)
|
||||
|
||||
# -- look for a chunking-strategy argument and run the indicated chunker when present --
|
||||
call_args = get_call_args_applying_defaults()
|
||||
|
||||
if call_args.get("chunking_strategy") == "by_title":
|
||||
return chunk_by_title(
|
||||
elements,
|
||||
combine_text_under_n_chars=call_args.get("combine_text_under_n_chars"),
|
||||
max_characters=call_args.get("max_characters"),
|
||||
multipage_sections=call_args.get("multipage_sections"),
|
||||
new_after_n_chars=call_args.get("new_after_n_chars"),
|
||||
overlap=call_args.get("overlap"),
|
||||
overlap_all=call_args.get("overlap_all"),
|
||||
)
|
||||
|
||||
if call_args.get("chunking_strategy") == "basic":
|
||||
return chunk_elements(
|
||||
elements,
|
||||
max_characters=call_args.get("max_characters"),
|
||||
new_after_n_chars=call_args.get("new_after_n_chars"),
|
||||
overlap=call_args.get("overlap"),
|
||||
overlap_all=call_args.get("overlap_all"),
|
||||
)
|
||||
|
||||
return elements
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
__all__ = [
|
||||
"CHUNK_MAX_CHARS_DEFAULT",
|
||||
"CHUNK_MULTI_PAGE_DEFAULT",
|
||||
"add_chunking_strategy",
|
||||
# -- these must be published to allow pluggable chunkers in other code-bases --
|
||||
"Chunker",
|
||||
"register_chunking_strategy",
|
||||
]
|
||||
|
||||
@ -25,6 +25,7 @@ from unstructured.documents.elements import Element
|
||||
|
||||
def chunk_elements(
|
||||
elements: Iterable[Element],
|
||||
*,
|
||||
max_characters: Optional[int] = None,
|
||||
new_after_n_chars: Optional[int] = None,
|
||||
overlap: Optional[int] = None,
|
||||
|
||||
138
unstructured/chunking/dispatch.py
Normal file
138
unstructured/chunking/dispatch.py
Normal file
@ -0,0 +1,138 @@
|
||||
"""Handles dispatch of elements to a chunking-strategy by name.
|
||||
|
||||
Also provides the `@add_chunking_strategy` decorator which is the chief current user of "by-name"
|
||||
chunking dispatch.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import dataclasses as dc
|
||||
import functools
|
||||
import inspect
|
||||
from typing import Any, Callable, Iterable, Optional, Protocol
|
||||
|
||||
from typing_extensions import ParamSpec
|
||||
|
||||
from unstructured.chunking.basic import chunk_elements
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.utils import lazyproperty
|
||||
|
||||
_P = ParamSpec("_P")
|
||||
|
||||
|
||||
class Chunker(Protocol):
|
||||
"""Abstract interface for chunking functions."""
|
||||
|
||||
def __call__(
|
||||
self, elements: Iterable[Element], *, max_characters: Optional[int]
|
||||
) -> list[Element]:
|
||||
"""A chunking function must have this signature.
|
||||
|
||||
In particular it must minimally have an `elements` parameter and all chunkers will have a
|
||||
`max_characters` parameter (doesn't need to follow `elements` directly). All others can
|
||||
vary by chunker.
|
||||
"""
|
||||
...
|
||||
|
||||
|
||||
def add_chunking_strategy(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
|
||||
"""Decorator for chunking text.
|
||||
|
||||
Chunks the element sequence produced by the partitioner it decorates when a `chunking_strategy`
|
||||
argument is present in the partitioner call and it names an available chunking strategy.
|
||||
"""
|
||||
# -- Patch the docstring of the decorated function to add chunking strategy and
|
||||
# -- chunking-related argument documentation. This only applies when `chunking_strategy`
|
||||
# -- is an explicit argument of the decorated function and "chunking_strategy" is not
|
||||
# -- already mentioned in the docstring.
|
||||
if func.__doc__ and (
|
||||
"chunking_strategy" in func.__code__.co_varnames and "chunking_strategy" not in func.__doc__
|
||||
):
|
||||
func.__doc__ += (
|
||||
"\nchunking_strategy"
|
||||
+ "\n\tStrategy used for chunking text into larger or smaller elements."
|
||||
+ "\n\tDefaults to `None` with optional arg of 'basic' or 'by_title'."
|
||||
+ "\n\tAdditional Parameters:"
|
||||
+ "\n\t\tmultipage_sections"
|
||||
+ "\n\t\t\tIf True, sections can span multiple pages. Defaults to True."
|
||||
+ "\n\t\tcombine_text_under_n_chars"
|
||||
+ "\n\t\t\tCombines elements (for example a series of titles) until a section"
|
||||
+ "\n\t\t\treaches a length of n characters. Only applies to 'by_title' strategy."
|
||||
+ "\n\t\tnew_after_n_chars"
|
||||
+ "\n\t\t\tCuts off chunks once they reach a length of n characters; a soft max."
|
||||
+ "\n\t\tmax_characters"
|
||||
+ "\n\t\t\tChunks elements text and text_as_html (if present) into chunks"
|
||||
+ "\n\t\t\tof length n characters, a hard max."
|
||||
)
|
||||
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
|
||||
"""The decorated function is replaced with this one."""
|
||||
|
||||
def get_call_args_applying_defaults() -> dict[str, Any]:
|
||||
"""Map both explicit and default arguments of decorated func call by param name."""
|
||||
sig = inspect.signature(func)
|
||||
call_args: dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
|
||||
for param in sig.parameters.values():
|
||||
if param.name not in call_args and param.default is not param.empty:
|
||||
call_args[param.name] = param.default
|
||||
return call_args
|
||||
|
||||
# -- call the partitioning function to get the elements --
|
||||
elements = func(*args, **kwargs)
|
||||
|
||||
# -- look for a chunking-strategy argument --
|
||||
call_args = get_call_args_applying_defaults()
|
||||
chunking_strategy = call_args.pop("chunking_strategy", None)
|
||||
|
||||
# -- no chunking-strategy means no chunking --
|
||||
if chunking_strategy is None:
|
||||
return elements
|
||||
|
||||
# -- otherwise, chunk away :) --
|
||||
return chunk(elements, chunking_strategy, **call_args)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def chunk(elements: Iterable[Element], chunking_strategy: str, **kwargs: Any) -> list[Element]:
|
||||
"""Dispatch chunking of `elements` to the chunking function for `chunking_strategy`."""
|
||||
chunker_spec = _chunker_registry.get(chunking_strategy)
|
||||
|
||||
if chunker_spec is None:
|
||||
raise ValueError(f"unrecognized chunking strategy {repr(chunking_strategy)}")
|
||||
|
||||
# -- `kwargs` will in general be an omnibus dict of all keyword arguments to the partitioner;
|
||||
# -- pick out and use only those supported by this chunker.
|
||||
chunking_kwargs = {k: v for k, v in kwargs.items() if k in chunker_spec.kw_arg_names}
|
||||
|
||||
return chunker_spec.chunker(elements, **chunking_kwargs)
|
||||
|
||||
|
||||
def register_chunking_strategy(name: str, chunker: Chunker) -> None:
|
||||
"""Make chunker available by using `name` as `chunking_strategy` arg in partitioner call."""
|
||||
_chunker_registry[name] = _ChunkerSpec(chunker)
|
||||
|
||||
|
||||
@dc.dataclass(frozen=True)
|
||||
class _ChunkerSpec:
|
||||
"""A registry entry for a chunker."""
|
||||
|
||||
chunker: Chunker
|
||||
"""The "chunk_by_{x}() function that implements this chunking strategy."""
|
||||
|
||||
@lazyproperty
|
||||
def kw_arg_names(self) -> tuple[str, ...]:
|
||||
"""Keyword arguments supported by this chunker.
|
||||
|
||||
These are all arguments other than the required `elements: list[Element]` first parameter.
|
||||
"""
|
||||
sig = inspect.signature(self.chunker)
|
||||
return tuple(key for key in sig.parameters if key != "elements")
|
||||
|
||||
|
||||
_chunker_registry: dict[str, _ChunkerSpec] = {
|
||||
"basic": _ChunkerSpec(chunk_elements),
|
||||
"by_title": _ChunkerSpec(chunk_by_title),
|
||||
}
|
||||
@ -26,7 +26,7 @@ DETECTION_ORIGIN: str = "csv"
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.CSV)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_csv(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
|
||||
|
||||
@ -16,7 +16,7 @@ from unstructured.partition.docx import partition_docx
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.DOC)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_doc(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
|
||||
@ -168,7 +168,7 @@ def convert_and_partition_docx(
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.DOCX)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_docx(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
|
||||
@ -262,7 +262,7 @@ def parse_email(
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.EML)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_email(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[Union[IO[bytes], SpooledTemporaryFile[bytes]]] = None,
|
||||
|
||||
@ -11,7 +11,7 @@ DETECTION_ORIGIN: str = "epub"
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.EPUB)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_epub(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
|
||||
@ -26,7 +26,7 @@ if TYPE_CHECKING:
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.HTML)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_html(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
|
||||
@ -13,7 +13,7 @@ from unstructured.partition.utils.constants import PartitionStrategy
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_image(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
|
||||
@ -29,7 +29,7 @@ from unstructured.staging.base import dict_to_elements
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.JSON)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_json(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
|
||||
@ -26,7 +26,7 @@ DETECTION_ORIGIN: str = "md"
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.MD)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_md(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
|
||||
@ -17,7 +17,7 @@ from unstructured.partition.text import partition_text
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.MSG)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_msg(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
|
||||
@ -12,7 +12,7 @@ from unstructured.partition.docx import convert_and_partition_docx
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.ODT)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_odt(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[BinaryIO] = None,
|
||||
|
||||
@ -9,7 +9,7 @@ DETECTION_ORIGIN: str = "org"
|
||||
|
||||
|
||||
@add_metadata_with_filetype(FileType.ORG)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_org(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
|
||||
@ -132,7 +132,7 @@ def default_hi_res_model() -> str:
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.PDF)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_pdf(
|
||||
filename: str = "",
|
||||
file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None,
|
||||
|
||||
@ -16,7 +16,7 @@ from unstructured.partition.pptx import partition_pptx
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.PPT)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_ppt(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
|
||||
@ -47,7 +47,7 @@ DETECTION_ORIGIN = "pptx"
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.PPTX)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_pptx(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
|
||||
@ -10,7 +10,7 @@ DETECTION_ORIGIN: str = "rst"
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.RST)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_rst(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
|
||||
@ -10,7 +10,7 @@ DETECTION_ORIGIN: str = "rtf"
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.RTF)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_rtf(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
|
||||
@ -113,7 +113,7 @@ def partition_text(
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.TXT)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def _partition_text(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
|
||||
@ -42,7 +42,7 @@ DETECTION_ORIGIN: str = "xlsx"
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.XLSX)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_xlsx(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
|
||||
@ -81,7 +81,7 @@ def _get_leaf_elements(
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.XML)
|
||||
@add_chunking_strategy()
|
||||
@add_chunking_strategy
|
||||
def partition_xml(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[Union[IO[bytes], SpooledTemporaryFile[bytes]]] = None,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user