mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-25 22:23:24 +00:00
rfctr(chunking): extract BasePreChunker (#2294)
The `_split_elements_by_title_and_table()` function fulfills the pre-chunker role for `chunk_by_title()`, but most of its operation is not strategy-specific and can be reused by other chunking strategies. Extract `BasePreChunker` and use it as the base class for `_ByTitlePreChunker` which now only needs to provide the boundary predicates specific to that strategy.
This commit is contained in:
parent
fd293b3e78
commit
82714cad98
@ -9,6 +9,7 @@ from typing import List
|
||||
import pytest
|
||||
|
||||
from unstructured.chunking.base import (
|
||||
BasePreChunker,
|
||||
ChunkingOptions,
|
||||
PreChunkBuilder,
|
||||
PreChunkCombiner,
|
||||
@ -20,6 +21,7 @@ from unstructured.chunking.base import (
|
||||
is_title,
|
||||
)
|
||||
from unstructured.documents.elements import (
|
||||
CheckBox,
|
||||
CompositeElement,
|
||||
Element,
|
||||
ElementMetadata,
|
||||
@ -31,6 +33,10 @@ from unstructured.documents.elements import (
|
||||
Title,
|
||||
)
|
||||
|
||||
# ================================================================================================
|
||||
# CHUNKING OPTIONS
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class DescribeChunkingOptions:
|
||||
"""Unit-test suite for `unstructured.chunking.base.ChunkingOptions objects."""
|
||||
@ -138,13 +144,64 @@ class DescribeChunkingOptions:
|
||||
assert ChunkingOptions.new().text_separator == "\n\n"
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# BASE PRE-CHUNKER
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class DescribeBasePreChunker:
|
||||
"""Unit-test suite for `unstructured.chunking.base.BasePreChunker` objects."""
|
||||
|
||||
def it_gathers_elements_into_pre_chunks_respecting_the_specified_chunk_size(self):
|
||||
elements = [
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet, consectetur adipiscing elit."),
|
||||
Text("Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."),
|
||||
Title("Ut Enim"),
|
||||
Text("Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi."),
|
||||
Text("Ut aliquip ex ea commodo consequat."),
|
||||
CheckBox(),
|
||||
]
|
||||
|
||||
opts = ChunkingOptions.new(max_characters=150, new_after_n_chars=65)
|
||||
|
||||
pre_chunk_iter = BasePreChunker.iter_pre_chunks(elements, opts=opts)
|
||||
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet, consectetur adipiscing elit."),
|
||||
]
|
||||
# --
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Text("Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.")
|
||||
]
|
||||
# --
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Ut Enim"),
|
||||
Text("Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi."),
|
||||
]
|
||||
# --
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [Text("Ut aliquip ex ea commodo consequat."), CheckBox()]
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(pre_chunk_iter)
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# PRE-CHUNK SUBTYPES
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class DescribeTablePreChunk:
|
||||
"""Unit-test suite for `unstructured.chunking.base.TablePreChunk objects."""
|
||||
"""Unit-test suite for `unstructured.chunking.base.TablePreChunk` objects."""
|
||||
|
||||
def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
|
||||
html_table = (
|
||||
@ -260,7 +317,7 @@ class DescribeTablePreChunk:
|
||||
|
||||
|
||||
class DescribeTextPreChunk:
|
||||
"""Unit-test suite for `unstructured.chunking.base.TextPreChunk objects."""
|
||||
"""Unit-test suite for `unstructured.chunking.base.TextPreChunk` objects."""
|
||||
|
||||
def it_can_combine_itself_with_another_TextPreChunk_instance(self):
|
||||
""".combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`.
|
||||
|
||||
@ -5,7 +5,7 @@ from typing import List
|
||||
import pytest
|
||||
|
||||
from unstructured.chunking.base import ChunkingOptions, TablePreChunk, TextPreChunk
|
||||
from unstructured.chunking.title import _split_elements_by_title_and_table, chunk_by_title
|
||||
from unstructured.chunking.title import _ByTitlePreChunker, chunk_by_title
|
||||
from unstructured.documents.coordinates import CoordinateSystem
|
||||
from unstructured.documents.elements import (
|
||||
CheckBox,
|
||||
@ -55,7 +55,7 @@ def test_split_elements_by_title_and_table():
|
||||
CheckBox(),
|
||||
]
|
||||
|
||||
pre_chunks = _split_elements_by_title_and_table(elements, opts=ChunkingOptions.new())
|
||||
pre_chunks = _ByTitlePreChunker.iter_pre_chunks(elements, opts=ChunkingOptions.new())
|
||||
|
||||
pre_chunk = next(pre_chunks)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
|
||||
@ -4,7 +4,19 @@ from __future__ import annotations
|
||||
|
||||
import collections
|
||||
import copy
|
||||
from typing import Any, Callable, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple, cast
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
DefaultDict,
|
||||
Dict,
|
||||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
cast,
|
||||
)
|
||||
|
||||
from typing_extensions import Self, TypeAlias
|
||||
|
||||
@ -27,6 +39,11 @@ PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
|
||||
"""The kind of object produced by a pre-chunker."""
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# CHUNKING OPTIONS
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class ChunkingOptions:
|
||||
"""Specifies parameters of optional chunking behaviors."""
|
||||
|
||||
@ -170,6 +187,81 @@ class ChunkingOptions:
|
||||
raise ValueError(f"'overlap' must be less than max_characters," f" got {self._overlap}")
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# BASE PRE-CHUNKER
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class BasePreChunker:
|
||||
"""Base-class for per-strategy pre-chunkers.
|
||||
|
||||
The pre-chunker's responsibilities are:
|
||||
|
||||
- **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
|
||||
either side of those boundaries into different sections. In this case, the primary indicator
|
||||
of a semantic boundary is a `Title` element. A page-break (change in page-number) is also a
|
||||
semantic boundary when `multipage_sections` is `False`.
|
||||
|
||||
- **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
|
||||
into sections as big as possible without exceeding the chunk window size.
|
||||
|
||||
- **Minimize chunks that must be split mid-text.** Precompute the text length of each section
|
||||
and only produce a section that exceeds the chunk window size when there is a single element
|
||||
with text longer than that window.
|
||||
|
||||
A Table element is placed into a section by itself. CheckBox elements are dropped.
|
||||
|
||||
The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates
|
||||
a new "section", hence the "by-title" designation.
|
||||
"""
|
||||
|
||||
def __init__(self, elements: Sequence[Element], opts: ChunkingOptions):
|
||||
self._elements = elements
|
||||
self._opts = opts
|
||||
|
||||
@classmethod
|
||||
def iter_pre_chunks(
|
||||
cls, elements: Sequence[Element], opts: ChunkingOptions
|
||||
) -> Iterator[PreChunk]:
|
||||
"""Generate pre-chunks from the element-stream provided on construction."""
|
||||
return cls(elements, opts)._iter_pre_chunks()
|
||||
|
||||
def _iter_pre_chunks(self) -> Iterator[PreChunk]:
|
||||
"""Generate pre-chunks from the element-stream provided on construction.
|
||||
|
||||
A *pre-chunk* is the largest sub-sequence of elements that will both fit within the
|
||||
chunking window and respects the semantic boundary rules of the chunking strategy. When a
|
||||
single element exceeds the chunking window size it is placed in a pre-chunk by itself and
|
||||
is subject to mid-text splitting in the second phase of the chunking process.
|
||||
"""
|
||||
pre_chunk_builder = PreChunkBuilder(self._opts)
|
||||
|
||||
for element in self._elements:
|
||||
# -- start new pre-chunk when necessary --
|
||||
if self._is_in_new_semantic_unit(element) or not pre_chunk_builder.will_fit(element):
|
||||
yield from pre_chunk_builder.flush()
|
||||
|
||||
# -- add this element to the work-in-progress (WIP) pre-chunk --
|
||||
pre_chunk_builder.add_element(element)
|
||||
|
||||
# -- flush "tail" pre-chunk, any partially-filled pre-chunk after last element is
|
||||
# -- processed
|
||||
yield from pre_chunk_builder.flush()
|
||||
|
||||
@lazyproperty
|
||||
def _boundary_predicates(self) -> Tuple[BoundaryPredicate, ...]:
|
||||
"""The semantic-boundary detectors to be applied to break pre-chunks."""
|
||||
return ()
|
||||
|
||||
def _is_in_new_semantic_unit(self, element: Element) -> bool:
|
||||
"""True when `element` begins a new semantic unit such as a section or page."""
|
||||
# -- all detectors need to be called to update state and avoid double counting
|
||||
# -- boundaries that happen to coincide, like Table and new section on same element.
|
||||
# -- Using `any()` would short-circuit on first True.
|
||||
semantic_boundaries = [pred(element) for pred in self._boundary_predicates]
|
||||
return any(semantic_boundaries)
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# PRE-CHUNK SUB-TYPES
|
||||
# ================================================================================================
|
||||
|
||||
@ -5,19 +5,19 @@ Main entry point is the `@add_chunking_strategy()` decorator.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterator, List, Optional
|
||||
from typing import Iterator, List, Optional, Tuple
|
||||
|
||||
from unstructured.chunking.base import (
|
||||
BasePreChunker,
|
||||
BoundaryPredicate,
|
||||
ChunkingOptions,
|
||||
PreChunk,
|
||||
PreChunkBuilder,
|
||||
PreChunkCombiner,
|
||||
is_in_next_section,
|
||||
is_on_next_page,
|
||||
is_title,
|
||||
)
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.utils import lazyproperty
|
||||
|
||||
|
||||
def chunk_by_title(
|
||||
@ -63,71 +63,27 @@ def chunk_by_title(
|
||||
)
|
||||
|
||||
pre_chunks = PreChunkCombiner(
|
||||
_split_elements_by_title_and_table(elements, opts), opts=opts
|
||||
_ByTitlePreChunker.iter_pre_chunks(elements, opts), opts=opts
|
||||
).iter_combined_pre_chunks()
|
||||
|
||||
return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks()]
|
||||
|
||||
|
||||
def _split_elements_by_title_and_table(
|
||||
elements: List[Element], opts: ChunkingOptions
|
||||
) -> Iterator[PreChunk]:
|
||||
"""Implements "pre-chunker" responsibilities.
|
||||
class _ByTitlePreChunker(BasePreChunker):
|
||||
"""Pre-chunker for the "by_title" chunking strategy.
|
||||
|
||||
A _section_ can be thought of as a "pre-chunk", generally determining the size and contents of a
|
||||
chunk formed by the subsequent "chunker" process. The only exception occurs when a single
|
||||
element is too big to fit in the chunk window and the chunker splits it into two or more chunks
|
||||
divided mid-text. The pre-chunker never divides an element mid-text.
|
||||
|
||||
The pre-chunker's responsibilities are:
|
||||
|
||||
* **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
|
||||
either side of those boundaries into different pre-chunks. In this case, the primary
|
||||
indicator of a semantic boundary is a `Title` element. A page-break (change in
|
||||
page-number) is also a semantic boundary when `multipage_sections` is `False`.
|
||||
|
||||
* **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
|
||||
into pre-chunks as big as possible without exceeding the chunk window size.
|
||||
|
||||
* **Minimize chunks that must be split mid-text.** Precompute the text length of each
|
||||
pre-chunk and only produce a pre-chunk that exceeds the chunk window size when there is a
|
||||
single element with text longer than that window.
|
||||
|
||||
A Table or Checkbox element is placed into a pre-chunk by itself.
|
||||
The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates a
|
||||
new "section", hence the "by-title" designation.
|
||||
"""
|
||||
|
||||
# ========================================================================================
|
||||
@lazyproperty
|
||||
def _boundary_predicates(self) -> Tuple[BoundaryPredicate, ...]:
|
||||
"""The semantic-boundary detectors to be applied to break pre-chunks."""
|
||||
|
||||
def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:
|
||||
yield is_title
|
||||
yield is_in_next_section()
|
||||
if not opts.multipage_sections:
|
||||
yield is_on_next_page()
|
||||
def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:
|
||||
yield is_title
|
||||
yield is_in_next_section()
|
||||
if not self._opts.multipage_sections:
|
||||
yield is_on_next_page()
|
||||
|
||||
# -- the semantic-boundary detectors to be applied to break pre-chunks --
|
||||
boundary_predicates = tuple(iter_boundary_predicates())
|
||||
|
||||
def is_in_new_semantic_unit(element: Element) -> bool:
|
||||
"""True when `element` begins a new semantic unit such as a section or page."""
|
||||
# -- all detectors need to be called to update state and avoid double counting
|
||||
# -- boundaries that happen to coincide, like Table and new section on same element.
|
||||
# -- Using `any()` would short-circuit on first True.
|
||||
semantic_boundaries = [pred(element) for pred in boundary_predicates]
|
||||
return any(semantic_boundaries)
|
||||
|
||||
# ----------------------------------------------------------------------------------------
|
||||
# -- these bits ^^^ will get migrated to `BasePreChunker` helper methods in the next PR --
|
||||
# ========================================================================================
|
||||
|
||||
pre_chunk_builder = PreChunkBuilder(opts)
|
||||
|
||||
for element in elements:
|
||||
# -- start new pre_chunk when necessary --
|
||||
if is_in_new_semantic_unit(element) or not pre_chunk_builder.will_fit(element):
|
||||
yield from pre_chunk_builder.flush()
|
||||
|
||||
# -- add this element to the work-in-progress (WIP) pre-chunk --
|
||||
pre_chunk_builder.add_element(element)
|
||||
|
||||
# -- flush "tail" pre_chunk, any partially-filled pre_chunk after last element is processed --
|
||||
yield from pre_chunk_builder.flush()
|
||||
return tuple(iter_boundary_predicates())
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user