rfctr(chunking): extract BasePreChunker (#2294)

The `_split_elements_by_title_and_table()` function fulfills the
pre-chunker role for `chunk_by_title()`, but most of its operation is
not strategy-specific and can be reused by other chunking strategies.

Extract `BasePreChunker` and use it as the base class for
`_ByTitlePreChunker` which now only needs to provide the boundary
predicates specific to that strategy.
This commit is contained in:
Steve Canny 2023-12-19 22:30:21 -08:00 committed by GitHub
parent fd293b3e78
commit 82714cad98
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 171 additions and 66 deletions

View File

@ -9,6 +9,7 @@ from typing import List
import pytest
from unstructured.chunking.base import (
BasePreChunker,
ChunkingOptions,
PreChunkBuilder,
PreChunkCombiner,
@ -20,6 +21,7 @@ from unstructured.chunking.base import (
is_title,
)
from unstructured.documents.elements import (
CheckBox,
CompositeElement,
Element,
ElementMetadata,
@ -31,6 +33,10 @@ from unstructured.documents.elements import (
Title,
)
# ================================================================================================
# CHUNKING OPTIONS
# ================================================================================================
class DescribeChunkingOptions:
"""Unit-test suite for `unstructured.chunking.base.ChunkingOptions objects."""
@ -138,13 +144,64 @@ class DescribeChunkingOptions:
assert ChunkingOptions.new().text_separator == "\n\n"
# ================================================================================================
# BASE PRE-CHUNKER
# ================================================================================================
class DescribeBasePreChunker:
"""Unit-test suite for `unstructured.chunking.base.BasePreChunker` objects."""
def it_gathers_elements_into_pre_chunks_respecting_the_specified_chunk_size(self):
elements = [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet, consectetur adipiscing elit."),
Text("Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."),
Title("Ut Enim"),
Text("Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi."),
Text("Ut aliquip ex ea commodo consequat."),
CheckBox(),
]
opts = ChunkingOptions.new(max_characters=150, new_after_n_chars=65)
pre_chunk_iter = BasePreChunker.iter_pre_chunks(elements, opts=opts)
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet, consectetur adipiscing elit."),
]
# --
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Text("Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.")
]
# --
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Ut Enim"),
Text("Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi."),
]
# --
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [Text("Ut aliquip ex ea commodo consequat."), CheckBox()]
# --
with pytest.raises(StopIteration):
next(pre_chunk_iter)
# ================================================================================================
# PRE-CHUNK SUBTYPES
# ================================================================================================
class DescribeTablePreChunk:
"""Unit-test suite for `unstructured.chunking.base.TablePreChunk objects."""
"""Unit-test suite for `unstructured.chunking.base.TablePreChunk` objects."""
def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
html_table = (
@ -260,7 +317,7 @@ class DescribeTablePreChunk:
class DescribeTextPreChunk:
"""Unit-test suite for `unstructured.chunking.base.TextPreChunk objects."""
"""Unit-test suite for `unstructured.chunking.base.TextPreChunk` objects."""
def it_can_combine_itself_with_another_TextPreChunk_instance(self):
""".combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`.

View File

@ -5,7 +5,7 @@ from typing import List
import pytest
from unstructured.chunking.base import ChunkingOptions, TablePreChunk, TextPreChunk
from unstructured.chunking.title import _split_elements_by_title_and_table, chunk_by_title
from unstructured.chunking.title import _ByTitlePreChunker, chunk_by_title
from unstructured.documents.coordinates import CoordinateSystem
from unstructured.documents.elements import (
CheckBox,
@ -55,7 +55,7 @@ def test_split_elements_by_title_and_table():
CheckBox(),
]
pre_chunks = _split_elements_by_title_and_table(elements, opts=ChunkingOptions.new())
pre_chunks = _ByTitlePreChunker.iter_pre_chunks(elements, opts=ChunkingOptions.new())
pre_chunk = next(pre_chunks)
assert isinstance(pre_chunk, TextPreChunk)

View File

@ -4,7 +4,19 @@ from __future__ import annotations
import collections
import copy
from typing import Any, Callable, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple, cast
from typing import (
Any,
Callable,
DefaultDict,
Dict,
Iterable,
Iterator,
List,
Optional,
Sequence,
Tuple,
cast,
)
from typing_extensions import Self, TypeAlias
@ -27,6 +39,11 @@ PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
"""The kind of object produced by a pre-chunker."""
# ================================================================================================
# CHUNKING OPTIONS
# ================================================================================================
class ChunkingOptions:
"""Specifies parameters of optional chunking behaviors."""
@ -170,6 +187,81 @@ class ChunkingOptions:
raise ValueError(f"'overlap' must be less than max_characters," f" got {self._overlap}")
# ================================================================================================
# BASE PRE-CHUNKER
# ================================================================================================
class BasePreChunker:
"""Base-class for per-strategy pre-chunkers.
The pre-chunker's responsibilities are:
- **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
either side of those boundaries into different sections. In this case, the primary indicator
of a semantic boundary is a `Title` element. A page-break (change in page-number) is also a
semantic boundary when `multipage_sections` is `False`.
- **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
into sections as big as possible without exceeding the chunk window size.
- **Minimize chunks that must be split mid-text.** Precompute the text length of each section
and only produce a section that exceeds the chunk window size when there is a single element
with text longer than that window.
A Table element is placed into a section by itself. CheckBox elements are dropped.
The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates
a new "section", hence the "by-title" designation.
"""
def __init__(self, elements: Sequence[Element], opts: ChunkingOptions):
self._elements = elements
self._opts = opts
@classmethod
def iter_pre_chunks(
cls, elements: Sequence[Element], opts: ChunkingOptions
) -> Iterator[PreChunk]:
"""Generate pre-chunks from the element-stream provided on construction."""
return cls(elements, opts)._iter_pre_chunks()
def _iter_pre_chunks(self) -> Iterator[PreChunk]:
"""Generate pre-chunks from the element-stream provided on construction.
A *pre-chunk* is the largest sub-sequence of elements that will both fit within the
chunking window and respects the semantic boundary rules of the chunking strategy. When a
single element exceeds the chunking window size it is placed in a pre-chunk by itself and
is subject to mid-text splitting in the second phase of the chunking process.
"""
pre_chunk_builder = PreChunkBuilder(self._opts)
for element in self._elements:
# -- start new pre-chunk when necessary --
if self._is_in_new_semantic_unit(element) or not pre_chunk_builder.will_fit(element):
yield from pre_chunk_builder.flush()
# -- add this element to the work-in-progress (WIP) pre-chunk --
pre_chunk_builder.add_element(element)
# -- flush "tail" pre-chunk, any partially-filled pre-chunk after last element is
# -- processed
yield from pre_chunk_builder.flush()
@lazyproperty
def _boundary_predicates(self) -> Tuple[BoundaryPredicate, ...]:
"""The semantic-boundary detectors to be applied to break pre-chunks."""
return ()
def _is_in_new_semantic_unit(self, element: Element) -> bool:
"""True when `element` begins a new semantic unit such as a section or page."""
# -- all detectors need to be called to update state and avoid double counting
# -- boundaries that happen to coincide, like Table and new section on same element.
# -- Using `any()` would short-circuit on first True.
semantic_boundaries = [pred(element) for pred in self._boundary_predicates]
return any(semantic_boundaries)
# ================================================================================================
# PRE-CHUNK SUB-TYPES
# ================================================================================================

View File

@ -5,19 +5,19 @@ Main entry point is the `@add_chunking_strategy()` decorator.
from __future__ import annotations
from typing import Iterator, List, Optional
from typing import Iterator, List, Optional, Tuple
from unstructured.chunking.base import (
BasePreChunker,
BoundaryPredicate,
ChunkingOptions,
PreChunk,
PreChunkBuilder,
PreChunkCombiner,
is_in_next_section,
is_on_next_page,
is_title,
)
from unstructured.documents.elements import Element
from unstructured.utils import lazyproperty
def chunk_by_title(
@ -63,71 +63,27 @@ def chunk_by_title(
)
pre_chunks = PreChunkCombiner(
_split_elements_by_title_and_table(elements, opts), opts=opts
_ByTitlePreChunker.iter_pre_chunks(elements, opts), opts=opts
).iter_combined_pre_chunks()
return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks()]
def _split_elements_by_title_and_table(
elements: List[Element], opts: ChunkingOptions
) -> Iterator[PreChunk]:
"""Implements "pre-chunker" responsibilities.
class _ByTitlePreChunker(BasePreChunker):
"""Pre-chunker for the "by_title" chunking strategy.
A _section_ can be thought of as a "pre-chunk", generally determining the size and contents of a
chunk formed by the subsequent "chunker" process. The only exception occurs when a single
element is too big to fit in the chunk window and the chunker splits it into two or more chunks
divided mid-text. The pre-chunker never divides an element mid-text.
The pre-chunker's responsibilities are:
* **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
either side of those boundaries into different pre-chunks. In this case, the primary
indicator of a semantic boundary is a `Title` element. A page-break (change in
page-number) is also a semantic boundary when `multipage_sections` is `False`.
* **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
into pre-chunks as big as possible without exceeding the chunk window size.
* **Minimize chunks that must be split mid-text.** Precompute the text length of each
pre-chunk and only produce a pre-chunk that exceeds the chunk window size when there is a
single element with text longer than that window.
A Table or Checkbox element is placed into a pre-chunk by itself.
The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates a
new "section", hence the "by-title" designation.
"""
# ========================================================================================
@lazyproperty
def _boundary_predicates(self) -> Tuple[BoundaryPredicate, ...]:
"""The semantic-boundary detectors to be applied to break pre-chunks."""
def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:
yield is_title
yield is_in_next_section()
if not opts.multipage_sections:
yield is_on_next_page()
def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:
yield is_title
yield is_in_next_section()
if not self._opts.multipage_sections:
yield is_on_next_page()
# -- the semantic-boundary detectors to be applied to break pre-chunks --
boundary_predicates = tuple(iter_boundary_predicates())
def is_in_new_semantic_unit(element: Element) -> bool:
"""True when `element` begins a new semantic unit such as a section or page."""
# -- all detectors need to be called to update state and avoid double counting
# -- boundaries that happen to coincide, like Table and new section on same element.
# -- Using `any()` would short-circuit on first True.
semantic_boundaries = [pred(element) for pred in boundary_predicates]
return any(semantic_boundaries)
# ----------------------------------------------------------------------------------------
# -- these bits ^^^ will get migrated to `BasePreChunker` helper methods in the next PR --
# ========================================================================================
pre_chunk_builder = PreChunkBuilder(opts)
for element in elements:
# -- start new pre_chunk when necessary --
if is_in_new_semantic_unit(element) or not pre_chunk_builder.will_fit(element):
yield from pre_chunk_builder.flush()
# -- add this element to the work-in-progress (WIP) pre-chunk --
pre_chunk_builder.add_element(element)
# -- flush "tail" pre_chunk, any partially-filled pre_chunk after last element is processed --
yield from pre_chunk_builder.flush()
return tuple(iter_boundary_predicates())