rfctr(chunking): extract BasePreChunker (#2294)

The `_split_elements_by_title_and_table()` function fulfills the pre-chunker role for `chunk_by_title()`, but most of its operation is not strategy-specific and can be reused by other chunking strategies. Extract `BasePreChunker` and use it as the base class for `_ByTitlePreChunker` which now only needs to provide the boundary predicates specific to that strategy.
2025-12-25 22:23:24 +00:00 · 2023-12-19 22:30:21 -08:00 · 2023-12-19 22:30:21 -08:00 · 82714cad98
commit 82714cad98
parent fd293b3e78
4 changed files with 171 additions and 66 deletions
--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@ -9,6 +9,7 @@ from typing import List
 import pytest

 from unstructured.chunking.base import (
+    BasePreChunker,
    ChunkingOptions,
    PreChunkBuilder,
    PreChunkCombiner,
@ -20,6 +21,7 @@ from unstructured.chunking.base import (
    is_title,
 )
 from unstructured.documents.elements import (
+    CheckBox,
    CompositeElement,
    Element,
    ElementMetadata,
@ -31,6 +33,10 @@ from unstructured.documents.elements import (
    Title,
 )

+# ================================================================================================
+# CHUNKING OPTIONS
+# ================================================================================================
+

 class DescribeChunkingOptions:
    """Unit-test suite for `unstructured.chunking.base.ChunkingOptions objects."""
@ -138,13 +144,64 @@ class DescribeChunkingOptions:
        assert ChunkingOptions.new().text_separator == "\n\n"


+# ================================================================================================
+# BASE PRE-CHUNKER
+# ================================================================================================
+
+
+class DescribeBasePreChunker:
+    """Unit-test suite for `unstructured.chunking.base.BasePreChunker` objects."""
+
+    def it_gathers_elements_into_pre_chunks_respecting_the_specified_chunk_size(self):
+        elements = [
+            Title("Lorem Ipsum"),
+            Text("Lorem ipsum dolor sit amet, consectetur adipiscing elit."),
+            Text("Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."),
+            Title("Ut Enim"),
+            Text("Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi."),
+            Text("Ut aliquip ex ea commodo consequat."),
+            CheckBox(),
+        ]
+
+        opts = ChunkingOptions.new(max_characters=150, new_after_n_chars=65)
+
+        pre_chunk_iter = BasePreChunker.iter_pre_chunks(elements, opts=opts)
+
+        pre_chunk = next(pre_chunk_iter)
+        assert isinstance(pre_chunk, TextPreChunk)
+        assert pre_chunk._elements == [
+            Title("Lorem Ipsum"),
+            Text("Lorem ipsum dolor sit amet, consectetur adipiscing elit."),
+        ]
+        # --
+        pre_chunk = next(pre_chunk_iter)
+        assert isinstance(pre_chunk, TextPreChunk)
+        assert pre_chunk._elements == [
+            Text("Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.")
+        ]
+        # --
+        pre_chunk = next(pre_chunk_iter)
+        assert isinstance(pre_chunk, TextPreChunk)
+        assert pre_chunk._elements == [
+            Title("Ut Enim"),
+            Text("Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi."),
+        ]
+        # --
+        pre_chunk = next(pre_chunk_iter)
+        assert isinstance(pre_chunk, TextPreChunk)
+        assert pre_chunk._elements == [Text("Ut aliquip ex ea commodo consequat."), CheckBox()]
+        # --
+        with pytest.raises(StopIteration):
+            next(pre_chunk_iter)
+
+
 # ================================================================================================
 # PRE-CHUNK SUBTYPES
 # ================================================================================================


 class DescribeTablePreChunk:
-    """Unit-test suite for `unstructured.chunking.base.TablePreChunk objects."""
+    """Unit-test suite for `unstructured.chunking.base.TablePreChunk` objects."""

    def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
        html_table = (
@ -260,7 +317,7 @@ class DescribeTablePreChunk:


 class DescribeTextPreChunk:
-    """Unit-test suite for `unstructured.chunking.base.TextPreChunk objects."""
+    """Unit-test suite for `unstructured.chunking.base.TextPreChunk` objects."""

    def it_can_combine_itself_with_another_TextPreChunk_instance(self):
        """.combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`.
--- a/test_unstructured/chunking/test_title.py
+++ b/test_unstructured/chunking/test_title.py
@ -5,7 +5,7 @@ from typing import List
 import pytest

 from unstructured.chunking.base import ChunkingOptions, TablePreChunk, TextPreChunk
-from unstructured.chunking.title import _split_elements_by_title_and_table, chunk_by_title
+from unstructured.chunking.title import _ByTitlePreChunker, chunk_by_title
 from unstructured.documents.coordinates import CoordinateSystem
 from unstructured.documents.elements import (
    CheckBox,
@ -55,7 +55,7 @@ def test_split_elements_by_title_and_table():
        CheckBox(),
    ]

-    pre_chunks = _split_elements_by_title_and_table(elements, opts=ChunkingOptions.new())
+    pre_chunks = _ByTitlePreChunker.iter_pre_chunks(elements, opts=ChunkingOptions.new())

    pre_chunk = next(pre_chunks)
    assert isinstance(pre_chunk, TextPreChunk)
--- a/unstructured/chunking/base.py
+++ b/unstructured/chunking/base.py
@ -4,7 +4,19 @@ from __future__ import annotations

 import collections
 import copy
-from typing import Any, Callable, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple, cast
+from typing import (
+    Any,
+    Callable,
+    DefaultDict,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    cast,
+)

 from typing_extensions import Self, TypeAlias

@ -27,6 +39,11 @@ PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
 """The kind of object produced by a pre-chunker."""


+# ================================================================================================
+# CHUNKING OPTIONS
+# ================================================================================================
+
+
 class ChunkingOptions:
    """Specifies parameters of optional chunking behaviors."""

@ -170,6 +187,81 @@ class ChunkingOptions:
            raise ValueError(f"'overlap' must be less than max_characters," f" got {self._overlap}")


+# ================================================================================================
+# BASE PRE-CHUNKER
+# ================================================================================================
+
+
+class BasePreChunker:
+    """Base-class for per-strategy pre-chunkers.
+
+    The pre-chunker's responsibilities are:
+
+    - **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
+      either side of those boundaries into different sections. In this case, the primary indicator
+      of a semantic boundary is a `Title` element. A page-break (change in page-number) is also a
+      semantic boundary when `multipage_sections` is `False`.
+
+    - **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
+      into sections as big as possible without exceeding the chunk window size.
+
+    - **Minimize chunks that must be split mid-text.** Precompute the text length of each section
+      and only produce a section that exceeds the chunk window size when there is a single element
+      with text longer than that window.
+
+    A Table element is placed into a section by itself. CheckBox elements are dropped.
+
+    The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates
+    a new "section", hence the "by-title" designation.
+    """
+
+    def __init__(self, elements: Sequence[Element], opts: ChunkingOptions):
+        self._elements = elements
+        self._opts = opts
+
+    @classmethod
+    def iter_pre_chunks(
+        cls, elements: Sequence[Element], opts: ChunkingOptions
+    ) -> Iterator[PreChunk]:
+        """Generate pre-chunks from the element-stream provided on construction."""
+        return cls(elements, opts)._iter_pre_chunks()
+
+    def _iter_pre_chunks(self) -> Iterator[PreChunk]:
+        """Generate pre-chunks from the element-stream provided on construction.
+
+        A *pre-chunk* is the largest sub-sequence of elements that will both fit within the
+        chunking window and respects the semantic boundary rules of the chunking strategy. When a
+        single element exceeds the chunking window size it is placed in a pre-chunk by itself and
+        is subject to mid-text splitting in the second phase of the chunking process.
+        """
+        pre_chunk_builder = PreChunkBuilder(self._opts)
+
+        for element in self._elements:
+            # -- start new pre-chunk when necessary --
+            if self._is_in_new_semantic_unit(element) or not pre_chunk_builder.will_fit(element):
+                yield from pre_chunk_builder.flush()
+
+            # -- add this element to the work-in-progress (WIP) pre-chunk --
+            pre_chunk_builder.add_element(element)
+
+        # -- flush "tail" pre-chunk, any partially-filled pre-chunk after last element is
+        # -- processed
+        yield from pre_chunk_builder.flush()
+
+    @lazyproperty
+    def _boundary_predicates(self) -> Tuple[BoundaryPredicate, ...]:
+        """The semantic-boundary detectors to be applied to break pre-chunks."""
+        return ()
+
+    def _is_in_new_semantic_unit(self, element: Element) -> bool:
+        """True when `element` begins a new semantic unit such as a section or page."""
+        # -- all detectors need to be called to update state and avoid double counting
+        # -- boundaries that happen to coincide, like Table and new section on same element.
+        # -- Using `any()` would short-circuit on first True.
+        semantic_boundaries = [pred(element) for pred in self._boundary_predicates]
+        return any(semantic_boundaries)
+
+
 # ================================================================================================
 # PRE-CHUNK SUB-TYPES
 # ================================================================================================
--- a/unstructured/chunking/title.py
+++ b/unstructured/chunking/title.py
@ -5,19 +5,19 @@ Main entry point is the `@add_chunking_strategy()` decorator.

 from __future__ import annotations

-from typing import Iterator, List, Optional
+from typing import Iterator, List, Optional, Tuple

 from unstructured.chunking.base import (
+    BasePreChunker,
    BoundaryPredicate,
    ChunkingOptions,
-    PreChunk,
-    PreChunkBuilder,
    PreChunkCombiner,
    is_in_next_section,
    is_on_next_page,
    is_title,
 )
 from unstructured.documents.elements import Element
+from unstructured.utils import lazyproperty


 def chunk_by_title(
@ -63,71 +63,27 @@ def chunk_by_title(
    )

    pre_chunks = PreChunkCombiner(
-        _split_elements_by_title_and_table(elements, opts), opts=opts
+        _ByTitlePreChunker.iter_pre_chunks(elements, opts), opts=opts
    ).iter_combined_pre_chunks()

    return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks()]


-def _split_elements_by_title_and_table(
-    elements: List[Element], opts: ChunkingOptions
-) -> Iterator[PreChunk]:
-    """Implements "pre-chunker" responsibilities.
+class _ByTitlePreChunker(BasePreChunker):
+    """Pre-chunker for the "by_title" chunking strategy.

-    A _section_ can be thought of as a "pre-chunk", generally determining the size and contents of a
-    chunk formed by the subsequent "chunker" process. The only exception occurs when a single
-    element is too big to fit in the chunk window and the chunker splits it into two or more chunks
-    divided mid-text. The pre-chunker never divides an element mid-text.
-
-    The pre-chunker's responsibilities are:
-
-        * **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
-          either side of those boundaries into different pre-chunks. In this case, the primary
-          indicator of a semantic boundary is a `Title` element. A page-break (change in
-          page-number) is also a semantic boundary when `multipage_sections` is `False`.
-
-        * **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
-          into pre-chunks as big as possible without exceeding the chunk window size.
-
-        * **Minimize chunks that must be split mid-text.** Precompute the text length of each
-          pre-chunk and only produce a pre-chunk that exceeds the chunk window size when there is a
-          single element with text longer than that window.
-
-    A Table or Checkbox element is placed into a pre-chunk by itself.
+    The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates a
+    new "section", hence the "by-title" designation.
    """

-    # ========================================================================================
+    @lazyproperty
+    def _boundary_predicates(self) -> Tuple[BoundaryPredicate, ...]:
+        """The semantic-boundary detectors to be applied to break pre-chunks."""

-    def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:
-        yield is_title
-        yield is_in_next_section()
-        if not opts.multipage_sections:
-            yield is_on_next_page()
+        def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:
+            yield is_title
+            yield is_in_next_section()
+            if not self._opts.multipage_sections:
+                yield is_on_next_page()

-    # -- the semantic-boundary detectors to be applied to break pre-chunks --
-    boundary_predicates = tuple(iter_boundary_predicates())
-
-    def is_in_new_semantic_unit(element: Element) -> bool:
-        """True when `element` begins a new semantic unit such as a section or page."""
-        # -- all detectors need to be called to update state and avoid double counting
-        # -- boundaries that happen to coincide, like Table and new section on same element.
-        # -- Using `any()` would short-circuit on first True.
-        semantic_boundaries = [pred(element) for pred in boundary_predicates]
-        return any(semantic_boundaries)
-
-    # ----------------------------------------------------------------------------------------
-    # -- these bits ^^^ will get migrated to `BasePreChunker` helper methods in the next PR --
-    # ========================================================================================
-
-    pre_chunk_builder = PreChunkBuilder(opts)
-
-    for element in elements:
-        # -- start new pre_chunk when necessary --
-        if is_in_new_semantic_unit(element) or not pre_chunk_builder.will_fit(element):
-            yield from pre_chunk_builder.flush()
-
-        # -- add this element to the work-in-progress (WIP) pre-chunk --
-        pre_chunk_builder.add_element(element)
-
-    # -- flush "tail" pre_chunk, any partially-filled pre_chunk after last element is processed --
-    yield from pre_chunk_builder.flush()
+        return tuple(iter_boundary_predicates())