rfctr(chunking): generalize PreChunkBuilder (#2283)

To implement inter-pre-chunk overlap, we need a context that sees every pre-chunk both before and after it is accumulated (from elements). - We need access to the pre-chunk when it is completed so we can extract the "tail" overlap to be applied to the next chunk. - We need access to the as-yet-unpopulated pre-chunk so we can add the prior tail to it as a prefix. This "visibility" is split between `PreChunkBuilder` and the pre-chunker itself, which handles `TablePreChunk`s without the builder. Move `Table` element and TablePreChunk` formation into `PreChunkBuilder` such that _all_ element types (adding `Table` elements in particular) pass through it. Then `PreChunkBuilder` becomes the context we require. The actual overlap harvesting and application will come in a subsequent commit.
2025-12-25 06:04:53 +00:00 · 2023-12-18 14:21:34 -08:00 · 2023-12-18 14:21:34 -08:00 · 0c7f64ecaa
commit 0c7f64ecaa
parent 9efc22c0fc
5 changed files with 165 additions and 48 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,11 @@
+## 0.11.6-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+
 ## 0.11.5

 ### Enhancements
--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@ -18,6 +18,7 @@ from unstructured.chunking.base import (
 )
 from unstructured.documents.elements import (
    CompositeElement,
+    Element,
    ElementMetadata,
    PageBreak,
    RegexMetadata,
@ -572,15 +573,15 @@ class DescribePreChunkBuilder:
    def it_is_empty_on_construction(self):
        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))

-        assert builder.text_length == 0
-        assert builder.remaining_space == 50
+        assert builder._text_length == 0
+        assert builder._remaining_space == 50

    def it_accumulates_elements_added_to_it(self):
        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))

        builder.add_element(Title("Introduction"))
-        assert builder.text_length == 12
-        assert builder.remaining_space == 136
+        assert builder._text_length == 12
+        assert builder._remaining_space == 136

        builder.add_element(
            Text(
@ -588,8 +589,67 @@ class DescribePreChunkBuilder:
                "lectus porta volutpat.",
            ),
        )
-        assert builder.text_length == 112
-        assert builder.remaining_space == 36
+        assert builder._text_length == 112
+        assert builder._remaining_space == 36
+
+    @pytest.mark.parametrize("element", [Table("Heading\nCell text"), Text("abcd " * 200)])
+    def it_will_fit_a_Table_or_oversized_element_when_empty(self, element: Element):
+        builder = PreChunkBuilder(opts=ChunkingOptions.new())
+        assert builder.will_fit(element)
+
+    @pytest.mark.parametrize(
+        ("existing_element", "next_element"),
+        [
+            (Text("abcd"), Table("Fruits\nMango")),
+            (Text("abcd"), Text("abcd " * 200)),
+            (Table("Heading\nCell text"), Table("Fruits\nMango")),
+            (Table("Heading\nCell text"), Text("abcd " * 200)),
+        ],
+    )
+    def but_not_when_it_already_contains_an_element_of_any_kind(
+        self, existing_element: Element, next_element: Element
+    ):
+        builder = PreChunkBuilder(opts=ChunkingOptions.new())
+        builder.add_element(existing_element)
+
+        assert not builder.will_fit(next_element)
+
+    @pytest.mark.parametrize("element", [Text("abcd"), Table("Fruits\nMango")])
+    def it_will_not_fit_any_element_when_it_already_contains_a_table(self, element: Element):
+        builder = PreChunkBuilder(opts=ChunkingOptions.new())
+        builder.add_element(Table("Heading\nCell text"))
+
+        assert not builder.will_fit(element)
+
+    def it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen(self):
+        builder = PreChunkBuilder(
+            opts=ChunkingOptions.new(max_characters=100, new_after_n_chars=50)
+        )
+        builder.add_element(
+            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")  # 55-chars
+        )
+
+        assert not builder.will_fit(Text("In rhoncus ipsum."))
+
+    def and_it_will_not_fit_an_element_when_that_would_cause_it_to_exceed_the_hard_maxlen(self):
+        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100))
+        builder.add_element(
+            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")  # 55-chars
+        )
+
+        # -- 55 + 2 (separator) + 44 == 101 --
+        assert not builder.will_fit(
+            Text("In rhoncus ipsum sed lectus portos volutpat.")  # 44-chars
+        )
+
+    def but_it_will_fit_an_element_that_fits(self):
+        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100))
+        builder.add_element(
+            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")  # 55-chars
+        )
+
+        # -- 55 + 2 (separator) + 43 == 100 --
+        assert builder.will_fit(Text("In rhoncus ipsum sed lectus porto volutpat."))  # 43-chars

    def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
@ -611,8 +671,24 @@ class DescribePreChunkBuilder:
                "lectus porta volutpat.",
            ),
        ]
-        assert builder.text_length == 0
-        assert builder.remaining_space == 150
+        assert builder._text_length == 0
+        assert builder._remaining_space == 150
+
+    def but_it_generates_a_TablePreChunk_when_it_contains_a_Table_element(self):
+        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
+        builder.add_element(Table("Heading\nCell text"))
+
+        pre_chunk = next(builder.flush())
+
+        # -- pre-chunk builder was reset before the yield, such that the iterator does not need to
+        # -- be exhausted before clearing out the old elements and a new pre-chunk can be
+        # -- accumulated immediately (first `next()` call is required however, to advance to the
+        # -- yield statement).
+        assert builder._text_length == 0
+        assert builder._remaining_space == 150
+        # -- pre-chunk is a `TablePreChunk` --
+        assert isinstance(pre_chunk, TablePreChunk)
+        assert pre_chunk._table == Table("Heading\nCell text")

    def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
@ -620,21 +696,21 @@ class DescribePreChunkBuilder:
        pre_chunks = list(builder.flush())

        assert pre_chunks == []
-        assert builder.text_length == 0
-        assert builder.remaining_space == 150
+        assert builder._text_length == 0
+        assert builder._remaining_space == 150

    def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
        builder.add_element(Text("abcde"))
        builder.add_element(Text("fghij"))

-        # -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
+        # -- ._text_length includes a separator ("\n\n", len==2) between each text-segment,
        # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
-        assert builder.text_length == 12
-        # -- .remaining_space is reduced by the length (2) of the trailing separator which would go
-        # -- between the current text and that of the next element if one was added.
+        assert builder._text_length == 12
+        # -- ._remaining_space is reduced by the length (2) of the trailing separator which would
+        # -- go between the current text and that of the next element if one was added.
        # -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
-        assert builder.remaining_space == 36
+        assert builder._remaining_space == 36


 class DescribePreChunkCombiner:
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.11.5"  # pragma: no cover
+__version__ = "0.11.6-dev0"  # pragma: no cover
--- a/unstructured/chunking/base.py
+++ b/unstructured/chunking/base.py
@ -396,8 +396,8 @@ class TextPreChunk:
 class PreChunkBuilder:
    """An element accumulator suitable for incrementally forming a pre-chunk.

-    Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use
-    to determine whether it should add the next element in the element stream.
+    Provides the trial method `.will_fit()` a pre-chunker can use to determine whether it should add
+    the next element in the element stream.

    `.flush()` is used to build a PreChunk object from the accumulated elements. This method
    returns an iterator that generates zero-or-one `TextPreChunk` or `TablePreChunk` object and is
@ -426,7 +426,7 @@ class PreChunkBuilder:
            self._text_segments.append(element.text)
            self._text_len += len(element.text)

-    def flush(self) -> Iterator[TextPreChunk]:
+    def flush(self) -> Iterator[PreChunk]:
        """Generate zero-or-one `PreChunk` object and clear the accumulator.

        Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
@ -435,23 +435,62 @@ class PreChunkBuilder:
        """
        if not self._elements:
            return
+
+        pre_chunk = (
+            TablePreChunk(self._elements[0], self._opts)
+            if isinstance(self._elements[0], Table)
+            # -- copy list, don't use original or it may change contents as builder proceeds --
+            else TextPreChunk(list(self._elements), self._opts)
+        )
        # -- clear builder before yield so we're not sensitive to the timing of how/when this
-        # -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately.
-        elements = self._elements[:]
-        self._elements.clear()
-        self._text_segments.clear()
-        self._text_len = 0
-        yield TextPreChunk(elements, self._opts)
+        # -- iterator is exhausted and can add elements for the next pre-chunk immediately.
+        self._reset_state()
+        yield pre_chunk
+
+    def will_fit(self, element: Element) -> bool:
+        """True when `element` can be added to this prechunk without violating its limits.
+
+        There are several limits:
+        - A `Table` element will never fit with any other element. It will only fit in an empty
+          pre-chunk.
+        - No element will fit in a pre-chunk that already contains a `Table` element.
+        - A text-element will not fit in a pre-chunk that already exceeds the soft-max
+          (aka. new_after_n_chars).
+        - A text-element will not fit when together with the elements already present it would
+          exceed the hard-max (aka. max_characters).
+        """
+        # -- an empty pre-chunk will accept any element (including an oversized-element) --
+        if len(self._elements) == 0:
+            return True
+        # -- a `Table` will not fit in a non-empty pre-chunk --
+        if isinstance(element, Table):
+            return False
+        # -- no element will fit in a pre-chunk that already contains a `Table` element --
+        if self._elements and isinstance(self._elements[0], Table):
+            return False
+        # -- a pre-chunk that already exceeds the soft-max is considered "full" --
+        if self._text_length > self._opts.soft_max:
+            return False
+        # -- don't add an element if it would increase total size beyond the hard-max --
+        if self._remaining_space < len(element.text):
+            return False
+        return True

    @property
-    def remaining_space(self) -> int:
+    def _remaining_space(self) -> int:
        """Maximum text-length of an element that can be added without exceeding maxlen."""
        # -- include length of trailing separator that will go before next element text --
        separators_len = self._separator_len * len(self._text_segments)
        return self._opts.hard_max - self._text_len - separators_len

+    def _reset_state(self) -> None:
+        """Set working-state values back to "empty", ready to accumulate next pre-chunk."""
+        self._elements.clear()
+        self._text_segments.clear()
+        self._text_len = 0
+
    @property
-    def text_length(self) -> int:
+    def _text_length(self) -> int:
        """Length of the text in this pre-chunk.

        This value represents the chunk-size that would result if this pre-chunk was flushed in its
@ -502,10 +541,16 @@ class PreChunkCombiner:


 class TextPreChunkAccumulator:
-    """Accumulates, measures, and combines pre-chunk objects.
+    """Accumulates, measures, and combines text pre-chunks.

-    Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
-    whether to add another pre-chunk.
+    Used for combining pre-chunks for chunking strategies like "by-title" that can potentially
+    produce undersized chunks and offer the `combine_text_under_n_chars` option. Note that only
+    sequential `TextPreChunk` objects can be combined. A `TablePreChunk` is never combined with
+    another pre-chunk.
+
+    Provides `.add_pre_chunk()` allowing a pre-chunk to be added to the chunk and provides
+    monitoring properties `.remaining_space` and `.text_length` suitable for deciding whether to add
+    another pre-chunk.

    `.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
    This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used
--- a/unstructured/chunking/title.py
+++ b/unstructured/chunking/title.py
@ -12,13 +12,8 @@ from unstructured.chunking.base import (
    PreChunk,
    PreChunkBuilder,
    PreChunkCombiner,
-    TablePreChunk,
-)
-from unstructured.documents.elements import (
-    Element,
-    Table,
-    Title,
 )
+from unstructured.documents.elements import Element, Title


 def chunk_by_title(
@ -109,24 +104,17 @@ def _split_elements_by_title_and_table(

        # -- start new pre_chunk when necessary --
        if (
-            # -- Title and Table both start a new pre_chunk --
-            isinstance(element, (Title, Table))
-            # -- adding this element would exceed hard-maxlen for pre_chunk --
-            or pre_chunk_builder.remaining_space < len(str(element))
-            # -- pre_chunk already meets or exceeds soft-maxlen --
-            or pre_chunk_builder.text_length >= opts.soft_max
+            # -- Title starts a new "section" and so a new pre_chunk --
+            isinstance(element, Title)
+            # -- start a new pre-chunk when the WIP pre-chunk is already full --
+            or not pre_chunk_builder.will_fit(element)
            # -- a semantic boundary is indicated by metadata change since prior element --
            or metadata_differs
        ):
            # -- complete any work-in-progress pre_chunk --
            yield from pre_chunk_builder.flush()

-        # -- emit table and checkbox immediately since they are always isolated --
-        if isinstance(element, Table):
-            yield TablePreChunk(table=element, opts=opts)
-        # -- but accumulate text elements for consolidation into a composite chunk --
-        else:
-            pre_chunk_builder.add_element(element)
+        pre_chunk_builder.add_element(element)

        prior_element = element