rfctr(chunking): extract boundary predicates (#2284)

`chunk_by_title()` respects certain semantic boundaries while chunking. Those are sections introduced by a `Title` element, sections introduced by a `metadata.section` value change, and optionally page-breaks. "Respecting" in this context means that elements on opposite sides of a semantic boundary never appear in the same chunk. The `metadata_differs()` function used for this purpose is clumsy to use requiring the caller to maintain state (prior element). It also combines what are independent predicates such that they cannot be individually reused. Introduce the `BoundaryPredicate` type which takes an element and returns bool, indicating whether the element introduces a new semantic boundary. These can be reused by any chunking strategy that needs them and allows the pre-chunking operation to be generalized for use by any chunking strategy, which it will be in the following PR.
2026-01-06 12:21:30 +00:00 · 2023-12-19 10:20:05 -08:00 · 2023-12-19 10:20:05 -08:00 · 4e2ba2c9b2
commit 4e2ba2c9b2
parent 4b8352e0f5
3 changed files with 289 additions and 39 deletions
--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@ -15,6 +15,9 @@ from unstructured.chunking.base import (
    TablePreChunk,
    TextPreChunk,
    TextPreChunkAccumulator,
+    is_in_next_section,
+    is_on_next_page,
+    is_title,
 )
 from unstructured.documents.elements import (
    CompositeElement,
@ -1052,3 +1055,137 @@ class DescribeTextPreChunkAccumulator:
        # -- go between the current text and that of the next pre-chunk if one was added.
        # -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88
        assert accum.remaining_space == 86
+
+
+# ================================================================================================
+# (SEMANTIC) BOUNDARY PREDICATES
+# ================================================================================================
+
+
+class Describe_is_in_next_section:
+    """Unit-test suite for `unstructured.chunking.base.is_in_next_section()` function.
+
+    `is_in_next_section()` is not itself a predicate, rather it returns a predicate on Element
+    (`Callable[[Element], bool]`) that can be called repeatedly to detect section changes in an
+    element stream.
+    """
+
+    def it_is_false_for_the_first_element_when_it_has_a_non_None_section(self):
+        """This is an explicit first-section; first-section does not represent a section break."""
+        pred = is_in_next_section()
+        assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
+
+    def and_it_is_false_for_the_first_element_when_it_has_a_None_section(self):
+        """This is an anonymous first-section; still doesn't represent a section break."""
+        pred = is_in_next_section()
+        assert not pred(Text("abcd"))
+
+    def it_is_false_for_None_section_elements_that_follow_an_explicit_first_section(self):
+        """A `None` section element is considered to continue the prior section."""
+        pred = is_in_next_section()
+        assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
+        assert not pred(Text("efgh"))
+        assert not pred(Text("ijkl"))
+
+    def and_it_is_false_for_None_section_elements_that_follow_an_anonymous_first_section(self):
+        """A `None` section element is considered to continue the prior section."""
+        pred = is_in_next_section()
+        assert not pred(Text("abcd"))
+        assert not pred(Text("efgh"))
+        assert not pred(Text("ijkl"))
+
+    def it_is_false_for_matching_section_elements_that_follow_an_explicit_first_section(self):
+        pred = is_in_next_section()
+        assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
+        assert not pred(Text("efgh", metadata=ElementMetadata(section="Introduction")))
+        assert not pred(Text("ijkl", metadata=ElementMetadata(section="Introduction")))
+
+    def it_is_true_for_an_explicit_section_element_that_follows_an_anonymous_first_section(self):
+        pred = is_in_next_section()
+        assert not pred(Text("abcd"))
+        assert not pred(Text("efgh"))
+        assert pred(Text("ijkl", metadata=ElementMetadata(section="Introduction")))
+
+    def and_it_is_true_for_a_different_explicit_section_that_follows_an_explicit_section(self):
+        pred = is_in_next_section()
+        assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
+        assert pred(Text("efgh", metadata=ElementMetadata(section="Summary")))
+
+    def it_is_true_whenever_the_section_explicitly_changes_except_at_the_start(self):
+        pred = is_in_next_section()
+        assert not pred(Text("abcd"))
+        assert pred(Text("efgh", metadata=ElementMetadata(section="Introduction")))
+        assert not pred(Text("ijkl"))
+        assert not pred(Text("mnop", metadata=ElementMetadata(section="Introduction")))
+        assert not pred(Text("qrst"))
+        assert pred(Text("uvwx", metadata=ElementMetadata(section="Summary")))
+        assert not pred(Text("yzab", metadata=ElementMetadata(section="Summary")))
+        assert not pred(Text("cdef"))
+        assert pred(Text("ghij", metadata=ElementMetadata(section="Appendix")))
+
+
+class Describe_is_on_next_page:
+    """Unit-test suite for `unstructured.chunking.base.is_on_next_page()` function.
+
+    `is_on_next_page()` is not itself a predicate, rather it returns a predicate on Element
+    (`Callable[[Element], bool]`) that can be called repeatedly to detect section changes in an
+    element stream.
+    """
+
+    @pytest.mark.parametrize(
+        "element", [Text("abcd"), Text("efgh", metadata=ElementMetadata(page_number=4))]
+    )
+    def it_is_unconditionally_false_for_the_first_element(self, element: Element):
+        """The first page never represents a page-break."""
+        pred = is_on_next_page()
+        assert not pred(element)
+
+    def it_is_false_for_an_element_that_has_no_page_number(self):
+        """An element with a `None` page-number is assumed to continue the current page."""
+        pred = is_on_next_page()
+        assert not pred(Text("abcd", metadata=ElementMetadata(page_number=1)))
+        assert not pred(Text("efgh"))
+        assert not pred(Text("ijkl"))
+
+    def it_is_false_for_an_element_with_the_current_page_number(self):
+        pred = is_on_next_page()
+        assert not pred(Text("abcd", metadata=ElementMetadata(page_number=1)))
+        assert not pred(Text("efgh"))
+        assert not pred(Text("ijkl", metadata=ElementMetadata(page_number=1)))
+        assert not pred(Text("mnop"))
+
+    def it_assigns_page_number_1_to_a_first_element_that_has_no_page_number(self):
+        pred = is_on_next_page()
+        assert not pred(Text("abcd"))
+        assert not pred(Text("efgh", metadata=ElementMetadata(page_number=1)))
+
+    def it_is_true_for_an_element_with_an_explicit_different_page_number(self):
+        pred = is_on_next_page()
+        assert not pred(Text("abcd", metadata=ElementMetadata(page_number=1)))
+        assert pred(Text("efgh", metadata=ElementMetadata(page_number=2)))
+
+    def and_it_is_true_even_when_that_page_number_is_lower(self):
+        pred = is_on_next_page()
+        assert not pred(Text("abcd", metadata=ElementMetadata(page_number=4)))
+        assert pred(Text("efgh", metadata=ElementMetadata(page_number=2)))
+        assert not pred(Text("ijkl", metadata=ElementMetadata(page_number=2)))
+        assert not pred(Text("mnop"))
+        assert pred(Text("qrst", metadata=ElementMetadata(page_number=3)))
+
+
+class Describe_is_title:
+    """Unit-test suite for `unstructured.chunking.base.is_title()` predicate."""
+
+    def it_is_true_for_a_Title_element(self):
+        assert is_title(Title("abcd"))
+
+    @pytest.mark.parametrize(
+        "element",
+        [
+            PageBreak(""),
+            Table("Header Col 1  Header Col 2\n" "Lorem ipsum   adipiscing"),
+            Text("abcd"),
+        ],
+    )
+    def and_it_is_false_for_any_other_element_subtype(self, element: Element):
+        assert not is_title(element)
--- a/unstructured/chunking/base.py
+++ b/unstructured/chunking/base.py
@ -4,7 +4,7 @@ from __future__ import annotations

 import collections
 import copy
-from typing import Any, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple, cast
+from typing import Any, Callable, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple, cast

 from typing_extensions import Self, TypeAlias

@ -16,10 +16,15 @@ from unstructured.documents.elements import (
    RegexMetadata,
    Table,
    TableChunk,
+    Title,
 )
 from unstructured.utils import lazyproperty

+BoundaryPredicate: TypeAlias = Callable[[Element], bool]
+"""Detects when element represents crossing a semantic boundary like section or page."""
+
 PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
+"""The kind of object produced by a pre-chunker."""


 class ChunkingOptions:
@ -609,3 +614,118 @@ class TextPreChunkAccumulator:
        total_text_length = sum(s.text_length for s in self._pre_chunks)
        total_separator_length = len(self._opts.text_separator) * (n - 1)
        return total_text_length + total_separator_length
+
+
+# ================================================================================================
+# CHUNK BOUNDARY PREDICATES
+# ------------------------------------------------------------------------------------------------
+# A *boundary predicate* is a function that takes an element and returns True when the element
+# represents the start of a new semantic boundary (such as section or page) to be respected in
+# chunking.
+#
+# Some of the functions below *are* a boundary predicate and others *construct* a boundary
+# predicate.
+#
+# These can be mixed and matched to produce different chunking behaviors like "by_title" or left
+# out altogether to produce "by_element" behavior.
+#
+# The effective lifetime of the function that produce a predicate (rather than directly being one)
+# is limited to a single element-stream because these retain state (e.g. current page number) to
+# determine when a semantic boundary has been crossed.
+# ================================================================================================
+
+
+def is_in_next_section() -> BoundaryPredicate:
+    """Not a predicate itself, calling this returns a predicate that triggers on each new section.
+
+    The lifetime of the returned callable cannot extend beyond a single element-stream because it
+    stores current state (current section) that is particular to that element stream.
+
+    A "section" of this type is particular to the EPUB format (so far) and not to be confused with
+    a "section" composed of a section-heading (`Title` element) followed by content elements.
+
+    The returned predicate tracks the current section, starting at `None`. Calling with an element
+    with a different value for `metadata.section` returns True, indicating the element starts a new
+    section boundary, and updates the enclosed section name ready for the next transition.
+    """
+    current_section: Optional[str] = None
+    is_first: bool = True
+
+    def section_changed(element: Element) -> bool:
+        nonlocal current_section, is_first
+
+        section = element.metadata.section
+
+        # -- The first element never reports a section break, it starts the first section of the
+        # -- document. That section could be named (section is non-None) or anonymous (section is
+        # -- None). We don't really have to care.
+        if is_first:
+            current_section = section
+            is_first = False
+            return False
+
+        # -- An element with a `None` section is assumed to continue the current section. It never
+        # -- updates the current-section because once set, the current-section is "sticky" until
+        # -- replaced by another explicit section.
+        if section is None:
+            return False
+
+        # -- another element with the same section continues that section --
+        if section == current_section:
+            return False
+
+        current_section = section
+        return True
+
+    return section_changed
+
+
+def is_on_next_page() -> BoundaryPredicate:
+    """Not a predicate itself, calling this returns a predicate that triggers on each new page.
+
+    The lifetime of the returned callable cannot extend beyond a single element-stream because it
+    stores current state (current page-number) that is particular to that element stream.
+
+    The returned predicate tracks the "current" page-number, starting at 1. An element with a
+    greater page number returns True, indicating the element starts a new page boundary, and
+    updates the enclosed page-number ready for the next transition.
+
+    An element with `page_number == None` or a page-number lower than the stored value is ignored
+    and returns False.
+    """
+    current_page_number: int = 1
+    is_first: bool = True
+
+    def page_number_incremented(element: Element) -> bool:
+        nonlocal current_page_number, is_first
+
+        page_number = element.metadata.page_number
+
+        # -- The first element never reports a page break, it starts the first page of the
+        # -- document. That page could be numbered (page_number is non-None) or not. If it is not
+        # -- numbered we assign it page-number 1.
+        if is_first:
+            current_page_number = page_number or 1
+            is_first = False
+            return False
+
+        # -- An element with a `None` page-number is assumed to continue the current page. It never
+        # -- updates the current-page-number because once set, the current-page-number is "sticky"
+        # -- until replaced by a different explicit page-number.
+        if page_number is None:
+            return False
+
+        if page_number == current_page_number:
+            return False
+
+        # -- it's possible for a page-number to decrease. We don't expect that, but if it happens
+        # -- we consider it a page-break.
+        current_page_number = page_number
+        return True
+
+    return page_number_incremented
+
+
+def is_title(element: Element) -> bool:
+    """True when `element` is a `Title` element, False otherwise."""
+    return isinstance(element, Title)
--- a/unstructured/chunking/title.py
+++ b/unstructured/chunking/title.py
@ -8,12 +8,16 @@ from __future__ import annotations
 from typing import Iterator, List, Optional

 from unstructured.chunking.base import (
+    BoundaryPredicate,
    ChunkingOptions,
    PreChunk,
    PreChunkBuilder,
    PreChunkCombiner,
+    is_in_next_section,
+    is_on_next_page,
+    is_title,
 )
-from unstructured.documents.elements import Element, Title
+from unstructured.documents.elements import Element


 def chunk_by_title(
@ -91,50 +95,39 @@ def _split_elements_by_title_and_table(

    A Table or Checkbox element is placed into a pre-chunk by itself.
    """
+
+    # ========================================================================================
+
+    def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:
+        yield is_title
+        yield is_in_next_section()
+        if not opts.multipage_sections:
+            yield is_on_next_page()
+
+    # -- the semantic-boundary detectors to be applied to break pre-chunks --
+    boundary_predicates = tuple(iter_boundary_predicates())
+
+    def is_in_new_semantic_unit(element: Element) -> bool:
+        """True when `element` begins a new semantic unit such as a section or page."""
+        # -- all detectors need to be called to update state and avoid double counting
+        # -- boundaries that happen to coincide, like Table and new section on same element.
+        # -- Using `any()` would short-circuit on first True.
+        semantic_boundaries = [pred(element) for pred in boundary_predicates]
+        return any(semantic_boundaries)
+
+    # ----------------------------------------------------------------------------------------
+    # -- these bits ^^^ will get migrated to `BasePreChunker` helper methods in the next PR --
+    # ========================================================================================
+
    pre_chunk_builder = PreChunkBuilder(opts)

-    prior_element = None
-
    for element in elements:
-        metadata_differs = (
-            _metadata_differs(element, prior_element, ignore_page_numbers=opts.multipage_sections)
-            if prior_element
-            else False
-        )
-
        # -- start new pre_chunk when necessary --
-        if (
-            # -- Title starts a new "section" and so a new pre_chunk --
-            isinstance(element, Title)
-            # -- start a new pre-chunk when the WIP pre-chunk is already full --
-            or not pre_chunk_builder.will_fit(element)
-            # -- a semantic boundary is indicated by metadata change since prior element --
-            or metadata_differs
-        ):
-            # -- complete any work-in-progress pre_chunk --
+        if is_in_new_semantic_unit(element) or not pre_chunk_builder.will_fit(element):
            yield from pre_chunk_builder.flush()

+        # -- add this element to the work-in-progress (WIP) pre-chunk --
        pre_chunk_builder.add_element(element)

-        prior_element = element
-
    # -- flush "tail" pre_chunk, any partially-filled pre_chunk after last element is processed --
    yield from pre_chunk_builder.flush()
-
-
-def _metadata_differs(
-    element: Element,
-    preceding_element: Element,
-    ignore_page_numbers: bool,
-) -> bool:
-    """True when metadata differences between two elements indicate a semantic boundary.
-
-    Currently this is only a section change and optionally a page-number change.
-    """
-    metadata1 = preceding_element.metadata
-    metadata2 = element.metadata
-    if metadata1.section != metadata2.section:
-        return True
-    if ignore_page_numbers:
-        return False
-    return metadata1.page_number != metadata2.page_number