rfctr(chunking): split oversized chunks on word boundary (#2297)

The text of an oversized chunk is split on an arbitrary character boundary (mid-word). The `chunk_by_character()` strategy introduces the idea of allowing the user to specify a separator to use for chunk-splitting. For `langchain` this is typically "\n\n", "\n", or " "; blank-line, newline, or word boundaries respectively. Even if the user is allowed to specify a separator, we must provide fall-back for when a chunk contains no such character. This can be done incrementally, like blank-line is preferable to newline, newline is preferable to word, and word is preferable to arbitrary character. Further, there is nothing particular to `chunk_by_character()` in providing such a fall-back text-splitting strategy. It would be preferable for all strategies to split oversized chunks on even-word boundaries for example. Note that while a "blank-line" ("\n\n") may be common in plain text, it is unlikely to appear in the text of an element because it would have been interpreted as an element boundary during partitioning. Add _TextSplitter with basic separator preferences and fall-back and apply it to chunk-splitting for all strategies. The `by_character` chunking strategy may enhance this behavior by adding the option for a user to specify a particular separator suited to their use case.
2025-12-05 11:32:35 +00:00 · 2023-12-20 21:45:36 -08:00 · 2023-12-20 21:45:36 -08:00 · 093a11d058
commit 093a11d058
parent 4533bdac98
6 changed files with 276 additions and 34 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,11 @@
+## 0.11.7-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+
 ## 0.11.6

 ### Enhancements
--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@ -4,7 +4,7 @@

 from __future__ import annotations

-from typing import List
+from typing import List, Sequence

 import pytest

@ -16,6 +16,7 @@ from unstructured.chunking.base import (
    TablePreChunk,
    TextPreChunk,
    TextPreChunkAccumulator,
+    _TextSplitter,
    is_in_next_section,
    is_on_next_page,
    is_title,
@ -39,7 +40,7 @@ from unstructured.documents.elements import (


 class DescribeChunkingOptions:
-    """Unit-test suite for `unstructured.chunking.base.ChunkingOptions objects."""
+    """Unit-test suite for `unstructured.chunking.base.ChunkingOptions` objects."""

    @pytest.mark.parametrize("max_characters", [0, -1, -42])
    def it_rejects_max_characters_not_greater_than_zero(self, max_characters: int):
@ -144,6 +145,97 @@ class DescribeChunkingOptions:
        assert ChunkingOptions.new().text_separator == "\n\n"


+class Describe_TextSplitter:
+    """Unit-test suite for `unstructured.chunking.base._TextSplitter` objects."""
+
+    def it_splits_on_a_preferred_separator_when_it_can(self):
+        opts = ChunkingOptions.new(max_characters=50, text_splitting_separators=("\n", " "))
+        split = _TextSplitter(opts)
+        text = (
+            "Lorem ipsum dolor amet consectetur adipiscing.\n"
+            "In rhoncus ipsum sed lectus porta volutpat."
+        )
+
+        s, remainder = split(text)
+        assert s == "Lorem ipsum dolor amet consectetur adipiscing."
+        assert remainder == "In rhoncus ipsum sed lectus porta volutpat."
+        # --
+        s, remainder = split(remainder)
+        assert s == "In rhoncus ipsum sed lectus porta volutpat."
+        assert remainder == ""
+
+    def and_it_splits_on_the_next_available_separator_when_the_first_is_not_available(self):
+        opts = ChunkingOptions.new(max_characters=40, text_splitting_separators=("\n", " "))
+        split = _TextSplitter(opts)
+        text = (
+            "Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus porta"
+            " volutpat."
+        )
+
+        s, remainder = split(text)
+        assert s == "Lorem ipsum dolor amet consectetur"
+        assert remainder == "adipiscing. In rhoncus ipsum sed lectus porta volutpat."
+        # --
+        s, remainder = split(remainder)
+        assert s == "adipiscing. In rhoncus ipsum sed lectus"
+        assert remainder == "porta volutpat."
+        # --
+        s, remainder = split(remainder)
+        assert s == "porta volutpat."
+        assert remainder == ""
+
+    def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self):
+        opts = ChunkingOptions.new(max_characters=40, text_splitting_separators=("\n", " "))
+        split = _TextSplitter(opts)
+        text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
+
+        s, remainder = split(text)
+        assert s == "Loremipsumdolorametconsecteturadipiscing"
+        assert remainder == "elit. In rhoncus ipsum sed lectus porta."
+        # --
+        s, remainder = split(remainder)
+        assert s == "elit. In rhoncus ipsum sed lectus porta."
+        assert remainder == ""
+
+    @pytest.mark.parametrize(
+        "text",
+        [
+            "Lorem ipsum dolor amet consectetur adipiscing.",  # 46-chars
+            "Lorem ipsum dolor.",  # 18-chars
+        ],
+    )
+    def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str):
+        opts = ChunkingOptions.new(max_characters=46)
+        split = _TextSplitter(opts)
+
+        s, remainder = split(text)
+
+        assert s == text
+        assert remainder == ""
+
+    def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self):
+        opts = ChunkingOptions.new(max_characters=38)
+        split = _TextSplitter(opts)
+        text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
+
+        s, _ = split(text)
+
+        assert s == "Loremipsumdolorametconsecteturadipisci"
+        assert len(s) == 38
+
+    @pytest.mark.parametrize("separators", [("\n", " "), ()])
+    def it_strips_whitespace_around_the_split(self, separators: Sequence[str]):
+        opts = ChunkingOptions.new(max_characters=50, text_splitting_separators=separators)
+        split = _TextSplitter(opts)
+        text = "Lorem ipsum dolor amet consectetur adipiscing.       In rhoncus ipsum sed lectus."
+        #       |------------------------------------------------^  50-chars
+
+        s, remainder = split(text)
+
+        assert s == "Lorem ipsum dolor amet consectetur adipiscing."
+        assert remainder == "In rhoncus ipsum sed lectus."
+
+
 # ================================================================================================
 # BASE PRE-CHUNKER
 # ================================================================================================
@ -263,7 +355,7 @@ class DescribeTablePreChunk:
        )
        pre_chunk = TablePreChunk(
            Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
-            opts=ChunkingOptions.new(max_characters=100),
+            opts=ChunkingOptions.new(max_characters=100, text_splitting_separators=("\n", " ")),
        )

        chunk_iter = pre_chunk.iter_chunks()
@ -273,8 +365,7 @@ class DescribeTablePreChunk:
        assert chunk.text == (
            "Header Col 1   Header Col 2\n"
            "Lorem ipsum    dolor sit amet\n"
-            "Consectetur    adipiscing elit\n"
-            "Nunc aliqua"
+            "Consectetur    adipiscing elit"
        )
        assert chunk.metadata.text_as_html == (
            "<table>\n"
@ -287,8 +378,8 @@ class DescribeTablePreChunk:
        # --
        chunk = next(chunk_iter)
        assert isinstance(chunk, TableChunk)
-        assert (
-            chunk.text == "m   id enim nec molestie\nVivamus quis   nunc ipsum donec ac fermentum"
+        assert chunk.text == (
+            "Nunc aliquam   id enim nec molestie\nVivamus quis   nunc ipsum donec ac fermentum"
        )
        assert chunk.metadata.text_as_html == (
            "rem ipsum    </td><td>A Link example</td></tr>\n"
@ -399,7 +490,7 @@ class DescribeTextPreChunk:
                    " commodo consequat."
                ),
            ],
-            opts=ChunkingOptions.new(max_characters=200),
+            opts=ChunkingOptions.new(max_characters=200, text_splitting_separators=("\n", " ")),
        )

        chunk_iter = pre_chunk.iter_chunks()
@ -408,12 +499,12 @@ class DescribeTextPreChunk:
        assert chunk == CompositeElement(
            "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
            " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
-            " veniam, quis nostrud exercitation ullamco laboris nisi ut a"
+            " veniam, quis nostrud exercitation ullamco laboris nisi ut"
        )
        assert chunk.metadata is pre_chunk._consolidated_metadata
        # --
        chunk = next(chunk_iter)
-        assert chunk == CompositeElement("liquip ex ea commodo consequat.")
+        assert chunk == CompositeElement("aliquip ex ea commodo consequat.")
        assert chunk.metadata is pre_chunk._consolidated_metadata
        # --
        with pytest.raises(StopIteration):
--- a/test_unstructured/chunking/test_title.py
+++ b/test_unstructured/chunking/test_title.py
@ -35,7 +35,7 @@ def test_it_splits_a_large_element_into_multiple_chunks():

    assert chunks == [
        CompositeElement("Introduction"),
-        CompositeElement("Lorem ipsum dolor sit amet consectetur adipiscing "),
+        CompositeElement("Lorem ipsum dolor sit amet consectetur adipiscing"),
        CompositeElement("elit. In rhoncus ipsum sed lectus porta volutpat."),
    ]

--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -1100,7 +1100,10 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
    assert len(partitioned_table_elements_5_chars) != len(table_elements)
    assert len(partitioned_table_elements_200_chars) != len(table_elements)

-    assert len(partitioned_table_elements_5_chars[0].text) == 5
+    # trailing whitespace is stripped from the first chunk, leaving only a checkbox character
+    assert len(partitioned_table_elements_5_chars[0].text) == 1
+    # but the second chunk is the full 5 characters
+    assert len(partitioned_table_elements_5_chars[1].text) == 5
    assert len(partitioned_table_elements_5_chars[0].metadata.text_as_html) == 5

    # the first table element is under 200 chars so doesn't get chunked!
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.11.6"  # pragma: no cover
+__version__ = "0.11.7-dev0"  # pragma: no cover
--- a/unstructured/chunking/base.py
+++ b/unstructured/chunking/base.py
@ -18,6 +18,7 @@ from typing import (
    cast,
 )

+import regex
 from typing_extensions import Self, TypeAlias

 from unstructured.documents.elements import (
@ -45,7 +46,46 @@ PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"


 class ChunkingOptions:
-    """Specifies parameters of optional chunking behaviors."""
+    """Specifies parameters of optional chunking behaviors.
+
+    Parameters
+    ----------
+    max_characters
+        Hard-maximum text-length of chunk. A chunk longer than this will be split mid-text and be
+        emitted as two or more chunks.
+    new_after_n_chars
+        Preferred approximate chunk size. A chunk composed of elements totalling this size or
+        greater is considered "full" and will not be enlarged by adding another element, even if it
+        will fit within the remaining `max_characters` for that chunk. Defaults to `max_characters`
+        when not specified, which effectively disables this behavior. Specifying 0 for this
+        argument causes each element to appear in a chunk by itself (although an element with text
+        longer than `max_characters` will be still be split into two or more chunks).
+    multipage_sections
+        Indicates that page-boundaries should not be respected while chunking, i.e. elements
+        appearing on two different pages can appear in the same chunk.
+    combine_text_under_n_chars
+        Provides a way to "recombine" small chunks formed by breaking on a semantic boundary. Only
+        relevant for a chunking strategy that specifies higher-level semantic boundaries to be
+        respected, like "section" or "page". Recursively combines two adjacent pre-chunks when the
+        first pre-chunk is smaller than this threshold. "Recursively" here means the resulting
+        pre-chunk can be combined with the next pre-chunk if it is still under the length threshold.
+        Defaults to `max_characters` which combines chunks whenever space allows. Specifying 0 for
+        this argument suppresses combining of small chunks. Note this value is "capped" at the
+        `new_after_n_chars` value since a value higher than that would not change this parameter's
+        effect.
+    overlap
+        Specifies the length of a string ("tail") to be drawn from each chunk and prefixed to the
+        next chunk as a context-preserving mechanism. By default, this only applies to split-chunks
+        where an oversized element is divided into multiple chunks by text-splitting.
+    text_splitting_separators
+        A sequence of strings like `("\n", " ")` to be used as target separators during
+        text-splitting. Text-splitting only applies to splitting an oversized element into two or
+        more chunks. These separators are tried in the specified order until one is found in the
+        string to be split. The default separator is `""` which matches between any two characters.
+        This separator should not be specified in this sequence because it is always the separator
+        of last-resort. Note that because the separator is removed during text-splitting, only
+        whitespace character sequences are suitable.
+    """

    def __init__(
        self,
@ -54,12 +94,14 @@ class ChunkingOptions:
        multipage_sections: bool = True,
        new_after_n_chars: Optional[int] = None,
        overlap: int = 0,
+        text_splitting_separators: Sequence[str] = (),
    ):
        self._combine_text_under_n_chars_arg = combine_text_under_n_chars
        self._max_characters = max_characters
        self._multipage_sections = multipage_sections
        self._new_after_n_chars_arg = new_after_n_chars
        self._overlap = overlap
+        self._text_splitting_separators = text_splitting_separators

    @classmethod
    def new(
@ -69,6 +111,7 @@ class ChunkingOptions:
        multipage_sections: bool = True,
        new_after_n_chars: Optional[int] = None,
        overlap: int = 0,
+        text_splitting_separators: Sequence[str] = (),
    ) -> Self:
        """Construct validated instance.

@ -80,6 +123,7 @@ class ChunkingOptions:
            multipage_sections,
            new_after_n_chars,
            overlap,
+            text_splitting_separators,
        )
        self._validate()
        return self
@ -144,6 +188,15 @@ class ChunkingOptions:
            else new_after_n_chars
        )

+    @lazyproperty
+    def split(self) -> Callable[[str], Tuple[str, str]]:
+        """A text-splitting function suitable for splitting the text of an oversized pre-chunk.
+
+        The function is pre-configured with the chosen chunking window size and any other applicable
+        options specified by the caller as part of this chunking-options instance.
+        """
+        return _TextSplitter(self)
+
    @lazyproperty
    def text_separator(self) -> str:
        """The string to insert between elements when concatenating their text for a chunk.
@ -154,6 +207,11 @@ class ChunkingOptions:
        """
        return "\n\n"

+    @lazyproperty
+    def text_splitting_separators(self) -> Tuple[str, ...]:
+        """Sequence of text-splitting target strings to be used in order of preference."""
+        return tuple(self._text_splitting_separators)
+
    def _validate(self) -> None:
        """Raise ValueError if requestion option-set is invalid."""
        max_characters = self._max_characters
@ -187,6 +245,90 @@ class ChunkingOptions:
            raise ValueError(f"'overlap' must be less than max_characters," f" got {self._overlap}")


+class _TextSplitter:
+    """Provides a text-splitting function configured on construction.
+
+    Text is split on the best-available separator, falling-back from the preferred separator
+    through a sequence of alternate separators.
+
+    - The separator is removed by splitting so only whitespace strings are suitable separators.
+    - A "blank-line" ("\n\n") is unlikely to occur in an element as it would have been used as an
+      element boundary during partitioning.
+
+    This is a *callable* object. Constructing it essentially produces a function:
+
+        split = _TextSplitter(opts)
+        fragment, remainder = split(s)
+
+    This allows it to be configured with length-options etc. on construction and used throughout a
+    chunking operation on a given element-stream.
+    """
+
+    def __init__(self, opts: ChunkingOptions):
+        self._opts = opts
+
+    def __call__(self, s: str) -> Tuple[str, str]:
+        """Return pair of strings split from `s` on the best match of configured patterns.
+
+        The first string is the split, the second is the remainder of the string. The split string
+        will never be longer than `maxlen`. The separators are tried in order until a match is
+        found. The last separator is "" which matches between any two characters so there will
+        always be a split.
+
+        The separator is removed and does not appear in the split or remainder.
+
+        An `s` that is already less than the maximum length is returned unchanged with no remainder.
+        This allows this function to be called repeatedly with the remainder until it is consumed
+        and returns a remainder of "".
+        """
+        maxlen = self._opts.hard_max
+
+        if len(s) <= maxlen:
+            return s, ""
+
+        for p, length in self._patterns:
+            # -- length of separator must be added to include that separator when it happens to be
+            # -- located exactly at maxlen. Otherwise the search-from-end regex won't find it.
+            fragment, remainder = self._split_from_maxlen(p, maxlen + length, s)
+            if not fragment:
+                continue
+            return fragment.rstrip(), remainder.lstrip()
+
+        # -- the terminal "" pattern is not actually executed via regex since its implementation is
+        # -- trivial and provides a hard back-stop here in this method.
+        return s[:maxlen].rstrip(), s[maxlen:].lstrip()
+
+    @lazyproperty
+    def _patterns(self) -> Tuple[Tuple[regex.Pattern[str], int], ...]:
+        """Sequence of (pattern, len) pairs to match against.
+
+        Patterns appear in order of preference, those following are "fall-back" patterns to be used
+        if no match of a prior pattern is found.
+
+        NOTE these regexes search *from the end of the string*, which is what the "(?r)" bit
+        specifies. This is much more efficient than starting at the beginning of the string which
+        could result in hundreds of matches before the desired one.
+        """
+        separators = self._opts.text_splitting_separators
+        return tuple((regex.compile(f"(?r){sep}"), len(sep)) for sep in separators)
+
+    @staticmethod
+    def _split_from_maxlen(pattern: regex.Pattern[str], maxlen: int, s: str) -> Tuple[str, str]:
+        """Return (split, remainder) pair split from `s` on the right-most match before `maxlen`.
+
+        Returns `"", s` if no suitable match was found. The first string in the pair will never be
+        longer than `maxlen` and there is no longer split available using `pattern`.
+
+        The separator is removed and does not appear in either the split or remainder.
+        """
+        match = pattern.search(s[:maxlen])
+        if match is None:
+            return "", s
+        start: int = match.start()
+        end: int = match.end()
+        return s[:start], s[end:]
+
+
 # ================================================================================================
 # BASE PRE-CHUNKER
 # ================================================================================================
@ -276,27 +418,28 @@ class TablePreChunk:

    def iter_chunks(self) -> Iterator[Table | TableChunk]:
        """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
-        text = self._table.text
-        html = self._table.metadata.text_as_html or ""
+        split = self._opts.split
+        text_remainder = self._table.text
+        html_remainder = self._table.metadata.text_as_html or ""
        maxlen = self._opts.hard_max

        # -- only chunk a table when it's too big to swallow whole --
-        if len(text) <= maxlen and len(html) <= maxlen:
+        if len(text_remainder) <= maxlen and len(html_remainder) <= maxlen:
            yield self._table
            return

        is_continuation = False

-        while text or html:
-            # -- split off the next maxchars into the next TableChunk --
-            text_chunk, text = text[:maxlen], text[maxlen:]
-            table_chunk = TableChunk(text=text_chunk, metadata=copy.deepcopy(self._table.metadata))
+        while text_remainder or html_remainder:
+            # -- split off the next chunk-worth of characters into a TableChunk --
+            chunk_text, text_remainder = split(text_remainder)
+            table_chunk = TableChunk(text=chunk_text, metadata=copy.deepcopy(self._table.metadata))

            # -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
            # -- HTML elements that *correspond* to the TextChunk.text fragment.
-            if html:
-                html_chunk, html = html[:maxlen], html[maxlen:]
-                table_chunk.metadata.text_as_html = html_chunk
+            if html_remainder:
+                chunk_html, html_remainder = html_remainder[:maxlen], html_remainder[maxlen:]
+                table_chunk.metadata.text_as_html = chunk_html

            # -- mark second and later chunks as a continuation --
            if is_continuation:
@ -332,17 +475,14 @@ class TextPreChunk:

    def iter_chunks(self) -> Iterator[CompositeElement]:
        """Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller."""
-        text = self._text
-        text_len = len(text)
-        maxlen = self._opts.hard_max
-        start = 0
-        remaining = text_len
+        split = self._opts.split
+        metadata = self._consolidated_metadata

-        while remaining > 0:
-            end = min(start + maxlen, text_len)
-            yield CompositeElement(text=text[start:end], metadata=self._consolidated_metadata)
-            start = end
-            remaining = text_len - end
+        remainder = self._text
+
+        while remainder:
+            s, remainder = split(remainder)
+            yield CompositeElement(text=s, metadata=metadata)

    @lazyproperty
    def text_length(self) -> int: