rfctr(chunking): split oversized chunks on word boundary (#2297)

The text of an oversized chunk is split on an arbitrary character
boundary (mid-word). The `chunk_by_character()` strategy introduces the
idea of allowing the user to specify a separator to use for
chunk-splitting. For `langchain` this is typically "\n\n", "\n", or " ";
blank-line, newline, or word boundaries respectively.

Even if the user is allowed to specify a separator, we must provide
fall-back for when a chunk contains no such character. This can be done
incrementally, like blank-line is preferable to newline, newline is
preferable to word, and word is preferable to arbitrary character.

Further, there is nothing particular to `chunk_by_character()` in
providing such a fall-back text-splitting strategy. It would be
preferable for all strategies to split oversized chunks on even-word
boundaries for example.

Note that while a "blank-line" ("\n\n") may be common in plain text, it
is unlikely to appear in the text of an element because it would have
been interpreted as an element boundary during partitioning.

Add _TextSplitter with basic separator preferences and fall-back and
apply it to chunk-splitting for all strategies. The `by_character`
chunking strategy may enhance this behavior by adding the option for a
user to specify a particular separator suited to their use case.
This commit is contained in:
Steve Canny 2023-12-20 21:45:36 -08:00 committed by GitHub
parent 4533bdac98
commit 093a11d058
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 276 additions and 34 deletions

View File

@ -1,3 +1,11 @@
## 0.11.7-dev0
### Enhancements
### Features
### Fixes
## 0.11.6
### Enhancements

View File

@ -4,7 +4,7 @@
from __future__ import annotations
from typing import List
from typing import List, Sequence
import pytest
@ -16,6 +16,7 @@ from unstructured.chunking.base import (
TablePreChunk,
TextPreChunk,
TextPreChunkAccumulator,
_TextSplitter,
is_in_next_section,
is_on_next_page,
is_title,
@ -39,7 +40,7 @@ from unstructured.documents.elements import (
class DescribeChunkingOptions:
"""Unit-test suite for `unstructured.chunking.base.ChunkingOptions objects."""
"""Unit-test suite for `unstructured.chunking.base.ChunkingOptions` objects."""
@pytest.mark.parametrize("max_characters", [0, -1, -42])
def it_rejects_max_characters_not_greater_than_zero(self, max_characters: int):
@ -144,6 +145,97 @@ class DescribeChunkingOptions:
assert ChunkingOptions.new().text_separator == "\n\n"
class Describe_TextSplitter:
"""Unit-test suite for `unstructured.chunking.base._TextSplitter` objects."""
def it_splits_on_a_preferred_separator_when_it_can(self):
opts = ChunkingOptions.new(max_characters=50, text_splitting_separators=("\n", " "))
split = _TextSplitter(opts)
text = (
"Lorem ipsum dolor amet consectetur adipiscing.\n"
"In rhoncus ipsum sed lectus porta volutpat."
)
s, remainder = split(text)
assert s == "Lorem ipsum dolor amet consectetur adipiscing."
assert remainder == "In rhoncus ipsum sed lectus porta volutpat."
# --
s, remainder = split(remainder)
assert s == "In rhoncus ipsum sed lectus porta volutpat."
assert remainder == ""
def and_it_splits_on_the_next_available_separator_when_the_first_is_not_available(self):
opts = ChunkingOptions.new(max_characters=40, text_splitting_separators=("\n", " "))
split = _TextSplitter(opts)
text = (
"Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus porta"
" volutpat."
)
s, remainder = split(text)
assert s == "Lorem ipsum dolor amet consectetur"
assert remainder == "adipiscing. In rhoncus ipsum sed lectus porta volutpat."
# --
s, remainder = split(remainder)
assert s == "adipiscing. In rhoncus ipsum sed lectus"
assert remainder == "porta volutpat."
# --
s, remainder = split(remainder)
assert s == "porta volutpat."
assert remainder == ""
def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self):
opts = ChunkingOptions.new(max_characters=40, text_splitting_separators=("\n", " "))
split = _TextSplitter(opts)
text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
s, remainder = split(text)
assert s == "Loremipsumdolorametconsecteturadipiscing"
assert remainder == "elit. In rhoncus ipsum sed lectus porta."
# --
s, remainder = split(remainder)
assert s == "elit. In rhoncus ipsum sed lectus porta."
assert remainder == ""
@pytest.mark.parametrize(
"text",
[
"Lorem ipsum dolor amet consectetur adipiscing.", # 46-chars
"Lorem ipsum dolor.", # 18-chars
],
)
def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str):
opts = ChunkingOptions.new(max_characters=46)
split = _TextSplitter(opts)
s, remainder = split(text)
assert s == text
assert remainder == ""
def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self):
opts = ChunkingOptions.new(max_characters=38)
split = _TextSplitter(opts)
text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
s, _ = split(text)
assert s == "Loremipsumdolorametconsecteturadipisci"
assert len(s) == 38
@pytest.mark.parametrize("separators", [("\n", " "), ()])
def it_strips_whitespace_around_the_split(self, separators: Sequence[str]):
opts = ChunkingOptions.new(max_characters=50, text_splitting_separators=separators)
split = _TextSplitter(opts)
text = "Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus."
# |------------------------------------------------^ 50-chars
s, remainder = split(text)
assert s == "Lorem ipsum dolor amet consectetur adipiscing."
assert remainder == "In rhoncus ipsum sed lectus."
# ================================================================================================
# BASE PRE-CHUNKER
# ================================================================================================
@ -263,7 +355,7 @@ class DescribeTablePreChunk:
)
pre_chunk = TablePreChunk(
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
opts=ChunkingOptions.new(max_characters=100),
opts=ChunkingOptions.new(max_characters=100, text_splitting_separators=("\n", " ")),
)
chunk_iter = pre_chunk.iter_chunks()
@ -273,8 +365,7 @@ class DescribeTablePreChunk:
assert chunk.text == (
"Header Col 1 Header Col 2\n"
"Lorem ipsum dolor sit amet\n"
"Consectetur adipiscing elit\n"
"Nunc aliqua"
"Consectetur adipiscing elit"
)
assert chunk.metadata.text_as_html == (
"<table>\n"
@ -287,8 +378,8 @@ class DescribeTablePreChunk:
# --
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
assert (
chunk.text == "m id enim nec molestie\nVivamus quis nunc ipsum donec ac fermentum"
assert chunk.text == (
"Nunc aliquam id enim nec molestie\nVivamus quis nunc ipsum donec ac fermentum"
)
assert chunk.metadata.text_as_html == (
"rem ipsum </td><td>A Link example</td></tr>\n"
@ -399,7 +490,7 @@ class DescribeTextPreChunk:
" commodo consequat."
),
],
opts=ChunkingOptions.new(max_characters=200),
opts=ChunkingOptions.new(max_characters=200, text_splitting_separators=("\n", " ")),
)
chunk_iter = pre_chunk.iter_chunks()
@ -408,12 +499,12 @@ class DescribeTextPreChunk:
assert chunk == CompositeElement(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
" veniam, quis nostrud exercitation ullamco laboris nisi ut a"
" veniam, quis nostrud exercitation ullamco laboris nisi ut"
)
assert chunk.metadata is pre_chunk._consolidated_metadata
# --
chunk = next(chunk_iter)
assert chunk == CompositeElement("liquip ex ea commodo consequat.")
assert chunk == CompositeElement("aliquip ex ea commodo consequat.")
assert chunk.metadata is pre_chunk._consolidated_metadata
# --
with pytest.raises(StopIteration):

View File

@ -35,7 +35,7 @@ def test_it_splits_a_large_element_into_multiple_chunks():
assert chunks == [
CompositeElement("Introduction"),
CompositeElement("Lorem ipsum dolor sit amet consectetur adipiscing "),
CompositeElement("Lorem ipsum dolor sit amet consectetur adipiscing"),
CompositeElement("elit. In rhoncus ipsum sed lectus porta volutpat."),
]

View File

@ -1100,7 +1100,10 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
assert len(partitioned_table_elements_5_chars) != len(table_elements)
assert len(partitioned_table_elements_200_chars) != len(table_elements)
assert len(partitioned_table_elements_5_chars[0].text) == 5
# trailing whitespace is stripped from the first chunk, leaving only a checkbox character
assert len(partitioned_table_elements_5_chars[0].text) == 1
# but the second chunk is the full 5 characters
assert len(partitioned_table_elements_5_chars[1].text) == 5
assert len(partitioned_table_elements_5_chars[0].metadata.text_as_html) == 5
# the first table element is under 200 chars so doesn't get chunked!

View File

@ -1 +1 @@
__version__ = "0.11.6" # pragma: no cover
__version__ = "0.11.7-dev0" # pragma: no cover

View File

@ -18,6 +18,7 @@ from typing import (
cast,
)
import regex
from typing_extensions import Self, TypeAlias
from unstructured.documents.elements import (
@ -45,7 +46,46 @@ PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
class ChunkingOptions:
"""Specifies parameters of optional chunking behaviors."""
"""Specifies parameters of optional chunking behaviors.
Parameters
----------
max_characters
Hard-maximum text-length of chunk. A chunk longer than this will be split mid-text and be
emitted as two or more chunks.
new_after_n_chars
Preferred approximate chunk size. A chunk composed of elements totalling this size or
greater is considered "full" and will not be enlarged by adding another element, even if it
will fit within the remaining `max_characters` for that chunk. Defaults to `max_characters`
when not specified, which effectively disables this behavior. Specifying 0 for this
argument causes each element to appear in a chunk by itself (although an element with text
longer than `max_characters` will be still be split into two or more chunks).
multipage_sections
Indicates that page-boundaries should not be respected while chunking, i.e. elements
appearing on two different pages can appear in the same chunk.
combine_text_under_n_chars
Provides a way to "recombine" small chunks formed by breaking on a semantic boundary. Only
relevant for a chunking strategy that specifies higher-level semantic boundaries to be
respected, like "section" or "page". Recursively combines two adjacent pre-chunks when the
first pre-chunk is smaller than this threshold. "Recursively" here means the resulting
pre-chunk can be combined with the next pre-chunk if it is still under the length threshold.
Defaults to `max_characters` which combines chunks whenever space allows. Specifying 0 for
this argument suppresses combining of small chunks. Note this value is "capped" at the
`new_after_n_chars` value since a value higher than that would not change this parameter's
effect.
overlap
Specifies the length of a string ("tail") to be drawn from each chunk and prefixed to the
next chunk as a context-preserving mechanism. By default, this only applies to split-chunks
where an oversized element is divided into multiple chunks by text-splitting.
text_splitting_separators
A sequence of strings like `("\n", " ")` to be used as target separators during
text-splitting. Text-splitting only applies to splitting an oversized element into two or
more chunks. These separators are tried in the specified order until one is found in the
string to be split. The default separator is `""` which matches between any two characters.
This separator should not be specified in this sequence because it is always the separator
of last-resort. Note that because the separator is removed during text-splitting, only
whitespace character sequences are suitable.
"""
def __init__(
self,
@ -54,12 +94,14 @@ class ChunkingOptions:
multipage_sections: bool = True,
new_after_n_chars: Optional[int] = None,
overlap: int = 0,
text_splitting_separators: Sequence[str] = (),
):
self._combine_text_under_n_chars_arg = combine_text_under_n_chars
self._max_characters = max_characters
self._multipage_sections = multipage_sections
self._new_after_n_chars_arg = new_after_n_chars
self._overlap = overlap
self._text_splitting_separators = text_splitting_separators
@classmethod
def new(
@ -69,6 +111,7 @@ class ChunkingOptions:
multipage_sections: bool = True,
new_after_n_chars: Optional[int] = None,
overlap: int = 0,
text_splitting_separators: Sequence[str] = (),
) -> Self:
"""Construct validated instance.
@ -80,6 +123,7 @@ class ChunkingOptions:
multipage_sections,
new_after_n_chars,
overlap,
text_splitting_separators,
)
self._validate()
return self
@ -144,6 +188,15 @@ class ChunkingOptions:
else new_after_n_chars
)
@lazyproperty
def split(self) -> Callable[[str], Tuple[str, str]]:
"""A text-splitting function suitable for splitting the text of an oversized pre-chunk.
The function is pre-configured with the chosen chunking window size and any other applicable
options specified by the caller as part of this chunking-options instance.
"""
return _TextSplitter(self)
@lazyproperty
def text_separator(self) -> str:
"""The string to insert between elements when concatenating their text for a chunk.
@ -154,6 +207,11 @@ class ChunkingOptions:
"""
return "\n\n"
@lazyproperty
def text_splitting_separators(self) -> Tuple[str, ...]:
"""Sequence of text-splitting target strings to be used in order of preference."""
return tuple(self._text_splitting_separators)
def _validate(self) -> None:
"""Raise ValueError if requestion option-set is invalid."""
max_characters = self._max_characters
@ -187,6 +245,90 @@ class ChunkingOptions:
raise ValueError(f"'overlap' must be less than max_characters," f" got {self._overlap}")
class _TextSplitter:
"""Provides a text-splitting function configured on construction.
Text is split on the best-available separator, falling-back from the preferred separator
through a sequence of alternate separators.
- The separator is removed by splitting so only whitespace strings are suitable separators.
- A "blank-line" ("\n\n") is unlikely to occur in an element as it would have been used as an
element boundary during partitioning.
This is a *callable* object. Constructing it essentially produces a function:
split = _TextSplitter(opts)
fragment, remainder = split(s)
This allows it to be configured with length-options etc. on construction and used throughout a
chunking operation on a given element-stream.
"""
def __init__(self, opts: ChunkingOptions):
self._opts = opts
def __call__(self, s: str) -> Tuple[str, str]:
"""Return pair of strings split from `s` on the best match of configured patterns.
The first string is the split, the second is the remainder of the string. The split string
will never be longer than `maxlen`. The separators are tried in order until a match is
found. The last separator is "" which matches between any two characters so there will
always be a split.
The separator is removed and does not appear in the split or remainder.
An `s` that is already less than the maximum length is returned unchanged with no remainder.
This allows this function to be called repeatedly with the remainder until it is consumed
and returns a remainder of "".
"""
maxlen = self._opts.hard_max
if len(s) <= maxlen:
return s, ""
for p, length in self._patterns:
# -- length of separator must be added to include that separator when it happens to be
# -- located exactly at maxlen. Otherwise the search-from-end regex won't find it.
fragment, remainder = self._split_from_maxlen(p, maxlen + length, s)
if not fragment:
continue
return fragment.rstrip(), remainder.lstrip()
# -- the terminal "" pattern is not actually executed via regex since its implementation is
# -- trivial and provides a hard back-stop here in this method.
return s[:maxlen].rstrip(), s[maxlen:].lstrip()
@lazyproperty
def _patterns(self) -> Tuple[Tuple[regex.Pattern[str], int], ...]:
"""Sequence of (pattern, len) pairs to match against.
Patterns appear in order of preference, those following are "fall-back" patterns to be used
if no match of a prior pattern is found.
NOTE these regexes search *from the end of the string*, which is what the "(?r)" bit
specifies. This is much more efficient than starting at the beginning of the string which
could result in hundreds of matches before the desired one.
"""
separators = self._opts.text_splitting_separators
return tuple((regex.compile(f"(?r){sep}"), len(sep)) for sep in separators)
@staticmethod
def _split_from_maxlen(pattern: regex.Pattern[str], maxlen: int, s: str) -> Tuple[str, str]:
"""Return (split, remainder) pair split from `s` on the right-most match before `maxlen`.
Returns `"", s` if no suitable match was found. The first string in the pair will never be
longer than `maxlen` and there is no longer split available using `pattern`.
The separator is removed and does not appear in either the split or remainder.
"""
match = pattern.search(s[:maxlen])
if match is None:
return "", s
start: int = match.start()
end: int = match.end()
return s[:start], s[end:]
# ================================================================================================
# BASE PRE-CHUNKER
# ================================================================================================
@ -276,27 +418,28 @@ class TablePreChunk:
def iter_chunks(self) -> Iterator[Table | TableChunk]:
"""Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
text = self._table.text
html = self._table.metadata.text_as_html or ""
split = self._opts.split
text_remainder = self._table.text
html_remainder = self._table.metadata.text_as_html or ""
maxlen = self._opts.hard_max
# -- only chunk a table when it's too big to swallow whole --
if len(text) <= maxlen and len(html) <= maxlen:
if len(text_remainder) <= maxlen and len(html_remainder) <= maxlen:
yield self._table
return
is_continuation = False
while text or html:
# -- split off the next maxchars into the next TableChunk --
text_chunk, text = text[:maxlen], text[maxlen:]
table_chunk = TableChunk(text=text_chunk, metadata=copy.deepcopy(self._table.metadata))
while text_remainder or html_remainder:
# -- split off the next chunk-worth of characters into a TableChunk --
chunk_text, text_remainder = split(text_remainder)
table_chunk = TableChunk(text=chunk_text, metadata=copy.deepcopy(self._table.metadata))
# -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
# -- HTML elements that *correspond* to the TextChunk.text fragment.
if html:
html_chunk, html = html[:maxlen], html[maxlen:]
table_chunk.metadata.text_as_html = html_chunk
if html_remainder:
chunk_html, html_remainder = html_remainder[:maxlen], html_remainder[maxlen:]
table_chunk.metadata.text_as_html = chunk_html
# -- mark second and later chunks as a continuation --
if is_continuation:
@ -332,17 +475,14 @@ class TextPreChunk:
def iter_chunks(self) -> Iterator[CompositeElement]:
"""Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller."""
text = self._text
text_len = len(text)
maxlen = self._opts.hard_max
start = 0
remaining = text_len
split = self._opts.split
metadata = self._consolidated_metadata
while remaining > 0:
end = min(start + maxlen, text_len)
yield CompositeElement(text=text[start:end], metadata=self._consolidated_metadata)
start = end
remaining = text_len - end
remainder = self._text
while remainder:
s, remainder = split(remainder)
yield CompositeElement(text=s, metadata=metadata)
@lazyproperty
def text_length(self) -> int: