rfctr(chunking): generalize PreChunkBuilder (#2283)

To implement inter-pre-chunk overlap, we need a context that sees every
pre-chunk both before and after it is accumulated (from elements).

- We need access to the pre-chunk when it is completed so we can extract
the "tail" overlap to be applied to the next chunk.
- We need access to the as-yet-unpopulated pre-chunk so we can add the
prior tail to it as a prefix.

This "visibility" is split between `PreChunkBuilder` and the pre-chunker
itself, which handles `TablePreChunk`s without the builder.

Move `Table` element and TablePreChunk` formation into `PreChunkBuilder`
such that _all_ element types (adding `Table` elements in particular)
pass through it. Then `PreChunkBuilder` becomes the context we require.

The actual overlap harvesting and application will come in a subsequent
commit.
This commit is contained in:
Steve Canny 2023-12-18 14:21:34 -08:00 committed by GitHub
parent 9efc22c0fc
commit 0c7f64ecaa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 165 additions and 48 deletions

View File

@ -1,3 +1,11 @@
## 0.11.6-dev0
### Enhancements
### Features
### Fixes
## 0.11.5
### Enhancements

View File

@ -18,6 +18,7 @@ from unstructured.chunking.base import (
)
from unstructured.documents.elements import (
CompositeElement,
Element,
ElementMetadata,
PageBreak,
RegexMetadata,
@ -572,15 +573,15 @@ class DescribePreChunkBuilder:
def it_is_empty_on_construction(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
assert builder.text_length == 0
assert builder.remaining_space == 50
assert builder._text_length == 0
assert builder._remaining_space == 50
def it_accumulates_elements_added_to_it(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
builder.add_element(Title("Introduction"))
assert builder.text_length == 12
assert builder.remaining_space == 136
assert builder._text_length == 12
assert builder._remaining_space == 136
builder.add_element(
Text(
@ -588,8 +589,67 @@ class DescribePreChunkBuilder:
"lectus porta volutpat.",
),
)
assert builder.text_length == 112
assert builder.remaining_space == 36
assert builder._text_length == 112
assert builder._remaining_space == 36
@pytest.mark.parametrize("element", [Table("Heading\nCell text"), Text("abcd " * 200)])
def it_will_fit_a_Table_or_oversized_element_when_empty(self, element: Element):
builder = PreChunkBuilder(opts=ChunkingOptions.new())
assert builder.will_fit(element)
@pytest.mark.parametrize(
("existing_element", "next_element"),
[
(Text("abcd"), Table("Fruits\nMango")),
(Text("abcd"), Text("abcd " * 200)),
(Table("Heading\nCell text"), Table("Fruits\nMango")),
(Table("Heading\nCell text"), Text("abcd " * 200)),
],
)
def but_not_when_it_already_contains_an_element_of_any_kind(
self, existing_element: Element, next_element: Element
):
builder = PreChunkBuilder(opts=ChunkingOptions.new())
builder.add_element(existing_element)
assert not builder.will_fit(next_element)
@pytest.mark.parametrize("element", [Text("abcd"), Table("Fruits\nMango")])
def it_will_not_fit_any_element_when_it_already_contains_a_table(self, element: Element):
builder = PreChunkBuilder(opts=ChunkingOptions.new())
builder.add_element(Table("Heading\nCell text"))
assert not builder.will_fit(element)
def it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen(self):
builder = PreChunkBuilder(
opts=ChunkingOptions.new(max_characters=100, new_after_n_chars=50)
)
builder.add_element(
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
)
assert not builder.will_fit(Text("In rhoncus ipsum."))
def and_it_will_not_fit_an_element_when_that_would_cause_it_to_exceed_the_hard_maxlen(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100))
builder.add_element(
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
)
# -- 55 + 2 (separator) + 44 == 101 --
assert not builder.will_fit(
Text("In rhoncus ipsum sed lectus portos volutpat.") # 44-chars
)
def but_it_will_fit_an_element_that_fits(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100))
builder.add_element(
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
)
# -- 55 + 2 (separator) + 43 == 100 --
assert builder.will_fit(Text("In rhoncus ipsum sed lectus porto volutpat.")) # 43-chars
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
@ -611,8 +671,24 @@ class DescribePreChunkBuilder:
"lectus porta volutpat.",
),
]
assert builder.text_length == 0
assert builder.remaining_space == 150
assert builder._text_length == 0
assert builder._remaining_space == 150
def but_it_generates_a_TablePreChunk_when_it_contains_a_Table_element(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
builder.add_element(Table("Heading\nCell text"))
pre_chunk = next(builder.flush())
# -- pre-chunk builder was reset before the yield, such that the iterator does not need to
# -- be exhausted before clearing out the old elements and a new pre-chunk can be
# -- accumulated immediately (first `next()` call is required however, to advance to the
# -- yield statement).
assert builder._text_length == 0
assert builder._remaining_space == 150
# -- pre-chunk is a `TablePreChunk` --
assert isinstance(pre_chunk, TablePreChunk)
assert pre_chunk._table == Table("Heading\nCell text")
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
@ -620,21 +696,21 @@ class DescribePreChunkBuilder:
pre_chunks = list(builder.flush())
assert pre_chunks == []
assert builder.text_length == 0
assert builder.remaining_space == 150
assert builder._text_length == 0
assert builder._remaining_space == 150
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
builder.add_element(Text("abcde"))
builder.add_element(Text("fghij"))
# -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
# -- ._text_length includes a separator ("\n\n", len==2) between each text-segment,
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
assert builder.text_length == 12
# -- .remaining_space is reduced by the length (2) of the trailing separator which would go
# -- between the current text and that of the next element if one was added.
assert builder._text_length == 12
# -- ._remaining_space is reduced by the length (2) of the trailing separator which would
# -- go between the current text and that of the next element if one was added.
# -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
assert builder.remaining_space == 36
assert builder._remaining_space == 36
class DescribePreChunkCombiner:

View File

@ -1 +1 @@
__version__ = "0.11.5" # pragma: no cover
__version__ = "0.11.6-dev0" # pragma: no cover

View File

@ -396,8 +396,8 @@ class TextPreChunk:
class PreChunkBuilder:
"""An element accumulator suitable for incrementally forming a pre-chunk.
Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use
to determine whether it should add the next element in the element stream.
Provides the trial method `.will_fit()` a pre-chunker can use to determine whether it should add
the next element in the element stream.
`.flush()` is used to build a PreChunk object from the accumulated elements. This method
returns an iterator that generates zero-or-one `TextPreChunk` or `TablePreChunk` object and is
@ -426,7 +426,7 @@ class PreChunkBuilder:
self._text_segments.append(element.text)
self._text_len += len(element.text)
def flush(self) -> Iterator[TextPreChunk]:
def flush(self) -> Iterator[PreChunk]:
"""Generate zero-or-one `PreChunk` object and clear the accumulator.
Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
@ -435,23 +435,62 @@ class PreChunkBuilder:
"""
if not self._elements:
return
pre_chunk = (
TablePreChunk(self._elements[0], self._opts)
if isinstance(self._elements[0], Table)
# -- copy list, don't use original or it may change contents as builder proceeds --
else TextPreChunk(list(self._elements), self._opts)
)
# -- clear builder before yield so we're not sensitive to the timing of how/when this
# -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately.
elements = self._elements[:]
self._elements.clear()
self._text_segments.clear()
self._text_len = 0
yield TextPreChunk(elements, self._opts)
# -- iterator is exhausted and can add elements for the next pre-chunk immediately.
self._reset_state()
yield pre_chunk
def will_fit(self, element: Element) -> bool:
"""True when `element` can be added to this prechunk without violating its limits.
There are several limits:
- A `Table` element will never fit with any other element. It will only fit in an empty
pre-chunk.
- No element will fit in a pre-chunk that already contains a `Table` element.
- A text-element will not fit in a pre-chunk that already exceeds the soft-max
(aka. new_after_n_chars).
- A text-element will not fit when together with the elements already present it would
exceed the hard-max (aka. max_characters).
"""
# -- an empty pre-chunk will accept any element (including an oversized-element) --
if len(self._elements) == 0:
return True
# -- a `Table` will not fit in a non-empty pre-chunk --
if isinstance(element, Table):
return False
# -- no element will fit in a pre-chunk that already contains a `Table` element --
if self._elements and isinstance(self._elements[0], Table):
return False
# -- a pre-chunk that already exceeds the soft-max is considered "full" --
if self._text_length > self._opts.soft_max:
return False
# -- don't add an element if it would increase total size beyond the hard-max --
if self._remaining_space < len(element.text):
return False
return True
@property
def remaining_space(self) -> int:
def _remaining_space(self) -> int:
"""Maximum text-length of an element that can be added without exceeding maxlen."""
# -- include length of trailing separator that will go before next element text --
separators_len = self._separator_len * len(self._text_segments)
return self._opts.hard_max - self._text_len - separators_len
def _reset_state(self) -> None:
"""Set working-state values back to "empty", ready to accumulate next pre-chunk."""
self._elements.clear()
self._text_segments.clear()
self._text_len = 0
@property
def text_length(self) -> int:
def _text_length(self) -> int:
"""Length of the text in this pre-chunk.
This value represents the chunk-size that would result if this pre-chunk was flushed in its
@ -502,10 +541,16 @@ class PreChunkCombiner:
class TextPreChunkAccumulator:
"""Accumulates, measures, and combines pre-chunk objects.
"""Accumulates, measures, and combines text pre-chunks.
Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
whether to add another pre-chunk.
Used for combining pre-chunks for chunking strategies like "by-title" that can potentially
produce undersized chunks and offer the `combine_text_under_n_chars` option. Note that only
sequential `TextPreChunk` objects can be combined. A `TablePreChunk` is never combined with
another pre-chunk.
Provides `.add_pre_chunk()` allowing a pre-chunk to be added to the chunk and provides
monitoring properties `.remaining_space` and `.text_length` suitable for deciding whether to add
another pre-chunk.
`.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used

View File

@ -12,13 +12,8 @@ from unstructured.chunking.base import (
PreChunk,
PreChunkBuilder,
PreChunkCombiner,
TablePreChunk,
)
from unstructured.documents.elements import (
Element,
Table,
Title,
)
from unstructured.documents.elements import Element, Title
def chunk_by_title(
@ -109,24 +104,17 @@ def _split_elements_by_title_and_table(
# -- start new pre_chunk when necessary --
if (
# -- Title and Table both start a new pre_chunk --
isinstance(element, (Title, Table))
# -- adding this element would exceed hard-maxlen for pre_chunk --
or pre_chunk_builder.remaining_space < len(str(element))
# -- pre_chunk already meets or exceeds soft-maxlen --
or pre_chunk_builder.text_length >= opts.soft_max
# -- Title starts a new "section" and so a new pre_chunk --
isinstance(element, Title)
# -- start a new pre-chunk when the WIP pre-chunk is already full --
or not pre_chunk_builder.will_fit(element)
# -- a semantic boundary is indicated by metadata change since prior element --
or metadata_differs
):
# -- complete any work-in-progress pre_chunk --
yield from pre_chunk_builder.flush()
# -- emit table and checkbox immediately since they are always isolated --
if isinstance(element, Table):
yield TablePreChunk(table=element, opts=opts)
# -- but accumulate text elements for consolidation into a composite chunk --
else:
pre_chunk_builder.add_element(element)
pre_chunk_builder.add_element(element)
prior_element = element