mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-25 06:04:53 +00:00
rfctr(chunking): generalize PreChunkBuilder (#2283)
To implement inter-pre-chunk overlap, we need a context that sees every pre-chunk both before and after it is accumulated (from elements). - We need access to the pre-chunk when it is completed so we can extract the "tail" overlap to be applied to the next chunk. - We need access to the as-yet-unpopulated pre-chunk so we can add the prior tail to it as a prefix. This "visibility" is split between `PreChunkBuilder` and the pre-chunker itself, which handles `TablePreChunk`s without the builder. Move `Table` element and TablePreChunk` formation into `PreChunkBuilder` such that _all_ element types (adding `Table` elements in particular) pass through it. Then `PreChunkBuilder` becomes the context we require. The actual overlap harvesting and application will come in a subsequent commit.
This commit is contained in:
parent
9efc22c0fc
commit
0c7f64ecaa
@ -1,3 +1,11 @@
|
||||
## 0.11.6-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.11.5
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -18,6 +18,7 @@ from unstructured.chunking.base import (
|
||||
)
|
||||
from unstructured.documents.elements import (
|
||||
CompositeElement,
|
||||
Element,
|
||||
ElementMetadata,
|
||||
PageBreak,
|
||||
RegexMetadata,
|
||||
@ -572,15 +573,15 @@ class DescribePreChunkBuilder:
|
||||
def it_is_empty_on_construction(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
|
||||
|
||||
assert builder.text_length == 0
|
||||
assert builder.remaining_space == 50
|
||||
assert builder._text_length == 0
|
||||
assert builder._remaining_space == 50
|
||||
|
||||
def it_accumulates_elements_added_to_it(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
||||
|
||||
builder.add_element(Title("Introduction"))
|
||||
assert builder.text_length == 12
|
||||
assert builder.remaining_space == 136
|
||||
assert builder._text_length == 12
|
||||
assert builder._remaining_space == 136
|
||||
|
||||
builder.add_element(
|
||||
Text(
|
||||
@ -588,8 +589,67 @@ class DescribePreChunkBuilder:
|
||||
"lectus porta volutpat.",
|
||||
),
|
||||
)
|
||||
assert builder.text_length == 112
|
||||
assert builder.remaining_space == 36
|
||||
assert builder._text_length == 112
|
||||
assert builder._remaining_space == 36
|
||||
|
||||
@pytest.mark.parametrize("element", [Table("Heading\nCell text"), Text("abcd " * 200)])
|
||||
def it_will_fit_a_Table_or_oversized_element_when_empty(self, element: Element):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new())
|
||||
assert builder.will_fit(element)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("existing_element", "next_element"),
|
||||
[
|
||||
(Text("abcd"), Table("Fruits\nMango")),
|
||||
(Text("abcd"), Text("abcd " * 200)),
|
||||
(Table("Heading\nCell text"), Table("Fruits\nMango")),
|
||||
(Table("Heading\nCell text"), Text("abcd " * 200)),
|
||||
],
|
||||
)
|
||||
def but_not_when_it_already_contains_an_element_of_any_kind(
|
||||
self, existing_element: Element, next_element: Element
|
||||
):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new())
|
||||
builder.add_element(existing_element)
|
||||
|
||||
assert not builder.will_fit(next_element)
|
||||
|
||||
@pytest.mark.parametrize("element", [Text("abcd"), Table("Fruits\nMango")])
|
||||
def it_will_not_fit_any_element_when_it_already_contains_a_table(self, element: Element):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new())
|
||||
builder.add_element(Table("Heading\nCell text"))
|
||||
|
||||
assert not builder.will_fit(element)
|
||||
|
||||
def it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen(self):
|
||||
builder = PreChunkBuilder(
|
||||
opts=ChunkingOptions.new(max_characters=100, new_after_n_chars=50)
|
||||
)
|
||||
builder.add_element(
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
|
||||
)
|
||||
|
||||
assert not builder.will_fit(Text("In rhoncus ipsum."))
|
||||
|
||||
def and_it_will_not_fit_an_element_when_that_would_cause_it_to_exceed_the_hard_maxlen(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100))
|
||||
builder.add_element(
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
|
||||
)
|
||||
|
||||
# -- 55 + 2 (separator) + 44 == 101 --
|
||||
assert not builder.will_fit(
|
||||
Text("In rhoncus ipsum sed lectus portos volutpat.") # 44-chars
|
||||
)
|
||||
|
||||
def but_it_will_fit_an_element_that_fits(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100))
|
||||
builder.add_element(
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
|
||||
)
|
||||
|
||||
# -- 55 + 2 (separator) + 43 == 100 --
|
||||
assert builder.will_fit(Text("In rhoncus ipsum sed lectus porto volutpat.")) # 43-chars
|
||||
|
||||
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
||||
@ -611,8 +671,24 @@ class DescribePreChunkBuilder:
|
||||
"lectus porta volutpat.",
|
||||
),
|
||||
]
|
||||
assert builder.text_length == 0
|
||||
assert builder.remaining_space == 150
|
||||
assert builder._text_length == 0
|
||||
assert builder._remaining_space == 150
|
||||
|
||||
def but_it_generates_a_TablePreChunk_when_it_contains_a_Table_element(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
||||
builder.add_element(Table("Heading\nCell text"))
|
||||
|
||||
pre_chunk = next(builder.flush())
|
||||
|
||||
# -- pre-chunk builder was reset before the yield, such that the iterator does not need to
|
||||
# -- be exhausted before clearing out the old elements and a new pre-chunk can be
|
||||
# -- accumulated immediately (first `next()` call is required however, to advance to the
|
||||
# -- yield statement).
|
||||
assert builder._text_length == 0
|
||||
assert builder._remaining_space == 150
|
||||
# -- pre-chunk is a `TablePreChunk` --
|
||||
assert isinstance(pre_chunk, TablePreChunk)
|
||||
assert pre_chunk._table == Table("Heading\nCell text")
|
||||
|
||||
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
||||
@ -620,21 +696,21 @@ class DescribePreChunkBuilder:
|
||||
pre_chunks = list(builder.flush())
|
||||
|
||||
assert pre_chunks == []
|
||||
assert builder.text_length == 0
|
||||
assert builder.remaining_space == 150
|
||||
assert builder._text_length == 0
|
||||
assert builder._remaining_space == 150
|
||||
|
||||
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
|
||||
builder.add_element(Text("abcde"))
|
||||
builder.add_element(Text("fghij"))
|
||||
|
||||
# -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
|
||||
# -- ._text_length includes a separator ("\n\n", len==2) between each text-segment,
|
||||
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
|
||||
assert builder.text_length == 12
|
||||
# -- .remaining_space is reduced by the length (2) of the trailing separator which would go
|
||||
# -- between the current text and that of the next element if one was added.
|
||||
assert builder._text_length == 12
|
||||
# -- ._remaining_space is reduced by the length (2) of the trailing separator which would
|
||||
# -- go between the current text and that of the next element if one was added.
|
||||
# -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
|
||||
assert builder.remaining_space == 36
|
||||
assert builder._remaining_space == 36
|
||||
|
||||
|
||||
class DescribePreChunkCombiner:
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.11.5" # pragma: no cover
|
||||
__version__ = "0.11.6-dev0" # pragma: no cover
|
||||
|
||||
@ -396,8 +396,8 @@ class TextPreChunk:
|
||||
class PreChunkBuilder:
|
||||
"""An element accumulator suitable for incrementally forming a pre-chunk.
|
||||
|
||||
Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use
|
||||
to determine whether it should add the next element in the element stream.
|
||||
Provides the trial method `.will_fit()` a pre-chunker can use to determine whether it should add
|
||||
the next element in the element stream.
|
||||
|
||||
`.flush()` is used to build a PreChunk object from the accumulated elements. This method
|
||||
returns an iterator that generates zero-or-one `TextPreChunk` or `TablePreChunk` object and is
|
||||
@ -426,7 +426,7 @@ class PreChunkBuilder:
|
||||
self._text_segments.append(element.text)
|
||||
self._text_len += len(element.text)
|
||||
|
||||
def flush(self) -> Iterator[TextPreChunk]:
|
||||
def flush(self) -> Iterator[PreChunk]:
|
||||
"""Generate zero-or-one `PreChunk` object and clear the accumulator.
|
||||
|
||||
Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
|
||||
@ -435,23 +435,62 @@ class PreChunkBuilder:
|
||||
"""
|
||||
if not self._elements:
|
||||
return
|
||||
|
||||
pre_chunk = (
|
||||
TablePreChunk(self._elements[0], self._opts)
|
||||
if isinstance(self._elements[0], Table)
|
||||
# -- copy list, don't use original or it may change contents as builder proceeds --
|
||||
else TextPreChunk(list(self._elements), self._opts)
|
||||
)
|
||||
# -- clear builder before yield so we're not sensitive to the timing of how/when this
|
||||
# -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately.
|
||||
elements = self._elements[:]
|
||||
self._elements.clear()
|
||||
self._text_segments.clear()
|
||||
self._text_len = 0
|
||||
yield TextPreChunk(elements, self._opts)
|
||||
# -- iterator is exhausted and can add elements for the next pre-chunk immediately.
|
||||
self._reset_state()
|
||||
yield pre_chunk
|
||||
|
||||
def will_fit(self, element: Element) -> bool:
|
||||
"""True when `element` can be added to this prechunk without violating its limits.
|
||||
|
||||
There are several limits:
|
||||
- A `Table` element will never fit with any other element. It will only fit in an empty
|
||||
pre-chunk.
|
||||
- No element will fit in a pre-chunk that already contains a `Table` element.
|
||||
- A text-element will not fit in a pre-chunk that already exceeds the soft-max
|
||||
(aka. new_after_n_chars).
|
||||
- A text-element will not fit when together with the elements already present it would
|
||||
exceed the hard-max (aka. max_characters).
|
||||
"""
|
||||
# -- an empty pre-chunk will accept any element (including an oversized-element) --
|
||||
if len(self._elements) == 0:
|
||||
return True
|
||||
# -- a `Table` will not fit in a non-empty pre-chunk --
|
||||
if isinstance(element, Table):
|
||||
return False
|
||||
# -- no element will fit in a pre-chunk that already contains a `Table` element --
|
||||
if self._elements and isinstance(self._elements[0], Table):
|
||||
return False
|
||||
# -- a pre-chunk that already exceeds the soft-max is considered "full" --
|
||||
if self._text_length > self._opts.soft_max:
|
||||
return False
|
||||
# -- don't add an element if it would increase total size beyond the hard-max --
|
||||
if self._remaining_space < len(element.text):
|
||||
return False
|
||||
return True
|
||||
|
||||
@property
|
||||
def remaining_space(self) -> int:
|
||||
def _remaining_space(self) -> int:
|
||||
"""Maximum text-length of an element that can be added without exceeding maxlen."""
|
||||
# -- include length of trailing separator that will go before next element text --
|
||||
separators_len = self._separator_len * len(self._text_segments)
|
||||
return self._opts.hard_max - self._text_len - separators_len
|
||||
|
||||
def _reset_state(self) -> None:
|
||||
"""Set working-state values back to "empty", ready to accumulate next pre-chunk."""
|
||||
self._elements.clear()
|
||||
self._text_segments.clear()
|
||||
self._text_len = 0
|
||||
|
||||
@property
|
||||
def text_length(self) -> int:
|
||||
def _text_length(self) -> int:
|
||||
"""Length of the text in this pre-chunk.
|
||||
|
||||
This value represents the chunk-size that would result if this pre-chunk was flushed in its
|
||||
@ -502,10 +541,16 @@ class PreChunkCombiner:
|
||||
|
||||
|
||||
class TextPreChunkAccumulator:
|
||||
"""Accumulates, measures, and combines pre-chunk objects.
|
||||
"""Accumulates, measures, and combines text pre-chunks.
|
||||
|
||||
Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
|
||||
whether to add another pre-chunk.
|
||||
Used for combining pre-chunks for chunking strategies like "by-title" that can potentially
|
||||
produce undersized chunks and offer the `combine_text_under_n_chars` option. Note that only
|
||||
sequential `TextPreChunk` objects can be combined. A `TablePreChunk` is never combined with
|
||||
another pre-chunk.
|
||||
|
||||
Provides `.add_pre_chunk()` allowing a pre-chunk to be added to the chunk and provides
|
||||
monitoring properties `.remaining_space` and `.text_length` suitable for deciding whether to add
|
||||
another pre-chunk.
|
||||
|
||||
`.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
|
||||
This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used
|
||||
|
||||
@ -12,13 +12,8 @@ from unstructured.chunking.base import (
|
||||
PreChunk,
|
||||
PreChunkBuilder,
|
||||
PreChunkCombiner,
|
||||
TablePreChunk,
|
||||
)
|
||||
from unstructured.documents.elements import (
|
||||
Element,
|
||||
Table,
|
||||
Title,
|
||||
)
|
||||
from unstructured.documents.elements import Element, Title
|
||||
|
||||
|
||||
def chunk_by_title(
|
||||
@ -109,24 +104,17 @@ def _split_elements_by_title_and_table(
|
||||
|
||||
# -- start new pre_chunk when necessary --
|
||||
if (
|
||||
# -- Title and Table both start a new pre_chunk --
|
||||
isinstance(element, (Title, Table))
|
||||
# -- adding this element would exceed hard-maxlen for pre_chunk --
|
||||
or pre_chunk_builder.remaining_space < len(str(element))
|
||||
# -- pre_chunk already meets or exceeds soft-maxlen --
|
||||
or pre_chunk_builder.text_length >= opts.soft_max
|
||||
# -- Title starts a new "section" and so a new pre_chunk --
|
||||
isinstance(element, Title)
|
||||
# -- start a new pre-chunk when the WIP pre-chunk is already full --
|
||||
or not pre_chunk_builder.will_fit(element)
|
||||
# -- a semantic boundary is indicated by metadata change since prior element --
|
||||
or metadata_differs
|
||||
):
|
||||
# -- complete any work-in-progress pre_chunk --
|
||||
yield from pre_chunk_builder.flush()
|
||||
|
||||
# -- emit table and checkbox immediately since they are always isolated --
|
||||
if isinstance(element, Table):
|
||||
yield TablePreChunk(table=element, opts=opts)
|
||||
# -- but accumulate text elements for consolidation into a composite chunk --
|
||||
else:
|
||||
pre_chunk_builder.add_element(element)
|
||||
pre_chunk_builder.add_element(element)
|
||||
|
||||
prior_element = element
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user