rfctr(chunking): extract general-purpose objects to base (#2281)

Many of the classes defined in `unstructured.chunking.title` are
applicable to any chunking strategy and will shortly be used for the
"by-character" chunking strategy as well.

Move these and their tests to `unstructured.chunking.base`.

Along the way, rename `TextPreChunkBuilder` to `PreChunkBuilder` because
it will be generalized in a subsequent PR to also take `Table` elements
such that inter-pre-chunk overlap can be implemented.

Otherwise, no logic changes, just moves.
This commit is contained in:
Steve Canny 2023-12-16 09:28:15 -08:00 committed by GitHub
parent a7c3f5f570
commit 36e81c3367
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 1297 additions and 1267 deletions

View File

@ -1,4 +1,4 @@
## 0.11.5-dev1
## 0.11.5-dev2
### Enhancements

View File

@ -1,14 +1,35 @@
# pyright: reportPrivateUsage=false
"""Unit-test suite for the `unstructured.chunking.base` module."""
from __future__ import annotations
from typing import List
import pytest
from unstructured.chunking.base import ChunkingOptions
from unstructured.chunking.base import (
ChunkingOptions,
PreChunkBuilder,
PreChunkCombiner,
TablePreChunk,
TextPreChunk,
TextPreChunkAccumulator,
)
from unstructured.documents.elements import (
CompositeElement,
ElementMetadata,
PageBreak,
RegexMetadata,
Table,
TableChunk,
Text,
Title,
)
class DescribeChunkingOptions:
"""Unit-test suite for `unstructured.chunking.model.ChunkingOptions objects."""
"""Unit-test suite for `unstructured.chunking.base.ChunkingOptions objects."""
@pytest.mark.parametrize("max_characters", [0, -1, -42])
def it_rejects_max_characters_not_greater_than_zero(self, max_characters: int):
@ -111,3 +132,847 @@ class DescribeChunkingOptions:
def it_knows_the_text_separator_string(self):
assert ChunkingOptions.new().text_separator == "\n\n"
# ================================================================================================
# PRE-CHUNK SUBTYPES
# ================================================================================================
class DescribeTablePreChunk:
"""Unit-test suite for `unstructured.chunking.base.TablePreChunk objects."""
def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
html_table = (
"<table>\n"
"<thead>\n"
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
"</thead>\n"
"<tbody>\n"
"<tr><td>Lorem ipsum </td><td>adipiscing </td></tr>\n"
"</tbody>\n"
"</table>"
)
text_table = "Header Col 1 Header Col 2\n" "Lorem ipsum adipiscing"
pre_chunk = TablePreChunk(
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
opts=ChunkingOptions.new(max_characters=175),
)
chunk_iter = pre_chunk.iter_chunks()
chunk = next(chunk_iter)
assert isinstance(chunk, Table)
assert chunk.text == "Header Col 1 Header Col 2\nLorem ipsum adipiscing"
assert chunk.metadata.text_as_html == (
"<table>\n"
"<thead>\n"
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
"</thead>\n"
"<tbody>\n"
"<tr><td>Lorem ipsum </td><td>adipiscing </td></tr>\n"
"</tbody>\n"
"</table>"
)
with pytest.raises(StopIteration):
next(chunk_iter)
def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
# fixed-overhead = 8+8+9+8+9+8 = 50
# per-row overhead = 27
html_table = (
"<table>\n" # 8
"<thead>\n" # 8
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
"</thead>\n" # 9
"<tbody>\n" # 8
"<tr><td>Lorem ipsum </td><td>A Link example</td></tr>\n"
"<tr><td>Consectetur </td><td>adipiscing elit</td></tr>\n"
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
"<tr><td>Vivamus quis </td><td>nunc ipsum donec ac fermentum</td></tr>\n"
"</tbody>\n" # 9
"</table>" # 8
)
text_table = (
"Header Col 1 Header Col 2\n"
"Lorem ipsum dolor sit amet\n"
"Consectetur adipiscing elit\n"
"Nunc aliquam id enim nec molestie\n"
"Vivamus quis nunc ipsum donec ac fermentum"
)
pre_chunk = TablePreChunk(
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
opts=ChunkingOptions.new(max_characters=100),
)
chunk_iter = pre_chunk.iter_chunks()
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
assert chunk.text == (
"Header Col 1 Header Col 2\n"
"Lorem ipsum dolor sit amet\n"
"Consectetur adipiscing elit\n"
"Nunc aliqua"
)
assert chunk.metadata.text_as_html == (
"<table>\n"
"<thead>\n"
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
"</thead>\n"
"<tbody>\n"
"<tr><td>Lo"
)
# --
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
assert (
chunk.text == "m id enim nec molestie\nVivamus quis nunc ipsum donec ac fermentum"
)
assert chunk.metadata.text_as_html == (
"rem ipsum </td><td>A Link example</td></tr>\n"
"<tr><td>Consectetur </td><td>adipiscing elit</td><"
)
# -- note that text runs out but HTML continues because it's significantly longer. So two
# -- of these chunks have HTML but no text.
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
assert chunk.text == ""
assert chunk.metadata.text_as_html == (
"/tr>\n"
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
"<tr><td>Vivamus quis </td><td>"
)
# --
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
assert chunk.text == ""
assert chunk.metadata.text_as_html == (
"nunc ipsum donec ac fermentum</td></tr>\n</tbody>\n</table>"
)
# --
with pytest.raises(StopIteration):
next(chunk_iter)
class DescribeTextPreChunk:
"""Unit-test suite for `unstructured.chunking.base.TextPreChunk objects."""
def it_can_combine_itself_with_another_TextPreChunk_instance(self):
""".combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`.
Note that neither the original or other pre_chunk are mutated.
"""
opts = ChunkingOptions.new()
pre_chunk = TextPreChunk(
[
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Text("In rhoncus ipsum sed lectus porta volutpat."),
],
opts=opts,
)
other_pre_chunk = TextPreChunk(
[
Text("Donec semper facilisis metus finibus malesuada."),
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
],
opts=opts,
)
new_pre_chunk = pre_chunk.combine(other_pre_chunk)
assert new_pre_chunk == TextPreChunk(
[
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Text("In rhoncus ipsum sed lectus porta volutpat."),
Text("Donec semper facilisis metus finibus malesuada."),
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
],
opts=opts,
)
assert pre_chunk == TextPreChunk(
[
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Text("In rhoncus ipsum sed lectus porta volutpat."),
],
opts=opts,
)
assert other_pre_chunk == TextPreChunk(
[
Text("Donec semper facilisis metus finibus malesuada."),
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
],
opts=opts,
)
def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
pre_chunk = TextPreChunk(
[
Title("Introduction"),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
"lectus porta volutpat.",
),
],
opts=ChunkingOptions.new(max_characters=200),
)
chunk_iter = pre_chunk.iter_chunks()
chunk = next(chunk_iter)
assert chunk == CompositeElement(
"Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
" In rhoncus ipsum sedlectus porta volutpat.",
)
assert chunk.metadata is pre_chunk._consolidated_metadata
def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
# -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
# -- The pre-chunker will isolate that element in a pre_chunk of its own.
pre_chunk = TextPreChunk(
[
Text(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
" veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
" commodo consequat."
),
],
opts=ChunkingOptions.new(max_characters=200),
)
chunk_iter = pre_chunk.iter_chunks()
chunk = next(chunk_iter)
assert chunk == CompositeElement(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
" veniam, quis nostrud exercitation ullamco laboris nisi ut a"
)
assert chunk.metadata is pre_chunk._consolidated_metadata
# --
chunk = next(chunk_iter)
assert chunk == CompositeElement("liquip ex ea commodo consequat.")
assert chunk.metadata is pre_chunk._consolidated_metadata
# --
with pytest.raises(StopIteration):
next(chunk_iter)
def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self):
""".text_length is the size of chunk this pre-chunk will produce (before any splitting)."""
pre_chunk = TextPreChunk(
[PageBreak(""), Text("foo"), Text("bar")], opts=ChunkingOptions.new()
)
assert pre_chunk.text_length == 8
def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self):
pre_chunk = TextPreChunk(
[
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
category_depth=0,
filename="foo.docx",
languages=["lat"],
parent_id="f87731e0",
),
),
Text(
"'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
metadata=ElementMetadata(
category_depth=1,
filename="foo.docx",
image_path="sprite.png",
languages=["lat", "eng"],
),
),
],
opts=ChunkingOptions.new(),
)
assert pre_chunk._all_metadata_values == {
# -- scalar values are accumulated in a list in element order --
"category_depth": [0, 1],
# -- all values are accumulated, not only unique ones --
"filename": ["foo.docx", "foo.docx"],
# -- list-type fields produce a list of lists --
"languages": [["lat"], ["lat", "eng"]],
# -- fields that only appear in some elements are captured --
"image_path": ["sprite.png"],
"parent_id": ["f87731e0"],
# -- A `None` value never appears, neither does a field-name with an empty list --
}
def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self):
metadata = ElementMetadata(
category_depth=0,
filename="foo.docx",
languages=["lat"],
parent_id="f87731e0",
)
metadata.coefficient = 0.62
metadata_2 = ElementMetadata(
category_depth=1,
filename="foo.docx",
image_path="sprite.png",
languages=["lat", "eng"],
)
metadata_2.quotient = 1.74
pre_chunk = TextPreChunk(
[
Title("Lorem Ipsum", metadata=metadata),
Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
],
opts=ChunkingOptions.new(),
)
# -- ad-hoc fields "coefficient" and "quotient" do not appear --
assert pre_chunk._all_metadata_values == {
"category_depth": [0, 1],
"filename": ["foo.docx", "foo.docx"],
"image_path": ["sprite.png"],
"languages": [["lat"], ["lat", "eng"]],
"parent_id": ["f87731e0"],
}
def it_consolidates_regex_metadata_in_a_field_specific_way(self):
"""regex_metadata of chunk is combined regex_metadatas of its elements.
Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new
position in the chunk after element text has been concatenated.
"""
pre_chunk = TextPreChunk(
[
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
},
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
),
),
],
opts=ChunkingOptions.new(),
)
regex_metadata = pre_chunk._consolidated_regex_meta
assert regex_metadata == {
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
"ipsum": [
RegexMetadata(text="Ipsum", start=6, end=11),
RegexMetadata(text="ipsum", start=19, end=24),
RegexMetadata(text="ipsum", start=81, end=86),
],
}
def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self):
"""._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
Only non-None fields should appear in the dict and each field value should be the
consolidation of the values across the pre_chunk elements.
"""
pre_chunk = TextPreChunk(
[
PageBreak(""),
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
filename="foo.docx",
# -- category_depth has DROP strategy so doesn't appear in result --
category_depth=0,
emphasized_text_contents=["Lorem", "Ipsum"],
emphasized_text_tags=["b", "i"],
languages=["lat"],
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
),
),
Text(
"'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
metadata=ElementMetadata(
# -- filename change doesn't happen IRL but demonstrates FIRST strategy --
filename="bar.docx",
# -- emphasized_text_contents has LIST_CONCATENATE strategy, so "Lorem"
# -- appears twice in consolidated-meta (as it should) and length matches
# -- that of emphasized_text_tags both before and after consolidation.
emphasized_text_contents=["Lorem", "ipsum"],
emphasized_text_tags=["i", "b"],
# -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once --
languages=["eng", "lat"],
# -- regex_metadata has its own dedicated consolidation-strategy (REGEX) --
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
},
),
),
],
opts=ChunkingOptions.new(),
)
meta_kwargs = pre_chunk._meta_kwargs
assert meta_kwargs == {
"filename": "foo.docx",
"emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"],
"emphasized_text_tags": ["b", "i", "i", "b"],
"languages": ["lat", "eng"],
"regex_metadata": {
"ipsum": [
RegexMetadata(text="Ipsum", start=6, end=11),
RegexMetadata(text="ipsum", start=19, end=24),
],
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
},
}
@pytest.mark.parametrize(
("elements", "expected_value"),
[
([Text("foo"), Text("bar")], "foo\n\nbar"),
([Text("foo"), PageBreak(""), Text("bar")], "foo\n\nbar"),
([PageBreak(""), Text("foo"), Text("bar")], "foo\n\nbar"),
([Text("foo"), Text("bar"), PageBreak("")], "foo\n\nbar"),
],
)
def it_knows_the_concatenated_text_of_the_pre_chunk(
self, elements: List[Text], expected_value: str
):
"""._text is the "joined" text of the pre-chunk elements.
The text-segment contributed by each element is separated from the next by a blank line
("\n\n"). An element that contributes no text does not give rise to a separator.
"""
pre_chunk = TextPreChunk(elements, opts=ChunkingOptions.new())
assert pre_chunk._text == expected_value
# ================================================================================================
# PRE-CHUNKING ACCUMULATORS
# ================================================================================================
class DescribePreChunkBuilder:
"""Unit-test suite for `unstructured.chunking.base.PreChunkBuilder`."""
def it_is_empty_on_construction(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
assert builder.text_length == 0
assert builder.remaining_space == 50
def it_accumulates_elements_added_to_it(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
builder.add_element(Title("Introduction"))
assert builder.text_length == 12
assert builder.remaining_space == 136
builder.add_element(
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
"lectus porta volutpat.",
),
)
assert builder.text_length == 112
assert builder.remaining_space == 36
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
builder.add_element(Title("Introduction"))
builder.add_element(
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
"lectus porta volutpat.",
),
)
pre_chunk = next(builder.flush())
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Introduction"),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
"lectus porta volutpat.",
),
]
assert builder.text_length == 0
assert builder.remaining_space == 150
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
pre_chunks = list(builder.flush())
assert pre_chunks == []
assert builder.text_length == 0
assert builder.remaining_space == 150
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
builder.add_element(Text("abcde"))
builder.add_element(Text("fghij"))
# -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
assert builder.text_length == 12
# -- .remaining_space is reduced by the length (2) of the trailing separator which would go
# -- between the current text and that of the next element if one was added.
# -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
assert builder.remaining_space == 36
class DescribePreChunkCombiner:
"""Unit-test suite for `unstructured.chunking.base.PreChunkCombiner`."""
def it_combines_sequential_small_text_pre_chunks(self):
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
pre_chunks = [
TextPreChunk(
[
Title("Lorem Ipsum"), # 11
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
],
opts=opts,
),
TextPreChunk(
[
Title("Mauris Nec"), # 10
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
],
opts=opts,
),
TextPreChunk(
[
Title("Sed Orci"), # 8
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
],
opts=opts,
),
]
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
]
with pytest.raises(StopIteration):
next(pre_chunk_iter)
def but_it_does_not_combine_table_pre_chunks(self):
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
pre_chunks = [
TextPreChunk(
[
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
],
opts=opts,
),
TablePreChunk(Table("Heading\nCell text"), opts=opts),
TextPreChunk(
[
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
],
opts=opts,
),
]
pre_chunk_iter = PreChunkCombiner(
pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
).iter_combined_pre_chunks()
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
]
# --
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TablePreChunk)
assert pre_chunk._table == Table("Heading\nCell text")
# --
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
]
# --
with pytest.raises(StopIteration):
next(pre_chunk_iter)
def it_respects_the_specified_combination_threshold(self):
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80)
pre_chunks = [
TextPreChunk( # 68
[
Title("Lorem Ipsum"), # 11
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
],
opts=opts,
),
TextPreChunk( # 71
[
Title("Mauris Nec"), # 10
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
],
opts=opts,
),
# -- len == 139
TextPreChunk(
[
Title("Sed Orci"), # 8
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
],
opts=opts,
),
]
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
]
# --
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
]
# --
with pytest.raises(StopIteration):
next(pre_chunk_iter)
def it_respects_the_hard_maximum_window_length(self):
opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200)
pre_chunks = [
TextPreChunk( # 68
[
Title("Lorem Ipsum"), # 11
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
],
opts=opts,
),
TextPreChunk( # 71
[
Title("Mauris Nec"), # 10
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
],
opts=opts,
),
# -- len == 139
TextPreChunk(
[
Title("Sed Orci"), # 8
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
],
opts=opts,
),
# -- len == 214
]
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
]
# --
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
]
# --
with pytest.raises(StopIteration):
next(pre_chunk_iter)
def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
"""Such as occurs when a single element exceeds the window size."""
opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
pre_chunks = [
TextPreChunk([Title("Lorem Ipsum")], opts=opts),
TextPreChunk( # 179
[
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55
" Mauris nec urna non augue vulputate consequat eget et nisi." # 60
" Sed orci quam, eleifend sit amet vehicula, elementum ultricies." # 64
)
],
opts=opts,
),
TextPreChunk([Title("Vulputate Consequat")], opts=opts),
]
pre_chunk_iter = PreChunkCombiner(
pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
).iter_combined_pre_chunks()
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [Title("Lorem Ipsum")]
# --
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit."
" Mauris nec urna non augue vulputate consequat eget et nisi."
" Sed orci quam, eleifend sit amet vehicula, elementum ultricies."
)
]
# --
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [Title("Vulputate Consequat")]
# --
with pytest.raises(StopIteration):
next(pre_chunk_iter)
class DescribeTextPreChunkAccumulator:
"""Unit-test suite for `unstructured.chunking.base.TextPreChunkAccumulator`."""
def it_is_empty_on_construction(self):
accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=100))
assert accum.text_length == 0
assert accum.remaining_space == 100
def it_accumulates_pre_chunks_added_to_it(self):
opts = ChunkingOptions.new(max_characters=500)
accum = TextPreChunkAccumulator(opts=opts)
accum.add_pre_chunk(
TextPreChunk(
[
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
],
opts=opts,
)
)
assert accum.text_length == 68
assert accum.remaining_space == 430
accum.add_pre_chunk(
TextPreChunk(
[
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
],
opts=opts,
)
)
assert accum.text_length == 141
assert accum.remaining_space == 357
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
opts = ChunkingOptions.new(max_characters=150)
accum = TextPreChunkAccumulator(opts=opts)
accum.add_pre_chunk(
TextPreChunk(
[
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
],
opts=opts,
)
)
accum.add_pre_chunk(
TextPreChunk(
[
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
],
opts=opts,
)
)
accum.add_pre_chunk(
TextPreChunk(
[
Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
],
opts=opts,
)
)
pre_chunk_iter = accum.flush()
# -- iterator generates exactly one pre_chunk --
pre_chunk = next(pre_chunk_iter)
with pytest.raises(StopIteration):
next(pre_chunk_iter)
# -- and it is a _TextPreChunk containing all the elements --
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
]
assert accum.text_length == 0
assert accum.remaining_space == 150
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150))
pre_chunks = list(accum.flush())
assert pre_chunks == []
assert accum.text_length == 0
assert accum.remaining_space == 150
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
opts = ChunkingOptions.new(max_characters=100)
accum = TextPreChunkAccumulator(opts=opts)
accum.add_pre_chunk(TextPreChunk([Text("abcde")], opts=opts))
accum.add_pre_chunk(TextPreChunk([Text("fghij")], opts=opts))
# -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
assert accum.text_length == 12
# -- .remaining_space is reduced by the length (2) of the trailing separator which would
# -- go between the current text and that of the next pre-chunk if one was added.
# -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88
assert accum.remaining_space == 86

View File

@ -4,16 +4,8 @@ from typing import List
import pytest
from unstructured.chunking.base import ChunkingOptions
from unstructured.chunking.title import (
PreChunkCombiner,
TablePreChunk,
TextPreChunk,
TextPreChunkAccumulator,
TextPreChunkBuilder,
_split_elements_by_title_and_table,
chunk_by_title,
)
from unstructured.chunking.base import ChunkingOptions, TablePreChunk, TextPreChunk
from unstructured.chunking.title import _split_elements_by_title_and_table, chunk_by_title
from unstructured.documents.coordinates import CoordinateSystem
from unstructured.documents.elements import (
CheckBox,
@ -22,10 +14,8 @@ from unstructured.documents.elements import (
Element,
ElementMetadata,
ListItem,
PageBreak,
RegexMetadata,
Table,
TableChunk,
Text,
Title,
)
@ -552,843 +542,3 @@ def test_it_considers_separator_length_when_pre_chunking():
),
CompositeElement("Minimize mid-text chunk-splitting"),
]
# == PreChunks ===================================================================================
class DescribeTablePreChunk:
"""Unit-test suite for `unstructured.chunking.title.TablePreChunk objects."""
def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
html_table = (
"<table>\n"
"<thead>\n"
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
"</thead>\n"
"<tbody>\n"
"<tr><td>Lorem ipsum </td><td>adipiscing </td></tr>\n"
"</tbody>\n"
"</table>"
)
text_table = "Header Col 1 Header Col 2\n" "Lorem ipsum adipiscing"
pre_chunk = TablePreChunk(
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
opts=ChunkingOptions.new(max_characters=175),
)
chunk_iter = pre_chunk.iter_chunks()
chunk = next(chunk_iter)
assert isinstance(chunk, Table)
assert chunk.text == "Header Col 1 Header Col 2\nLorem ipsum adipiscing"
assert chunk.metadata.text_as_html == (
"<table>\n"
"<thead>\n"
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
"</thead>\n"
"<tbody>\n"
"<tr><td>Lorem ipsum </td><td>adipiscing </td></tr>\n"
"</tbody>\n"
"</table>"
)
with pytest.raises(StopIteration):
next(chunk_iter)
def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
# fixed-overhead = 8+8+9+8+9+8 = 50
# per-row overhead = 27
html_table = (
"<table>\n" # 8
"<thead>\n" # 8
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
"</thead>\n" # 9
"<tbody>\n" # 8
"<tr><td>Lorem ipsum </td><td>A Link example</td></tr>\n"
"<tr><td>Consectetur </td><td>adipiscing elit</td></tr>\n"
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
"<tr><td>Vivamus quis </td><td>nunc ipsum donec ac fermentum</td></tr>\n"
"</tbody>\n" # 9
"</table>" # 8
)
text_table = (
"Header Col 1 Header Col 2\n"
"Lorem ipsum dolor sit amet\n"
"Consectetur adipiscing elit\n"
"Nunc aliquam id enim nec molestie\n"
"Vivamus quis nunc ipsum donec ac fermentum"
)
pre_chunk = TablePreChunk(
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
opts=ChunkingOptions.new(max_characters=100),
)
chunk_iter = pre_chunk.iter_chunks()
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
assert chunk.text == (
"Header Col 1 Header Col 2\n"
"Lorem ipsum dolor sit amet\n"
"Consectetur adipiscing elit\n"
"Nunc aliqua"
)
assert chunk.metadata.text_as_html == (
"<table>\n"
"<thead>\n"
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
"</thead>\n"
"<tbody>\n"
"<tr><td>Lo"
)
# --
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
assert (
chunk.text == "m id enim nec molestie\nVivamus quis nunc ipsum donec ac fermentum"
)
assert chunk.metadata.text_as_html == (
"rem ipsum </td><td>A Link example</td></tr>\n"
"<tr><td>Consectetur </td><td>adipiscing elit</td><"
)
# -- note that text runs out but HTML continues because it's significantly longer. So two
# -- of these chunks have HTML but no text.
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
assert chunk.text == ""
assert chunk.metadata.text_as_html == (
"/tr>\n"
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
"<tr><td>Vivamus quis </td><td>"
)
# --
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
assert chunk.text == ""
assert chunk.metadata.text_as_html == (
"nunc ipsum donec ac fermentum</td></tr>\n</tbody>\n</table>"
)
# --
with pytest.raises(StopIteration):
next(chunk_iter)
class DescribeTextPreChunk:
"""Unit-test suite for `unstructured.chunking.title.TextPreChunk objects."""
def it_can_combine_itself_with_another_TextPreChunk_instance(self):
""".combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`.
Note that neither the original or other pre_chunk are mutated.
"""
opts = ChunkingOptions.new()
pre_chunk = TextPreChunk(
[
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Text("In rhoncus ipsum sed lectus porta volutpat."),
],
opts=opts,
)
other_pre_chunk = TextPreChunk(
[
Text("Donec semper facilisis metus finibus malesuada."),
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
],
opts=opts,
)
new_pre_chunk = pre_chunk.combine(other_pre_chunk)
assert new_pre_chunk == TextPreChunk(
[
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Text("In rhoncus ipsum sed lectus porta volutpat."),
Text("Donec semper facilisis metus finibus malesuada."),
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
],
opts=opts,
)
assert pre_chunk == TextPreChunk(
[
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Text("In rhoncus ipsum sed lectus porta volutpat."),
],
opts=opts,
)
assert other_pre_chunk == TextPreChunk(
[
Text("Donec semper facilisis metus finibus malesuada."),
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
],
opts=opts,
)
def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
pre_chunk = TextPreChunk(
[
Title("Introduction"),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
"lectus porta volutpat.",
),
],
opts=ChunkingOptions.new(max_characters=200),
)
chunk_iter = pre_chunk.iter_chunks()
chunk = next(chunk_iter)
assert chunk == CompositeElement(
"Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
" In rhoncus ipsum sedlectus porta volutpat.",
)
assert chunk.metadata is pre_chunk._consolidated_metadata
def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
# -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
# -- The pre-chunker will isolate that element in a pre_chunk of its own.
pre_chunk = TextPreChunk(
[
Text(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
" veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
" commodo consequat."
),
],
opts=ChunkingOptions.new(max_characters=200),
)
chunk_iter = pre_chunk.iter_chunks()
chunk = next(chunk_iter)
assert chunk == CompositeElement(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
" veniam, quis nostrud exercitation ullamco laboris nisi ut a"
)
assert chunk.metadata is pre_chunk._consolidated_metadata
# --
chunk = next(chunk_iter)
assert chunk == CompositeElement("liquip ex ea commodo consequat.")
assert chunk.metadata is pre_chunk._consolidated_metadata
# --
with pytest.raises(StopIteration):
next(chunk_iter)
def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self):
""".text_length is the size of chunk this pre-chunk will produce (before any splitting)."""
pre_chunk = TextPreChunk(
[PageBreak(""), Text("foo"), Text("bar")], opts=ChunkingOptions.new()
)
assert pre_chunk.text_length == 8
def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self):
pre_chunk = TextPreChunk(
[
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
category_depth=0,
filename="foo.docx",
languages=["lat"],
parent_id="f87731e0",
),
),
Text(
"'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
metadata=ElementMetadata(
category_depth=1,
filename="foo.docx",
image_path="sprite.png",
languages=["lat", "eng"],
),
),
],
opts=ChunkingOptions.new(),
)
assert pre_chunk._all_metadata_values == {
# -- scalar values are accumulated in a list in element order --
"category_depth": [0, 1],
# -- all values are accumulated, not only unique ones --
"filename": ["foo.docx", "foo.docx"],
# -- list-type fields produce a list of lists --
"languages": [["lat"], ["lat", "eng"]],
# -- fields that only appear in some elements are captured --
"image_path": ["sprite.png"],
"parent_id": ["f87731e0"],
# -- A `None` value never appears, neither does a field-name with an empty list --
}
def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self):
metadata = ElementMetadata(
category_depth=0,
filename="foo.docx",
languages=["lat"],
parent_id="f87731e0",
)
metadata.coefficient = 0.62
metadata_2 = ElementMetadata(
category_depth=1,
filename="foo.docx",
image_path="sprite.png",
languages=["lat", "eng"],
)
metadata_2.quotient = 1.74
pre_chunk = TextPreChunk(
[
Title("Lorem Ipsum", metadata=metadata),
Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
],
opts=ChunkingOptions.new(),
)
# -- ad-hoc fields "coefficient" and "quotient" do not appear --
assert pre_chunk._all_metadata_values == {
"category_depth": [0, 1],
"filename": ["foo.docx", "foo.docx"],
"image_path": ["sprite.png"],
"languages": [["lat"], ["lat", "eng"]],
"parent_id": ["f87731e0"],
}
def it_consolidates_regex_metadata_in_a_field_specific_way(self):
"""regex_metadata of chunk is combined regex_metadatas of its elements.
Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new
position in the chunk after element text has been concatenated.
"""
pre_chunk = TextPreChunk(
[
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
},
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
),
),
],
opts=ChunkingOptions.new(),
)
regex_metadata = pre_chunk._consolidated_regex_meta
assert regex_metadata == {
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
"ipsum": [
RegexMetadata(text="Ipsum", start=6, end=11),
RegexMetadata(text="ipsum", start=19, end=24),
RegexMetadata(text="ipsum", start=81, end=86),
],
}
def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self):
"""._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
Only non-None fields should appear in the dict and each field value should be the
consolidation of the values across the pre_chunk elements.
"""
pre_chunk = TextPreChunk(
[
PageBreak(""),
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
filename="foo.docx",
# -- category_depth has DROP strategy so doesn't appear in result --
category_depth=0,
emphasized_text_contents=["Lorem", "Ipsum"],
emphasized_text_tags=["b", "i"],
languages=["lat"],
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
),
),
Text(
"'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
metadata=ElementMetadata(
# -- filename change doesn't happen IRL but demonstrates FIRST strategy --
filename="bar.docx",
# -- emphasized_text_contents has LIST_CONCATENATE strategy, so "Lorem"
# -- appears twice in consolidated-meta (as it should) and length matches
# -- that of emphasized_text_tags both before and after consolidation.
emphasized_text_contents=["Lorem", "ipsum"],
emphasized_text_tags=["i", "b"],
# -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once --
languages=["eng", "lat"],
# -- regex_metadata has its own dedicated consolidation-strategy (REGEX) --
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
},
),
),
],
opts=ChunkingOptions.new(),
)
meta_kwargs = pre_chunk._meta_kwargs
assert meta_kwargs == {
"filename": "foo.docx",
"emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"],
"emphasized_text_tags": ["b", "i", "i", "b"],
"languages": ["lat", "eng"],
"regex_metadata": {
"ipsum": [
RegexMetadata(text="Ipsum", start=6, end=11),
RegexMetadata(text="ipsum", start=19, end=24),
],
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
},
}
@pytest.mark.parametrize(
("elements", "expected_value"),
[
([Text("foo"), Text("bar")], "foo\n\nbar"),
([Text("foo"), PageBreak(""), Text("bar")], "foo\n\nbar"),
([PageBreak(""), Text("foo"), Text("bar")], "foo\n\nbar"),
([Text("foo"), Text("bar"), PageBreak("")], "foo\n\nbar"),
],
)
def it_knows_the_concatenated_text_of_the_pre_chunk(
self, elements: List[Text], expected_value: str
):
"""._text is the "joined" text of the pre-chunk elements.
The text-segment contributed by each element is separated from the next by a blank line
("\n\n"). An element that contributes no text does not give rise to a separator.
"""
pre_chunk = TextPreChunk(elements, opts=ChunkingOptions.new())
assert pre_chunk._text == expected_value
class DescribeTextPreChunkBuilder:
"""Unit-test suite for `unstructured.chunking.title.TextPreChunkBuilder`."""
def it_is_empty_on_construction(self):
builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
assert builder.text_length == 0
assert builder.remaining_space == 50
def it_accumulates_elements_added_to_it(self):
builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
builder.add_element(Title("Introduction"))
assert builder.text_length == 12
assert builder.remaining_space == 136
builder.add_element(
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
"lectus porta volutpat.",
),
)
assert builder.text_length == 112
assert builder.remaining_space == 36
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
builder.add_element(Title("Introduction"))
builder.add_element(
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
"lectus porta volutpat.",
),
)
pre_chunk = next(builder.flush())
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Introduction"),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
"lectus porta volutpat.",
),
]
assert builder.text_length == 0
assert builder.remaining_space == 150
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
pre_chunks = list(builder.flush())
assert pre_chunks == []
assert builder.text_length == 0
assert builder.remaining_space == 150
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
builder.add_element(Text("abcde"))
builder.add_element(Text("fghij"))
# -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
assert builder.text_length == 12
# -- .remaining_space is reduced by the length (2) of the trailing separator which would go
# -- between the current text and that of the next element if one was added.
# -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
assert builder.remaining_space == 36
# == PreChunkCombiner =============================================================================
class DescribePreChunkCombiner:
"""Unit-test suite for `unstructured.chunking.title.PreChunkCombiner`."""
def it_combines_sequential_small_text_pre_chunks(self):
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
pre_chunks = [
TextPreChunk(
[
Title("Lorem Ipsum"), # 11
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
],
opts=opts,
),
TextPreChunk(
[
Title("Mauris Nec"), # 10
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
],
opts=opts,
),
TextPreChunk(
[
Title("Sed Orci"), # 8
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
],
opts=opts,
),
]
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
]
with pytest.raises(StopIteration):
next(pre_chunk_iter)
def but_it_does_not_combine_table_pre_chunks(self):
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
pre_chunks = [
TextPreChunk(
[
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
],
opts=opts,
),
TablePreChunk(Table("Heading\nCell text"), opts=opts),
TextPreChunk(
[
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
],
opts=opts,
),
]
pre_chunk_iter = PreChunkCombiner(
pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
).iter_combined_pre_chunks()
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
]
# --
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TablePreChunk)
assert pre_chunk._table == Table("Heading\nCell text")
# --
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
]
# --
with pytest.raises(StopIteration):
next(pre_chunk_iter)
def it_respects_the_specified_combination_threshold(self):
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80)
pre_chunks = [
TextPreChunk( # 68
[
Title("Lorem Ipsum"), # 11
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
],
opts=opts,
),
TextPreChunk( # 71
[
Title("Mauris Nec"), # 10
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
],
opts=opts,
),
# -- len == 139
TextPreChunk(
[
Title("Sed Orci"), # 8
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
],
opts=opts,
),
]
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
]
# --
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
]
# --
with pytest.raises(StopIteration):
next(pre_chunk_iter)
def it_respects_the_hard_maximum_window_length(self):
opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200)
pre_chunks = [
TextPreChunk( # 68
[
Title("Lorem Ipsum"), # 11
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
],
opts=opts,
),
TextPreChunk( # 71
[
Title("Mauris Nec"), # 10
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
],
opts=opts,
),
# -- len == 139
TextPreChunk(
[
Title("Sed Orci"), # 8
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
],
opts=opts,
),
# -- len == 214
]
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
]
# --
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
]
# --
with pytest.raises(StopIteration):
next(pre_chunk_iter)
def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
"""Such as occurs when a single element exceeds the window size."""
opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
pre_chunks = [
TextPreChunk([Title("Lorem Ipsum")], opts=opts),
TextPreChunk( # 179
[
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55
" Mauris nec urna non augue vulputate consequat eget et nisi." # 60
" Sed orci quam, eleifend sit amet vehicula, elementum ultricies." # 64
)
],
opts=opts,
),
TextPreChunk([Title("Vulputate Consequat")], opts=opts),
]
pre_chunk_iter = PreChunkCombiner(
pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
).iter_combined_pre_chunks()
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [Title("Lorem Ipsum")]
# --
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit."
" Mauris nec urna non augue vulputate consequat eget et nisi."
" Sed orci quam, eleifend sit amet vehicula, elementum ultricies."
)
]
# --
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [Title("Vulputate Consequat")]
# --
with pytest.raises(StopIteration):
next(pre_chunk_iter)
class DescribeTextPreChunkAccumulator:
"""Unit-test suite for `unstructured.chunking.title.TextPreChunkAccumulator`."""
def it_is_empty_on_construction(self):
accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=100))
assert accum.text_length == 0
assert accum.remaining_space == 100
def it_accumulates_pre_chunks_added_to_it(self):
opts = ChunkingOptions.new(max_characters=500)
accum = TextPreChunkAccumulator(opts=opts)
accum.add_pre_chunk(
TextPreChunk(
[
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
],
opts=opts,
)
)
assert accum.text_length == 68
assert accum.remaining_space == 430
accum.add_pre_chunk(
TextPreChunk(
[
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
],
opts=opts,
)
)
assert accum.text_length == 141
assert accum.remaining_space == 357
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
opts = ChunkingOptions.new(max_characters=150)
accum = TextPreChunkAccumulator(opts=opts)
accum.add_pre_chunk(
TextPreChunk(
[
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
],
opts=opts,
)
)
accum.add_pre_chunk(
TextPreChunk(
[
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
],
opts=opts,
)
)
accum.add_pre_chunk(
TextPreChunk(
[
Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
],
opts=opts,
)
)
pre_chunk_iter = accum.flush()
# -- iterator generates exactly one pre_chunk --
pre_chunk = next(pre_chunk_iter)
with pytest.raises(StopIteration):
next(pre_chunk_iter)
# -- and it is a _TextPreChunk containing all the elements --
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
]
assert accum.text_length == 0
assert accum.remaining_space == 150
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150))
pre_chunks = list(accum.flush())
assert pre_chunks == []
assert accum.text_length == 0
assert accum.remaining_space == 150
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
opts = ChunkingOptions.new(max_characters=100)
accum = TextPreChunkAccumulator(opts=opts)
accum.add_pre_chunk(TextPreChunk([Text("abcde")], opts=opts))
accum.add_pre_chunk(TextPreChunk([Text("fghij")], opts=opts))
# -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
assert accum.text_length == 12
# -- .remaining_space is reduced by the length (2) of the trailing separator which would
# -- go between the current text and that of the next pre-chunk if one was added.
# -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88
assert accum.remaining_space == 86

View File

@ -1 +1 @@
__version__ = "0.11.5-dev1" # pragma: no cover
__version__ = "0.11.5-dev2" # pragma: no cover

View File

@ -2,12 +2,25 @@
from __future__ import annotations
from typing import Optional
import collections
import copy
from typing import Any, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple, cast
from typing_extensions import Self
from typing_extensions import Self, TypeAlias
from unstructured.documents.elements import (
CompositeElement,
ConsolidationStrategy,
Element,
ElementMetadata,
RegexMetadata,
Table,
TableChunk,
)
from unstructured.utils import lazyproperty
PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
class ChunkingOptions:
"""Specifies parameters of optional chunking behaviors."""
@ -150,3 +163,404 @@ class ChunkingOptions:
# loop (I think).
if self._overlap >= max_characters:
raise ValueError(f"'overlap' must be less than max_characters," f" got {self._overlap}")
# ================================================================================================
# PRE-CHUNK SUB-TYPES
# ================================================================================================
class TablePreChunk:
"""A pre-chunk composed of a single Table element."""
def __init__(self, table: Table, opts: ChunkingOptions) -> None:
self._table = table
self._opts = opts
def iter_chunks(self) -> Iterator[Table | TableChunk]:
"""Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
text = self._table.text
html = self._table.metadata.text_as_html or ""
maxlen = self._opts.hard_max
# -- only chunk a table when it's too big to swallow whole --
if len(text) <= maxlen and len(html) <= maxlen:
yield self._table
return
is_continuation = False
while text or html:
# -- split off the next maxchars into the next TableChunk --
text_chunk, text = text[:maxlen], text[maxlen:]
table_chunk = TableChunk(text=text_chunk, metadata=copy.deepcopy(self._table.metadata))
# -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
# -- HTML elements that *correspond* to the TextChunk.text fragment.
if html:
html_chunk, html = html[:maxlen], html[maxlen:]
table_chunk.metadata.text_as_html = html_chunk
# -- mark second and later chunks as a continuation --
if is_continuation:
table_chunk.metadata.is_continuation = True
yield table_chunk
is_continuation = True
class TextPreChunk:
"""A sequence of elements that belong to the same semantic unit within a document.
The name "section" derives from the idea of a document-section, a heading followed by the
paragraphs "under" that heading. That structure is not found in all documents and actual section
content can vary, but that's the concept.
This object is purposely immutable.
"""
def __init__(self, elements: Iterable[Element], opts: ChunkingOptions) -> None:
self._elements = list(elements)
self._opts = opts
def __eq__(self, other: Any) -> bool:
if not isinstance(other, TextPreChunk):
return False
return self._elements == other._elements
def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk:
"""Return new `TextPreChunk` that combines this and `other_pre_chunk`."""
return TextPreChunk(self._elements + other_pre_chunk._elements, opts=self._opts)
def iter_chunks(self) -> Iterator[CompositeElement]:
"""Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller."""
text = self._text
text_len = len(text)
maxlen = self._opts.hard_max
start = 0
remaining = text_len
while remaining > 0:
end = min(start + maxlen, text_len)
yield CompositeElement(text=text[start:end], metadata=self._consolidated_metadata)
start = end
remaining = text_len - end
@lazyproperty
def text_length(self) -> int:
"""Length of concatenated text of this pre-chunk, including separators."""
# -- used by pre-chunk-combiner to identify combination candidates --
return len(self._text)
@lazyproperty
def _all_metadata_values(self) -> Dict[str, List[Any]]:
"""Collection of all populated metadata values across elements.
The resulting dict has one key for each `ElementMetadata` field that had a non-None value in
at least one of the elements in this pre-chunk. The value of that key is a list of all those
populated values, in element order, for example:
{
"filename": ["sample.docx", "sample.docx"],
"languages": [["lat"], ["lat", "eng"]]
...
}
This preprocessing step provides the input for a specified consolidation strategy that will
resolve the list of values for each field to a single consolidated value.
"""
def iter_populated_fields(metadata: ElementMetadata) -> Iterator[Tuple[str, Any]]:
"""(field_name, value) pair for each non-None field in single `ElementMetadata`."""
return (
(field_name, value)
for field_name, value in metadata.known_fields.items()
if value is not None
)
field_values: DefaultDict[str, List[Any]] = collections.defaultdict(list)
# -- collect all non-None field values in a list for each field, in element-order --
for e in self._elements:
for field_name, value in iter_populated_fields(e.metadata):
field_values[field_name].append(value)
return dict(field_values)
@lazyproperty
def _consolidated_metadata(self) -> ElementMetadata:
"""Metadata applicable to this pre-chunk as a single chunk.
Formed by applying consolidation rules to all metadata fields across the elements of this
pre-chunk.
For the sake of consistency, the same rules are applied (for example, for dropping values)
to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
"consolidated".
"""
return ElementMetadata(**self._meta_kwargs)
@lazyproperty
def _consolidated_regex_meta(self) -> Dict[str, List[RegexMetadata]]:
"""Consolidate the regex-metadata in `regex_metadata_dicts` into a single dict.
This consolidated value is suitable for use in the chunk metadata. `start` and `end`
offsets of each regex match are also adjusted for their new positions.
"""
chunk_regex_metadata: Dict[str, List[RegexMetadata]] = {}
separator_len = len(self._opts.text_separator)
running_text_len = 0
start_offset = 0
for element in self._elements:
text_len = len(element.text)
# -- skip empty elements like `PageBreak("")` --
if not text_len:
continue
# -- account for blank line between "squashed" elements, but not before first element --
running_text_len += separator_len if running_text_len else 0
start_offset = running_text_len
running_text_len += text_len
if not element.metadata.regex_metadata:
continue
# -- consolidate any `regex_metadata` matches, adjusting the match start/end offsets --
element_regex_metadata = copy.deepcopy(element.metadata.regex_metadata)
for regex_name, matches in element_regex_metadata.items():
for m in matches:
m["start"] += start_offset
m["end"] += start_offset
chunk_matches = chunk_regex_metadata.get(regex_name, [])
chunk_matches.extend(matches)
chunk_regex_metadata[regex_name] = chunk_matches
return chunk_regex_metadata
@lazyproperty
def _meta_kwargs(self) -> Dict[str, Any]:
"""The consolidated metadata values as a dict suitable for constructing ElementMetadata.
This is where consolidation strategies are actually applied. The output is suitable for use
in constructing an `ElementMetadata` object like `ElementMetadata(**self._meta_kwargs)`.
"""
CS = ConsolidationStrategy
field_consolidation_strategies = ConsolidationStrategy.field_consolidation_strategies()
def iter_kwarg_pairs() -> Iterator[Tuple[str, Any]]:
"""Generate (field-name, value) pairs for each field in consolidated metadata."""
for field_name, values in self._all_metadata_values.items():
strategy = field_consolidation_strategies.get(field_name)
if strategy is CS.FIRST:
yield field_name, values[0]
# -- concatenate lists from each element that had one, in order --
elif strategy is CS.LIST_CONCATENATE:
yield field_name, sum(values, cast(List[Any], []))
# -- union lists from each element, preserving order of appearance --
elif strategy is CS.LIST_UNIQUE:
# -- Python 3.7+ maintains dict insertion order --
ordered_unique_keys = {key: None for val_list in values for key in val_list}
yield field_name, list(ordered_unique_keys.keys())
elif strategy is CS.REGEX:
yield field_name, self._consolidated_regex_meta
elif strategy is CS.DROP:
continue
else:
# -- not likely to hit this since we have a test in `text_elements.py` that
# -- ensures every ElementMetadata fields has an assigned strategy.
raise NotImplementedError(
f"metadata field {repr(field_name)} has no defined consolidation strategy"
)
return dict(iter_kwarg_pairs())
@lazyproperty
def _text(self) -> str:
"""The concatenated text of all elements in this pre-chunk.
Each element-text is separated from the next by a blank line ("\n\n").
"""
text_separator = self._opts.text_separator
return text_separator.join(e.text for e in self._elements if e.text)
# ================================================================================================
# PRE-CHUNKING ACCUMULATORS
# ------------------------------------------------------------------------------------------------
# Accumulators encapsulate the work of grouping elements and later pre-chunks to form the larger
# pre-chunk and combined-pre-chunk items central to unstructured chunking.
# ================================================================================================
class PreChunkBuilder:
"""An element accumulator suitable for incrementally forming a pre-chunk.
Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use
to determine whether it should add the next element in the element stream.
`.flush()` is used to build a PreChunk object from the accumulated elements. This method
returns an iterator that generates zero-or-one `TextPreChunk` or `TablePreChunk` object and is
used like so:
yield from builder.flush()
If no elements have been accumulated, no `PreChunk` instance is generated. Flushing the builder
clears the elements it contains so it is ready to build the next pre-chunk.
"""
def __init__(self, opts: ChunkingOptions) -> None:
self._opts = opts
self._separator_len = len(opts.text_separator)
self._elements: List[Element] = []
# -- only includes non-empty element text, e.g. PageBreak.text=="" is not included --
self._text_segments: List[str] = []
# -- combined length of text-segments, not including separators --
self._text_len: int = 0
def add_element(self, element: Element) -> None:
"""Add `element` to this section."""
self._elements.append(element)
if element.text:
self._text_segments.append(element.text)
self._text_len += len(element.text)
def flush(self) -> Iterator[TextPreChunk]:
"""Generate zero-or-one `PreChunk` object and clear the accumulator.
Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element
stream.
"""
if not self._elements:
return
# -- clear builder before yield so we're not sensitive to the timing of how/when this
# -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately.
elements = self._elements[:]
self._elements.clear()
self._text_segments.clear()
self._text_len = 0
yield TextPreChunk(elements, self._opts)
@property
def remaining_space(self) -> int:
"""Maximum text-length of an element that can be added without exceeding maxlen."""
# -- include length of trailing separator that will go before next element text --
separators_len = self._separator_len * len(self._text_segments)
return self._opts.hard_max - self._text_len - separators_len
@property
def text_length(self) -> int:
"""Length of the text in this pre-chunk.
This value represents the chunk-size that would result if this pre-chunk was flushed in its
current state. In particular, it does not include the length of a trailing separator (since
that would only appear if an additional element was added).
Not suitable for judging remaining space, use `.remaining_space` for that value.
"""
# -- number of text separators present in joined text of elements. This includes only
# -- separators *between* text segments, not one at the end. Note there are zero separators
# -- for both 0 and 1 text-segments.
n = len(self._text_segments)
separator_count = n - 1 if n else 0
return self._text_len + (separator_count * self._separator_len)
class PreChunkCombiner:
"""Filters pre-chunk stream to combine small pre-chunks where possible."""
def __init__(self, pre_chunks: Iterable[PreChunk], opts: ChunkingOptions):
self._pre_chunks = pre_chunks
self._opts = opts
def iter_combined_pre_chunks(self) -> Iterator[PreChunk]:
"""Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window."""
accum = TextPreChunkAccumulator(self._opts)
combine_text_under_n_chars = self._opts.combine_text_under_n_chars
for pre_chunk in self._pre_chunks:
# -- start new pre-chunk under these conditions --
if (
# -- a table pre-chunk is never combined --
isinstance(pre_chunk, TablePreChunk)
# -- don't add another pre-chunk once length has reached combination soft-max --
or accum.text_length >= combine_text_under_n_chars
# -- combining would exceed hard-max --
or accum.remaining_space < pre_chunk.text_length
):
yield from accum.flush()
# -- a table pre-chunk is never combined so don't accumulate --
if isinstance(pre_chunk, TablePreChunk):
yield pre_chunk
else:
accum.add_pre_chunk(pre_chunk)
yield from accum.flush()
class TextPreChunkAccumulator:
"""Accumulates, measures, and combines pre-chunk objects.
Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
whether to add another pre-chunk.
`.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used
like so:
yield from accum.flush()
If no pre-chunks have been accumulated, no `TextPreChunk` is generated. Flushing the builder
clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk.
"""
def __init__(self, opts: ChunkingOptions) -> None:
self._opts = opts
self._pre_chunks: List[TextPreChunk] = []
def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None:
"""Add a pre-chunk to the accumulator for possible combination with next pre-chunk."""
self._pre_chunks.append(pre_chunk)
def flush(self) -> Iterator[TextPreChunk]:
"""Generate all accumulated pre-chunks as a single combined pre-chunk."""
pre_chunks = self._pre_chunks
# -- nothing to do if no pre-chunks have been accumulated --
if not pre_chunks:
return
# -- otherwise combine all accumulated pre-chunk into one --
pre_chunk = pre_chunks[0]
for other_pre_chunk in pre_chunks[1:]:
pre_chunk = pre_chunk.combine(other_pre_chunk)
yield pre_chunk
# -- and reset the accumulator (to empty) --
pre_chunks.clear()
@property
def remaining_space(self) -> int:
"""Maximum size of pre-chunk that can be added without exceeding maxlen."""
maxlen = self._opts.hard_max
return (
maxlen
if not self._pre_chunks
# -- an additional pre-chunk will also incur an additional separator --
else maxlen - self.text_length - len(self._opts.text_separator)
)
@property
def text_length(self) -> int:
"""Size of concatenated text in all pre-chunks in accumulator."""
n = len(self._pre_chunks)
if n == 0:
return 0
total_text_length = sum(s.text_length for s in self._pre_chunks)
total_separator_length = len(self._opts.text_separator) * (n - 1)
return total_text_length + total_separator_length

View File

@ -5,26 +5,20 @@ Main entry point is the `@add_chunking_strategy()` decorator.
from __future__ import annotations
import collections
import copy
from typing import Any, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple, cast
from typing import Iterator, List, Optional
from typing_extensions import TypeAlias
from unstructured.chunking.base import ChunkingOptions
from unstructured.chunking.base import (
ChunkingOptions,
PreChunk,
PreChunkBuilder,
PreChunkCombiner,
TablePreChunk,
)
from unstructured.documents.elements import (
CompositeElement,
ConsolidationStrategy,
Element,
ElementMetadata,
RegexMetadata,
Table,
TableChunk,
Title,
)
from unstructured.utils import lazyproperty
PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
def chunk_by_title(
@ -78,7 +72,7 @@ def chunk_by_title(
def _split_elements_by_title_and_table(
elements: List[Element], opts: ChunkingOptions
) -> Iterator[TextPreChunk | TablePreChunk]:
) -> Iterator[PreChunk]:
"""Implements "pre-chunker" responsibilities.
A _section_ can be thought of as a "pre-chunk", generally determining the size and contents of a
@ -102,7 +96,7 @@ def _split_elements_by_title_and_table(
A Table or Checkbox element is placed into a pre-chunk by itself.
"""
pre_chunk_builder = TextPreChunkBuilder(opts)
pre_chunk_builder = PreChunkBuilder(opts)
prior_element = None
@ -156,396 +150,3 @@ def _metadata_differs(
if ignore_page_numbers:
return False
return metadata1.page_number != metadata2.page_number
# == PreChunks ===================================================================================
class TablePreChunk:
"""A pre-chunk composed of a single Table element."""
def __init__(self, table: Table, opts: ChunkingOptions) -> None:
self._table = table
self._opts = opts
def iter_chunks(self) -> Iterator[Table | TableChunk]:
"""Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
text = self._table.text
html = self._table.metadata.text_as_html or ""
maxlen = self._opts.hard_max
# -- only chunk a table when it's too big to swallow whole --
if len(text) <= maxlen and len(html) <= maxlen:
yield self._table
return
is_continuation = False
while text or html:
# -- split off the next maxchars into the next TableChunk --
text_chunk, text = text[:maxlen], text[maxlen:]
table_chunk = TableChunk(text=text_chunk, metadata=copy.deepcopy(self._table.metadata))
# -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
# -- HTML elements that *correspond* to the TextChunk.text fragment.
if html:
html_chunk, html = html[:maxlen], html[maxlen:]
table_chunk.metadata.text_as_html = html_chunk
# -- mark second and later chunks as a continuation --
if is_continuation:
table_chunk.metadata.is_continuation = True
yield table_chunk
is_continuation = True
class TextPreChunk:
"""A sequence of elements that belong to the same semantic unit within a document.
The name "section" derives from the idea of a document-section, a heading followed by the
paragraphs "under" that heading. That structure is not found in all documents and actual section
content can vary, but that's the concept.
This object is purposely immutable.
"""
def __init__(self, elements: Iterable[Element], opts: ChunkingOptions) -> None:
self._elements = list(elements)
self._opts = opts
def __eq__(self, other: Any) -> bool:
if not isinstance(other, TextPreChunk):
return False
return self._elements == other._elements
def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk:
"""Return new `TextPreChunk` that combines this and `other_pre_chunk`."""
return TextPreChunk(self._elements + other_pre_chunk._elements, opts=self._opts)
def iter_chunks(self) -> Iterator[CompositeElement]:
"""Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller."""
text = self._text
text_len = len(text)
maxlen = self._opts.hard_max
start = 0
remaining = text_len
while remaining > 0:
end = min(start + maxlen, text_len)
yield CompositeElement(text=text[start:end], metadata=self._consolidated_metadata)
start = end
remaining = text_len - end
@lazyproperty
def text_length(self) -> int:
"""Length of concatenated text of this pre-chunk, including separators."""
# -- used by pre-chunk-combiner to identify combination candidates --
return len(self._text)
@lazyproperty
def _all_metadata_values(self) -> Dict[str, List[Any]]:
"""Collection of all populated metadata values across elements.
The resulting dict has one key for each `ElementMetadata` field that had a non-None value in
at least one of the elements in this pre-chunk. The value of that key is a list of all those
populated values, in element order, for example:
{
"filename": ["sample.docx", "sample.docx"],
"languages": [["lat"], ["lat", "eng"]]
...
}
This preprocessing step provides the input for a specified consolidation strategy that will
resolve the list of values for each field to a single consolidated value.
"""
def iter_populated_fields(metadata: ElementMetadata) -> Iterator[Tuple[str, Any]]:
"""(field_name, value) pair for each non-None field in single `ElementMetadata`."""
return (
(field_name, value)
for field_name, value in metadata.known_fields.items()
if value is not None
)
field_values: DefaultDict[str, List[Any]] = collections.defaultdict(list)
# -- collect all non-None field values in a list for each field, in element-order --
for e in self._elements:
for field_name, value in iter_populated_fields(e.metadata):
field_values[field_name].append(value)
return dict(field_values)
@lazyproperty
def _consolidated_metadata(self) -> ElementMetadata:
"""Metadata applicable to this pre-chunk as a single chunk.
Formed by applying consolidation rules to all metadata fields across the elements of this
pre-chunk.
For the sake of consistency, the same rules are applied (for example, for dropping values)
to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
"consolidated".
"""
return ElementMetadata(**self._meta_kwargs)
@lazyproperty
def _consolidated_regex_meta(self) -> Dict[str, List[RegexMetadata]]:
"""Consolidate the regex-metadata in `regex_metadata_dicts` into a single dict.
This consolidated value is suitable for use in the chunk metadata. `start` and `end`
offsets of each regex match are also adjusted for their new positions.
"""
chunk_regex_metadata: Dict[str, List[RegexMetadata]] = {}
separator_len = len(self._opts.text_separator)
running_text_len = 0
start_offset = 0
for element in self._elements:
text_len = len(element.text)
# -- skip empty elements like `PageBreak("")` --
if not text_len:
continue
# -- account for blank line between "squashed" elements, but not before first element --
running_text_len += separator_len if running_text_len else 0
start_offset = running_text_len
running_text_len += text_len
if not element.metadata.regex_metadata:
continue
# -- consolidate any `regex_metadata` matches, adjusting the match start/end offsets --
element_regex_metadata = copy.deepcopy(element.metadata.regex_metadata)
for regex_name, matches in element_regex_metadata.items():
for m in matches:
m["start"] += start_offset
m["end"] += start_offset
chunk_matches = chunk_regex_metadata.get(regex_name, [])
chunk_matches.extend(matches)
chunk_regex_metadata[regex_name] = chunk_matches
return chunk_regex_metadata
@lazyproperty
def _meta_kwargs(self) -> Dict[str, Any]:
"""The consolidated metadata values as a dict suitable for constructing ElementMetadata.
This is where consolidation strategies are actually applied. The output is suitable for use
in constructing an `ElementMetadata` object like `ElementMetadata(**self._meta_kwargs)`.
"""
CS = ConsolidationStrategy
field_consolidation_strategies = ConsolidationStrategy.field_consolidation_strategies()
def iter_kwarg_pairs() -> Iterator[Tuple[str, Any]]:
"""Generate (field-name, value) pairs for each field in consolidated metadata."""
for field_name, values in self._all_metadata_values.items():
strategy = field_consolidation_strategies.get(field_name)
if strategy is CS.FIRST:
yield field_name, values[0]
# -- concatenate lists from each element that had one, in order --
elif strategy is CS.LIST_CONCATENATE:
yield field_name, sum(values, cast(List[Any], []))
# -- union lists from each element, preserving order of appearance --
elif strategy is CS.LIST_UNIQUE:
# -- Python 3.7+ maintains dict insertion order --
ordered_unique_keys = {key: None for val_list in values for key in val_list}
yield field_name, list(ordered_unique_keys.keys())
elif strategy is CS.REGEX:
yield field_name, self._consolidated_regex_meta
elif strategy is CS.DROP:
continue
else:
# -- not likely to hit this since we have a test in `text_elements.py` that
# -- ensures every ElementMetadata fields has an assigned strategy.
raise NotImplementedError(
f"metadata field {repr(field_name)} has no defined consolidation strategy"
)
return dict(iter_kwarg_pairs())
@lazyproperty
def _text(self) -> str:
"""The concatenated text of all elements in this pre-chunk.
Each element-text is separated from the next by a blank line ("\n\n").
"""
text_separator = self._opts.text_separator
return text_separator.join(e.text for e in self._elements if e.text)
class TextPreChunkBuilder:
"""An element accumulator suitable for incrementally forming a pre-chunk.
Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use
to determine whether it should add the next element in the element stream.
`.flush()` is used to build a `TextPreChunk` object from the accumulated elements. This method
returns an interator that generates zero-or-one `TextPreChunk` object and is used like so:
yield from builder.flush()
If no elements have been accumulated, no `TextPreChunk` is generated. Flushing the builder
clears the elements it contains so it is ready to build the next text-pre-chunk.
"""
def __init__(self, opts: ChunkingOptions) -> None:
self._opts = opts
self._separator_len = len(opts.text_separator)
self._elements: List[Element] = []
# -- only includes non-empty element text, e.g. PageBreak.text=="" is not included --
self._text_segments: List[str] = []
# -- combined length of text-segments, not including separators --
self._text_len: int = 0
def add_element(self, element: Element) -> None:
"""Add `element` to this section."""
self._elements.append(element)
if element.text:
self._text_segments.append(element.text)
self._text_len += len(element.text)
def flush(self) -> Iterator[TextPreChunk]:
"""Generate zero-or-one `PreChunk` object and clear the accumulator.
Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element
stream.
"""
if not self._elements:
return
# -- clear builder before yield so we're not sensitive to the timing of how/when this
# -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately.
elements = self._elements[:]
self._elements.clear()
self._text_segments.clear()
self._text_len = 0
yield TextPreChunk(elements, self._opts)
@property
def remaining_space(self) -> int:
"""Maximum text-length of an element that can be added without exceeding maxlen."""
# -- include length of trailing separator that will go before next element text --
separators_len = self._separator_len * len(self._text_segments)
return self._opts.hard_max - self._text_len - separators_len
@property
def text_length(self) -> int:
"""Length of the text in this pre-chunk.
This value represents the chunk-size that would result if this pre-chunk was flushed in its
current state. In particular, it does not include the length of a trailing separator (since
that would only appear if an additional element was added).
Not suitable for judging remaining space, use `.remaining_space` for that value.
"""
# -- number of text separators present in joined text of elements. This includes only
# -- separators *between* text segments, not one at the end. Note there are zero separators
# -- for both 0 and 1 text-segments.
n = len(self._text_segments)
separator_count = n - 1 if n else 0
return self._text_len + (separator_count * self._separator_len)
# == PreChunkCombiner ============================================================================
class PreChunkCombiner:
"""Filters pre-chunk stream to combine small pre-chunks where possible."""
def __init__(self, pre_chunks: Iterable[PreChunk], opts: ChunkingOptions):
self._pre_chunks = pre_chunks
self._opts = opts
def iter_combined_pre_chunks(self) -> Iterator[PreChunk]:
"""Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window."""
accum = TextPreChunkAccumulator(self._opts)
combine_text_under_n_chars = self._opts.combine_text_under_n_chars
for pre_chunk in self._pre_chunks:
# -- start new pre-chunk under these conditions --
if (
# -- a table pre-chunk is never combined --
isinstance(pre_chunk, TablePreChunk)
# -- don't add another pre-chunk once length has reached combination soft-max --
or accum.text_length >= combine_text_under_n_chars
# -- combining would exceed hard-max --
or accum.remaining_space < pre_chunk.text_length
):
yield from accum.flush()
# -- a table pre-chunk is never combined so don't accumulate --
if isinstance(pre_chunk, TablePreChunk):
yield pre_chunk
else:
accum.add_pre_chunk(pre_chunk)
yield from accum.flush()
class TextPreChunkAccumulator:
"""Accumulates, measures, and combines pre-chunk objects.
Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
whether to add another pre-chunk.
`.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used
like so:
yield from accum.flush()
If no pre-chunks have been accumulated, no `TextPreChunk` is generated. Flushing the builder
clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk.
"""
def __init__(self, opts: ChunkingOptions) -> None:
self._opts = opts
self._pre_chunks: List[TextPreChunk] = []
def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None:
"""Add a pre-chunk to the accumulator for possible combination with next pre-chunk."""
self._pre_chunks.append(pre_chunk)
def flush(self) -> Iterator[TextPreChunk]:
"""Generate all accumulated pre-chunks as a single combined pre-chunk."""
pre_chunks = self._pre_chunks
# -- nothing to do if no pre-chunks have been accumulated --
if not pre_chunks:
return
# -- otherwise combine all accumulated pre-chunk into one --
pre_chunk = pre_chunks[0]
for other_pre_chunk in pre_chunks[1:]:
pre_chunk = pre_chunk.combine(other_pre_chunk)
yield pre_chunk
# -- and reset the accumulator (to empty) --
pre_chunks.clear()
@property
def remaining_space(self) -> int:
"""Maximum size of pre-chunk that can be added without exceeding maxlen."""
maxlen = self._opts.hard_max
return (
maxlen
if not self._pre_chunks
# -- an additional pre-chunk will also incur an additional separator --
else maxlen - self.text_length - len(self._opts.text_separator)
)
@property
def text_length(self) -> int:
"""Size of concatenated text in all pre-chunks in accumulator."""
n = len(self._pre_chunks)
if n == 0:
return 0
total_text_length = sum(s.text_length for s in self._pre_chunks)
total_separator_length = len(self._opts.text_separator) * (n - 1)
return total_text_length + total_separator_length