mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-03 03:23:25 +00:00
rfctr(chunking): extract general-purpose objects to base (#2281)
Many of the classes defined in `unstructured.chunking.title` are applicable to any chunking strategy and will shortly be used for the "by-character" chunking strategy as well. Move these and their tests to `unstructured.chunking.base`. Along the way, rename `TextPreChunkBuilder` to `PreChunkBuilder` because it will be generalized in a subsequent PR to also take `Table` elements such that inter-pre-chunk overlap can be implemented. Otherwise, no logic changes, just moves.
This commit is contained in:
parent
a7c3f5f570
commit
36e81c3367
@ -1,4 +1,4 @@
|
||||
## 0.11.5-dev1
|
||||
## 0.11.5-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
|
||||
@ -1,14 +1,35 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
"""Unit-test suite for the `unstructured.chunking.base` module."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from unstructured.chunking.base import ChunkingOptions
|
||||
from unstructured.chunking.base import (
|
||||
ChunkingOptions,
|
||||
PreChunkBuilder,
|
||||
PreChunkCombiner,
|
||||
TablePreChunk,
|
||||
TextPreChunk,
|
||||
TextPreChunkAccumulator,
|
||||
)
|
||||
from unstructured.documents.elements import (
|
||||
CompositeElement,
|
||||
ElementMetadata,
|
||||
PageBreak,
|
||||
RegexMetadata,
|
||||
Table,
|
||||
TableChunk,
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
|
||||
|
||||
class DescribeChunkingOptions:
|
||||
"""Unit-test suite for `unstructured.chunking.model.ChunkingOptions objects."""
|
||||
"""Unit-test suite for `unstructured.chunking.base.ChunkingOptions objects."""
|
||||
|
||||
@pytest.mark.parametrize("max_characters", [0, -1, -42])
|
||||
def it_rejects_max_characters_not_greater_than_zero(self, max_characters: int):
|
||||
@ -111,3 +132,847 @@ class DescribeChunkingOptions:
|
||||
|
||||
def it_knows_the_text_separator_string(self):
|
||||
assert ChunkingOptions.new().text_separator == "\n\n"
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# PRE-CHUNK SUBTYPES
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class DescribeTablePreChunk:
|
||||
"""Unit-test suite for `unstructured.chunking.base.TablePreChunk objects."""
|
||||
|
||||
def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
|
||||
html_table = (
|
||||
"<table>\n"
|
||||
"<thead>\n"
|
||||
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
||||
"</thead>\n"
|
||||
"<tbody>\n"
|
||||
"<tr><td>Lorem ipsum </td><td>adipiscing </td></tr>\n"
|
||||
"</tbody>\n"
|
||||
"</table>"
|
||||
)
|
||||
text_table = "Header Col 1 Header Col 2\n" "Lorem ipsum adipiscing"
|
||||
pre_chunk = TablePreChunk(
|
||||
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
|
||||
opts=ChunkingOptions.new(max_characters=175),
|
||||
)
|
||||
|
||||
chunk_iter = pre_chunk.iter_chunks()
|
||||
|
||||
chunk = next(chunk_iter)
|
||||
assert isinstance(chunk, Table)
|
||||
assert chunk.text == "Header Col 1 Header Col 2\nLorem ipsum adipiscing"
|
||||
assert chunk.metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n"
|
||||
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
||||
"</thead>\n"
|
||||
"<tbody>\n"
|
||||
"<tr><td>Lorem ipsum </td><td>adipiscing </td></tr>\n"
|
||||
"</tbody>\n"
|
||||
"</table>"
|
||||
)
|
||||
with pytest.raises(StopIteration):
|
||||
next(chunk_iter)
|
||||
|
||||
def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
|
||||
# fixed-overhead = 8+8+9+8+9+8 = 50
|
||||
# per-row overhead = 27
|
||||
html_table = (
|
||||
"<table>\n" # 8
|
||||
"<thead>\n" # 8
|
||||
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
||||
"</thead>\n" # 9
|
||||
"<tbody>\n" # 8
|
||||
"<tr><td>Lorem ipsum </td><td>A Link example</td></tr>\n"
|
||||
"<tr><td>Consectetur </td><td>adipiscing elit</td></tr>\n"
|
||||
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
|
||||
"<tr><td>Vivamus quis </td><td>nunc ipsum donec ac fermentum</td></tr>\n"
|
||||
"</tbody>\n" # 9
|
||||
"</table>" # 8
|
||||
)
|
||||
text_table = (
|
||||
"Header Col 1 Header Col 2\n"
|
||||
"Lorem ipsum dolor sit amet\n"
|
||||
"Consectetur adipiscing elit\n"
|
||||
"Nunc aliquam id enim nec molestie\n"
|
||||
"Vivamus quis nunc ipsum donec ac fermentum"
|
||||
)
|
||||
pre_chunk = TablePreChunk(
|
||||
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
|
||||
opts=ChunkingOptions.new(max_characters=100),
|
||||
)
|
||||
|
||||
chunk_iter = pre_chunk.iter_chunks()
|
||||
|
||||
chunk = next(chunk_iter)
|
||||
assert isinstance(chunk, TableChunk)
|
||||
assert chunk.text == (
|
||||
"Header Col 1 Header Col 2\n"
|
||||
"Lorem ipsum dolor sit amet\n"
|
||||
"Consectetur adipiscing elit\n"
|
||||
"Nunc aliqua"
|
||||
)
|
||||
assert chunk.metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n"
|
||||
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
||||
"</thead>\n"
|
||||
"<tbody>\n"
|
||||
"<tr><td>Lo"
|
||||
)
|
||||
# --
|
||||
chunk = next(chunk_iter)
|
||||
assert isinstance(chunk, TableChunk)
|
||||
assert (
|
||||
chunk.text == "m id enim nec molestie\nVivamus quis nunc ipsum donec ac fermentum"
|
||||
)
|
||||
assert chunk.metadata.text_as_html == (
|
||||
"rem ipsum </td><td>A Link example</td></tr>\n"
|
||||
"<tr><td>Consectetur </td><td>adipiscing elit</td><"
|
||||
)
|
||||
# -- note that text runs out but HTML continues because it's significantly longer. So two
|
||||
# -- of these chunks have HTML but no text.
|
||||
chunk = next(chunk_iter)
|
||||
assert isinstance(chunk, TableChunk)
|
||||
assert chunk.text == ""
|
||||
assert chunk.metadata.text_as_html == (
|
||||
"/tr>\n"
|
||||
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
|
||||
"<tr><td>Vivamus quis </td><td>"
|
||||
)
|
||||
# --
|
||||
chunk = next(chunk_iter)
|
||||
assert isinstance(chunk, TableChunk)
|
||||
assert chunk.text == ""
|
||||
assert chunk.metadata.text_as_html == (
|
||||
"nunc ipsum donec ac fermentum</td></tr>\n</tbody>\n</table>"
|
||||
)
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(chunk_iter)
|
||||
|
||||
|
||||
class DescribeTextPreChunk:
|
||||
"""Unit-test suite for `unstructured.chunking.base.TextPreChunk objects."""
|
||||
|
||||
def it_can_combine_itself_with_another_TextPreChunk_instance(self):
|
||||
""".combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`.
|
||||
|
||||
Note that neither the original or other pre_chunk are mutated.
|
||||
"""
|
||||
opts = ChunkingOptions.new()
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Text("In rhoncus ipsum sed lectus porta volutpat."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
other_pre_chunk = TextPreChunk(
|
||||
[
|
||||
Text("Donec semper facilisis metus finibus malesuada."),
|
||||
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
|
||||
new_pre_chunk = pre_chunk.combine(other_pre_chunk)
|
||||
|
||||
assert new_pre_chunk == TextPreChunk(
|
||||
[
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Text("In rhoncus ipsum sed lectus porta volutpat."),
|
||||
Text("Donec semper facilisis metus finibus malesuada."),
|
||||
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
assert pre_chunk == TextPreChunk(
|
||||
[
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Text("In rhoncus ipsum sed lectus porta volutpat."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
assert other_pre_chunk == TextPreChunk(
|
||||
[
|
||||
Text("Donec semper facilisis metus finibus malesuada."),
|
||||
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
|
||||
def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Title("Introduction"),
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
||||
"lectus porta volutpat.",
|
||||
),
|
||||
],
|
||||
opts=ChunkingOptions.new(max_characters=200),
|
||||
)
|
||||
|
||||
chunk_iter = pre_chunk.iter_chunks()
|
||||
|
||||
chunk = next(chunk_iter)
|
||||
assert chunk == CompositeElement(
|
||||
"Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
|
||||
" In rhoncus ipsum sedlectus porta volutpat.",
|
||||
)
|
||||
assert chunk.metadata is pre_chunk._consolidated_metadata
|
||||
|
||||
def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
|
||||
# -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
|
||||
# -- The pre-chunker will isolate that element in a pre_chunk of its own.
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
|
||||
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
|
||||
" veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
|
||||
" commodo consequat."
|
||||
),
|
||||
],
|
||||
opts=ChunkingOptions.new(max_characters=200),
|
||||
)
|
||||
|
||||
chunk_iter = pre_chunk.iter_chunks()
|
||||
|
||||
chunk = next(chunk_iter)
|
||||
assert chunk == CompositeElement(
|
||||
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
|
||||
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
|
||||
" veniam, quis nostrud exercitation ullamco laboris nisi ut a"
|
||||
)
|
||||
assert chunk.metadata is pre_chunk._consolidated_metadata
|
||||
# --
|
||||
chunk = next(chunk_iter)
|
||||
assert chunk == CompositeElement("liquip ex ea commodo consequat.")
|
||||
assert chunk.metadata is pre_chunk._consolidated_metadata
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(chunk_iter)
|
||||
|
||||
def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self):
|
||||
""".text_length is the size of chunk this pre-chunk will produce (before any splitting)."""
|
||||
pre_chunk = TextPreChunk(
|
||||
[PageBreak(""), Text("foo"), Text("bar")], opts=ChunkingOptions.new()
|
||||
)
|
||||
assert pre_chunk.text_length == 8
|
||||
|
||||
def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self):
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Title(
|
||||
"Lorem Ipsum",
|
||||
metadata=ElementMetadata(
|
||||
category_depth=0,
|
||||
filename="foo.docx",
|
||||
languages=["lat"],
|
||||
parent_id="f87731e0",
|
||||
),
|
||||
),
|
||||
Text(
|
||||
"'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
|
||||
metadata=ElementMetadata(
|
||||
category_depth=1,
|
||||
filename="foo.docx",
|
||||
image_path="sprite.png",
|
||||
languages=["lat", "eng"],
|
||||
),
|
||||
),
|
||||
],
|
||||
opts=ChunkingOptions.new(),
|
||||
)
|
||||
|
||||
assert pre_chunk._all_metadata_values == {
|
||||
# -- scalar values are accumulated in a list in element order --
|
||||
"category_depth": [0, 1],
|
||||
# -- all values are accumulated, not only unique ones --
|
||||
"filename": ["foo.docx", "foo.docx"],
|
||||
# -- list-type fields produce a list of lists --
|
||||
"languages": [["lat"], ["lat", "eng"]],
|
||||
# -- fields that only appear in some elements are captured --
|
||||
"image_path": ["sprite.png"],
|
||||
"parent_id": ["f87731e0"],
|
||||
# -- A `None` value never appears, neither does a field-name with an empty list --
|
||||
}
|
||||
|
||||
def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self):
|
||||
metadata = ElementMetadata(
|
||||
category_depth=0,
|
||||
filename="foo.docx",
|
||||
languages=["lat"],
|
||||
parent_id="f87731e0",
|
||||
)
|
||||
metadata.coefficient = 0.62
|
||||
metadata_2 = ElementMetadata(
|
||||
category_depth=1,
|
||||
filename="foo.docx",
|
||||
image_path="sprite.png",
|
||||
languages=["lat", "eng"],
|
||||
)
|
||||
metadata_2.quotient = 1.74
|
||||
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Title("Lorem Ipsum", metadata=metadata),
|
||||
Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
|
||||
],
|
||||
opts=ChunkingOptions.new(),
|
||||
)
|
||||
|
||||
# -- ad-hoc fields "coefficient" and "quotient" do not appear --
|
||||
assert pre_chunk._all_metadata_values == {
|
||||
"category_depth": [0, 1],
|
||||
"filename": ["foo.docx", "foo.docx"],
|
||||
"image_path": ["sprite.png"],
|
||||
"languages": [["lat"], ["lat", "eng"]],
|
||||
"parent_id": ["f87731e0"],
|
||||
}
|
||||
|
||||
def it_consolidates_regex_metadata_in_a_field_specific_way(self):
|
||||
"""regex_metadata of chunk is combined regex_metadatas of its elements.
|
||||
|
||||
Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new
|
||||
position in the chunk after element text has been concatenated.
|
||||
"""
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Title(
|
||||
"Lorem Ipsum",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
|
||||
),
|
||||
),
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={
|
||||
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
|
||||
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
|
||||
},
|
||||
),
|
||||
),
|
||||
Text(
|
||||
"In rhoncus ipsum sed lectus porta volutpat.",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
|
||||
),
|
||||
),
|
||||
],
|
||||
opts=ChunkingOptions.new(),
|
||||
)
|
||||
|
||||
regex_metadata = pre_chunk._consolidated_regex_meta
|
||||
|
||||
assert regex_metadata == {
|
||||
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
|
||||
"ipsum": [
|
||||
RegexMetadata(text="Ipsum", start=6, end=11),
|
||||
RegexMetadata(text="ipsum", start=19, end=24),
|
||||
RegexMetadata(text="ipsum", start=81, end=86),
|
||||
],
|
||||
}
|
||||
|
||||
def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self):
|
||||
"""._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
|
||||
|
||||
Only non-None fields should appear in the dict and each field value should be the
|
||||
consolidation of the values across the pre_chunk elements.
|
||||
"""
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
PageBreak(""),
|
||||
Title(
|
||||
"Lorem Ipsum",
|
||||
metadata=ElementMetadata(
|
||||
filename="foo.docx",
|
||||
# -- category_depth has DROP strategy so doesn't appear in result --
|
||||
category_depth=0,
|
||||
emphasized_text_contents=["Lorem", "Ipsum"],
|
||||
emphasized_text_tags=["b", "i"],
|
||||
languages=["lat"],
|
||||
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
|
||||
),
|
||||
),
|
||||
Text(
|
||||
"'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
|
||||
metadata=ElementMetadata(
|
||||
# -- filename change doesn't happen IRL but demonstrates FIRST strategy --
|
||||
filename="bar.docx",
|
||||
# -- emphasized_text_contents has LIST_CONCATENATE strategy, so "Lorem"
|
||||
# -- appears twice in consolidated-meta (as it should) and length matches
|
||||
# -- that of emphasized_text_tags both before and after consolidation.
|
||||
emphasized_text_contents=["Lorem", "ipsum"],
|
||||
emphasized_text_tags=["i", "b"],
|
||||
# -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once --
|
||||
languages=["eng", "lat"],
|
||||
# -- regex_metadata has its own dedicated consolidation-strategy (REGEX) --
|
||||
regex_metadata={
|
||||
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
|
||||
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
|
||||
},
|
||||
),
|
||||
),
|
||||
],
|
||||
opts=ChunkingOptions.new(),
|
||||
)
|
||||
|
||||
meta_kwargs = pre_chunk._meta_kwargs
|
||||
|
||||
assert meta_kwargs == {
|
||||
"filename": "foo.docx",
|
||||
"emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"],
|
||||
"emphasized_text_tags": ["b", "i", "i", "b"],
|
||||
"languages": ["lat", "eng"],
|
||||
"regex_metadata": {
|
||||
"ipsum": [
|
||||
RegexMetadata(text="Ipsum", start=6, end=11),
|
||||
RegexMetadata(text="ipsum", start=19, end=24),
|
||||
],
|
||||
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
|
||||
},
|
||||
}
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("elements", "expected_value"),
|
||||
[
|
||||
([Text("foo"), Text("bar")], "foo\n\nbar"),
|
||||
([Text("foo"), PageBreak(""), Text("bar")], "foo\n\nbar"),
|
||||
([PageBreak(""), Text("foo"), Text("bar")], "foo\n\nbar"),
|
||||
([Text("foo"), Text("bar"), PageBreak("")], "foo\n\nbar"),
|
||||
],
|
||||
)
|
||||
def it_knows_the_concatenated_text_of_the_pre_chunk(
|
||||
self, elements: List[Text], expected_value: str
|
||||
):
|
||||
"""._text is the "joined" text of the pre-chunk elements.
|
||||
|
||||
The text-segment contributed by each element is separated from the next by a blank line
|
||||
("\n\n"). An element that contributes no text does not give rise to a separator.
|
||||
"""
|
||||
pre_chunk = TextPreChunk(elements, opts=ChunkingOptions.new())
|
||||
assert pre_chunk._text == expected_value
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# PRE-CHUNKING ACCUMULATORS
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class DescribePreChunkBuilder:
|
||||
"""Unit-test suite for `unstructured.chunking.base.PreChunkBuilder`."""
|
||||
|
||||
def it_is_empty_on_construction(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
|
||||
|
||||
assert builder.text_length == 0
|
||||
assert builder.remaining_space == 50
|
||||
|
||||
def it_accumulates_elements_added_to_it(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
||||
|
||||
builder.add_element(Title("Introduction"))
|
||||
assert builder.text_length == 12
|
||||
assert builder.remaining_space == 136
|
||||
|
||||
builder.add_element(
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
||||
"lectus porta volutpat.",
|
||||
),
|
||||
)
|
||||
assert builder.text_length == 112
|
||||
assert builder.remaining_space == 36
|
||||
|
||||
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
||||
builder.add_element(Title("Introduction"))
|
||||
builder.add_element(
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
||||
"lectus porta volutpat.",
|
||||
),
|
||||
)
|
||||
|
||||
pre_chunk = next(builder.flush())
|
||||
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Introduction"),
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
||||
"lectus porta volutpat.",
|
||||
),
|
||||
]
|
||||
assert builder.text_length == 0
|
||||
assert builder.remaining_space == 150
|
||||
|
||||
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
||||
|
||||
pre_chunks = list(builder.flush())
|
||||
|
||||
assert pre_chunks == []
|
||||
assert builder.text_length == 0
|
||||
assert builder.remaining_space == 150
|
||||
|
||||
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
|
||||
builder.add_element(Text("abcde"))
|
||||
builder.add_element(Text("fghij"))
|
||||
|
||||
# -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
|
||||
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
|
||||
assert builder.text_length == 12
|
||||
# -- .remaining_space is reduced by the length (2) of the trailing separator which would go
|
||||
# -- between the current text and that of the next element if one was added.
|
||||
# -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
|
||||
assert builder.remaining_space == 36
|
||||
|
||||
|
||||
class DescribePreChunkCombiner:
|
||||
"""Unit-test suite for `unstructured.chunking.base.PreChunkCombiner`."""
|
||||
|
||||
def it_combines_sequential_small_text_pre_chunks(self):
|
||||
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
|
||||
pre_chunks = [
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Lorem Ipsum"), # 11
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Mauris Nec"), # 10
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Sed Orci"), # 8
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
]
|
||||
|
||||
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
|
||||
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
Title("Sed Orci"),
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
|
||||
]
|
||||
with pytest.raises(StopIteration):
|
||||
next(pre_chunk_iter)
|
||||
|
||||
def but_it_does_not_combine_table_pre_chunks(self):
|
||||
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
|
||||
pre_chunks = [
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
TablePreChunk(Table("Heading\nCell text"), opts=opts),
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
]
|
||||
|
||||
pre_chunk_iter = PreChunkCombiner(
|
||||
pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
|
||||
).iter_combined_pre_chunks()
|
||||
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
]
|
||||
# --
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TablePreChunk)
|
||||
assert pre_chunk._table == Table("Heading\nCell text")
|
||||
# --
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
]
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(pre_chunk_iter)
|
||||
|
||||
def it_respects_the_specified_combination_threshold(self):
|
||||
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80)
|
||||
pre_chunks = [
|
||||
TextPreChunk( # 68
|
||||
[
|
||||
Title("Lorem Ipsum"), # 11
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
TextPreChunk( # 71
|
||||
[
|
||||
Title("Mauris Nec"), # 10
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
# -- len == 139
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Sed Orci"), # 8
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
]
|
||||
|
||||
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
|
||||
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
]
|
||||
# --
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Sed Orci"),
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
|
||||
]
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(pre_chunk_iter)
|
||||
|
||||
def it_respects_the_hard_maximum_window_length(self):
|
||||
opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200)
|
||||
pre_chunks = [
|
||||
TextPreChunk( # 68
|
||||
[
|
||||
Title("Lorem Ipsum"), # 11
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
TextPreChunk( # 71
|
||||
[
|
||||
Title("Mauris Nec"), # 10
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
# -- len == 139
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Sed Orci"), # 8
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
# -- len == 214
|
||||
]
|
||||
|
||||
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
|
||||
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
]
|
||||
# --
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Sed Orci"),
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
|
||||
]
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(pre_chunk_iter)
|
||||
|
||||
def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
|
||||
"""Such as occurs when a single element exceeds the window size."""
|
||||
opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
|
||||
pre_chunks = [
|
||||
TextPreChunk([Title("Lorem Ipsum")], opts=opts),
|
||||
TextPreChunk( # 179
|
||||
[
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55
|
||||
" Mauris nec urna non augue vulputate consequat eget et nisi." # 60
|
||||
" Sed orci quam, eleifend sit amet vehicula, elementum ultricies." # 64
|
||||
)
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
TextPreChunk([Title("Vulputate Consequat")], opts=opts),
|
||||
]
|
||||
|
||||
pre_chunk_iter = PreChunkCombiner(
|
||||
pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
|
||||
).iter_combined_pre_chunks()
|
||||
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [Title("Lorem Ipsum")]
|
||||
# --
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit."
|
||||
" Mauris nec urna non augue vulputate consequat eget et nisi."
|
||||
" Sed orci quam, eleifend sit amet vehicula, elementum ultricies."
|
||||
)
|
||||
]
|
||||
# --
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [Title("Vulputate Consequat")]
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(pre_chunk_iter)
|
||||
|
||||
|
||||
class DescribeTextPreChunkAccumulator:
|
||||
"""Unit-test suite for `unstructured.chunking.base.TextPreChunkAccumulator`."""
|
||||
|
||||
def it_is_empty_on_construction(self):
|
||||
accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=100))
|
||||
|
||||
assert accum.text_length == 0
|
||||
assert accum.remaining_space == 100
|
||||
|
||||
def it_accumulates_pre_chunks_added_to_it(self):
|
||||
opts = ChunkingOptions.new(max_characters=500)
|
||||
accum = TextPreChunkAccumulator(opts=opts)
|
||||
|
||||
accum.add_pre_chunk(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
)
|
||||
assert accum.text_length == 68
|
||||
assert accum.remaining_space == 430
|
||||
|
||||
accum.add_pre_chunk(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
)
|
||||
assert accum.text_length == 141
|
||||
assert accum.remaining_space == 357
|
||||
|
||||
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
|
||||
opts = ChunkingOptions.new(max_characters=150)
|
||||
accum = TextPreChunkAccumulator(opts=opts)
|
||||
accum.add_pre_chunk(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
)
|
||||
accum.add_pre_chunk(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
)
|
||||
accum.add_pre_chunk(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Sed Orci"),
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
)
|
||||
|
||||
pre_chunk_iter = accum.flush()
|
||||
|
||||
# -- iterator generates exactly one pre_chunk --
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
with pytest.raises(StopIteration):
|
||||
next(pre_chunk_iter)
|
||||
# -- and it is a _TextPreChunk containing all the elements --
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
Title("Sed Orci"),
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
|
||||
]
|
||||
assert accum.text_length == 0
|
||||
assert accum.remaining_space == 150
|
||||
|
||||
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
|
||||
accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150))
|
||||
|
||||
pre_chunks = list(accum.flush())
|
||||
|
||||
assert pre_chunks == []
|
||||
assert accum.text_length == 0
|
||||
assert accum.remaining_space == 150
|
||||
|
||||
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
|
||||
opts = ChunkingOptions.new(max_characters=100)
|
||||
accum = TextPreChunkAccumulator(opts=opts)
|
||||
accum.add_pre_chunk(TextPreChunk([Text("abcde")], opts=opts))
|
||||
accum.add_pre_chunk(TextPreChunk([Text("fghij")], opts=opts))
|
||||
|
||||
# -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
|
||||
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
|
||||
assert accum.text_length == 12
|
||||
# -- .remaining_space is reduced by the length (2) of the trailing separator which would
|
||||
# -- go between the current text and that of the next pre-chunk if one was added.
|
||||
# -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88
|
||||
assert accum.remaining_space == 86
|
||||
|
||||
@ -4,16 +4,8 @@ from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from unstructured.chunking.base import ChunkingOptions
|
||||
from unstructured.chunking.title import (
|
||||
PreChunkCombiner,
|
||||
TablePreChunk,
|
||||
TextPreChunk,
|
||||
TextPreChunkAccumulator,
|
||||
TextPreChunkBuilder,
|
||||
_split_elements_by_title_and_table,
|
||||
chunk_by_title,
|
||||
)
|
||||
from unstructured.chunking.base import ChunkingOptions, TablePreChunk, TextPreChunk
|
||||
from unstructured.chunking.title import _split_elements_by_title_and_table, chunk_by_title
|
||||
from unstructured.documents.coordinates import CoordinateSystem
|
||||
from unstructured.documents.elements import (
|
||||
CheckBox,
|
||||
@ -22,10 +14,8 @@ from unstructured.documents.elements import (
|
||||
Element,
|
||||
ElementMetadata,
|
||||
ListItem,
|
||||
PageBreak,
|
||||
RegexMetadata,
|
||||
Table,
|
||||
TableChunk,
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
@ -552,843 +542,3 @@ def test_it_considers_separator_length_when_pre_chunking():
|
||||
),
|
||||
CompositeElement("Minimize mid-text chunk-splitting"),
|
||||
]
|
||||
|
||||
|
||||
# == PreChunks ===================================================================================
|
||||
|
||||
|
||||
class DescribeTablePreChunk:
|
||||
"""Unit-test suite for `unstructured.chunking.title.TablePreChunk objects."""
|
||||
|
||||
def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
|
||||
html_table = (
|
||||
"<table>\n"
|
||||
"<thead>\n"
|
||||
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
||||
"</thead>\n"
|
||||
"<tbody>\n"
|
||||
"<tr><td>Lorem ipsum </td><td>adipiscing </td></tr>\n"
|
||||
"</tbody>\n"
|
||||
"</table>"
|
||||
)
|
||||
text_table = "Header Col 1 Header Col 2\n" "Lorem ipsum adipiscing"
|
||||
pre_chunk = TablePreChunk(
|
||||
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
|
||||
opts=ChunkingOptions.new(max_characters=175),
|
||||
)
|
||||
|
||||
chunk_iter = pre_chunk.iter_chunks()
|
||||
|
||||
chunk = next(chunk_iter)
|
||||
assert isinstance(chunk, Table)
|
||||
assert chunk.text == "Header Col 1 Header Col 2\nLorem ipsum adipiscing"
|
||||
assert chunk.metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n"
|
||||
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
||||
"</thead>\n"
|
||||
"<tbody>\n"
|
||||
"<tr><td>Lorem ipsum </td><td>adipiscing </td></tr>\n"
|
||||
"</tbody>\n"
|
||||
"</table>"
|
||||
)
|
||||
with pytest.raises(StopIteration):
|
||||
next(chunk_iter)
|
||||
|
||||
def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
|
||||
# fixed-overhead = 8+8+9+8+9+8 = 50
|
||||
# per-row overhead = 27
|
||||
html_table = (
|
||||
"<table>\n" # 8
|
||||
"<thead>\n" # 8
|
||||
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
||||
"</thead>\n" # 9
|
||||
"<tbody>\n" # 8
|
||||
"<tr><td>Lorem ipsum </td><td>A Link example</td></tr>\n"
|
||||
"<tr><td>Consectetur </td><td>adipiscing elit</td></tr>\n"
|
||||
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
|
||||
"<tr><td>Vivamus quis </td><td>nunc ipsum donec ac fermentum</td></tr>\n"
|
||||
"</tbody>\n" # 9
|
||||
"</table>" # 8
|
||||
)
|
||||
text_table = (
|
||||
"Header Col 1 Header Col 2\n"
|
||||
"Lorem ipsum dolor sit amet\n"
|
||||
"Consectetur adipiscing elit\n"
|
||||
"Nunc aliquam id enim nec molestie\n"
|
||||
"Vivamus quis nunc ipsum donec ac fermentum"
|
||||
)
|
||||
pre_chunk = TablePreChunk(
|
||||
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
|
||||
opts=ChunkingOptions.new(max_characters=100),
|
||||
)
|
||||
|
||||
chunk_iter = pre_chunk.iter_chunks()
|
||||
|
||||
chunk = next(chunk_iter)
|
||||
assert isinstance(chunk, TableChunk)
|
||||
assert chunk.text == (
|
||||
"Header Col 1 Header Col 2\n"
|
||||
"Lorem ipsum dolor sit amet\n"
|
||||
"Consectetur adipiscing elit\n"
|
||||
"Nunc aliqua"
|
||||
)
|
||||
assert chunk.metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
"<thead>\n"
|
||||
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
||||
"</thead>\n"
|
||||
"<tbody>\n"
|
||||
"<tr><td>Lo"
|
||||
)
|
||||
# --
|
||||
chunk = next(chunk_iter)
|
||||
assert isinstance(chunk, TableChunk)
|
||||
assert (
|
||||
chunk.text == "m id enim nec molestie\nVivamus quis nunc ipsum donec ac fermentum"
|
||||
)
|
||||
assert chunk.metadata.text_as_html == (
|
||||
"rem ipsum </td><td>A Link example</td></tr>\n"
|
||||
"<tr><td>Consectetur </td><td>adipiscing elit</td><"
|
||||
)
|
||||
# -- note that text runs out but HTML continues because it's significantly longer. So two
|
||||
# -- of these chunks have HTML but no text.
|
||||
chunk = next(chunk_iter)
|
||||
assert isinstance(chunk, TableChunk)
|
||||
assert chunk.text == ""
|
||||
assert chunk.metadata.text_as_html == (
|
||||
"/tr>\n"
|
||||
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
|
||||
"<tr><td>Vivamus quis </td><td>"
|
||||
)
|
||||
# --
|
||||
chunk = next(chunk_iter)
|
||||
assert isinstance(chunk, TableChunk)
|
||||
assert chunk.text == ""
|
||||
assert chunk.metadata.text_as_html == (
|
||||
"nunc ipsum donec ac fermentum</td></tr>\n</tbody>\n</table>"
|
||||
)
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(chunk_iter)
|
||||
|
||||
|
||||
class DescribeTextPreChunk:
|
||||
"""Unit-test suite for `unstructured.chunking.title.TextPreChunk objects."""
|
||||
|
||||
def it_can_combine_itself_with_another_TextPreChunk_instance(self):
|
||||
""".combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`.
|
||||
|
||||
Note that neither the original or other pre_chunk are mutated.
|
||||
"""
|
||||
opts = ChunkingOptions.new()
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Text("In rhoncus ipsum sed lectus porta volutpat."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
other_pre_chunk = TextPreChunk(
|
||||
[
|
||||
Text("Donec semper facilisis metus finibus malesuada."),
|
||||
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
|
||||
new_pre_chunk = pre_chunk.combine(other_pre_chunk)
|
||||
|
||||
assert new_pre_chunk == TextPreChunk(
|
||||
[
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Text("In rhoncus ipsum sed lectus porta volutpat."),
|
||||
Text("Donec semper facilisis metus finibus malesuada."),
|
||||
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
assert pre_chunk == TextPreChunk(
|
||||
[
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Text("In rhoncus ipsum sed lectus porta volutpat."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
assert other_pre_chunk == TextPreChunk(
|
||||
[
|
||||
Text("Donec semper facilisis metus finibus malesuada."),
|
||||
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
|
||||
def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Title("Introduction"),
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
||||
"lectus porta volutpat.",
|
||||
),
|
||||
],
|
||||
opts=ChunkingOptions.new(max_characters=200),
|
||||
)
|
||||
|
||||
chunk_iter = pre_chunk.iter_chunks()
|
||||
|
||||
chunk = next(chunk_iter)
|
||||
assert chunk == CompositeElement(
|
||||
"Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
|
||||
" In rhoncus ipsum sedlectus porta volutpat.",
|
||||
)
|
||||
assert chunk.metadata is pre_chunk._consolidated_metadata
|
||||
|
||||
def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
|
||||
# -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
|
||||
# -- The pre-chunker will isolate that element in a pre_chunk of its own.
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
|
||||
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
|
||||
" veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
|
||||
" commodo consequat."
|
||||
),
|
||||
],
|
||||
opts=ChunkingOptions.new(max_characters=200),
|
||||
)
|
||||
|
||||
chunk_iter = pre_chunk.iter_chunks()
|
||||
|
||||
chunk = next(chunk_iter)
|
||||
assert chunk == CompositeElement(
|
||||
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
|
||||
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
|
||||
" veniam, quis nostrud exercitation ullamco laboris nisi ut a"
|
||||
)
|
||||
assert chunk.metadata is pre_chunk._consolidated_metadata
|
||||
# --
|
||||
chunk = next(chunk_iter)
|
||||
assert chunk == CompositeElement("liquip ex ea commodo consequat.")
|
||||
assert chunk.metadata is pre_chunk._consolidated_metadata
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(chunk_iter)
|
||||
|
||||
def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self):
|
||||
""".text_length is the size of chunk this pre-chunk will produce (before any splitting)."""
|
||||
pre_chunk = TextPreChunk(
|
||||
[PageBreak(""), Text("foo"), Text("bar")], opts=ChunkingOptions.new()
|
||||
)
|
||||
assert pre_chunk.text_length == 8
|
||||
|
||||
def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self):
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Title(
|
||||
"Lorem Ipsum",
|
||||
metadata=ElementMetadata(
|
||||
category_depth=0,
|
||||
filename="foo.docx",
|
||||
languages=["lat"],
|
||||
parent_id="f87731e0",
|
||||
),
|
||||
),
|
||||
Text(
|
||||
"'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
|
||||
metadata=ElementMetadata(
|
||||
category_depth=1,
|
||||
filename="foo.docx",
|
||||
image_path="sprite.png",
|
||||
languages=["lat", "eng"],
|
||||
),
|
||||
),
|
||||
],
|
||||
opts=ChunkingOptions.new(),
|
||||
)
|
||||
|
||||
assert pre_chunk._all_metadata_values == {
|
||||
# -- scalar values are accumulated in a list in element order --
|
||||
"category_depth": [0, 1],
|
||||
# -- all values are accumulated, not only unique ones --
|
||||
"filename": ["foo.docx", "foo.docx"],
|
||||
# -- list-type fields produce a list of lists --
|
||||
"languages": [["lat"], ["lat", "eng"]],
|
||||
# -- fields that only appear in some elements are captured --
|
||||
"image_path": ["sprite.png"],
|
||||
"parent_id": ["f87731e0"],
|
||||
# -- A `None` value never appears, neither does a field-name with an empty list --
|
||||
}
|
||||
|
||||
def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self):
|
||||
metadata = ElementMetadata(
|
||||
category_depth=0,
|
||||
filename="foo.docx",
|
||||
languages=["lat"],
|
||||
parent_id="f87731e0",
|
||||
)
|
||||
metadata.coefficient = 0.62
|
||||
metadata_2 = ElementMetadata(
|
||||
category_depth=1,
|
||||
filename="foo.docx",
|
||||
image_path="sprite.png",
|
||||
languages=["lat", "eng"],
|
||||
)
|
||||
metadata_2.quotient = 1.74
|
||||
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Title("Lorem Ipsum", metadata=metadata),
|
||||
Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
|
||||
],
|
||||
opts=ChunkingOptions.new(),
|
||||
)
|
||||
|
||||
# -- ad-hoc fields "coefficient" and "quotient" do not appear --
|
||||
assert pre_chunk._all_metadata_values == {
|
||||
"category_depth": [0, 1],
|
||||
"filename": ["foo.docx", "foo.docx"],
|
||||
"image_path": ["sprite.png"],
|
||||
"languages": [["lat"], ["lat", "eng"]],
|
||||
"parent_id": ["f87731e0"],
|
||||
}
|
||||
|
||||
def it_consolidates_regex_metadata_in_a_field_specific_way(self):
|
||||
"""regex_metadata of chunk is combined regex_metadatas of its elements.
|
||||
|
||||
Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new
|
||||
position in the chunk after element text has been concatenated.
|
||||
"""
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Title(
|
||||
"Lorem Ipsum",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
|
||||
),
|
||||
),
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={
|
||||
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
|
||||
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
|
||||
},
|
||||
),
|
||||
),
|
||||
Text(
|
||||
"In rhoncus ipsum sed lectus porta volutpat.",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
|
||||
),
|
||||
),
|
||||
],
|
||||
opts=ChunkingOptions.new(),
|
||||
)
|
||||
|
||||
regex_metadata = pre_chunk._consolidated_regex_meta
|
||||
|
||||
assert regex_metadata == {
|
||||
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
|
||||
"ipsum": [
|
||||
RegexMetadata(text="Ipsum", start=6, end=11),
|
||||
RegexMetadata(text="ipsum", start=19, end=24),
|
||||
RegexMetadata(text="ipsum", start=81, end=86),
|
||||
],
|
||||
}
|
||||
|
||||
def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self):
|
||||
"""._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
|
||||
|
||||
Only non-None fields should appear in the dict and each field value should be the
|
||||
consolidation of the values across the pre_chunk elements.
|
||||
"""
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
PageBreak(""),
|
||||
Title(
|
||||
"Lorem Ipsum",
|
||||
metadata=ElementMetadata(
|
||||
filename="foo.docx",
|
||||
# -- category_depth has DROP strategy so doesn't appear in result --
|
||||
category_depth=0,
|
||||
emphasized_text_contents=["Lorem", "Ipsum"],
|
||||
emphasized_text_tags=["b", "i"],
|
||||
languages=["lat"],
|
||||
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
|
||||
),
|
||||
),
|
||||
Text(
|
||||
"'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
|
||||
metadata=ElementMetadata(
|
||||
# -- filename change doesn't happen IRL but demonstrates FIRST strategy --
|
||||
filename="bar.docx",
|
||||
# -- emphasized_text_contents has LIST_CONCATENATE strategy, so "Lorem"
|
||||
# -- appears twice in consolidated-meta (as it should) and length matches
|
||||
# -- that of emphasized_text_tags both before and after consolidation.
|
||||
emphasized_text_contents=["Lorem", "ipsum"],
|
||||
emphasized_text_tags=["i", "b"],
|
||||
# -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once --
|
||||
languages=["eng", "lat"],
|
||||
# -- regex_metadata has its own dedicated consolidation-strategy (REGEX) --
|
||||
regex_metadata={
|
||||
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
|
||||
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
|
||||
},
|
||||
),
|
||||
),
|
||||
],
|
||||
opts=ChunkingOptions.new(),
|
||||
)
|
||||
|
||||
meta_kwargs = pre_chunk._meta_kwargs
|
||||
|
||||
assert meta_kwargs == {
|
||||
"filename": "foo.docx",
|
||||
"emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"],
|
||||
"emphasized_text_tags": ["b", "i", "i", "b"],
|
||||
"languages": ["lat", "eng"],
|
||||
"regex_metadata": {
|
||||
"ipsum": [
|
||||
RegexMetadata(text="Ipsum", start=6, end=11),
|
||||
RegexMetadata(text="ipsum", start=19, end=24),
|
||||
],
|
||||
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
|
||||
},
|
||||
}
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("elements", "expected_value"),
|
||||
[
|
||||
([Text("foo"), Text("bar")], "foo\n\nbar"),
|
||||
([Text("foo"), PageBreak(""), Text("bar")], "foo\n\nbar"),
|
||||
([PageBreak(""), Text("foo"), Text("bar")], "foo\n\nbar"),
|
||||
([Text("foo"), Text("bar"), PageBreak("")], "foo\n\nbar"),
|
||||
],
|
||||
)
|
||||
def it_knows_the_concatenated_text_of_the_pre_chunk(
|
||||
self, elements: List[Text], expected_value: str
|
||||
):
|
||||
"""._text is the "joined" text of the pre-chunk elements.
|
||||
|
||||
The text-segment contributed by each element is separated from the next by a blank line
|
||||
("\n\n"). An element that contributes no text does not give rise to a separator.
|
||||
"""
|
||||
pre_chunk = TextPreChunk(elements, opts=ChunkingOptions.new())
|
||||
assert pre_chunk._text == expected_value
|
||||
|
||||
|
||||
class DescribeTextPreChunkBuilder:
|
||||
"""Unit-test suite for `unstructured.chunking.title.TextPreChunkBuilder`."""
|
||||
|
||||
def it_is_empty_on_construction(self):
|
||||
builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
|
||||
|
||||
assert builder.text_length == 0
|
||||
assert builder.remaining_space == 50
|
||||
|
||||
def it_accumulates_elements_added_to_it(self):
|
||||
builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
||||
|
||||
builder.add_element(Title("Introduction"))
|
||||
assert builder.text_length == 12
|
||||
assert builder.remaining_space == 136
|
||||
|
||||
builder.add_element(
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
||||
"lectus porta volutpat.",
|
||||
),
|
||||
)
|
||||
assert builder.text_length == 112
|
||||
assert builder.remaining_space == 36
|
||||
|
||||
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
|
||||
builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
||||
builder.add_element(Title("Introduction"))
|
||||
builder.add_element(
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
||||
"lectus porta volutpat.",
|
||||
),
|
||||
)
|
||||
|
||||
pre_chunk = next(builder.flush())
|
||||
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Introduction"),
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
||||
"lectus porta volutpat.",
|
||||
),
|
||||
]
|
||||
assert builder.text_length == 0
|
||||
assert builder.remaining_space == 150
|
||||
|
||||
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
|
||||
builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
||||
|
||||
pre_chunks = list(builder.flush())
|
||||
|
||||
assert pre_chunks == []
|
||||
assert builder.text_length == 0
|
||||
assert builder.remaining_space == 150
|
||||
|
||||
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
|
||||
builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
|
||||
builder.add_element(Text("abcde"))
|
||||
builder.add_element(Text("fghij"))
|
||||
|
||||
# -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
|
||||
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
|
||||
assert builder.text_length == 12
|
||||
# -- .remaining_space is reduced by the length (2) of the trailing separator which would go
|
||||
# -- between the current text and that of the next element if one was added.
|
||||
# -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
|
||||
assert builder.remaining_space == 36
|
||||
|
||||
|
||||
# == PreChunkCombiner =============================================================================
|
||||
|
||||
|
||||
class DescribePreChunkCombiner:
|
||||
"""Unit-test suite for `unstructured.chunking.title.PreChunkCombiner`."""
|
||||
|
||||
def it_combines_sequential_small_text_pre_chunks(self):
|
||||
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
|
||||
pre_chunks = [
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Lorem Ipsum"), # 11
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Mauris Nec"), # 10
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Sed Orci"), # 8
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
]
|
||||
|
||||
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
|
||||
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
Title("Sed Orci"),
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
|
||||
]
|
||||
with pytest.raises(StopIteration):
|
||||
next(pre_chunk_iter)
|
||||
|
||||
def but_it_does_not_combine_table_pre_chunks(self):
|
||||
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
|
||||
pre_chunks = [
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
TablePreChunk(Table("Heading\nCell text"), opts=opts),
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
]
|
||||
|
||||
pre_chunk_iter = PreChunkCombiner(
|
||||
pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
|
||||
).iter_combined_pre_chunks()
|
||||
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
]
|
||||
# --
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TablePreChunk)
|
||||
assert pre_chunk._table == Table("Heading\nCell text")
|
||||
# --
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
]
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(pre_chunk_iter)
|
||||
|
||||
def it_respects_the_specified_combination_threshold(self):
|
||||
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80)
|
||||
pre_chunks = [
|
||||
TextPreChunk( # 68
|
||||
[
|
||||
Title("Lorem Ipsum"), # 11
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
TextPreChunk( # 71
|
||||
[
|
||||
Title("Mauris Nec"), # 10
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
# -- len == 139
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Sed Orci"), # 8
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
]
|
||||
|
||||
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
|
||||
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
]
|
||||
# --
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Sed Orci"),
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
|
||||
]
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(pre_chunk_iter)
|
||||
|
||||
def it_respects_the_hard_maximum_window_length(self):
|
||||
opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200)
|
||||
pre_chunks = [
|
||||
TextPreChunk( # 68
|
||||
[
|
||||
Title("Lorem Ipsum"), # 11
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
TextPreChunk( # 71
|
||||
[
|
||||
Title("Mauris Nec"), # 10
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
# -- len == 139
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Sed Orci"), # 8
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
# -- len == 214
|
||||
]
|
||||
|
||||
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
|
||||
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
]
|
||||
# --
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Sed Orci"),
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
|
||||
]
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(pre_chunk_iter)
|
||||
|
||||
def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
|
||||
"""Such as occurs when a single element exceeds the window size."""
|
||||
opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
|
||||
pre_chunks = [
|
||||
TextPreChunk([Title("Lorem Ipsum")], opts=opts),
|
||||
TextPreChunk( # 179
|
||||
[
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55
|
||||
" Mauris nec urna non augue vulputate consequat eget et nisi." # 60
|
||||
" Sed orci quam, eleifend sit amet vehicula, elementum ultricies." # 64
|
||||
)
|
||||
],
|
||||
opts=opts,
|
||||
),
|
||||
TextPreChunk([Title("Vulputate Consequat")], opts=opts),
|
||||
]
|
||||
|
||||
pre_chunk_iter = PreChunkCombiner(
|
||||
pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
|
||||
).iter_combined_pre_chunks()
|
||||
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [Title("Lorem Ipsum")]
|
||||
# --
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit."
|
||||
" Mauris nec urna non augue vulputate consequat eget et nisi."
|
||||
" Sed orci quam, eleifend sit amet vehicula, elementum ultricies."
|
||||
)
|
||||
]
|
||||
# --
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [Title("Vulputate Consequat")]
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(pre_chunk_iter)
|
||||
|
||||
|
||||
class DescribeTextPreChunkAccumulator:
|
||||
"""Unit-test suite for `unstructured.chunking.title.TextPreChunkAccumulator`."""
|
||||
|
||||
def it_is_empty_on_construction(self):
|
||||
accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=100))
|
||||
|
||||
assert accum.text_length == 0
|
||||
assert accum.remaining_space == 100
|
||||
|
||||
def it_accumulates_pre_chunks_added_to_it(self):
|
||||
opts = ChunkingOptions.new(max_characters=500)
|
||||
accum = TextPreChunkAccumulator(opts=opts)
|
||||
|
||||
accum.add_pre_chunk(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
)
|
||||
assert accum.text_length == 68
|
||||
assert accum.remaining_space == 430
|
||||
|
||||
accum.add_pre_chunk(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
)
|
||||
assert accum.text_length == 141
|
||||
assert accum.remaining_space == 357
|
||||
|
||||
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
|
||||
opts = ChunkingOptions.new(max_characters=150)
|
||||
accum = TextPreChunkAccumulator(opts=opts)
|
||||
accum.add_pre_chunk(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
)
|
||||
accum.add_pre_chunk(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
)
|
||||
accum.add_pre_chunk(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Sed Orci"),
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
|
||||
],
|
||||
opts=opts,
|
||||
)
|
||||
)
|
||||
|
||||
pre_chunk_iter = accum.flush()
|
||||
|
||||
# -- iterator generates exactly one pre_chunk --
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
with pytest.raises(StopIteration):
|
||||
next(pre_chunk_iter)
|
||||
# -- and it is a _TextPreChunk containing all the elements --
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
Title("Sed Orci"),
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
|
||||
]
|
||||
assert accum.text_length == 0
|
||||
assert accum.remaining_space == 150
|
||||
|
||||
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
|
||||
accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150))
|
||||
|
||||
pre_chunks = list(accum.flush())
|
||||
|
||||
assert pre_chunks == []
|
||||
assert accum.text_length == 0
|
||||
assert accum.remaining_space == 150
|
||||
|
||||
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
|
||||
opts = ChunkingOptions.new(max_characters=100)
|
||||
accum = TextPreChunkAccumulator(opts=opts)
|
||||
accum.add_pre_chunk(TextPreChunk([Text("abcde")], opts=opts))
|
||||
accum.add_pre_chunk(TextPreChunk([Text("fghij")], opts=opts))
|
||||
|
||||
# -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
|
||||
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
|
||||
assert accum.text_length == 12
|
||||
# -- .remaining_space is reduced by the length (2) of the trailing separator which would
|
||||
# -- go between the current text and that of the next pre-chunk if one was added.
|
||||
# -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88
|
||||
assert accum.remaining_space == 86
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.11.5-dev1" # pragma: no cover
|
||||
__version__ = "0.11.5-dev2" # pragma: no cover
|
||||
|
||||
@ -2,12 +2,25 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
import collections
|
||||
import copy
|
||||
from typing import Any, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple, cast
|
||||
|
||||
from typing_extensions import Self
|
||||
from typing_extensions import Self, TypeAlias
|
||||
|
||||
from unstructured.documents.elements import (
|
||||
CompositeElement,
|
||||
ConsolidationStrategy,
|
||||
Element,
|
||||
ElementMetadata,
|
||||
RegexMetadata,
|
||||
Table,
|
||||
TableChunk,
|
||||
)
|
||||
from unstructured.utils import lazyproperty
|
||||
|
||||
PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
|
||||
|
||||
|
||||
class ChunkingOptions:
|
||||
"""Specifies parameters of optional chunking behaviors."""
|
||||
@ -150,3 +163,404 @@ class ChunkingOptions:
|
||||
# loop (I think).
|
||||
if self._overlap >= max_characters:
|
||||
raise ValueError(f"'overlap' must be less than max_characters," f" got {self._overlap}")
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# PRE-CHUNK SUB-TYPES
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class TablePreChunk:
|
||||
"""A pre-chunk composed of a single Table element."""
|
||||
|
||||
def __init__(self, table: Table, opts: ChunkingOptions) -> None:
|
||||
self._table = table
|
||||
self._opts = opts
|
||||
|
||||
def iter_chunks(self) -> Iterator[Table | TableChunk]:
|
||||
"""Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
|
||||
text = self._table.text
|
||||
html = self._table.metadata.text_as_html or ""
|
||||
maxlen = self._opts.hard_max
|
||||
|
||||
# -- only chunk a table when it's too big to swallow whole --
|
||||
if len(text) <= maxlen and len(html) <= maxlen:
|
||||
yield self._table
|
||||
return
|
||||
|
||||
is_continuation = False
|
||||
|
||||
while text or html:
|
||||
# -- split off the next maxchars into the next TableChunk --
|
||||
text_chunk, text = text[:maxlen], text[maxlen:]
|
||||
table_chunk = TableChunk(text=text_chunk, metadata=copy.deepcopy(self._table.metadata))
|
||||
|
||||
# -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
|
||||
# -- HTML elements that *correspond* to the TextChunk.text fragment.
|
||||
if html:
|
||||
html_chunk, html = html[:maxlen], html[maxlen:]
|
||||
table_chunk.metadata.text_as_html = html_chunk
|
||||
|
||||
# -- mark second and later chunks as a continuation --
|
||||
if is_continuation:
|
||||
table_chunk.metadata.is_continuation = True
|
||||
|
||||
yield table_chunk
|
||||
|
||||
is_continuation = True
|
||||
|
||||
|
||||
class TextPreChunk:
|
||||
"""A sequence of elements that belong to the same semantic unit within a document.
|
||||
|
||||
The name "section" derives from the idea of a document-section, a heading followed by the
|
||||
paragraphs "under" that heading. That structure is not found in all documents and actual section
|
||||
content can vary, but that's the concept.
|
||||
|
||||
This object is purposely immutable.
|
||||
"""
|
||||
|
||||
def __init__(self, elements: Iterable[Element], opts: ChunkingOptions) -> None:
|
||||
self._elements = list(elements)
|
||||
self._opts = opts
|
||||
|
||||
def __eq__(self, other: Any) -> bool:
|
||||
if not isinstance(other, TextPreChunk):
|
||||
return False
|
||||
return self._elements == other._elements
|
||||
|
||||
def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk:
|
||||
"""Return new `TextPreChunk` that combines this and `other_pre_chunk`."""
|
||||
return TextPreChunk(self._elements + other_pre_chunk._elements, opts=self._opts)
|
||||
|
||||
def iter_chunks(self) -> Iterator[CompositeElement]:
|
||||
"""Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller."""
|
||||
text = self._text
|
||||
text_len = len(text)
|
||||
maxlen = self._opts.hard_max
|
||||
start = 0
|
||||
remaining = text_len
|
||||
|
||||
while remaining > 0:
|
||||
end = min(start + maxlen, text_len)
|
||||
yield CompositeElement(text=text[start:end], metadata=self._consolidated_metadata)
|
||||
start = end
|
||||
remaining = text_len - end
|
||||
|
||||
@lazyproperty
|
||||
def text_length(self) -> int:
|
||||
"""Length of concatenated text of this pre-chunk, including separators."""
|
||||
# -- used by pre-chunk-combiner to identify combination candidates --
|
||||
return len(self._text)
|
||||
|
||||
@lazyproperty
|
||||
def _all_metadata_values(self) -> Dict[str, List[Any]]:
|
||||
"""Collection of all populated metadata values across elements.
|
||||
|
||||
The resulting dict has one key for each `ElementMetadata` field that had a non-None value in
|
||||
at least one of the elements in this pre-chunk. The value of that key is a list of all those
|
||||
populated values, in element order, for example:
|
||||
|
||||
{
|
||||
"filename": ["sample.docx", "sample.docx"],
|
||||
"languages": [["lat"], ["lat", "eng"]]
|
||||
...
|
||||
}
|
||||
|
||||
This preprocessing step provides the input for a specified consolidation strategy that will
|
||||
resolve the list of values for each field to a single consolidated value.
|
||||
"""
|
||||
|
||||
def iter_populated_fields(metadata: ElementMetadata) -> Iterator[Tuple[str, Any]]:
|
||||
"""(field_name, value) pair for each non-None field in single `ElementMetadata`."""
|
||||
return (
|
||||
(field_name, value)
|
||||
for field_name, value in metadata.known_fields.items()
|
||||
if value is not None
|
||||
)
|
||||
|
||||
field_values: DefaultDict[str, List[Any]] = collections.defaultdict(list)
|
||||
|
||||
# -- collect all non-None field values in a list for each field, in element-order --
|
||||
for e in self._elements:
|
||||
for field_name, value in iter_populated_fields(e.metadata):
|
||||
field_values[field_name].append(value)
|
||||
|
||||
return dict(field_values)
|
||||
|
||||
@lazyproperty
|
||||
def _consolidated_metadata(self) -> ElementMetadata:
|
||||
"""Metadata applicable to this pre-chunk as a single chunk.
|
||||
|
||||
Formed by applying consolidation rules to all metadata fields across the elements of this
|
||||
pre-chunk.
|
||||
|
||||
For the sake of consistency, the same rules are applied (for example, for dropping values)
|
||||
to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
|
||||
"consolidated".
|
||||
"""
|
||||
return ElementMetadata(**self._meta_kwargs)
|
||||
|
||||
@lazyproperty
|
||||
def _consolidated_regex_meta(self) -> Dict[str, List[RegexMetadata]]:
|
||||
"""Consolidate the regex-metadata in `regex_metadata_dicts` into a single dict.
|
||||
|
||||
This consolidated value is suitable for use in the chunk metadata. `start` and `end`
|
||||
offsets of each regex match are also adjusted for their new positions.
|
||||
"""
|
||||
chunk_regex_metadata: Dict[str, List[RegexMetadata]] = {}
|
||||
separator_len = len(self._opts.text_separator)
|
||||
running_text_len = 0
|
||||
start_offset = 0
|
||||
|
||||
for element in self._elements:
|
||||
text_len = len(element.text)
|
||||
# -- skip empty elements like `PageBreak("")` --
|
||||
if not text_len:
|
||||
continue
|
||||
# -- account for blank line between "squashed" elements, but not before first element --
|
||||
running_text_len += separator_len if running_text_len else 0
|
||||
start_offset = running_text_len
|
||||
running_text_len += text_len
|
||||
|
||||
if not element.metadata.regex_metadata:
|
||||
continue
|
||||
|
||||
# -- consolidate any `regex_metadata` matches, adjusting the match start/end offsets --
|
||||
element_regex_metadata = copy.deepcopy(element.metadata.regex_metadata)
|
||||
for regex_name, matches in element_regex_metadata.items():
|
||||
for m in matches:
|
||||
m["start"] += start_offset
|
||||
m["end"] += start_offset
|
||||
chunk_matches = chunk_regex_metadata.get(regex_name, [])
|
||||
chunk_matches.extend(matches)
|
||||
chunk_regex_metadata[regex_name] = chunk_matches
|
||||
|
||||
return chunk_regex_metadata
|
||||
|
||||
@lazyproperty
|
||||
def _meta_kwargs(self) -> Dict[str, Any]:
|
||||
"""The consolidated metadata values as a dict suitable for constructing ElementMetadata.
|
||||
|
||||
This is where consolidation strategies are actually applied. The output is suitable for use
|
||||
in constructing an `ElementMetadata` object like `ElementMetadata(**self._meta_kwargs)`.
|
||||
"""
|
||||
CS = ConsolidationStrategy
|
||||
field_consolidation_strategies = ConsolidationStrategy.field_consolidation_strategies()
|
||||
|
||||
def iter_kwarg_pairs() -> Iterator[Tuple[str, Any]]:
|
||||
"""Generate (field-name, value) pairs for each field in consolidated metadata."""
|
||||
for field_name, values in self._all_metadata_values.items():
|
||||
strategy = field_consolidation_strategies.get(field_name)
|
||||
if strategy is CS.FIRST:
|
||||
yield field_name, values[0]
|
||||
# -- concatenate lists from each element that had one, in order --
|
||||
elif strategy is CS.LIST_CONCATENATE:
|
||||
yield field_name, sum(values, cast(List[Any], []))
|
||||
# -- union lists from each element, preserving order of appearance --
|
||||
elif strategy is CS.LIST_UNIQUE:
|
||||
# -- Python 3.7+ maintains dict insertion order --
|
||||
ordered_unique_keys = {key: None for val_list in values for key in val_list}
|
||||
yield field_name, list(ordered_unique_keys.keys())
|
||||
elif strategy is CS.REGEX:
|
||||
yield field_name, self._consolidated_regex_meta
|
||||
elif strategy is CS.DROP:
|
||||
continue
|
||||
else:
|
||||
# -- not likely to hit this since we have a test in `text_elements.py` that
|
||||
# -- ensures every ElementMetadata fields has an assigned strategy.
|
||||
raise NotImplementedError(
|
||||
f"metadata field {repr(field_name)} has no defined consolidation strategy"
|
||||
)
|
||||
|
||||
return dict(iter_kwarg_pairs())
|
||||
|
||||
@lazyproperty
|
||||
def _text(self) -> str:
|
||||
"""The concatenated text of all elements in this pre-chunk.
|
||||
|
||||
Each element-text is separated from the next by a blank line ("\n\n").
|
||||
"""
|
||||
text_separator = self._opts.text_separator
|
||||
return text_separator.join(e.text for e in self._elements if e.text)
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# PRE-CHUNKING ACCUMULATORS
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
# Accumulators encapsulate the work of grouping elements and later pre-chunks to form the larger
|
||||
# pre-chunk and combined-pre-chunk items central to unstructured chunking.
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class PreChunkBuilder:
|
||||
"""An element accumulator suitable for incrementally forming a pre-chunk.
|
||||
|
||||
Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use
|
||||
to determine whether it should add the next element in the element stream.
|
||||
|
||||
`.flush()` is used to build a PreChunk object from the accumulated elements. This method
|
||||
returns an iterator that generates zero-or-one `TextPreChunk` or `TablePreChunk` object and is
|
||||
used like so:
|
||||
|
||||
yield from builder.flush()
|
||||
|
||||
If no elements have been accumulated, no `PreChunk` instance is generated. Flushing the builder
|
||||
clears the elements it contains so it is ready to build the next pre-chunk.
|
||||
"""
|
||||
|
||||
def __init__(self, opts: ChunkingOptions) -> None:
|
||||
self._opts = opts
|
||||
self._separator_len = len(opts.text_separator)
|
||||
self._elements: List[Element] = []
|
||||
|
||||
# -- only includes non-empty element text, e.g. PageBreak.text=="" is not included --
|
||||
self._text_segments: List[str] = []
|
||||
# -- combined length of text-segments, not including separators --
|
||||
self._text_len: int = 0
|
||||
|
||||
def add_element(self, element: Element) -> None:
|
||||
"""Add `element` to this section."""
|
||||
self._elements.append(element)
|
||||
if element.text:
|
||||
self._text_segments.append(element.text)
|
||||
self._text_len += len(element.text)
|
||||
|
||||
def flush(self) -> Iterator[TextPreChunk]:
|
||||
"""Generate zero-or-one `PreChunk` object and clear the accumulator.
|
||||
|
||||
Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
|
||||
boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element
|
||||
stream.
|
||||
"""
|
||||
if not self._elements:
|
||||
return
|
||||
# -- clear builder before yield so we're not sensitive to the timing of how/when this
|
||||
# -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately.
|
||||
elements = self._elements[:]
|
||||
self._elements.clear()
|
||||
self._text_segments.clear()
|
||||
self._text_len = 0
|
||||
yield TextPreChunk(elements, self._opts)
|
||||
|
||||
@property
|
||||
def remaining_space(self) -> int:
|
||||
"""Maximum text-length of an element that can be added without exceeding maxlen."""
|
||||
# -- include length of trailing separator that will go before next element text --
|
||||
separators_len = self._separator_len * len(self._text_segments)
|
||||
return self._opts.hard_max - self._text_len - separators_len
|
||||
|
||||
@property
|
||||
def text_length(self) -> int:
|
||||
"""Length of the text in this pre-chunk.
|
||||
|
||||
This value represents the chunk-size that would result if this pre-chunk was flushed in its
|
||||
current state. In particular, it does not include the length of a trailing separator (since
|
||||
that would only appear if an additional element was added).
|
||||
|
||||
Not suitable for judging remaining space, use `.remaining_space` for that value.
|
||||
"""
|
||||
# -- number of text separators present in joined text of elements. This includes only
|
||||
# -- separators *between* text segments, not one at the end. Note there are zero separators
|
||||
# -- for both 0 and 1 text-segments.
|
||||
n = len(self._text_segments)
|
||||
separator_count = n - 1 if n else 0
|
||||
return self._text_len + (separator_count * self._separator_len)
|
||||
|
||||
|
||||
class PreChunkCombiner:
|
||||
"""Filters pre-chunk stream to combine small pre-chunks where possible."""
|
||||
|
||||
def __init__(self, pre_chunks: Iterable[PreChunk], opts: ChunkingOptions):
|
||||
self._pre_chunks = pre_chunks
|
||||
self._opts = opts
|
||||
|
||||
def iter_combined_pre_chunks(self) -> Iterator[PreChunk]:
|
||||
"""Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window."""
|
||||
accum = TextPreChunkAccumulator(self._opts)
|
||||
combine_text_under_n_chars = self._opts.combine_text_under_n_chars
|
||||
|
||||
for pre_chunk in self._pre_chunks:
|
||||
# -- start new pre-chunk under these conditions --
|
||||
if (
|
||||
# -- a table pre-chunk is never combined --
|
||||
isinstance(pre_chunk, TablePreChunk)
|
||||
# -- don't add another pre-chunk once length has reached combination soft-max --
|
||||
or accum.text_length >= combine_text_under_n_chars
|
||||
# -- combining would exceed hard-max --
|
||||
or accum.remaining_space < pre_chunk.text_length
|
||||
):
|
||||
yield from accum.flush()
|
||||
|
||||
# -- a table pre-chunk is never combined so don't accumulate --
|
||||
if isinstance(pre_chunk, TablePreChunk):
|
||||
yield pre_chunk
|
||||
else:
|
||||
accum.add_pre_chunk(pre_chunk)
|
||||
|
||||
yield from accum.flush()
|
||||
|
||||
|
||||
class TextPreChunkAccumulator:
|
||||
"""Accumulates, measures, and combines pre-chunk objects.
|
||||
|
||||
Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
|
||||
whether to add another pre-chunk.
|
||||
|
||||
`.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
|
||||
This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used
|
||||
like so:
|
||||
|
||||
yield from accum.flush()
|
||||
|
||||
If no pre-chunks have been accumulated, no `TextPreChunk` is generated. Flushing the builder
|
||||
clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk.
|
||||
"""
|
||||
|
||||
def __init__(self, opts: ChunkingOptions) -> None:
|
||||
self._opts = opts
|
||||
self._pre_chunks: List[TextPreChunk] = []
|
||||
|
||||
def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None:
|
||||
"""Add a pre-chunk to the accumulator for possible combination with next pre-chunk."""
|
||||
self._pre_chunks.append(pre_chunk)
|
||||
|
||||
def flush(self) -> Iterator[TextPreChunk]:
|
||||
"""Generate all accumulated pre-chunks as a single combined pre-chunk."""
|
||||
pre_chunks = self._pre_chunks
|
||||
|
||||
# -- nothing to do if no pre-chunks have been accumulated --
|
||||
if not pre_chunks:
|
||||
return
|
||||
|
||||
# -- otherwise combine all accumulated pre-chunk into one --
|
||||
pre_chunk = pre_chunks[0]
|
||||
for other_pre_chunk in pre_chunks[1:]:
|
||||
pre_chunk = pre_chunk.combine(other_pre_chunk)
|
||||
yield pre_chunk
|
||||
|
||||
# -- and reset the accumulator (to empty) --
|
||||
pre_chunks.clear()
|
||||
|
||||
@property
|
||||
def remaining_space(self) -> int:
|
||||
"""Maximum size of pre-chunk that can be added without exceeding maxlen."""
|
||||
maxlen = self._opts.hard_max
|
||||
return (
|
||||
maxlen
|
||||
if not self._pre_chunks
|
||||
# -- an additional pre-chunk will also incur an additional separator --
|
||||
else maxlen - self.text_length - len(self._opts.text_separator)
|
||||
)
|
||||
|
||||
@property
|
||||
def text_length(self) -> int:
|
||||
"""Size of concatenated text in all pre-chunks in accumulator."""
|
||||
n = len(self._pre_chunks)
|
||||
|
||||
if n == 0:
|
||||
return 0
|
||||
|
||||
total_text_length = sum(s.text_length for s in self._pre_chunks)
|
||||
total_separator_length = len(self._opts.text_separator) * (n - 1)
|
||||
return total_text_length + total_separator_length
|
||||
|
||||
@ -5,26 +5,20 @@ Main entry point is the `@add_chunking_strategy()` decorator.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import collections
|
||||
import copy
|
||||
from typing import Any, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple, cast
|
||||
from typing import Iterator, List, Optional
|
||||
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from unstructured.chunking.base import ChunkingOptions
|
||||
from unstructured.chunking.base import (
|
||||
ChunkingOptions,
|
||||
PreChunk,
|
||||
PreChunkBuilder,
|
||||
PreChunkCombiner,
|
||||
TablePreChunk,
|
||||
)
|
||||
from unstructured.documents.elements import (
|
||||
CompositeElement,
|
||||
ConsolidationStrategy,
|
||||
Element,
|
||||
ElementMetadata,
|
||||
RegexMetadata,
|
||||
Table,
|
||||
TableChunk,
|
||||
Title,
|
||||
)
|
||||
from unstructured.utils import lazyproperty
|
||||
|
||||
PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
|
||||
|
||||
|
||||
def chunk_by_title(
|
||||
@ -78,7 +72,7 @@ def chunk_by_title(
|
||||
|
||||
def _split_elements_by_title_and_table(
|
||||
elements: List[Element], opts: ChunkingOptions
|
||||
) -> Iterator[TextPreChunk | TablePreChunk]:
|
||||
) -> Iterator[PreChunk]:
|
||||
"""Implements "pre-chunker" responsibilities.
|
||||
|
||||
A _section_ can be thought of as a "pre-chunk", generally determining the size and contents of a
|
||||
@ -102,7 +96,7 @@ def _split_elements_by_title_and_table(
|
||||
|
||||
A Table or Checkbox element is placed into a pre-chunk by itself.
|
||||
"""
|
||||
pre_chunk_builder = TextPreChunkBuilder(opts)
|
||||
pre_chunk_builder = PreChunkBuilder(opts)
|
||||
|
||||
prior_element = None
|
||||
|
||||
@ -156,396 +150,3 @@ def _metadata_differs(
|
||||
if ignore_page_numbers:
|
||||
return False
|
||||
return metadata1.page_number != metadata2.page_number
|
||||
|
||||
|
||||
# == PreChunks ===================================================================================
|
||||
|
||||
|
||||
class TablePreChunk:
|
||||
"""A pre-chunk composed of a single Table element."""
|
||||
|
||||
def __init__(self, table: Table, opts: ChunkingOptions) -> None:
|
||||
self._table = table
|
||||
self._opts = opts
|
||||
|
||||
def iter_chunks(self) -> Iterator[Table | TableChunk]:
|
||||
"""Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
|
||||
text = self._table.text
|
||||
html = self._table.metadata.text_as_html or ""
|
||||
maxlen = self._opts.hard_max
|
||||
|
||||
# -- only chunk a table when it's too big to swallow whole --
|
||||
if len(text) <= maxlen and len(html) <= maxlen:
|
||||
yield self._table
|
||||
return
|
||||
|
||||
is_continuation = False
|
||||
|
||||
while text or html:
|
||||
# -- split off the next maxchars into the next TableChunk --
|
||||
text_chunk, text = text[:maxlen], text[maxlen:]
|
||||
table_chunk = TableChunk(text=text_chunk, metadata=copy.deepcopy(self._table.metadata))
|
||||
|
||||
# -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
|
||||
# -- HTML elements that *correspond* to the TextChunk.text fragment.
|
||||
if html:
|
||||
html_chunk, html = html[:maxlen], html[maxlen:]
|
||||
table_chunk.metadata.text_as_html = html_chunk
|
||||
|
||||
# -- mark second and later chunks as a continuation --
|
||||
if is_continuation:
|
||||
table_chunk.metadata.is_continuation = True
|
||||
|
||||
yield table_chunk
|
||||
|
||||
is_continuation = True
|
||||
|
||||
|
||||
class TextPreChunk:
|
||||
"""A sequence of elements that belong to the same semantic unit within a document.
|
||||
|
||||
The name "section" derives from the idea of a document-section, a heading followed by the
|
||||
paragraphs "under" that heading. That structure is not found in all documents and actual section
|
||||
content can vary, but that's the concept.
|
||||
|
||||
This object is purposely immutable.
|
||||
"""
|
||||
|
||||
def __init__(self, elements: Iterable[Element], opts: ChunkingOptions) -> None:
|
||||
self._elements = list(elements)
|
||||
self._opts = opts
|
||||
|
||||
def __eq__(self, other: Any) -> bool:
|
||||
if not isinstance(other, TextPreChunk):
|
||||
return False
|
||||
return self._elements == other._elements
|
||||
|
||||
def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk:
|
||||
"""Return new `TextPreChunk` that combines this and `other_pre_chunk`."""
|
||||
return TextPreChunk(self._elements + other_pre_chunk._elements, opts=self._opts)
|
||||
|
||||
def iter_chunks(self) -> Iterator[CompositeElement]:
|
||||
"""Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller."""
|
||||
text = self._text
|
||||
text_len = len(text)
|
||||
maxlen = self._opts.hard_max
|
||||
start = 0
|
||||
remaining = text_len
|
||||
|
||||
while remaining > 0:
|
||||
end = min(start + maxlen, text_len)
|
||||
yield CompositeElement(text=text[start:end], metadata=self._consolidated_metadata)
|
||||
start = end
|
||||
remaining = text_len - end
|
||||
|
||||
@lazyproperty
|
||||
def text_length(self) -> int:
|
||||
"""Length of concatenated text of this pre-chunk, including separators."""
|
||||
# -- used by pre-chunk-combiner to identify combination candidates --
|
||||
return len(self._text)
|
||||
|
||||
@lazyproperty
|
||||
def _all_metadata_values(self) -> Dict[str, List[Any]]:
|
||||
"""Collection of all populated metadata values across elements.
|
||||
|
||||
The resulting dict has one key for each `ElementMetadata` field that had a non-None value in
|
||||
at least one of the elements in this pre-chunk. The value of that key is a list of all those
|
||||
populated values, in element order, for example:
|
||||
|
||||
{
|
||||
"filename": ["sample.docx", "sample.docx"],
|
||||
"languages": [["lat"], ["lat", "eng"]]
|
||||
...
|
||||
}
|
||||
|
||||
This preprocessing step provides the input for a specified consolidation strategy that will
|
||||
resolve the list of values for each field to a single consolidated value.
|
||||
"""
|
||||
|
||||
def iter_populated_fields(metadata: ElementMetadata) -> Iterator[Tuple[str, Any]]:
|
||||
"""(field_name, value) pair for each non-None field in single `ElementMetadata`."""
|
||||
return (
|
||||
(field_name, value)
|
||||
for field_name, value in metadata.known_fields.items()
|
||||
if value is not None
|
||||
)
|
||||
|
||||
field_values: DefaultDict[str, List[Any]] = collections.defaultdict(list)
|
||||
|
||||
# -- collect all non-None field values in a list for each field, in element-order --
|
||||
for e in self._elements:
|
||||
for field_name, value in iter_populated_fields(e.metadata):
|
||||
field_values[field_name].append(value)
|
||||
|
||||
return dict(field_values)
|
||||
|
||||
@lazyproperty
|
||||
def _consolidated_metadata(self) -> ElementMetadata:
|
||||
"""Metadata applicable to this pre-chunk as a single chunk.
|
||||
|
||||
Formed by applying consolidation rules to all metadata fields across the elements of this
|
||||
pre-chunk.
|
||||
|
||||
For the sake of consistency, the same rules are applied (for example, for dropping values)
|
||||
to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
|
||||
"consolidated".
|
||||
"""
|
||||
return ElementMetadata(**self._meta_kwargs)
|
||||
|
||||
@lazyproperty
|
||||
def _consolidated_regex_meta(self) -> Dict[str, List[RegexMetadata]]:
|
||||
"""Consolidate the regex-metadata in `regex_metadata_dicts` into a single dict.
|
||||
|
||||
This consolidated value is suitable for use in the chunk metadata. `start` and `end`
|
||||
offsets of each regex match are also adjusted for their new positions.
|
||||
"""
|
||||
chunk_regex_metadata: Dict[str, List[RegexMetadata]] = {}
|
||||
separator_len = len(self._opts.text_separator)
|
||||
running_text_len = 0
|
||||
start_offset = 0
|
||||
|
||||
for element in self._elements:
|
||||
text_len = len(element.text)
|
||||
# -- skip empty elements like `PageBreak("")` --
|
||||
if not text_len:
|
||||
continue
|
||||
# -- account for blank line between "squashed" elements, but not before first element --
|
||||
running_text_len += separator_len if running_text_len else 0
|
||||
start_offset = running_text_len
|
||||
running_text_len += text_len
|
||||
|
||||
if not element.metadata.regex_metadata:
|
||||
continue
|
||||
|
||||
# -- consolidate any `regex_metadata` matches, adjusting the match start/end offsets --
|
||||
element_regex_metadata = copy.deepcopy(element.metadata.regex_metadata)
|
||||
for regex_name, matches in element_regex_metadata.items():
|
||||
for m in matches:
|
||||
m["start"] += start_offset
|
||||
m["end"] += start_offset
|
||||
chunk_matches = chunk_regex_metadata.get(regex_name, [])
|
||||
chunk_matches.extend(matches)
|
||||
chunk_regex_metadata[regex_name] = chunk_matches
|
||||
|
||||
return chunk_regex_metadata
|
||||
|
||||
@lazyproperty
|
||||
def _meta_kwargs(self) -> Dict[str, Any]:
|
||||
"""The consolidated metadata values as a dict suitable for constructing ElementMetadata.
|
||||
|
||||
This is where consolidation strategies are actually applied. The output is suitable for use
|
||||
in constructing an `ElementMetadata` object like `ElementMetadata(**self._meta_kwargs)`.
|
||||
"""
|
||||
CS = ConsolidationStrategy
|
||||
field_consolidation_strategies = ConsolidationStrategy.field_consolidation_strategies()
|
||||
|
||||
def iter_kwarg_pairs() -> Iterator[Tuple[str, Any]]:
|
||||
"""Generate (field-name, value) pairs for each field in consolidated metadata."""
|
||||
for field_name, values in self._all_metadata_values.items():
|
||||
strategy = field_consolidation_strategies.get(field_name)
|
||||
if strategy is CS.FIRST:
|
||||
yield field_name, values[0]
|
||||
# -- concatenate lists from each element that had one, in order --
|
||||
elif strategy is CS.LIST_CONCATENATE:
|
||||
yield field_name, sum(values, cast(List[Any], []))
|
||||
# -- union lists from each element, preserving order of appearance --
|
||||
elif strategy is CS.LIST_UNIQUE:
|
||||
# -- Python 3.7+ maintains dict insertion order --
|
||||
ordered_unique_keys = {key: None for val_list in values for key in val_list}
|
||||
yield field_name, list(ordered_unique_keys.keys())
|
||||
elif strategy is CS.REGEX:
|
||||
yield field_name, self._consolidated_regex_meta
|
||||
elif strategy is CS.DROP:
|
||||
continue
|
||||
else:
|
||||
# -- not likely to hit this since we have a test in `text_elements.py` that
|
||||
# -- ensures every ElementMetadata fields has an assigned strategy.
|
||||
raise NotImplementedError(
|
||||
f"metadata field {repr(field_name)} has no defined consolidation strategy"
|
||||
)
|
||||
|
||||
return dict(iter_kwarg_pairs())
|
||||
|
||||
@lazyproperty
|
||||
def _text(self) -> str:
|
||||
"""The concatenated text of all elements in this pre-chunk.
|
||||
|
||||
Each element-text is separated from the next by a blank line ("\n\n").
|
||||
"""
|
||||
text_separator = self._opts.text_separator
|
||||
return text_separator.join(e.text for e in self._elements if e.text)
|
||||
|
||||
|
||||
class TextPreChunkBuilder:
|
||||
"""An element accumulator suitable for incrementally forming a pre-chunk.
|
||||
|
||||
Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use
|
||||
to determine whether it should add the next element in the element stream.
|
||||
|
||||
`.flush()` is used to build a `TextPreChunk` object from the accumulated elements. This method
|
||||
returns an interator that generates zero-or-one `TextPreChunk` object and is used like so:
|
||||
|
||||
yield from builder.flush()
|
||||
|
||||
If no elements have been accumulated, no `TextPreChunk` is generated. Flushing the builder
|
||||
clears the elements it contains so it is ready to build the next text-pre-chunk.
|
||||
"""
|
||||
|
||||
def __init__(self, opts: ChunkingOptions) -> None:
|
||||
self._opts = opts
|
||||
self._separator_len = len(opts.text_separator)
|
||||
self._elements: List[Element] = []
|
||||
|
||||
# -- only includes non-empty element text, e.g. PageBreak.text=="" is not included --
|
||||
self._text_segments: List[str] = []
|
||||
# -- combined length of text-segments, not including separators --
|
||||
self._text_len: int = 0
|
||||
|
||||
def add_element(self, element: Element) -> None:
|
||||
"""Add `element` to this section."""
|
||||
self._elements.append(element)
|
||||
if element.text:
|
||||
self._text_segments.append(element.text)
|
||||
self._text_len += len(element.text)
|
||||
|
||||
def flush(self) -> Iterator[TextPreChunk]:
|
||||
"""Generate zero-or-one `PreChunk` object and clear the accumulator.
|
||||
|
||||
Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
|
||||
boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element
|
||||
stream.
|
||||
"""
|
||||
if not self._elements:
|
||||
return
|
||||
# -- clear builder before yield so we're not sensitive to the timing of how/when this
|
||||
# -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately.
|
||||
elements = self._elements[:]
|
||||
self._elements.clear()
|
||||
self._text_segments.clear()
|
||||
self._text_len = 0
|
||||
yield TextPreChunk(elements, self._opts)
|
||||
|
||||
@property
|
||||
def remaining_space(self) -> int:
|
||||
"""Maximum text-length of an element that can be added without exceeding maxlen."""
|
||||
# -- include length of trailing separator that will go before next element text --
|
||||
separators_len = self._separator_len * len(self._text_segments)
|
||||
return self._opts.hard_max - self._text_len - separators_len
|
||||
|
||||
@property
|
||||
def text_length(self) -> int:
|
||||
"""Length of the text in this pre-chunk.
|
||||
|
||||
This value represents the chunk-size that would result if this pre-chunk was flushed in its
|
||||
current state. In particular, it does not include the length of a trailing separator (since
|
||||
that would only appear if an additional element was added).
|
||||
|
||||
Not suitable for judging remaining space, use `.remaining_space` for that value.
|
||||
"""
|
||||
# -- number of text separators present in joined text of elements. This includes only
|
||||
# -- separators *between* text segments, not one at the end. Note there are zero separators
|
||||
# -- for both 0 and 1 text-segments.
|
||||
n = len(self._text_segments)
|
||||
separator_count = n - 1 if n else 0
|
||||
return self._text_len + (separator_count * self._separator_len)
|
||||
|
||||
|
||||
# == PreChunkCombiner ============================================================================
|
||||
|
||||
|
||||
class PreChunkCombiner:
|
||||
"""Filters pre-chunk stream to combine small pre-chunks where possible."""
|
||||
|
||||
def __init__(self, pre_chunks: Iterable[PreChunk], opts: ChunkingOptions):
|
||||
self._pre_chunks = pre_chunks
|
||||
self._opts = opts
|
||||
|
||||
def iter_combined_pre_chunks(self) -> Iterator[PreChunk]:
|
||||
"""Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window."""
|
||||
accum = TextPreChunkAccumulator(self._opts)
|
||||
combine_text_under_n_chars = self._opts.combine_text_under_n_chars
|
||||
|
||||
for pre_chunk in self._pre_chunks:
|
||||
# -- start new pre-chunk under these conditions --
|
||||
if (
|
||||
# -- a table pre-chunk is never combined --
|
||||
isinstance(pre_chunk, TablePreChunk)
|
||||
# -- don't add another pre-chunk once length has reached combination soft-max --
|
||||
or accum.text_length >= combine_text_under_n_chars
|
||||
# -- combining would exceed hard-max --
|
||||
or accum.remaining_space < pre_chunk.text_length
|
||||
):
|
||||
yield from accum.flush()
|
||||
|
||||
# -- a table pre-chunk is never combined so don't accumulate --
|
||||
if isinstance(pre_chunk, TablePreChunk):
|
||||
yield pre_chunk
|
||||
else:
|
||||
accum.add_pre_chunk(pre_chunk)
|
||||
|
||||
yield from accum.flush()
|
||||
|
||||
|
||||
class TextPreChunkAccumulator:
|
||||
"""Accumulates, measures, and combines pre-chunk objects.
|
||||
|
||||
Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
|
||||
whether to add another pre-chunk.
|
||||
|
||||
`.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
|
||||
This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used
|
||||
like so:
|
||||
|
||||
yield from accum.flush()
|
||||
|
||||
If no pre-chunks have been accumulated, no `TextPreChunk` is generated. Flushing the builder
|
||||
clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk.
|
||||
"""
|
||||
|
||||
def __init__(self, opts: ChunkingOptions) -> None:
|
||||
self._opts = opts
|
||||
self._pre_chunks: List[TextPreChunk] = []
|
||||
|
||||
def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None:
|
||||
"""Add a pre-chunk to the accumulator for possible combination with next pre-chunk."""
|
||||
self._pre_chunks.append(pre_chunk)
|
||||
|
||||
def flush(self) -> Iterator[TextPreChunk]:
|
||||
"""Generate all accumulated pre-chunks as a single combined pre-chunk."""
|
||||
pre_chunks = self._pre_chunks
|
||||
|
||||
# -- nothing to do if no pre-chunks have been accumulated --
|
||||
if not pre_chunks:
|
||||
return
|
||||
|
||||
# -- otherwise combine all accumulated pre-chunk into one --
|
||||
pre_chunk = pre_chunks[0]
|
||||
for other_pre_chunk in pre_chunks[1:]:
|
||||
pre_chunk = pre_chunk.combine(other_pre_chunk)
|
||||
yield pre_chunk
|
||||
|
||||
# -- and reset the accumulator (to empty) --
|
||||
pre_chunks.clear()
|
||||
|
||||
@property
|
||||
def remaining_space(self) -> int:
|
||||
"""Maximum size of pre-chunk that can be added without exceeding maxlen."""
|
||||
maxlen = self._opts.hard_max
|
||||
return (
|
||||
maxlen
|
||||
if not self._pre_chunks
|
||||
# -- an additional pre-chunk will also incur an additional separator --
|
||||
else maxlen - self.text_length - len(self._opts.text_separator)
|
||||
)
|
||||
|
||||
@property
|
||||
def text_length(self) -> int:
|
||||
"""Size of concatenated text in all pre-chunks in accumulator."""
|
||||
n = len(self._pre_chunks)
|
||||
|
||||
if n == 0:
|
||||
return 0
|
||||
|
||||
total_text_length = sum(s.text_length for s in self._pre_chunks)
|
||||
total_separator_length = len(self._opts.text_separator) * (n - 1)
|
||||
return total_text_length + total_separator_length
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user