mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-08 14:39:27 +00:00
rfctr(chunking): extract general-purpose objects to base (#2281)
Many of the classes defined in `unstructured.chunking.title` are applicable to any chunking strategy and will shortly be used for the "by-character" chunking strategy as well. Move these and their tests to `unstructured.chunking.base`. Along the way, rename `TextPreChunkBuilder` to `PreChunkBuilder` because it will be generalized in a subsequent PR to also take `Table` elements such that inter-pre-chunk overlap can be implemented. Otherwise, no logic changes, just moves.
This commit is contained in:
parent
a7c3f5f570
commit
36e81c3367
@ -1,4 +1,4 @@
|
|||||||
## 0.11.5-dev1
|
## 0.11.5-dev2
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
|
|||||||
@ -1,14 +1,35 @@
|
|||||||
|
# pyright: reportPrivateUsage=false
|
||||||
|
|
||||||
"""Unit-test suite for the `unstructured.chunking.base` module."""
|
"""Unit-test suite for the `unstructured.chunking.base` module."""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from unstructured.chunking.base import ChunkingOptions
|
from unstructured.chunking.base import (
|
||||||
|
ChunkingOptions,
|
||||||
|
PreChunkBuilder,
|
||||||
|
PreChunkCombiner,
|
||||||
|
TablePreChunk,
|
||||||
|
TextPreChunk,
|
||||||
|
TextPreChunkAccumulator,
|
||||||
|
)
|
||||||
|
from unstructured.documents.elements import (
|
||||||
|
CompositeElement,
|
||||||
|
ElementMetadata,
|
||||||
|
PageBreak,
|
||||||
|
RegexMetadata,
|
||||||
|
Table,
|
||||||
|
TableChunk,
|
||||||
|
Text,
|
||||||
|
Title,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class DescribeChunkingOptions:
|
class DescribeChunkingOptions:
|
||||||
"""Unit-test suite for `unstructured.chunking.model.ChunkingOptions objects."""
|
"""Unit-test suite for `unstructured.chunking.base.ChunkingOptions objects."""
|
||||||
|
|
||||||
@pytest.mark.parametrize("max_characters", [0, -1, -42])
|
@pytest.mark.parametrize("max_characters", [0, -1, -42])
|
||||||
def it_rejects_max_characters_not_greater_than_zero(self, max_characters: int):
|
def it_rejects_max_characters_not_greater_than_zero(self, max_characters: int):
|
||||||
@ -111,3 +132,847 @@ class DescribeChunkingOptions:
|
|||||||
|
|
||||||
def it_knows_the_text_separator_string(self):
|
def it_knows_the_text_separator_string(self):
|
||||||
assert ChunkingOptions.new().text_separator == "\n\n"
|
assert ChunkingOptions.new().text_separator == "\n\n"
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================================================
|
||||||
|
# PRE-CHUNK SUBTYPES
|
||||||
|
# ================================================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class DescribeTablePreChunk:
|
||||||
|
"""Unit-test suite for `unstructured.chunking.base.TablePreChunk objects."""
|
||||||
|
|
||||||
|
def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
|
||||||
|
html_table = (
|
||||||
|
"<table>\n"
|
||||||
|
"<thead>\n"
|
||||||
|
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
||||||
|
"</thead>\n"
|
||||||
|
"<tbody>\n"
|
||||||
|
"<tr><td>Lorem ipsum </td><td>adipiscing </td></tr>\n"
|
||||||
|
"</tbody>\n"
|
||||||
|
"</table>"
|
||||||
|
)
|
||||||
|
text_table = "Header Col 1 Header Col 2\n" "Lorem ipsum adipiscing"
|
||||||
|
pre_chunk = TablePreChunk(
|
||||||
|
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
|
||||||
|
opts=ChunkingOptions.new(max_characters=175),
|
||||||
|
)
|
||||||
|
|
||||||
|
chunk_iter = pre_chunk.iter_chunks()
|
||||||
|
|
||||||
|
chunk = next(chunk_iter)
|
||||||
|
assert isinstance(chunk, Table)
|
||||||
|
assert chunk.text == "Header Col 1 Header Col 2\nLorem ipsum adipiscing"
|
||||||
|
assert chunk.metadata.text_as_html == (
|
||||||
|
"<table>\n"
|
||||||
|
"<thead>\n"
|
||||||
|
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
||||||
|
"</thead>\n"
|
||||||
|
"<tbody>\n"
|
||||||
|
"<tr><td>Lorem ipsum </td><td>adipiscing </td></tr>\n"
|
||||||
|
"</tbody>\n"
|
||||||
|
"</table>"
|
||||||
|
)
|
||||||
|
with pytest.raises(StopIteration):
|
||||||
|
next(chunk_iter)
|
||||||
|
|
||||||
|
def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
|
||||||
|
# fixed-overhead = 8+8+9+8+9+8 = 50
|
||||||
|
# per-row overhead = 27
|
||||||
|
html_table = (
|
||||||
|
"<table>\n" # 8
|
||||||
|
"<thead>\n" # 8
|
||||||
|
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
||||||
|
"</thead>\n" # 9
|
||||||
|
"<tbody>\n" # 8
|
||||||
|
"<tr><td>Lorem ipsum </td><td>A Link example</td></tr>\n"
|
||||||
|
"<tr><td>Consectetur </td><td>adipiscing elit</td></tr>\n"
|
||||||
|
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
|
||||||
|
"<tr><td>Vivamus quis </td><td>nunc ipsum donec ac fermentum</td></tr>\n"
|
||||||
|
"</tbody>\n" # 9
|
||||||
|
"</table>" # 8
|
||||||
|
)
|
||||||
|
text_table = (
|
||||||
|
"Header Col 1 Header Col 2\n"
|
||||||
|
"Lorem ipsum dolor sit amet\n"
|
||||||
|
"Consectetur adipiscing elit\n"
|
||||||
|
"Nunc aliquam id enim nec molestie\n"
|
||||||
|
"Vivamus quis nunc ipsum donec ac fermentum"
|
||||||
|
)
|
||||||
|
pre_chunk = TablePreChunk(
|
||||||
|
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
|
||||||
|
opts=ChunkingOptions.new(max_characters=100),
|
||||||
|
)
|
||||||
|
|
||||||
|
chunk_iter = pre_chunk.iter_chunks()
|
||||||
|
|
||||||
|
chunk = next(chunk_iter)
|
||||||
|
assert isinstance(chunk, TableChunk)
|
||||||
|
assert chunk.text == (
|
||||||
|
"Header Col 1 Header Col 2\n"
|
||||||
|
"Lorem ipsum dolor sit amet\n"
|
||||||
|
"Consectetur adipiscing elit\n"
|
||||||
|
"Nunc aliqua"
|
||||||
|
)
|
||||||
|
assert chunk.metadata.text_as_html == (
|
||||||
|
"<table>\n"
|
||||||
|
"<thead>\n"
|
||||||
|
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
||||||
|
"</thead>\n"
|
||||||
|
"<tbody>\n"
|
||||||
|
"<tr><td>Lo"
|
||||||
|
)
|
||||||
|
# --
|
||||||
|
chunk = next(chunk_iter)
|
||||||
|
assert isinstance(chunk, TableChunk)
|
||||||
|
assert (
|
||||||
|
chunk.text == "m id enim nec molestie\nVivamus quis nunc ipsum donec ac fermentum"
|
||||||
|
)
|
||||||
|
assert chunk.metadata.text_as_html == (
|
||||||
|
"rem ipsum </td><td>A Link example</td></tr>\n"
|
||||||
|
"<tr><td>Consectetur </td><td>adipiscing elit</td><"
|
||||||
|
)
|
||||||
|
# -- note that text runs out but HTML continues because it's significantly longer. So two
|
||||||
|
# -- of these chunks have HTML but no text.
|
||||||
|
chunk = next(chunk_iter)
|
||||||
|
assert isinstance(chunk, TableChunk)
|
||||||
|
assert chunk.text == ""
|
||||||
|
assert chunk.metadata.text_as_html == (
|
||||||
|
"/tr>\n"
|
||||||
|
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
|
||||||
|
"<tr><td>Vivamus quis </td><td>"
|
||||||
|
)
|
||||||
|
# --
|
||||||
|
chunk = next(chunk_iter)
|
||||||
|
assert isinstance(chunk, TableChunk)
|
||||||
|
assert chunk.text == ""
|
||||||
|
assert chunk.metadata.text_as_html == (
|
||||||
|
"nunc ipsum donec ac fermentum</td></tr>\n</tbody>\n</table>"
|
||||||
|
)
|
||||||
|
# --
|
||||||
|
with pytest.raises(StopIteration):
|
||||||
|
next(chunk_iter)
|
||||||
|
|
||||||
|
|
||||||
|
class DescribeTextPreChunk:
|
||||||
|
"""Unit-test suite for `unstructured.chunking.base.TextPreChunk objects."""
|
||||||
|
|
||||||
|
def it_can_combine_itself_with_another_TextPreChunk_instance(self):
|
||||||
|
""".combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`.
|
||||||
|
|
||||||
|
Note that neither the original or other pre_chunk are mutated.
|
||||||
|
"""
|
||||||
|
opts = ChunkingOptions.new()
|
||||||
|
pre_chunk = TextPreChunk(
|
||||||
|
[
|
||||||
|
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||||
|
Text("In rhoncus ipsum sed lectus porta volutpat."),
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
)
|
||||||
|
other_pre_chunk = TextPreChunk(
|
||||||
|
[
|
||||||
|
Text("Donec semper facilisis metus finibus malesuada."),
|
||||||
|
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
)
|
||||||
|
|
||||||
|
new_pre_chunk = pre_chunk.combine(other_pre_chunk)
|
||||||
|
|
||||||
|
assert new_pre_chunk == TextPreChunk(
|
||||||
|
[
|
||||||
|
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||||
|
Text("In rhoncus ipsum sed lectus porta volutpat."),
|
||||||
|
Text("Donec semper facilisis metus finibus malesuada."),
|
||||||
|
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
)
|
||||||
|
assert pre_chunk == TextPreChunk(
|
||||||
|
[
|
||||||
|
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||||
|
Text("In rhoncus ipsum sed lectus porta volutpat."),
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
)
|
||||||
|
assert other_pre_chunk == TextPreChunk(
|
||||||
|
[
|
||||||
|
Text("Donec semper facilisis metus finibus malesuada."),
|
||||||
|
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
)
|
||||||
|
|
||||||
|
def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
|
||||||
|
pre_chunk = TextPreChunk(
|
||||||
|
[
|
||||||
|
Title("Introduction"),
|
||||||
|
Text(
|
||||||
|
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
||||||
|
"lectus porta volutpat.",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
opts=ChunkingOptions.new(max_characters=200),
|
||||||
|
)
|
||||||
|
|
||||||
|
chunk_iter = pre_chunk.iter_chunks()
|
||||||
|
|
||||||
|
chunk = next(chunk_iter)
|
||||||
|
assert chunk == CompositeElement(
|
||||||
|
"Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
|
||||||
|
" In rhoncus ipsum sedlectus porta volutpat.",
|
||||||
|
)
|
||||||
|
assert chunk.metadata is pre_chunk._consolidated_metadata
|
||||||
|
|
||||||
|
def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
|
||||||
|
# -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
|
||||||
|
# -- The pre-chunker will isolate that element in a pre_chunk of its own.
|
||||||
|
pre_chunk = TextPreChunk(
|
||||||
|
[
|
||||||
|
Text(
|
||||||
|
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
|
||||||
|
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
|
||||||
|
" veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
|
||||||
|
" commodo consequat."
|
||||||
|
),
|
||||||
|
],
|
||||||
|
opts=ChunkingOptions.new(max_characters=200),
|
||||||
|
)
|
||||||
|
|
||||||
|
chunk_iter = pre_chunk.iter_chunks()
|
||||||
|
|
||||||
|
chunk = next(chunk_iter)
|
||||||
|
assert chunk == CompositeElement(
|
||||||
|
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
|
||||||
|
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
|
||||||
|
" veniam, quis nostrud exercitation ullamco laboris nisi ut a"
|
||||||
|
)
|
||||||
|
assert chunk.metadata is pre_chunk._consolidated_metadata
|
||||||
|
# --
|
||||||
|
chunk = next(chunk_iter)
|
||||||
|
assert chunk == CompositeElement("liquip ex ea commodo consequat.")
|
||||||
|
assert chunk.metadata is pre_chunk._consolidated_metadata
|
||||||
|
# --
|
||||||
|
with pytest.raises(StopIteration):
|
||||||
|
next(chunk_iter)
|
||||||
|
|
||||||
|
def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self):
|
||||||
|
""".text_length is the size of chunk this pre-chunk will produce (before any splitting)."""
|
||||||
|
pre_chunk = TextPreChunk(
|
||||||
|
[PageBreak(""), Text("foo"), Text("bar")], opts=ChunkingOptions.new()
|
||||||
|
)
|
||||||
|
assert pre_chunk.text_length == 8
|
||||||
|
|
||||||
|
def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self):
|
||||||
|
pre_chunk = TextPreChunk(
|
||||||
|
[
|
||||||
|
Title(
|
||||||
|
"Lorem Ipsum",
|
||||||
|
metadata=ElementMetadata(
|
||||||
|
category_depth=0,
|
||||||
|
filename="foo.docx",
|
||||||
|
languages=["lat"],
|
||||||
|
parent_id="f87731e0",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
Text(
|
||||||
|
"'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
|
||||||
|
metadata=ElementMetadata(
|
||||||
|
category_depth=1,
|
||||||
|
filename="foo.docx",
|
||||||
|
image_path="sprite.png",
|
||||||
|
languages=["lat", "eng"],
|
||||||
|
),
|
||||||
|
),
|
||||||
|
],
|
||||||
|
opts=ChunkingOptions.new(),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert pre_chunk._all_metadata_values == {
|
||||||
|
# -- scalar values are accumulated in a list in element order --
|
||||||
|
"category_depth": [0, 1],
|
||||||
|
# -- all values are accumulated, not only unique ones --
|
||||||
|
"filename": ["foo.docx", "foo.docx"],
|
||||||
|
# -- list-type fields produce a list of lists --
|
||||||
|
"languages": [["lat"], ["lat", "eng"]],
|
||||||
|
# -- fields that only appear in some elements are captured --
|
||||||
|
"image_path": ["sprite.png"],
|
||||||
|
"parent_id": ["f87731e0"],
|
||||||
|
# -- A `None` value never appears, neither does a field-name with an empty list --
|
||||||
|
}
|
||||||
|
|
||||||
|
def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self):
|
||||||
|
metadata = ElementMetadata(
|
||||||
|
category_depth=0,
|
||||||
|
filename="foo.docx",
|
||||||
|
languages=["lat"],
|
||||||
|
parent_id="f87731e0",
|
||||||
|
)
|
||||||
|
metadata.coefficient = 0.62
|
||||||
|
metadata_2 = ElementMetadata(
|
||||||
|
category_depth=1,
|
||||||
|
filename="foo.docx",
|
||||||
|
image_path="sprite.png",
|
||||||
|
languages=["lat", "eng"],
|
||||||
|
)
|
||||||
|
metadata_2.quotient = 1.74
|
||||||
|
|
||||||
|
pre_chunk = TextPreChunk(
|
||||||
|
[
|
||||||
|
Title("Lorem Ipsum", metadata=metadata),
|
||||||
|
Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
|
||||||
|
],
|
||||||
|
opts=ChunkingOptions.new(),
|
||||||
|
)
|
||||||
|
|
||||||
|
# -- ad-hoc fields "coefficient" and "quotient" do not appear --
|
||||||
|
assert pre_chunk._all_metadata_values == {
|
||||||
|
"category_depth": [0, 1],
|
||||||
|
"filename": ["foo.docx", "foo.docx"],
|
||||||
|
"image_path": ["sprite.png"],
|
||||||
|
"languages": [["lat"], ["lat", "eng"]],
|
||||||
|
"parent_id": ["f87731e0"],
|
||||||
|
}
|
||||||
|
|
||||||
|
def it_consolidates_regex_metadata_in_a_field_specific_way(self):
|
||||||
|
"""regex_metadata of chunk is combined regex_metadatas of its elements.
|
||||||
|
|
||||||
|
Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new
|
||||||
|
position in the chunk after element text has been concatenated.
|
||||||
|
"""
|
||||||
|
pre_chunk = TextPreChunk(
|
||||||
|
[
|
||||||
|
Title(
|
||||||
|
"Lorem Ipsum",
|
||||||
|
metadata=ElementMetadata(
|
||||||
|
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
|
||||||
|
),
|
||||||
|
),
|
||||||
|
Text(
|
||||||
|
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
|
||||||
|
metadata=ElementMetadata(
|
||||||
|
regex_metadata={
|
||||||
|
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
|
||||||
|
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
),
|
||||||
|
Text(
|
||||||
|
"In rhoncus ipsum sed lectus porta volutpat.",
|
||||||
|
metadata=ElementMetadata(
|
||||||
|
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
|
||||||
|
),
|
||||||
|
),
|
||||||
|
],
|
||||||
|
opts=ChunkingOptions.new(),
|
||||||
|
)
|
||||||
|
|
||||||
|
regex_metadata = pre_chunk._consolidated_regex_meta
|
||||||
|
|
||||||
|
assert regex_metadata == {
|
||||||
|
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
|
||||||
|
"ipsum": [
|
||||||
|
RegexMetadata(text="Ipsum", start=6, end=11),
|
||||||
|
RegexMetadata(text="ipsum", start=19, end=24),
|
||||||
|
RegexMetadata(text="ipsum", start=81, end=86),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self):
|
||||||
|
"""._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
|
||||||
|
|
||||||
|
Only non-None fields should appear in the dict and each field value should be the
|
||||||
|
consolidation of the values across the pre_chunk elements.
|
||||||
|
"""
|
||||||
|
pre_chunk = TextPreChunk(
|
||||||
|
[
|
||||||
|
PageBreak(""),
|
||||||
|
Title(
|
||||||
|
"Lorem Ipsum",
|
||||||
|
metadata=ElementMetadata(
|
||||||
|
filename="foo.docx",
|
||||||
|
# -- category_depth has DROP strategy so doesn't appear in result --
|
||||||
|
category_depth=0,
|
||||||
|
emphasized_text_contents=["Lorem", "Ipsum"],
|
||||||
|
emphasized_text_tags=["b", "i"],
|
||||||
|
languages=["lat"],
|
||||||
|
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
|
||||||
|
),
|
||||||
|
),
|
||||||
|
Text(
|
||||||
|
"'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
|
||||||
|
metadata=ElementMetadata(
|
||||||
|
# -- filename change doesn't happen IRL but demonstrates FIRST strategy --
|
||||||
|
filename="bar.docx",
|
||||||
|
# -- emphasized_text_contents has LIST_CONCATENATE strategy, so "Lorem"
|
||||||
|
# -- appears twice in consolidated-meta (as it should) and length matches
|
||||||
|
# -- that of emphasized_text_tags both before and after consolidation.
|
||||||
|
emphasized_text_contents=["Lorem", "ipsum"],
|
||||||
|
emphasized_text_tags=["i", "b"],
|
||||||
|
# -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once --
|
||||||
|
languages=["eng", "lat"],
|
||||||
|
# -- regex_metadata has its own dedicated consolidation-strategy (REGEX) --
|
||||||
|
regex_metadata={
|
||||||
|
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
|
||||||
|
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
),
|
||||||
|
],
|
||||||
|
opts=ChunkingOptions.new(),
|
||||||
|
)
|
||||||
|
|
||||||
|
meta_kwargs = pre_chunk._meta_kwargs
|
||||||
|
|
||||||
|
assert meta_kwargs == {
|
||||||
|
"filename": "foo.docx",
|
||||||
|
"emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"],
|
||||||
|
"emphasized_text_tags": ["b", "i", "i", "b"],
|
||||||
|
"languages": ["lat", "eng"],
|
||||||
|
"regex_metadata": {
|
||||||
|
"ipsum": [
|
||||||
|
RegexMetadata(text="Ipsum", start=6, end=11),
|
||||||
|
RegexMetadata(text="ipsum", start=19, end=24),
|
||||||
|
],
|
||||||
|
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("elements", "expected_value"),
|
||||||
|
[
|
||||||
|
([Text("foo"), Text("bar")], "foo\n\nbar"),
|
||||||
|
([Text("foo"), PageBreak(""), Text("bar")], "foo\n\nbar"),
|
||||||
|
([PageBreak(""), Text("foo"), Text("bar")], "foo\n\nbar"),
|
||||||
|
([Text("foo"), Text("bar"), PageBreak("")], "foo\n\nbar"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def it_knows_the_concatenated_text_of_the_pre_chunk(
|
||||||
|
self, elements: List[Text], expected_value: str
|
||||||
|
):
|
||||||
|
"""._text is the "joined" text of the pre-chunk elements.
|
||||||
|
|
||||||
|
The text-segment contributed by each element is separated from the next by a blank line
|
||||||
|
("\n\n"). An element that contributes no text does not give rise to a separator.
|
||||||
|
"""
|
||||||
|
pre_chunk = TextPreChunk(elements, opts=ChunkingOptions.new())
|
||||||
|
assert pre_chunk._text == expected_value
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================================================
|
||||||
|
# PRE-CHUNKING ACCUMULATORS
|
||||||
|
# ================================================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class DescribePreChunkBuilder:
|
||||||
|
"""Unit-test suite for `unstructured.chunking.base.PreChunkBuilder`."""
|
||||||
|
|
||||||
|
def it_is_empty_on_construction(self):
|
||||||
|
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
|
||||||
|
|
||||||
|
assert builder.text_length == 0
|
||||||
|
assert builder.remaining_space == 50
|
||||||
|
|
||||||
|
def it_accumulates_elements_added_to_it(self):
|
||||||
|
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
||||||
|
|
||||||
|
builder.add_element(Title("Introduction"))
|
||||||
|
assert builder.text_length == 12
|
||||||
|
assert builder.remaining_space == 136
|
||||||
|
|
||||||
|
builder.add_element(
|
||||||
|
Text(
|
||||||
|
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
||||||
|
"lectus porta volutpat.",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
assert builder.text_length == 112
|
||||||
|
assert builder.remaining_space == 36
|
||||||
|
|
||||||
|
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
|
||||||
|
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
||||||
|
builder.add_element(Title("Introduction"))
|
||||||
|
builder.add_element(
|
||||||
|
Text(
|
||||||
|
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
||||||
|
"lectus porta volutpat.",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
pre_chunk = next(builder.flush())
|
||||||
|
|
||||||
|
assert isinstance(pre_chunk, TextPreChunk)
|
||||||
|
assert pre_chunk._elements == [
|
||||||
|
Title("Introduction"),
|
||||||
|
Text(
|
||||||
|
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
||||||
|
"lectus porta volutpat.",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
assert builder.text_length == 0
|
||||||
|
assert builder.remaining_space == 150
|
||||||
|
|
||||||
|
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
|
||||||
|
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
||||||
|
|
||||||
|
pre_chunks = list(builder.flush())
|
||||||
|
|
||||||
|
assert pre_chunks == []
|
||||||
|
assert builder.text_length == 0
|
||||||
|
assert builder.remaining_space == 150
|
||||||
|
|
||||||
|
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
|
||||||
|
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
|
||||||
|
builder.add_element(Text("abcde"))
|
||||||
|
builder.add_element(Text("fghij"))
|
||||||
|
|
||||||
|
# -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
|
||||||
|
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
|
||||||
|
assert builder.text_length == 12
|
||||||
|
# -- .remaining_space is reduced by the length (2) of the trailing separator which would go
|
||||||
|
# -- between the current text and that of the next element if one was added.
|
||||||
|
# -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
|
||||||
|
assert builder.remaining_space == 36
|
||||||
|
|
||||||
|
|
||||||
|
class DescribePreChunkCombiner:
|
||||||
|
"""Unit-test suite for `unstructured.chunking.base.PreChunkCombiner`."""
|
||||||
|
|
||||||
|
def it_combines_sequential_small_text_pre_chunks(self):
|
||||||
|
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
|
||||||
|
pre_chunks = [
|
||||||
|
TextPreChunk(
|
||||||
|
[
|
||||||
|
Title("Lorem Ipsum"), # 11
|
||||||
|
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
),
|
||||||
|
TextPreChunk(
|
||||||
|
[
|
||||||
|
Title("Mauris Nec"), # 10
|
||||||
|
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
),
|
||||||
|
TextPreChunk(
|
||||||
|
[
|
||||||
|
Title("Sed Orci"), # 8
|
||||||
|
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
|
||||||
|
|
||||||
|
pre_chunk = next(pre_chunk_iter)
|
||||||
|
assert isinstance(pre_chunk, TextPreChunk)
|
||||||
|
assert pre_chunk._elements == [
|
||||||
|
Title("Lorem Ipsum"),
|
||||||
|
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||||
|
Title("Mauris Nec"),
|
||||||
|
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||||
|
Title("Sed Orci"),
|
||||||
|
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
|
||||||
|
]
|
||||||
|
with pytest.raises(StopIteration):
|
||||||
|
next(pre_chunk_iter)
|
||||||
|
|
||||||
|
def but_it_does_not_combine_table_pre_chunks(self):
|
||||||
|
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
|
||||||
|
pre_chunks = [
|
||||||
|
TextPreChunk(
|
||||||
|
[
|
||||||
|
Title("Lorem Ipsum"),
|
||||||
|
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
),
|
||||||
|
TablePreChunk(Table("Heading\nCell text"), opts=opts),
|
||||||
|
TextPreChunk(
|
||||||
|
[
|
||||||
|
Title("Mauris Nec"),
|
||||||
|
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
pre_chunk_iter = PreChunkCombiner(
|
||||||
|
pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
|
||||||
|
).iter_combined_pre_chunks()
|
||||||
|
|
||||||
|
pre_chunk = next(pre_chunk_iter)
|
||||||
|
assert isinstance(pre_chunk, TextPreChunk)
|
||||||
|
assert pre_chunk._elements == [
|
||||||
|
Title("Lorem Ipsum"),
|
||||||
|
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||||
|
]
|
||||||
|
# --
|
||||||
|
pre_chunk = next(pre_chunk_iter)
|
||||||
|
assert isinstance(pre_chunk, TablePreChunk)
|
||||||
|
assert pre_chunk._table == Table("Heading\nCell text")
|
||||||
|
# --
|
||||||
|
pre_chunk = next(pre_chunk_iter)
|
||||||
|
assert isinstance(pre_chunk, TextPreChunk)
|
||||||
|
assert pre_chunk._elements == [
|
||||||
|
Title("Mauris Nec"),
|
||||||
|
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||||
|
]
|
||||||
|
# --
|
||||||
|
with pytest.raises(StopIteration):
|
||||||
|
next(pre_chunk_iter)
|
||||||
|
|
||||||
|
def it_respects_the_specified_combination_threshold(self):
|
||||||
|
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80)
|
||||||
|
pre_chunks = [
|
||||||
|
TextPreChunk( # 68
|
||||||
|
[
|
||||||
|
Title("Lorem Ipsum"), # 11
|
||||||
|
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
),
|
||||||
|
TextPreChunk( # 71
|
||||||
|
[
|
||||||
|
Title("Mauris Nec"), # 10
|
||||||
|
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
),
|
||||||
|
# -- len == 139
|
||||||
|
TextPreChunk(
|
||||||
|
[
|
||||||
|
Title("Sed Orci"), # 8
|
||||||
|
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
|
||||||
|
|
||||||
|
pre_chunk = next(pre_chunk_iter)
|
||||||
|
assert isinstance(pre_chunk, TextPreChunk)
|
||||||
|
assert pre_chunk._elements == [
|
||||||
|
Title("Lorem Ipsum"),
|
||||||
|
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||||
|
Title("Mauris Nec"),
|
||||||
|
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||||
|
]
|
||||||
|
# --
|
||||||
|
pre_chunk = next(pre_chunk_iter)
|
||||||
|
assert isinstance(pre_chunk, TextPreChunk)
|
||||||
|
assert pre_chunk._elements == [
|
||||||
|
Title("Sed Orci"),
|
||||||
|
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
|
||||||
|
]
|
||||||
|
# --
|
||||||
|
with pytest.raises(StopIteration):
|
||||||
|
next(pre_chunk_iter)
|
||||||
|
|
||||||
|
def it_respects_the_hard_maximum_window_length(self):
|
||||||
|
opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200)
|
||||||
|
pre_chunks = [
|
||||||
|
TextPreChunk( # 68
|
||||||
|
[
|
||||||
|
Title("Lorem Ipsum"), # 11
|
||||||
|
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
),
|
||||||
|
TextPreChunk( # 71
|
||||||
|
[
|
||||||
|
Title("Mauris Nec"), # 10
|
||||||
|
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
),
|
||||||
|
# -- len == 139
|
||||||
|
TextPreChunk(
|
||||||
|
[
|
||||||
|
Title("Sed Orci"), # 8
|
||||||
|
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
),
|
||||||
|
# -- len == 214
|
||||||
|
]
|
||||||
|
|
||||||
|
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
|
||||||
|
|
||||||
|
pre_chunk = next(pre_chunk_iter)
|
||||||
|
assert isinstance(pre_chunk, TextPreChunk)
|
||||||
|
assert pre_chunk._elements == [
|
||||||
|
Title("Lorem Ipsum"),
|
||||||
|
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||||
|
Title("Mauris Nec"),
|
||||||
|
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||||
|
]
|
||||||
|
# --
|
||||||
|
pre_chunk = next(pre_chunk_iter)
|
||||||
|
assert isinstance(pre_chunk, TextPreChunk)
|
||||||
|
assert pre_chunk._elements == [
|
||||||
|
Title("Sed Orci"),
|
||||||
|
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
|
||||||
|
]
|
||||||
|
# --
|
||||||
|
with pytest.raises(StopIteration):
|
||||||
|
next(pre_chunk_iter)
|
||||||
|
|
||||||
|
def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
|
||||||
|
"""Such as occurs when a single element exceeds the window size."""
|
||||||
|
opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
|
||||||
|
pre_chunks = [
|
||||||
|
TextPreChunk([Title("Lorem Ipsum")], opts=opts),
|
||||||
|
TextPreChunk( # 179
|
||||||
|
[
|
||||||
|
Text(
|
||||||
|
"Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55
|
||||||
|
" Mauris nec urna non augue vulputate consequat eget et nisi." # 60
|
||||||
|
" Sed orci quam, eleifend sit amet vehicula, elementum ultricies." # 64
|
||||||
|
)
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
),
|
||||||
|
TextPreChunk([Title("Vulputate Consequat")], opts=opts),
|
||||||
|
]
|
||||||
|
|
||||||
|
pre_chunk_iter = PreChunkCombiner(
|
||||||
|
pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
|
||||||
|
).iter_combined_pre_chunks()
|
||||||
|
|
||||||
|
pre_chunk = next(pre_chunk_iter)
|
||||||
|
assert isinstance(pre_chunk, TextPreChunk)
|
||||||
|
assert pre_chunk._elements == [Title("Lorem Ipsum")]
|
||||||
|
# --
|
||||||
|
pre_chunk = next(pre_chunk_iter)
|
||||||
|
assert isinstance(pre_chunk, TextPreChunk)
|
||||||
|
assert pre_chunk._elements == [
|
||||||
|
Text(
|
||||||
|
"Lorem ipsum dolor sit amet consectetur adipiscing elit."
|
||||||
|
" Mauris nec urna non augue vulputate consequat eget et nisi."
|
||||||
|
" Sed orci quam, eleifend sit amet vehicula, elementum ultricies."
|
||||||
|
)
|
||||||
|
]
|
||||||
|
# --
|
||||||
|
pre_chunk = next(pre_chunk_iter)
|
||||||
|
assert isinstance(pre_chunk, TextPreChunk)
|
||||||
|
assert pre_chunk._elements == [Title("Vulputate Consequat")]
|
||||||
|
# --
|
||||||
|
with pytest.raises(StopIteration):
|
||||||
|
next(pre_chunk_iter)
|
||||||
|
|
||||||
|
|
||||||
|
class DescribeTextPreChunkAccumulator:
|
||||||
|
"""Unit-test suite for `unstructured.chunking.base.TextPreChunkAccumulator`."""
|
||||||
|
|
||||||
|
def it_is_empty_on_construction(self):
|
||||||
|
accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=100))
|
||||||
|
|
||||||
|
assert accum.text_length == 0
|
||||||
|
assert accum.remaining_space == 100
|
||||||
|
|
||||||
|
def it_accumulates_pre_chunks_added_to_it(self):
|
||||||
|
opts = ChunkingOptions.new(max_characters=500)
|
||||||
|
accum = TextPreChunkAccumulator(opts=opts)
|
||||||
|
|
||||||
|
accum.add_pre_chunk(
|
||||||
|
TextPreChunk(
|
||||||
|
[
|
||||||
|
Title("Lorem Ipsum"),
|
||||||
|
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assert accum.text_length == 68
|
||||||
|
assert accum.remaining_space == 430
|
||||||
|
|
||||||
|
accum.add_pre_chunk(
|
||||||
|
TextPreChunk(
|
||||||
|
[
|
||||||
|
Title("Mauris Nec"),
|
||||||
|
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assert accum.text_length == 141
|
||||||
|
assert accum.remaining_space == 357
|
||||||
|
|
||||||
|
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
|
||||||
|
opts = ChunkingOptions.new(max_characters=150)
|
||||||
|
accum = TextPreChunkAccumulator(opts=opts)
|
||||||
|
accum.add_pre_chunk(
|
||||||
|
TextPreChunk(
|
||||||
|
[
|
||||||
|
Title("Lorem Ipsum"),
|
||||||
|
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
accum.add_pre_chunk(
|
||||||
|
TextPreChunk(
|
||||||
|
[
|
||||||
|
Title("Mauris Nec"),
|
||||||
|
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
accum.add_pre_chunk(
|
||||||
|
TextPreChunk(
|
||||||
|
[
|
||||||
|
Title("Sed Orci"),
|
||||||
|
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
|
||||||
|
],
|
||||||
|
opts=opts,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
pre_chunk_iter = accum.flush()
|
||||||
|
|
||||||
|
# -- iterator generates exactly one pre_chunk --
|
||||||
|
pre_chunk = next(pre_chunk_iter)
|
||||||
|
with pytest.raises(StopIteration):
|
||||||
|
next(pre_chunk_iter)
|
||||||
|
# -- and it is a _TextPreChunk containing all the elements --
|
||||||
|
assert isinstance(pre_chunk, TextPreChunk)
|
||||||
|
assert pre_chunk._elements == [
|
||||||
|
Title("Lorem Ipsum"),
|
||||||
|
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||||
|
Title("Mauris Nec"),
|
||||||
|
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||||
|
Title("Sed Orci"),
|
||||||
|
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
|
||||||
|
]
|
||||||
|
assert accum.text_length == 0
|
||||||
|
assert accum.remaining_space == 150
|
||||||
|
|
||||||
|
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
|
||||||
|
accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150))
|
||||||
|
|
||||||
|
pre_chunks = list(accum.flush())
|
||||||
|
|
||||||
|
assert pre_chunks == []
|
||||||
|
assert accum.text_length == 0
|
||||||
|
assert accum.remaining_space == 150
|
||||||
|
|
||||||
|
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
|
||||||
|
opts = ChunkingOptions.new(max_characters=100)
|
||||||
|
accum = TextPreChunkAccumulator(opts=opts)
|
||||||
|
accum.add_pre_chunk(TextPreChunk([Text("abcde")], opts=opts))
|
||||||
|
accum.add_pre_chunk(TextPreChunk([Text("fghij")], opts=opts))
|
||||||
|
|
||||||
|
# -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
|
||||||
|
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
|
||||||
|
assert accum.text_length == 12
|
||||||
|
# -- .remaining_space is reduced by the length (2) of the trailing separator which would
|
||||||
|
# -- go between the current text and that of the next pre-chunk if one was added.
|
||||||
|
# -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88
|
||||||
|
assert accum.remaining_space == 86
|
||||||
|
|||||||
@ -4,16 +4,8 @@ from typing import List
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from unstructured.chunking.base import ChunkingOptions
|
from unstructured.chunking.base import ChunkingOptions, TablePreChunk, TextPreChunk
|
||||||
from unstructured.chunking.title import (
|
from unstructured.chunking.title import _split_elements_by_title_and_table, chunk_by_title
|
||||||
PreChunkCombiner,
|
|
||||||
TablePreChunk,
|
|
||||||
TextPreChunk,
|
|
||||||
TextPreChunkAccumulator,
|
|
||||||
TextPreChunkBuilder,
|
|
||||||
_split_elements_by_title_and_table,
|
|
||||||
chunk_by_title,
|
|
||||||
)
|
|
||||||
from unstructured.documents.coordinates import CoordinateSystem
|
from unstructured.documents.coordinates import CoordinateSystem
|
||||||
from unstructured.documents.elements import (
|
from unstructured.documents.elements import (
|
||||||
CheckBox,
|
CheckBox,
|
||||||
@ -22,10 +14,8 @@ from unstructured.documents.elements import (
|
|||||||
Element,
|
Element,
|
||||||
ElementMetadata,
|
ElementMetadata,
|
||||||
ListItem,
|
ListItem,
|
||||||
PageBreak,
|
|
||||||
RegexMetadata,
|
RegexMetadata,
|
||||||
Table,
|
Table,
|
||||||
TableChunk,
|
|
||||||
Text,
|
Text,
|
||||||
Title,
|
Title,
|
||||||
)
|
)
|
||||||
@ -552,843 +542,3 @@ def test_it_considers_separator_length_when_pre_chunking():
|
|||||||
),
|
),
|
||||||
CompositeElement("Minimize mid-text chunk-splitting"),
|
CompositeElement("Minimize mid-text chunk-splitting"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
# == PreChunks ===================================================================================
|
|
||||||
|
|
||||||
|
|
||||||
class DescribeTablePreChunk:
|
|
||||||
"""Unit-test suite for `unstructured.chunking.title.TablePreChunk objects."""
|
|
||||||
|
|
||||||
def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
|
|
||||||
html_table = (
|
|
||||||
"<table>\n"
|
|
||||||
"<thead>\n"
|
|
||||||
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
|
||||||
"</thead>\n"
|
|
||||||
"<tbody>\n"
|
|
||||||
"<tr><td>Lorem ipsum </td><td>adipiscing </td></tr>\n"
|
|
||||||
"</tbody>\n"
|
|
||||||
"</table>"
|
|
||||||
)
|
|
||||||
text_table = "Header Col 1 Header Col 2\n" "Lorem ipsum adipiscing"
|
|
||||||
pre_chunk = TablePreChunk(
|
|
||||||
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
|
|
||||||
opts=ChunkingOptions.new(max_characters=175),
|
|
||||||
)
|
|
||||||
|
|
||||||
chunk_iter = pre_chunk.iter_chunks()
|
|
||||||
|
|
||||||
chunk = next(chunk_iter)
|
|
||||||
assert isinstance(chunk, Table)
|
|
||||||
assert chunk.text == "Header Col 1 Header Col 2\nLorem ipsum adipiscing"
|
|
||||||
assert chunk.metadata.text_as_html == (
|
|
||||||
"<table>\n"
|
|
||||||
"<thead>\n"
|
|
||||||
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
|
||||||
"</thead>\n"
|
|
||||||
"<tbody>\n"
|
|
||||||
"<tr><td>Lorem ipsum </td><td>adipiscing </td></tr>\n"
|
|
||||||
"</tbody>\n"
|
|
||||||
"</table>"
|
|
||||||
)
|
|
||||||
with pytest.raises(StopIteration):
|
|
||||||
next(chunk_iter)
|
|
||||||
|
|
||||||
def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
|
|
||||||
# fixed-overhead = 8+8+9+8+9+8 = 50
|
|
||||||
# per-row overhead = 27
|
|
||||||
html_table = (
|
|
||||||
"<table>\n" # 8
|
|
||||||
"<thead>\n" # 8
|
|
||||||
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
|
||||||
"</thead>\n" # 9
|
|
||||||
"<tbody>\n" # 8
|
|
||||||
"<tr><td>Lorem ipsum </td><td>A Link example</td></tr>\n"
|
|
||||||
"<tr><td>Consectetur </td><td>adipiscing elit</td></tr>\n"
|
|
||||||
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
|
|
||||||
"<tr><td>Vivamus quis </td><td>nunc ipsum donec ac fermentum</td></tr>\n"
|
|
||||||
"</tbody>\n" # 9
|
|
||||||
"</table>" # 8
|
|
||||||
)
|
|
||||||
text_table = (
|
|
||||||
"Header Col 1 Header Col 2\n"
|
|
||||||
"Lorem ipsum dolor sit amet\n"
|
|
||||||
"Consectetur adipiscing elit\n"
|
|
||||||
"Nunc aliquam id enim nec molestie\n"
|
|
||||||
"Vivamus quis nunc ipsum donec ac fermentum"
|
|
||||||
)
|
|
||||||
pre_chunk = TablePreChunk(
|
|
||||||
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
|
|
||||||
opts=ChunkingOptions.new(max_characters=100),
|
|
||||||
)
|
|
||||||
|
|
||||||
chunk_iter = pre_chunk.iter_chunks()
|
|
||||||
|
|
||||||
chunk = next(chunk_iter)
|
|
||||||
assert isinstance(chunk, TableChunk)
|
|
||||||
assert chunk.text == (
|
|
||||||
"Header Col 1 Header Col 2\n"
|
|
||||||
"Lorem ipsum dolor sit amet\n"
|
|
||||||
"Consectetur adipiscing elit\n"
|
|
||||||
"Nunc aliqua"
|
|
||||||
)
|
|
||||||
assert chunk.metadata.text_as_html == (
|
|
||||||
"<table>\n"
|
|
||||||
"<thead>\n"
|
|
||||||
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
|
|
||||||
"</thead>\n"
|
|
||||||
"<tbody>\n"
|
|
||||||
"<tr><td>Lo"
|
|
||||||
)
|
|
||||||
# --
|
|
||||||
chunk = next(chunk_iter)
|
|
||||||
assert isinstance(chunk, TableChunk)
|
|
||||||
assert (
|
|
||||||
chunk.text == "m id enim nec molestie\nVivamus quis nunc ipsum donec ac fermentum"
|
|
||||||
)
|
|
||||||
assert chunk.metadata.text_as_html == (
|
|
||||||
"rem ipsum </td><td>A Link example</td></tr>\n"
|
|
||||||
"<tr><td>Consectetur </td><td>adipiscing elit</td><"
|
|
||||||
)
|
|
||||||
# -- note that text runs out but HTML continues because it's significantly longer. So two
|
|
||||||
# -- of these chunks have HTML but no text.
|
|
||||||
chunk = next(chunk_iter)
|
|
||||||
assert isinstance(chunk, TableChunk)
|
|
||||||
assert chunk.text == ""
|
|
||||||
assert chunk.metadata.text_as_html == (
|
|
||||||
"/tr>\n"
|
|
||||||
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
|
|
||||||
"<tr><td>Vivamus quis </td><td>"
|
|
||||||
)
|
|
||||||
# --
|
|
||||||
chunk = next(chunk_iter)
|
|
||||||
assert isinstance(chunk, TableChunk)
|
|
||||||
assert chunk.text == ""
|
|
||||||
assert chunk.metadata.text_as_html == (
|
|
||||||
"nunc ipsum donec ac fermentum</td></tr>\n</tbody>\n</table>"
|
|
||||||
)
|
|
||||||
# --
|
|
||||||
with pytest.raises(StopIteration):
|
|
||||||
next(chunk_iter)
|
|
||||||
|
|
||||||
|
|
||||||
class DescribeTextPreChunk:
|
|
||||||
"""Unit-test suite for `unstructured.chunking.title.TextPreChunk objects."""
|
|
||||||
|
|
||||||
def it_can_combine_itself_with_another_TextPreChunk_instance(self):
|
|
||||||
""".combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`.
|
|
||||||
|
|
||||||
Note that neither the original or other pre_chunk are mutated.
|
|
||||||
"""
|
|
||||||
opts = ChunkingOptions.new()
|
|
||||||
pre_chunk = TextPreChunk(
|
|
||||||
[
|
|
||||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
|
||||||
Text("In rhoncus ipsum sed lectus porta volutpat."),
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
)
|
|
||||||
other_pre_chunk = TextPreChunk(
|
|
||||||
[
|
|
||||||
Text("Donec semper facilisis metus finibus malesuada."),
|
|
||||||
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
)
|
|
||||||
|
|
||||||
new_pre_chunk = pre_chunk.combine(other_pre_chunk)
|
|
||||||
|
|
||||||
assert new_pre_chunk == TextPreChunk(
|
|
||||||
[
|
|
||||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
|
||||||
Text("In rhoncus ipsum sed lectus porta volutpat."),
|
|
||||||
Text("Donec semper facilisis metus finibus malesuada."),
|
|
||||||
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
)
|
|
||||||
assert pre_chunk == TextPreChunk(
|
|
||||||
[
|
|
||||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
|
||||||
Text("In rhoncus ipsum sed lectus porta volutpat."),
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
)
|
|
||||||
assert other_pre_chunk == TextPreChunk(
|
|
||||||
[
|
|
||||||
Text("Donec semper facilisis metus finibus malesuada."),
|
|
||||||
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
)
|
|
||||||
|
|
||||||
def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
|
|
||||||
pre_chunk = TextPreChunk(
|
|
||||||
[
|
|
||||||
Title("Introduction"),
|
|
||||||
Text(
|
|
||||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
|
||||||
"lectus porta volutpat.",
|
|
||||||
),
|
|
||||||
],
|
|
||||||
opts=ChunkingOptions.new(max_characters=200),
|
|
||||||
)
|
|
||||||
|
|
||||||
chunk_iter = pre_chunk.iter_chunks()
|
|
||||||
|
|
||||||
chunk = next(chunk_iter)
|
|
||||||
assert chunk == CompositeElement(
|
|
||||||
"Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
|
|
||||||
" In rhoncus ipsum sedlectus porta volutpat.",
|
|
||||||
)
|
|
||||||
assert chunk.metadata is pre_chunk._consolidated_metadata
|
|
||||||
|
|
||||||
def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
|
|
||||||
# -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
|
|
||||||
# -- The pre-chunker will isolate that element in a pre_chunk of its own.
|
|
||||||
pre_chunk = TextPreChunk(
|
|
||||||
[
|
|
||||||
Text(
|
|
||||||
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
|
|
||||||
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
|
|
||||||
" veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
|
|
||||||
" commodo consequat."
|
|
||||||
),
|
|
||||||
],
|
|
||||||
opts=ChunkingOptions.new(max_characters=200),
|
|
||||||
)
|
|
||||||
|
|
||||||
chunk_iter = pre_chunk.iter_chunks()
|
|
||||||
|
|
||||||
chunk = next(chunk_iter)
|
|
||||||
assert chunk == CompositeElement(
|
|
||||||
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
|
|
||||||
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
|
|
||||||
" veniam, quis nostrud exercitation ullamco laboris nisi ut a"
|
|
||||||
)
|
|
||||||
assert chunk.metadata is pre_chunk._consolidated_metadata
|
|
||||||
# --
|
|
||||||
chunk = next(chunk_iter)
|
|
||||||
assert chunk == CompositeElement("liquip ex ea commodo consequat.")
|
|
||||||
assert chunk.metadata is pre_chunk._consolidated_metadata
|
|
||||||
# --
|
|
||||||
with pytest.raises(StopIteration):
|
|
||||||
next(chunk_iter)
|
|
||||||
|
|
||||||
def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self):
|
|
||||||
""".text_length is the size of chunk this pre-chunk will produce (before any splitting)."""
|
|
||||||
pre_chunk = TextPreChunk(
|
|
||||||
[PageBreak(""), Text("foo"), Text("bar")], opts=ChunkingOptions.new()
|
|
||||||
)
|
|
||||||
assert pre_chunk.text_length == 8
|
|
||||||
|
|
||||||
def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self):
|
|
||||||
pre_chunk = TextPreChunk(
|
|
||||||
[
|
|
||||||
Title(
|
|
||||||
"Lorem Ipsum",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
category_depth=0,
|
|
||||||
filename="foo.docx",
|
|
||||||
languages=["lat"],
|
|
||||||
parent_id="f87731e0",
|
|
||||||
),
|
|
||||||
),
|
|
||||||
Text(
|
|
||||||
"'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
category_depth=1,
|
|
||||||
filename="foo.docx",
|
|
||||||
image_path="sprite.png",
|
|
||||||
languages=["lat", "eng"],
|
|
||||||
),
|
|
||||||
),
|
|
||||||
],
|
|
||||||
opts=ChunkingOptions.new(),
|
|
||||||
)
|
|
||||||
|
|
||||||
assert pre_chunk._all_metadata_values == {
|
|
||||||
# -- scalar values are accumulated in a list in element order --
|
|
||||||
"category_depth": [0, 1],
|
|
||||||
# -- all values are accumulated, not only unique ones --
|
|
||||||
"filename": ["foo.docx", "foo.docx"],
|
|
||||||
# -- list-type fields produce a list of lists --
|
|
||||||
"languages": [["lat"], ["lat", "eng"]],
|
|
||||||
# -- fields that only appear in some elements are captured --
|
|
||||||
"image_path": ["sprite.png"],
|
|
||||||
"parent_id": ["f87731e0"],
|
|
||||||
# -- A `None` value never appears, neither does a field-name with an empty list --
|
|
||||||
}
|
|
||||||
|
|
||||||
def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self):
|
|
||||||
metadata = ElementMetadata(
|
|
||||||
category_depth=0,
|
|
||||||
filename="foo.docx",
|
|
||||||
languages=["lat"],
|
|
||||||
parent_id="f87731e0",
|
|
||||||
)
|
|
||||||
metadata.coefficient = 0.62
|
|
||||||
metadata_2 = ElementMetadata(
|
|
||||||
category_depth=1,
|
|
||||||
filename="foo.docx",
|
|
||||||
image_path="sprite.png",
|
|
||||||
languages=["lat", "eng"],
|
|
||||||
)
|
|
||||||
metadata_2.quotient = 1.74
|
|
||||||
|
|
||||||
pre_chunk = TextPreChunk(
|
|
||||||
[
|
|
||||||
Title("Lorem Ipsum", metadata=metadata),
|
|
||||||
Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
|
|
||||||
],
|
|
||||||
opts=ChunkingOptions.new(),
|
|
||||||
)
|
|
||||||
|
|
||||||
# -- ad-hoc fields "coefficient" and "quotient" do not appear --
|
|
||||||
assert pre_chunk._all_metadata_values == {
|
|
||||||
"category_depth": [0, 1],
|
|
||||||
"filename": ["foo.docx", "foo.docx"],
|
|
||||||
"image_path": ["sprite.png"],
|
|
||||||
"languages": [["lat"], ["lat", "eng"]],
|
|
||||||
"parent_id": ["f87731e0"],
|
|
||||||
}
|
|
||||||
|
|
||||||
def it_consolidates_regex_metadata_in_a_field_specific_way(self):
|
|
||||||
"""regex_metadata of chunk is combined regex_metadatas of its elements.
|
|
||||||
|
|
||||||
Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new
|
|
||||||
position in the chunk after element text has been concatenated.
|
|
||||||
"""
|
|
||||||
pre_chunk = TextPreChunk(
|
|
||||||
[
|
|
||||||
Title(
|
|
||||||
"Lorem Ipsum",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
|
|
||||||
),
|
|
||||||
),
|
|
||||||
Text(
|
|
||||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
regex_metadata={
|
|
||||||
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
|
|
||||||
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
|
|
||||||
},
|
|
||||||
),
|
|
||||||
),
|
|
||||||
Text(
|
|
||||||
"In rhoncus ipsum sed lectus porta volutpat.",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
|
|
||||||
),
|
|
||||||
),
|
|
||||||
],
|
|
||||||
opts=ChunkingOptions.new(),
|
|
||||||
)
|
|
||||||
|
|
||||||
regex_metadata = pre_chunk._consolidated_regex_meta
|
|
||||||
|
|
||||||
assert regex_metadata == {
|
|
||||||
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
|
|
||||||
"ipsum": [
|
|
||||||
RegexMetadata(text="Ipsum", start=6, end=11),
|
|
||||||
RegexMetadata(text="ipsum", start=19, end=24),
|
|
||||||
RegexMetadata(text="ipsum", start=81, end=86),
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self):
|
|
||||||
"""._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
|
|
||||||
|
|
||||||
Only non-None fields should appear in the dict and each field value should be the
|
|
||||||
consolidation of the values across the pre_chunk elements.
|
|
||||||
"""
|
|
||||||
pre_chunk = TextPreChunk(
|
|
||||||
[
|
|
||||||
PageBreak(""),
|
|
||||||
Title(
|
|
||||||
"Lorem Ipsum",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
filename="foo.docx",
|
|
||||||
# -- category_depth has DROP strategy so doesn't appear in result --
|
|
||||||
category_depth=0,
|
|
||||||
emphasized_text_contents=["Lorem", "Ipsum"],
|
|
||||||
emphasized_text_tags=["b", "i"],
|
|
||||||
languages=["lat"],
|
|
||||||
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
|
|
||||||
),
|
|
||||||
),
|
|
||||||
Text(
|
|
||||||
"'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
# -- filename change doesn't happen IRL but demonstrates FIRST strategy --
|
|
||||||
filename="bar.docx",
|
|
||||||
# -- emphasized_text_contents has LIST_CONCATENATE strategy, so "Lorem"
|
|
||||||
# -- appears twice in consolidated-meta (as it should) and length matches
|
|
||||||
# -- that of emphasized_text_tags both before and after consolidation.
|
|
||||||
emphasized_text_contents=["Lorem", "ipsum"],
|
|
||||||
emphasized_text_tags=["i", "b"],
|
|
||||||
# -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once --
|
|
||||||
languages=["eng", "lat"],
|
|
||||||
# -- regex_metadata has its own dedicated consolidation-strategy (REGEX) --
|
|
||||||
regex_metadata={
|
|
||||||
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
|
|
||||||
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
|
|
||||||
},
|
|
||||||
),
|
|
||||||
),
|
|
||||||
],
|
|
||||||
opts=ChunkingOptions.new(),
|
|
||||||
)
|
|
||||||
|
|
||||||
meta_kwargs = pre_chunk._meta_kwargs
|
|
||||||
|
|
||||||
assert meta_kwargs == {
|
|
||||||
"filename": "foo.docx",
|
|
||||||
"emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"],
|
|
||||||
"emphasized_text_tags": ["b", "i", "i", "b"],
|
|
||||||
"languages": ["lat", "eng"],
|
|
||||||
"regex_metadata": {
|
|
||||||
"ipsum": [
|
|
||||||
RegexMetadata(text="Ipsum", start=6, end=11),
|
|
||||||
RegexMetadata(text="ipsum", start=19, end=24),
|
|
||||||
],
|
|
||||||
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
("elements", "expected_value"),
|
|
||||||
[
|
|
||||||
([Text("foo"), Text("bar")], "foo\n\nbar"),
|
|
||||||
([Text("foo"), PageBreak(""), Text("bar")], "foo\n\nbar"),
|
|
||||||
([PageBreak(""), Text("foo"), Text("bar")], "foo\n\nbar"),
|
|
||||||
([Text("foo"), Text("bar"), PageBreak("")], "foo\n\nbar"),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def it_knows_the_concatenated_text_of_the_pre_chunk(
|
|
||||||
self, elements: List[Text], expected_value: str
|
|
||||||
):
|
|
||||||
"""._text is the "joined" text of the pre-chunk elements.
|
|
||||||
|
|
||||||
The text-segment contributed by each element is separated from the next by a blank line
|
|
||||||
("\n\n"). An element that contributes no text does not give rise to a separator.
|
|
||||||
"""
|
|
||||||
pre_chunk = TextPreChunk(elements, opts=ChunkingOptions.new())
|
|
||||||
assert pre_chunk._text == expected_value
|
|
||||||
|
|
||||||
|
|
||||||
class DescribeTextPreChunkBuilder:
|
|
||||||
"""Unit-test suite for `unstructured.chunking.title.TextPreChunkBuilder`."""
|
|
||||||
|
|
||||||
def it_is_empty_on_construction(self):
|
|
||||||
builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
|
|
||||||
|
|
||||||
assert builder.text_length == 0
|
|
||||||
assert builder.remaining_space == 50
|
|
||||||
|
|
||||||
def it_accumulates_elements_added_to_it(self):
|
|
||||||
builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
|
||||||
|
|
||||||
builder.add_element(Title("Introduction"))
|
|
||||||
assert builder.text_length == 12
|
|
||||||
assert builder.remaining_space == 136
|
|
||||||
|
|
||||||
builder.add_element(
|
|
||||||
Text(
|
|
||||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
|
||||||
"lectus porta volutpat.",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
assert builder.text_length == 112
|
|
||||||
assert builder.remaining_space == 36
|
|
||||||
|
|
||||||
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
|
|
||||||
builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
|
||||||
builder.add_element(Title("Introduction"))
|
|
||||||
builder.add_element(
|
|
||||||
Text(
|
|
||||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
|
||||||
"lectus porta volutpat.",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
pre_chunk = next(builder.flush())
|
|
||||||
|
|
||||||
assert isinstance(pre_chunk, TextPreChunk)
|
|
||||||
assert pre_chunk._elements == [
|
|
||||||
Title("Introduction"),
|
|
||||||
Text(
|
|
||||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
|
||||||
"lectus porta volutpat.",
|
|
||||||
),
|
|
||||||
]
|
|
||||||
assert builder.text_length == 0
|
|
||||||
assert builder.remaining_space == 150
|
|
||||||
|
|
||||||
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
|
|
||||||
builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
|
||||||
|
|
||||||
pre_chunks = list(builder.flush())
|
|
||||||
|
|
||||||
assert pre_chunks == []
|
|
||||||
assert builder.text_length == 0
|
|
||||||
assert builder.remaining_space == 150
|
|
||||||
|
|
||||||
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
|
|
||||||
builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
|
|
||||||
builder.add_element(Text("abcde"))
|
|
||||||
builder.add_element(Text("fghij"))
|
|
||||||
|
|
||||||
# -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
|
|
||||||
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
|
|
||||||
assert builder.text_length == 12
|
|
||||||
# -- .remaining_space is reduced by the length (2) of the trailing separator which would go
|
|
||||||
# -- between the current text and that of the next element if one was added.
|
|
||||||
# -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
|
|
||||||
assert builder.remaining_space == 36
|
|
||||||
|
|
||||||
|
|
||||||
# == PreChunkCombiner =============================================================================
|
|
||||||
|
|
||||||
|
|
||||||
class DescribePreChunkCombiner:
|
|
||||||
"""Unit-test suite for `unstructured.chunking.title.PreChunkCombiner`."""
|
|
||||||
|
|
||||||
def it_combines_sequential_small_text_pre_chunks(self):
|
|
||||||
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
|
|
||||||
pre_chunks = [
|
|
||||||
TextPreChunk(
|
|
||||||
[
|
|
||||||
Title("Lorem Ipsum"), # 11
|
|
||||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
),
|
|
||||||
TextPreChunk(
|
|
||||||
[
|
|
||||||
Title("Mauris Nec"), # 10
|
|
||||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
),
|
|
||||||
TextPreChunk(
|
|
||||||
[
|
|
||||||
Title("Sed Orci"), # 8
|
|
||||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
|
|
||||||
|
|
||||||
pre_chunk = next(pre_chunk_iter)
|
|
||||||
assert isinstance(pre_chunk, TextPreChunk)
|
|
||||||
assert pre_chunk._elements == [
|
|
||||||
Title("Lorem Ipsum"),
|
|
||||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
|
||||||
Title("Mauris Nec"),
|
|
||||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
|
||||||
Title("Sed Orci"),
|
|
||||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
|
|
||||||
]
|
|
||||||
with pytest.raises(StopIteration):
|
|
||||||
next(pre_chunk_iter)
|
|
||||||
|
|
||||||
def but_it_does_not_combine_table_pre_chunks(self):
|
|
||||||
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
|
|
||||||
pre_chunks = [
|
|
||||||
TextPreChunk(
|
|
||||||
[
|
|
||||||
Title("Lorem Ipsum"),
|
|
||||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
),
|
|
||||||
TablePreChunk(Table("Heading\nCell text"), opts=opts),
|
|
||||||
TextPreChunk(
|
|
||||||
[
|
|
||||||
Title("Mauris Nec"),
|
|
||||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
pre_chunk_iter = PreChunkCombiner(
|
|
||||||
pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
|
|
||||||
).iter_combined_pre_chunks()
|
|
||||||
|
|
||||||
pre_chunk = next(pre_chunk_iter)
|
|
||||||
assert isinstance(pre_chunk, TextPreChunk)
|
|
||||||
assert pre_chunk._elements == [
|
|
||||||
Title("Lorem Ipsum"),
|
|
||||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
|
||||||
]
|
|
||||||
# --
|
|
||||||
pre_chunk = next(pre_chunk_iter)
|
|
||||||
assert isinstance(pre_chunk, TablePreChunk)
|
|
||||||
assert pre_chunk._table == Table("Heading\nCell text")
|
|
||||||
# --
|
|
||||||
pre_chunk = next(pre_chunk_iter)
|
|
||||||
assert isinstance(pre_chunk, TextPreChunk)
|
|
||||||
assert pre_chunk._elements == [
|
|
||||||
Title("Mauris Nec"),
|
|
||||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
|
||||||
]
|
|
||||||
# --
|
|
||||||
with pytest.raises(StopIteration):
|
|
||||||
next(pre_chunk_iter)
|
|
||||||
|
|
||||||
def it_respects_the_specified_combination_threshold(self):
|
|
||||||
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80)
|
|
||||||
pre_chunks = [
|
|
||||||
TextPreChunk( # 68
|
|
||||||
[
|
|
||||||
Title("Lorem Ipsum"), # 11
|
|
||||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
),
|
|
||||||
TextPreChunk( # 71
|
|
||||||
[
|
|
||||||
Title("Mauris Nec"), # 10
|
|
||||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
),
|
|
||||||
# -- len == 139
|
|
||||||
TextPreChunk(
|
|
||||||
[
|
|
||||||
Title("Sed Orci"), # 8
|
|
||||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
|
|
||||||
|
|
||||||
pre_chunk = next(pre_chunk_iter)
|
|
||||||
assert isinstance(pre_chunk, TextPreChunk)
|
|
||||||
assert pre_chunk._elements == [
|
|
||||||
Title("Lorem Ipsum"),
|
|
||||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
|
||||||
Title("Mauris Nec"),
|
|
||||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
|
||||||
]
|
|
||||||
# --
|
|
||||||
pre_chunk = next(pre_chunk_iter)
|
|
||||||
assert isinstance(pre_chunk, TextPreChunk)
|
|
||||||
assert pre_chunk._elements == [
|
|
||||||
Title("Sed Orci"),
|
|
||||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
|
|
||||||
]
|
|
||||||
# --
|
|
||||||
with pytest.raises(StopIteration):
|
|
||||||
next(pre_chunk_iter)
|
|
||||||
|
|
||||||
def it_respects_the_hard_maximum_window_length(self):
|
|
||||||
opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200)
|
|
||||||
pre_chunks = [
|
|
||||||
TextPreChunk( # 68
|
|
||||||
[
|
|
||||||
Title("Lorem Ipsum"), # 11
|
|
||||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
),
|
|
||||||
TextPreChunk( # 71
|
|
||||||
[
|
|
||||||
Title("Mauris Nec"), # 10
|
|
||||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
),
|
|
||||||
# -- len == 139
|
|
||||||
TextPreChunk(
|
|
||||||
[
|
|
||||||
Title("Sed Orci"), # 8
|
|
||||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
),
|
|
||||||
# -- len == 214
|
|
||||||
]
|
|
||||||
|
|
||||||
pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
|
|
||||||
|
|
||||||
pre_chunk = next(pre_chunk_iter)
|
|
||||||
assert isinstance(pre_chunk, TextPreChunk)
|
|
||||||
assert pre_chunk._elements == [
|
|
||||||
Title("Lorem Ipsum"),
|
|
||||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
|
||||||
Title("Mauris Nec"),
|
|
||||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
|
||||||
]
|
|
||||||
# --
|
|
||||||
pre_chunk = next(pre_chunk_iter)
|
|
||||||
assert isinstance(pre_chunk, TextPreChunk)
|
|
||||||
assert pre_chunk._elements == [
|
|
||||||
Title("Sed Orci"),
|
|
||||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
|
|
||||||
]
|
|
||||||
# --
|
|
||||||
with pytest.raises(StopIteration):
|
|
||||||
next(pre_chunk_iter)
|
|
||||||
|
|
||||||
def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
|
|
||||||
"""Such as occurs when a single element exceeds the window size."""
|
|
||||||
opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
|
|
||||||
pre_chunks = [
|
|
||||||
TextPreChunk([Title("Lorem Ipsum")], opts=opts),
|
|
||||||
TextPreChunk( # 179
|
|
||||||
[
|
|
||||||
Text(
|
|
||||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55
|
|
||||||
" Mauris nec urna non augue vulputate consequat eget et nisi." # 60
|
|
||||||
" Sed orci quam, eleifend sit amet vehicula, elementum ultricies." # 64
|
|
||||||
)
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
),
|
|
||||||
TextPreChunk([Title("Vulputate Consequat")], opts=opts),
|
|
||||||
]
|
|
||||||
|
|
||||||
pre_chunk_iter = PreChunkCombiner(
|
|
||||||
pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
|
|
||||||
).iter_combined_pre_chunks()
|
|
||||||
|
|
||||||
pre_chunk = next(pre_chunk_iter)
|
|
||||||
assert isinstance(pre_chunk, TextPreChunk)
|
|
||||||
assert pre_chunk._elements == [Title("Lorem Ipsum")]
|
|
||||||
# --
|
|
||||||
pre_chunk = next(pre_chunk_iter)
|
|
||||||
assert isinstance(pre_chunk, TextPreChunk)
|
|
||||||
assert pre_chunk._elements == [
|
|
||||||
Text(
|
|
||||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit."
|
|
||||||
" Mauris nec urna non augue vulputate consequat eget et nisi."
|
|
||||||
" Sed orci quam, eleifend sit amet vehicula, elementum ultricies."
|
|
||||||
)
|
|
||||||
]
|
|
||||||
# --
|
|
||||||
pre_chunk = next(pre_chunk_iter)
|
|
||||||
assert isinstance(pre_chunk, TextPreChunk)
|
|
||||||
assert pre_chunk._elements == [Title("Vulputate Consequat")]
|
|
||||||
# --
|
|
||||||
with pytest.raises(StopIteration):
|
|
||||||
next(pre_chunk_iter)
|
|
||||||
|
|
||||||
|
|
||||||
class DescribeTextPreChunkAccumulator:
|
|
||||||
"""Unit-test suite for `unstructured.chunking.title.TextPreChunkAccumulator`."""
|
|
||||||
|
|
||||||
def it_is_empty_on_construction(self):
|
|
||||||
accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=100))
|
|
||||||
|
|
||||||
assert accum.text_length == 0
|
|
||||||
assert accum.remaining_space == 100
|
|
||||||
|
|
||||||
def it_accumulates_pre_chunks_added_to_it(self):
|
|
||||||
opts = ChunkingOptions.new(max_characters=500)
|
|
||||||
accum = TextPreChunkAccumulator(opts=opts)
|
|
||||||
|
|
||||||
accum.add_pre_chunk(
|
|
||||||
TextPreChunk(
|
|
||||||
[
|
|
||||||
Title("Lorem Ipsum"),
|
|
||||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
assert accum.text_length == 68
|
|
||||||
assert accum.remaining_space == 430
|
|
||||||
|
|
||||||
accum.add_pre_chunk(
|
|
||||||
TextPreChunk(
|
|
||||||
[
|
|
||||||
Title("Mauris Nec"),
|
|
||||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
assert accum.text_length == 141
|
|
||||||
assert accum.remaining_space == 357
|
|
||||||
|
|
||||||
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
|
|
||||||
opts = ChunkingOptions.new(max_characters=150)
|
|
||||||
accum = TextPreChunkAccumulator(opts=opts)
|
|
||||||
accum.add_pre_chunk(
|
|
||||||
TextPreChunk(
|
|
||||||
[
|
|
||||||
Title("Lorem Ipsum"),
|
|
||||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
accum.add_pre_chunk(
|
|
||||||
TextPreChunk(
|
|
||||||
[
|
|
||||||
Title("Mauris Nec"),
|
|
||||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
accum.add_pre_chunk(
|
|
||||||
TextPreChunk(
|
|
||||||
[
|
|
||||||
Title("Sed Orci"),
|
|
||||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
|
|
||||||
],
|
|
||||||
opts=opts,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
pre_chunk_iter = accum.flush()
|
|
||||||
|
|
||||||
# -- iterator generates exactly one pre_chunk --
|
|
||||||
pre_chunk = next(pre_chunk_iter)
|
|
||||||
with pytest.raises(StopIteration):
|
|
||||||
next(pre_chunk_iter)
|
|
||||||
# -- and it is a _TextPreChunk containing all the elements --
|
|
||||||
assert isinstance(pre_chunk, TextPreChunk)
|
|
||||||
assert pre_chunk._elements == [
|
|
||||||
Title("Lorem Ipsum"),
|
|
||||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
|
||||||
Title("Mauris Nec"),
|
|
||||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
|
||||||
Title("Sed Orci"),
|
|
||||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
|
|
||||||
]
|
|
||||||
assert accum.text_length == 0
|
|
||||||
assert accum.remaining_space == 150
|
|
||||||
|
|
||||||
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
|
|
||||||
accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150))
|
|
||||||
|
|
||||||
pre_chunks = list(accum.flush())
|
|
||||||
|
|
||||||
assert pre_chunks == []
|
|
||||||
assert accum.text_length == 0
|
|
||||||
assert accum.remaining_space == 150
|
|
||||||
|
|
||||||
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
|
|
||||||
opts = ChunkingOptions.new(max_characters=100)
|
|
||||||
accum = TextPreChunkAccumulator(opts=opts)
|
|
||||||
accum.add_pre_chunk(TextPreChunk([Text("abcde")], opts=opts))
|
|
||||||
accum.add_pre_chunk(TextPreChunk([Text("fghij")], opts=opts))
|
|
||||||
|
|
||||||
# -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
|
|
||||||
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
|
|
||||||
assert accum.text_length == 12
|
|
||||||
# -- .remaining_space is reduced by the length (2) of the trailing separator which would
|
|
||||||
# -- go between the current text and that of the next pre-chunk if one was added.
|
|
||||||
# -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88
|
|
||||||
assert accum.remaining_space == 86
|
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.11.5-dev1" # pragma: no cover
|
__version__ = "0.11.5-dev2" # pragma: no cover
|
||||||
|
|||||||
@ -2,12 +2,25 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import Optional
|
import collections
|
||||||
|
import copy
|
||||||
|
from typing import Any, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple, cast
|
||||||
|
|
||||||
from typing_extensions import Self
|
from typing_extensions import Self, TypeAlias
|
||||||
|
|
||||||
|
from unstructured.documents.elements import (
|
||||||
|
CompositeElement,
|
||||||
|
ConsolidationStrategy,
|
||||||
|
Element,
|
||||||
|
ElementMetadata,
|
||||||
|
RegexMetadata,
|
||||||
|
Table,
|
||||||
|
TableChunk,
|
||||||
|
)
|
||||||
from unstructured.utils import lazyproperty
|
from unstructured.utils import lazyproperty
|
||||||
|
|
||||||
|
PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
|
||||||
|
|
||||||
|
|
||||||
class ChunkingOptions:
|
class ChunkingOptions:
|
||||||
"""Specifies parameters of optional chunking behaviors."""
|
"""Specifies parameters of optional chunking behaviors."""
|
||||||
@ -150,3 +163,404 @@ class ChunkingOptions:
|
|||||||
# loop (I think).
|
# loop (I think).
|
||||||
if self._overlap >= max_characters:
|
if self._overlap >= max_characters:
|
||||||
raise ValueError(f"'overlap' must be less than max_characters," f" got {self._overlap}")
|
raise ValueError(f"'overlap' must be less than max_characters," f" got {self._overlap}")
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================================================
|
||||||
|
# PRE-CHUNK SUB-TYPES
|
||||||
|
# ================================================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TablePreChunk:
|
||||||
|
"""A pre-chunk composed of a single Table element."""
|
||||||
|
|
||||||
|
def __init__(self, table: Table, opts: ChunkingOptions) -> None:
|
||||||
|
self._table = table
|
||||||
|
self._opts = opts
|
||||||
|
|
||||||
|
def iter_chunks(self) -> Iterator[Table | TableChunk]:
|
||||||
|
"""Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
|
||||||
|
text = self._table.text
|
||||||
|
html = self._table.metadata.text_as_html or ""
|
||||||
|
maxlen = self._opts.hard_max
|
||||||
|
|
||||||
|
# -- only chunk a table when it's too big to swallow whole --
|
||||||
|
if len(text) <= maxlen and len(html) <= maxlen:
|
||||||
|
yield self._table
|
||||||
|
return
|
||||||
|
|
||||||
|
is_continuation = False
|
||||||
|
|
||||||
|
while text or html:
|
||||||
|
# -- split off the next maxchars into the next TableChunk --
|
||||||
|
text_chunk, text = text[:maxlen], text[maxlen:]
|
||||||
|
table_chunk = TableChunk(text=text_chunk, metadata=copy.deepcopy(self._table.metadata))
|
||||||
|
|
||||||
|
# -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
|
||||||
|
# -- HTML elements that *correspond* to the TextChunk.text fragment.
|
||||||
|
if html:
|
||||||
|
html_chunk, html = html[:maxlen], html[maxlen:]
|
||||||
|
table_chunk.metadata.text_as_html = html_chunk
|
||||||
|
|
||||||
|
# -- mark second and later chunks as a continuation --
|
||||||
|
if is_continuation:
|
||||||
|
table_chunk.metadata.is_continuation = True
|
||||||
|
|
||||||
|
yield table_chunk
|
||||||
|
|
||||||
|
is_continuation = True
|
||||||
|
|
||||||
|
|
||||||
|
class TextPreChunk:
|
||||||
|
"""A sequence of elements that belong to the same semantic unit within a document.
|
||||||
|
|
||||||
|
The name "section" derives from the idea of a document-section, a heading followed by the
|
||||||
|
paragraphs "under" that heading. That structure is not found in all documents and actual section
|
||||||
|
content can vary, but that's the concept.
|
||||||
|
|
||||||
|
This object is purposely immutable.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, elements: Iterable[Element], opts: ChunkingOptions) -> None:
|
||||||
|
self._elements = list(elements)
|
||||||
|
self._opts = opts
|
||||||
|
|
||||||
|
def __eq__(self, other: Any) -> bool:
|
||||||
|
if not isinstance(other, TextPreChunk):
|
||||||
|
return False
|
||||||
|
return self._elements == other._elements
|
||||||
|
|
||||||
|
def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk:
|
||||||
|
"""Return new `TextPreChunk` that combines this and `other_pre_chunk`."""
|
||||||
|
return TextPreChunk(self._elements + other_pre_chunk._elements, opts=self._opts)
|
||||||
|
|
||||||
|
def iter_chunks(self) -> Iterator[CompositeElement]:
|
||||||
|
"""Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller."""
|
||||||
|
text = self._text
|
||||||
|
text_len = len(text)
|
||||||
|
maxlen = self._opts.hard_max
|
||||||
|
start = 0
|
||||||
|
remaining = text_len
|
||||||
|
|
||||||
|
while remaining > 0:
|
||||||
|
end = min(start + maxlen, text_len)
|
||||||
|
yield CompositeElement(text=text[start:end], metadata=self._consolidated_metadata)
|
||||||
|
start = end
|
||||||
|
remaining = text_len - end
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def text_length(self) -> int:
|
||||||
|
"""Length of concatenated text of this pre-chunk, including separators."""
|
||||||
|
# -- used by pre-chunk-combiner to identify combination candidates --
|
||||||
|
return len(self._text)
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def _all_metadata_values(self) -> Dict[str, List[Any]]:
|
||||||
|
"""Collection of all populated metadata values across elements.
|
||||||
|
|
||||||
|
The resulting dict has one key for each `ElementMetadata` field that had a non-None value in
|
||||||
|
at least one of the elements in this pre-chunk. The value of that key is a list of all those
|
||||||
|
populated values, in element order, for example:
|
||||||
|
|
||||||
|
{
|
||||||
|
"filename": ["sample.docx", "sample.docx"],
|
||||||
|
"languages": [["lat"], ["lat", "eng"]]
|
||||||
|
...
|
||||||
|
}
|
||||||
|
|
||||||
|
This preprocessing step provides the input for a specified consolidation strategy that will
|
||||||
|
resolve the list of values for each field to a single consolidated value.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def iter_populated_fields(metadata: ElementMetadata) -> Iterator[Tuple[str, Any]]:
|
||||||
|
"""(field_name, value) pair for each non-None field in single `ElementMetadata`."""
|
||||||
|
return (
|
||||||
|
(field_name, value)
|
||||||
|
for field_name, value in metadata.known_fields.items()
|
||||||
|
if value is not None
|
||||||
|
)
|
||||||
|
|
||||||
|
field_values: DefaultDict[str, List[Any]] = collections.defaultdict(list)
|
||||||
|
|
||||||
|
# -- collect all non-None field values in a list for each field, in element-order --
|
||||||
|
for e in self._elements:
|
||||||
|
for field_name, value in iter_populated_fields(e.metadata):
|
||||||
|
field_values[field_name].append(value)
|
||||||
|
|
||||||
|
return dict(field_values)
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def _consolidated_metadata(self) -> ElementMetadata:
|
||||||
|
"""Metadata applicable to this pre-chunk as a single chunk.
|
||||||
|
|
||||||
|
Formed by applying consolidation rules to all metadata fields across the elements of this
|
||||||
|
pre-chunk.
|
||||||
|
|
||||||
|
For the sake of consistency, the same rules are applied (for example, for dropping values)
|
||||||
|
to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
|
||||||
|
"consolidated".
|
||||||
|
"""
|
||||||
|
return ElementMetadata(**self._meta_kwargs)
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def _consolidated_regex_meta(self) -> Dict[str, List[RegexMetadata]]:
|
||||||
|
"""Consolidate the regex-metadata in `regex_metadata_dicts` into a single dict.
|
||||||
|
|
||||||
|
This consolidated value is suitable for use in the chunk metadata. `start` and `end`
|
||||||
|
offsets of each regex match are also adjusted for their new positions.
|
||||||
|
"""
|
||||||
|
chunk_regex_metadata: Dict[str, List[RegexMetadata]] = {}
|
||||||
|
separator_len = len(self._opts.text_separator)
|
||||||
|
running_text_len = 0
|
||||||
|
start_offset = 0
|
||||||
|
|
||||||
|
for element in self._elements:
|
||||||
|
text_len = len(element.text)
|
||||||
|
# -- skip empty elements like `PageBreak("")` --
|
||||||
|
if not text_len:
|
||||||
|
continue
|
||||||
|
# -- account for blank line between "squashed" elements, but not before first element --
|
||||||
|
running_text_len += separator_len if running_text_len else 0
|
||||||
|
start_offset = running_text_len
|
||||||
|
running_text_len += text_len
|
||||||
|
|
||||||
|
if not element.metadata.regex_metadata:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# -- consolidate any `regex_metadata` matches, adjusting the match start/end offsets --
|
||||||
|
element_regex_metadata = copy.deepcopy(element.metadata.regex_metadata)
|
||||||
|
for regex_name, matches in element_regex_metadata.items():
|
||||||
|
for m in matches:
|
||||||
|
m["start"] += start_offset
|
||||||
|
m["end"] += start_offset
|
||||||
|
chunk_matches = chunk_regex_metadata.get(regex_name, [])
|
||||||
|
chunk_matches.extend(matches)
|
||||||
|
chunk_regex_metadata[regex_name] = chunk_matches
|
||||||
|
|
||||||
|
return chunk_regex_metadata
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def _meta_kwargs(self) -> Dict[str, Any]:
|
||||||
|
"""The consolidated metadata values as a dict suitable for constructing ElementMetadata.
|
||||||
|
|
||||||
|
This is where consolidation strategies are actually applied. The output is suitable for use
|
||||||
|
in constructing an `ElementMetadata` object like `ElementMetadata(**self._meta_kwargs)`.
|
||||||
|
"""
|
||||||
|
CS = ConsolidationStrategy
|
||||||
|
field_consolidation_strategies = ConsolidationStrategy.field_consolidation_strategies()
|
||||||
|
|
||||||
|
def iter_kwarg_pairs() -> Iterator[Tuple[str, Any]]:
|
||||||
|
"""Generate (field-name, value) pairs for each field in consolidated metadata."""
|
||||||
|
for field_name, values in self._all_metadata_values.items():
|
||||||
|
strategy = field_consolidation_strategies.get(field_name)
|
||||||
|
if strategy is CS.FIRST:
|
||||||
|
yield field_name, values[0]
|
||||||
|
# -- concatenate lists from each element that had one, in order --
|
||||||
|
elif strategy is CS.LIST_CONCATENATE:
|
||||||
|
yield field_name, sum(values, cast(List[Any], []))
|
||||||
|
# -- union lists from each element, preserving order of appearance --
|
||||||
|
elif strategy is CS.LIST_UNIQUE:
|
||||||
|
# -- Python 3.7+ maintains dict insertion order --
|
||||||
|
ordered_unique_keys = {key: None for val_list in values for key in val_list}
|
||||||
|
yield field_name, list(ordered_unique_keys.keys())
|
||||||
|
elif strategy is CS.REGEX:
|
||||||
|
yield field_name, self._consolidated_regex_meta
|
||||||
|
elif strategy is CS.DROP:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# -- not likely to hit this since we have a test in `text_elements.py` that
|
||||||
|
# -- ensures every ElementMetadata fields has an assigned strategy.
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"metadata field {repr(field_name)} has no defined consolidation strategy"
|
||||||
|
)
|
||||||
|
|
||||||
|
return dict(iter_kwarg_pairs())
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def _text(self) -> str:
|
||||||
|
"""The concatenated text of all elements in this pre-chunk.
|
||||||
|
|
||||||
|
Each element-text is separated from the next by a blank line ("\n\n").
|
||||||
|
"""
|
||||||
|
text_separator = self._opts.text_separator
|
||||||
|
return text_separator.join(e.text for e in self._elements if e.text)
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================================================
|
||||||
|
# PRE-CHUNKING ACCUMULATORS
|
||||||
|
# ------------------------------------------------------------------------------------------------
|
||||||
|
# Accumulators encapsulate the work of grouping elements and later pre-chunks to form the larger
|
||||||
|
# pre-chunk and combined-pre-chunk items central to unstructured chunking.
|
||||||
|
# ================================================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class PreChunkBuilder:
|
||||||
|
"""An element accumulator suitable for incrementally forming a pre-chunk.
|
||||||
|
|
||||||
|
Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use
|
||||||
|
to determine whether it should add the next element in the element stream.
|
||||||
|
|
||||||
|
`.flush()` is used to build a PreChunk object from the accumulated elements. This method
|
||||||
|
returns an iterator that generates zero-or-one `TextPreChunk` or `TablePreChunk` object and is
|
||||||
|
used like so:
|
||||||
|
|
||||||
|
yield from builder.flush()
|
||||||
|
|
||||||
|
If no elements have been accumulated, no `PreChunk` instance is generated. Flushing the builder
|
||||||
|
clears the elements it contains so it is ready to build the next pre-chunk.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, opts: ChunkingOptions) -> None:
|
||||||
|
self._opts = opts
|
||||||
|
self._separator_len = len(opts.text_separator)
|
||||||
|
self._elements: List[Element] = []
|
||||||
|
|
||||||
|
# -- only includes non-empty element text, e.g. PageBreak.text=="" is not included --
|
||||||
|
self._text_segments: List[str] = []
|
||||||
|
# -- combined length of text-segments, not including separators --
|
||||||
|
self._text_len: int = 0
|
||||||
|
|
||||||
|
def add_element(self, element: Element) -> None:
|
||||||
|
"""Add `element` to this section."""
|
||||||
|
self._elements.append(element)
|
||||||
|
if element.text:
|
||||||
|
self._text_segments.append(element.text)
|
||||||
|
self._text_len += len(element.text)
|
||||||
|
|
||||||
|
def flush(self) -> Iterator[TextPreChunk]:
|
||||||
|
"""Generate zero-or-one `PreChunk` object and clear the accumulator.
|
||||||
|
|
||||||
|
Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
|
||||||
|
boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element
|
||||||
|
stream.
|
||||||
|
"""
|
||||||
|
if not self._elements:
|
||||||
|
return
|
||||||
|
# -- clear builder before yield so we're not sensitive to the timing of how/when this
|
||||||
|
# -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately.
|
||||||
|
elements = self._elements[:]
|
||||||
|
self._elements.clear()
|
||||||
|
self._text_segments.clear()
|
||||||
|
self._text_len = 0
|
||||||
|
yield TextPreChunk(elements, self._opts)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def remaining_space(self) -> int:
|
||||||
|
"""Maximum text-length of an element that can be added without exceeding maxlen."""
|
||||||
|
# -- include length of trailing separator that will go before next element text --
|
||||||
|
separators_len = self._separator_len * len(self._text_segments)
|
||||||
|
return self._opts.hard_max - self._text_len - separators_len
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text_length(self) -> int:
|
||||||
|
"""Length of the text in this pre-chunk.
|
||||||
|
|
||||||
|
This value represents the chunk-size that would result if this pre-chunk was flushed in its
|
||||||
|
current state. In particular, it does not include the length of a trailing separator (since
|
||||||
|
that would only appear if an additional element was added).
|
||||||
|
|
||||||
|
Not suitable for judging remaining space, use `.remaining_space` for that value.
|
||||||
|
"""
|
||||||
|
# -- number of text separators present in joined text of elements. This includes only
|
||||||
|
# -- separators *between* text segments, not one at the end. Note there are zero separators
|
||||||
|
# -- for both 0 and 1 text-segments.
|
||||||
|
n = len(self._text_segments)
|
||||||
|
separator_count = n - 1 if n else 0
|
||||||
|
return self._text_len + (separator_count * self._separator_len)
|
||||||
|
|
||||||
|
|
||||||
|
class PreChunkCombiner:
|
||||||
|
"""Filters pre-chunk stream to combine small pre-chunks where possible."""
|
||||||
|
|
||||||
|
def __init__(self, pre_chunks: Iterable[PreChunk], opts: ChunkingOptions):
|
||||||
|
self._pre_chunks = pre_chunks
|
||||||
|
self._opts = opts
|
||||||
|
|
||||||
|
def iter_combined_pre_chunks(self) -> Iterator[PreChunk]:
|
||||||
|
"""Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window."""
|
||||||
|
accum = TextPreChunkAccumulator(self._opts)
|
||||||
|
combine_text_under_n_chars = self._opts.combine_text_under_n_chars
|
||||||
|
|
||||||
|
for pre_chunk in self._pre_chunks:
|
||||||
|
# -- start new pre-chunk under these conditions --
|
||||||
|
if (
|
||||||
|
# -- a table pre-chunk is never combined --
|
||||||
|
isinstance(pre_chunk, TablePreChunk)
|
||||||
|
# -- don't add another pre-chunk once length has reached combination soft-max --
|
||||||
|
or accum.text_length >= combine_text_under_n_chars
|
||||||
|
# -- combining would exceed hard-max --
|
||||||
|
or accum.remaining_space < pre_chunk.text_length
|
||||||
|
):
|
||||||
|
yield from accum.flush()
|
||||||
|
|
||||||
|
# -- a table pre-chunk is never combined so don't accumulate --
|
||||||
|
if isinstance(pre_chunk, TablePreChunk):
|
||||||
|
yield pre_chunk
|
||||||
|
else:
|
||||||
|
accum.add_pre_chunk(pre_chunk)
|
||||||
|
|
||||||
|
yield from accum.flush()
|
||||||
|
|
||||||
|
|
||||||
|
class TextPreChunkAccumulator:
|
||||||
|
"""Accumulates, measures, and combines pre-chunk objects.
|
||||||
|
|
||||||
|
Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
|
||||||
|
whether to add another pre-chunk.
|
||||||
|
|
||||||
|
`.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
|
||||||
|
This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used
|
||||||
|
like so:
|
||||||
|
|
||||||
|
yield from accum.flush()
|
||||||
|
|
||||||
|
If no pre-chunks have been accumulated, no `TextPreChunk` is generated. Flushing the builder
|
||||||
|
clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, opts: ChunkingOptions) -> None:
|
||||||
|
self._opts = opts
|
||||||
|
self._pre_chunks: List[TextPreChunk] = []
|
||||||
|
|
||||||
|
def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None:
|
||||||
|
"""Add a pre-chunk to the accumulator for possible combination with next pre-chunk."""
|
||||||
|
self._pre_chunks.append(pre_chunk)
|
||||||
|
|
||||||
|
def flush(self) -> Iterator[TextPreChunk]:
|
||||||
|
"""Generate all accumulated pre-chunks as a single combined pre-chunk."""
|
||||||
|
pre_chunks = self._pre_chunks
|
||||||
|
|
||||||
|
# -- nothing to do if no pre-chunks have been accumulated --
|
||||||
|
if not pre_chunks:
|
||||||
|
return
|
||||||
|
|
||||||
|
# -- otherwise combine all accumulated pre-chunk into one --
|
||||||
|
pre_chunk = pre_chunks[0]
|
||||||
|
for other_pre_chunk in pre_chunks[1:]:
|
||||||
|
pre_chunk = pre_chunk.combine(other_pre_chunk)
|
||||||
|
yield pre_chunk
|
||||||
|
|
||||||
|
# -- and reset the accumulator (to empty) --
|
||||||
|
pre_chunks.clear()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def remaining_space(self) -> int:
|
||||||
|
"""Maximum size of pre-chunk that can be added without exceeding maxlen."""
|
||||||
|
maxlen = self._opts.hard_max
|
||||||
|
return (
|
||||||
|
maxlen
|
||||||
|
if not self._pre_chunks
|
||||||
|
# -- an additional pre-chunk will also incur an additional separator --
|
||||||
|
else maxlen - self.text_length - len(self._opts.text_separator)
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text_length(self) -> int:
|
||||||
|
"""Size of concatenated text in all pre-chunks in accumulator."""
|
||||||
|
n = len(self._pre_chunks)
|
||||||
|
|
||||||
|
if n == 0:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
total_text_length = sum(s.text_length for s in self._pre_chunks)
|
||||||
|
total_separator_length = len(self._opts.text_separator) * (n - 1)
|
||||||
|
return total_text_length + total_separator_length
|
||||||
|
|||||||
@ -5,26 +5,20 @@ Main entry point is the `@add_chunking_strategy()` decorator.
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import collections
|
from typing import Iterator, List, Optional
|
||||||
import copy
|
|
||||||
from typing import Any, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple, cast
|
|
||||||
|
|
||||||
from typing_extensions import TypeAlias
|
from unstructured.chunking.base import (
|
||||||
|
ChunkingOptions,
|
||||||
from unstructured.chunking.base import ChunkingOptions
|
PreChunk,
|
||||||
|
PreChunkBuilder,
|
||||||
|
PreChunkCombiner,
|
||||||
|
TablePreChunk,
|
||||||
|
)
|
||||||
from unstructured.documents.elements import (
|
from unstructured.documents.elements import (
|
||||||
CompositeElement,
|
|
||||||
ConsolidationStrategy,
|
|
||||||
Element,
|
Element,
|
||||||
ElementMetadata,
|
|
||||||
RegexMetadata,
|
|
||||||
Table,
|
Table,
|
||||||
TableChunk,
|
|
||||||
Title,
|
Title,
|
||||||
)
|
)
|
||||||
from unstructured.utils import lazyproperty
|
|
||||||
|
|
||||||
PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
|
|
||||||
|
|
||||||
|
|
||||||
def chunk_by_title(
|
def chunk_by_title(
|
||||||
@ -78,7 +72,7 @@ def chunk_by_title(
|
|||||||
|
|
||||||
def _split_elements_by_title_and_table(
|
def _split_elements_by_title_and_table(
|
||||||
elements: List[Element], opts: ChunkingOptions
|
elements: List[Element], opts: ChunkingOptions
|
||||||
) -> Iterator[TextPreChunk | TablePreChunk]:
|
) -> Iterator[PreChunk]:
|
||||||
"""Implements "pre-chunker" responsibilities.
|
"""Implements "pre-chunker" responsibilities.
|
||||||
|
|
||||||
A _section_ can be thought of as a "pre-chunk", generally determining the size and contents of a
|
A _section_ can be thought of as a "pre-chunk", generally determining the size and contents of a
|
||||||
@ -102,7 +96,7 @@ def _split_elements_by_title_and_table(
|
|||||||
|
|
||||||
A Table or Checkbox element is placed into a pre-chunk by itself.
|
A Table or Checkbox element is placed into a pre-chunk by itself.
|
||||||
"""
|
"""
|
||||||
pre_chunk_builder = TextPreChunkBuilder(opts)
|
pre_chunk_builder = PreChunkBuilder(opts)
|
||||||
|
|
||||||
prior_element = None
|
prior_element = None
|
||||||
|
|
||||||
@ -156,396 +150,3 @@ def _metadata_differs(
|
|||||||
if ignore_page_numbers:
|
if ignore_page_numbers:
|
||||||
return False
|
return False
|
||||||
return metadata1.page_number != metadata2.page_number
|
return metadata1.page_number != metadata2.page_number
|
||||||
|
|
||||||
|
|
||||||
# == PreChunks ===================================================================================
|
|
||||||
|
|
||||||
|
|
||||||
class TablePreChunk:
|
|
||||||
"""A pre-chunk composed of a single Table element."""
|
|
||||||
|
|
||||||
def __init__(self, table: Table, opts: ChunkingOptions) -> None:
|
|
||||||
self._table = table
|
|
||||||
self._opts = opts
|
|
||||||
|
|
||||||
def iter_chunks(self) -> Iterator[Table | TableChunk]:
|
|
||||||
"""Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
|
|
||||||
text = self._table.text
|
|
||||||
html = self._table.metadata.text_as_html or ""
|
|
||||||
maxlen = self._opts.hard_max
|
|
||||||
|
|
||||||
# -- only chunk a table when it's too big to swallow whole --
|
|
||||||
if len(text) <= maxlen and len(html) <= maxlen:
|
|
||||||
yield self._table
|
|
||||||
return
|
|
||||||
|
|
||||||
is_continuation = False
|
|
||||||
|
|
||||||
while text or html:
|
|
||||||
# -- split off the next maxchars into the next TableChunk --
|
|
||||||
text_chunk, text = text[:maxlen], text[maxlen:]
|
|
||||||
table_chunk = TableChunk(text=text_chunk, metadata=copy.deepcopy(self._table.metadata))
|
|
||||||
|
|
||||||
# -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
|
|
||||||
# -- HTML elements that *correspond* to the TextChunk.text fragment.
|
|
||||||
if html:
|
|
||||||
html_chunk, html = html[:maxlen], html[maxlen:]
|
|
||||||
table_chunk.metadata.text_as_html = html_chunk
|
|
||||||
|
|
||||||
# -- mark second and later chunks as a continuation --
|
|
||||||
if is_continuation:
|
|
||||||
table_chunk.metadata.is_continuation = True
|
|
||||||
|
|
||||||
yield table_chunk
|
|
||||||
|
|
||||||
is_continuation = True
|
|
||||||
|
|
||||||
|
|
||||||
class TextPreChunk:
|
|
||||||
"""A sequence of elements that belong to the same semantic unit within a document.
|
|
||||||
|
|
||||||
The name "section" derives from the idea of a document-section, a heading followed by the
|
|
||||||
paragraphs "under" that heading. That structure is not found in all documents and actual section
|
|
||||||
content can vary, but that's the concept.
|
|
||||||
|
|
||||||
This object is purposely immutable.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, elements: Iterable[Element], opts: ChunkingOptions) -> None:
|
|
||||||
self._elements = list(elements)
|
|
||||||
self._opts = opts
|
|
||||||
|
|
||||||
def __eq__(self, other: Any) -> bool:
|
|
||||||
if not isinstance(other, TextPreChunk):
|
|
||||||
return False
|
|
||||||
return self._elements == other._elements
|
|
||||||
|
|
||||||
def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk:
|
|
||||||
"""Return new `TextPreChunk` that combines this and `other_pre_chunk`."""
|
|
||||||
return TextPreChunk(self._elements + other_pre_chunk._elements, opts=self._opts)
|
|
||||||
|
|
||||||
def iter_chunks(self) -> Iterator[CompositeElement]:
|
|
||||||
"""Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller."""
|
|
||||||
text = self._text
|
|
||||||
text_len = len(text)
|
|
||||||
maxlen = self._opts.hard_max
|
|
||||||
start = 0
|
|
||||||
remaining = text_len
|
|
||||||
|
|
||||||
while remaining > 0:
|
|
||||||
end = min(start + maxlen, text_len)
|
|
||||||
yield CompositeElement(text=text[start:end], metadata=self._consolidated_metadata)
|
|
||||||
start = end
|
|
||||||
remaining = text_len - end
|
|
||||||
|
|
||||||
@lazyproperty
|
|
||||||
def text_length(self) -> int:
|
|
||||||
"""Length of concatenated text of this pre-chunk, including separators."""
|
|
||||||
# -- used by pre-chunk-combiner to identify combination candidates --
|
|
||||||
return len(self._text)
|
|
||||||
|
|
||||||
@lazyproperty
|
|
||||||
def _all_metadata_values(self) -> Dict[str, List[Any]]:
|
|
||||||
"""Collection of all populated metadata values across elements.
|
|
||||||
|
|
||||||
The resulting dict has one key for each `ElementMetadata` field that had a non-None value in
|
|
||||||
at least one of the elements in this pre-chunk. The value of that key is a list of all those
|
|
||||||
populated values, in element order, for example:
|
|
||||||
|
|
||||||
{
|
|
||||||
"filename": ["sample.docx", "sample.docx"],
|
|
||||||
"languages": [["lat"], ["lat", "eng"]]
|
|
||||||
...
|
|
||||||
}
|
|
||||||
|
|
||||||
This preprocessing step provides the input for a specified consolidation strategy that will
|
|
||||||
resolve the list of values for each field to a single consolidated value.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def iter_populated_fields(metadata: ElementMetadata) -> Iterator[Tuple[str, Any]]:
|
|
||||||
"""(field_name, value) pair for each non-None field in single `ElementMetadata`."""
|
|
||||||
return (
|
|
||||||
(field_name, value)
|
|
||||||
for field_name, value in metadata.known_fields.items()
|
|
||||||
if value is not None
|
|
||||||
)
|
|
||||||
|
|
||||||
field_values: DefaultDict[str, List[Any]] = collections.defaultdict(list)
|
|
||||||
|
|
||||||
# -- collect all non-None field values in a list for each field, in element-order --
|
|
||||||
for e in self._elements:
|
|
||||||
for field_name, value in iter_populated_fields(e.metadata):
|
|
||||||
field_values[field_name].append(value)
|
|
||||||
|
|
||||||
return dict(field_values)
|
|
||||||
|
|
||||||
@lazyproperty
|
|
||||||
def _consolidated_metadata(self) -> ElementMetadata:
|
|
||||||
"""Metadata applicable to this pre-chunk as a single chunk.
|
|
||||||
|
|
||||||
Formed by applying consolidation rules to all metadata fields across the elements of this
|
|
||||||
pre-chunk.
|
|
||||||
|
|
||||||
For the sake of consistency, the same rules are applied (for example, for dropping values)
|
|
||||||
to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
|
|
||||||
"consolidated".
|
|
||||||
"""
|
|
||||||
return ElementMetadata(**self._meta_kwargs)
|
|
||||||
|
|
||||||
@lazyproperty
|
|
||||||
def _consolidated_regex_meta(self) -> Dict[str, List[RegexMetadata]]:
|
|
||||||
"""Consolidate the regex-metadata in `regex_metadata_dicts` into a single dict.
|
|
||||||
|
|
||||||
This consolidated value is suitable for use in the chunk metadata. `start` and `end`
|
|
||||||
offsets of each regex match are also adjusted for their new positions.
|
|
||||||
"""
|
|
||||||
chunk_regex_metadata: Dict[str, List[RegexMetadata]] = {}
|
|
||||||
separator_len = len(self._opts.text_separator)
|
|
||||||
running_text_len = 0
|
|
||||||
start_offset = 0
|
|
||||||
|
|
||||||
for element in self._elements:
|
|
||||||
text_len = len(element.text)
|
|
||||||
# -- skip empty elements like `PageBreak("")` --
|
|
||||||
if not text_len:
|
|
||||||
continue
|
|
||||||
# -- account for blank line between "squashed" elements, but not before first element --
|
|
||||||
running_text_len += separator_len if running_text_len else 0
|
|
||||||
start_offset = running_text_len
|
|
||||||
running_text_len += text_len
|
|
||||||
|
|
||||||
if not element.metadata.regex_metadata:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# -- consolidate any `regex_metadata` matches, adjusting the match start/end offsets --
|
|
||||||
element_regex_metadata = copy.deepcopy(element.metadata.regex_metadata)
|
|
||||||
for regex_name, matches in element_regex_metadata.items():
|
|
||||||
for m in matches:
|
|
||||||
m["start"] += start_offset
|
|
||||||
m["end"] += start_offset
|
|
||||||
chunk_matches = chunk_regex_metadata.get(regex_name, [])
|
|
||||||
chunk_matches.extend(matches)
|
|
||||||
chunk_regex_metadata[regex_name] = chunk_matches
|
|
||||||
|
|
||||||
return chunk_regex_metadata
|
|
||||||
|
|
||||||
@lazyproperty
|
|
||||||
def _meta_kwargs(self) -> Dict[str, Any]:
|
|
||||||
"""The consolidated metadata values as a dict suitable for constructing ElementMetadata.
|
|
||||||
|
|
||||||
This is where consolidation strategies are actually applied. The output is suitable for use
|
|
||||||
in constructing an `ElementMetadata` object like `ElementMetadata(**self._meta_kwargs)`.
|
|
||||||
"""
|
|
||||||
CS = ConsolidationStrategy
|
|
||||||
field_consolidation_strategies = ConsolidationStrategy.field_consolidation_strategies()
|
|
||||||
|
|
||||||
def iter_kwarg_pairs() -> Iterator[Tuple[str, Any]]:
|
|
||||||
"""Generate (field-name, value) pairs for each field in consolidated metadata."""
|
|
||||||
for field_name, values in self._all_metadata_values.items():
|
|
||||||
strategy = field_consolidation_strategies.get(field_name)
|
|
||||||
if strategy is CS.FIRST:
|
|
||||||
yield field_name, values[0]
|
|
||||||
# -- concatenate lists from each element that had one, in order --
|
|
||||||
elif strategy is CS.LIST_CONCATENATE:
|
|
||||||
yield field_name, sum(values, cast(List[Any], []))
|
|
||||||
# -- union lists from each element, preserving order of appearance --
|
|
||||||
elif strategy is CS.LIST_UNIQUE:
|
|
||||||
# -- Python 3.7+ maintains dict insertion order --
|
|
||||||
ordered_unique_keys = {key: None for val_list in values for key in val_list}
|
|
||||||
yield field_name, list(ordered_unique_keys.keys())
|
|
||||||
elif strategy is CS.REGEX:
|
|
||||||
yield field_name, self._consolidated_regex_meta
|
|
||||||
elif strategy is CS.DROP:
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
# -- not likely to hit this since we have a test in `text_elements.py` that
|
|
||||||
# -- ensures every ElementMetadata fields has an assigned strategy.
|
|
||||||
raise NotImplementedError(
|
|
||||||
f"metadata field {repr(field_name)} has no defined consolidation strategy"
|
|
||||||
)
|
|
||||||
|
|
||||||
return dict(iter_kwarg_pairs())
|
|
||||||
|
|
||||||
@lazyproperty
|
|
||||||
def _text(self) -> str:
|
|
||||||
"""The concatenated text of all elements in this pre-chunk.
|
|
||||||
|
|
||||||
Each element-text is separated from the next by a blank line ("\n\n").
|
|
||||||
"""
|
|
||||||
text_separator = self._opts.text_separator
|
|
||||||
return text_separator.join(e.text for e in self._elements if e.text)
|
|
||||||
|
|
||||||
|
|
||||||
class TextPreChunkBuilder:
|
|
||||||
"""An element accumulator suitable for incrementally forming a pre-chunk.
|
|
||||||
|
|
||||||
Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use
|
|
||||||
to determine whether it should add the next element in the element stream.
|
|
||||||
|
|
||||||
`.flush()` is used to build a `TextPreChunk` object from the accumulated elements. This method
|
|
||||||
returns an interator that generates zero-or-one `TextPreChunk` object and is used like so:
|
|
||||||
|
|
||||||
yield from builder.flush()
|
|
||||||
|
|
||||||
If no elements have been accumulated, no `TextPreChunk` is generated. Flushing the builder
|
|
||||||
clears the elements it contains so it is ready to build the next text-pre-chunk.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, opts: ChunkingOptions) -> None:
|
|
||||||
self._opts = opts
|
|
||||||
self._separator_len = len(opts.text_separator)
|
|
||||||
self._elements: List[Element] = []
|
|
||||||
|
|
||||||
# -- only includes non-empty element text, e.g. PageBreak.text=="" is not included --
|
|
||||||
self._text_segments: List[str] = []
|
|
||||||
# -- combined length of text-segments, not including separators --
|
|
||||||
self._text_len: int = 0
|
|
||||||
|
|
||||||
def add_element(self, element: Element) -> None:
|
|
||||||
"""Add `element` to this section."""
|
|
||||||
self._elements.append(element)
|
|
||||||
if element.text:
|
|
||||||
self._text_segments.append(element.text)
|
|
||||||
self._text_len += len(element.text)
|
|
||||||
|
|
||||||
def flush(self) -> Iterator[TextPreChunk]:
|
|
||||||
"""Generate zero-or-one `PreChunk` object and clear the accumulator.
|
|
||||||
|
|
||||||
Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
|
|
||||||
boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element
|
|
||||||
stream.
|
|
||||||
"""
|
|
||||||
if not self._elements:
|
|
||||||
return
|
|
||||||
# -- clear builder before yield so we're not sensitive to the timing of how/when this
|
|
||||||
# -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately.
|
|
||||||
elements = self._elements[:]
|
|
||||||
self._elements.clear()
|
|
||||||
self._text_segments.clear()
|
|
||||||
self._text_len = 0
|
|
||||||
yield TextPreChunk(elements, self._opts)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def remaining_space(self) -> int:
|
|
||||||
"""Maximum text-length of an element that can be added without exceeding maxlen."""
|
|
||||||
# -- include length of trailing separator that will go before next element text --
|
|
||||||
separators_len = self._separator_len * len(self._text_segments)
|
|
||||||
return self._opts.hard_max - self._text_len - separators_len
|
|
||||||
|
|
||||||
@property
|
|
||||||
def text_length(self) -> int:
|
|
||||||
"""Length of the text in this pre-chunk.
|
|
||||||
|
|
||||||
This value represents the chunk-size that would result if this pre-chunk was flushed in its
|
|
||||||
current state. In particular, it does not include the length of a trailing separator (since
|
|
||||||
that would only appear if an additional element was added).
|
|
||||||
|
|
||||||
Not suitable for judging remaining space, use `.remaining_space` for that value.
|
|
||||||
"""
|
|
||||||
# -- number of text separators present in joined text of elements. This includes only
|
|
||||||
# -- separators *between* text segments, not one at the end. Note there are zero separators
|
|
||||||
# -- for both 0 and 1 text-segments.
|
|
||||||
n = len(self._text_segments)
|
|
||||||
separator_count = n - 1 if n else 0
|
|
||||||
return self._text_len + (separator_count * self._separator_len)
|
|
||||||
|
|
||||||
|
|
||||||
# == PreChunkCombiner ============================================================================
|
|
||||||
|
|
||||||
|
|
||||||
class PreChunkCombiner:
|
|
||||||
"""Filters pre-chunk stream to combine small pre-chunks where possible."""
|
|
||||||
|
|
||||||
def __init__(self, pre_chunks: Iterable[PreChunk], opts: ChunkingOptions):
|
|
||||||
self._pre_chunks = pre_chunks
|
|
||||||
self._opts = opts
|
|
||||||
|
|
||||||
def iter_combined_pre_chunks(self) -> Iterator[PreChunk]:
|
|
||||||
"""Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window."""
|
|
||||||
accum = TextPreChunkAccumulator(self._opts)
|
|
||||||
combine_text_under_n_chars = self._opts.combine_text_under_n_chars
|
|
||||||
|
|
||||||
for pre_chunk in self._pre_chunks:
|
|
||||||
# -- start new pre-chunk under these conditions --
|
|
||||||
if (
|
|
||||||
# -- a table pre-chunk is never combined --
|
|
||||||
isinstance(pre_chunk, TablePreChunk)
|
|
||||||
# -- don't add another pre-chunk once length has reached combination soft-max --
|
|
||||||
or accum.text_length >= combine_text_under_n_chars
|
|
||||||
# -- combining would exceed hard-max --
|
|
||||||
or accum.remaining_space < pre_chunk.text_length
|
|
||||||
):
|
|
||||||
yield from accum.flush()
|
|
||||||
|
|
||||||
# -- a table pre-chunk is never combined so don't accumulate --
|
|
||||||
if isinstance(pre_chunk, TablePreChunk):
|
|
||||||
yield pre_chunk
|
|
||||||
else:
|
|
||||||
accum.add_pre_chunk(pre_chunk)
|
|
||||||
|
|
||||||
yield from accum.flush()
|
|
||||||
|
|
||||||
|
|
||||||
class TextPreChunkAccumulator:
|
|
||||||
"""Accumulates, measures, and combines pre-chunk objects.
|
|
||||||
|
|
||||||
Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
|
|
||||||
whether to add another pre-chunk.
|
|
||||||
|
|
||||||
`.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
|
|
||||||
This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used
|
|
||||||
like so:
|
|
||||||
|
|
||||||
yield from accum.flush()
|
|
||||||
|
|
||||||
If no pre-chunks have been accumulated, no `TextPreChunk` is generated. Flushing the builder
|
|
||||||
clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, opts: ChunkingOptions) -> None:
|
|
||||||
self._opts = opts
|
|
||||||
self._pre_chunks: List[TextPreChunk] = []
|
|
||||||
|
|
||||||
def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None:
|
|
||||||
"""Add a pre-chunk to the accumulator for possible combination with next pre-chunk."""
|
|
||||||
self._pre_chunks.append(pre_chunk)
|
|
||||||
|
|
||||||
def flush(self) -> Iterator[TextPreChunk]:
|
|
||||||
"""Generate all accumulated pre-chunks as a single combined pre-chunk."""
|
|
||||||
pre_chunks = self._pre_chunks
|
|
||||||
|
|
||||||
# -- nothing to do if no pre-chunks have been accumulated --
|
|
||||||
if not pre_chunks:
|
|
||||||
return
|
|
||||||
|
|
||||||
# -- otherwise combine all accumulated pre-chunk into one --
|
|
||||||
pre_chunk = pre_chunks[0]
|
|
||||||
for other_pre_chunk in pre_chunks[1:]:
|
|
||||||
pre_chunk = pre_chunk.combine(other_pre_chunk)
|
|
||||||
yield pre_chunk
|
|
||||||
|
|
||||||
# -- and reset the accumulator (to empty) --
|
|
||||||
pre_chunks.clear()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def remaining_space(self) -> int:
|
|
||||||
"""Maximum size of pre-chunk that can be added without exceeding maxlen."""
|
|
||||||
maxlen = self._opts.hard_max
|
|
||||||
return (
|
|
||||||
maxlen
|
|
||||||
if not self._pre_chunks
|
|
||||||
# -- an additional pre-chunk will also incur an additional separator --
|
|
||||||
else maxlen - self.text_length - len(self._opts.text_separator)
|
|
||||||
)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def text_length(self) -> int:
|
|
||||||
"""Size of concatenated text in all pre-chunks in accumulator."""
|
|
||||||
n = len(self._pre_chunks)
|
|
||||||
|
|
||||||
if n == 0:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
total_text_length = sum(s.text_length for s in self._pre_chunks)
|
|
||||||
total_separator_length = len(self._opts.text_separator) * (n - 1)
|
|
||||||
return total_text_length + total_separator_length
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user