rfctr: rename pre chunk (#2261)

The original naming for the pre-cursor to a chunk in `chunk_by_title()`
was conflated with the idea of how these element subsequences were
bounded (by document-section) for that strategy. I mistakenly picked
that up as a universal concept but in fact no notion of section arises
in the `by_character` or other chunking strategies.

Fix this misconception by using the name `pre-chunk` for this concept
throughout.
This commit is contained in:
Steve Canny 2023-12-13 15:13:57 -08:00 committed by GitHub
parent 74d089d942
commit cbeaed21ef
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 298 additions and 298 deletions

View File

@ -1,4 +1,4 @@
## 0.11.4-dev10
## 0.11.4-dev11
### Enhancements

View File

@ -5,12 +5,12 @@ from typing import List
import pytest
from unstructured.chunking.title import (
_SectionCombiner,
PreChunkCombiner,
TablePreChunk,
TextPreChunk,
TextPreChunkAccumulator,
TextPreChunkBuilder,
_split_elements_by_title_and_table,
_TableSection,
_TextSection,
_TextSectionAccumulator,
_TextSectionBuilder,
chunk_by_title,
)
from unstructured.documents.coordinates import CoordinateSystem
@ -130,7 +130,7 @@ def test_it_does_not_complain_when_specifying_new_after_n_chars_by_itself():
def test_it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk():
"""Specifying `new_after_n_chars=0` places each element into its own section.
"""Specifying `new_after_n_chars=0` places each element into its own pre-chunk.
This puts each element into its own chunk, although long chunks are still split.
"""
@ -166,7 +166,7 @@ def test_it_silently_accepts_new_after_n_chars_greater_than_maxchars():
# ================================================================================================
def test_it_splits_a_large_section_into_multiple_chunks():
def test_it_splits_a_large_element_into_multiple_chunks():
elements: List[Element] = [
Title("Introduction"),
Text(
@ -199,36 +199,36 @@ def test_split_elements_by_title_and_table():
CheckBox(),
]
sections = _split_elements_by_title_and_table(
pre_chunks = _split_elements_by_title_and_table(
elements,
multipage_sections=True,
new_after_n_chars=500,
max_characters=500,
)
section = next(sections)
assert isinstance(section, _TextSection)
assert section._elements == [
pre_chunk = next(pre_chunks)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("A Great Day"),
Text("Today is a great day."),
Text("It is sunny outside."),
]
# --
section = next(sections)
assert isinstance(section, _TableSection)
assert section._table == Table("Heading\nCell text")
pre_chunk = next(pre_chunks)
assert isinstance(pre_chunk, TablePreChunk)
assert pre_chunk._table == Table("Heading\nCell text")
# ==
section = next(sections)
assert isinstance(section, _TextSection)
assert section._elements == [
pre_chunk = next(pre_chunks)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
]
# --
section = next(sections)
assert isinstance(section, _TextSection)
assert section._elements == [
pre_chunk = next(pre_chunks)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("A Bad Day"),
Text("Today is a bad day."),
Text("It is storming outside."),
@ -236,7 +236,7 @@ def test_split_elements_by_title_and_table():
]
# --
with pytest.raises(StopIteration):
next(sections)
next(pre_chunks)
def test_chunk_by_title():
@ -351,9 +351,9 @@ def test_chunk_by_title_separates_by_page_number():
def test_chunk_by_title_does_not_break_on_regex_metadata_change():
"""Sectioner is insensitive to regex-metadata changes.
"""PreChunker is insensitive to regex-metadata changes.
A regex-metadata match in an element does not signify a semantic boundary and a section should
A regex-metadata match in an element does not signify a semantic boundary and a pre-chunk should
not be split based on such a difference.
"""
elements: List[Element] = [
@ -672,8 +672,8 @@ def test_chunk_by_title_drops_extra_metadata():
assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day."))
def test_it_considers_separator_length_when_sectioning():
"""Sectioner includes length of separators when computing remaining space."""
def test_it_considers_separator_length_when_pre_chunking():
"""PreChunker includes length of separators when computing remaining space."""
elements: List[Element] = [
Title("Chunking Priorities"), # 19 chars
ListItem("Divide text into manageable chunks"), # 34 chars
@ -693,11 +693,11 @@ def test_it_considers_separator_length_when_sectioning():
]
# == Sections ====================================================================================
# == PreChunks ===================================================================================
class Describe_TableSection:
"""Unit-test suite for `unstructured.chunking.title._TableSection objects."""
class DescribeTablePreChunk:
"""Unit-test suite for `unstructured.chunking.title.TablePreChunk objects."""
def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
html_table = (
@ -711,11 +711,11 @@ class Describe_TableSection:
"</table>"
)
text_table = "Header Col 1 Header Col 2\n" "Lorem ipsum adipiscing"
section = _TableSection(
pre_chunk = TablePreChunk(
Table(text_table, metadata=ElementMetadata(text_as_html=html_table))
)
chunk_iter = section.iter_chunks(maxlen=175)
chunk_iter = pre_chunk.iter_chunks(maxlen=175)
chunk = next(chunk_iter)
assert isinstance(chunk, Table)
@ -756,11 +756,11 @@ class Describe_TableSection:
"Nunc aliquam id enim nec molestie\n"
"Vivamus quis nunc ipsum donec ac fermentum"
)
section = _TableSection(
pre_chunk = TablePreChunk(
Table(text_table, metadata=ElementMetadata(text_as_html=html_table))
)
chunk_iter = section.iter_chunks(maxlen=100)
chunk_iter = pre_chunk.iter_chunks(maxlen=100)
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
@ -810,30 +810,30 @@ class Describe_TableSection:
next(chunk_iter)
class Describe_TextSection:
"""Unit-test suite for `unstructured.chunking.title._TextSection objects."""
class DescribeTextPreChunk:
"""Unit-test suite for `unstructured.chunking.title.TextPreChunk objects."""
def it_can_combine_itself_with_another_TextSection_instance(self):
""".combine() produces a new section by appending the elements of `other_section`.
def it_can_combine_itself_with_another_TextPreChunk_instance(self):
""".combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`.
Note that neither the original or other section are mutated.
Note that neither the original or other pre_chunk are mutated.
"""
section = _TextSection(
pre_chunk = TextPreChunk(
[
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Text("In rhoncus ipsum sed lectus porta volutpat."),
]
)
other_section = _TextSection(
other_pre_chunk = TextPreChunk(
[
Text("Donec semper facilisis metus finibus malesuada."),
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
]
)
new_section = section.combine(other_section)
new_pre_chunk = pre_chunk.combine(other_pre_chunk)
assert new_section == _TextSection(
assert new_pre_chunk == TextPreChunk(
[
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Text("In rhoncus ipsum sed lectus porta volutpat."),
@ -841,13 +841,13 @@ class Describe_TextSection:
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
]
)
assert section == _TextSection(
assert pre_chunk == TextPreChunk(
[
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Text("In rhoncus ipsum sed lectus porta volutpat."),
]
)
assert other_section == _TextSection(
assert other_pre_chunk == TextPreChunk(
[
Text("Donec semper facilisis metus finibus malesuada."),
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
@ -855,7 +855,7 @@ class Describe_TextSection:
)
def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
section = _TextSection(
pre_chunk = TextPreChunk(
[
Title("Introduction"),
Text(
@ -865,19 +865,19 @@ class Describe_TextSection:
]
)
chunk_iter = section.iter_chunks(maxlen=200)
chunk_iter = pre_chunk.iter_chunks(maxlen=200)
chunk = next(chunk_iter)
assert chunk == CompositeElement(
"Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
" In rhoncus ipsum sedlectus porta volutpat.",
)
assert chunk.metadata is section._consolidated_metadata
assert chunk.metadata is pre_chunk._consolidated_metadata
def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
# -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
# -- The sectioner will isolate that element in a section of its own.
section = _TextSection(
# -- The pre-chunker will isolate that element in a pre_chunk of its own.
pre_chunk = TextPreChunk(
[
Text(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
@ -888,7 +888,7 @@ class Describe_TextSection:
]
)
chunk_iter = section.iter_chunks(maxlen=200)
chunk_iter = pre_chunk.iter_chunks(maxlen=200)
chunk = next(chunk_iter)
assert chunk == CompositeElement(
@ -896,22 +896,22 @@ class Describe_TextSection:
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
" veniam, quis nostrud exercitation ullamco laboris nisi ut a"
)
assert chunk.metadata is section._consolidated_metadata
assert chunk.metadata is pre_chunk._consolidated_metadata
# --
chunk = next(chunk_iter)
assert chunk == CompositeElement("liquip ex ea commodo consequat.")
assert chunk.metadata is section._consolidated_metadata
assert chunk.metadata is pre_chunk._consolidated_metadata
# --
with pytest.raises(StopIteration):
next(chunk_iter)
def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self):
""".text_length is the size of chunk this section will produce (before any splitting)."""
section = _TextSection([PageBreak(""), Text("foo"), Text("bar")])
assert section.text_length == 8
""".text_length is the size of chunk this pre-chunk will produce (before any splitting)."""
pre_chunk = TextPreChunk([PageBreak(""), Text("foo"), Text("bar")])
assert pre_chunk.text_length == 8
def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self):
section = _TextSection(
pre_chunk = TextPreChunk(
[
Title(
"Lorem Ipsum",
@ -934,7 +934,7 @@ class Describe_TextSection:
]
)
assert section._all_metadata_values == {
assert pre_chunk._all_metadata_values == {
# -- scalar values are accumulated in a list in element order --
"category_depth": [0, 1],
# -- all values are accumulated, not only unique ones --
@ -963,7 +963,7 @@ class Describe_TextSection:
)
metadata_2.quotient = 1.74
section = _TextSection(
pre_chunk = TextPreChunk(
[
Title("Lorem Ipsum", metadata=metadata),
Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
@ -971,7 +971,7 @@ class Describe_TextSection:
)
# -- ad-hoc fields "coefficient" and "quotient" do not appear --
assert section._all_metadata_values == {
assert pre_chunk._all_metadata_values == {
"category_depth": [0, 1],
"filename": ["foo.docx", "foo.docx"],
"image_path": ["sprite.png"],
@ -985,7 +985,7 @@ class Describe_TextSection:
Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new
position in the chunk after element text has been concatenated.
"""
section = _TextSection(
pre_chunk = TextPreChunk(
[
Title(
"Lorem Ipsum",
@ -1011,7 +1011,7 @@ class Describe_TextSection:
]
)
regex_metadata = section._consolidated_regex_meta
regex_metadata = pre_chunk._consolidated_regex_meta
assert regex_metadata == {
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
@ -1026,9 +1026,9 @@ class Describe_TextSection:
"""._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
Only non-None fields should appear in the dict and each field value should be the
consolidation of the values across the section elements.
consolidation of the values across the pre_chunk elements.
"""
section = _TextSection(
pre_chunk = TextPreChunk(
[
PageBreak(""),
Title(
@ -1065,7 +1065,7 @@ class Describe_TextSection:
]
)
meta_kwargs = section._meta_kwargs
meta_kwargs = pre_chunk._meta_kwargs
assert meta_kwargs == {
"filename": "foo.docx",
@ -1090,29 +1090,29 @@ class Describe_TextSection:
([Text("foo"), Text("bar"), PageBreak("")], "foo\n\nbar"),
],
)
def it_knows_the_concatenated_text_of_the_section(
def it_knows_the_concatenated_text_of_the_pre_chunk(
self, elements: List[Text], expected_value: str
):
"""._text is the "joined" text of the section elements.
"""._text is the "joined" text of the pre-chunk elements.
The text-segment contributed by each element is separated from the next by a blank line
("\n\n"). An element that contributes no text does not give rise to a separator.
"""
section = _TextSection(elements)
assert section._text == expected_value
pre_chunk = TextPreChunk(elements)
assert pre_chunk._text == expected_value
class Describe_TextSectionBuilder:
"""Unit-test suite for `unstructured.chunking.title._TextSectionBuilder`."""
class DescribeTextPreChunkBuilder:
"""Unit-test suite for `unstructured.chunking.title.TextPreChunkBuilder`."""
def it_is_empty_on_construction(self):
builder = _TextSectionBuilder(maxlen=50)
builder = TextPreChunkBuilder(maxlen=50)
assert builder.text_length == 0
assert builder.remaining_space == 50
def it_accumulates_elements_added_to_it(self):
builder = _TextSectionBuilder(maxlen=150)
builder = TextPreChunkBuilder(maxlen=150)
builder.add_element(Title("Introduction"))
assert builder.text_length == 12
@ -1127,8 +1127,8 @@ class Describe_TextSectionBuilder:
assert builder.text_length == 112
assert builder.remaining_space == 36
def it_generates_a_TextSection_when_flushed_and_resets_itself_to_empty(self):
builder = _TextSectionBuilder(maxlen=150)
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
builder = TextPreChunkBuilder(maxlen=150)
builder.add_element(Title("Introduction"))
builder.add_element(
Text(
@ -1137,10 +1137,10 @@ class Describe_TextSectionBuilder:
),
)
section = next(builder.flush())
pre_chunk = next(builder.flush())
assert isinstance(section, _TextSection)
assert section._elements == [
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Introduction"),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
@ -1150,17 +1150,17 @@ class Describe_TextSectionBuilder:
assert builder.text_length == 0
assert builder.remaining_space == 150
def but_it_does_not_generate_a_TextSection_on_flush_when_empty(self):
builder = _TextSectionBuilder(maxlen=150)
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
builder = TextPreChunkBuilder(maxlen=150)
sections = list(builder.flush())
pre_chunks = list(builder.flush())
assert sections == []
assert pre_chunks == []
assert builder.text_length == 0
assert builder.remaining_space == 150
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
builder = _TextSectionBuilder(maxlen=50)
builder = TextPreChunkBuilder(maxlen=50)
builder.add_element(Text("abcde"))
builder.add_element(Text("fghij"))
@ -1173,27 +1173,27 @@ class Describe_TextSectionBuilder:
assert builder.remaining_space == 36
# == SectionCombiner =============================================================================
# == PreChunkCombiner =============================================================================
class Describe_SectionCombiner:
"""Unit-test suite for `unstructured.chunking.title._SectionCombiner`."""
class DescribePreChunkCombiner:
"""Unit-test suite for `unstructured.chunking.title.PreChunkCombiner`."""
def it_combines_sequential_small_text_sections(self):
sections = [
_TextSection(
def it_combines_sequential_small_text_pre_chunks(self):
pre_chunks = [
TextPreChunk(
[
Title("Lorem Ipsum"), # 11
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
]
),
_TextSection(
TextPreChunk(
[
Title("Mauris Nec"), # 10
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
]
),
_TextSection(
TextPreChunk(
[
Title("Sed Orci"), # 8
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
@ -1201,13 +1201,13 @@ class Describe_SectionCombiner:
),
]
section_iter = _SectionCombiner(
sections, maxlen=250, combine_text_under_n_chars=250
).iter_combined_sections()
pre_chunk_iter = PreChunkCombiner(
pre_chunks, maxlen=250, combine_text_under_n_chars=250
).iter_combined_pre_chunks()
section = next(section_iter)
assert isinstance(section, _TextSection)
assert section._elements == [
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Title("Mauris Nec"),
@ -1216,18 +1216,18 @@ class Describe_SectionCombiner:
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
]
with pytest.raises(StopIteration):
next(section_iter)
next(pre_chunk_iter)
def but_it_does_not_combine_table_sections(self):
sections = [
_TextSection(
def but_it_does_not_combine_table_pre_chunks(self):
pre_chunks = [
TextPreChunk(
[
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
]
),
_TableSection(Table("Heading\nCell text")),
_TextSection(
TablePreChunk(Table("Heading\nCell text")),
TextPreChunk(
[
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
@ -1235,47 +1235,47 @@ class Describe_SectionCombiner:
),
]
section_iter = _SectionCombiner(
sections, maxlen=250, combine_text_under_n_chars=250
).iter_combined_sections()
pre_chunk_iter = PreChunkCombiner(
pre_chunks, maxlen=250, combine_text_under_n_chars=250
).iter_combined_pre_chunks()
section = next(section_iter)
assert isinstance(section, _TextSection)
assert section._elements == [
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
]
# --
section = next(section_iter)
assert isinstance(section, _TableSection)
assert section._table == Table("Heading\nCell text")
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TablePreChunk)
assert pre_chunk._table == Table("Heading\nCell text")
# --
section = next(section_iter)
assert isinstance(section, _TextSection)
assert section._elements == [
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
]
# --
with pytest.raises(StopIteration):
next(section_iter)
next(pre_chunk_iter)
def it_respects_the_specified_combination_threshold(self):
sections = [
_TextSection( # 68
pre_chunks = [
TextPreChunk( # 68
[
Title("Lorem Ipsum"), # 11
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
]
),
_TextSection( # 71
TextPreChunk( # 71
[
Title("Mauris Nec"), # 10
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
]
),
# -- len == 139
_TextSection(
TextPreChunk(
[
Title("Sed Orci"), # 8
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
@ -1283,45 +1283,45 @@ class Describe_SectionCombiner:
),
]
section_iter = _SectionCombiner(
sections, maxlen=250, combine_text_under_n_chars=80
).iter_combined_sections()
pre_chunk_iter = PreChunkCombiner(
pre_chunks, maxlen=250, combine_text_under_n_chars=80
).iter_combined_pre_chunks()
section = next(section_iter)
assert isinstance(section, _TextSection)
assert section._elements == [
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
]
# --
section = next(section_iter)
assert isinstance(section, _TextSection)
assert section._elements == [
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
]
# --
with pytest.raises(StopIteration):
next(section_iter)
next(pre_chunk_iter)
def it_respects_the_hard_maximum_window_length(self):
sections = [
_TextSection( # 68
pre_chunks = [
TextPreChunk( # 68
[
Title("Lorem Ipsum"), # 11
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
]
),
_TextSection( # 71
TextPreChunk( # 71
[
Title("Mauris Nec"), # 10
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
]
),
# -- len == 139
_TextSection(
TextPreChunk(
[
Title("Sed Orci"), # 8
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
@ -1330,35 +1330,35 @@ class Describe_SectionCombiner:
# -- len == 214
]
section_iter = _SectionCombiner(
sections, maxlen=200, combine_text_under_n_chars=200
).iter_combined_sections()
pre_chunk_iter = PreChunkCombiner(
pre_chunks, maxlen=200, combine_text_under_n_chars=200
).iter_combined_pre_chunks()
section = next(section_iter)
assert isinstance(section, _TextSection)
assert section._elements == [
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
]
# --
section = next(section_iter)
assert isinstance(section, _TextSection)
assert section._elements == [
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
]
# --
with pytest.raises(StopIteration):
next(section_iter)
next(pre_chunk_iter)
def it_accommodates_and_isolates_an_oversized_section(self):
def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
"""Such as occurs when a single element exceeds the window size."""
sections = [
_TextSection([Title("Lorem Ipsum")]),
_TextSection( # 179
pre_chunks = [
TextPreChunk([Title("Lorem Ipsum")]),
TextPreChunk( # 179
[
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55
@ -1367,20 +1367,20 @@ class Describe_SectionCombiner:
)
]
),
_TextSection([Title("Vulputate Consequat")]),
TextPreChunk([Title("Vulputate Consequat")]),
]
section_iter = _SectionCombiner(
sections, maxlen=150, combine_text_under_n_chars=150
).iter_combined_sections()
pre_chunk_iter = PreChunkCombiner(
pre_chunks, maxlen=150, combine_text_under_n_chars=150
).iter_combined_pre_chunks()
section = next(section_iter)
assert isinstance(section, _TextSection)
assert section._elements == [Title("Lorem Ipsum")]
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [Title("Lorem Ipsum")]
# --
section = next(section_iter)
assert isinstance(section, _TextSection)
assert section._elements == [
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit."
" Mauris nec urna non augue vulputate consequat eget et nisi."
@ -1388,28 +1388,28 @@ class Describe_SectionCombiner:
)
]
# --
section = next(section_iter)
assert isinstance(section, _TextSection)
assert section._elements == [Title("Vulputate Consequat")]
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [Title("Vulputate Consequat")]
# --
with pytest.raises(StopIteration):
next(section_iter)
next(pre_chunk_iter)
class Describe_TextSectionAccumulator:
"""Unit-test suite for `unstructured.chunking.title._TextSectionAccumulator`."""
class DescribeTextPreChunkAccumulator:
"""Unit-test suite for `unstructured.chunking.title.TextPreChunkAccumulator`."""
def it_is_empty_on_construction(self):
accum = _TextSectionAccumulator(maxlen=100)
accum = TextPreChunkAccumulator(maxlen=100)
assert accum.text_length == 0
assert accum.remaining_space == 100
def it_accumulates_sections_added_to_it(self):
accum = _TextSectionAccumulator(maxlen=500)
def it_accumulates_pre_chunks_added_to_it(self):
accum = TextPreChunkAccumulator(maxlen=500)
accum.add_section(
_TextSection(
accum.add_pre_chunk(
TextPreChunk(
[
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
@ -1419,8 +1419,8 @@ class Describe_TextSectionAccumulator:
assert accum.text_length == 68
assert accum.remaining_space == 430
accum.add_section(
_TextSection(
accum.add_pre_chunk(
TextPreChunk(
[
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
@ -1430,26 +1430,26 @@ class Describe_TextSectionAccumulator:
assert accum.text_length == 141
assert accum.remaining_space == 357
def it_generates_a_TextSection_when_flushed_and_resets_itself_to_empty(self):
accum = _TextSectionAccumulator(maxlen=150)
accum.add_section(
_TextSection(
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
accum = TextPreChunkAccumulator(maxlen=150)
accum.add_pre_chunk(
TextPreChunk(
[
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
]
)
)
accum.add_section(
_TextSection(
accum.add_pre_chunk(
TextPreChunk(
[
Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
]
)
)
accum.add_section(
_TextSection(
accum.add_pre_chunk(
TextPreChunk(
[
Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
@ -1457,15 +1457,15 @@ class Describe_TextSectionAccumulator:
)
)
section_iter = accum.flush()
pre_chunk_iter = accum.flush()
# -- iterator generates exactly one section --
section = next(section_iter)
# -- iterator generates exactly one pre_chunk --
pre_chunk = next(pre_chunk_iter)
with pytest.raises(StopIteration):
next(section_iter)
# -- and it is a _TextSection containing all the elements --
assert isinstance(section, _TextSection)
assert section._elements == [
next(pre_chunk_iter)
# -- and it is a _TextPreChunk containing all the elements --
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Title("Mauris Nec"),
@ -1476,24 +1476,24 @@ class Describe_TextSectionAccumulator:
assert accum.text_length == 0
assert accum.remaining_space == 150
def but_it_does_not_generate_a_TextSection_on_flush_when_empty(self):
accum = _TextSectionAccumulator(maxlen=150)
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
accum = TextPreChunkAccumulator(maxlen=150)
sections = list(accum.flush())
pre_chunks = list(accum.flush())
assert sections == []
assert pre_chunks == []
assert accum.text_length == 0
assert accum.remaining_space == 150
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
accum = _TextSectionAccumulator(maxlen=100)
accum.add_section(_TextSection([Text("abcde")]))
accum.add_section(_TextSection([Text("fghij")]))
accum = TextPreChunkAccumulator(maxlen=100)
accum.add_pre_chunk(TextPreChunk([Text("abcde")]))
accum.add_pre_chunk(TextPreChunk([Text("fghij")]))
# -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
assert accum.text_length == 12
# -- .remaining_space is reduced by the length (2) of the trailing separator which would
# -- go between the current text and that of the next section if one was added.
# -- go between the current text and that of the next pre-chunk if one was added.
# -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88
assert accum.remaining_space == 86

View File

@ -1 +1 @@
__version__ = "0.11.4-dev10" # pragma: no cover
__version__ = "0.11.4-dev11" # pragma: no cover

View File

@ -25,7 +25,7 @@ from unstructured.documents.elements import (
)
from unstructured.utils import lazyproperty
_Section: TypeAlias = "_TableSection | _TextSection"
PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
# -- goes between text of each element when element-text is concatenated to form chunk --
TEXT_SEPARATOR = "\n\n"
@ -98,7 +98,7 @@ def chunk_by_title(
# ----------------------------------------------------------------
sections = _SectionCombiner(
pre_chunks = PreChunkCombiner(
_split_elements_by_title_and_table(
elements,
multipage_sections=multipage_sections,
@ -107,9 +107,9 @@ def chunk_by_title(
),
max_characters,
combine_text_under_n_chars,
).iter_combined_sections()
).iter_combined_pre_chunks()
return [chunk for section in sections for chunk in section.iter_chunks(max_characters)]
return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks(max_characters)]
def _split_elements_by_title_and_table(
@ -117,31 +117,31 @@ def _split_elements_by_title_and_table(
multipage_sections: bool,
new_after_n_chars: int,
max_characters: int,
) -> Iterator[_TextSection | _TableSection]:
"""Implements "sectioner" responsibilities.
) -> Iterator[TextPreChunk | TablePreChunk]:
"""Implements "pre-chunker" responsibilities.
A _section_ can be thought of as a "pre-chunk", generally determining the size and contents of a
chunk formed by the subsequent "chunker" process. The only exception occurs when a single
element is too big to fit in the chunk window and the chunker splits it into two or more chunks
divided mid-text. The sectioner never divides an element mid-text.
divided mid-text. The pre-chunker never divides an element mid-text.
The sectioner's responsibilities are:
The pre-chunker's responsibilities are:
* **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
either side of those boundaries into different sections. In this case, the primary
either side of those boundaries into different pre-chunks. In this case, the primary
indicator of a semantic boundary is a `Title` element. A page-break (change in
page-number) is also a semantic boundary when `multipage_sections` is `False`.
* **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
into sections as big as possible without exceeding the chunk window size.
into pre-chunks as big as possible without exceeding the chunk window size.
* **Minimize chunks that must be split mid-text.** Precompute the text length of each
section and only produce a section that exceeds the chunk window size when there is a
pre-chunk and only produce a pre-chunk that exceeds the chunk window size when there is a
single element with text longer than that window.
A Table or Checkbox element is placed into a section by itself.
A Table or Checkbox element is placed into a pre-chunk by itself.
"""
section_builder = _TextSectionBuilder(max_characters)
pre_chunk_builder = TextPreChunkBuilder(max_characters)
prior_element = None
@ -152,31 +152,31 @@ def _split_elements_by_title_and_table(
else False
)
# -- start new section when necessary --
# -- start new pre_chunk when necessary --
if (
# -- Title and Table both start a new section --
# -- Title and Table both start a new pre_chunk --
isinstance(element, (Title, Table))
# -- adding this element would exceed hard-maxlen for section --
or section_builder.remaining_space < len(str(element))
# -- section already meets or exceeds soft-maxlen --
or section_builder.text_length >= new_after_n_chars
# -- adding this element would exceed hard-maxlen for pre_chunk --
or pre_chunk_builder.remaining_space < len(str(element))
# -- pre_chunk already meets or exceeds soft-maxlen --
or pre_chunk_builder.text_length >= new_after_n_chars
# -- a semantic boundary is indicated by metadata change since prior element --
or metadata_differs
):
# -- complete any work-in-progress section --
yield from section_builder.flush()
# -- complete any work-in-progress pre_chunk --
yield from pre_chunk_builder.flush()
# -- emit table and checkbox immediately since they are always isolated --
if isinstance(element, Table):
yield _TableSection(table=element)
yield TablePreChunk(table=element)
# -- but accumulate text elements for consolidation into a composite chunk --
else:
section_builder.add_element(element)
pre_chunk_builder.add_element(element)
prior_element = element
# -- flush "tail" section, any partially-filled section after last element is processed --
yield from section_builder.flush()
# -- flush "tail" pre_chunk, any partially-filled pre_chunk after last element is processed --
yield from pre_chunk_builder.flush()
def _metadata_differs(
@ -255,17 +255,17 @@ def add_chunking_strategy() -> Callable[[Callable[_P, List[Element]]], Callable[
return decorator
# == Sections ====================================================================================
# == PreChunks ===================================================================================
class _TableSection:
"""A section composed of a single Table element."""
class TablePreChunk:
"""A pre-chunk composed of a single Table element."""
def __init__(self, table: Table) -> None:
self._table = table
def iter_chunks(self, maxlen: int) -> Iterator[Table | TableChunk]:
"""Split this section into one or more `Table` or `TableChunk` objects maxlen or smaller."""
"""Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
text = self._table.text
html = self._table.metadata.text_as_html or ""
@ -296,7 +296,7 @@ class _TableSection:
is_continuation = True
class _TextSection:
class TextPreChunk:
"""A sequence of elements that belong to the same semantic unit within a document.
The name "section" derives from the idea of a document-section, a heading followed by the
@ -310,16 +310,16 @@ class _TextSection:
self._elements = list(elements)
def __eq__(self, other: Any) -> bool:
if not isinstance(other, _TextSection):
if not isinstance(other, TextPreChunk):
return False
return self._elements == other._elements
def combine(self, other_section: _TextSection) -> _TextSection:
"""Return new `_TextSection` that combines this and `other_section`."""
return _TextSection(self._elements + other_section._elements)
def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk:
"""Return new `TextPreChunk` that combines this and `other_pre_chunk`."""
return TextPreChunk(self._elements + other_pre_chunk._elements)
def iter_chunks(self, maxlen: int) -> Iterator[CompositeElement]:
"""Split this section into one or more `CompositeElement` objects maxlen or smaller."""
"""Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller."""
text = self._text
text_len = len(text)
start = 0
@ -333,8 +333,8 @@ class _TextSection:
@lazyproperty
def text_length(self) -> int:
"""Length of concatenated text of this section, including separators."""
# -- used by section-combiner to identify combination candidates --
"""Length of concatenated text of this pre-chunk, including separators."""
# -- used by pre-chunk-combiner to identify combination candidates --
return len(self._text)
@lazyproperty
@ -342,7 +342,7 @@ class _TextSection:
"""Collection of all populated metadata values across elements.
The resulting dict has one key for each `ElementMetadata` field that had a non-None value in
at least one of the elements in this section. The value of that key is a list of all those
at least one of the elements in this pre-chunk. The value of that key is a list of all those
populated values, in element order, for example:
{
@ -374,13 +374,13 @@ class _TextSection:
@lazyproperty
def _consolidated_metadata(self) -> ElementMetadata:
"""Metadata applicable to this section as a single chunk.
"""Metadata applicable to this pre-chunk as a single chunk.
Formed by applying consolidation rules to all metadata fields across the elements of this
section.
pre-chunk.
For the sake of consistency, the same rules are applied (for example, for dropping values)
to a single-element section too, even though metadata for such a section is already
to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
"consolidated".
"""
return ElementMetadata(**self._meta_kwargs)
@ -460,26 +460,26 @@ class _TextSection:
@lazyproperty
def _text(self) -> str:
"""The concatenated text of all elements in this section.
"""The concatenated text of all elements in this pre-chunk.
Each element-text is separated from the next by a blank line ("\n\n").
"""
return TEXT_SEPARATOR.join(e.text for e in self._elements if e.text)
class _TextSectionBuilder:
"""An element accumulator suitable for incrementally forming a section.
class TextPreChunkBuilder:
"""An element accumulator suitable for incrementally forming a pre-chunk.
Provides monitoring properties like `.remaining_space` and `.text_length` a sectioner can use
Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use
to determine whether it should add the next element in the element stream.
`.flush()` is used to build a `TextSection` object from the accumulated elements. This method
returns an interator that generates zero-or-one `TextSection` object and is used like so:
`.flush()` is used to build a `TextPreChunk` object from the accumulated elements. This method
returns an interator that generates zero-or-one `TextPreChunk` object and is used like so:
yield from builder.flush()
If no elements have been accumulated, no `TextSection` is generated. Flushing the builder clears
the elements it contains so it is ready to build the next text-section.
If no elements have been accumulated, no `TextPreChunk` is generated. Flushing the builder
clears the elements it contains so it is ready to build the next text-pre-chunk.
"""
def __init__(self, maxlen: int) -> None:
@ -502,22 +502,22 @@ class _TextSectionBuilder:
self._text_segments.append(element.text)
self._text_len += len(element.text)
def flush(self) -> Iterator[_TextSection]:
"""Generate zero-or-one `Section` object and clear the accumulator.
def flush(self) -> Iterator[TextPreChunk]:
"""Generate zero-or-one `PreChunk` object and clear the accumulator.
Suitable for use to emit a Section when the maximum size has been reached or a semantic
boundary has been reached. Also to clear out a terminal section at the end of an element
Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element
stream.
"""
if not self._elements:
return
# -- clear builder before yield so we're not sensitive to the timing of how/when this
# -- iterator is exhausted and can add eleemnts for the next section immediately.
# -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately.
elements = self._elements[:]
self._elements.clear()
self._text_segments.clear()
self._text_len = 0
yield _TextSection(elements)
yield TextPreChunk(elements)
@property
def remaining_space(self) -> int:
@ -528,9 +528,9 @@ class _TextSectionBuilder:
@property
def text_length(self) -> int:
"""Length of the text in this section.
"""Length of the text in this pre-chunk.
This value represents the chunk-size that would result if this section was flushed in its
This value represents the chunk-size that would result if this pre-chunk was flushed in its
current state. In particular, it does not include the length of a trailing separator (since
that would only appear if an additional element was added).
@ -544,104 +544,104 @@ class _TextSectionBuilder:
return self._text_len + (separator_count * self._separator_len)
# == SectionCombiner =============================================================================
# == PreChunkCombiner ============================================================================
class _SectionCombiner:
"""Filters section stream to combine small sections where possible."""
class PreChunkCombiner:
"""Filters pre-chunk stream to combine small pre-chunks where possible."""
def __init__(
self,
sections: Iterable[_Section],
pre_chunks: Iterable[PreChunk],
maxlen: int,
combine_text_under_n_chars: int,
):
self._sections = sections
self._pre_chunks = pre_chunks
self._maxlen = maxlen
self._combine_text_under_n_chars = combine_text_under_n_chars
def iter_combined_sections(self) -> Iterator[_Section]:
"""Generate section objects, combining TextSection objects when they will fit in window."""
accum = _TextSectionAccumulator(self._maxlen)
def iter_combined_pre_chunks(self) -> Iterator[PreChunk]:
"""Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window."""
accum = TextPreChunkAccumulator(self._maxlen)
for section in self._sections:
# -- start new section under these conditions --
for pre_chunk in self._pre_chunks:
# -- start new pre-chunk under these conditions --
if (
# -- a table section is never combined --
isinstance(section, _TableSection)
# -- don't add another section once length has reached combination soft-max --
# -- a table pre-chunk is never combined --
isinstance(pre_chunk, TablePreChunk)
# -- don't add another pre-chunk once length has reached combination soft-max --
or accum.text_length >= self._combine_text_under_n_chars
# -- combining would exceed hard-max --
or accum.remaining_space < section.text_length
or accum.remaining_space < pre_chunk.text_length
):
yield from accum.flush()
# -- a table section is never combined so don't accumulate --
if isinstance(section, _TableSection):
yield section
# -- a table pre-chunk is never combined so don't accumulate --
if isinstance(pre_chunk, TablePreChunk):
yield pre_chunk
else:
accum.add_section(section)
accum.add_pre_chunk(pre_chunk)
yield from accum.flush()
class _TextSectionAccumulator:
"""Accumulates, measures, and combines section objects.
class TextPreChunkAccumulator:
"""Accumulates, measures, and combines pre-chunk objects.
Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
whether to add another section.
whether to add another pre-chunk.
`.flush()` is used to combine the accumulated sections into a single `TextSection` object. This
method returns an interator that generates zero-or-one `TextSection` objects and is used like
so:
`.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used
like so:
yield from accum.flush()
If no sections have been accumulated, no `TextSection` is generated. Flushing the builder clears
the sections it contains so it is ready to accept the next text-section.
If no pre-chunks have been accumulated, no `TextPreChunk` is generated. Flushing the builder
clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk.
"""
def __init__(self, maxlen: int) -> None:
self._maxlen = maxlen
self._sections: List[_TextSection] = []
self._pre_chunks: List[TextPreChunk] = []
def add_section(self, section: _TextSection) -> None:
"""Add a section to the accumulator for possible combination with next section."""
self._sections.append(section)
def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None:
"""Add a pre-chunk to the accumulator for possible combination with next pre-chunk."""
self._pre_chunks.append(pre_chunk)
def flush(self) -> Iterator[_TextSection]:
"""Generate all accumulated sections as a single combined section."""
sections = self._sections
def flush(self) -> Iterator[TextPreChunk]:
"""Generate all accumulated pre-chunks as a single combined pre-chunk."""
pre_chunks = self._pre_chunks
# -- nothing to do if no sections have been accumulated --
if not sections:
# -- nothing to do if no pre-chunks have been accumulated --
if not pre_chunks:
return
# -- otherwise combine all accumulated section into one --
section = sections[0]
for other_section in sections[1:]:
section = section.combine(other_section)
yield section
# -- otherwise combine all accumulated pre-chunk into one --
pre_chunk = pre_chunks[0]
for other_pre_chunk in pre_chunks[1:]:
pre_chunk = pre_chunk.combine(other_pre_chunk)
yield pre_chunk
# -- and reset the accumulator (to empty) --
sections.clear()
pre_chunks.clear()
@property
def remaining_space(self) -> int:
"""Maximum size of section that can be added without exceeding maxlen."""
"""Maximum size of pre-chunk that can be added without exceeding maxlen."""
return (
self._maxlen
if not self._sections
# -- an additional section will also incur an additional separator --
if not self._pre_chunks
# -- an additional pre-chunk will also incur an additional separator --
else self._maxlen - self.text_length - len(TEXT_SEPARATOR)
)
@property
def text_length(self) -> int:
"""Size of concatenated text in all sections in accumulator."""
n = len(self._sections)
"""Size of concatenated text in all pre-chunks in accumulator."""
n = len(self._pre_chunks)
return (
0
if n == 0
else sum(s.text_length for s in self._sections) + len(TEXT_SEPARATOR) * (n - 1)
else sum(s.text_length for s in self._pre_chunks) + len(TEXT_SEPARATOR) * (n - 1)
)