rfctr: rename pre chunk (#2261)

The original naming for the pre-cursor to a chunk in `chunk_by_title()`
was conflated with the idea of how these element subsequences were
bounded (by document-section) for that strategy. I mistakenly picked
that up as a universal concept but in fact no notion of section arises
in the `by_character` or other chunking strategies.

Fix this misconception by using the name `pre-chunk` for this concept
throughout.
This commit is contained in:
Steve Canny 2023-12-13 15:13:57 -08:00 committed by GitHub
parent 74d089d942
commit cbeaed21ef
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 298 additions and 298 deletions

View File

@ -1,4 +1,4 @@
## 0.11.4-dev10 ## 0.11.4-dev11
### Enhancements ### Enhancements

View File

@ -5,12 +5,12 @@ from typing import List
import pytest import pytest
from unstructured.chunking.title import ( from unstructured.chunking.title import (
_SectionCombiner, PreChunkCombiner,
TablePreChunk,
TextPreChunk,
TextPreChunkAccumulator,
TextPreChunkBuilder,
_split_elements_by_title_and_table, _split_elements_by_title_and_table,
_TableSection,
_TextSection,
_TextSectionAccumulator,
_TextSectionBuilder,
chunk_by_title, chunk_by_title,
) )
from unstructured.documents.coordinates import CoordinateSystem from unstructured.documents.coordinates import CoordinateSystem
@ -130,7 +130,7 @@ def test_it_does_not_complain_when_specifying_new_after_n_chars_by_itself():
def test_it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk(): def test_it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk():
"""Specifying `new_after_n_chars=0` places each element into its own section. """Specifying `new_after_n_chars=0` places each element into its own pre-chunk.
This puts each element into its own chunk, although long chunks are still split. This puts each element into its own chunk, although long chunks are still split.
""" """
@ -166,7 +166,7 @@ def test_it_silently_accepts_new_after_n_chars_greater_than_maxchars():
# ================================================================================================ # ================================================================================================
def test_it_splits_a_large_section_into_multiple_chunks(): def test_it_splits_a_large_element_into_multiple_chunks():
elements: List[Element] = [ elements: List[Element] = [
Title("Introduction"), Title("Introduction"),
Text( Text(
@ -199,36 +199,36 @@ def test_split_elements_by_title_and_table():
CheckBox(), CheckBox(),
] ]
sections = _split_elements_by_title_and_table( pre_chunks = _split_elements_by_title_and_table(
elements, elements,
multipage_sections=True, multipage_sections=True,
new_after_n_chars=500, new_after_n_chars=500,
max_characters=500, max_characters=500,
) )
section = next(sections) pre_chunk = next(pre_chunks)
assert isinstance(section, _TextSection) assert isinstance(pre_chunk, TextPreChunk)
assert section._elements == [ assert pre_chunk._elements == [
Title("A Great Day"), Title("A Great Day"),
Text("Today is a great day."), Text("Today is a great day."),
Text("It is sunny outside."), Text("It is sunny outside."),
] ]
# -- # --
section = next(sections) pre_chunk = next(pre_chunks)
assert isinstance(section, _TableSection) assert isinstance(pre_chunk, TablePreChunk)
assert section._table == Table("Heading\nCell text") assert pre_chunk._table == Table("Heading\nCell text")
# == # ==
section = next(sections) pre_chunk = next(pre_chunks)
assert isinstance(section, _TextSection) assert isinstance(pre_chunk, TextPreChunk)
assert section._elements == [ assert pre_chunk._elements == [
Title("An Okay Day"), Title("An Okay Day"),
Text("Today is an okay day."), Text("Today is an okay day."),
Text("It is rainy outside."), Text("It is rainy outside."),
] ]
# -- # --
section = next(sections) pre_chunk = next(pre_chunks)
assert isinstance(section, _TextSection) assert isinstance(pre_chunk, TextPreChunk)
assert section._elements == [ assert pre_chunk._elements == [
Title("A Bad Day"), Title("A Bad Day"),
Text("Today is a bad day."), Text("Today is a bad day."),
Text("It is storming outside."), Text("It is storming outside."),
@ -236,7 +236,7 @@ def test_split_elements_by_title_and_table():
] ]
# -- # --
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(sections) next(pre_chunks)
def test_chunk_by_title(): def test_chunk_by_title():
@ -351,9 +351,9 @@ def test_chunk_by_title_separates_by_page_number():
def test_chunk_by_title_does_not_break_on_regex_metadata_change(): def test_chunk_by_title_does_not_break_on_regex_metadata_change():
"""Sectioner is insensitive to regex-metadata changes. """PreChunker is insensitive to regex-metadata changes.
A regex-metadata match in an element does not signify a semantic boundary and a section should A regex-metadata match in an element does not signify a semantic boundary and a pre-chunk should
not be split based on such a difference. not be split based on such a difference.
""" """
elements: List[Element] = [ elements: List[Element] = [
@ -672,8 +672,8 @@ def test_chunk_by_title_drops_extra_metadata():
assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day.")) assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day."))
def test_it_considers_separator_length_when_sectioning(): def test_it_considers_separator_length_when_pre_chunking():
"""Sectioner includes length of separators when computing remaining space.""" """PreChunker includes length of separators when computing remaining space."""
elements: List[Element] = [ elements: List[Element] = [
Title("Chunking Priorities"), # 19 chars Title("Chunking Priorities"), # 19 chars
ListItem("Divide text into manageable chunks"), # 34 chars ListItem("Divide text into manageable chunks"), # 34 chars
@ -693,11 +693,11 @@ def test_it_considers_separator_length_when_sectioning():
] ]
# == Sections ==================================================================================== # == PreChunks ===================================================================================
class Describe_TableSection: class DescribeTablePreChunk:
"""Unit-test suite for `unstructured.chunking.title._TableSection objects.""" """Unit-test suite for `unstructured.chunking.title.TablePreChunk objects."""
def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self): def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
html_table = ( html_table = (
@ -711,11 +711,11 @@ class Describe_TableSection:
"</table>" "</table>"
) )
text_table = "Header Col 1 Header Col 2\n" "Lorem ipsum adipiscing" text_table = "Header Col 1 Header Col 2\n" "Lorem ipsum adipiscing"
section = _TableSection( pre_chunk = TablePreChunk(
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)) Table(text_table, metadata=ElementMetadata(text_as_html=html_table))
) )
chunk_iter = section.iter_chunks(maxlen=175) chunk_iter = pre_chunk.iter_chunks(maxlen=175)
chunk = next(chunk_iter) chunk = next(chunk_iter)
assert isinstance(chunk, Table) assert isinstance(chunk, Table)
@ -756,11 +756,11 @@ class Describe_TableSection:
"Nunc aliquam id enim nec molestie\n" "Nunc aliquam id enim nec molestie\n"
"Vivamus quis nunc ipsum donec ac fermentum" "Vivamus quis nunc ipsum donec ac fermentum"
) )
section = _TableSection( pre_chunk = TablePreChunk(
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)) Table(text_table, metadata=ElementMetadata(text_as_html=html_table))
) )
chunk_iter = section.iter_chunks(maxlen=100) chunk_iter = pre_chunk.iter_chunks(maxlen=100)
chunk = next(chunk_iter) chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk) assert isinstance(chunk, TableChunk)
@ -810,30 +810,30 @@ class Describe_TableSection:
next(chunk_iter) next(chunk_iter)
class Describe_TextSection: class DescribeTextPreChunk:
"""Unit-test suite for `unstructured.chunking.title._TextSection objects.""" """Unit-test suite for `unstructured.chunking.title.TextPreChunk objects."""
def it_can_combine_itself_with_another_TextSection_instance(self): def it_can_combine_itself_with_another_TextPreChunk_instance(self):
""".combine() produces a new section by appending the elements of `other_section`. """.combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`.
Note that neither the original or other section are mutated. Note that neither the original or other pre_chunk are mutated.
""" """
section = _TextSection( pre_chunk = TextPreChunk(
[ [
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Text("In rhoncus ipsum sed lectus porta volutpat."), Text("In rhoncus ipsum sed lectus porta volutpat."),
] ]
) )
other_section = _TextSection( other_pre_chunk = TextPreChunk(
[ [
Text("Donec semper facilisis metus finibus malesuada."), Text("Donec semper facilisis metus finibus malesuada."),
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."), Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
] ]
) )
new_section = section.combine(other_section) new_pre_chunk = pre_chunk.combine(other_pre_chunk)
assert new_section == _TextSection( assert new_pre_chunk == TextPreChunk(
[ [
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Text("In rhoncus ipsum sed lectus porta volutpat."), Text("In rhoncus ipsum sed lectus porta volutpat."),
@ -841,13 +841,13 @@ class Describe_TextSection:
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."), Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
] ]
) )
assert section == _TextSection( assert pre_chunk == TextPreChunk(
[ [
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Text("In rhoncus ipsum sed lectus porta volutpat."), Text("In rhoncus ipsum sed lectus porta volutpat."),
] ]
) )
assert other_section == _TextSection( assert other_pre_chunk == TextPreChunk(
[ [
Text("Donec semper facilisis metus finibus malesuada."), Text("Donec semper facilisis metus finibus malesuada."),
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."), Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
@ -855,7 +855,7 @@ class Describe_TextSection:
) )
def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self): def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
section = _TextSection( pre_chunk = TextPreChunk(
[ [
Title("Introduction"), Title("Introduction"),
Text( Text(
@ -865,19 +865,19 @@ class Describe_TextSection:
] ]
) )
chunk_iter = section.iter_chunks(maxlen=200) chunk_iter = pre_chunk.iter_chunks(maxlen=200)
chunk = next(chunk_iter) chunk = next(chunk_iter)
assert chunk == CompositeElement( assert chunk == CompositeElement(
"Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit." "Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
" In rhoncus ipsum sedlectus porta volutpat.", " In rhoncus ipsum sedlectus porta volutpat.",
) )
assert chunk.metadata is section._consolidated_metadata assert chunk.metadata is pre_chunk._consolidated_metadata
def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self): def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
# -- Chunk-splitting only occurs when a *single* element is too big to fit in the window. # -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
# -- The sectioner will isolate that element in a section of its own. # -- The pre-chunker will isolate that element in a pre_chunk of its own.
section = _TextSection( pre_chunk = TextPreChunk(
[ [
Text( Text(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod" "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
@ -888,7 +888,7 @@ class Describe_TextSection:
] ]
) )
chunk_iter = section.iter_chunks(maxlen=200) chunk_iter = pre_chunk.iter_chunks(maxlen=200)
chunk = next(chunk_iter) chunk = next(chunk_iter)
assert chunk == CompositeElement( assert chunk == CompositeElement(
@ -896,22 +896,22 @@ class Describe_TextSection:
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim" " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
" veniam, quis nostrud exercitation ullamco laboris nisi ut a" " veniam, quis nostrud exercitation ullamco laboris nisi ut a"
) )
assert chunk.metadata is section._consolidated_metadata assert chunk.metadata is pre_chunk._consolidated_metadata
# -- # --
chunk = next(chunk_iter) chunk = next(chunk_iter)
assert chunk == CompositeElement("liquip ex ea commodo consequat.") assert chunk == CompositeElement("liquip ex ea commodo consequat.")
assert chunk.metadata is section._consolidated_metadata assert chunk.metadata is pre_chunk._consolidated_metadata
# -- # --
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(chunk_iter) next(chunk_iter)
def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self): def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self):
""".text_length is the size of chunk this section will produce (before any splitting).""" """.text_length is the size of chunk this pre-chunk will produce (before any splitting)."""
section = _TextSection([PageBreak(""), Text("foo"), Text("bar")]) pre_chunk = TextPreChunk([PageBreak(""), Text("foo"), Text("bar")])
assert section.text_length == 8 assert pre_chunk.text_length == 8
def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self): def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self):
section = _TextSection( pre_chunk = TextPreChunk(
[ [
Title( Title(
"Lorem Ipsum", "Lorem Ipsum",
@ -934,7 +934,7 @@ class Describe_TextSection:
] ]
) )
assert section._all_metadata_values == { assert pre_chunk._all_metadata_values == {
# -- scalar values are accumulated in a list in element order -- # -- scalar values are accumulated in a list in element order --
"category_depth": [0, 1], "category_depth": [0, 1],
# -- all values are accumulated, not only unique ones -- # -- all values are accumulated, not only unique ones --
@ -963,7 +963,7 @@ class Describe_TextSection:
) )
metadata_2.quotient = 1.74 metadata_2.quotient = 1.74
section = _TextSection( pre_chunk = TextPreChunk(
[ [
Title("Lorem Ipsum", metadata=metadata), Title("Lorem Ipsum", metadata=metadata),
Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2), Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
@ -971,7 +971,7 @@ class Describe_TextSection:
) )
# -- ad-hoc fields "coefficient" and "quotient" do not appear -- # -- ad-hoc fields "coefficient" and "quotient" do not appear --
assert section._all_metadata_values == { assert pre_chunk._all_metadata_values == {
"category_depth": [0, 1], "category_depth": [0, 1],
"filename": ["foo.docx", "foo.docx"], "filename": ["foo.docx", "foo.docx"],
"image_path": ["sprite.png"], "image_path": ["sprite.png"],
@ -985,7 +985,7 @@ class Describe_TextSection:
Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new
position in the chunk after element text has been concatenated. position in the chunk after element text has been concatenated.
""" """
section = _TextSection( pre_chunk = TextPreChunk(
[ [
Title( Title(
"Lorem Ipsum", "Lorem Ipsum",
@ -1011,7 +1011,7 @@ class Describe_TextSection:
] ]
) )
regex_metadata = section._consolidated_regex_meta regex_metadata = pre_chunk._consolidated_regex_meta
assert regex_metadata == { assert regex_metadata == {
"dolor": [RegexMetadata(text="dolor", start=25, end=30)], "dolor": [RegexMetadata(text="dolor", start=25, end=30)],
@ -1026,9 +1026,9 @@ class Describe_TextSection:
"""._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata. """._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
Only non-None fields should appear in the dict and each field value should be the Only non-None fields should appear in the dict and each field value should be the
consolidation of the values across the section elements. consolidation of the values across the pre_chunk elements.
""" """
section = _TextSection( pre_chunk = TextPreChunk(
[ [
PageBreak(""), PageBreak(""),
Title( Title(
@ -1065,7 +1065,7 @@ class Describe_TextSection:
] ]
) )
meta_kwargs = section._meta_kwargs meta_kwargs = pre_chunk._meta_kwargs
assert meta_kwargs == { assert meta_kwargs == {
"filename": "foo.docx", "filename": "foo.docx",
@ -1090,29 +1090,29 @@ class Describe_TextSection:
([Text("foo"), Text("bar"), PageBreak("")], "foo\n\nbar"), ([Text("foo"), Text("bar"), PageBreak("")], "foo\n\nbar"),
], ],
) )
def it_knows_the_concatenated_text_of_the_section( def it_knows_the_concatenated_text_of_the_pre_chunk(
self, elements: List[Text], expected_value: str self, elements: List[Text], expected_value: str
): ):
"""._text is the "joined" text of the section elements. """._text is the "joined" text of the pre-chunk elements.
The text-segment contributed by each element is separated from the next by a blank line The text-segment contributed by each element is separated from the next by a blank line
("\n\n"). An element that contributes no text does not give rise to a separator. ("\n\n"). An element that contributes no text does not give rise to a separator.
""" """
section = _TextSection(elements) pre_chunk = TextPreChunk(elements)
assert section._text == expected_value assert pre_chunk._text == expected_value
class Describe_TextSectionBuilder: class DescribeTextPreChunkBuilder:
"""Unit-test suite for `unstructured.chunking.title._TextSectionBuilder`.""" """Unit-test suite for `unstructured.chunking.title.TextPreChunkBuilder`."""
def it_is_empty_on_construction(self): def it_is_empty_on_construction(self):
builder = _TextSectionBuilder(maxlen=50) builder = TextPreChunkBuilder(maxlen=50)
assert builder.text_length == 0 assert builder.text_length == 0
assert builder.remaining_space == 50 assert builder.remaining_space == 50
def it_accumulates_elements_added_to_it(self): def it_accumulates_elements_added_to_it(self):
builder = _TextSectionBuilder(maxlen=150) builder = TextPreChunkBuilder(maxlen=150)
builder.add_element(Title("Introduction")) builder.add_element(Title("Introduction"))
assert builder.text_length == 12 assert builder.text_length == 12
@ -1127,8 +1127,8 @@ class Describe_TextSectionBuilder:
assert builder.text_length == 112 assert builder.text_length == 112
assert builder.remaining_space == 36 assert builder.remaining_space == 36
def it_generates_a_TextSection_when_flushed_and_resets_itself_to_empty(self): def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
builder = _TextSectionBuilder(maxlen=150) builder = TextPreChunkBuilder(maxlen=150)
builder.add_element(Title("Introduction")) builder.add_element(Title("Introduction"))
builder.add_element( builder.add_element(
Text( Text(
@ -1137,10 +1137,10 @@ class Describe_TextSectionBuilder:
), ),
) )
section = next(builder.flush()) pre_chunk = next(builder.flush())
assert isinstance(section, _TextSection) assert isinstance(pre_chunk, TextPreChunk)
assert section._elements == [ assert pre_chunk._elements == [
Title("Introduction"), Title("Introduction"),
Text( Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed" "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
@ -1150,17 +1150,17 @@ class Describe_TextSectionBuilder:
assert builder.text_length == 0 assert builder.text_length == 0
assert builder.remaining_space == 150 assert builder.remaining_space == 150
def but_it_does_not_generate_a_TextSection_on_flush_when_empty(self): def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
builder = _TextSectionBuilder(maxlen=150) builder = TextPreChunkBuilder(maxlen=150)
sections = list(builder.flush()) pre_chunks = list(builder.flush())
assert sections == [] assert pre_chunks == []
assert builder.text_length == 0 assert builder.text_length == 0
assert builder.remaining_space == 150 assert builder.remaining_space == 150
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self): def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
builder = _TextSectionBuilder(maxlen=50) builder = TextPreChunkBuilder(maxlen=50)
builder.add_element(Text("abcde")) builder.add_element(Text("abcde"))
builder.add_element(Text("fghij")) builder.add_element(Text("fghij"))
@ -1173,27 +1173,27 @@ class Describe_TextSectionBuilder:
assert builder.remaining_space == 36 assert builder.remaining_space == 36
# == SectionCombiner ============================================================================= # == PreChunkCombiner =============================================================================
class Describe_SectionCombiner: class DescribePreChunkCombiner:
"""Unit-test suite for `unstructured.chunking.title._SectionCombiner`.""" """Unit-test suite for `unstructured.chunking.title.PreChunkCombiner`."""
def it_combines_sequential_small_text_sections(self): def it_combines_sequential_small_text_pre_chunks(self):
sections = [ pre_chunks = [
_TextSection( TextPreChunk(
[ [
Title("Lorem Ipsum"), # 11 Title("Lorem Ipsum"), # 11
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
] ]
), ),
_TextSection( TextPreChunk(
[ [
Title("Mauris Nec"), # 10 Title("Mauris Nec"), # 10
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
] ]
), ),
_TextSection( TextPreChunk(
[ [
Title("Sed Orci"), # 8 Title("Sed Orci"), # 8
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
@ -1201,13 +1201,13 @@ class Describe_SectionCombiner:
), ),
] ]
section_iter = _SectionCombiner( pre_chunk_iter = PreChunkCombiner(
sections, maxlen=250, combine_text_under_n_chars=250 pre_chunks, maxlen=250, combine_text_under_n_chars=250
).iter_combined_sections() ).iter_combined_pre_chunks()
section = next(section_iter) pre_chunk = next(pre_chunk_iter)
assert isinstance(section, _TextSection) assert isinstance(pre_chunk, TextPreChunk)
assert section._elements == [ assert pre_chunk._elements == [
Title("Lorem Ipsum"), Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Title("Mauris Nec"), Title("Mauris Nec"),
@ -1216,18 +1216,18 @@ class Describe_SectionCombiner:
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
] ]
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(section_iter) next(pre_chunk_iter)
def but_it_does_not_combine_table_sections(self): def but_it_does_not_combine_table_pre_chunks(self):
sections = [ pre_chunks = [
_TextSection( TextPreChunk(
[ [
Title("Lorem Ipsum"), Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
] ]
), ),
_TableSection(Table("Heading\nCell text")), TablePreChunk(Table("Heading\nCell text")),
_TextSection( TextPreChunk(
[ [
Title("Mauris Nec"), Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
@ -1235,47 +1235,47 @@ class Describe_SectionCombiner:
), ),
] ]
section_iter = _SectionCombiner( pre_chunk_iter = PreChunkCombiner(
sections, maxlen=250, combine_text_under_n_chars=250 pre_chunks, maxlen=250, combine_text_under_n_chars=250
).iter_combined_sections() ).iter_combined_pre_chunks()
section = next(section_iter) pre_chunk = next(pre_chunk_iter)
assert isinstance(section, _TextSection) assert isinstance(pre_chunk, TextPreChunk)
assert section._elements == [ assert pre_chunk._elements == [
Title("Lorem Ipsum"), Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
] ]
# -- # --
section = next(section_iter) pre_chunk = next(pre_chunk_iter)
assert isinstance(section, _TableSection) assert isinstance(pre_chunk, TablePreChunk)
assert section._table == Table("Heading\nCell text") assert pre_chunk._table == Table("Heading\nCell text")
# -- # --
section = next(section_iter) pre_chunk = next(pre_chunk_iter)
assert isinstance(section, _TextSection) assert isinstance(pre_chunk, TextPreChunk)
assert section._elements == [ assert pre_chunk._elements == [
Title("Mauris Nec"), Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
] ]
# -- # --
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(section_iter) next(pre_chunk_iter)
def it_respects_the_specified_combination_threshold(self): def it_respects_the_specified_combination_threshold(self):
sections = [ pre_chunks = [
_TextSection( # 68 TextPreChunk( # 68
[ [
Title("Lorem Ipsum"), # 11 Title("Lorem Ipsum"), # 11
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
] ]
), ),
_TextSection( # 71 TextPreChunk( # 71
[ [
Title("Mauris Nec"), # 10 Title("Mauris Nec"), # 10
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
] ]
), ),
# -- len == 139 # -- len == 139
_TextSection( TextPreChunk(
[ [
Title("Sed Orci"), # 8 Title("Sed Orci"), # 8
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
@ -1283,45 +1283,45 @@ class Describe_SectionCombiner:
), ),
] ]
section_iter = _SectionCombiner( pre_chunk_iter = PreChunkCombiner(
sections, maxlen=250, combine_text_under_n_chars=80 pre_chunks, maxlen=250, combine_text_under_n_chars=80
).iter_combined_sections() ).iter_combined_pre_chunks()
section = next(section_iter) pre_chunk = next(pre_chunk_iter)
assert isinstance(section, _TextSection) assert isinstance(pre_chunk, TextPreChunk)
assert section._elements == [ assert pre_chunk._elements == [
Title("Lorem Ipsum"), Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Title("Mauris Nec"), Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
] ]
# -- # --
section = next(section_iter) pre_chunk = next(pre_chunk_iter)
assert isinstance(section, _TextSection) assert isinstance(pre_chunk, TextPreChunk)
assert section._elements == [ assert pre_chunk._elements == [
Title("Sed Orci"), Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
] ]
# -- # --
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(section_iter) next(pre_chunk_iter)
def it_respects_the_hard_maximum_window_length(self): def it_respects_the_hard_maximum_window_length(self):
sections = [ pre_chunks = [
_TextSection( # 68 TextPreChunk( # 68
[ [
Title("Lorem Ipsum"), # 11 Title("Lorem Ipsum"), # 11
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
] ]
), ),
_TextSection( # 71 TextPreChunk( # 71
[ [
Title("Mauris Nec"), # 10 Title("Mauris Nec"), # 10
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
] ]
), ),
# -- len == 139 # -- len == 139
_TextSection( TextPreChunk(
[ [
Title("Sed Orci"), # 8 Title("Sed Orci"), # 8
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
@ -1330,35 +1330,35 @@ class Describe_SectionCombiner:
# -- len == 214 # -- len == 214
] ]
section_iter = _SectionCombiner( pre_chunk_iter = PreChunkCombiner(
sections, maxlen=200, combine_text_under_n_chars=200 pre_chunks, maxlen=200, combine_text_under_n_chars=200
).iter_combined_sections() ).iter_combined_pre_chunks()
section = next(section_iter) pre_chunk = next(pre_chunk_iter)
assert isinstance(section, _TextSection) assert isinstance(pre_chunk, TextPreChunk)
assert section._elements == [ assert pre_chunk._elements == [
Title("Lorem Ipsum"), Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Title("Mauris Nec"), Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
] ]
# -- # --
section = next(section_iter) pre_chunk = next(pre_chunk_iter)
assert isinstance(section, _TextSection) assert isinstance(pre_chunk, TextPreChunk)
assert section._elements == [ assert pre_chunk._elements == [
Title("Sed Orci"), Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
] ]
# -- # --
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(section_iter) next(pre_chunk_iter)
def it_accommodates_and_isolates_an_oversized_section(self): def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
"""Such as occurs when a single element exceeds the window size.""" """Such as occurs when a single element exceeds the window size."""
sections = [ pre_chunks = [
_TextSection([Title("Lorem Ipsum")]), TextPreChunk([Title("Lorem Ipsum")]),
_TextSection( # 179 TextPreChunk( # 179
[ [
Text( Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55 "Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55
@ -1367,20 +1367,20 @@ class Describe_SectionCombiner:
) )
] ]
), ),
_TextSection([Title("Vulputate Consequat")]), TextPreChunk([Title("Vulputate Consequat")]),
] ]
section_iter = _SectionCombiner( pre_chunk_iter = PreChunkCombiner(
sections, maxlen=150, combine_text_under_n_chars=150 pre_chunks, maxlen=150, combine_text_under_n_chars=150
).iter_combined_sections() ).iter_combined_pre_chunks()
section = next(section_iter) pre_chunk = next(pre_chunk_iter)
assert isinstance(section, _TextSection) assert isinstance(pre_chunk, TextPreChunk)
assert section._elements == [Title("Lorem Ipsum")] assert pre_chunk._elements == [Title("Lorem Ipsum")]
# -- # --
section = next(section_iter) pre_chunk = next(pre_chunk_iter)
assert isinstance(section, _TextSection) assert isinstance(pre_chunk, TextPreChunk)
assert section._elements == [ assert pre_chunk._elements == [
Text( Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit." "Lorem ipsum dolor sit amet consectetur adipiscing elit."
" Mauris nec urna non augue vulputate consequat eget et nisi." " Mauris nec urna non augue vulputate consequat eget et nisi."
@ -1388,28 +1388,28 @@ class Describe_SectionCombiner:
) )
] ]
# -- # --
section = next(section_iter) pre_chunk = next(pre_chunk_iter)
assert isinstance(section, _TextSection) assert isinstance(pre_chunk, TextPreChunk)
assert section._elements == [Title("Vulputate Consequat")] assert pre_chunk._elements == [Title("Vulputate Consequat")]
# -- # --
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(section_iter) next(pre_chunk_iter)
class Describe_TextSectionAccumulator: class DescribeTextPreChunkAccumulator:
"""Unit-test suite for `unstructured.chunking.title._TextSectionAccumulator`.""" """Unit-test suite for `unstructured.chunking.title.TextPreChunkAccumulator`."""
def it_is_empty_on_construction(self): def it_is_empty_on_construction(self):
accum = _TextSectionAccumulator(maxlen=100) accum = TextPreChunkAccumulator(maxlen=100)
assert accum.text_length == 0 assert accum.text_length == 0
assert accum.remaining_space == 100 assert accum.remaining_space == 100
def it_accumulates_sections_added_to_it(self): def it_accumulates_pre_chunks_added_to_it(self):
accum = _TextSectionAccumulator(maxlen=500) accum = TextPreChunkAccumulator(maxlen=500)
accum.add_section( accum.add_pre_chunk(
_TextSection( TextPreChunk(
[ [
Title("Lorem Ipsum"), Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
@ -1419,8 +1419,8 @@ class Describe_TextSectionAccumulator:
assert accum.text_length == 68 assert accum.text_length == 68
assert accum.remaining_space == 430 assert accum.remaining_space == 430
accum.add_section( accum.add_pre_chunk(
_TextSection( TextPreChunk(
[ [
Title("Mauris Nec"), Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
@ -1430,26 +1430,26 @@ class Describe_TextSectionAccumulator:
assert accum.text_length == 141 assert accum.text_length == 141
assert accum.remaining_space == 357 assert accum.remaining_space == 357
def it_generates_a_TextSection_when_flushed_and_resets_itself_to_empty(self): def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
accum = _TextSectionAccumulator(maxlen=150) accum = TextPreChunkAccumulator(maxlen=150)
accum.add_section( accum.add_pre_chunk(
_TextSection( TextPreChunk(
[ [
Title("Lorem Ipsum"), Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
] ]
) )
) )
accum.add_section( accum.add_pre_chunk(
_TextSection( TextPreChunk(
[ [
Title("Mauris Nec"), Title("Mauris Nec"),
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
] ]
) )
) )
accum.add_section( accum.add_pre_chunk(
_TextSection( TextPreChunk(
[ [
Title("Sed Orci"), Title("Sed Orci"),
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."), Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
@ -1457,15 +1457,15 @@ class Describe_TextSectionAccumulator:
) )
) )
section_iter = accum.flush() pre_chunk_iter = accum.flush()
# -- iterator generates exactly one section -- # -- iterator generates exactly one pre_chunk --
section = next(section_iter) pre_chunk = next(pre_chunk_iter)
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(section_iter) next(pre_chunk_iter)
# -- and it is a _TextSection containing all the elements -- # -- and it is a _TextPreChunk containing all the elements --
assert isinstance(section, _TextSection) assert isinstance(pre_chunk, TextPreChunk)
assert section._elements == [ assert pre_chunk._elements == [
Title("Lorem Ipsum"), Title("Lorem Ipsum"),
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
Title("Mauris Nec"), Title("Mauris Nec"),
@ -1476,24 +1476,24 @@ class Describe_TextSectionAccumulator:
assert accum.text_length == 0 assert accum.text_length == 0
assert accum.remaining_space == 150 assert accum.remaining_space == 150
def but_it_does_not_generate_a_TextSection_on_flush_when_empty(self): def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
accum = _TextSectionAccumulator(maxlen=150) accum = TextPreChunkAccumulator(maxlen=150)
sections = list(accum.flush()) pre_chunks = list(accum.flush())
assert sections == [] assert pre_chunks == []
assert accum.text_length == 0 assert accum.text_length == 0
assert accum.remaining_space == 150 assert accum.remaining_space == 150
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self): def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
accum = _TextSectionAccumulator(maxlen=100) accum = TextPreChunkAccumulator(maxlen=100)
accum.add_section(_TextSection([Text("abcde")])) accum.add_pre_chunk(TextPreChunk([Text("abcde")]))
accum.add_section(_TextSection([Text("fghij")])) accum.add_pre_chunk(TextPreChunk([Text("fghij")]))
# -- .text_length includes a separator ("\n\n", len==2) between each text-segment, # -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10 # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
assert accum.text_length == 12 assert accum.text_length == 12
# -- .remaining_space is reduced by the length (2) of the trailing separator which would # -- .remaining_space is reduced by the length (2) of the trailing separator which would
# -- go between the current text and that of the next section if one was added. # -- go between the current text and that of the next pre-chunk if one was added.
# -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88 # -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88
assert accum.remaining_space == 86 assert accum.remaining_space == 86

View File

@ -1 +1 @@
__version__ = "0.11.4-dev10" # pragma: no cover __version__ = "0.11.4-dev11" # pragma: no cover

View File

@ -25,7 +25,7 @@ from unstructured.documents.elements import (
) )
from unstructured.utils import lazyproperty from unstructured.utils import lazyproperty
_Section: TypeAlias = "_TableSection | _TextSection" PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
# -- goes between text of each element when element-text is concatenated to form chunk -- # -- goes between text of each element when element-text is concatenated to form chunk --
TEXT_SEPARATOR = "\n\n" TEXT_SEPARATOR = "\n\n"
@ -98,7 +98,7 @@ def chunk_by_title(
# ---------------------------------------------------------------- # ----------------------------------------------------------------
sections = _SectionCombiner( pre_chunks = PreChunkCombiner(
_split_elements_by_title_and_table( _split_elements_by_title_and_table(
elements, elements,
multipage_sections=multipage_sections, multipage_sections=multipage_sections,
@ -107,9 +107,9 @@ def chunk_by_title(
), ),
max_characters, max_characters,
combine_text_under_n_chars, combine_text_under_n_chars,
).iter_combined_sections() ).iter_combined_pre_chunks()
return [chunk for section in sections for chunk in section.iter_chunks(max_characters)] return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks(max_characters)]
def _split_elements_by_title_and_table( def _split_elements_by_title_and_table(
@ -117,31 +117,31 @@ def _split_elements_by_title_and_table(
multipage_sections: bool, multipage_sections: bool,
new_after_n_chars: int, new_after_n_chars: int,
max_characters: int, max_characters: int,
) -> Iterator[_TextSection | _TableSection]: ) -> Iterator[TextPreChunk | TablePreChunk]:
"""Implements "sectioner" responsibilities. """Implements "pre-chunker" responsibilities.
A _section_ can be thought of as a "pre-chunk", generally determining the size and contents of a A _section_ can be thought of as a "pre-chunk", generally determining the size and contents of a
chunk formed by the subsequent "chunker" process. The only exception occurs when a single chunk formed by the subsequent "chunker" process. The only exception occurs when a single
element is too big to fit in the chunk window and the chunker splits it into two or more chunks element is too big to fit in the chunk window and the chunker splits it into two or more chunks
divided mid-text. The sectioner never divides an element mid-text. divided mid-text. The pre-chunker never divides an element mid-text.
The sectioner's responsibilities are: The pre-chunker's responsibilities are:
* **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on * **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
either side of those boundaries into different sections. In this case, the primary either side of those boundaries into different pre-chunks. In this case, the primary
indicator of a semantic boundary is a `Title` element. A page-break (change in indicator of a semantic boundary is a `Title` element. A page-break (change in
page-number) is also a semantic boundary when `multipage_sections` is `False`. page-number) is also a semantic boundary when `multipage_sections` is `False`.
* **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit * **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
into sections as big as possible without exceeding the chunk window size. into pre-chunks as big as possible without exceeding the chunk window size.
* **Minimize chunks that must be split mid-text.** Precompute the text length of each * **Minimize chunks that must be split mid-text.** Precompute the text length of each
section and only produce a section that exceeds the chunk window size when there is a pre-chunk and only produce a pre-chunk that exceeds the chunk window size when there is a
single element with text longer than that window. single element with text longer than that window.
A Table or Checkbox element is placed into a section by itself. A Table or Checkbox element is placed into a pre-chunk by itself.
""" """
section_builder = _TextSectionBuilder(max_characters) pre_chunk_builder = TextPreChunkBuilder(max_characters)
prior_element = None prior_element = None
@ -152,31 +152,31 @@ def _split_elements_by_title_and_table(
else False else False
) )
# -- start new section when necessary -- # -- start new pre_chunk when necessary --
if ( if (
# -- Title and Table both start a new section -- # -- Title and Table both start a new pre_chunk --
isinstance(element, (Title, Table)) isinstance(element, (Title, Table))
# -- adding this element would exceed hard-maxlen for section -- # -- adding this element would exceed hard-maxlen for pre_chunk --
or section_builder.remaining_space < len(str(element)) or pre_chunk_builder.remaining_space < len(str(element))
# -- section already meets or exceeds soft-maxlen -- # -- pre_chunk already meets or exceeds soft-maxlen --
or section_builder.text_length >= new_after_n_chars or pre_chunk_builder.text_length >= new_after_n_chars
# -- a semantic boundary is indicated by metadata change since prior element -- # -- a semantic boundary is indicated by metadata change since prior element --
or metadata_differs or metadata_differs
): ):
# -- complete any work-in-progress section -- # -- complete any work-in-progress pre_chunk --
yield from section_builder.flush() yield from pre_chunk_builder.flush()
# -- emit table and checkbox immediately since they are always isolated -- # -- emit table and checkbox immediately since they are always isolated --
if isinstance(element, Table): if isinstance(element, Table):
yield _TableSection(table=element) yield TablePreChunk(table=element)
# -- but accumulate text elements for consolidation into a composite chunk -- # -- but accumulate text elements for consolidation into a composite chunk --
else: else:
section_builder.add_element(element) pre_chunk_builder.add_element(element)
prior_element = element prior_element = element
# -- flush "tail" section, any partially-filled section after last element is processed -- # -- flush "tail" pre_chunk, any partially-filled pre_chunk after last element is processed --
yield from section_builder.flush() yield from pre_chunk_builder.flush()
def _metadata_differs( def _metadata_differs(
@ -255,17 +255,17 @@ def add_chunking_strategy() -> Callable[[Callable[_P, List[Element]]], Callable[
return decorator return decorator
# == Sections ==================================================================================== # == PreChunks ===================================================================================
class _TableSection: class TablePreChunk:
"""A section composed of a single Table element.""" """A pre-chunk composed of a single Table element."""
def __init__(self, table: Table) -> None: def __init__(self, table: Table) -> None:
self._table = table self._table = table
def iter_chunks(self, maxlen: int) -> Iterator[Table | TableChunk]: def iter_chunks(self, maxlen: int) -> Iterator[Table | TableChunk]:
"""Split this section into one or more `Table` or `TableChunk` objects maxlen or smaller.""" """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
text = self._table.text text = self._table.text
html = self._table.metadata.text_as_html or "" html = self._table.metadata.text_as_html or ""
@ -296,7 +296,7 @@ class _TableSection:
is_continuation = True is_continuation = True
class _TextSection: class TextPreChunk:
"""A sequence of elements that belong to the same semantic unit within a document. """A sequence of elements that belong to the same semantic unit within a document.
The name "section" derives from the idea of a document-section, a heading followed by the The name "section" derives from the idea of a document-section, a heading followed by the
@ -310,16 +310,16 @@ class _TextSection:
self._elements = list(elements) self._elements = list(elements)
def __eq__(self, other: Any) -> bool: def __eq__(self, other: Any) -> bool:
if not isinstance(other, _TextSection): if not isinstance(other, TextPreChunk):
return False return False
return self._elements == other._elements return self._elements == other._elements
def combine(self, other_section: _TextSection) -> _TextSection: def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk:
"""Return new `_TextSection` that combines this and `other_section`.""" """Return new `TextPreChunk` that combines this and `other_pre_chunk`."""
return _TextSection(self._elements + other_section._elements) return TextPreChunk(self._elements + other_pre_chunk._elements)
def iter_chunks(self, maxlen: int) -> Iterator[CompositeElement]: def iter_chunks(self, maxlen: int) -> Iterator[CompositeElement]:
"""Split this section into one or more `CompositeElement` objects maxlen or smaller.""" """Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller."""
text = self._text text = self._text
text_len = len(text) text_len = len(text)
start = 0 start = 0
@ -333,8 +333,8 @@ class _TextSection:
@lazyproperty @lazyproperty
def text_length(self) -> int: def text_length(self) -> int:
"""Length of concatenated text of this section, including separators.""" """Length of concatenated text of this pre-chunk, including separators."""
# -- used by section-combiner to identify combination candidates -- # -- used by pre-chunk-combiner to identify combination candidates --
return len(self._text) return len(self._text)
@lazyproperty @lazyproperty
@ -342,7 +342,7 @@ class _TextSection:
"""Collection of all populated metadata values across elements. """Collection of all populated metadata values across elements.
The resulting dict has one key for each `ElementMetadata` field that had a non-None value in The resulting dict has one key for each `ElementMetadata` field that had a non-None value in
at least one of the elements in this section. The value of that key is a list of all those at least one of the elements in this pre-chunk. The value of that key is a list of all those
populated values, in element order, for example: populated values, in element order, for example:
{ {
@ -374,13 +374,13 @@ class _TextSection:
@lazyproperty @lazyproperty
def _consolidated_metadata(self) -> ElementMetadata: def _consolidated_metadata(self) -> ElementMetadata:
"""Metadata applicable to this section as a single chunk. """Metadata applicable to this pre-chunk as a single chunk.
Formed by applying consolidation rules to all metadata fields across the elements of this Formed by applying consolidation rules to all metadata fields across the elements of this
section. pre-chunk.
For the sake of consistency, the same rules are applied (for example, for dropping values) For the sake of consistency, the same rules are applied (for example, for dropping values)
to a single-element section too, even though metadata for such a section is already to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
"consolidated". "consolidated".
""" """
return ElementMetadata(**self._meta_kwargs) return ElementMetadata(**self._meta_kwargs)
@ -460,26 +460,26 @@ class _TextSection:
@lazyproperty @lazyproperty
def _text(self) -> str: def _text(self) -> str:
"""The concatenated text of all elements in this section. """The concatenated text of all elements in this pre-chunk.
Each element-text is separated from the next by a blank line ("\n\n"). Each element-text is separated from the next by a blank line ("\n\n").
""" """
return TEXT_SEPARATOR.join(e.text for e in self._elements if e.text) return TEXT_SEPARATOR.join(e.text for e in self._elements if e.text)
class _TextSectionBuilder: class TextPreChunkBuilder:
"""An element accumulator suitable for incrementally forming a section. """An element accumulator suitable for incrementally forming a pre-chunk.
Provides monitoring properties like `.remaining_space` and `.text_length` a sectioner can use Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use
to determine whether it should add the next element in the element stream. to determine whether it should add the next element in the element stream.
`.flush()` is used to build a `TextSection` object from the accumulated elements. This method `.flush()` is used to build a `TextPreChunk` object from the accumulated elements. This method
returns an interator that generates zero-or-one `TextSection` object and is used like so: returns an interator that generates zero-or-one `TextPreChunk` object and is used like so:
yield from builder.flush() yield from builder.flush()
If no elements have been accumulated, no `TextSection` is generated. Flushing the builder clears If no elements have been accumulated, no `TextPreChunk` is generated. Flushing the builder
the elements it contains so it is ready to build the next text-section. clears the elements it contains so it is ready to build the next text-pre-chunk.
""" """
def __init__(self, maxlen: int) -> None: def __init__(self, maxlen: int) -> None:
@ -502,22 +502,22 @@ class _TextSectionBuilder:
self._text_segments.append(element.text) self._text_segments.append(element.text)
self._text_len += len(element.text) self._text_len += len(element.text)
def flush(self) -> Iterator[_TextSection]: def flush(self) -> Iterator[TextPreChunk]:
"""Generate zero-or-one `Section` object and clear the accumulator. """Generate zero-or-one `PreChunk` object and clear the accumulator.
Suitable for use to emit a Section when the maximum size has been reached or a semantic Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
boundary has been reached. Also to clear out a terminal section at the end of an element boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element
stream. stream.
""" """
if not self._elements: if not self._elements:
return return
# -- clear builder before yield so we're not sensitive to the timing of how/when this # -- clear builder before yield so we're not sensitive to the timing of how/when this
# -- iterator is exhausted and can add eleemnts for the next section immediately. # -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately.
elements = self._elements[:] elements = self._elements[:]
self._elements.clear() self._elements.clear()
self._text_segments.clear() self._text_segments.clear()
self._text_len = 0 self._text_len = 0
yield _TextSection(elements) yield TextPreChunk(elements)
@property @property
def remaining_space(self) -> int: def remaining_space(self) -> int:
@ -528,9 +528,9 @@ class _TextSectionBuilder:
@property @property
def text_length(self) -> int: def text_length(self) -> int:
"""Length of the text in this section. """Length of the text in this pre-chunk.
This value represents the chunk-size that would result if this section was flushed in its This value represents the chunk-size that would result if this pre-chunk was flushed in its
current state. In particular, it does not include the length of a trailing separator (since current state. In particular, it does not include the length of a trailing separator (since
that would only appear if an additional element was added). that would only appear if an additional element was added).
@ -544,104 +544,104 @@ class _TextSectionBuilder:
return self._text_len + (separator_count * self._separator_len) return self._text_len + (separator_count * self._separator_len)
# == SectionCombiner ============================================================================= # == PreChunkCombiner ============================================================================
class _SectionCombiner: class PreChunkCombiner:
"""Filters section stream to combine small sections where possible.""" """Filters pre-chunk stream to combine small pre-chunks where possible."""
def __init__( def __init__(
self, self,
sections: Iterable[_Section], pre_chunks: Iterable[PreChunk],
maxlen: int, maxlen: int,
combine_text_under_n_chars: int, combine_text_under_n_chars: int,
): ):
self._sections = sections self._pre_chunks = pre_chunks
self._maxlen = maxlen self._maxlen = maxlen
self._combine_text_under_n_chars = combine_text_under_n_chars self._combine_text_under_n_chars = combine_text_under_n_chars
def iter_combined_sections(self) -> Iterator[_Section]: def iter_combined_pre_chunks(self) -> Iterator[PreChunk]:
"""Generate section objects, combining TextSection objects when they will fit in window.""" """Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window."""
accum = _TextSectionAccumulator(self._maxlen) accum = TextPreChunkAccumulator(self._maxlen)
for section in self._sections: for pre_chunk in self._pre_chunks:
# -- start new section under these conditions -- # -- start new pre-chunk under these conditions --
if ( if (
# -- a table section is never combined -- # -- a table pre-chunk is never combined --
isinstance(section, _TableSection) isinstance(pre_chunk, TablePreChunk)
# -- don't add another section once length has reached combination soft-max -- # -- don't add another pre-chunk once length has reached combination soft-max --
or accum.text_length >= self._combine_text_under_n_chars or accum.text_length >= self._combine_text_under_n_chars
# -- combining would exceed hard-max -- # -- combining would exceed hard-max --
or accum.remaining_space < section.text_length or accum.remaining_space < pre_chunk.text_length
): ):
yield from accum.flush() yield from accum.flush()
# -- a table section is never combined so don't accumulate -- # -- a table pre-chunk is never combined so don't accumulate --
if isinstance(section, _TableSection): if isinstance(pre_chunk, TablePreChunk):
yield section yield pre_chunk
else: else:
accum.add_section(section) accum.add_pre_chunk(pre_chunk)
yield from accum.flush() yield from accum.flush()
class _TextSectionAccumulator: class TextPreChunkAccumulator:
"""Accumulates, measures, and combines section objects. """Accumulates, measures, and combines pre-chunk objects.
Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
whether to add another section. whether to add another pre-chunk.
`.flush()` is used to combine the accumulated sections into a single `TextSection` object. This `.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
method returns an interator that generates zero-or-one `TextSection` objects and is used like This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used
so: like so:
yield from accum.flush() yield from accum.flush()
If no sections have been accumulated, no `TextSection` is generated. Flushing the builder clears If no pre-chunks have been accumulated, no `TextPreChunk` is generated. Flushing the builder
the sections it contains so it is ready to accept the next text-section. clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk.
""" """
def __init__(self, maxlen: int) -> None: def __init__(self, maxlen: int) -> None:
self._maxlen = maxlen self._maxlen = maxlen
self._sections: List[_TextSection] = [] self._pre_chunks: List[TextPreChunk] = []
def add_section(self, section: _TextSection) -> None: def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None:
"""Add a section to the accumulator for possible combination with next section.""" """Add a pre-chunk to the accumulator for possible combination with next pre-chunk."""
self._sections.append(section) self._pre_chunks.append(pre_chunk)
def flush(self) -> Iterator[_TextSection]: def flush(self) -> Iterator[TextPreChunk]:
"""Generate all accumulated sections as a single combined section.""" """Generate all accumulated pre-chunks as a single combined pre-chunk."""
sections = self._sections pre_chunks = self._pre_chunks
# -- nothing to do if no sections have been accumulated -- # -- nothing to do if no pre-chunks have been accumulated --
if not sections: if not pre_chunks:
return return
# -- otherwise combine all accumulated section into one -- # -- otherwise combine all accumulated pre-chunk into one --
section = sections[0] pre_chunk = pre_chunks[0]
for other_section in sections[1:]: for other_pre_chunk in pre_chunks[1:]:
section = section.combine(other_section) pre_chunk = pre_chunk.combine(other_pre_chunk)
yield section yield pre_chunk
# -- and reset the accumulator (to empty) -- # -- and reset the accumulator (to empty) --
sections.clear() pre_chunks.clear()
@property @property
def remaining_space(self) -> int: def remaining_space(self) -> int:
"""Maximum size of section that can be added without exceeding maxlen.""" """Maximum size of pre-chunk that can be added without exceeding maxlen."""
return ( return (
self._maxlen self._maxlen
if not self._sections if not self._pre_chunks
# -- an additional section will also incur an additional separator -- # -- an additional pre-chunk will also incur an additional separator --
else self._maxlen - self.text_length - len(TEXT_SEPARATOR) else self._maxlen - self.text_length - len(TEXT_SEPARATOR)
) )
@property @property
def text_length(self) -> int: def text_length(self) -> int:
"""Size of concatenated text in all sections in accumulator.""" """Size of concatenated text in all pre-chunks in accumulator."""
n = len(self._sections) n = len(self._pre_chunks)
return ( return (
0 0
if n == 0 if n == 0
else sum(s.text_length for s in self._sections) + len(TEXT_SEPARATOR) * (n - 1) else sum(s.text_length for s in self._pre_chunks) + len(TEXT_SEPARATOR) * (n - 1)
) )