From cbeaed21ef552192e4ed1aec984480135ee37242 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Wed, 13 Dec 2023 15:13:57 -0800 Subject: [PATCH] rfctr: rename pre chunk (#2261) The original naming for the pre-cursor to a chunk in `chunk_by_title()` was conflated with the idea of how these element subsequences were bounded (by document-section) for that strategy. I mistakenly picked that up as a universal concept but in fact no notion of section arises in the `by_character` or other chunking strategies. Fix this misconception by using the name `pre-chunk` for this concept throughout. --- CHANGELOG.md | 2 +- test_unstructured/chunking/test_title.py | 384 +++++++++++------------ unstructured/__version__.py | 2 +- unstructured/chunking/title.py | 208 ++++++------ 4 files changed, 298 insertions(+), 298 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6525da7ee..9dc82d2ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.11.4-dev10 +## 0.11.4-dev11 ### Enhancements diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py index 2934b0547..709057c58 100644 --- a/test_unstructured/chunking/test_title.py +++ b/test_unstructured/chunking/test_title.py @@ -5,12 +5,12 @@ from typing import List import pytest from unstructured.chunking.title import ( - _SectionCombiner, + PreChunkCombiner, + TablePreChunk, + TextPreChunk, + TextPreChunkAccumulator, + TextPreChunkBuilder, _split_elements_by_title_and_table, - _TableSection, - _TextSection, - _TextSectionAccumulator, - _TextSectionBuilder, chunk_by_title, ) from unstructured.documents.coordinates import CoordinateSystem @@ -130,7 +130,7 @@ def test_it_does_not_complain_when_specifying_new_after_n_chars_by_itself(): def test_it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk(): - """Specifying `new_after_n_chars=0` places each element into its own section. + """Specifying `new_after_n_chars=0` places each element into its own pre-chunk. This puts each element into its own chunk, although long chunks are still split. """ @@ -166,7 +166,7 @@ def test_it_silently_accepts_new_after_n_chars_greater_than_maxchars(): # ================================================================================================ -def test_it_splits_a_large_section_into_multiple_chunks(): +def test_it_splits_a_large_element_into_multiple_chunks(): elements: List[Element] = [ Title("Introduction"), Text( @@ -199,36 +199,36 @@ def test_split_elements_by_title_and_table(): CheckBox(), ] - sections = _split_elements_by_title_and_table( + pre_chunks = _split_elements_by_title_and_table( elements, multipage_sections=True, new_after_n_chars=500, max_characters=500, ) - section = next(sections) - assert isinstance(section, _TextSection) - assert section._elements == [ + pre_chunk = next(pre_chunks) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ Title("A Great Day"), Text("Today is a great day."), Text("It is sunny outside."), ] # -- - section = next(sections) - assert isinstance(section, _TableSection) - assert section._table == Table("Heading\nCell text") + pre_chunk = next(pre_chunks) + assert isinstance(pre_chunk, TablePreChunk) + assert pre_chunk._table == Table("Heading\nCell text") # == - section = next(sections) - assert isinstance(section, _TextSection) - assert section._elements == [ + pre_chunk = next(pre_chunks) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ Title("An Okay Day"), Text("Today is an okay day."), Text("It is rainy outside."), ] # -- - section = next(sections) - assert isinstance(section, _TextSection) - assert section._elements == [ + pre_chunk = next(pre_chunks) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ Title("A Bad Day"), Text("Today is a bad day."), Text("It is storming outside."), @@ -236,7 +236,7 @@ def test_split_elements_by_title_and_table(): ] # -- with pytest.raises(StopIteration): - next(sections) + next(pre_chunks) def test_chunk_by_title(): @@ -351,9 +351,9 @@ def test_chunk_by_title_separates_by_page_number(): def test_chunk_by_title_does_not_break_on_regex_metadata_change(): - """Sectioner is insensitive to regex-metadata changes. + """PreChunker is insensitive to regex-metadata changes. - A regex-metadata match in an element does not signify a semantic boundary and a section should + A regex-metadata match in an element does not signify a semantic boundary and a pre-chunk should not be split based on such a difference. """ elements: List[Element] = [ @@ -672,8 +672,8 @@ def test_chunk_by_title_drops_extra_metadata(): assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day.")) -def test_it_considers_separator_length_when_sectioning(): - """Sectioner includes length of separators when computing remaining space.""" +def test_it_considers_separator_length_when_pre_chunking(): + """PreChunker includes length of separators when computing remaining space.""" elements: List[Element] = [ Title("Chunking Priorities"), # 19 chars ListItem("Divide text into manageable chunks"), # 34 chars @@ -693,11 +693,11 @@ def test_it_considers_separator_length_when_sectioning(): ] -# == Sections ==================================================================================== +# == PreChunks =================================================================================== -class Describe_TableSection: - """Unit-test suite for `unstructured.chunking.title._TableSection objects.""" +class DescribeTablePreChunk: + """Unit-test suite for `unstructured.chunking.title.TablePreChunk objects.""" def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self): html_table = ( @@ -711,11 +711,11 @@ class Describe_TableSection: "" ) text_table = "Header Col 1 Header Col 2\n" "Lorem ipsum adipiscing" - section = _TableSection( + pre_chunk = TablePreChunk( Table(text_table, metadata=ElementMetadata(text_as_html=html_table)) ) - chunk_iter = section.iter_chunks(maxlen=175) + chunk_iter = pre_chunk.iter_chunks(maxlen=175) chunk = next(chunk_iter) assert isinstance(chunk, Table) @@ -756,11 +756,11 @@ class Describe_TableSection: "Nunc aliquam id enim nec molestie\n" "Vivamus quis nunc ipsum donec ac fermentum" ) - section = _TableSection( + pre_chunk = TablePreChunk( Table(text_table, metadata=ElementMetadata(text_as_html=html_table)) ) - chunk_iter = section.iter_chunks(maxlen=100) + chunk_iter = pre_chunk.iter_chunks(maxlen=100) chunk = next(chunk_iter) assert isinstance(chunk, TableChunk) @@ -810,30 +810,30 @@ class Describe_TableSection: next(chunk_iter) -class Describe_TextSection: - """Unit-test suite for `unstructured.chunking.title._TextSection objects.""" +class DescribeTextPreChunk: + """Unit-test suite for `unstructured.chunking.title.TextPreChunk objects.""" - def it_can_combine_itself_with_another_TextSection_instance(self): - """.combine() produces a new section by appending the elements of `other_section`. + def it_can_combine_itself_with_another_TextPreChunk_instance(self): + """.combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`. - Note that neither the original or other section are mutated. + Note that neither the original or other pre_chunk are mutated. """ - section = _TextSection( + pre_chunk = TextPreChunk( [ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("In rhoncus ipsum sed lectus porta volutpat."), ] ) - other_section = _TextSection( + other_pre_chunk = TextPreChunk( [ Text("Donec semper facilisis metus finibus malesuada."), Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."), ] ) - new_section = section.combine(other_section) + new_pre_chunk = pre_chunk.combine(other_pre_chunk) - assert new_section == _TextSection( + assert new_pre_chunk == TextPreChunk( [ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("In rhoncus ipsum sed lectus porta volutpat."), @@ -841,13 +841,13 @@ class Describe_TextSection: Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."), ] ) - assert section == _TextSection( + assert pre_chunk == TextPreChunk( [ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("In rhoncus ipsum sed lectus porta volutpat."), ] ) - assert other_section == _TextSection( + assert other_pre_chunk == TextPreChunk( [ Text("Donec semper facilisis metus finibus malesuada."), Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."), @@ -855,7 +855,7 @@ class Describe_TextSection: ) def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self): - section = _TextSection( + pre_chunk = TextPreChunk( [ Title("Introduction"), Text( @@ -865,19 +865,19 @@ class Describe_TextSection: ] ) - chunk_iter = section.iter_chunks(maxlen=200) + chunk_iter = pre_chunk.iter_chunks(maxlen=200) chunk = next(chunk_iter) assert chunk == CompositeElement( "Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit." " In rhoncus ipsum sedlectus porta volutpat.", ) - assert chunk.metadata is section._consolidated_metadata + assert chunk.metadata is pre_chunk._consolidated_metadata def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self): # -- Chunk-splitting only occurs when a *single* element is too big to fit in the window. - # -- The sectioner will isolate that element in a section of its own. - section = _TextSection( + # -- The pre-chunker will isolate that element in a pre_chunk of its own. + pre_chunk = TextPreChunk( [ Text( "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod" @@ -888,7 +888,7 @@ class Describe_TextSection: ] ) - chunk_iter = section.iter_chunks(maxlen=200) + chunk_iter = pre_chunk.iter_chunks(maxlen=200) chunk = next(chunk_iter) assert chunk == CompositeElement( @@ -896,22 +896,22 @@ class Describe_TextSection: " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim" " veniam, quis nostrud exercitation ullamco laboris nisi ut a" ) - assert chunk.metadata is section._consolidated_metadata + assert chunk.metadata is pre_chunk._consolidated_metadata # -- chunk = next(chunk_iter) assert chunk == CompositeElement("liquip ex ea commodo consequat.") - assert chunk.metadata is section._consolidated_metadata + assert chunk.metadata is pre_chunk._consolidated_metadata # -- with pytest.raises(StopIteration): next(chunk_iter) def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self): - """.text_length is the size of chunk this section will produce (before any splitting).""" - section = _TextSection([PageBreak(""), Text("foo"), Text("bar")]) - assert section.text_length == 8 + """.text_length is the size of chunk this pre-chunk will produce (before any splitting).""" + pre_chunk = TextPreChunk([PageBreak(""), Text("foo"), Text("bar")]) + assert pre_chunk.text_length == 8 def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self): - section = _TextSection( + pre_chunk = TextPreChunk( [ Title( "Lorem Ipsum", @@ -934,7 +934,7 @@ class Describe_TextSection: ] ) - assert section._all_metadata_values == { + assert pre_chunk._all_metadata_values == { # -- scalar values are accumulated in a list in element order -- "category_depth": [0, 1], # -- all values are accumulated, not only unique ones -- @@ -963,7 +963,7 @@ class Describe_TextSection: ) metadata_2.quotient = 1.74 - section = _TextSection( + pre_chunk = TextPreChunk( [ Title("Lorem Ipsum", metadata=metadata), Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2), @@ -971,7 +971,7 @@ class Describe_TextSection: ) # -- ad-hoc fields "coefficient" and "quotient" do not appear -- - assert section._all_metadata_values == { + assert pre_chunk._all_metadata_values == { "category_depth": [0, 1], "filename": ["foo.docx", "foo.docx"], "image_path": ["sprite.png"], @@ -985,7 +985,7 @@ class Describe_TextSection: Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new position in the chunk after element text has been concatenated. """ - section = _TextSection( + pre_chunk = TextPreChunk( [ Title( "Lorem Ipsum", @@ -1011,7 +1011,7 @@ class Describe_TextSection: ] ) - regex_metadata = section._consolidated_regex_meta + regex_metadata = pre_chunk._consolidated_regex_meta assert regex_metadata == { "dolor": [RegexMetadata(text="dolor", start=25, end=30)], @@ -1026,9 +1026,9 @@ class Describe_TextSection: """._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata. Only non-None fields should appear in the dict and each field value should be the - consolidation of the values across the section elements. + consolidation of the values across the pre_chunk elements. """ - section = _TextSection( + pre_chunk = TextPreChunk( [ PageBreak(""), Title( @@ -1065,7 +1065,7 @@ class Describe_TextSection: ] ) - meta_kwargs = section._meta_kwargs + meta_kwargs = pre_chunk._meta_kwargs assert meta_kwargs == { "filename": "foo.docx", @@ -1090,29 +1090,29 @@ class Describe_TextSection: ([Text("foo"), Text("bar"), PageBreak("")], "foo\n\nbar"), ], ) - def it_knows_the_concatenated_text_of_the_section( + def it_knows_the_concatenated_text_of_the_pre_chunk( self, elements: List[Text], expected_value: str ): - """._text is the "joined" text of the section elements. + """._text is the "joined" text of the pre-chunk elements. The text-segment contributed by each element is separated from the next by a blank line ("\n\n"). An element that contributes no text does not give rise to a separator. """ - section = _TextSection(elements) - assert section._text == expected_value + pre_chunk = TextPreChunk(elements) + assert pre_chunk._text == expected_value -class Describe_TextSectionBuilder: - """Unit-test suite for `unstructured.chunking.title._TextSectionBuilder`.""" +class DescribeTextPreChunkBuilder: + """Unit-test suite for `unstructured.chunking.title.TextPreChunkBuilder`.""" def it_is_empty_on_construction(self): - builder = _TextSectionBuilder(maxlen=50) + builder = TextPreChunkBuilder(maxlen=50) assert builder.text_length == 0 assert builder.remaining_space == 50 def it_accumulates_elements_added_to_it(self): - builder = _TextSectionBuilder(maxlen=150) + builder = TextPreChunkBuilder(maxlen=150) builder.add_element(Title("Introduction")) assert builder.text_length == 12 @@ -1127,8 +1127,8 @@ class Describe_TextSectionBuilder: assert builder.text_length == 112 assert builder.remaining_space == 36 - def it_generates_a_TextSection_when_flushed_and_resets_itself_to_empty(self): - builder = _TextSectionBuilder(maxlen=150) + def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): + builder = TextPreChunkBuilder(maxlen=150) builder.add_element(Title("Introduction")) builder.add_element( Text( @@ -1137,10 +1137,10 @@ class Describe_TextSectionBuilder: ), ) - section = next(builder.flush()) + pre_chunk = next(builder.flush()) - assert isinstance(section, _TextSection) - assert section._elements == [ + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ Title("Introduction"), Text( "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed" @@ -1150,17 +1150,17 @@ class Describe_TextSectionBuilder: assert builder.text_length == 0 assert builder.remaining_space == 150 - def but_it_does_not_generate_a_TextSection_on_flush_when_empty(self): - builder = _TextSectionBuilder(maxlen=150) + def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self): + builder = TextPreChunkBuilder(maxlen=150) - sections = list(builder.flush()) + pre_chunks = list(builder.flush()) - assert sections == [] + assert pre_chunks == [] assert builder.text_length == 0 assert builder.remaining_space == 150 def it_considers_separator_length_when_computing_text_length_and_remaining_space(self): - builder = _TextSectionBuilder(maxlen=50) + builder = TextPreChunkBuilder(maxlen=50) builder.add_element(Text("abcde")) builder.add_element(Text("fghij")) @@ -1173,27 +1173,27 @@ class Describe_TextSectionBuilder: assert builder.remaining_space == 36 -# == SectionCombiner ============================================================================= +# == PreChunkCombiner ============================================================================= -class Describe_SectionCombiner: - """Unit-test suite for `unstructured.chunking.title._SectionCombiner`.""" +class DescribePreChunkCombiner: + """Unit-test suite for `unstructured.chunking.title.PreChunkCombiner`.""" - def it_combines_sequential_small_text_sections(self): - sections = [ - _TextSection( + def it_combines_sequential_small_text_pre_chunks(self): + pre_chunks = [ + TextPreChunk( [ Title("Lorem Ipsum"), # 11 Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 ] ), - _TextSection( + TextPreChunk( [ Title("Mauris Nec"), # 10 Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 ] ), - _TextSection( + TextPreChunk( [ Title("Sed Orci"), # 8 Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 @@ -1201,13 +1201,13 @@ class Describe_SectionCombiner: ), ] - section_iter = _SectionCombiner( - sections, maxlen=250, combine_text_under_n_chars=250 - ).iter_combined_sections() + pre_chunk_iter = PreChunkCombiner( + pre_chunks, maxlen=250, combine_text_under_n_chars=250 + ).iter_combined_pre_chunks() - section = next(section_iter) - assert isinstance(section, _TextSection) - assert section._elements == [ + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ Title("Lorem Ipsum"), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Title("Mauris Nec"), @@ -1216,18 +1216,18 @@ class Describe_SectionCombiner: Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), ] with pytest.raises(StopIteration): - next(section_iter) + next(pre_chunk_iter) - def but_it_does_not_combine_table_sections(self): - sections = [ - _TextSection( + def but_it_does_not_combine_table_pre_chunks(self): + pre_chunks = [ + TextPreChunk( [ Title("Lorem Ipsum"), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), ] ), - _TableSection(Table("Heading\nCell text")), - _TextSection( + TablePreChunk(Table("Heading\nCell text")), + TextPreChunk( [ Title("Mauris Nec"), Text("Mauris nec urna non augue vulputate consequat eget et nisi."), @@ -1235,47 +1235,47 @@ class Describe_SectionCombiner: ), ] - section_iter = _SectionCombiner( - sections, maxlen=250, combine_text_under_n_chars=250 - ).iter_combined_sections() + pre_chunk_iter = PreChunkCombiner( + pre_chunks, maxlen=250, combine_text_under_n_chars=250 + ).iter_combined_pre_chunks() - section = next(section_iter) - assert isinstance(section, _TextSection) - assert section._elements == [ + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ Title("Lorem Ipsum"), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), ] # -- - section = next(section_iter) - assert isinstance(section, _TableSection) - assert section._table == Table("Heading\nCell text") + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TablePreChunk) + assert pre_chunk._table == Table("Heading\nCell text") # -- - section = next(section_iter) - assert isinstance(section, _TextSection) - assert section._elements == [ + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ Title("Mauris Nec"), Text("Mauris nec urna non augue vulputate consequat eget et nisi."), ] # -- with pytest.raises(StopIteration): - next(section_iter) + next(pre_chunk_iter) def it_respects_the_specified_combination_threshold(self): - sections = [ - _TextSection( # 68 + pre_chunks = [ + TextPreChunk( # 68 [ Title("Lorem Ipsum"), # 11 Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 ] ), - _TextSection( # 71 + TextPreChunk( # 71 [ Title("Mauris Nec"), # 10 Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 ] ), # -- len == 139 - _TextSection( + TextPreChunk( [ Title("Sed Orci"), # 8 Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 @@ -1283,45 +1283,45 @@ class Describe_SectionCombiner: ), ] - section_iter = _SectionCombiner( - sections, maxlen=250, combine_text_under_n_chars=80 - ).iter_combined_sections() + pre_chunk_iter = PreChunkCombiner( + pre_chunks, maxlen=250, combine_text_under_n_chars=80 + ).iter_combined_pre_chunks() - section = next(section_iter) - assert isinstance(section, _TextSection) - assert section._elements == [ + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ Title("Lorem Ipsum"), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Title("Mauris Nec"), Text("Mauris nec urna non augue vulputate consequat eget et nisi."), ] # -- - section = next(section_iter) - assert isinstance(section, _TextSection) - assert section._elements == [ + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ Title("Sed Orci"), Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), ] # -- with pytest.raises(StopIteration): - next(section_iter) + next(pre_chunk_iter) def it_respects_the_hard_maximum_window_length(self): - sections = [ - _TextSection( # 68 + pre_chunks = [ + TextPreChunk( # 68 [ Title("Lorem Ipsum"), # 11 Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55 ] ), - _TextSection( # 71 + TextPreChunk( # 71 [ Title("Mauris Nec"), # 10 Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59 ] ), # -- len == 139 - _TextSection( + TextPreChunk( [ Title("Sed Orci"), # 8 Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63 @@ -1330,35 +1330,35 @@ class Describe_SectionCombiner: # -- len == 214 ] - section_iter = _SectionCombiner( - sections, maxlen=200, combine_text_under_n_chars=200 - ).iter_combined_sections() + pre_chunk_iter = PreChunkCombiner( + pre_chunks, maxlen=200, combine_text_under_n_chars=200 + ).iter_combined_pre_chunks() - section = next(section_iter) - assert isinstance(section, _TextSection) - assert section._elements == [ + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ Title("Lorem Ipsum"), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Title("Mauris Nec"), Text("Mauris nec urna non augue vulputate consequat eget et nisi."), ] # -- - section = next(section_iter) - assert isinstance(section, _TextSection) - assert section._elements == [ + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ Title("Sed Orci"), Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), ] # -- with pytest.raises(StopIteration): - next(section_iter) + next(pre_chunk_iter) - def it_accommodates_and_isolates_an_oversized_section(self): + def it_accommodates_and_isolates_an_oversized_pre_chunk(self): """Such as occurs when a single element exceeds the window size.""" - sections = [ - _TextSection([Title("Lorem Ipsum")]), - _TextSection( # 179 + pre_chunks = [ + TextPreChunk([Title("Lorem Ipsum")]), + TextPreChunk( # 179 [ Text( "Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55 @@ -1367,20 +1367,20 @@ class Describe_SectionCombiner: ) ] ), - _TextSection([Title("Vulputate Consequat")]), + TextPreChunk([Title("Vulputate Consequat")]), ] - section_iter = _SectionCombiner( - sections, maxlen=150, combine_text_under_n_chars=150 - ).iter_combined_sections() + pre_chunk_iter = PreChunkCombiner( + pre_chunks, maxlen=150, combine_text_under_n_chars=150 + ).iter_combined_pre_chunks() - section = next(section_iter) - assert isinstance(section, _TextSection) - assert section._elements == [Title("Lorem Ipsum")] + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [Title("Lorem Ipsum")] # -- - section = next(section_iter) - assert isinstance(section, _TextSection) - assert section._elements == [ + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ Text( "Lorem ipsum dolor sit amet consectetur adipiscing elit." " Mauris nec urna non augue vulputate consequat eget et nisi." @@ -1388,28 +1388,28 @@ class Describe_SectionCombiner: ) ] # -- - section = next(section_iter) - assert isinstance(section, _TextSection) - assert section._elements == [Title("Vulputate Consequat")] + pre_chunk = next(pre_chunk_iter) + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [Title("Vulputate Consequat")] # -- with pytest.raises(StopIteration): - next(section_iter) + next(pre_chunk_iter) -class Describe_TextSectionAccumulator: - """Unit-test suite for `unstructured.chunking.title._TextSectionAccumulator`.""" +class DescribeTextPreChunkAccumulator: + """Unit-test suite for `unstructured.chunking.title.TextPreChunkAccumulator`.""" def it_is_empty_on_construction(self): - accum = _TextSectionAccumulator(maxlen=100) + accum = TextPreChunkAccumulator(maxlen=100) assert accum.text_length == 0 assert accum.remaining_space == 100 - def it_accumulates_sections_added_to_it(self): - accum = _TextSectionAccumulator(maxlen=500) + def it_accumulates_pre_chunks_added_to_it(self): + accum = TextPreChunkAccumulator(maxlen=500) - accum.add_section( - _TextSection( + accum.add_pre_chunk( + TextPreChunk( [ Title("Lorem Ipsum"), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), @@ -1419,8 +1419,8 @@ class Describe_TextSectionAccumulator: assert accum.text_length == 68 assert accum.remaining_space == 430 - accum.add_section( - _TextSection( + accum.add_pre_chunk( + TextPreChunk( [ Title("Mauris Nec"), Text("Mauris nec urna non augue vulputate consequat eget et nisi."), @@ -1430,26 +1430,26 @@ class Describe_TextSectionAccumulator: assert accum.text_length == 141 assert accum.remaining_space == 357 - def it_generates_a_TextSection_when_flushed_and_resets_itself_to_empty(self): - accum = _TextSectionAccumulator(maxlen=150) - accum.add_section( - _TextSection( + def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): + accum = TextPreChunkAccumulator(maxlen=150) + accum.add_pre_chunk( + TextPreChunk( [ Title("Lorem Ipsum"), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), ] ) ) - accum.add_section( - _TextSection( + accum.add_pre_chunk( + TextPreChunk( [ Title("Mauris Nec"), Text("Mauris nec urna non augue vulputate consequat eget et nisi."), ] ) ) - accum.add_section( - _TextSection( + accum.add_pre_chunk( + TextPreChunk( [ Title("Sed Orci"), Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."), @@ -1457,15 +1457,15 @@ class Describe_TextSectionAccumulator: ) ) - section_iter = accum.flush() + pre_chunk_iter = accum.flush() - # -- iterator generates exactly one section -- - section = next(section_iter) + # -- iterator generates exactly one pre_chunk -- + pre_chunk = next(pre_chunk_iter) with pytest.raises(StopIteration): - next(section_iter) - # -- and it is a _TextSection containing all the elements -- - assert isinstance(section, _TextSection) - assert section._elements == [ + next(pre_chunk_iter) + # -- and it is a _TextPreChunk containing all the elements -- + assert isinstance(pre_chunk, TextPreChunk) + assert pre_chunk._elements == [ Title("Lorem Ipsum"), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Title("Mauris Nec"), @@ -1476,24 +1476,24 @@ class Describe_TextSectionAccumulator: assert accum.text_length == 0 assert accum.remaining_space == 150 - def but_it_does_not_generate_a_TextSection_on_flush_when_empty(self): - accum = _TextSectionAccumulator(maxlen=150) + def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self): + accum = TextPreChunkAccumulator(maxlen=150) - sections = list(accum.flush()) + pre_chunks = list(accum.flush()) - assert sections == [] + assert pre_chunks == [] assert accum.text_length == 0 assert accum.remaining_space == 150 def it_considers_separator_length_when_computing_text_length_and_remaining_space(self): - accum = _TextSectionAccumulator(maxlen=100) - accum.add_section(_TextSection([Text("abcde")])) - accum.add_section(_TextSection([Text("fghij")])) + accum = TextPreChunkAccumulator(maxlen=100) + accum.add_pre_chunk(TextPreChunk([Text("abcde")])) + accum.add_pre_chunk(TextPreChunk([Text("fghij")])) # -- .text_length includes a separator ("\n\n", len==2) between each text-segment, # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10 assert accum.text_length == 12 # -- .remaining_space is reduced by the length (2) of the trailing separator which would - # -- go between the current text and that of the next section if one was added. + # -- go between the current text and that of the next pre-chunk if one was added. # -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88 assert accum.remaining_space == 86 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 6525ce9a6..357d43325 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.11.4-dev10" # pragma: no cover +__version__ = "0.11.4-dev11" # pragma: no cover diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py index c13591d3d..4ff18fea2 100644 --- a/unstructured/chunking/title.py +++ b/unstructured/chunking/title.py @@ -25,7 +25,7 @@ from unstructured.documents.elements import ( ) from unstructured.utils import lazyproperty -_Section: TypeAlias = "_TableSection | _TextSection" +PreChunk: TypeAlias = "TablePreChunk | TextPreChunk" # -- goes between text of each element when element-text is concatenated to form chunk -- TEXT_SEPARATOR = "\n\n" @@ -98,7 +98,7 @@ def chunk_by_title( # ---------------------------------------------------------------- - sections = _SectionCombiner( + pre_chunks = PreChunkCombiner( _split_elements_by_title_and_table( elements, multipage_sections=multipage_sections, @@ -107,9 +107,9 @@ def chunk_by_title( ), max_characters, combine_text_under_n_chars, - ).iter_combined_sections() + ).iter_combined_pre_chunks() - return [chunk for section in sections for chunk in section.iter_chunks(max_characters)] + return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks(max_characters)] def _split_elements_by_title_and_table( @@ -117,31 +117,31 @@ def _split_elements_by_title_and_table( multipage_sections: bool, new_after_n_chars: int, max_characters: int, -) -> Iterator[_TextSection | _TableSection]: - """Implements "sectioner" responsibilities. +) -> Iterator[TextPreChunk | TablePreChunk]: + """Implements "pre-chunker" responsibilities. A _section_ can be thought of as a "pre-chunk", generally determining the size and contents of a chunk formed by the subsequent "chunker" process. The only exception occurs when a single element is too big to fit in the chunk window and the chunker splits it into two or more chunks - divided mid-text. The sectioner never divides an element mid-text. + divided mid-text. The pre-chunker never divides an element mid-text. - The sectioner's responsibilities are: + The pre-chunker's responsibilities are: * **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on - either side of those boundaries into different sections. In this case, the primary + either side of those boundaries into different pre-chunks. In this case, the primary indicator of a semantic boundary is a `Title` element. A page-break (change in page-number) is also a semantic boundary when `multipage_sections` is `False`. * **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit - into sections as big as possible without exceeding the chunk window size. + into pre-chunks as big as possible without exceeding the chunk window size. * **Minimize chunks that must be split mid-text.** Precompute the text length of each - section and only produce a section that exceeds the chunk window size when there is a + pre-chunk and only produce a pre-chunk that exceeds the chunk window size when there is a single element with text longer than that window. - A Table or Checkbox element is placed into a section by itself. + A Table or Checkbox element is placed into a pre-chunk by itself. """ - section_builder = _TextSectionBuilder(max_characters) + pre_chunk_builder = TextPreChunkBuilder(max_characters) prior_element = None @@ -152,31 +152,31 @@ def _split_elements_by_title_and_table( else False ) - # -- start new section when necessary -- + # -- start new pre_chunk when necessary -- if ( - # -- Title and Table both start a new section -- + # -- Title and Table both start a new pre_chunk -- isinstance(element, (Title, Table)) - # -- adding this element would exceed hard-maxlen for section -- - or section_builder.remaining_space < len(str(element)) - # -- section already meets or exceeds soft-maxlen -- - or section_builder.text_length >= new_after_n_chars + # -- adding this element would exceed hard-maxlen for pre_chunk -- + or pre_chunk_builder.remaining_space < len(str(element)) + # -- pre_chunk already meets or exceeds soft-maxlen -- + or pre_chunk_builder.text_length >= new_after_n_chars # -- a semantic boundary is indicated by metadata change since prior element -- or metadata_differs ): - # -- complete any work-in-progress section -- - yield from section_builder.flush() + # -- complete any work-in-progress pre_chunk -- + yield from pre_chunk_builder.flush() # -- emit table and checkbox immediately since they are always isolated -- if isinstance(element, Table): - yield _TableSection(table=element) + yield TablePreChunk(table=element) # -- but accumulate text elements for consolidation into a composite chunk -- else: - section_builder.add_element(element) + pre_chunk_builder.add_element(element) prior_element = element - # -- flush "tail" section, any partially-filled section after last element is processed -- - yield from section_builder.flush() + # -- flush "tail" pre_chunk, any partially-filled pre_chunk after last element is processed -- + yield from pre_chunk_builder.flush() def _metadata_differs( @@ -255,17 +255,17 @@ def add_chunking_strategy() -> Callable[[Callable[_P, List[Element]]], Callable[ return decorator -# == Sections ==================================================================================== +# == PreChunks =================================================================================== -class _TableSection: - """A section composed of a single Table element.""" +class TablePreChunk: + """A pre-chunk composed of a single Table element.""" def __init__(self, table: Table) -> None: self._table = table def iter_chunks(self, maxlen: int) -> Iterator[Table | TableChunk]: - """Split this section into one or more `Table` or `TableChunk` objects maxlen or smaller.""" + """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller.""" text = self._table.text html = self._table.metadata.text_as_html or "" @@ -296,7 +296,7 @@ class _TableSection: is_continuation = True -class _TextSection: +class TextPreChunk: """A sequence of elements that belong to the same semantic unit within a document. The name "section" derives from the idea of a document-section, a heading followed by the @@ -310,16 +310,16 @@ class _TextSection: self._elements = list(elements) def __eq__(self, other: Any) -> bool: - if not isinstance(other, _TextSection): + if not isinstance(other, TextPreChunk): return False return self._elements == other._elements - def combine(self, other_section: _TextSection) -> _TextSection: - """Return new `_TextSection` that combines this and `other_section`.""" - return _TextSection(self._elements + other_section._elements) + def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk: + """Return new `TextPreChunk` that combines this and `other_pre_chunk`.""" + return TextPreChunk(self._elements + other_pre_chunk._elements) def iter_chunks(self, maxlen: int) -> Iterator[CompositeElement]: - """Split this section into one or more `CompositeElement` objects maxlen or smaller.""" + """Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller.""" text = self._text text_len = len(text) start = 0 @@ -333,8 +333,8 @@ class _TextSection: @lazyproperty def text_length(self) -> int: - """Length of concatenated text of this section, including separators.""" - # -- used by section-combiner to identify combination candidates -- + """Length of concatenated text of this pre-chunk, including separators.""" + # -- used by pre-chunk-combiner to identify combination candidates -- return len(self._text) @lazyproperty @@ -342,7 +342,7 @@ class _TextSection: """Collection of all populated metadata values across elements. The resulting dict has one key for each `ElementMetadata` field that had a non-None value in - at least one of the elements in this section. The value of that key is a list of all those + at least one of the elements in this pre-chunk. The value of that key is a list of all those populated values, in element order, for example: { @@ -374,13 +374,13 @@ class _TextSection: @lazyproperty def _consolidated_metadata(self) -> ElementMetadata: - """Metadata applicable to this section as a single chunk. + """Metadata applicable to this pre-chunk as a single chunk. Formed by applying consolidation rules to all metadata fields across the elements of this - section. + pre-chunk. For the sake of consistency, the same rules are applied (for example, for dropping values) - to a single-element section too, even though metadata for such a section is already + to a single-element pre-chunk too, even though metadata for such a pre-chunk is already "consolidated". """ return ElementMetadata(**self._meta_kwargs) @@ -460,26 +460,26 @@ class _TextSection: @lazyproperty def _text(self) -> str: - """The concatenated text of all elements in this section. + """The concatenated text of all elements in this pre-chunk. Each element-text is separated from the next by a blank line ("\n\n"). """ return TEXT_SEPARATOR.join(e.text for e in self._elements if e.text) -class _TextSectionBuilder: - """An element accumulator suitable for incrementally forming a section. +class TextPreChunkBuilder: + """An element accumulator suitable for incrementally forming a pre-chunk. - Provides monitoring properties like `.remaining_space` and `.text_length` a sectioner can use + Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use to determine whether it should add the next element in the element stream. - `.flush()` is used to build a `TextSection` object from the accumulated elements. This method - returns an interator that generates zero-or-one `TextSection` object and is used like so: + `.flush()` is used to build a `TextPreChunk` object from the accumulated elements. This method + returns an interator that generates zero-or-one `TextPreChunk` object and is used like so: yield from builder.flush() - If no elements have been accumulated, no `TextSection` is generated. Flushing the builder clears - the elements it contains so it is ready to build the next text-section. + If no elements have been accumulated, no `TextPreChunk` is generated. Flushing the builder + clears the elements it contains so it is ready to build the next text-pre-chunk. """ def __init__(self, maxlen: int) -> None: @@ -502,22 +502,22 @@ class _TextSectionBuilder: self._text_segments.append(element.text) self._text_len += len(element.text) - def flush(self) -> Iterator[_TextSection]: - """Generate zero-or-one `Section` object and clear the accumulator. + def flush(self) -> Iterator[TextPreChunk]: + """Generate zero-or-one `PreChunk` object and clear the accumulator. - Suitable for use to emit a Section when the maximum size has been reached or a semantic - boundary has been reached. Also to clear out a terminal section at the end of an element + Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic + boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element stream. """ if not self._elements: return # -- clear builder before yield so we're not sensitive to the timing of how/when this - # -- iterator is exhausted and can add eleemnts for the next section immediately. + # -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately. elements = self._elements[:] self._elements.clear() self._text_segments.clear() self._text_len = 0 - yield _TextSection(elements) + yield TextPreChunk(elements) @property def remaining_space(self) -> int: @@ -528,9 +528,9 @@ class _TextSectionBuilder: @property def text_length(self) -> int: - """Length of the text in this section. + """Length of the text in this pre-chunk. - This value represents the chunk-size that would result if this section was flushed in its + This value represents the chunk-size that would result if this pre-chunk was flushed in its current state. In particular, it does not include the length of a trailing separator (since that would only appear if an additional element was added). @@ -544,104 +544,104 @@ class _TextSectionBuilder: return self._text_len + (separator_count * self._separator_len) -# == SectionCombiner ============================================================================= +# == PreChunkCombiner ============================================================================ -class _SectionCombiner: - """Filters section stream to combine small sections where possible.""" +class PreChunkCombiner: + """Filters pre-chunk stream to combine small pre-chunks where possible.""" def __init__( self, - sections: Iterable[_Section], + pre_chunks: Iterable[PreChunk], maxlen: int, combine_text_under_n_chars: int, ): - self._sections = sections + self._pre_chunks = pre_chunks self._maxlen = maxlen self._combine_text_under_n_chars = combine_text_under_n_chars - def iter_combined_sections(self) -> Iterator[_Section]: - """Generate section objects, combining TextSection objects when they will fit in window.""" - accum = _TextSectionAccumulator(self._maxlen) + def iter_combined_pre_chunks(self) -> Iterator[PreChunk]: + """Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window.""" + accum = TextPreChunkAccumulator(self._maxlen) - for section in self._sections: - # -- start new section under these conditions -- + for pre_chunk in self._pre_chunks: + # -- start new pre-chunk under these conditions -- if ( - # -- a table section is never combined -- - isinstance(section, _TableSection) - # -- don't add another section once length has reached combination soft-max -- + # -- a table pre-chunk is never combined -- + isinstance(pre_chunk, TablePreChunk) + # -- don't add another pre-chunk once length has reached combination soft-max -- or accum.text_length >= self._combine_text_under_n_chars # -- combining would exceed hard-max -- - or accum.remaining_space < section.text_length + or accum.remaining_space < pre_chunk.text_length ): yield from accum.flush() - # -- a table section is never combined so don't accumulate -- - if isinstance(section, _TableSection): - yield section + # -- a table pre-chunk is never combined so don't accumulate -- + if isinstance(pre_chunk, TablePreChunk): + yield pre_chunk else: - accum.add_section(section) + accum.add_pre_chunk(pre_chunk) yield from accum.flush() -class _TextSectionAccumulator: - """Accumulates, measures, and combines section objects. +class TextPreChunkAccumulator: + """Accumulates, measures, and combines pre-chunk objects. Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding - whether to add another section. + whether to add another pre-chunk. - `.flush()` is used to combine the accumulated sections into a single `TextSection` object. This - method returns an interator that generates zero-or-one `TextSection` objects and is used like - so: + `.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object. + This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used + like so: yield from accum.flush() - If no sections have been accumulated, no `TextSection` is generated. Flushing the builder clears - the sections it contains so it is ready to accept the next text-section. + If no pre-chunks have been accumulated, no `TextPreChunk` is generated. Flushing the builder + clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk. """ def __init__(self, maxlen: int) -> None: self._maxlen = maxlen - self._sections: List[_TextSection] = [] + self._pre_chunks: List[TextPreChunk] = [] - def add_section(self, section: _TextSection) -> None: - """Add a section to the accumulator for possible combination with next section.""" - self._sections.append(section) + def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None: + """Add a pre-chunk to the accumulator for possible combination with next pre-chunk.""" + self._pre_chunks.append(pre_chunk) - def flush(self) -> Iterator[_TextSection]: - """Generate all accumulated sections as a single combined section.""" - sections = self._sections + def flush(self) -> Iterator[TextPreChunk]: + """Generate all accumulated pre-chunks as a single combined pre-chunk.""" + pre_chunks = self._pre_chunks - # -- nothing to do if no sections have been accumulated -- - if not sections: + # -- nothing to do if no pre-chunks have been accumulated -- + if not pre_chunks: return - # -- otherwise combine all accumulated section into one -- - section = sections[0] - for other_section in sections[1:]: - section = section.combine(other_section) - yield section + # -- otherwise combine all accumulated pre-chunk into one -- + pre_chunk = pre_chunks[0] + for other_pre_chunk in pre_chunks[1:]: + pre_chunk = pre_chunk.combine(other_pre_chunk) + yield pre_chunk # -- and reset the accumulator (to empty) -- - sections.clear() + pre_chunks.clear() @property def remaining_space(self) -> int: - """Maximum size of section that can be added without exceeding maxlen.""" + """Maximum size of pre-chunk that can be added without exceeding maxlen.""" return ( self._maxlen - if not self._sections - # -- an additional section will also incur an additional separator -- + if not self._pre_chunks + # -- an additional pre-chunk will also incur an additional separator -- else self._maxlen - self.text_length - len(TEXT_SEPARATOR) ) @property def text_length(self) -> int: - """Size of concatenated text in all sections in accumulator.""" - n = len(self._sections) + """Size of concatenated text in all pre-chunks in accumulator.""" + n = len(self._pre_chunks) return ( 0 if n == 0 - else sum(s.text_length for s in self._sections) + len(TEXT_SEPARATOR) * (n - 1) + else sum(s.text_length for s in self._pre_chunks) + len(TEXT_SEPARATOR) * (n - 1) )