From 51cf6bf7161d56eee4e108ada5dfa3c1c314f2cc Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Fri, 23 Feb 2024 10:22:44 -0800 Subject: [PATCH] rfctr(chunking): extract strategy-specific chunking options (#2556) **Summary** A pluggable chunking strategy needs its own local set of chunking options that subclasses a base-class in `unstructured`. Extract distinct `_ByTitleChunkingOptions` and `_BasicChunkingOptions` for the existing two chunking strategies and move their strategy-specific option setting and validation to the respective subclass. This was also a good opportunity for us to clean up a few odds and ends we'd been meaning to. Might be worth looking at the commits individually as they are cohesive incremental steps toward the goal. --- test_unstructured/chunking/test_base.py | 187 ++++++++++------------- test_unstructured/chunking/test_basic.py | 2 +- test_unstructured/chunking/test_title.py | 105 ++++++++++++- unstructured/chunking/base.py | 91 +++-------- unstructured/chunking/basic.py | 36 ++++- unstructured/chunking/title.py | 124 +++++++++++++-- 6 files changed, 340 insertions(+), 205 deletions(-) diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index 22ebf68e7..3a3674ced 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -4,15 +4,15 @@ from __future__ import annotations -from typing import Sequence +from typing import Optional, Sequence import pytest from unstructured.chunking.base import ( - BasePreChunker, ChunkingOptions, PreChunkBuilder, PreChunkCombiner, + PreChunker, TablePreChunk, TextPreChunk, TextPreChunkAccumulator, @@ -48,7 +48,7 @@ class DescribeChunkingOptions: ValueError, match=f"'max_characters' argument must be > 0, got {max_characters}", ): - ChunkingOptions.new(max_characters=max_characters) + ChunkingOptions(max_characters=max_characters)._validate() def it_does_not_complain_when_specifying_max_characters_by_itself(self): """Caller can specify `max_characters` arg without specifying any others. @@ -58,44 +58,25 @@ class DescribeChunkingOptions: and trigger an exception. """ try: - ChunkingOptions.new(max_characters=50) + ChunkingOptions(max_characters=50)._validate() except ValueError: pytest.fail("did not accept `max_characters` as option by itself") - @pytest.mark.parametrize("n_chars", [-1, -42]) - def it_rejects_combine_text_under_n_chars_for_n_less_than_zero(self, n_chars: int): - with pytest.raises( - ValueError, - match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}", - ): - ChunkingOptions.new(combine_text_under_n_chars=n_chars) + @pytest.mark.parametrize( + ("combine_text_under_n_chars", "expected_value"), [(None, 0), (42, 42)] + ) + def it_accepts_combine_text_under_n_chars_in_constructor_but_defaults_to_no_combining( + self, combine_text_under_n_chars: Optional[int], expected_value: int + ): + """Subclasses can store `combine_text_under_n_chars` but must validate and enable it. - def it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining(self): - """Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining.""" - opts = ChunkingOptions.new(combine_text_under_n_chars=0) - assert opts.combine_text_under_n_chars == 0 - - def it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself(self): - """Caller can specify `combine_text_under_n_chars` arg without specifying other options.""" - try: - opts = ChunkingOptions.new(combine_text_under_n_chars=50) - except ValueError: - pytest.fail("did not accept `combine_text_under_n_chars` as option by itself") - - assert opts.combine_text_under_n_chars == 50 - - def it_silently_accepts_combine_text_under_n_chars_greater_than_maxchars(self): - """`combine_text_under_n_chars` > `max_characters` doesn't affect chunking behavior. - - So rather than raising an exception or warning, we just cap that value at `max_characters` - which is the behavioral equivalent. + The `combine_text_under_n_chars` option is not used by all chunkers and its behavior can + differ between subtypes. It is present in and stored by the contructur but it defaults to + `0` (no pre-chunk combining) and must be overridden by subclasses to give it the desired + behavior. """ - try: - opts = ChunkingOptions.new(max_characters=500, combine_text_under_n_chars=600) - except ValueError: - pytest.fail("did not accept `combine_text_under_n_chars` greater than `max_characters`") - - assert opts.combine_text_under_n_chars == 500 + opts = ChunkingOptions(combine_text_under_n_chars=combine_text_under_n_chars) + assert opts.combine_text_under_n_chars == expected_value @pytest.mark.parametrize("n_chars", [-1, -42]) def it_rejects_new_after_n_chars_for_n_less_than_zero(self, n_chars: int): @@ -103,7 +84,7 @@ class DescribeChunkingOptions: ValueError, match=f"'new_after_n_chars' argument must be >= 0, got {n_chars}", ): - ChunkingOptions.new(new_after_n_chars=n_chars) + ChunkingOptions(new_after_n_chars=n_chars)._validate() def it_rejects_overlap_not_less_than_max_characters(self): with pytest.raises( @@ -113,26 +94,23 @@ class DescribeChunkingOptions: ChunkingOptions(max_characters=200, overlap=300)._validate() def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self): - """Caller can specify `new_after_n_chars` arg without specifying any other options. - - In particular, `combine_text_under_n_chars` value is adjusted down to the - `new_after_n_chars` value when the default for `combine_text_under_n_chars` exceeds the - value of `new_after_n_chars`. - """ + """Caller can specify `new_after_n_chars` arg without specifying any other options.""" + opts = ChunkingOptions(new_after_n_chars=200) try: - opts = ChunkingOptions.new(new_after_n_chars=200) + opts._validate() except ValueError: pytest.fail("did not accept `new_after_n_chars` as option by itself") assert opts.soft_max == 200 - assert opts.combine_text_under_n_chars == 200 def it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk(self): """Specifying `new_after_n_chars=0` places each element into its own pre-chunk. This puts each element into its own chunk, although long chunks are still split. """ - opts = ChunkingOptions.new(new_after_n_chars=0) + opts = ChunkingOptions(new_after_n_chars=0) + opts._validate() + assert opts.soft_max == 0 def it_silently_accepts_new_after_n_chars_greater_than_maxchars(self): @@ -141,33 +119,32 @@ class DescribeChunkingOptions: So rather than raising an exception or warning, we just cap that value at `max_characters` which is the behavioral equivalent. """ + opts = ChunkingOptions(max_characters=444, new_after_n_chars=555) try: - opts = ChunkingOptions.new(max_characters=444, new_after_n_chars=555) + opts._validate() except ValueError: pytest.fail("did not accept `new_after_n_chars` greater than `max_characters`") assert opts.soft_max == 444 def it_knows_how_much_overlap_to_apply_to_split_chunks(self): - assert ChunkingOptions.new(overlap=10).overlap == 10 + assert ChunkingOptions(overlap=10).overlap == 10 def and_it_uses_the_same_value_for_inter_chunk_overlap_when_asked_to_overlap_all_chunks(self): - assert ChunkingOptions.new(overlap=10, overlap_all=True).inter_chunk_overlap == 10 + assert ChunkingOptions(overlap=10, overlap_all=True).inter_chunk_overlap == 10 def but_it_does_not_overlap_pre_chunks_by_default(self): - assert ChunkingOptions.new(overlap=10).inter_chunk_overlap == 0 + assert ChunkingOptions(overlap=10).inter_chunk_overlap == 0 def it_knows_the_text_separator_string(self): - assert ChunkingOptions.new().text_separator == "\n\n" + assert ChunkingOptions().text_separator == "\n\n" class Describe_TextSplitter: """Unit-test suite for `unstructured.chunking.base._TextSplitter` objects.""" def it_splits_on_a_preferred_separator_when_it_can(self): - opts = ChunkingOptions.new( - max_characters=50, text_splitting_separators=("\n", " "), overlap=10 - ) + opts = ChunkingOptions(max_characters=50, text_splitting_separators=("\n", " "), overlap=10) split = _TextSplitter(opts) text = ( "Lorem ipsum dolor amet consectetur adipiscing. \n " @@ -189,9 +166,7 @@ class Describe_TextSplitter: assert remainder == "" def and_it_splits_on_the_next_available_separator_when_the_first_is_not_available(self): - opts = ChunkingOptions.new( - max_characters=40, text_splitting_separators=("\n", " "), overlap=10 - ) + opts = ChunkingOptions(max_characters=40, text_splitting_separators=("\n", " "), overlap=10) split = _TextSplitter(opts) text = ( "Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus porta" @@ -211,9 +186,7 @@ class Describe_TextSplitter: assert remainder == "" def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self): - opts = ChunkingOptions.new( - max_characters=30, text_splitting_separators=("\n", " "), overlap=10 - ) + opts = ChunkingOptions(max_characters=30, text_splitting_separators=("\n", " "), overlap=10) split = _TextSplitter(opts) text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta." @@ -237,7 +210,7 @@ class Describe_TextSplitter: ], ) def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str): - opts = ChunkingOptions.new(max_characters=46, overlap=10) + opts = ChunkingOptions(max_characters=46, overlap=10) split = _TextSplitter(opts) s, remainder = split(text) @@ -246,7 +219,7 @@ class Describe_TextSplitter: assert remainder == "" def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self): - opts = ChunkingOptions.new(max_characters=38, overlap=10) + opts = ChunkingOptions(max_characters=38, overlap=10) split = _TextSplitter(opts) text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta." @@ -257,9 +230,7 @@ class Describe_TextSplitter: @pytest.mark.parametrize("separators", [("\n", " "), (" ",)]) def it_strips_whitespace_around_the_split(self, separators: Sequence[str]): - opts = ChunkingOptions.new( - max_characters=50, text_splitting_separators=separators, overlap=10 - ) + opts = ChunkingOptions(max_characters=50, text_splitting_separators=separators, overlap=10) split = _TextSplitter(opts) text = "Lorem ipsum dolor amet consectetur adipiscing. \n\n In rhoncus ipsum sed lectus." # |-------------------------------------------------^ 50-chars @@ -271,12 +242,12 @@ class Describe_TextSplitter: # ================================================================================================ -# BASE PRE-CHUNKER +# PRE-CHUNKER # ================================================================================================ -class DescribeBasePreChunker: - """Unit-test suite for `unstructured.chunking.base.BasePreChunker` objects.""" +class DescribePreChunker: + """Unit-test suite for `unstructured.chunking.base.PreChunker` objects.""" def it_gathers_elements_into_pre_chunks_respecting_the_specified_chunk_size(self): elements = [ @@ -289,9 +260,9 @@ class DescribeBasePreChunker: CheckBox(), ] - opts = ChunkingOptions.new(max_characters=150, new_after_n_chars=65) + opts = ChunkingOptions(max_characters=150, new_after_n_chars=65) - pre_chunk_iter = BasePreChunker.iter_pre_chunks(elements, opts=opts) + pre_chunk_iter = PreChunker.iter_pre_chunks(elements, opts=opts) pre_chunk = next(pre_chunk_iter) assert isinstance(pre_chunk, TextPreChunk) @@ -344,7 +315,7 @@ class DescribeTablePreChunk: pre_chunk = TablePreChunk( Table(text_table, metadata=ElementMetadata(text_as_html=html_table)), overlap_prefix="ctus porta volutpat.", - opts=ChunkingOptions.new(max_characters=175), + opts=ChunkingOptions(max_characters=175), ) chunk_iter = pre_chunk.iter_chunks() @@ -393,7 +364,7 @@ class DescribeTablePreChunk: pre_chunk = TablePreChunk( Table(text_table, metadata=ElementMetadata(text_as_html=html_table)), overlap_prefix="", - opts=ChunkingOptions.new(max_characters=100, text_splitting_separators=("\n", " ")), + opts=ChunkingOptions(max_characters=100, text_splitting_separators=("\n", " ")), ) chunk_iter = pre_chunk.iter_chunks() @@ -457,7 +428,7 @@ class DescribeTablePreChunk: self, text: str, expected_value: str ): pre_chunk = TablePreChunk( - Table(text), overlap_prefix="", opts=ChunkingOptions.new(overlap=20, overlap_all=True) + Table(text), overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True) ) assert pre_chunk.overlap_tail == expected_value @@ -480,7 +451,7 @@ class DescribeTablePreChunk: self, text: str, overlap_prefix: str, expected_value: str ): pre_chunk = TablePreChunk( - Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions.new() + Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions() ) assert pre_chunk._text == expected_value @@ -536,7 +507,7 @@ class DescribeTextPreChunk: self, max_characters: int, combine_text_under_n_chars: int, expected_value: bool ): """This allows `PreChunkCombiner` to operate without knowing `TextPreChunk` internals.""" - opts = ChunkingOptions.new( + opts = ChunkingOptions( max_characters=max_characters, combine_text_under_n_chars=combine_text_under_n_chars, overlap=20, @@ -560,7 +531,7 @@ class DescribeTextPreChunk: Note that neither the original or other pre_chunk are mutated. """ - opts = ChunkingOptions.new() + opts = ChunkingOptions() pre_chunk = TextPreChunk( [ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), @@ -623,7 +594,7 @@ class DescribeTextPreChunk: ), ], overlap_prefix="e feugiat efficitur.", - opts=ChunkingOptions.new(max_characters=200), + opts=ChunkingOptions(max_characters=200), ) chunk_iter = pre_chunk.iter_chunks() @@ -648,7 +619,7 @@ class DescribeTextPreChunk: ), ], overlap_prefix="", - opts=ChunkingOptions.new(max_characters=200, text_splitting_separators=("\n", " ")), + opts=ChunkingOptions(max_characters=200, text_splitting_separators=("\n", " ")), ) chunk_iter = pre_chunk.iter_chunks() @@ -681,7 +652,7 @@ class DescribeTextPreChunk: self, text: str, expected_value: str ): pre_chunk = TextPreChunk( - [Text(text)], overlap_prefix="", opts=ChunkingOptions.new(overlap=20, overlap_all=True) + [Text(text)], overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True) ) assert pre_chunk.overlap_tail == expected_value @@ -708,7 +679,7 @@ class DescribeTextPreChunk: ), ], overlap_prefix="", - opts=ChunkingOptions.new(), + opts=ChunkingOptions(), ) assert pre_chunk._all_metadata_values == { @@ -746,7 +717,7 @@ class DescribeTextPreChunk: Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2), ], overlap_prefix="", - opts=ChunkingOptions.new(), + opts=ChunkingOptions(), ) # -- ad-hoc fields "coefficient" and "quotient" do not appear -- @@ -789,7 +760,7 @@ class DescribeTextPreChunk: ), ], overlap_prefix="ficitur.", # len == 8 - opts=ChunkingOptions.new(), + opts=ChunkingOptions(), ) regex_metadata = pre_chunk._consolidated_regex_meta @@ -845,7 +816,7 @@ class DescribeTextPreChunk: ), ], overlap_prefix="", - opts=ChunkingOptions.new(), + opts=ChunkingOptions(), ) meta_kwargs = pre_chunk._meta_kwargs @@ -881,9 +852,7 @@ class DescribeTextPreChunk: The text-segment contributed by each element is separated from the next by a blank line ("\n\n"). An element that contributes no text does not give rise to a separator. """ - pre_chunk = TextPreChunk( - elements, overlap_prefix=overlap_prefix, opts=ChunkingOptions.new() - ) + pre_chunk = TextPreChunk(elements, overlap_prefix=overlap_prefix, opts=ChunkingOptions()) assert pre_chunk._text == expected_value @@ -896,13 +865,13 @@ class DescribePreChunkBuilder: """Unit-test suite for `unstructured.chunking.base.PreChunkBuilder`.""" def it_is_empty_on_construction(self): - builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50)) + builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=50)) assert builder._text_length == 0 assert builder._remaining_space == 50 def it_accumulates_elements_added_to_it(self): - builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150)) + builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150)) builder.add_element(Title("Introduction")) assert builder._text_length == 12 @@ -919,7 +888,7 @@ class DescribePreChunkBuilder: @pytest.mark.parametrize("element", [Table("Heading\nCell text"), Text("abcd " * 200)]) def it_will_fit_a_Table_or_oversized_element_when_empty(self, element: Element): - builder = PreChunkBuilder(opts=ChunkingOptions.new()) + builder = PreChunkBuilder(opts=ChunkingOptions()) assert builder.will_fit(element) @pytest.mark.parametrize( @@ -934,22 +903,20 @@ class DescribePreChunkBuilder: def but_not_when_it_already_contains_an_element_of_any_kind( self, existing_element: Element, next_element: Element ): - builder = PreChunkBuilder(opts=ChunkingOptions.new()) + builder = PreChunkBuilder(opts=ChunkingOptions()) builder.add_element(existing_element) assert not builder.will_fit(next_element) @pytest.mark.parametrize("element", [Text("abcd"), Table("Fruits\nMango")]) def it_will_not_fit_any_element_when_it_already_contains_a_table(self, element: Element): - builder = PreChunkBuilder(opts=ChunkingOptions.new()) + builder = PreChunkBuilder(opts=ChunkingOptions()) builder.add_element(Table("Heading\nCell text")) assert not builder.will_fit(element) def it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen(self): - builder = PreChunkBuilder( - opts=ChunkingOptions.new(max_characters=100, new_after_n_chars=50) - ) + builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100, new_after_n_chars=50)) builder.add_element( Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars ) @@ -957,7 +924,7 @@ class DescribePreChunkBuilder: assert not builder.will_fit(Text("In rhoncus ipsum.")) def and_it_will_not_fit_an_element_when_that_would_cause_it_to_exceed_the_hard_maxlen(self): - builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100)) + builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100)) builder.add_element( Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars ) @@ -968,7 +935,7 @@ class DescribePreChunkBuilder: ) def but_it_will_fit_an_element_that_fits(self): - builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100)) + builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100)) builder.add_element( Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars ) @@ -977,7 +944,7 @@ class DescribePreChunkBuilder: assert builder.will_fit(Text("In rhoncus ipsum sed lectus porto volutpat.")) # 43-chars def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): - builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150)) + builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150)) builder.add_element(Title("Introduction")) builder.add_element( Text( @@ -1000,7 +967,7 @@ class DescribePreChunkBuilder: assert builder._remaining_space == 150 def and_it_generates_a_TablePreChunk_when_it_contains_a_Table_element(self): - builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150)) + builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150)) builder.add_element(Table("Heading\nCell text")) pre_chunk = next(builder.flush()) @@ -1016,7 +983,7 @@ class DescribePreChunkBuilder: assert pre_chunk._table == Table("Heading\nCell text") def but_it_does_not_generate_a_pre_chunk_on_flush_when_empty(self): - builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150)) + builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150)) pre_chunks = list(builder.flush()) @@ -1025,7 +992,7 @@ class DescribePreChunkBuilder: assert builder._remaining_space == 150 def it_computes_overlap_from_each_pre_chunk_and_applies_it_to_the_next(self): - opts = ChunkingOptions.new(overlap=15, overlap_all=True) + opts = ChunkingOptions(overlap=15, overlap_all=True) builder = PreChunkBuilder(opts=opts) builder.add_element(Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")) @@ -1044,7 +1011,7 @@ class DescribePreChunkBuilder: assert pre_chunk._text == "porta volutpat.\n\nDonec semper facilisis metus finibus." def it_considers_separator_length_when_computing_text_length_and_remaining_space(self): - builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50)) + builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=50)) builder.add_element(Text("abcde")) builder.add_element(Text("fghij")) @@ -1061,7 +1028,7 @@ class DescribePreChunkCombiner: """Unit-test suite for `unstructured.chunking.base.PreChunkCombiner`.""" def it_combines_sequential_small_text_pre_chunks(self): - opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250) + opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=250) pre_chunks = [ TextPreChunk( [ @@ -1105,7 +1072,7 @@ class DescribePreChunkCombiner: next(pre_chunk_iter) def but_it_does_not_combine_table_pre_chunks(self): - opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250) + opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=250) pre_chunks = [ TextPreChunk( [ @@ -1127,7 +1094,7 @@ class DescribePreChunkCombiner: ] pre_chunk_iter = PreChunkCombiner( - pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250) + pre_chunks, ChunkingOptions(max_characters=250, combine_text_under_n_chars=250) ).iter_combined_pre_chunks() pre_chunk = next(pre_chunk_iter) @@ -1152,7 +1119,7 @@ class DescribePreChunkCombiner: next(pre_chunk_iter) def it_respects_the_specified_combination_threshold(self): - opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80) + opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=80) pre_chunks = [ TextPreChunk( # 68 [ @@ -1203,7 +1170,7 @@ class DescribePreChunkCombiner: next(pre_chunk_iter) def it_respects_the_hard_maximum_window_length(self): - opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200) + opts = ChunkingOptions(max_characters=200, combine_text_under_n_chars=200) pre_chunks = [ TextPreChunk( # 68 [ @@ -1256,7 +1223,7 @@ class DescribePreChunkCombiner: def it_accommodates_and_isolates_an_oversized_pre_chunk(self): """Such as occurs when a single element exceeds the window size.""" - opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150) + opts = ChunkingOptions(max_characters=150, combine_text_under_n_chars=150) pre_chunks = [ TextPreChunk([Title("Lorem Ipsum")], overlap_prefix="", opts=opts), TextPreChunk( # 179 @@ -1274,7 +1241,7 @@ class DescribePreChunkCombiner: ] pre_chunk_iter = PreChunkCombiner( - pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150) + pre_chunks, ChunkingOptions(max_characters=150, combine_text_under_n_chars=150) ).iter_combined_pre_chunks() pre_chunk = next(pre_chunk_iter) @@ -1303,7 +1270,7 @@ class DescribeTextPreChunkAccumulator: """Unit-test suite for `unstructured.chunking.base.TextPreChunkAccumulator`.""" def it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): - opts = ChunkingOptions.new() + opts = ChunkingOptions(combine_text_under_n_chars=500) accum = TextPreChunkAccumulator(opts=opts) pre_chunk = TextPreChunk( @@ -1362,7 +1329,7 @@ class DescribeTextPreChunkAccumulator: next(accum.flush()) def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self): - accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150)) + accum = TextPreChunkAccumulator(opts=ChunkingOptions(max_characters=150)) assert list(accum.flush()) == [] diff --git a/test_unstructured/chunking/test_basic.py b/test_unstructured/chunking/test_basic.py index b74f28c53..acc6b049d 100644 --- a/test_unstructured/chunking/test_basic.py +++ b/test_unstructured/chunking/test_basic.py @@ -1,4 +1,4 @@ -"""Unit-test suite for the `unstructured.chunking.basic` module. +"""Test suite for the `unstructured.chunking.basic` module. That module implements the baseline chunking strategy. The baseline strategy has all behaviors shared by all chunking strategies and no extra rules like perserve section or page boundaries. diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py index 319d31d01..54db22a58 100644 --- a/test_unstructured/chunking/test_title.py +++ b/test_unstructured/chunking/test_title.py @@ -1,13 +1,20 @@ # pyright: reportPrivateUsage=false -"""Unit-test suite for the `unstructured.chunking.title` module.""" +"""Test suite for the `unstructured.chunking.title` module.""" from __future__ import annotations +from typing import Optional + import pytest -from unstructured.chunking.base import ChunkingOptions, TablePreChunk, TextPreChunk -from unstructured.chunking.title import _ByTitlePreChunker, chunk_by_title +from unstructured.chunking.base import ( + CHUNK_MULTI_PAGE_DEFAULT, + PreChunker, + TablePreChunk, + TextPreChunk, +) +from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title from unstructured.documents.coordinates import CoordinateSystem from unstructured.documents.elements import ( CheckBox, @@ -23,6 +30,13 @@ from unstructured.documents.elements import ( ) from unstructured.partition.html import partition_html +# ================================================================================================ +# INTEGRATION-TESTS +# ================================================================================================ +# These test `chunk_by_title()` as an integrated whole, calling `chunk_by_title()` and inspecting +# the outputs. +# ================================================================================================ + def test_it_splits_a_large_element_into_multiple_chunks(): elements: list[Element] = [ @@ -57,7 +71,7 @@ def test_split_elements_by_title_and_table(): CheckBox(), ] - pre_chunks = _ByTitlePreChunker.iter_pre_chunks(elements, opts=ChunkingOptions.new()) + pre_chunks = PreChunker.iter_pre_chunks(elements, opts=_ByTitleChunkingOptions.new()) pre_chunk = next(pre_chunks) assert isinstance(pre_chunk, TextPreChunk) @@ -544,3 +558,86 @@ def test_it_considers_separator_length_when_pre_chunking(): ), CompositeElement("Minimize mid-text chunk-splitting"), ] + + +# ================================================================================================ +# UNIT-TESTS +# ================================================================================================ +# These test individual components in isolation so can exercise all edge cases while still +# performing well. +# ================================================================================================ + + +class Describe_ByTitleChunkingOptions: + """Unit-test suite for `unstructured.chunking.title._ByTitleChunkingOptions` objects.""" + + @pytest.mark.parametrize("n_chars", [-1, -42]) + def it_rejects_combine_text_under_n_chars_for_n_less_than_zero(self, n_chars: int): + with pytest.raises( + ValueError, + match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}", + ): + _ByTitleChunkingOptions.new(combine_text_under_n_chars=n_chars) + + def it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining(self): + """Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining.""" + opts = _ByTitleChunkingOptions(combine_text_under_n_chars=0) + assert opts.combine_text_under_n_chars == 0 + + def it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself(self): + """Caller can specify `combine_text_under_n_chars` arg without specifying other options.""" + try: + opts = _ByTitleChunkingOptions(combine_text_under_n_chars=50) + except ValueError: + pytest.fail("did not accept `combine_text_under_n_chars` as option by itself") + + assert opts.combine_text_under_n_chars == 50 + + @pytest.mark.parametrize( + ("combine_text_under_n_chars", "max_characters", "expected_hard_max"), + [(600, None, 500), (600, 450, 450)], + ) + def it_rejects_combine_text_under_n_chars_greater_than_maxchars( + self, combine_text_under_n_chars: int, max_characters: Optional[int], expected_hard_max: int + ): + """`combine_text_under_n_chars` > `max_characters` can produce behavior confusing to users. + + The behavior is no different from `combine_text_under_n_chars == max_characters`, but if + `max_characters` is left to default (500) and `combine_text_under_n_chars` is set to a + larger number like 1500 then it can look like chunk-combining isn't working. + """ + with pytest.raises( + ValueError, + match=( + "'combine_text_under_n_chars' argument must not exceed `max_characters` value," + f" got {combine_text_under_n_chars} > {expected_hard_max}" + ), + ): + _ByTitleChunkingOptions.new( + max_characters=max_characters, combine_text_under_n_chars=combine_text_under_n_chars + ) + + def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self): + """Caller can specify `new_after_n_chars` arg without specifying any other options. + + In particular, `combine_text_under_n_chars` value is adjusted down to the + `new_after_n_chars` value when the default for `combine_text_under_n_chars` exceeds the + value of `new_after_n_chars`. + """ + try: + opts = _ByTitleChunkingOptions(new_after_n_chars=200) + except ValueError: + pytest.fail("did not accept `new_after_n_chars` as option by itself") + + assert opts.soft_max == 200 + assert opts.combine_text_under_n_chars == 200 + + @pytest.mark.parametrize( + ("multipage_sections", "expected_value"), + [(True, True), (False, False), (None, CHUNK_MULTI_PAGE_DEFAULT)], + ) + def it_knows_whether_to_break_chunks_on_page_boundaries( + self, multipage_sections: bool, expected_value: bool + ): + opts = _ByTitleChunkingOptions(multipage_sections=multipage_sections) + assert opts.multipage_sections is expected_value diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index c9958b461..66be1ce96 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -7,7 +7,7 @@ import copy from typing import Any, Callable, DefaultDict, Iterable, Iterator, Optional, Sequence, cast import regex -from typing_extensions import Self, TypeAlias +from typing_extensions import TypeAlias from unstructured.documents.elements import ( CompositeElement, @@ -68,9 +68,6 @@ class ChunkingOptions: when not specified, which effectively disables this behavior. Specifying 0 for this argument causes each element to appear in a chunk by itself (although an element with text longer than `max_characters` will be still be split into two or more chunks). - multipage_sections - Indicates that page-boundaries should not be respected while chunking, i.e. elements - appearing on two different pages can appear in the same chunk. combine_text_under_n_chars Provides a way to "recombine" small chunks formed by breaking on a semantic boundary. Only relevant for a chunking strategy that specifies higher-level semantic boundaries to be @@ -101,9 +98,9 @@ class ChunkingOptions: def __init__( self, + *, combine_text_under_n_chars: Optional[int] = None, max_characters: Optional[int] = None, - multipage_sections: Optional[bool] = None, new_after_n_chars: Optional[int] = None, overlap: Optional[int] = None, overlap_all: Optional[bool] = None, @@ -111,59 +108,28 @@ class ChunkingOptions: ): self._combine_text_under_n_chars_arg = combine_text_under_n_chars self._max_characters_arg = max_characters - self._multipage_sections_arg = multipage_sections self._new_after_n_chars_arg = new_after_n_chars self._overlap_arg = overlap self._overlap_all_arg = overlap_all self._text_splitting_separators = text_splitting_separators - @classmethod - def new( - cls, - combine_text_under_n_chars: Optional[int] = None, - max_characters: Optional[int] = None, - multipage_sections: Optional[bool] = None, - new_after_n_chars: Optional[int] = None, - overlap: Optional[int] = None, - overlap_all: Optional[bool] = None, - text_splitting_separators: Sequence[str] = ("\n", " "), - ) -> Self: - """Construct validated instance. + @lazyproperty + def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]: + """The semantic-boundary detectors to be applied to break pre-chunks. - Raises `ValueError` on invalid arguments like overlap > max_chars. + Overridden by sub-typs to provide semantic-boundary isolation behaviors. """ - self = cls( - combine_text_under_n_chars, - max_characters, - multipage_sections, - new_after_n_chars, - overlap, - overlap_all, - text_splitting_separators, - ) - self._validate() - return self + return () @lazyproperty def combine_text_under_n_chars(self) -> int: - """Combine consecutive text pre-chunks if former is smaller than this and both will fit. + """Combine two consecutive text pre-chunks if first is smaller than this and both will fit. - - Does not combine table chunks with text chunks even if they would both fit in the - chunking window. - - Does not combine text chunks if together they would exceed the chunking window. - - Defaults to `max_characters` when not specified. - - Is reduced to `new_after_n_chars` when it exceeds that value. + Default applied here is `0` which essentially disables chunk combining. Must be overridden + by subclass where combining behavior is supported. """ - max_characters = self.hard_max - soft_max = self.soft_max - arg = self._combine_text_under_n_chars_arg - - # -- `combine_text_under_n_chars` defaults to `max_characters` when not specified and is - # -- capped at max-chars - combine_text_under_n_chars = max_characters if arg is None or arg > max_characters else arg - - # -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` -- - return soft_max if combine_text_under_n_chars > soft_max else combine_text_under_n_chars + arg_value = self._combine_text_under_n_chars_arg + return arg_value if arg_value is not None else 0 @lazyproperty def hard_max(self) -> int: @@ -185,12 +151,6 @@ class ChunkingOptions: """ return self.overlap if self._overlap_all_arg else 0 - @lazyproperty - def multipage_sections(self) -> bool: - """When False, break pre-chunks on page-boundaries.""" - arg_value = self._multipage_sections_arg - return CHUNK_MULTI_PAGE_DEFAULT if arg_value is None else bool(arg_value) - @lazyproperty def overlap(self) -> int: """The number of characters to overlap text when splitting chunks mid-text. @@ -256,28 +216,15 @@ class ChunkingOptions: if max_characters <= 0: raise ValueError(f"'max_characters' argument must be > 0," f" got {max_characters}") - # -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination) - # -- but a negative value is not - combine_text_under_n_chars = self._combine_text_under_n_chars_arg - if combine_text_under_n_chars is not None and combine_text_under_n_chars < 0: - raise ValueError( - f"'combine_text_under_n_chars' argument must be >= 0," - f" got {combine_text_under_n_chars}" - ) - - # -- a negative value for `new_after_n_chars` is assumed to - # -- be a mistake the caller will want to know about + # -- a negative value for `new_after_n_chars` is assumed to be a mistake the caller will + # -- want to know about new_after_n_chars = self._new_after_n_chars_arg if new_after_n_chars is not None and new_after_n_chars < 0: raise ValueError( f"'new_after_n_chars' argument must be >= 0," f" got {new_after_n_chars}" ) - # -- overlap must be less than max-chars or the chunk text will - # -- never be consumed - # TODO: consider a heuristic like never overlap more than half, - # otherwise there could be corner cases leading to an infinite - # loop (I think). + # -- overlap must be less than max-chars or the chunk text will never be consumed -- if self.overlap >= max_characters: raise ValueError( f"'overlap' argument must be less than `max_characters`," @@ -402,12 +349,12 @@ class _TextSplitter: # ================================================================================================ -# BASE PRE-CHUNKER +# PRE-CHUNKER # ================================================================================================ -class BasePreChunker: - """Base-class for per-strategy pre-chunkers. +class PreChunker: + """Gathers sequential elements into pre-chunks as length constraints allow. The pre-chunker's responsibilities are: @@ -465,7 +412,7 @@ class BasePreChunker: @lazyproperty def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]: """The semantic-boundary detectors to be applied to break pre-chunks.""" - return () + return self._opts.boundary_predicates def _is_in_new_semantic_unit(self, element: Element) -> bool: """True when `element` begins a new semantic unit such as a section or page.""" diff --git a/unstructured/chunking/basic.py b/unstructured/chunking/basic.py index d15e7eac2..6f4ae49de 100644 --- a/unstructured/chunking/basic.py +++ b/unstructured/chunking/basic.py @@ -17,7 +17,9 @@ from __future__ import annotations from typing import Iterable, Optional -from unstructured.chunking.base import BasePreChunker, ChunkingOptions +from typing_extensions import Self + +from unstructured.chunking.base import ChunkingOptions, PreChunker from unstructured.documents.elements import Element @@ -58,7 +60,7 @@ def chunk_elements( level of "pollution" of otherwise clean semantic chunk boundaries. """ # -- raises ValueError on invalid parameters -- - opts = ChunkingOptions.new( + opts = _BasicChunkingOptions.new( max_characters=max_characters, new_after_n_chars=new_after_n_chars, overlap=overlap, @@ -67,14 +69,32 @@ def chunk_elements( return [ chunk - for pre_chunk in BasicPreChunker.iter_pre_chunks(elements, opts) + for pre_chunk in PreChunker.iter_pre_chunks(elements, opts) for chunk in pre_chunk.iter_chunks() ] -class BasicPreChunker(BasePreChunker): - """Produces pre-chunks from a sequence of document-elements using the "basic" rule-set. +class _BasicChunkingOptions(ChunkingOptions): + """Options for `basic` chunking.""" - The "basic" rule-set is essentially "no-rules" other than `Table` is segregated into its own - pre-chunk. - """ + @classmethod + def new( + cls, + *, + max_characters: Optional[int] = None, + new_after_n_chars: Optional[int] = None, + overlap: Optional[int] = None, + overlap_all: Optional[bool] = None, + ) -> Self: + """Construct validated instance. + + Raises `ValueError` on invalid arguments like overlap > max_chars. + """ + self = cls( + max_characters=max_characters, + new_after_n_chars=new_after_n_chars, + overlap=overlap, + overlap_all=overlap_all, + ) + self._validate() + return self diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py index 432019c25..a26dd5ebb 100644 --- a/unstructured/chunking/title.py +++ b/unstructured/chunking/title.py @@ -7,11 +7,14 @@ from __future__ import annotations from typing import Iterable, Iterator, Optional +from typing_extensions import Self + from unstructured.chunking.base import ( - BasePreChunker, + CHUNK_MULTI_PAGE_DEFAULT, BoundaryPredicate, ChunkingOptions, PreChunkCombiner, + PreChunker, is_in_next_section, is_on_next_page, is_title, @@ -22,6 +25,7 @@ from unstructured.utils import lazyproperty def chunk_by_title( elements: Iterable[Element], + *, max_characters: Optional[int] = None, multipage_sections: Optional[bool] = None, combine_text_under_n_chars: Optional[int] = None, @@ -65,7 +69,7 @@ def chunk_by_title( elements and not subject to text-splitting. Use this with caution as it entails a certain level of "pollution" of otherwise clean semantic chunk boundaries. """ - opts = ChunkingOptions.new( + opts = _ByTitleChunkingOptions.new( combine_text_under_n_chars=combine_text_under_n_chars, max_characters=max_characters, multipage_sections=multipage_sections, @@ -75,27 +79,127 @@ def chunk_by_title( ) pre_chunks = PreChunkCombiner( - _ByTitlePreChunker.iter_pre_chunks(elements, opts), opts=opts + PreChunker.iter_pre_chunks(elements, opts), opts=opts ).iter_combined_pre_chunks() return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks()] -class _ByTitlePreChunker(BasePreChunker): - """Pre-chunker for the "by_title" chunking strategy. +class _ByTitleChunkingOptions(ChunkingOptions): + """Adds the by-title-specific chunking options to the base case. - The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates a - new "section", hence the "by-title" designation. + `by_title`-specific options: + + multipage_sections + Indicates that page-boundaries should not be respected while chunking, i.e. elements + appearing on two different pages can appear in the same chunk. """ + def __init__( + self, + *, + max_characters: Optional[int] = None, + combine_text_under_n_chars: Optional[int] = None, + multipage_sections: Optional[bool] = None, + new_after_n_chars: Optional[int] = None, + overlap: Optional[int] = None, + overlap_all: Optional[bool] = None, + ): + super().__init__( + combine_text_under_n_chars=combine_text_under_n_chars, + max_characters=max_characters, + new_after_n_chars=new_after_n_chars, + overlap=overlap, + overlap_all=overlap_all, + ) + self._multipage_sections_arg = multipage_sections + + @classmethod + def new( + cls, + *, + max_characters: Optional[int] = None, + combine_text_under_n_chars: Optional[int] = None, + multipage_sections: Optional[bool] = None, + new_after_n_chars: Optional[int] = None, + overlap: Optional[int] = None, + overlap_all: Optional[bool] = None, + ) -> Self: + """Return instance or raises `ValueError` on invalid arguments like overlap > max_chars.""" + self = cls( + max_characters=max_characters, + combine_text_under_n_chars=combine_text_under_n_chars, + multipage_sections=multipage_sections, + new_after_n_chars=new_after_n_chars, + overlap=overlap, + overlap_all=overlap_all, + ) + self._validate() + return self + @lazyproperty - def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]: - """The semantic-boundary detectors to be applied to break pre-chunks.""" + def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]: + """The semantic-boundary detectors to be applied to break pre-chunks. + + For the `by_title` strategy these are sections indicated by a title (section-heading), an + explicit section metadata item (only present for certain document types), and optionally + page boundaries. + """ def iter_boundary_predicates() -> Iterator[BoundaryPredicate]: yield is_title yield is_in_next_section() - if not self._opts.multipage_sections: + if not self.multipage_sections: yield is_on_next_page() return tuple(iter_boundary_predicates()) + + @lazyproperty + def combine_text_under_n_chars(self) -> int: + """Combine consecutive text pre-chunks if former is smaller than this and both will fit. + + - Does not combine table chunks with text chunks even if they would both fit in the + chunking window. + - Does not combine text chunks if together they would exceed the chunking window. + - Defaults to `max_characters` when not specified. + - Is reduced to `new_after_n_chars` when it exceeds that value. + """ + max_characters = self.hard_max + soft_max = self.soft_max + arg_value = self._combine_text_under_n_chars_arg + + # -- `combine_text_under_n_chars` defaults to `max_characters` when not specified -- + combine_text_under_n_chars = max_characters if arg_value is None else arg_value + + # -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` -- + return soft_max if combine_text_under_n_chars > soft_max else combine_text_under_n_chars + + @lazyproperty + def multipage_sections(self) -> bool: + """When False, break pre-chunks on page-boundaries.""" + arg_value = self._multipage_sections_arg + return CHUNK_MULTI_PAGE_DEFAULT if arg_value is None else bool(arg_value) + + def _validate(self) -> None: + """Raise ValueError if request option-set is invalid.""" + # -- start with base-class validations -- + super()._validate() + + if (combine_text_under_n_chars_arg := self._combine_text_under_n_chars_arg) is not None: + # -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination) + # -- but a negative value is not + if combine_text_under_n_chars_arg < 0: + raise ValueError( + f"'combine_text_under_n_chars' argument must be >= 0," + f" got {combine_text_under_n_chars_arg}" + ) + + # -- `combine_text_under_n_chars` > `max_characters` can produce behavior confusing to + # -- users. The chunking behavior would be no different than when + # -- `combine_text_under_n_chars == max_characters`, but if `max_characters` is left to + # -- default (500) then it can look like chunk-combining isn't working. + if combine_text_under_n_chars_arg > self.hard_max: + raise ValueError( + f"'combine_text_under_n_chars' argument must not exceed `max_characters`" + f" value, got {combine_text_under_n_chars_arg} > {self.hard_max}" + )