rfctr(chunking): extract strategy-specific chunking options (#2556)

**Summary**
A pluggable chunking strategy needs its own local set of chunking
options that subclasses a base-class in `unstructured`.

Extract distinct `_ByTitleChunkingOptions` and `_BasicChunkingOptions`
for the existing two chunking strategies and move their
strategy-specific option setting and validation to the respective
subclass.

This was also a good opportunity for us to clean up a few odds and ends
we'd been meaning to.

Might be worth looking at the commits individually as they are cohesive
incremental steps toward the goal.
This commit is contained in:
Steve Canny 2024-02-23 10:22:44 -08:00 committed by GitHub
parent b4d9ad8130
commit 51cf6bf716
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 340 additions and 205 deletions

View File

@ -4,15 +4,15 @@
from __future__ import annotations from __future__ import annotations
from typing import Sequence from typing import Optional, Sequence
import pytest import pytest
from unstructured.chunking.base import ( from unstructured.chunking.base import (
BasePreChunker,
ChunkingOptions, ChunkingOptions,
PreChunkBuilder, PreChunkBuilder,
PreChunkCombiner, PreChunkCombiner,
PreChunker,
TablePreChunk, TablePreChunk,
TextPreChunk, TextPreChunk,
TextPreChunkAccumulator, TextPreChunkAccumulator,
@ -48,7 +48,7 @@ class DescribeChunkingOptions:
ValueError, ValueError,
match=f"'max_characters' argument must be > 0, got {max_characters}", match=f"'max_characters' argument must be > 0, got {max_characters}",
): ):
ChunkingOptions.new(max_characters=max_characters) ChunkingOptions(max_characters=max_characters)._validate()
def it_does_not_complain_when_specifying_max_characters_by_itself(self): def it_does_not_complain_when_specifying_max_characters_by_itself(self):
"""Caller can specify `max_characters` arg without specifying any others. """Caller can specify `max_characters` arg without specifying any others.
@ -58,44 +58,25 @@ class DescribeChunkingOptions:
and trigger an exception. and trigger an exception.
""" """
try: try:
ChunkingOptions.new(max_characters=50) ChunkingOptions(max_characters=50)._validate()
except ValueError: except ValueError:
pytest.fail("did not accept `max_characters` as option by itself") pytest.fail("did not accept `max_characters` as option by itself")
@pytest.mark.parametrize("n_chars", [-1, -42]) @pytest.mark.parametrize(
def it_rejects_combine_text_under_n_chars_for_n_less_than_zero(self, n_chars: int): ("combine_text_under_n_chars", "expected_value"), [(None, 0), (42, 42)]
with pytest.raises( )
ValueError, def it_accepts_combine_text_under_n_chars_in_constructor_but_defaults_to_no_combining(
match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}", self, combine_text_under_n_chars: Optional[int], expected_value: int
): ):
ChunkingOptions.new(combine_text_under_n_chars=n_chars) """Subclasses can store `combine_text_under_n_chars` but must validate and enable it.
def it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining(self): The `combine_text_under_n_chars` option is not used by all chunkers and its behavior can
"""Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining.""" differ between subtypes. It is present in and stored by the contructur but it defaults to
opts = ChunkingOptions.new(combine_text_under_n_chars=0) `0` (no pre-chunk combining) and must be overridden by subclasses to give it the desired
assert opts.combine_text_under_n_chars == 0 behavior.
def it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself(self):
"""Caller can specify `combine_text_under_n_chars` arg without specifying other options."""
try:
opts = ChunkingOptions.new(combine_text_under_n_chars=50)
except ValueError:
pytest.fail("did not accept `combine_text_under_n_chars` as option by itself")
assert opts.combine_text_under_n_chars == 50
def it_silently_accepts_combine_text_under_n_chars_greater_than_maxchars(self):
"""`combine_text_under_n_chars` > `max_characters` doesn't affect chunking behavior.
So rather than raising an exception or warning, we just cap that value at `max_characters`
which is the behavioral equivalent.
""" """
try: opts = ChunkingOptions(combine_text_under_n_chars=combine_text_under_n_chars)
opts = ChunkingOptions.new(max_characters=500, combine_text_under_n_chars=600) assert opts.combine_text_under_n_chars == expected_value
except ValueError:
pytest.fail("did not accept `combine_text_under_n_chars` greater than `max_characters`")
assert opts.combine_text_under_n_chars == 500
@pytest.mark.parametrize("n_chars", [-1, -42]) @pytest.mark.parametrize("n_chars", [-1, -42])
def it_rejects_new_after_n_chars_for_n_less_than_zero(self, n_chars: int): def it_rejects_new_after_n_chars_for_n_less_than_zero(self, n_chars: int):
@ -103,7 +84,7 @@ class DescribeChunkingOptions:
ValueError, ValueError,
match=f"'new_after_n_chars' argument must be >= 0, got {n_chars}", match=f"'new_after_n_chars' argument must be >= 0, got {n_chars}",
): ):
ChunkingOptions.new(new_after_n_chars=n_chars) ChunkingOptions(new_after_n_chars=n_chars)._validate()
def it_rejects_overlap_not_less_than_max_characters(self): def it_rejects_overlap_not_less_than_max_characters(self):
with pytest.raises( with pytest.raises(
@ -113,26 +94,23 @@ class DescribeChunkingOptions:
ChunkingOptions(max_characters=200, overlap=300)._validate() ChunkingOptions(max_characters=200, overlap=300)._validate()
def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self): def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self):
"""Caller can specify `new_after_n_chars` arg without specifying any other options. """Caller can specify `new_after_n_chars` arg without specifying any other options."""
opts = ChunkingOptions(new_after_n_chars=200)
In particular, `combine_text_under_n_chars` value is adjusted down to the
`new_after_n_chars` value when the default for `combine_text_under_n_chars` exceeds the
value of `new_after_n_chars`.
"""
try: try:
opts = ChunkingOptions.new(new_after_n_chars=200) opts._validate()
except ValueError: except ValueError:
pytest.fail("did not accept `new_after_n_chars` as option by itself") pytest.fail("did not accept `new_after_n_chars` as option by itself")
assert opts.soft_max == 200 assert opts.soft_max == 200
assert opts.combine_text_under_n_chars == 200
def it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk(self): def it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk(self):
"""Specifying `new_after_n_chars=0` places each element into its own pre-chunk. """Specifying `new_after_n_chars=0` places each element into its own pre-chunk.
This puts each element into its own chunk, although long chunks are still split. This puts each element into its own chunk, although long chunks are still split.
""" """
opts = ChunkingOptions.new(new_after_n_chars=0) opts = ChunkingOptions(new_after_n_chars=0)
opts._validate()
assert opts.soft_max == 0 assert opts.soft_max == 0
def it_silently_accepts_new_after_n_chars_greater_than_maxchars(self): def it_silently_accepts_new_after_n_chars_greater_than_maxchars(self):
@ -141,33 +119,32 @@ class DescribeChunkingOptions:
So rather than raising an exception or warning, we just cap that value at `max_characters` So rather than raising an exception or warning, we just cap that value at `max_characters`
which is the behavioral equivalent. which is the behavioral equivalent.
""" """
opts = ChunkingOptions(max_characters=444, new_after_n_chars=555)
try: try:
opts = ChunkingOptions.new(max_characters=444, new_after_n_chars=555) opts._validate()
except ValueError: except ValueError:
pytest.fail("did not accept `new_after_n_chars` greater than `max_characters`") pytest.fail("did not accept `new_after_n_chars` greater than `max_characters`")
assert opts.soft_max == 444 assert opts.soft_max == 444
def it_knows_how_much_overlap_to_apply_to_split_chunks(self): def it_knows_how_much_overlap_to_apply_to_split_chunks(self):
assert ChunkingOptions.new(overlap=10).overlap == 10 assert ChunkingOptions(overlap=10).overlap == 10
def and_it_uses_the_same_value_for_inter_chunk_overlap_when_asked_to_overlap_all_chunks(self): def and_it_uses_the_same_value_for_inter_chunk_overlap_when_asked_to_overlap_all_chunks(self):
assert ChunkingOptions.new(overlap=10, overlap_all=True).inter_chunk_overlap == 10 assert ChunkingOptions(overlap=10, overlap_all=True).inter_chunk_overlap == 10
def but_it_does_not_overlap_pre_chunks_by_default(self): def but_it_does_not_overlap_pre_chunks_by_default(self):
assert ChunkingOptions.new(overlap=10).inter_chunk_overlap == 0 assert ChunkingOptions(overlap=10).inter_chunk_overlap == 0
def it_knows_the_text_separator_string(self): def it_knows_the_text_separator_string(self):
assert ChunkingOptions.new().text_separator == "\n\n" assert ChunkingOptions().text_separator == "\n\n"
class Describe_TextSplitter: class Describe_TextSplitter:
"""Unit-test suite for `unstructured.chunking.base._TextSplitter` objects.""" """Unit-test suite for `unstructured.chunking.base._TextSplitter` objects."""
def it_splits_on_a_preferred_separator_when_it_can(self): def it_splits_on_a_preferred_separator_when_it_can(self):
opts = ChunkingOptions.new( opts = ChunkingOptions(max_characters=50, text_splitting_separators=("\n", " "), overlap=10)
max_characters=50, text_splitting_separators=("\n", " "), overlap=10
)
split = _TextSplitter(opts) split = _TextSplitter(opts)
text = ( text = (
"Lorem ipsum dolor amet consectetur adipiscing. \n " "Lorem ipsum dolor amet consectetur adipiscing. \n "
@ -189,9 +166,7 @@ class Describe_TextSplitter:
assert remainder == "" assert remainder == ""
def and_it_splits_on_the_next_available_separator_when_the_first_is_not_available(self): def and_it_splits_on_the_next_available_separator_when_the_first_is_not_available(self):
opts = ChunkingOptions.new( opts = ChunkingOptions(max_characters=40, text_splitting_separators=("\n", " "), overlap=10)
max_characters=40, text_splitting_separators=("\n", " "), overlap=10
)
split = _TextSplitter(opts) split = _TextSplitter(opts)
text = ( text = (
"Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus porta" "Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus porta"
@ -211,9 +186,7 @@ class Describe_TextSplitter:
assert remainder == "" assert remainder == ""
def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self): def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self):
opts = ChunkingOptions.new( opts = ChunkingOptions(max_characters=30, text_splitting_separators=("\n", " "), overlap=10)
max_characters=30, text_splitting_separators=("\n", " "), overlap=10
)
split = _TextSplitter(opts) split = _TextSplitter(opts)
text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta." text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
@ -237,7 +210,7 @@ class Describe_TextSplitter:
], ],
) )
def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str): def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str):
opts = ChunkingOptions.new(max_characters=46, overlap=10) opts = ChunkingOptions(max_characters=46, overlap=10)
split = _TextSplitter(opts) split = _TextSplitter(opts)
s, remainder = split(text) s, remainder = split(text)
@ -246,7 +219,7 @@ class Describe_TextSplitter:
assert remainder == "" assert remainder == ""
def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self): def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self):
opts = ChunkingOptions.new(max_characters=38, overlap=10) opts = ChunkingOptions(max_characters=38, overlap=10)
split = _TextSplitter(opts) split = _TextSplitter(opts)
text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta." text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
@ -257,9 +230,7 @@ class Describe_TextSplitter:
@pytest.mark.parametrize("separators", [("\n", " "), (" ",)]) @pytest.mark.parametrize("separators", [("\n", " "), (" ",)])
def it_strips_whitespace_around_the_split(self, separators: Sequence[str]): def it_strips_whitespace_around_the_split(self, separators: Sequence[str]):
opts = ChunkingOptions.new( opts = ChunkingOptions(max_characters=50, text_splitting_separators=separators, overlap=10)
max_characters=50, text_splitting_separators=separators, overlap=10
)
split = _TextSplitter(opts) split = _TextSplitter(opts)
text = "Lorem ipsum dolor amet consectetur adipiscing. \n\n In rhoncus ipsum sed lectus." text = "Lorem ipsum dolor amet consectetur adipiscing. \n\n In rhoncus ipsum sed lectus."
# |-------------------------------------------------^ 50-chars # |-------------------------------------------------^ 50-chars
@ -271,12 +242,12 @@ class Describe_TextSplitter:
# ================================================================================================ # ================================================================================================
# BASE PRE-CHUNKER # PRE-CHUNKER
# ================================================================================================ # ================================================================================================
class DescribeBasePreChunker: class DescribePreChunker:
"""Unit-test suite for `unstructured.chunking.base.BasePreChunker` objects.""" """Unit-test suite for `unstructured.chunking.base.PreChunker` objects."""
def it_gathers_elements_into_pre_chunks_respecting_the_specified_chunk_size(self): def it_gathers_elements_into_pre_chunks_respecting_the_specified_chunk_size(self):
elements = [ elements = [
@ -289,9 +260,9 @@ class DescribeBasePreChunker:
CheckBox(), CheckBox(),
] ]
opts = ChunkingOptions.new(max_characters=150, new_after_n_chars=65) opts = ChunkingOptions(max_characters=150, new_after_n_chars=65)
pre_chunk_iter = BasePreChunker.iter_pre_chunks(elements, opts=opts) pre_chunk_iter = PreChunker.iter_pre_chunks(elements, opts=opts)
pre_chunk = next(pre_chunk_iter) pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk) assert isinstance(pre_chunk, TextPreChunk)
@ -344,7 +315,7 @@ class DescribeTablePreChunk:
pre_chunk = TablePreChunk( pre_chunk = TablePreChunk(
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)), Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
overlap_prefix="ctus porta volutpat.", overlap_prefix="ctus porta volutpat.",
opts=ChunkingOptions.new(max_characters=175), opts=ChunkingOptions(max_characters=175),
) )
chunk_iter = pre_chunk.iter_chunks() chunk_iter = pre_chunk.iter_chunks()
@ -393,7 +364,7 @@ class DescribeTablePreChunk:
pre_chunk = TablePreChunk( pre_chunk = TablePreChunk(
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)), Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
overlap_prefix="", overlap_prefix="",
opts=ChunkingOptions.new(max_characters=100, text_splitting_separators=("\n", " ")), opts=ChunkingOptions(max_characters=100, text_splitting_separators=("\n", " ")),
) )
chunk_iter = pre_chunk.iter_chunks() chunk_iter = pre_chunk.iter_chunks()
@ -457,7 +428,7 @@ class DescribeTablePreChunk:
self, text: str, expected_value: str self, text: str, expected_value: str
): ):
pre_chunk = TablePreChunk( pre_chunk = TablePreChunk(
Table(text), overlap_prefix="", opts=ChunkingOptions.new(overlap=20, overlap_all=True) Table(text), overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True)
) )
assert pre_chunk.overlap_tail == expected_value assert pre_chunk.overlap_tail == expected_value
@ -480,7 +451,7 @@ class DescribeTablePreChunk:
self, text: str, overlap_prefix: str, expected_value: str self, text: str, overlap_prefix: str, expected_value: str
): ):
pre_chunk = TablePreChunk( pre_chunk = TablePreChunk(
Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions.new() Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions()
) )
assert pre_chunk._text == expected_value assert pre_chunk._text == expected_value
@ -536,7 +507,7 @@ class DescribeTextPreChunk:
self, max_characters: int, combine_text_under_n_chars: int, expected_value: bool self, max_characters: int, combine_text_under_n_chars: int, expected_value: bool
): ):
"""This allows `PreChunkCombiner` to operate without knowing `TextPreChunk` internals.""" """This allows `PreChunkCombiner` to operate without knowing `TextPreChunk` internals."""
opts = ChunkingOptions.new( opts = ChunkingOptions(
max_characters=max_characters, max_characters=max_characters,
combine_text_under_n_chars=combine_text_under_n_chars, combine_text_under_n_chars=combine_text_under_n_chars,
overlap=20, overlap=20,
@ -560,7 +531,7 @@ class DescribeTextPreChunk:
Note that neither the original or other pre_chunk are mutated. Note that neither the original or other pre_chunk are mutated.
""" """
opts = ChunkingOptions.new() opts = ChunkingOptions()
pre_chunk = TextPreChunk( pre_chunk = TextPreChunk(
[ [
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
@ -623,7 +594,7 @@ class DescribeTextPreChunk:
), ),
], ],
overlap_prefix="e feugiat efficitur.", overlap_prefix="e feugiat efficitur.",
opts=ChunkingOptions.new(max_characters=200), opts=ChunkingOptions(max_characters=200),
) )
chunk_iter = pre_chunk.iter_chunks() chunk_iter = pre_chunk.iter_chunks()
@ -648,7 +619,7 @@ class DescribeTextPreChunk:
), ),
], ],
overlap_prefix="", overlap_prefix="",
opts=ChunkingOptions.new(max_characters=200, text_splitting_separators=("\n", " ")), opts=ChunkingOptions(max_characters=200, text_splitting_separators=("\n", " ")),
) )
chunk_iter = pre_chunk.iter_chunks() chunk_iter = pre_chunk.iter_chunks()
@ -681,7 +652,7 @@ class DescribeTextPreChunk:
self, text: str, expected_value: str self, text: str, expected_value: str
): ):
pre_chunk = TextPreChunk( pre_chunk = TextPreChunk(
[Text(text)], overlap_prefix="", opts=ChunkingOptions.new(overlap=20, overlap_all=True) [Text(text)], overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True)
) )
assert pre_chunk.overlap_tail == expected_value assert pre_chunk.overlap_tail == expected_value
@ -708,7 +679,7 @@ class DescribeTextPreChunk:
), ),
], ],
overlap_prefix="", overlap_prefix="",
opts=ChunkingOptions.new(), opts=ChunkingOptions(),
) )
assert pre_chunk._all_metadata_values == { assert pre_chunk._all_metadata_values == {
@ -746,7 +717,7 @@ class DescribeTextPreChunk:
Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2), Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
], ],
overlap_prefix="", overlap_prefix="",
opts=ChunkingOptions.new(), opts=ChunkingOptions(),
) )
# -- ad-hoc fields "coefficient" and "quotient" do not appear -- # -- ad-hoc fields "coefficient" and "quotient" do not appear --
@ -789,7 +760,7 @@ class DescribeTextPreChunk:
), ),
], ],
overlap_prefix="ficitur.", # len == 8 overlap_prefix="ficitur.", # len == 8
opts=ChunkingOptions.new(), opts=ChunkingOptions(),
) )
regex_metadata = pre_chunk._consolidated_regex_meta regex_metadata = pre_chunk._consolidated_regex_meta
@ -845,7 +816,7 @@ class DescribeTextPreChunk:
), ),
], ],
overlap_prefix="", overlap_prefix="",
opts=ChunkingOptions.new(), opts=ChunkingOptions(),
) )
meta_kwargs = pre_chunk._meta_kwargs meta_kwargs = pre_chunk._meta_kwargs
@ -881,9 +852,7 @@ class DescribeTextPreChunk:
The text-segment contributed by each element is separated from the next by a blank line The text-segment contributed by each element is separated from the next by a blank line
("\n\n"). An element that contributes no text does not give rise to a separator. ("\n\n"). An element that contributes no text does not give rise to a separator.
""" """
pre_chunk = TextPreChunk( pre_chunk = TextPreChunk(elements, overlap_prefix=overlap_prefix, opts=ChunkingOptions())
elements, overlap_prefix=overlap_prefix, opts=ChunkingOptions.new()
)
assert pre_chunk._text == expected_value assert pre_chunk._text == expected_value
@ -896,13 +865,13 @@ class DescribePreChunkBuilder:
"""Unit-test suite for `unstructured.chunking.base.PreChunkBuilder`.""" """Unit-test suite for `unstructured.chunking.base.PreChunkBuilder`."""
def it_is_empty_on_construction(self): def it_is_empty_on_construction(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50)) builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=50))
assert builder._text_length == 0 assert builder._text_length == 0
assert builder._remaining_space == 50 assert builder._remaining_space == 50
def it_accumulates_elements_added_to_it(self): def it_accumulates_elements_added_to_it(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150)) builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
builder.add_element(Title("Introduction")) builder.add_element(Title("Introduction"))
assert builder._text_length == 12 assert builder._text_length == 12
@ -919,7 +888,7 @@ class DescribePreChunkBuilder:
@pytest.mark.parametrize("element", [Table("Heading\nCell text"), Text("abcd " * 200)]) @pytest.mark.parametrize("element", [Table("Heading\nCell text"), Text("abcd " * 200)])
def it_will_fit_a_Table_or_oversized_element_when_empty(self, element: Element): def it_will_fit_a_Table_or_oversized_element_when_empty(self, element: Element):
builder = PreChunkBuilder(opts=ChunkingOptions.new()) builder = PreChunkBuilder(opts=ChunkingOptions())
assert builder.will_fit(element) assert builder.will_fit(element)
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -934,22 +903,20 @@ class DescribePreChunkBuilder:
def but_not_when_it_already_contains_an_element_of_any_kind( def but_not_when_it_already_contains_an_element_of_any_kind(
self, existing_element: Element, next_element: Element self, existing_element: Element, next_element: Element
): ):
builder = PreChunkBuilder(opts=ChunkingOptions.new()) builder = PreChunkBuilder(opts=ChunkingOptions())
builder.add_element(existing_element) builder.add_element(existing_element)
assert not builder.will_fit(next_element) assert not builder.will_fit(next_element)
@pytest.mark.parametrize("element", [Text("abcd"), Table("Fruits\nMango")]) @pytest.mark.parametrize("element", [Text("abcd"), Table("Fruits\nMango")])
def it_will_not_fit_any_element_when_it_already_contains_a_table(self, element: Element): def it_will_not_fit_any_element_when_it_already_contains_a_table(self, element: Element):
builder = PreChunkBuilder(opts=ChunkingOptions.new()) builder = PreChunkBuilder(opts=ChunkingOptions())
builder.add_element(Table("Heading\nCell text")) builder.add_element(Table("Heading\nCell text"))
assert not builder.will_fit(element) assert not builder.will_fit(element)
def it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen(self): def it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen(self):
builder = PreChunkBuilder( builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100, new_after_n_chars=50))
opts=ChunkingOptions.new(max_characters=100, new_after_n_chars=50)
)
builder.add_element( builder.add_element(
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
) )
@ -957,7 +924,7 @@ class DescribePreChunkBuilder:
assert not builder.will_fit(Text("In rhoncus ipsum.")) assert not builder.will_fit(Text("In rhoncus ipsum."))
def and_it_will_not_fit_an_element_when_that_would_cause_it_to_exceed_the_hard_maxlen(self): def and_it_will_not_fit_an_element_when_that_would_cause_it_to_exceed_the_hard_maxlen(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100)) builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100))
builder.add_element( builder.add_element(
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
) )
@ -968,7 +935,7 @@ class DescribePreChunkBuilder:
) )
def but_it_will_fit_an_element_that_fits(self): def but_it_will_fit_an_element_that_fits(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100)) builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100))
builder.add_element( builder.add_element(
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
) )
@ -977,7 +944,7 @@ class DescribePreChunkBuilder:
assert builder.will_fit(Text("In rhoncus ipsum sed lectus porto volutpat.")) # 43-chars assert builder.will_fit(Text("In rhoncus ipsum sed lectus porto volutpat.")) # 43-chars
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150)) builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
builder.add_element(Title("Introduction")) builder.add_element(Title("Introduction"))
builder.add_element( builder.add_element(
Text( Text(
@ -1000,7 +967,7 @@ class DescribePreChunkBuilder:
assert builder._remaining_space == 150 assert builder._remaining_space == 150
def and_it_generates_a_TablePreChunk_when_it_contains_a_Table_element(self): def and_it_generates_a_TablePreChunk_when_it_contains_a_Table_element(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150)) builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
builder.add_element(Table("Heading\nCell text")) builder.add_element(Table("Heading\nCell text"))
pre_chunk = next(builder.flush()) pre_chunk = next(builder.flush())
@ -1016,7 +983,7 @@ class DescribePreChunkBuilder:
assert pre_chunk._table == Table("Heading\nCell text") assert pre_chunk._table == Table("Heading\nCell text")
def but_it_does_not_generate_a_pre_chunk_on_flush_when_empty(self): def but_it_does_not_generate_a_pre_chunk_on_flush_when_empty(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150)) builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
pre_chunks = list(builder.flush()) pre_chunks = list(builder.flush())
@ -1025,7 +992,7 @@ class DescribePreChunkBuilder:
assert builder._remaining_space == 150 assert builder._remaining_space == 150
def it_computes_overlap_from_each_pre_chunk_and_applies_it_to_the_next(self): def it_computes_overlap_from_each_pre_chunk_and_applies_it_to_the_next(self):
opts = ChunkingOptions.new(overlap=15, overlap_all=True) opts = ChunkingOptions(overlap=15, overlap_all=True)
builder = PreChunkBuilder(opts=opts) builder = PreChunkBuilder(opts=opts)
builder.add_element(Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")) builder.add_element(Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."))
@ -1044,7 +1011,7 @@ class DescribePreChunkBuilder:
assert pre_chunk._text == "porta volutpat.\n\nDonec semper facilisis metus finibus." assert pre_chunk._text == "porta volutpat.\n\nDonec semper facilisis metus finibus."
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self): def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50)) builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=50))
builder.add_element(Text("abcde")) builder.add_element(Text("abcde"))
builder.add_element(Text("fghij")) builder.add_element(Text("fghij"))
@ -1061,7 +1028,7 @@ class DescribePreChunkCombiner:
"""Unit-test suite for `unstructured.chunking.base.PreChunkCombiner`.""" """Unit-test suite for `unstructured.chunking.base.PreChunkCombiner`."""
def it_combines_sequential_small_text_pre_chunks(self): def it_combines_sequential_small_text_pre_chunks(self):
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250) opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=250)
pre_chunks = [ pre_chunks = [
TextPreChunk( TextPreChunk(
[ [
@ -1105,7 +1072,7 @@ class DescribePreChunkCombiner:
next(pre_chunk_iter) next(pre_chunk_iter)
def but_it_does_not_combine_table_pre_chunks(self): def but_it_does_not_combine_table_pre_chunks(self):
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250) opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=250)
pre_chunks = [ pre_chunks = [
TextPreChunk( TextPreChunk(
[ [
@ -1127,7 +1094,7 @@ class DescribePreChunkCombiner:
] ]
pre_chunk_iter = PreChunkCombiner( pre_chunk_iter = PreChunkCombiner(
pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250) pre_chunks, ChunkingOptions(max_characters=250, combine_text_under_n_chars=250)
).iter_combined_pre_chunks() ).iter_combined_pre_chunks()
pre_chunk = next(pre_chunk_iter) pre_chunk = next(pre_chunk_iter)
@ -1152,7 +1119,7 @@ class DescribePreChunkCombiner:
next(pre_chunk_iter) next(pre_chunk_iter)
def it_respects_the_specified_combination_threshold(self): def it_respects_the_specified_combination_threshold(self):
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80) opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=80)
pre_chunks = [ pre_chunks = [
TextPreChunk( # 68 TextPreChunk( # 68
[ [
@ -1203,7 +1170,7 @@ class DescribePreChunkCombiner:
next(pre_chunk_iter) next(pre_chunk_iter)
def it_respects_the_hard_maximum_window_length(self): def it_respects_the_hard_maximum_window_length(self):
opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200) opts = ChunkingOptions(max_characters=200, combine_text_under_n_chars=200)
pre_chunks = [ pre_chunks = [
TextPreChunk( # 68 TextPreChunk( # 68
[ [
@ -1256,7 +1223,7 @@ class DescribePreChunkCombiner:
def it_accommodates_and_isolates_an_oversized_pre_chunk(self): def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
"""Such as occurs when a single element exceeds the window size.""" """Such as occurs when a single element exceeds the window size."""
opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150) opts = ChunkingOptions(max_characters=150, combine_text_under_n_chars=150)
pre_chunks = [ pre_chunks = [
TextPreChunk([Title("Lorem Ipsum")], overlap_prefix="", opts=opts), TextPreChunk([Title("Lorem Ipsum")], overlap_prefix="", opts=opts),
TextPreChunk( # 179 TextPreChunk( # 179
@ -1274,7 +1241,7 @@ class DescribePreChunkCombiner:
] ]
pre_chunk_iter = PreChunkCombiner( pre_chunk_iter = PreChunkCombiner(
pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150) pre_chunks, ChunkingOptions(max_characters=150, combine_text_under_n_chars=150)
).iter_combined_pre_chunks() ).iter_combined_pre_chunks()
pre_chunk = next(pre_chunk_iter) pre_chunk = next(pre_chunk_iter)
@ -1303,7 +1270,7 @@ class DescribeTextPreChunkAccumulator:
"""Unit-test suite for `unstructured.chunking.base.TextPreChunkAccumulator`.""" """Unit-test suite for `unstructured.chunking.base.TextPreChunkAccumulator`."""
def it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty(self): def it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
opts = ChunkingOptions.new() opts = ChunkingOptions(combine_text_under_n_chars=500)
accum = TextPreChunkAccumulator(opts=opts) accum = TextPreChunkAccumulator(opts=opts)
pre_chunk = TextPreChunk( pre_chunk = TextPreChunk(
@ -1362,7 +1329,7 @@ class DescribeTextPreChunkAccumulator:
next(accum.flush()) next(accum.flush())
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self): def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150)) accum = TextPreChunkAccumulator(opts=ChunkingOptions(max_characters=150))
assert list(accum.flush()) == [] assert list(accum.flush()) == []

View File

@ -1,4 +1,4 @@
"""Unit-test suite for the `unstructured.chunking.basic` module. """Test suite for the `unstructured.chunking.basic` module.
That module implements the baseline chunking strategy. The baseline strategy has all behaviors That module implements the baseline chunking strategy. The baseline strategy has all behaviors
shared by all chunking strategies and no extra rules like perserve section or page boundaries. shared by all chunking strategies and no extra rules like perserve section or page boundaries.

View File

@ -1,13 +1,20 @@
# pyright: reportPrivateUsage=false # pyright: reportPrivateUsage=false
"""Unit-test suite for the `unstructured.chunking.title` module.""" """Test suite for the `unstructured.chunking.title` module."""
from __future__ import annotations from __future__ import annotations
from typing import Optional
import pytest import pytest
from unstructured.chunking.base import ChunkingOptions, TablePreChunk, TextPreChunk from unstructured.chunking.base import (
from unstructured.chunking.title import _ByTitlePreChunker, chunk_by_title CHUNK_MULTI_PAGE_DEFAULT,
PreChunker,
TablePreChunk,
TextPreChunk,
)
from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title
from unstructured.documents.coordinates import CoordinateSystem from unstructured.documents.coordinates import CoordinateSystem
from unstructured.documents.elements import ( from unstructured.documents.elements import (
CheckBox, CheckBox,
@ -23,6 +30,13 @@ from unstructured.documents.elements import (
) )
from unstructured.partition.html import partition_html from unstructured.partition.html import partition_html
# ================================================================================================
# INTEGRATION-TESTS
# ================================================================================================
# These test `chunk_by_title()` as an integrated whole, calling `chunk_by_title()` and inspecting
# the outputs.
# ================================================================================================
def test_it_splits_a_large_element_into_multiple_chunks(): def test_it_splits_a_large_element_into_multiple_chunks():
elements: list[Element] = [ elements: list[Element] = [
@ -57,7 +71,7 @@ def test_split_elements_by_title_and_table():
CheckBox(), CheckBox(),
] ]
pre_chunks = _ByTitlePreChunker.iter_pre_chunks(elements, opts=ChunkingOptions.new()) pre_chunks = PreChunker.iter_pre_chunks(elements, opts=_ByTitleChunkingOptions.new())
pre_chunk = next(pre_chunks) pre_chunk = next(pre_chunks)
assert isinstance(pre_chunk, TextPreChunk) assert isinstance(pre_chunk, TextPreChunk)
@ -544,3 +558,86 @@ def test_it_considers_separator_length_when_pre_chunking():
), ),
CompositeElement("Minimize mid-text chunk-splitting"), CompositeElement("Minimize mid-text chunk-splitting"),
] ]
# ================================================================================================
# UNIT-TESTS
# ================================================================================================
# These test individual components in isolation so can exercise all edge cases while still
# performing well.
# ================================================================================================
class Describe_ByTitleChunkingOptions:
"""Unit-test suite for `unstructured.chunking.title._ByTitleChunkingOptions` objects."""
@pytest.mark.parametrize("n_chars", [-1, -42])
def it_rejects_combine_text_under_n_chars_for_n_less_than_zero(self, n_chars: int):
with pytest.raises(
ValueError,
match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}",
):
_ByTitleChunkingOptions.new(combine_text_under_n_chars=n_chars)
def it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining(self):
"""Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining."""
opts = _ByTitleChunkingOptions(combine_text_under_n_chars=0)
assert opts.combine_text_under_n_chars == 0
def it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself(self):
"""Caller can specify `combine_text_under_n_chars` arg without specifying other options."""
try:
opts = _ByTitleChunkingOptions(combine_text_under_n_chars=50)
except ValueError:
pytest.fail("did not accept `combine_text_under_n_chars` as option by itself")
assert opts.combine_text_under_n_chars == 50
@pytest.mark.parametrize(
("combine_text_under_n_chars", "max_characters", "expected_hard_max"),
[(600, None, 500), (600, 450, 450)],
)
def it_rejects_combine_text_under_n_chars_greater_than_maxchars(
self, combine_text_under_n_chars: int, max_characters: Optional[int], expected_hard_max: int
):
"""`combine_text_under_n_chars` > `max_characters` can produce behavior confusing to users.
The behavior is no different from `combine_text_under_n_chars == max_characters`, but if
`max_characters` is left to default (500) and `combine_text_under_n_chars` is set to a
larger number like 1500 then it can look like chunk-combining isn't working.
"""
with pytest.raises(
ValueError,
match=(
"'combine_text_under_n_chars' argument must not exceed `max_characters` value,"
f" got {combine_text_under_n_chars} > {expected_hard_max}"
),
):
_ByTitleChunkingOptions.new(
max_characters=max_characters, combine_text_under_n_chars=combine_text_under_n_chars
)
def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self):
"""Caller can specify `new_after_n_chars` arg without specifying any other options.
In particular, `combine_text_under_n_chars` value is adjusted down to the
`new_after_n_chars` value when the default for `combine_text_under_n_chars` exceeds the
value of `new_after_n_chars`.
"""
try:
opts = _ByTitleChunkingOptions(new_after_n_chars=200)
except ValueError:
pytest.fail("did not accept `new_after_n_chars` as option by itself")
assert opts.soft_max == 200
assert opts.combine_text_under_n_chars == 200
@pytest.mark.parametrize(
("multipage_sections", "expected_value"),
[(True, True), (False, False), (None, CHUNK_MULTI_PAGE_DEFAULT)],
)
def it_knows_whether_to_break_chunks_on_page_boundaries(
self, multipage_sections: bool, expected_value: bool
):
opts = _ByTitleChunkingOptions(multipage_sections=multipage_sections)
assert opts.multipage_sections is expected_value

View File

@ -7,7 +7,7 @@ import copy
from typing import Any, Callable, DefaultDict, Iterable, Iterator, Optional, Sequence, cast from typing import Any, Callable, DefaultDict, Iterable, Iterator, Optional, Sequence, cast
import regex import regex
from typing_extensions import Self, TypeAlias from typing_extensions import TypeAlias
from unstructured.documents.elements import ( from unstructured.documents.elements import (
CompositeElement, CompositeElement,
@ -68,9 +68,6 @@ class ChunkingOptions:
when not specified, which effectively disables this behavior. Specifying 0 for this when not specified, which effectively disables this behavior. Specifying 0 for this
argument causes each element to appear in a chunk by itself (although an element with text argument causes each element to appear in a chunk by itself (although an element with text
longer than `max_characters` will be still be split into two or more chunks). longer than `max_characters` will be still be split into two or more chunks).
multipage_sections
Indicates that page-boundaries should not be respected while chunking, i.e. elements
appearing on two different pages can appear in the same chunk.
combine_text_under_n_chars combine_text_under_n_chars
Provides a way to "recombine" small chunks formed by breaking on a semantic boundary. Only Provides a way to "recombine" small chunks formed by breaking on a semantic boundary. Only
relevant for a chunking strategy that specifies higher-level semantic boundaries to be relevant for a chunking strategy that specifies higher-level semantic boundaries to be
@ -101,9 +98,9 @@ class ChunkingOptions:
def __init__( def __init__(
self, self,
*,
combine_text_under_n_chars: Optional[int] = None, combine_text_under_n_chars: Optional[int] = None,
max_characters: Optional[int] = None, max_characters: Optional[int] = None,
multipage_sections: Optional[bool] = None,
new_after_n_chars: Optional[int] = None, new_after_n_chars: Optional[int] = None,
overlap: Optional[int] = None, overlap: Optional[int] = None,
overlap_all: Optional[bool] = None, overlap_all: Optional[bool] = None,
@ -111,59 +108,28 @@ class ChunkingOptions:
): ):
self._combine_text_under_n_chars_arg = combine_text_under_n_chars self._combine_text_under_n_chars_arg = combine_text_under_n_chars
self._max_characters_arg = max_characters self._max_characters_arg = max_characters
self._multipage_sections_arg = multipage_sections
self._new_after_n_chars_arg = new_after_n_chars self._new_after_n_chars_arg = new_after_n_chars
self._overlap_arg = overlap self._overlap_arg = overlap
self._overlap_all_arg = overlap_all self._overlap_all_arg = overlap_all
self._text_splitting_separators = text_splitting_separators self._text_splitting_separators = text_splitting_separators
@classmethod @lazyproperty
def new( def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
cls, """The semantic-boundary detectors to be applied to break pre-chunks.
combine_text_under_n_chars: Optional[int] = None,
max_characters: Optional[int] = None,
multipage_sections: Optional[bool] = None,
new_after_n_chars: Optional[int] = None,
overlap: Optional[int] = None,
overlap_all: Optional[bool] = None,
text_splitting_separators: Sequence[str] = ("\n", " "),
) -> Self:
"""Construct validated instance.
Raises `ValueError` on invalid arguments like overlap > max_chars. Overridden by sub-typs to provide semantic-boundary isolation behaviors.
""" """
self = cls( return ()
combine_text_under_n_chars,
max_characters,
multipage_sections,
new_after_n_chars,
overlap,
overlap_all,
text_splitting_separators,
)
self._validate()
return self
@lazyproperty @lazyproperty
def combine_text_under_n_chars(self) -> int: def combine_text_under_n_chars(self) -> int:
"""Combine consecutive text pre-chunks if former is smaller than this and both will fit. """Combine two consecutive text pre-chunks if first is smaller than this and both will fit.
- Does not combine table chunks with text chunks even if they would both fit in the Default applied here is `0` which essentially disables chunk combining. Must be overridden
chunking window. by subclass where combining behavior is supported.
- Does not combine text chunks if together they would exceed the chunking window.
- Defaults to `max_characters` when not specified.
- Is reduced to `new_after_n_chars` when it exceeds that value.
""" """
max_characters = self.hard_max arg_value = self._combine_text_under_n_chars_arg
soft_max = self.soft_max return arg_value if arg_value is not None else 0
arg = self._combine_text_under_n_chars_arg
# -- `combine_text_under_n_chars` defaults to `max_characters` when not specified and is
# -- capped at max-chars
combine_text_under_n_chars = max_characters if arg is None or arg > max_characters else arg
# -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` --
return soft_max if combine_text_under_n_chars > soft_max else combine_text_under_n_chars
@lazyproperty @lazyproperty
def hard_max(self) -> int: def hard_max(self) -> int:
@ -185,12 +151,6 @@ class ChunkingOptions:
""" """
return self.overlap if self._overlap_all_arg else 0 return self.overlap if self._overlap_all_arg else 0
@lazyproperty
def multipage_sections(self) -> bool:
"""When False, break pre-chunks on page-boundaries."""
arg_value = self._multipage_sections_arg
return CHUNK_MULTI_PAGE_DEFAULT if arg_value is None else bool(arg_value)
@lazyproperty @lazyproperty
def overlap(self) -> int: def overlap(self) -> int:
"""The number of characters to overlap text when splitting chunks mid-text. """The number of characters to overlap text when splitting chunks mid-text.
@ -256,28 +216,15 @@ class ChunkingOptions:
if max_characters <= 0: if max_characters <= 0:
raise ValueError(f"'max_characters' argument must be > 0," f" got {max_characters}") raise ValueError(f"'max_characters' argument must be > 0," f" got {max_characters}")
# -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination) # -- a negative value for `new_after_n_chars` is assumed to be a mistake the caller will
# -- but a negative value is not # -- want to know about
combine_text_under_n_chars = self._combine_text_under_n_chars_arg
if combine_text_under_n_chars is not None and combine_text_under_n_chars < 0:
raise ValueError(
f"'combine_text_under_n_chars' argument must be >= 0,"
f" got {combine_text_under_n_chars}"
)
# -- a negative value for `new_after_n_chars` is assumed to
# -- be a mistake the caller will want to know about
new_after_n_chars = self._new_after_n_chars_arg new_after_n_chars = self._new_after_n_chars_arg
if new_after_n_chars is not None and new_after_n_chars < 0: if new_after_n_chars is not None and new_after_n_chars < 0:
raise ValueError( raise ValueError(
f"'new_after_n_chars' argument must be >= 0," f" got {new_after_n_chars}" f"'new_after_n_chars' argument must be >= 0," f" got {new_after_n_chars}"
) )
# -- overlap must be less than max-chars or the chunk text will # -- overlap must be less than max-chars or the chunk text will never be consumed --
# -- never be consumed
# TODO: consider a heuristic like never overlap more than half,
# otherwise there could be corner cases leading to an infinite
# loop (I think).
if self.overlap >= max_characters: if self.overlap >= max_characters:
raise ValueError( raise ValueError(
f"'overlap' argument must be less than `max_characters`," f"'overlap' argument must be less than `max_characters`,"
@ -402,12 +349,12 @@ class _TextSplitter:
# ================================================================================================ # ================================================================================================
# BASE PRE-CHUNKER # PRE-CHUNKER
# ================================================================================================ # ================================================================================================
class BasePreChunker: class PreChunker:
"""Base-class for per-strategy pre-chunkers. """Gathers sequential elements into pre-chunks as length constraints allow.
The pre-chunker's responsibilities are: The pre-chunker's responsibilities are:
@ -465,7 +412,7 @@ class BasePreChunker:
@lazyproperty @lazyproperty
def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]: def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
"""The semantic-boundary detectors to be applied to break pre-chunks.""" """The semantic-boundary detectors to be applied to break pre-chunks."""
return () return self._opts.boundary_predicates
def _is_in_new_semantic_unit(self, element: Element) -> bool: def _is_in_new_semantic_unit(self, element: Element) -> bool:
"""True when `element` begins a new semantic unit such as a section or page.""" """True when `element` begins a new semantic unit such as a section or page."""

View File

@ -17,7 +17,9 @@ from __future__ import annotations
from typing import Iterable, Optional from typing import Iterable, Optional
from unstructured.chunking.base import BasePreChunker, ChunkingOptions from typing_extensions import Self
from unstructured.chunking.base import ChunkingOptions, PreChunker
from unstructured.documents.elements import Element from unstructured.documents.elements import Element
@ -58,7 +60,7 @@ def chunk_elements(
level of "pollution" of otherwise clean semantic chunk boundaries. level of "pollution" of otherwise clean semantic chunk boundaries.
""" """
# -- raises ValueError on invalid parameters -- # -- raises ValueError on invalid parameters --
opts = ChunkingOptions.new( opts = _BasicChunkingOptions.new(
max_characters=max_characters, max_characters=max_characters,
new_after_n_chars=new_after_n_chars, new_after_n_chars=new_after_n_chars,
overlap=overlap, overlap=overlap,
@ -67,14 +69,32 @@ def chunk_elements(
return [ return [
chunk chunk
for pre_chunk in BasicPreChunker.iter_pre_chunks(elements, opts) for pre_chunk in PreChunker.iter_pre_chunks(elements, opts)
for chunk in pre_chunk.iter_chunks() for chunk in pre_chunk.iter_chunks()
] ]
class BasicPreChunker(BasePreChunker): class _BasicChunkingOptions(ChunkingOptions):
"""Produces pre-chunks from a sequence of document-elements using the "basic" rule-set. """Options for `basic` chunking."""
The "basic" rule-set is essentially "no-rules" other than `Table` is segregated into its own @classmethod
pre-chunk. def new(
cls,
*,
max_characters: Optional[int] = None,
new_after_n_chars: Optional[int] = None,
overlap: Optional[int] = None,
overlap_all: Optional[bool] = None,
) -> Self:
"""Construct validated instance.
Raises `ValueError` on invalid arguments like overlap > max_chars.
""" """
self = cls(
max_characters=max_characters,
new_after_n_chars=new_after_n_chars,
overlap=overlap,
overlap_all=overlap_all,
)
self._validate()
return self

View File

@ -7,11 +7,14 @@ from __future__ import annotations
from typing import Iterable, Iterator, Optional from typing import Iterable, Iterator, Optional
from typing_extensions import Self
from unstructured.chunking.base import ( from unstructured.chunking.base import (
BasePreChunker, CHUNK_MULTI_PAGE_DEFAULT,
BoundaryPredicate, BoundaryPredicate,
ChunkingOptions, ChunkingOptions,
PreChunkCombiner, PreChunkCombiner,
PreChunker,
is_in_next_section, is_in_next_section,
is_on_next_page, is_on_next_page,
is_title, is_title,
@ -22,6 +25,7 @@ from unstructured.utils import lazyproperty
def chunk_by_title( def chunk_by_title(
elements: Iterable[Element], elements: Iterable[Element],
*,
max_characters: Optional[int] = None, max_characters: Optional[int] = None,
multipage_sections: Optional[bool] = None, multipage_sections: Optional[bool] = None,
combine_text_under_n_chars: Optional[int] = None, combine_text_under_n_chars: Optional[int] = None,
@ -65,7 +69,7 @@ def chunk_by_title(
elements and not subject to text-splitting. Use this with caution as it entails a certain elements and not subject to text-splitting. Use this with caution as it entails a certain
level of "pollution" of otherwise clean semantic chunk boundaries. level of "pollution" of otherwise clean semantic chunk boundaries.
""" """
opts = ChunkingOptions.new( opts = _ByTitleChunkingOptions.new(
combine_text_under_n_chars=combine_text_under_n_chars, combine_text_under_n_chars=combine_text_under_n_chars,
max_characters=max_characters, max_characters=max_characters,
multipage_sections=multipage_sections, multipage_sections=multipage_sections,
@ -75,27 +79,127 @@ def chunk_by_title(
) )
pre_chunks = PreChunkCombiner( pre_chunks = PreChunkCombiner(
_ByTitlePreChunker.iter_pre_chunks(elements, opts), opts=opts PreChunker.iter_pre_chunks(elements, opts), opts=opts
).iter_combined_pre_chunks() ).iter_combined_pre_chunks()
return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks()] return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks()]
class _ByTitlePreChunker(BasePreChunker): class _ByTitleChunkingOptions(ChunkingOptions):
"""Pre-chunker for the "by_title" chunking strategy. """Adds the by-title-specific chunking options to the base case.
The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates a `by_title`-specific options:
new "section", hence the "by-title" designation.
multipage_sections
Indicates that page-boundaries should not be respected while chunking, i.e. elements
appearing on two different pages can appear in the same chunk.
""" """
def __init__(
self,
*,
max_characters: Optional[int] = None,
combine_text_under_n_chars: Optional[int] = None,
multipage_sections: Optional[bool] = None,
new_after_n_chars: Optional[int] = None,
overlap: Optional[int] = None,
overlap_all: Optional[bool] = None,
):
super().__init__(
combine_text_under_n_chars=combine_text_under_n_chars,
max_characters=max_characters,
new_after_n_chars=new_after_n_chars,
overlap=overlap,
overlap_all=overlap_all,
)
self._multipage_sections_arg = multipage_sections
@classmethod
def new(
cls,
*,
max_characters: Optional[int] = None,
combine_text_under_n_chars: Optional[int] = None,
multipage_sections: Optional[bool] = None,
new_after_n_chars: Optional[int] = None,
overlap: Optional[int] = None,
overlap_all: Optional[bool] = None,
) -> Self:
"""Return instance or raises `ValueError` on invalid arguments like overlap > max_chars."""
self = cls(
max_characters=max_characters,
combine_text_under_n_chars=combine_text_under_n_chars,
multipage_sections=multipage_sections,
new_after_n_chars=new_after_n_chars,
overlap=overlap,
overlap_all=overlap_all,
)
self._validate()
return self
@lazyproperty @lazyproperty
def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]: def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
"""The semantic-boundary detectors to be applied to break pre-chunks.""" """The semantic-boundary detectors to be applied to break pre-chunks.
For the `by_title` strategy these are sections indicated by a title (section-heading), an
explicit section metadata item (only present for certain document types), and optionally
page boundaries.
"""
def iter_boundary_predicates() -> Iterator[BoundaryPredicate]: def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:
yield is_title yield is_title
yield is_in_next_section() yield is_in_next_section()
if not self._opts.multipage_sections: if not self.multipage_sections:
yield is_on_next_page() yield is_on_next_page()
return tuple(iter_boundary_predicates()) return tuple(iter_boundary_predicates())
@lazyproperty
def combine_text_under_n_chars(self) -> int:
"""Combine consecutive text pre-chunks if former is smaller than this and both will fit.
- Does not combine table chunks with text chunks even if they would both fit in the
chunking window.
- Does not combine text chunks if together they would exceed the chunking window.
- Defaults to `max_characters` when not specified.
- Is reduced to `new_after_n_chars` when it exceeds that value.
"""
max_characters = self.hard_max
soft_max = self.soft_max
arg_value = self._combine_text_under_n_chars_arg
# -- `combine_text_under_n_chars` defaults to `max_characters` when not specified --
combine_text_under_n_chars = max_characters if arg_value is None else arg_value
# -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` --
return soft_max if combine_text_under_n_chars > soft_max else combine_text_under_n_chars
@lazyproperty
def multipage_sections(self) -> bool:
"""When False, break pre-chunks on page-boundaries."""
arg_value = self._multipage_sections_arg
return CHUNK_MULTI_PAGE_DEFAULT if arg_value is None else bool(arg_value)
def _validate(self) -> None:
"""Raise ValueError if request option-set is invalid."""
# -- start with base-class validations --
super()._validate()
if (combine_text_under_n_chars_arg := self._combine_text_under_n_chars_arg) is not None:
# -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination)
# -- but a negative value is not
if combine_text_under_n_chars_arg < 0:
raise ValueError(
f"'combine_text_under_n_chars' argument must be >= 0,"
f" got {combine_text_under_n_chars_arg}"
)
# -- `combine_text_under_n_chars` > `max_characters` can produce behavior confusing to
# -- users. The chunking behavior would be no different than when
# -- `combine_text_under_n_chars == max_characters`, but if `max_characters` is left to
# -- default (500) then it can look like chunk-combining isn't working.
if combine_text_under_n_chars_arg > self.hard_max:
raise ValueError(
f"'combine_text_under_n_chars' argument must not exceed `max_characters`"
f" value, got {combine_text_under_n_chars_arg} > {self.hard_max}"
)