rfctr(chunking): extract strategy-specific chunking options (#2556)

**Summary**
A pluggable chunking strategy needs its own local set of chunking
options that subclasses a base-class in `unstructured`.

Extract distinct `_ByTitleChunkingOptions` and `_BasicChunkingOptions`
for the existing two chunking strategies and move their
strategy-specific option setting and validation to the respective
subclass.

This was also a good opportunity for us to clean up a few odds and ends
we'd been meaning to.

Might be worth looking at the commits individually as they are cohesive
incremental steps toward the goal.
This commit is contained in:
Steve Canny 2024-02-23 10:22:44 -08:00 committed by GitHub
parent b4d9ad8130
commit 51cf6bf716
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 340 additions and 205 deletions

View File

@ -4,15 +4,15 @@
from __future__ import annotations
from typing import Sequence
from typing import Optional, Sequence
import pytest
from unstructured.chunking.base import (
BasePreChunker,
ChunkingOptions,
PreChunkBuilder,
PreChunkCombiner,
PreChunker,
TablePreChunk,
TextPreChunk,
TextPreChunkAccumulator,
@ -48,7 +48,7 @@ class DescribeChunkingOptions:
ValueError,
match=f"'max_characters' argument must be > 0, got {max_characters}",
):
ChunkingOptions.new(max_characters=max_characters)
ChunkingOptions(max_characters=max_characters)._validate()
def it_does_not_complain_when_specifying_max_characters_by_itself(self):
"""Caller can specify `max_characters` arg without specifying any others.
@ -58,44 +58,25 @@ class DescribeChunkingOptions:
and trigger an exception.
"""
try:
ChunkingOptions.new(max_characters=50)
ChunkingOptions(max_characters=50)._validate()
except ValueError:
pytest.fail("did not accept `max_characters` as option by itself")
@pytest.mark.parametrize("n_chars", [-1, -42])
def it_rejects_combine_text_under_n_chars_for_n_less_than_zero(self, n_chars: int):
with pytest.raises(
ValueError,
match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}",
@pytest.mark.parametrize(
("combine_text_under_n_chars", "expected_value"), [(None, 0), (42, 42)]
)
def it_accepts_combine_text_under_n_chars_in_constructor_but_defaults_to_no_combining(
self, combine_text_under_n_chars: Optional[int], expected_value: int
):
ChunkingOptions.new(combine_text_under_n_chars=n_chars)
"""Subclasses can store `combine_text_under_n_chars` but must validate and enable it.
def it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining(self):
"""Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining."""
opts = ChunkingOptions.new(combine_text_under_n_chars=0)
assert opts.combine_text_under_n_chars == 0
def it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself(self):
"""Caller can specify `combine_text_under_n_chars` arg without specifying other options."""
try:
opts = ChunkingOptions.new(combine_text_under_n_chars=50)
except ValueError:
pytest.fail("did not accept `combine_text_under_n_chars` as option by itself")
assert opts.combine_text_under_n_chars == 50
def it_silently_accepts_combine_text_under_n_chars_greater_than_maxchars(self):
"""`combine_text_under_n_chars` > `max_characters` doesn't affect chunking behavior.
So rather than raising an exception or warning, we just cap that value at `max_characters`
which is the behavioral equivalent.
The `combine_text_under_n_chars` option is not used by all chunkers and its behavior can
differ between subtypes. It is present in and stored by the contructur but it defaults to
`0` (no pre-chunk combining) and must be overridden by subclasses to give it the desired
behavior.
"""
try:
opts = ChunkingOptions.new(max_characters=500, combine_text_under_n_chars=600)
except ValueError:
pytest.fail("did not accept `combine_text_under_n_chars` greater than `max_characters`")
assert opts.combine_text_under_n_chars == 500
opts = ChunkingOptions(combine_text_under_n_chars=combine_text_under_n_chars)
assert opts.combine_text_under_n_chars == expected_value
@pytest.mark.parametrize("n_chars", [-1, -42])
def it_rejects_new_after_n_chars_for_n_less_than_zero(self, n_chars: int):
@ -103,7 +84,7 @@ class DescribeChunkingOptions:
ValueError,
match=f"'new_after_n_chars' argument must be >= 0, got {n_chars}",
):
ChunkingOptions.new(new_after_n_chars=n_chars)
ChunkingOptions(new_after_n_chars=n_chars)._validate()
def it_rejects_overlap_not_less_than_max_characters(self):
with pytest.raises(
@ -113,26 +94,23 @@ class DescribeChunkingOptions:
ChunkingOptions(max_characters=200, overlap=300)._validate()
def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self):
"""Caller can specify `new_after_n_chars` arg without specifying any other options.
In particular, `combine_text_under_n_chars` value is adjusted down to the
`new_after_n_chars` value when the default for `combine_text_under_n_chars` exceeds the
value of `new_after_n_chars`.
"""
"""Caller can specify `new_after_n_chars` arg without specifying any other options."""
opts = ChunkingOptions(new_after_n_chars=200)
try:
opts = ChunkingOptions.new(new_after_n_chars=200)
opts._validate()
except ValueError:
pytest.fail("did not accept `new_after_n_chars` as option by itself")
assert opts.soft_max == 200
assert opts.combine_text_under_n_chars == 200
def it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk(self):
"""Specifying `new_after_n_chars=0` places each element into its own pre-chunk.
This puts each element into its own chunk, although long chunks are still split.
"""
opts = ChunkingOptions.new(new_after_n_chars=0)
opts = ChunkingOptions(new_after_n_chars=0)
opts._validate()
assert opts.soft_max == 0
def it_silently_accepts_new_after_n_chars_greater_than_maxchars(self):
@ -141,33 +119,32 @@ class DescribeChunkingOptions:
So rather than raising an exception or warning, we just cap that value at `max_characters`
which is the behavioral equivalent.
"""
opts = ChunkingOptions(max_characters=444, new_after_n_chars=555)
try:
opts = ChunkingOptions.new(max_characters=444, new_after_n_chars=555)
opts._validate()
except ValueError:
pytest.fail("did not accept `new_after_n_chars` greater than `max_characters`")
assert opts.soft_max == 444
def it_knows_how_much_overlap_to_apply_to_split_chunks(self):
assert ChunkingOptions.new(overlap=10).overlap == 10
assert ChunkingOptions(overlap=10).overlap == 10
def and_it_uses_the_same_value_for_inter_chunk_overlap_when_asked_to_overlap_all_chunks(self):
assert ChunkingOptions.new(overlap=10, overlap_all=True).inter_chunk_overlap == 10
assert ChunkingOptions(overlap=10, overlap_all=True).inter_chunk_overlap == 10
def but_it_does_not_overlap_pre_chunks_by_default(self):
assert ChunkingOptions.new(overlap=10).inter_chunk_overlap == 0
assert ChunkingOptions(overlap=10).inter_chunk_overlap == 0
def it_knows_the_text_separator_string(self):
assert ChunkingOptions.new().text_separator == "\n\n"
assert ChunkingOptions().text_separator == "\n\n"
class Describe_TextSplitter:
"""Unit-test suite for `unstructured.chunking.base._TextSplitter` objects."""
def it_splits_on_a_preferred_separator_when_it_can(self):
opts = ChunkingOptions.new(
max_characters=50, text_splitting_separators=("\n", " "), overlap=10
)
opts = ChunkingOptions(max_characters=50, text_splitting_separators=("\n", " "), overlap=10)
split = _TextSplitter(opts)
text = (
"Lorem ipsum dolor amet consectetur adipiscing. \n "
@ -189,9 +166,7 @@ class Describe_TextSplitter:
assert remainder == ""
def and_it_splits_on_the_next_available_separator_when_the_first_is_not_available(self):
opts = ChunkingOptions.new(
max_characters=40, text_splitting_separators=("\n", " "), overlap=10
)
opts = ChunkingOptions(max_characters=40, text_splitting_separators=("\n", " "), overlap=10)
split = _TextSplitter(opts)
text = (
"Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus porta"
@ -211,9 +186,7 @@ class Describe_TextSplitter:
assert remainder == ""
def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self):
opts = ChunkingOptions.new(
max_characters=30, text_splitting_separators=("\n", " "), overlap=10
)
opts = ChunkingOptions(max_characters=30, text_splitting_separators=("\n", " "), overlap=10)
split = _TextSplitter(opts)
text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
@ -237,7 +210,7 @@ class Describe_TextSplitter:
],
)
def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str):
opts = ChunkingOptions.new(max_characters=46, overlap=10)
opts = ChunkingOptions(max_characters=46, overlap=10)
split = _TextSplitter(opts)
s, remainder = split(text)
@ -246,7 +219,7 @@ class Describe_TextSplitter:
assert remainder == ""
def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self):
opts = ChunkingOptions.new(max_characters=38, overlap=10)
opts = ChunkingOptions(max_characters=38, overlap=10)
split = _TextSplitter(opts)
text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
@ -257,9 +230,7 @@ class Describe_TextSplitter:
@pytest.mark.parametrize("separators", [("\n", " "), (" ",)])
def it_strips_whitespace_around_the_split(self, separators: Sequence[str]):
opts = ChunkingOptions.new(
max_characters=50, text_splitting_separators=separators, overlap=10
)
opts = ChunkingOptions(max_characters=50, text_splitting_separators=separators, overlap=10)
split = _TextSplitter(opts)
text = "Lorem ipsum dolor amet consectetur adipiscing. \n\n In rhoncus ipsum sed lectus."
# |-------------------------------------------------^ 50-chars
@ -271,12 +242,12 @@ class Describe_TextSplitter:
# ================================================================================================
# BASE PRE-CHUNKER
# PRE-CHUNKER
# ================================================================================================
class DescribeBasePreChunker:
"""Unit-test suite for `unstructured.chunking.base.BasePreChunker` objects."""
class DescribePreChunker:
"""Unit-test suite for `unstructured.chunking.base.PreChunker` objects."""
def it_gathers_elements_into_pre_chunks_respecting_the_specified_chunk_size(self):
elements = [
@ -289,9 +260,9 @@ class DescribeBasePreChunker:
CheckBox(),
]
opts = ChunkingOptions.new(max_characters=150, new_after_n_chars=65)
opts = ChunkingOptions(max_characters=150, new_after_n_chars=65)
pre_chunk_iter = BasePreChunker.iter_pre_chunks(elements, opts=opts)
pre_chunk_iter = PreChunker.iter_pre_chunks(elements, opts=opts)
pre_chunk = next(pre_chunk_iter)
assert isinstance(pre_chunk, TextPreChunk)
@ -344,7 +315,7 @@ class DescribeTablePreChunk:
pre_chunk = TablePreChunk(
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
overlap_prefix="ctus porta volutpat.",
opts=ChunkingOptions.new(max_characters=175),
opts=ChunkingOptions(max_characters=175),
)
chunk_iter = pre_chunk.iter_chunks()
@ -393,7 +364,7 @@ class DescribeTablePreChunk:
pre_chunk = TablePreChunk(
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
overlap_prefix="",
opts=ChunkingOptions.new(max_characters=100, text_splitting_separators=("\n", " ")),
opts=ChunkingOptions(max_characters=100, text_splitting_separators=("\n", " ")),
)
chunk_iter = pre_chunk.iter_chunks()
@ -457,7 +428,7 @@ class DescribeTablePreChunk:
self, text: str, expected_value: str
):
pre_chunk = TablePreChunk(
Table(text), overlap_prefix="", opts=ChunkingOptions.new(overlap=20, overlap_all=True)
Table(text), overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True)
)
assert pre_chunk.overlap_tail == expected_value
@ -480,7 +451,7 @@ class DescribeTablePreChunk:
self, text: str, overlap_prefix: str, expected_value: str
):
pre_chunk = TablePreChunk(
Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions.new()
Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions()
)
assert pre_chunk._text == expected_value
@ -536,7 +507,7 @@ class DescribeTextPreChunk:
self, max_characters: int, combine_text_under_n_chars: int, expected_value: bool
):
"""This allows `PreChunkCombiner` to operate without knowing `TextPreChunk` internals."""
opts = ChunkingOptions.new(
opts = ChunkingOptions(
max_characters=max_characters,
combine_text_under_n_chars=combine_text_under_n_chars,
overlap=20,
@ -560,7 +531,7 @@ class DescribeTextPreChunk:
Note that neither the original or other pre_chunk are mutated.
"""
opts = ChunkingOptions.new()
opts = ChunkingOptions()
pre_chunk = TextPreChunk(
[
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
@ -623,7 +594,7 @@ class DescribeTextPreChunk:
),
],
overlap_prefix="e feugiat efficitur.",
opts=ChunkingOptions.new(max_characters=200),
opts=ChunkingOptions(max_characters=200),
)
chunk_iter = pre_chunk.iter_chunks()
@ -648,7 +619,7 @@ class DescribeTextPreChunk:
),
],
overlap_prefix="",
opts=ChunkingOptions.new(max_characters=200, text_splitting_separators=("\n", " ")),
opts=ChunkingOptions(max_characters=200, text_splitting_separators=("\n", " ")),
)
chunk_iter = pre_chunk.iter_chunks()
@ -681,7 +652,7 @@ class DescribeTextPreChunk:
self, text: str, expected_value: str
):
pre_chunk = TextPreChunk(
[Text(text)], overlap_prefix="", opts=ChunkingOptions.new(overlap=20, overlap_all=True)
[Text(text)], overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True)
)
assert pre_chunk.overlap_tail == expected_value
@ -708,7 +679,7 @@ class DescribeTextPreChunk:
),
],
overlap_prefix="",
opts=ChunkingOptions.new(),
opts=ChunkingOptions(),
)
assert pre_chunk._all_metadata_values == {
@ -746,7 +717,7 @@ class DescribeTextPreChunk:
Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
],
overlap_prefix="",
opts=ChunkingOptions.new(),
opts=ChunkingOptions(),
)
# -- ad-hoc fields "coefficient" and "quotient" do not appear --
@ -789,7 +760,7 @@ class DescribeTextPreChunk:
),
],
overlap_prefix="ficitur.", # len == 8
opts=ChunkingOptions.new(),
opts=ChunkingOptions(),
)
regex_metadata = pre_chunk._consolidated_regex_meta
@ -845,7 +816,7 @@ class DescribeTextPreChunk:
),
],
overlap_prefix="",
opts=ChunkingOptions.new(),
opts=ChunkingOptions(),
)
meta_kwargs = pre_chunk._meta_kwargs
@ -881,9 +852,7 @@ class DescribeTextPreChunk:
The text-segment contributed by each element is separated from the next by a blank line
("\n\n"). An element that contributes no text does not give rise to a separator.
"""
pre_chunk = TextPreChunk(
elements, overlap_prefix=overlap_prefix, opts=ChunkingOptions.new()
)
pre_chunk = TextPreChunk(elements, overlap_prefix=overlap_prefix, opts=ChunkingOptions())
assert pre_chunk._text == expected_value
@ -896,13 +865,13 @@ class DescribePreChunkBuilder:
"""Unit-test suite for `unstructured.chunking.base.PreChunkBuilder`."""
def it_is_empty_on_construction(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=50))
assert builder._text_length == 0
assert builder._remaining_space == 50
def it_accumulates_elements_added_to_it(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
builder.add_element(Title("Introduction"))
assert builder._text_length == 12
@ -919,7 +888,7 @@ class DescribePreChunkBuilder:
@pytest.mark.parametrize("element", [Table("Heading\nCell text"), Text("abcd " * 200)])
def it_will_fit_a_Table_or_oversized_element_when_empty(self, element: Element):
builder = PreChunkBuilder(opts=ChunkingOptions.new())
builder = PreChunkBuilder(opts=ChunkingOptions())
assert builder.will_fit(element)
@pytest.mark.parametrize(
@ -934,22 +903,20 @@ class DescribePreChunkBuilder:
def but_not_when_it_already_contains_an_element_of_any_kind(
self, existing_element: Element, next_element: Element
):
builder = PreChunkBuilder(opts=ChunkingOptions.new())
builder = PreChunkBuilder(opts=ChunkingOptions())
builder.add_element(existing_element)
assert not builder.will_fit(next_element)
@pytest.mark.parametrize("element", [Text("abcd"), Table("Fruits\nMango")])
def it_will_not_fit_any_element_when_it_already_contains_a_table(self, element: Element):
builder = PreChunkBuilder(opts=ChunkingOptions.new())
builder = PreChunkBuilder(opts=ChunkingOptions())
builder.add_element(Table("Heading\nCell text"))
assert not builder.will_fit(element)
def it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen(self):
builder = PreChunkBuilder(
opts=ChunkingOptions.new(max_characters=100, new_after_n_chars=50)
)
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100, new_after_n_chars=50))
builder.add_element(
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
)
@ -957,7 +924,7 @@ class DescribePreChunkBuilder:
assert not builder.will_fit(Text("In rhoncus ipsum."))
def and_it_will_not_fit_an_element_when_that_would_cause_it_to_exceed_the_hard_maxlen(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100))
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100))
builder.add_element(
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
)
@ -968,7 +935,7 @@ class DescribePreChunkBuilder:
)
def but_it_will_fit_an_element_that_fits(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100))
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100))
builder.add_element(
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
)
@ -977,7 +944,7 @@ class DescribePreChunkBuilder:
assert builder.will_fit(Text("In rhoncus ipsum sed lectus porto volutpat.")) # 43-chars
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
builder.add_element(Title("Introduction"))
builder.add_element(
Text(
@ -1000,7 +967,7 @@ class DescribePreChunkBuilder:
assert builder._remaining_space == 150
def and_it_generates_a_TablePreChunk_when_it_contains_a_Table_element(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
builder.add_element(Table("Heading\nCell text"))
pre_chunk = next(builder.flush())
@ -1016,7 +983,7 @@ class DescribePreChunkBuilder:
assert pre_chunk._table == Table("Heading\nCell text")
def but_it_does_not_generate_a_pre_chunk_on_flush_when_empty(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
pre_chunks = list(builder.flush())
@ -1025,7 +992,7 @@ class DescribePreChunkBuilder:
assert builder._remaining_space == 150
def it_computes_overlap_from_each_pre_chunk_and_applies_it_to_the_next(self):
opts = ChunkingOptions.new(overlap=15, overlap_all=True)
opts = ChunkingOptions(overlap=15, overlap_all=True)
builder = PreChunkBuilder(opts=opts)
builder.add_element(Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."))
@ -1044,7 +1011,7 @@ class DescribePreChunkBuilder:
assert pre_chunk._text == "porta volutpat.\n\nDonec semper facilisis metus finibus."
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=50))
builder.add_element(Text("abcde"))
builder.add_element(Text("fghij"))
@ -1061,7 +1028,7 @@ class DescribePreChunkCombiner:
"""Unit-test suite for `unstructured.chunking.base.PreChunkCombiner`."""
def it_combines_sequential_small_text_pre_chunks(self):
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=250)
pre_chunks = [
TextPreChunk(
[
@ -1105,7 +1072,7 @@ class DescribePreChunkCombiner:
next(pre_chunk_iter)
def but_it_does_not_combine_table_pre_chunks(self):
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=250)
pre_chunks = [
TextPreChunk(
[
@ -1127,7 +1094,7 @@ class DescribePreChunkCombiner:
]
pre_chunk_iter = PreChunkCombiner(
pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
pre_chunks, ChunkingOptions(max_characters=250, combine_text_under_n_chars=250)
).iter_combined_pre_chunks()
pre_chunk = next(pre_chunk_iter)
@ -1152,7 +1119,7 @@ class DescribePreChunkCombiner:
next(pre_chunk_iter)
def it_respects_the_specified_combination_threshold(self):
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80)
opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=80)
pre_chunks = [
TextPreChunk( # 68
[
@ -1203,7 +1170,7 @@ class DescribePreChunkCombiner:
next(pre_chunk_iter)
def it_respects_the_hard_maximum_window_length(self):
opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200)
opts = ChunkingOptions(max_characters=200, combine_text_under_n_chars=200)
pre_chunks = [
TextPreChunk( # 68
[
@ -1256,7 +1223,7 @@ class DescribePreChunkCombiner:
def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
"""Such as occurs when a single element exceeds the window size."""
opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
opts = ChunkingOptions(max_characters=150, combine_text_under_n_chars=150)
pre_chunks = [
TextPreChunk([Title("Lorem Ipsum")], overlap_prefix="", opts=opts),
TextPreChunk( # 179
@ -1274,7 +1241,7 @@ class DescribePreChunkCombiner:
]
pre_chunk_iter = PreChunkCombiner(
pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
pre_chunks, ChunkingOptions(max_characters=150, combine_text_under_n_chars=150)
).iter_combined_pre_chunks()
pre_chunk = next(pre_chunk_iter)
@ -1303,7 +1270,7 @@ class DescribeTextPreChunkAccumulator:
"""Unit-test suite for `unstructured.chunking.base.TextPreChunkAccumulator`."""
def it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
opts = ChunkingOptions.new()
opts = ChunkingOptions(combine_text_under_n_chars=500)
accum = TextPreChunkAccumulator(opts=opts)
pre_chunk = TextPreChunk(
@ -1362,7 +1329,7 @@ class DescribeTextPreChunkAccumulator:
next(accum.flush())
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150))
accum = TextPreChunkAccumulator(opts=ChunkingOptions(max_characters=150))
assert list(accum.flush()) == []

View File

@ -1,4 +1,4 @@
"""Unit-test suite for the `unstructured.chunking.basic` module.
"""Test suite for the `unstructured.chunking.basic` module.
That module implements the baseline chunking strategy. The baseline strategy has all behaviors
shared by all chunking strategies and no extra rules like perserve section or page boundaries.

View File

@ -1,13 +1,20 @@
# pyright: reportPrivateUsage=false
"""Unit-test suite for the `unstructured.chunking.title` module."""
"""Test suite for the `unstructured.chunking.title` module."""
from __future__ import annotations
from typing import Optional
import pytest
from unstructured.chunking.base import ChunkingOptions, TablePreChunk, TextPreChunk
from unstructured.chunking.title import _ByTitlePreChunker, chunk_by_title
from unstructured.chunking.base import (
CHUNK_MULTI_PAGE_DEFAULT,
PreChunker,
TablePreChunk,
TextPreChunk,
)
from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title
from unstructured.documents.coordinates import CoordinateSystem
from unstructured.documents.elements import (
CheckBox,
@ -23,6 +30,13 @@ from unstructured.documents.elements import (
)
from unstructured.partition.html import partition_html
# ================================================================================================
# INTEGRATION-TESTS
# ================================================================================================
# These test `chunk_by_title()` as an integrated whole, calling `chunk_by_title()` and inspecting
# the outputs.
# ================================================================================================
def test_it_splits_a_large_element_into_multiple_chunks():
elements: list[Element] = [
@ -57,7 +71,7 @@ def test_split_elements_by_title_and_table():
CheckBox(),
]
pre_chunks = _ByTitlePreChunker.iter_pre_chunks(elements, opts=ChunkingOptions.new())
pre_chunks = PreChunker.iter_pre_chunks(elements, opts=_ByTitleChunkingOptions.new())
pre_chunk = next(pre_chunks)
assert isinstance(pre_chunk, TextPreChunk)
@ -544,3 +558,86 @@ def test_it_considers_separator_length_when_pre_chunking():
),
CompositeElement("Minimize mid-text chunk-splitting"),
]
# ================================================================================================
# UNIT-TESTS
# ================================================================================================
# These test individual components in isolation so can exercise all edge cases while still
# performing well.
# ================================================================================================
class Describe_ByTitleChunkingOptions:
"""Unit-test suite for `unstructured.chunking.title._ByTitleChunkingOptions` objects."""
@pytest.mark.parametrize("n_chars", [-1, -42])
def it_rejects_combine_text_under_n_chars_for_n_less_than_zero(self, n_chars: int):
with pytest.raises(
ValueError,
match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}",
):
_ByTitleChunkingOptions.new(combine_text_under_n_chars=n_chars)
def it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining(self):
"""Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining."""
opts = _ByTitleChunkingOptions(combine_text_under_n_chars=0)
assert opts.combine_text_under_n_chars == 0
def it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself(self):
"""Caller can specify `combine_text_under_n_chars` arg without specifying other options."""
try:
opts = _ByTitleChunkingOptions(combine_text_under_n_chars=50)
except ValueError:
pytest.fail("did not accept `combine_text_under_n_chars` as option by itself")
assert opts.combine_text_under_n_chars == 50
@pytest.mark.parametrize(
("combine_text_under_n_chars", "max_characters", "expected_hard_max"),
[(600, None, 500), (600, 450, 450)],
)
def it_rejects_combine_text_under_n_chars_greater_than_maxchars(
self, combine_text_under_n_chars: int, max_characters: Optional[int], expected_hard_max: int
):
"""`combine_text_under_n_chars` > `max_characters` can produce behavior confusing to users.
The behavior is no different from `combine_text_under_n_chars == max_characters`, but if
`max_characters` is left to default (500) and `combine_text_under_n_chars` is set to a
larger number like 1500 then it can look like chunk-combining isn't working.
"""
with pytest.raises(
ValueError,
match=(
"'combine_text_under_n_chars' argument must not exceed `max_characters` value,"
f" got {combine_text_under_n_chars} > {expected_hard_max}"
),
):
_ByTitleChunkingOptions.new(
max_characters=max_characters, combine_text_under_n_chars=combine_text_under_n_chars
)
def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self):
"""Caller can specify `new_after_n_chars` arg without specifying any other options.
In particular, `combine_text_under_n_chars` value is adjusted down to the
`new_after_n_chars` value when the default for `combine_text_under_n_chars` exceeds the
value of `new_after_n_chars`.
"""
try:
opts = _ByTitleChunkingOptions(new_after_n_chars=200)
except ValueError:
pytest.fail("did not accept `new_after_n_chars` as option by itself")
assert opts.soft_max == 200
assert opts.combine_text_under_n_chars == 200
@pytest.mark.parametrize(
("multipage_sections", "expected_value"),
[(True, True), (False, False), (None, CHUNK_MULTI_PAGE_DEFAULT)],
)
def it_knows_whether_to_break_chunks_on_page_boundaries(
self, multipage_sections: bool, expected_value: bool
):
opts = _ByTitleChunkingOptions(multipage_sections=multipage_sections)
assert opts.multipage_sections is expected_value

View File

@ -7,7 +7,7 @@ import copy
from typing import Any, Callable, DefaultDict, Iterable, Iterator, Optional, Sequence, cast
import regex
from typing_extensions import Self, TypeAlias
from typing_extensions import TypeAlias
from unstructured.documents.elements import (
CompositeElement,
@ -68,9 +68,6 @@ class ChunkingOptions:
when not specified, which effectively disables this behavior. Specifying 0 for this
argument causes each element to appear in a chunk by itself (although an element with text
longer than `max_characters` will be still be split into two or more chunks).
multipage_sections
Indicates that page-boundaries should not be respected while chunking, i.e. elements
appearing on two different pages can appear in the same chunk.
combine_text_under_n_chars
Provides a way to "recombine" small chunks formed by breaking on a semantic boundary. Only
relevant for a chunking strategy that specifies higher-level semantic boundaries to be
@ -101,9 +98,9 @@ class ChunkingOptions:
def __init__(
self,
*,
combine_text_under_n_chars: Optional[int] = None,
max_characters: Optional[int] = None,
multipage_sections: Optional[bool] = None,
new_after_n_chars: Optional[int] = None,
overlap: Optional[int] = None,
overlap_all: Optional[bool] = None,
@ -111,59 +108,28 @@ class ChunkingOptions:
):
self._combine_text_under_n_chars_arg = combine_text_under_n_chars
self._max_characters_arg = max_characters
self._multipage_sections_arg = multipage_sections
self._new_after_n_chars_arg = new_after_n_chars
self._overlap_arg = overlap
self._overlap_all_arg = overlap_all
self._text_splitting_separators = text_splitting_separators
@classmethod
def new(
cls,
combine_text_under_n_chars: Optional[int] = None,
max_characters: Optional[int] = None,
multipage_sections: Optional[bool] = None,
new_after_n_chars: Optional[int] = None,
overlap: Optional[int] = None,
overlap_all: Optional[bool] = None,
text_splitting_separators: Sequence[str] = ("\n", " "),
) -> Self:
"""Construct validated instance.
@lazyproperty
def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
"""The semantic-boundary detectors to be applied to break pre-chunks.
Raises `ValueError` on invalid arguments like overlap > max_chars.
Overridden by sub-typs to provide semantic-boundary isolation behaviors.
"""
self = cls(
combine_text_under_n_chars,
max_characters,
multipage_sections,
new_after_n_chars,
overlap,
overlap_all,
text_splitting_separators,
)
self._validate()
return self
return ()
@lazyproperty
def combine_text_under_n_chars(self) -> int:
"""Combine consecutive text pre-chunks if former is smaller than this and both will fit.
"""Combine two consecutive text pre-chunks if first is smaller than this and both will fit.
- Does not combine table chunks with text chunks even if they would both fit in the
chunking window.
- Does not combine text chunks if together they would exceed the chunking window.
- Defaults to `max_characters` when not specified.
- Is reduced to `new_after_n_chars` when it exceeds that value.
Default applied here is `0` which essentially disables chunk combining. Must be overridden
by subclass where combining behavior is supported.
"""
max_characters = self.hard_max
soft_max = self.soft_max
arg = self._combine_text_under_n_chars_arg
# -- `combine_text_under_n_chars` defaults to `max_characters` when not specified and is
# -- capped at max-chars
combine_text_under_n_chars = max_characters if arg is None or arg > max_characters else arg
# -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` --
return soft_max if combine_text_under_n_chars > soft_max else combine_text_under_n_chars
arg_value = self._combine_text_under_n_chars_arg
return arg_value if arg_value is not None else 0
@lazyproperty
def hard_max(self) -> int:
@ -185,12 +151,6 @@ class ChunkingOptions:
"""
return self.overlap if self._overlap_all_arg else 0
@lazyproperty
def multipage_sections(self) -> bool:
"""When False, break pre-chunks on page-boundaries."""
arg_value = self._multipage_sections_arg
return CHUNK_MULTI_PAGE_DEFAULT if arg_value is None else bool(arg_value)
@lazyproperty
def overlap(self) -> int:
"""The number of characters to overlap text when splitting chunks mid-text.
@ -256,28 +216,15 @@ class ChunkingOptions:
if max_characters <= 0:
raise ValueError(f"'max_characters' argument must be > 0," f" got {max_characters}")
# -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination)
# -- but a negative value is not
combine_text_under_n_chars = self._combine_text_under_n_chars_arg
if combine_text_under_n_chars is not None and combine_text_under_n_chars < 0:
raise ValueError(
f"'combine_text_under_n_chars' argument must be >= 0,"
f" got {combine_text_under_n_chars}"
)
# -- a negative value for `new_after_n_chars` is assumed to
# -- be a mistake the caller will want to know about
# -- a negative value for `new_after_n_chars` is assumed to be a mistake the caller will
# -- want to know about
new_after_n_chars = self._new_after_n_chars_arg
if new_after_n_chars is not None and new_after_n_chars < 0:
raise ValueError(
f"'new_after_n_chars' argument must be >= 0," f" got {new_after_n_chars}"
)
# -- overlap must be less than max-chars or the chunk text will
# -- never be consumed
# TODO: consider a heuristic like never overlap more than half,
# otherwise there could be corner cases leading to an infinite
# loop (I think).
# -- overlap must be less than max-chars or the chunk text will never be consumed --
if self.overlap >= max_characters:
raise ValueError(
f"'overlap' argument must be less than `max_characters`,"
@ -402,12 +349,12 @@ class _TextSplitter:
# ================================================================================================
# BASE PRE-CHUNKER
# PRE-CHUNKER
# ================================================================================================
class BasePreChunker:
"""Base-class for per-strategy pre-chunkers.
class PreChunker:
"""Gathers sequential elements into pre-chunks as length constraints allow.
The pre-chunker's responsibilities are:
@ -465,7 +412,7 @@ class BasePreChunker:
@lazyproperty
def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
"""The semantic-boundary detectors to be applied to break pre-chunks."""
return ()
return self._opts.boundary_predicates
def _is_in_new_semantic_unit(self, element: Element) -> bool:
"""True when `element` begins a new semantic unit such as a section or page."""

View File

@ -17,7 +17,9 @@ from __future__ import annotations
from typing import Iterable, Optional
from unstructured.chunking.base import BasePreChunker, ChunkingOptions
from typing_extensions import Self
from unstructured.chunking.base import ChunkingOptions, PreChunker
from unstructured.documents.elements import Element
@ -58,7 +60,7 @@ def chunk_elements(
level of "pollution" of otherwise clean semantic chunk boundaries.
"""
# -- raises ValueError on invalid parameters --
opts = ChunkingOptions.new(
opts = _BasicChunkingOptions.new(
max_characters=max_characters,
new_after_n_chars=new_after_n_chars,
overlap=overlap,
@ -67,14 +69,32 @@ def chunk_elements(
return [
chunk
for pre_chunk in BasicPreChunker.iter_pre_chunks(elements, opts)
for pre_chunk in PreChunker.iter_pre_chunks(elements, opts)
for chunk in pre_chunk.iter_chunks()
]
class BasicPreChunker(BasePreChunker):
"""Produces pre-chunks from a sequence of document-elements using the "basic" rule-set.
class _BasicChunkingOptions(ChunkingOptions):
"""Options for `basic` chunking."""
The "basic" rule-set is essentially "no-rules" other than `Table` is segregated into its own
pre-chunk.
@classmethod
def new(
cls,
*,
max_characters: Optional[int] = None,
new_after_n_chars: Optional[int] = None,
overlap: Optional[int] = None,
overlap_all: Optional[bool] = None,
) -> Self:
"""Construct validated instance.
Raises `ValueError` on invalid arguments like overlap > max_chars.
"""
self = cls(
max_characters=max_characters,
new_after_n_chars=new_after_n_chars,
overlap=overlap,
overlap_all=overlap_all,
)
self._validate()
return self

View File

@ -7,11 +7,14 @@ from __future__ import annotations
from typing import Iterable, Iterator, Optional
from typing_extensions import Self
from unstructured.chunking.base import (
BasePreChunker,
CHUNK_MULTI_PAGE_DEFAULT,
BoundaryPredicate,
ChunkingOptions,
PreChunkCombiner,
PreChunker,
is_in_next_section,
is_on_next_page,
is_title,
@ -22,6 +25,7 @@ from unstructured.utils import lazyproperty
def chunk_by_title(
elements: Iterable[Element],
*,
max_characters: Optional[int] = None,
multipage_sections: Optional[bool] = None,
combine_text_under_n_chars: Optional[int] = None,
@ -65,7 +69,7 @@ def chunk_by_title(
elements and not subject to text-splitting. Use this with caution as it entails a certain
level of "pollution" of otherwise clean semantic chunk boundaries.
"""
opts = ChunkingOptions.new(
opts = _ByTitleChunkingOptions.new(
combine_text_under_n_chars=combine_text_under_n_chars,
max_characters=max_characters,
multipage_sections=multipage_sections,
@ -75,27 +79,127 @@ def chunk_by_title(
)
pre_chunks = PreChunkCombiner(
_ByTitlePreChunker.iter_pre_chunks(elements, opts), opts=opts
PreChunker.iter_pre_chunks(elements, opts), opts=opts
).iter_combined_pre_chunks()
return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks()]
class _ByTitlePreChunker(BasePreChunker):
"""Pre-chunker for the "by_title" chunking strategy.
class _ByTitleChunkingOptions(ChunkingOptions):
"""Adds the by-title-specific chunking options to the base case.
The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates a
new "section", hence the "by-title" designation.
`by_title`-specific options:
multipage_sections
Indicates that page-boundaries should not be respected while chunking, i.e. elements
appearing on two different pages can appear in the same chunk.
"""
def __init__(
self,
*,
max_characters: Optional[int] = None,
combine_text_under_n_chars: Optional[int] = None,
multipage_sections: Optional[bool] = None,
new_after_n_chars: Optional[int] = None,
overlap: Optional[int] = None,
overlap_all: Optional[bool] = None,
):
super().__init__(
combine_text_under_n_chars=combine_text_under_n_chars,
max_characters=max_characters,
new_after_n_chars=new_after_n_chars,
overlap=overlap,
overlap_all=overlap_all,
)
self._multipage_sections_arg = multipage_sections
@classmethod
def new(
cls,
*,
max_characters: Optional[int] = None,
combine_text_under_n_chars: Optional[int] = None,
multipage_sections: Optional[bool] = None,
new_after_n_chars: Optional[int] = None,
overlap: Optional[int] = None,
overlap_all: Optional[bool] = None,
) -> Self:
"""Return instance or raises `ValueError` on invalid arguments like overlap > max_chars."""
self = cls(
max_characters=max_characters,
combine_text_under_n_chars=combine_text_under_n_chars,
multipage_sections=multipage_sections,
new_after_n_chars=new_after_n_chars,
overlap=overlap,
overlap_all=overlap_all,
)
self._validate()
return self
@lazyproperty
def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
"""The semantic-boundary detectors to be applied to break pre-chunks."""
def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
"""The semantic-boundary detectors to be applied to break pre-chunks.
For the `by_title` strategy these are sections indicated by a title (section-heading), an
explicit section metadata item (only present for certain document types), and optionally
page boundaries.
"""
def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:
yield is_title
yield is_in_next_section()
if not self._opts.multipage_sections:
if not self.multipage_sections:
yield is_on_next_page()
return tuple(iter_boundary_predicates())
@lazyproperty
def combine_text_under_n_chars(self) -> int:
"""Combine consecutive text pre-chunks if former is smaller than this and both will fit.
- Does not combine table chunks with text chunks even if they would both fit in the
chunking window.
- Does not combine text chunks if together they would exceed the chunking window.
- Defaults to `max_characters` when not specified.
- Is reduced to `new_after_n_chars` when it exceeds that value.
"""
max_characters = self.hard_max
soft_max = self.soft_max
arg_value = self._combine_text_under_n_chars_arg
# -- `combine_text_under_n_chars` defaults to `max_characters` when not specified --
combine_text_under_n_chars = max_characters if arg_value is None else arg_value
# -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` --
return soft_max if combine_text_under_n_chars > soft_max else combine_text_under_n_chars
@lazyproperty
def multipage_sections(self) -> bool:
"""When False, break pre-chunks on page-boundaries."""
arg_value = self._multipage_sections_arg
return CHUNK_MULTI_PAGE_DEFAULT if arg_value is None else bool(arg_value)
def _validate(self) -> None:
"""Raise ValueError if request option-set is invalid."""
# -- start with base-class validations --
super()._validate()
if (combine_text_under_n_chars_arg := self._combine_text_under_n_chars_arg) is not None:
# -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination)
# -- but a negative value is not
if combine_text_under_n_chars_arg < 0:
raise ValueError(
f"'combine_text_under_n_chars' argument must be >= 0,"
f" got {combine_text_under_n_chars_arg}"
)
# -- `combine_text_under_n_chars` > `max_characters` can produce behavior confusing to
# -- users. The chunking behavior would be no different than when
# -- `combine_text_under_n_chars == max_characters`, but if `max_characters` is left to
# -- default (500) then it can look like chunk-combining isn't working.
if combine_text_under_n_chars_arg > self.hard_max:
raise ValueError(
f"'combine_text_under_n_chars' argument must not exceed `max_characters`"
f" value, got {combine_text_under_n_chars_arg} > {self.hard_max}"
)