mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 11:03:38 +00:00
rfctr(chunking): extract strategy-specific chunking options (#2556)
**Summary** A pluggable chunking strategy needs its own local set of chunking options that subclasses a base-class in `unstructured`. Extract distinct `_ByTitleChunkingOptions` and `_BasicChunkingOptions` for the existing two chunking strategies and move their strategy-specific option setting and validation to the respective subclass. This was also a good opportunity for us to clean up a few odds and ends we'd been meaning to. Might be worth looking at the commits individually as they are cohesive incremental steps toward the goal.
This commit is contained in:
parent
b4d9ad8130
commit
51cf6bf716
@ -4,15 +4,15 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Sequence
|
||||
from typing import Optional, Sequence
|
||||
|
||||
import pytest
|
||||
|
||||
from unstructured.chunking.base import (
|
||||
BasePreChunker,
|
||||
ChunkingOptions,
|
||||
PreChunkBuilder,
|
||||
PreChunkCombiner,
|
||||
PreChunker,
|
||||
TablePreChunk,
|
||||
TextPreChunk,
|
||||
TextPreChunkAccumulator,
|
||||
@ -48,7 +48,7 @@ class DescribeChunkingOptions:
|
||||
ValueError,
|
||||
match=f"'max_characters' argument must be > 0, got {max_characters}",
|
||||
):
|
||||
ChunkingOptions.new(max_characters=max_characters)
|
||||
ChunkingOptions(max_characters=max_characters)._validate()
|
||||
|
||||
def it_does_not_complain_when_specifying_max_characters_by_itself(self):
|
||||
"""Caller can specify `max_characters` arg without specifying any others.
|
||||
@ -58,44 +58,25 @@ class DescribeChunkingOptions:
|
||||
and trigger an exception.
|
||||
"""
|
||||
try:
|
||||
ChunkingOptions.new(max_characters=50)
|
||||
ChunkingOptions(max_characters=50)._validate()
|
||||
except ValueError:
|
||||
pytest.fail("did not accept `max_characters` as option by itself")
|
||||
|
||||
@pytest.mark.parametrize("n_chars", [-1, -42])
|
||||
def it_rejects_combine_text_under_n_chars_for_n_less_than_zero(self, n_chars: int):
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}",
|
||||
):
|
||||
ChunkingOptions.new(combine_text_under_n_chars=n_chars)
|
||||
@pytest.mark.parametrize(
|
||||
("combine_text_under_n_chars", "expected_value"), [(None, 0), (42, 42)]
|
||||
)
|
||||
def it_accepts_combine_text_under_n_chars_in_constructor_but_defaults_to_no_combining(
|
||||
self, combine_text_under_n_chars: Optional[int], expected_value: int
|
||||
):
|
||||
"""Subclasses can store `combine_text_under_n_chars` but must validate and enable it.
|
||||
|
||||
def it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining(self):
|
||||
"""Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining."""
|
||||
opts = ChunkingOptions.new(combine_text_under_n_chars=0)
|
||||
assert opts.combine_text_under_n_chars == 0
|
||||
|
||||
def it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself(self):
|
||||
"""Caller can specify `combine_text_under_n_chars` arg without specifying other options."""
|
||||
try:
|
||||
opts = ChunkingOptions.new(combine_text_under_n_chars=50)
|
||||
except ValueError:
|
||||
pytest.fail("did not accept `combine_text_under_n_chars` as option by itself")
|
||||
|
||||
assert opts.combine_text_under_n_chars == 50
|
||||
|
||||
def it_silently_accepts_combine_text_under_n_chars_greater_than_maxchars(self):
|
||||
"""`combine_text_under_n_chars` > `max_characters` doesn't affect chunking behavior.
|
||||
|
||||
So rather than raising an exception or warning, we just cap that value at `max_characters`
|
||||
which is the behavioral equivalent.
|
||||
The `combine_text_under_n_chars` option is not used by all chunkers and its behavior can
|
||||
differ between subtypes. It is present in and stored by the contructur but it defaults to
|
||||
`0` (no pre-chunk combining) and must be overridden by subclasses to give it the desired
|
||||
behavior.
|
||||
"""
|
||||
try:
|
||||
opts = ChunkingOptions.new(max_characters=500, combine_text_under_n_chars=600)
|
||||
except ValueError:
|
||||
pytest.fail("did not accept `combine_text_under_n_chars` greater than `max_characters`")
|
||||
|
||||
assert opts.combine_text_under_n_chars == 500
|
||||
opts = ChunkingOptions(combine_text_under_n_chars=combine_text_under_n_chars)
|
||||
assert opts.combine_text_under_n_chars == expected_value
|
||||
|
||||
@pytest.mark.parametrize("n_chars", [-1, -42])
|
||||
def it_rejects_new_after_n_chars_for_n_less_than_zero(self, n_chars: int):
|
||||
@ -103,7 +84,7 @@ class DescribeChunkingOptions:
|
||||
ValueError,
|
||||
match=f"'new_after_n_chars' argument must be >= 0, got {n_chars}",
|
||||
):
|
||||
ChunkingOptions.new(new_after_n_chars=n_chars)
|
||||
ChunkingOptions(new_after_n_chars=n_chars)._validate()
|
||||
|
||||
def it_rejects_overlap_not_less_than_max_characters(self):
|
||||
with pytest.raises(
|
||||
@ -113,26 +94,23 @@ class DescribeChunkingOptions:
|
||||
ChunkingOptions(max_characters=200, overlap=300)._validate()
|
||||
|
||||
def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self):
|
||||
"""Caller can specify `new_after_n_chars` arg without specifying any other options.
|
||||
|
||||
In particular, `combine_text_under_n_chars` value is adjusted down to the
|
||||
`new_after_n_chars` value when the default for `combine_text_under_n_chars` exceeds the
|
||||
value of `new_after_n_chars`.
|
||||
"""
|
||||
"""Caller can specify `new_after_n_chars` arg without specifying any other options."""
|
||||
opts = ChunkingOptions(new_after_n_chars=200)
|
||||
try:
|
||||
opts = ChunkingOptions.new(new_after_n_chars=200)
|
||||
opts._validate()
|
||||
except ValueError:
|
||||
pytest.fail("did not accept `new_after_n_chars` as option by itself")
|
||||
|
||||
assert opts.soft_max == 200
|
||||
assert opts.combine_text_under_n_chars == 200
|
||||
|
||||
def it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk(self):
|
||||
"""Specifying `new_after_n_chars=0` places each element into its own pre-chunk.
|
||||
|
||||
This puts each element into its own chunk, although long chunks are still split.
|
||||
"""
|
||||
opts = ChunkingOptions.new(new_after_n_chars=0)
|
||||
opts = ChunkingOptions(new_after_n_chars=0)
|
||||
opts._validate()
|
||||
|
||||
assert opts.soft_max == 0
|
||||
|
||||
def it_silently_accepts_new_after_n_chars_greater_than_maxchars(self):
|
||||
@ -141,33 +119,32 @@ class DescribeChunkingOptions:
|
||||
So rather than raising an exception or warning, we just cap that value at `max_characters`
|
||||
which is the behavioral equivalent.
|
||||
"""
|
||||
opts = ChunkingOptions(max_characters=444, new_after_n_chars=555)
|
||||
try:
|
||||
opts = ChunkingOptions.new(max_characters=444, new_after_n_chars=555)
|
||||
opts._validate()
|
||||
except ValueError:
|
||||
pytest.fail("did not accept `new_after_n_chars` greater than `max_characters`")
|
||||
|
||||
assert opts.soft_max == 444
|
||||
|
||||
def it_knows_how_much_overlap_to_apply_to_split_chunks(self):
|
||||
assert ChunkingOptions.new(overlap=10).overlap == 10
|
||||
assert ChunkingOptions(overlap=10).overlap == 10
|
||||
|
||||
def and_it_uses_the_same_value_for_inter_chunk_overlap_when_asked_to_overlap_all_chunks(self):
|
||||
assert ChunkingOptions.new(overlap=10, overlap_all=True).inter_chunk_overlap == 10
|
||||
assert ChunkingOptions(overlap=10, overlap_all=True).inter_chunk_overlap == 10
|
||||
|
||||
def but_it_does_not_overlap_pre_chunks_by_default(self):
|
||||
assert ChunkingOptions.new(overlap=10).inter_chunk_overlap == 0
|
||||
assert ChunkingOptions(overlap=10).inter_chunk_overlap == 0
|
||||
|
||||
def it_knows_the_text_separator_string(self):
|
||||
assert ChunkingOptions.new().text_separator == "\n\n"
|
||||
assert ChunkingOptions().text_separator == "\n\n"
|
||||
|
||||
|
||||
class Describe_TextSplitter:
|
||||
"""Unit-test suite for `unstructured.chunking.base._TextSplitter` objects."""
|
||||
|
||||
def it_splits_on_a_preferred_separator_when_it_can(self):
|
||||
opts = ChunkingOptions.new(
|
||||
max_characters=50, text_splitting_separators=("\n", " "), overlap=10
|
||||
)
|
||||
opts = ChunkingOptions(max_characters=50, text_splitting_separators=("\n", " "), overlap=10)
|
||||
split = _TextSplitter(opts)
|
||||
text = (
|
||||
"Lorem ipsum dolor amet consectetur adipiscing. \n "
|
||||
@ -189,9 +166,7 @@ class Describe_TextSplitter:
|
||||
assert remainder == ""
|
||||
|
||||
def and_it_splits_on_the_next_available_separator_when_the_first_is_not_available(self):
|
||||
opts = ChunkingOptions.new(
|
||||
max_characters=40, text_splitting_separators=("\n", " "), overlap=10
|
||||
)
|
||||
opts = ChunkingOptions(max_characters=40, text_splitting_separators=("\n", " "), overlap=10)
|
||||
split = _TextSplitter(opts)
|
||||
text = (
|
||||
"Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus porta"
|
||||
@ -211,9 +186,7 @@ class Describe_TextSplitter:
|
||||
assert remainder == ""
|
||||
|
||||
def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self):
|
||||
opts = ChunkingOptions.new(
|
||||
max_characters=30, text_splitting_separators=("\n", " "), overlap=10
|
||||
)
|
||||
opts = ChunkingOptions(max_characters=30, text_splitting_separators=("\n", " "), overlap=10)
|
||||
split = _TextSplitter(opts)
|
||||
text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
|
||||
|
||||
@ -237,7 +210,7 @@ class Describe_TextSplitter:
|
||||
],
|
||||
)
|
||||
def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str):
|
||||
opts = ChunkingOptions.new(max_characters=46, overlap=10)
|
||||
opts = ChunkingOptions(max_characters=46, overlap=10)
|
||||
split = _TextSplitter(opts)
|
||||
|
||||
s, remainder = split(text)
|
||||
@ -246,7 +219,7 @@ class Describe_TextSplitter:
|
||||
assert remainder == ""
|
||||
|
||||
def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self):
|
||||
opts = ChunkingOptions.new(max_characters=38, overlap=10)
|
||||
opts = ChunkingOptions(max_characters=38, overlap=10)
|
||||
split = _TextSplitter(opts)
|
||||
text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
|
||||
|
||||
@ -257,9 +230,7 @@ class Describe_TextSplitter:
|
||||
|
||||
@pytest.mark.parametrize("separators", [("\n", " "), (" ",)])
|
||||
def it_strips_whitespace_around_the_split(self, separators: Sequence[str]):
|
||||
opts = ChunkingOptions.new(
|
||||
max_characters=50, text_splitting_separators=separators, overlap=10
|
||||
)
|
||||
opts = ChunkingOptions(max_characters=50, text_splitting_separators=separators, overlap=10)
|
||||
split = _TextSplitter(opts)
|
||||
text = "Lorem ipsum dolor amet consectetur adipiscing. \n\n In rhoncus ipsum sed lectus."
|
||||
# |-------------------------------------------------^ 50-chars
|
||||
@ -271,12 +242,12 @@ class Describe_TextSplitter:
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# BASE PRE-CHUNKER
|
||||
# PRE-CHUNKER
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class DescribeBasePreChunker:
|
||||
"""Unit-test suite for `unstructured.chunking.base.BasePreChunker` objects."""
|
||||
class DescribePreChunker:
|
||||
"""Unit-test suite for `unstructured.chunking.base.PreChunker` objects."""
|
||||
|
||||
def it_gathers_elements_into_pre_chunks_respecting_the_specified_chunk_size(self):
|
||||
elements = [
|
||||
@ -289,9 +260,9 @@ class DescribeBasePreChunker:
|
||||
CheckBox(),
|
||||
]
|
||||
|
||||
opts = ChunkingOptions.new(max_characters=150, new_after_n_chars=65)
|
||||
opts = ChunkingOptions(max_characters=150, new_after_n_chars=65)
|
||||
|
||||
pre_chunk_iter = BasePreChunker.iter_pre_chunks(elements, opts=opts)
|
||||
pre_chunk_iter = PreChunker.iter_pre_chunks(elements, opts=opts)
|
||||
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
@ -344,7 +315,7 @@ class DescribeTablePreChunk:
|
||||
pre_chunk = TablePreChunk(
|
||||
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
|
||||
overlap_prefix="ctus porta volutpat.",
|
||||
opts=ChunkingOptions.new(max_characters=175),
|
||||
opts=ChunkingOptions(max_characters=175),
|
||||
)
|
||||
|
||||
chunk_iter = pre_chunk.iter_chunks()
|
||||
@ -393,7 +364,7 @@ class DescribeTablePreChunk:
|
||||
pre_chunk = TablePreChunk(
|
||||
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
|
||||
overlap_prefix="",
|
||||
opts=ChunkingOptions.new(max_characters=100, text_splitting_separators=("\n", " ")),
|
||||
opts=ChunkingOptions(max_characters=100, text_splitting_separators=("\n", " ")),
|
||||
)
|
||||
|
||||
chunk_iter = pre_chunk.iter_chunks()
|
||||
@ -457,7 +428,7 @@ class DescribeTablePreChunk:
|
||||
self, text: str, expected_value: str
|
||||
):
|
||||
pre_chunk = TablePreChunk(
|
||||
Table(text), overlap_prefix="", opts=ChunkingOptions.new(overlap=20, overlap_all=True)
|
||||
Table(text), overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True)
|
||||
)
|
||||
assert pre_chunk.overlap_tail == expected_value
|
||||
|
||||
@ -480,7 +451,7 @@ class DescribeTablePreChunk:
|
||||
self, text: str, overlap_prefix: str, expected_value: str
|
||||
):
|
||||
pre_chunk = TablePreChunk(
|
||||
Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions.new()
|
||||
Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions()
|
||||
)
|
||||
assert pre_chunk._text == expected_value
|
||||
|
||||
@ -536,7 +507,7 @@ class DescribeTextPreChunk:
|
||||
self, max_characters: int, combine_text_under_n_chars: int, expected_value: bool
|
||||
):
|
||||
"""This allows `PreChunkCombiner` to operate without knowing `TextPreChunk` internals."""
|
||||
opts = ChunkingOptions.new(
|
||||
opts = ChunkingOptions(
|
||||
max_characters=max_characters,
|
||||
combine_text_under_n_chars=combine_text_under_n_chars,
|
||||
overlap=20,
|
||||
@ -560,7 +531,7 @@ class DescribeTextPreChunk:
|
||||
|
||||
Note that neither the original or other pre_chunk are mutated.
|
||||
"""
|
||||
opts = ChunkingOptions.new()
|
||||
opts = ChunkingOptions()
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
@ -623,7 +594,7 @@ class DescribeTextPreChunk:
|
||||
),
|
||||
],
|
||||
overlap_prefix="e feugiat efficitur.",
|
||||
opts=ChunkingOptions.new(max_characters=200),
|
||||
opts=ChunkingOptions(max_characters=200),
|
||||
)
|
||||
|
||||
chunk_iter = pre_chunk.iter_chunks()
|
||||
@ -648,7 +619,7 @@ class DescribeTextPreChunk:
|
||||
),
|
||||
],
|
||||
overlap_prefix="",
|
||||
opts=ChunkingOptions.new(max_characters=200, text_splitting_separators=("\n", " ")),
|
||||
opts=ChunkingOptions(max_characters=200, text_splitting_separators=("\n", " ")),
|
||||
)
|
||||
|
||||
chunk_iter = pre_chunk.iter_chunks()
|
||||
@ -681,7 +652,7 @@ class DescribeTextPreChunk:
|
||||
self, text: str, expected_value: str
|
||||
):
|
||||
pre_chunk = TextPreChunk(
|
||||
[Text(text)], overlap_prefix="", opts=ChunkingOptions.new(overlap=20, overlap_all=True)
|
||||
[Text(text)], overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True)
|
||||
)
|
||||
assert pre_chunk.overlap_tail == expected_value
|
||||
|
||||
@ -708,7 +679,7 @@ class DescribeTextPreChunk:
|
||||
),
|
||||
],
|
||||
overlap_prefix="",
|
||||
opts=ChunkingOptions.new(),
|
||||
opts=ChunkingOptions(),
|
||||
)
|
||||
|
||||
assert pre_chunk._all_metadata_values == {
|
||||
@ -746,7 +717,7 @@ class DescribeTextPreChunk:
|
||||
Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
|
||||
],
|
||||
overlap_prefix="",
|
||||
opts=ChunkingOptions.new(),
|
||||
opts=ChunkingOptions(),
|
||||
)
|
||||
|
||||
# -- ad-hoc fields "coefficient" and "quotient" do not appear --
|
||||
@ -789,7 +760,7 @@ class DescribeTextPreChunk:
|
||||
),
|
||||
],
|
||||
overlap_prefix="ficitur.", # len == 8
|
||||
opts=ChunkingOptions.new(),
|
||||
opts=ChunkingOptions(),
|
||||
)
|
||||
|
||||
regex_metadata = pre_chunk._consolidated_regex_meta
|
||||
@ -845,7 +816,7 @@ class DescribeTextPreChunk:
|
||||
),
|
||||
],
|
||||
overlap_prefix="",
|
||||
opts=ChunkingOptions.new(),
|
||||
opts=ChunkingOptions(),
|
||||
)
|
||||
|
||||
meta_kwargs = pre_chunk._meta_kwargs
|
||||
@ -881,9 +852,7 @@ class DescribeTextPreChunk:
|
||||
The text-segment contributed by each element is separated from the next by a blank line
|
||||
("\n\n"). An element that contributes no text does not give rise to a separator.
|
||||
"""
|
||||
pre_chunk = TextPreChunk(
|
||||
elements, overlap_prefix=overlap_prefix, opts=ChunkingOptions.new()
|
||||
)
|
||||
pre_chunk = TextPreChunk(elements, overlap_prefix=overlap_prefix, opts=ChunkingOptions())
|
||||
assert pre_chunk._text == expected_value
|
||||
|
||||
|
||||
@ -896,13 +865,13 @@ class DescribePreChunkBuilder:
|
||||
"""Unit-test suite for `unstructured.chunking.base.PreChunkBuilder`."""
|
||||
|
||||
def it_is_empty_on_construction(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=50))
|
||||
|
||||
assert builder._text_length == 0
|
||||
assert builder._remaining_space == 50
|
||||
|
||||
def it_accumulates_elements_added_to_it(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
|
||||
|
||||
builder.add_element(Title("Introduction"))
|
||||
assert builder._text_length == 12
|
||||
@ -919,7 +888,7 @@ class DescribePreChunkBuilder:
|
||||
|
||||
@pytest.mark.parametrize("element", [Table("Heading\nCell text"), Text("abcd " * 200)])
|
||||
def it_will_fit_a_Table_or_oversized_element_when_empty(self, element: Element):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new())
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions())
|
||||
assert builder.will_fit(element)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -934,22 +903,20 @@ class DescribePreChunkBuilder:
|
||||
def but_not_when_it_already_contains_an_element_of_any_kind(
|
||||
self, existing_element: Element, next_element: Element
|
||||
):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new())
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions())
|
||||
builder.add_element(existing_element)
|
||||
|
||||
assert not builder.will_fit(next_element)
|
||||
|
||||
@pytest.mark.parametrize("element", [Text("abcd"), Table("Fruits\nMango")])
|
||||
def it_will_not_fit_any_element_when_it_already_contains_a_table(self, element: Element):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new())
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions())
|
||||
builder.add_element(Table("Heading\nCell text"))
|
||||
|
||||
assert not builder.will_fit(element)
|
||||
|
||||
def it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen(self):
|
||||
builder = PreChunkBuilder(
|
||||
opts=ChunkingOptions.new(max_characters=100, new_after_n_chars=50)
|
||||
)
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100, new_after_n_chars=50))
|
||||
builder.add_element(
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
|
||||
)
|
||||
@ -957,7 +924,7 @@ class DescribePreChunkBuilder:
|
||||
assert not builder.will_fit(Text("In rhoncus ipsum."))
|
||||
|
||||
def and_it_will_not_fit_an_element_when_that_would_cause_it_to_exceed_the_hard_maxlen(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100))
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100))
|
||||
builder.add_element(
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
|
||||
)
|
||||
@ -968,7 +935,7 @@ class DescribePreChunkBuilder:
|
||||
)
|
||||
|
||||
def but_it_will_fit_an_element_that_fits(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100))
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100))
|
||||
builder.add_element(
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
|
||||
)
|
||||
@ -977,7 +944,7 @@ class DescribePreChunkBuilder:
|
||||
assert builder.will_fit(Text("In rhoncus ipsum sed lectus porto volutpat.")) # 43-chars
|
||||
|
||||
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
|
||||
builder.add_element(Title("Introduction"))
|
||||
builder.add_element(
|
||||
Text(
|
||||
@ -1000,7 +967,7 @@ class DescribePreChunkBuilder:
|
||||
assert builder._remaining_space == 150
|
||||
|
||||
def and_it_generates_a_TablePreChunk_when_it_contains_a_Table_element(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
|
||||
builder.add_element(Table("Heading\nCell text"))
|
||||
|
||||
pre_chunk = next(builder.flush())
|
||||
@ -1016,7 +983,7 @@ class DescribePreChunkBuilder:
|
||||
assert pre_chunk._table == Table("Heading\nCell text")
|
||||
|
||||
def but_it_does_not_generate_a_pre_chunk_on_flush_when_empty(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
|
||||
|
||||
pre_chunks = list(builder.flush())
|
||||
|
||||
@ -1025,7 +992,7 @@ class DescribePreChunkBuilder:
|
||||
assert builder._remaining_space == 150
|
||||
|
||||
def it_computes_overlap_from_each_pre_chunk_and_applies_it_to_the_next(self):
|
||||
opts = ChunkingOptions.new(overlap=15, overlap_all=True)
|
||||
opts = ChunkingOptions(overlap=15, overlap_all=True)
|
||||
builder = PreChunkBuilder(opts=opts)
|
||||
|
||||
builder.add_element(Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."))
|
||||
@ -1044,7 +1011,7 @@ class DescribePreChunkBuilder:
|
||||
assert pre_chunk._text == "porta volutpat.\n\nDonec semper facilisis metus finibus."
|
||||
|
||||
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=50))
|
||||
builder.add_element(Text("abcde"))
|
||||
builder.add_element(Text("fghij"))
|
||||
|
||||
@ -1061,7 +1028,7 @@ class DescribePreChunkCombiner:
|
||||
"""Unit-test suite for `unstructured.chunking.base.PreChunkCombiner`."""
|
||||
|
||||
def it_combines_sequential_small_text_pre_chunks(self):
|
||||
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
|
||||
opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=250)
|
||||
pre_chunks = [
|
||||
TextPreChunk(
|
||||
[
|
||||
@ -1105,7 +1072,7 @@ class DescribePreChunkCombiner:
|
||||
next(pre_chunk_iter)
|
||||
|
||||
def but_it_does_not_combine_table_pre_chunks(self):
|
||||
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
|
||||
opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=250)
|
||||
pre_chunks = [
|
||||
TextPreChunk(
|
||||
[
|
||||
@ -1127,7 +1094,7 @@ class DescribePreChunkCombiner:
|
||||
]
|
||||
|
||||
pre_chunk_iter = PreChunkCombiner(
|
||||
pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
|
||||
pre_chunks, ChunkingOptions(max_characters=250, combine_text_under_n_chars=250)
|
||||
).iter_combined_pre_chunks()
|
||||
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
@ -1152,7 +1119,7 @@ class DescribePreChunkCombiner:
|
||||
next(pre_chunk_iter)
|
||||
|
||||
def it_respects_the_specified_combination_threshold(self):
|
||||
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80)
|
||||
opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=80)
|
||||
pre_chunks = [
|
||||
TextPreChunk( # 68
|
||||
[
|
||||
@ -1203,7 +1170,7 @@ class DescribePreChunkCombiner:
|
||||
next(pre_chunk_iter)
|
||||
|
||||
def it_respects_the_hard_maximum_window_length(self):
|
||||
opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200)
|
||||
opts = ChunkingOptions(max_characters=200, combine_text_under_n_chars=200)
|
||||
pre_chunks = [
|
||||
TextPreChunk( # 68
|
||||
[
|
||||
@ -1256,7 +1223,7 @@ class DescribePreChunkCombiner:
|
||||
|
||||
def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
|
||||
"""Such as occurs when a single element exceeds the window size."""
|
||||
opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
|
||||
opts = ChunkingOptions(max_characters=150, combine_text_under_n_chars=150)
|
||||
pre_chunks = [
|
||||
TextPreChunk([Title("Lorem Ipsum")], overlap_prefix="", opts=opts),
|
||||
TextPreChunk( # 179
|
||||
@ -1274,7 +1241,7 @@ class DescribePreChunkCombiner:
|
||||
]
|
||||
|
||||
pre_chunk_iter = PreChunkCombiner(
|
||||
pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
|
||||
pre_chunks, ChunkingOptions(max_characters=150, combine_text_under_n_chars=150)
|
||||
).iter_combined_pre_chunks()
|
||||
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
@ -1303,7 +1270,7 @@ class DescribeTextPreChunkAccumulator:
|
||||
"""Unit-test suite for `unstructured.chunking.base.TextPreChunkAccumulator`."""
|
||||
|
||||
def it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
|
||||
opts = ChunkingOptions.new()
|
||||
opts = ChunkingOptions(combine_text_under_n_chars=500)
|
||||
accum = TextPreChunkAccumulator(opts=opts)
|
||||
|
||||
pre_chunk = TextPreChunk(
|
||||
@ -1362,7 +1329,7 @@ class DescribeTextPreChunkAccumulator:
|
||||
next(accum.flush())
|
||||
|
||||
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
|
||||
accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150))
|
||||
accum = TextPreChunkAccumulator(opts=ChunkingOptions(max_characters=150))
|
||||
assert list(accum.flush()) == []
|
||||
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
"""Unit-test suite for the `unstructured.chunking.basic` module.
|
||||
"""Test suite for the `unstructured.chunking.basic` module.
|
||||
|
||||
That module implements the baseline chunking strategy. The baseline strategy has all behaviors
|
||||
shared by all chunking strategies and no extra rules like perserve section or page boundaries.
|
||||
|
||||
@ -1,13 +1,20 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
"""Unit-test suite for the `unstructured.chunking.title` module."""
|
||||
"""Test suite for the `unstructured.chunking.title` module."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from unstructured.chunking.base import ChunkingOptions, TablePreChunk, TextPreChunk
|
||||
from unstructured.chunking.title import _ByTitlePreChunker, chunk_by_title
|
||||
from unstructured.chunking.base import (
|
||||
CHUNK_MULTI_PAGE_DEFAULT,
|
||||
PreChunker,
|
||||
TablePreChunk,
|
||||
TextPreChunk,
|
||||
)
|
||||
from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title
|
||||
from unstructured.documents.coordinates import CoordinateSystem
|
||||
from unstructured.documents.elements import (
|
||||
CheckBox,
|
||||
@ -23,6 +30,13 @@ from unstructured.documents.elements import (
|
||||
)
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
# ================================================================================================
|
||||
# INTEGRATION-TESTS
|
||||
# ================================================================================================
|
||||
# These test `chunk_by_title()` as an integrated whole, calling `chunk_by_title()` and inspecting
|
||||
# the outputs.
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
def test_it_splits_a_large_element_into_multiple_chunks():
|
||||
elements: list[Element] = [
|
||||
@ -57,7 +71,7 @@ def test_split_elements_by_title_and_table():
|
||||
CheckBox(),
|
||||
]
|
||||
|
||||
pre_chunks = _ByTitlePreChunker.iter_pre_chunks(elements, opts=ChunkingOptions.new())
|
||||
pre_chunks = PreChunker.iter_pre_chunks(elements, opts=_ByTitleChunkingOptions.new())
|
||||
|
||||
pre_chunk = next(pre_chunks)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
@ -544,3 +558,86 @@ def test_it_considers_separator_length_when_pre_chunking():
|
||||
),
|
||||
CompositeElement("Minimize mid-text chunk-splitting"),
|
||||
]
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# UNIT-TESTS
|
||||
# ================================================================================================
|
||||
# These test individual components in isolation so can exercise all edge cases while still
|
||||
# performing well.
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class Describe_ByTitleChunkingOptions:
|
||||
"""Unit-test suite for `unstructured.chunking.title._ByTitleChunkingOptions` objects."""
|
||||
|
||||
@pytest.mark.parametrize("n_chars", [-1, -42])
|
||||
def it_rejects_combine_text_under_n_chars_for_n_less_than_zero(self, n_chars: int):
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}",
|
||||
):
|
||||
_ByTitleChunkingOptions.new(combine_text_under_n_chars=n_chars)
|
||||
|
||||
def it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining(self):
|
||||
"""Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining."""
|
||||
opts = _ByTitleChunkingOptions(combine_text_under_n_chars=0)
|
||||
assert opts.combine_text_under_n_chars == 0
|
||||
|
||||
def it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself(self):
|
||||
"""Caller can specify `combine_text_under_n_chars` arg without specifying other options."""
|
||||
try:
|
||||
opts = _ByTitleChunkingOptions(combine_text_under_n_chars=50)
|
||||
except ValueError:
|
||||
pytest.fail("did not accept `combine_text_under_n_chars` as option by itself")
|
||||
|
||||
assert opts.combine_text_under_n_chars == 50
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("combine_text_under_n_chars", "max_characters", "expected_hard_max"),
|
||||
[(600, None, 500), (600, 450, 450)],
|
||||
)
|
||||
def it_rejects_combine_text_under_n_chars_greater_than_maxchars(
|
||||
self, combine_text_under_n_chars: int, max_characters: Optional[int], expected_hard_max: int
|
||||
):
|
||||
"""`combine_text_under_n_chars` > `max_characters` can produce behavior confusing to users.
|
||||
|
||||
The behavior is no different from `combine_text_under_n_chars == max_characters`, but if
|
||||
`max_characters` is left to default (500) and `combine_text_under_n_chars` is set to a
|
||||
larger number like 1500 then it can look like chunk-combining isn't working.
|
||||
"""
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(
|
||||
"'combine_text_under_n_chars' argument must not exceed `max_characters` value,"
|
||||
f" got {combine_text_under_n_chars} > {expected_hard_max}"
|
||||
),
|
||||
):
|
||||
_ByTitleChunkingOptions.new(
|
||||
max_characters=max_characters, combine_text_under_n_chars=combine_text_under_n_chars
|
||||
)
|
||||
|
||||
def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self):
|
||||
"""Caller can specify `new_after_n_chars` arg without specifying any other options.
|
||||
|
||||
In particular, `combine_text_under_n_chars` value is adjusted down to the
|
||||
`new_after_n_chars` value when the default for `combine_text_under_n_chars` exceeds the
|
||||
value of `new_after_n_chars`.
|
||||
"""
|
||||
try:
|
||||
opts = _ByTitleChunkingOptions(new_after_n_chars=200)
|
||||
except ValueError:
|
||||
pytest.fail("did not accept `new_after_n_chars` as option by itself")
|
||||
|
||||
assert opts.soft_max == 200
|
||||
assert opts.combine_text_under_n_chars == 200
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("multipage_sections", "expected_value"),
|
||||
[(True, True), (False, False), (None, CHUNK_MULTI_PAGE_DEFAULT)],
|
||||
)
|
||||
def it_knows_whether_to_break_chunks_on_page_boundaries(
|
||||
self, multipage_sections: bool, expected_value: bool
|
||||
):
|
||||
opts = _ByTitleChunkingOptions(multipage_sections=multipage_sections)
|
||||
assert opts.multipage_sections is expected_value
|
||||
|
||||
@ -7,7 +7,7 @@ import copy
|
||||
from typing import Any, Callable, DefaultDict, Iterable, Iterator, Optional, Sequence, cast
|
||||
|
||||
import regex
|
||||
from typing_extensions import Self, TypeAlias
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from unstructured.documents.elements import (
|
||||
CompositeElement,
|
||||
@ -68,9 +68,6 @@ class ChunkingOptions:
|
||||
when not specified, which effectively disables this behavior. Specifying 0 for this
|
||||
argument causes each element to appear in a chunk by itself (although an element with text
|
||||
longer than `max_characters` will be still be split into two or more chunks).
|
||||
multipage_sections
|
||||
Indicates that page-boundaries should not be respected while chunking, i.e. elements
|
||||
appearing on two different pages can appear in the same chunk.
|
||||
combine_text_under_n_chars
|
||||
Provides a way to "recombine" small chunks formed by breaking on a semantic boundary. Only
|
||||
relevant for a chunking strategy that specifies higher-level semantic boundaries to be
|
||||
@ -101,9 +98,9 @@ class ChunkingOptions:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
combine_text_under_n_chars: Optional[int] = None,
|
||||
max_characters: Optional[int] = None,
|
||||
multipage_sections: Optional[bool] = None,
|
||||
new_after_n_chars: Optional[int] = None,
|
||||
overlap: Optional[int] = None,
|
||||
overlap_all: Optional[bool] = None,
|
||||
@ -111,59 +108,28 @@ class ChunkingOptions:
|
||||
):
|
||||
self._combine_text_under_n_chars_arg = combine_text_under_n_chars
|
||||
self._max_characters_arg = max_characters
|
||||
self._multipage_sections_arg = multipage_sections
|
||||
self._new_after_n_chars_arg = new_after_n_chars
|
||||
self._overlap_arg = overlap
|
||||
self._overlap_all_arg = overlap_all
|
||||
self._text_splitting_separators = text_splitting_separators
|
||||
|
||||
@classmethod
|
||||
def new(
|
||||
cls,
|
||||
combine_text_under_n_chars: Optional[int] = None,
|
||||
max_characters: Optional[int] = None,
|
||||
multipage_sections: Optional[bool] = None,
|
||||
new_after_n_chars: Optional[int] = None,
|
||||
overlap: Optional[int] = None,
|
||||
overlap_all: Optional[bool] = None,
|
||||
text_splitting_separators: Sequence[str] = ("\n", " "),
|
||||
) -> Self:
|
||||
"""Construct validated instance.
|
||||
@lazyproperty
|
||||
def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
|
||||
"""The semantic-boundary detectors to be applied to break pre-chunks.
|
||||
|
||||
Raises `ValueError` on invalid arguments like overlap > max_chars.
|
||||
Overridden by sub-typs to provide semantic-boundary isolation behaviors.
|
||||
"""
|
||||
self = cls(
|
||||
combine_text_under_n_chars,
|
||||
max_characters,
|
||||
multipage_sections,
|
||||
new_after_n_chars,
|
||||
overlap,
|
||||
overlap_all,
|
||||
text_splitting_separators,
|
||||
)
|
||||
self._validate()
|
||||
return self
|
||||
return ()
|
||||
|
||||
@lazyproperty
|
||||
def combine_text_under_n_chars(self) -> int:
|
||||
"""Combine consecutive text pre-chunks if former is smaller than this and both will fit.
|
||||
"""Combine two consecutive text pre-chunks if first is smaller than this and both will fit.
|
||||
|
||||
- Does not combine table chunks with text chunks even if they would both fit in the
|
||||
chunking window.
|
||||
- Does not combine text chunks if together they would exceed the chunking window.
|
||||
- Defaults to `max_characters` when not specified.
|
||||
- Is reduced to `new_after_n_chars` when it exceeds that value.
|
||||
Default applied here is `0` which essentially disables chunk combining. Must be overridden
|
||||
by subclass where combining behavior is supported.
|
||||
"""
|
||||
max_characters = self.hard_max
|
||||
soft_max = self.soft_max
|
||||
arg = self._combine_text_under_n_chars_arg
|
||||
|
||||
# -- `combine_text_under_n_chars` defaults to `max_characters` when not specified and is
|
||||
# -- capped at max-chars
|
||||
combine_text_under_n_chars = max_characters if arg is None or arg > max_characters else arg
|
||||
|
||||
# -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` --
|
||||
return soft_max if combine_text_under_n_chars > soft_max else combine_text_under_n_chars
|
||||
arg_value = self._combine_text_under_n_chars_arg
|
||||
return arg_value if arg_value is not None else 0
|
||||
|
||||
@lazyproperty
|
||||
def hard_max(self) -> int:
|
||||
@ -185,12 +151,6 @@ class ChunkingOptions:
|
||||
"""
|
||||
return self.overlap if self._overlap_all_arg else 0
|
||||
|
||||
@lazyproperty
|
||||
def multipage_sections(self) -> bool:
|
||||
"""When False, break pre-chunks on page-boundaries."""
|
||||
arg_value = self._multipage_sections_arg
|
||||
return CHUNK_MULTI_PAGE_DEFAULT if arg_value is None else bool(arg_value)
|
||||
|
||||
@lazyproperty
|
||||
def overlap(self) -> int:
|
||||
"""The number of characters to overlap text when splitting chunks mid-text.
|
||||
@ -256,28 +216,15 @@ class ChunkingOptions:
|
||||
if max_characters <= 0:
|
||||
raise ValueError(f"'max_characters' argument must be > 0," f" got {max_characters}")
|
||||
|
||||
# -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination)
|
||||
# -- but a negative value is not
|
||||
combine_text_under_n_chars = self._combine_text_under_n_chars_arg
|
||||
if combine_text_under_n_chars is not None and combine_text_under_n_chars < 0:
|
||||
raise ValueError(
|
||||
f"'combine_text_under_n_chars' argument must be >= 0,"
|
||||
f" got {combine_text_under_n_chars}"
|
||||
)
|
||||
|
||||
# -- a negative value for `new_after_n_chars` is assumed to
|
||||
# -- be a mistake the caller will want to know about
|
||||
# -- a negative value for `new_after_n_chars` is assumed to be a mistake the caller will
|
||||
# -- want to know about
|
||||
new_after_n_chars = self._new_after_n_chars_arg
|
||||
if new_after_n_chars is not None and new_after_n_chars < 0:
|
||||
raise ValueError(
|
||||
f"'new_after_n_chars' argument must be >= 0," f" got {new_after_n_chars}"
|
||||
)
|
||||
|
||||
# -- overlap must be less than max-chars or the chunk text will
|
||||
# -- never be consumed
|
||||
# TODO: consider a heuristic like never overlap more than half,
|
||||
# otherwise there could be corner cases leading to an infinite
|
||||
# loop (I think).
|
||||
# -- overlap must be less than max-chars or the chunk text will never be consumed --
|
||||
if self.overlap >= max_characters:
|
||||
raise ValueError(
|
||||
f"'overlap' argument must be less than `max_characters`,"
|
||||
@ -402,12 +349,12 @@ class _TextSplitter:
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# BASE PRE-CHUNKER
|
||||
# PRE-CHUNKER
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class BasePreChunker:
|
||||
"""Base-class for per-strategy pre-chunkers.
|
||||
class PreChunker:
|
||||
"""Gathers sequential elements into pre-chunks as length constraints allow.
|
||||
|
||||
The pre-chunker's responsibilities are:
|
||||
|
||||
@ -465,7 +412,7 @@ class BasePreChunker:
|
||||
@lazyproperty
|
||||
def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
|
||||
"""The semantic-boundary detectors to be applied to break pre-chunks."""
|
||||
return ()
|
||||
return self._opts.boundary_predicates
|
||||
|
||||
def _is_in_new_semantic_unit(self, element: Element) -> bool:
|
||||
"""True when `element` begins a new semantic unit such as a section or page."""
|
||||
|
||||
@ -17,7 +17,9 @@ from __future__ import annotations
|
||||
|
||||
from typing import Iterable, Optional
|
||||
|
||||
from unstructured.chunking.base import BasePreChunker, ChunkingOptions
|
||||
from typing_extensions import Self
|
||||
|
||||
from unstructured.chunking.base import ChunkingOptions, PreChunker
|
||||
from unstructured.documents.elements import Element
|
||||
|
||||
|
||||
@ -58,7 +60,7 @@ def chunk_elements(
|
||||
level of "pollution" of otherwise clean semantic chunk boundaries.
|
||||
"""
|
||||
# -- raises ValueError on invalid parameters --
|
||||
opts = ChunkingOptions.new(
|
||||
opts = _BasicChunkingOptions.new(
|
||||
max_characters=max_characters,
|
||||
new_after_n_chars=new_after_n_chars,
|
||||
overlap=overlap,
|
||||
@ -67,14 +69,32 @@ def chunk_elements(
|
||||
|
||||
return [
|
||||
chunk
|
||||
for pre_chunk in BasicPreChunker.iter_pre_chunks(elements, opts)
|
||||
for pre_chunk in PreChunker.iter_pre_chunks(elements, opts)
|
||||
for chunk in pre_chunk.iter_chunks()
|
||||
]
|
||||
|
||||
|
||||
class BasicPreChunker(BasePreChunker):
|
||||
"""Produces pre-chunks from a sequence of document-elements using the "basic" rule-set.
|
||||
class _BasicChunkingOptions(ChunkingOptions):
|
||||
"""Options for `basic` chunking."""
|
||||
|
||||
The "basic" rule-set is essentially "no-rules" other than `Table` is segregated into its own
|
||||
pre-chunk.
|
||||
"""
|
||||
@classmethod
|
||||
def new(
|
||||
cls,
|
||||
*,
|
||||
max_characters: Optional[int] = None,
|
||||
new_after_n_chars: Optional[int] = None,
|
||||
overlap: Optional[int] = None,
|
||||
overlap_all: Optional[bool] = None,
|
||||
) -> Self:
|
||||
"""Construct validated instance.
|
||||
|
||||
Raises `ValueError` on invalid arguments like overlap > max_chars.
|
||||
"""
|
||||
self = cls(
|
||||
max_characters=max_characters,
|
||||
new_after_n_chars=new_after_n_chars,
|
||||
overlap=overlap,
|
||||
overlap_all=overlap_all,
|
||||
)
|
||||
self._validate()
|
||||
return self
|
||||
|
||||
@ -7,11 +7,14 @@ from __future__ import annotations
|
||||
|
||||
from typing import Iterable, Iterator, Optional
|
||||
|
||||
from typing_extensions import Self
|
||||
|
||||
from unstructured.chunking.base import (
|
||||
BasePreChunker,
|
||||
CHUNK_MULTI_PAGE_DEFAULT,
|
||||
BoundaryPredicate,
|
||||
ChunkingOptions,
|
||||
PreChunkCombiner,
|
||||
PreChunker,
|
||||
is_in_next_section,
|
||||
is_on_next_page,
|
||||
is_title,
|
||||
@ -22,6 +25,7 @@ from unstructured.utils import lazyproperty
|
||||
|
||||
def chunk_by_title(
|
||||
elements: Iterable[Element],
|
||||
*,
|
||||
max_characters: Optional[int] = None,
|
||||
multipage_sections: Optional[bool] = None,
|
||||
combine_text_under_n_chars: Optional[int] = None,
|
||||
@ -65,7 +69,7 @@ def chunk_by_title(
|
||||
elements and not subject to text-splitting. Use this with caution as it entails a certain
|
||||
level of "pollution" of otherwise clean semantic chunk boundaries.
|
||||
"""
|
||||
opts = ChunkingOptions.new(
|
||||
opts = _ByTitleChunkingOptions.new(
|
||||
combine_text_under_n_chars=combine_text_under_n_chars,
|
||||
max_characters=max_characters,
|
||||
multipage_sections=multipage_sections,
|
||||
@ -75,27 +79,127 @@ def chunk_by_title(
|
||||
)
|
||||
|
||||
pre_chunks = PreChunkCombiner(
|
||||
_ByTitlePreChunker.iter_pre_chunks(elements, opts), opts=opts
|
||||
PreChunker.iter_pre_chunks(elements, opts), opts=opts
|
||||
).iter_combined_pre_chunks()
|
||||
|
||||
return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks()]
|
||||
|
||||
|
||||
class _ByTitlePreChunker(BasePreChunker):
|
||||
"""Pre-chunker for the "by_title" chunking strategy.
|
||||
class _ByTitleChunkingOptions(ChunkingOptions):
|
||||
"""Adds the by-title-specific chunking options to the base case.
|
||||
|
||||
The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates a
|
||||
new "section", hence the "by-title" designation.
|
||||
`by_title`-specific options:
|
||||
|
||||
multipage_sections
|
||||
Indicates that page-boundaries should not be respected while chunking, i.e. elements
|
||||
appearing on two different pages can appear in the same chunk.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
max_characters: Optional[int] = None,
|
||||
combine_text_under_n_chars: Optional[int] = None,
|
||||
multipage_sections: Optional[bool] = None,
|
||||
new_after_n_chars: Optional[int] = None,
|
||||
overlap: Optional[int] = None,
|
||||
overlap_all: Optional[bool] = None,
|
||||
):
|
||||
super().__init__(
|
||||
combine_text_under_n_chars=combine_text_under_n_chars,
|
||||
max_characters=max_characters,
|
||||
new_after_n_chars=new_after_n_chars,
|
||||
overlap=overlap,
|
||||
overlap_all=overlap_all,
|
||||
)
|
||||
self._multipage_sections_arg = multipage_sections
|
||||
|
||||
@classmethod
|
||||
def new(
|
||||
cls,
|
||||
*,
|
||||
max_characters: Optional[int] = None,
|
||||
combine_text_under_n_chars: Optional[int] = None,
|
||||
multipage_sections: Optional[bool] = None,
|
||||
new_after_n_chars: Optional[int] = None,
|
||||
overlap: Optional[int] = None,
|
||||
overlap_all: Optional[bool] = None,
|
||||
) -> Self:
|
||||
"""Return instance or raises `ValueError` on invalid arguments like overlap > max_chars."""
|
||||
self = cls(
|
||||
max_characters=max_characters,
|
||||
combine_text_under_n_chars=combine_text_under_n_chars,
|
||||
multipage_sections=multipage_sections,
|
||||
new_after_n_chars=new_after_n_chars,
|
||||
overlap=overlap,
|
||||
overlap_all=overlap_all,
|
||||
)
|
||||
self._validate()
|
||||
return self
|
||||
|
||||
@lazyproperty
|
||||
def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
|
||||
"""The semantic-boundary detectors to be applied to break pre-chunks."""
|
||||
def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
|
||||
"""The semantic-boundary detectors to be applied to break pre-chunks.
|
||||
|
||||
For the `by_title` strategy these are sections indicated by a title (section-heading), an
|
||||
explicit section metadata item (only present for certain document types), and optionally
|
||||
page boundaries.
|
||||
"""
|
||||
|
||||
def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:
|
||||
yield is_title
|
||||
yield is_in_next_section()
|
||||
if not self._opts.multipage_sections:
|
||||
if not self.multipage_sections:
|
||||
yield is_on_next_page()
|
||||
|
||||
return tuple(iter_boundary_predicates())
|
||||
|
||||
@lazyproperty
|
||||
def combine_text_under_n_chars(self) -> int:
|
||||
"""Combine consecutive text pre-chunks if former is smaller than this and both will fit.
|
||||
|
||||
- Does not combine table chunks with text chunks even if they would both fit in the
|
||||
chunking window.
|
||||
- Does not combine text chunks if together they would exceed the chunking window.
|
||||
- Defaults to `max_characters` when not specified.
|
||||
- Is reduced to `new_after_n_chars` when it exceeds that value.
|
||||
"""
|
||||
max_characters = self.hard_max
|
||||
soft_max = self.soft_max
|
||||
arg_value = self._combine_text_under_n_chars_arg
|
||||
|
||||
# -- `combine_text_under_n_chars` defaults to `max_characters` when not specified --
|
||||
combine_text_under_n_chars = max_characters if arg_value is None else arg_value
|
||||
|
||||
# -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` --
|
||||
return soft_max if combine_text_under_n_chars > soft_max else combine_text_under_n_chars
|
||||
|
||||
@lazyproperty
|
||||
def multipage_sections(self) -> bool:
|
||||
"""When False, break pre-chunks on page-boundaries."""
|
||||
arg_value = self._multipage_sections_arg
|
||||
return CHUNK_MULTI_PAGE_DEFAULT if arg_value is None else bool(arg_value)
|
||||
|
||||
def _validate(self) -> None:
|
||||
"""Raise ValueError if request option-set is invalid."""
|
||||
# -- start with base-class validations --
|
||||
super()._validate()
|
||||
|
||||
if (combine_text_under_n_chars_arg := self._combine_text_under_n_chars_arg) is not None:
|
||||
# -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination)
|
||||
# -- but a negative value is not
|
||||
if combine_text_under_n_chars_arg < 0:
|
||||
raise ValueError(
|
||||
f"'combine_text_under_n_chars' argument must be >= 0,"
|
||||
f" got {combine_text_under_n_chars_arg}"
|
||||
)
|
||||
|
||||
# -- `combine_text_under_n_chars` > `max_characters` can produce behavior confusing to
|
||||
# -- users. The chunking behavior would be no different than when
|
||||
# -- `combine_text_under_n_chars == max_characters`, but if `max_characters` is left to
|
||||
# -- default (500) then it can look like chunk-combining isn't working.
|
||||
if combine_text_under_n_chars_arg > self.hard_max:
|
||||
raise ValueError(
|
||||
f"'combine_text_under_n_chars' argument must not exceed `max_characters`"
|
||||
f" value, got {combine_text_under_n_chars_arg} > {self.hard_max}"
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user