mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-06 13:01:56 +00:00
rfctr(chunking): extract strategy-specific chunking options (#2556)
**Summary** A pluggable chunking strategy needs its own local set of chunking options that subclasses a base-class in `unstructured`. Extract distinct `_ByTitleChunkingOptions` and `_BasicChunkingOptions` for the existing two chunking strategies and move their strategy-specific option setting and validation to the respective subclass. This was also a good opportunity for us to clean up a few odds and ends we'd been meaning to. Might be worth looking at the commits individually as they are cohesive incremental steps toward the goal.
This commit is contained in:
parent
b4d9ad8130
commit
51cf6bf716
@ -4,15 +4,15 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import Sequence
|
from typing import Optional, Sequence
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from unstructured.chunking.base import (
|
from unstructured.chunking.base import (
|
||||||
BasePreChunker,
|
|
||||||
ChunkingOptions,
|
ChunkingOptions,
|
||||||
PreChunkBuilder,
|
PreChunkBuilder,
|
||||||
PreChunkCombiner,
|
PreChunkCombiner,
|
||||||
|
PreChunker,
|
||||||
TablePreChunk,
|
TablePreChunk,
|
||||||
TextPreChunk,
|
TextPreChunk,
|
||||||
TextPreChunkAccumulator,
|
TextPreChunkAccumulator,
|
||||||
@ -48,7 +48,7 @@ class DescribeChunkingOptions:
|
|||||||
ValueError,
|
ValueError,
|
||||||
match=f"'max_characters' argument must be > 0, got {max_characters}",
|
match=f"'max_characters' argument must be > 0, got {max_characters}",
|
||||||
):
|
):
|
||||||
ChunkingOptions.new(max_characters=max_characters)
|
ChunkingOptions(max_characters=max_characters)._validate()
|
||||||
|
|
||||||
def it_does_not_complain_when_specifying_max_characters_by_itself(self):
|
def it_does_not_complain_when_specifying_max_characters_by_itself(self):
|
||||||
"""Caller can specify `max_characters` arg without specifying any others.
|
"""Caller can specify `max_characters` arg without specifying any others.
|
||||||
@ -58,44 +58,25 @@ class DescribeChunkingOptions:
|
|||||||
and trigger an exception.
|
and trigger an exception.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
ChunkingOptions.new(max_characters=50)
|
ChunkingOptions(max_characters=50)._validate()
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pytest.fail("did not accept `max_characters` as option by itself")
|
pytest.fail("did not accept `max_characters` as option by itself")
|
||||||
|
|
||||||
@pytest.mark.parametrize("n_chars", [-1, -42])
|
@pytest.mark.parametrize(
|
||||||
def it_rejects_combine_text_under_n_chars_for_n_less_than_zero(self, n_chars: int):
|
("combine_text_under_n_chars", "expected_value"), [(None, 0), (42, 42)]
|
||||||
with pytest.raises(
|
)
|
||||||
ValueError,
|
def it_accepts_combine_text_under_n_chars_in_constructor_but_defaults_to_no_combining(
|
||||||
match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}",
|
self, combine_text_under_n_chars: Optional[int], expected_value: int
|
||||||
):
|
):
|
||||||
ChunkingOptions.new(combine_text_under_n_chars=n_chars)
|
"""Subclasses can store `combine_text_under_n_chars` but must validate and enable it.
|
||||||
|
|
||||||
def it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining(self):
|
The `combine_text_under_n_chars` option is not used by all chunkers and its behavior can
|
||||||
"""Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining."""
|
differ between subtypes. It is present in and stored by the contructur but it defaults to
|
||||||
opts = ChunkingOptions.new(combine_text_under_n_chars=0)
|
`0` (no pre-chunk combining) and must be overridden by subclasses to give it the desired
|
||||||
assert opts.combine_text_under_n_chars == 0
|
behavior.
|
||||||
|
|
||||||
def it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself(self):
|
|
||||||
"""Caller can specify `combine_text_under_n_chars` arg without specifying other options."""
|
|
||||||
try:
|
|
||||||
opts = ChunkingOptions.new(combine_text_under_n_chars=50)
|
|
||||||
except ValueError:
|
|
||||||
pytest.fail("did not accept `combine_text_under_n_chars` as option by itself")
|
|
||||||
|
|
||||||
assert opts.combine_text_under_n_chars == 50
|
|
||||||
|
|
||||||
def it_silently_accepts_combine_text_under_n_chars_greater_than_maxchars(self):
|
|
||||||
"""`combine_text_under_n_chars` > `max_characters` doesn't affect chunking behavior.
|
|
||||||
|
|
||||||
So rather than raising an exception or warning, we just cap that value at `max_characters`
|
|
||||||
which is the behavioral equivalent.
|
|
||||||
"""
|
"""
|
||||||
try:
|
opts = ChunkingOptions(combine_text_under_n_chars=combine_text_under_n_chars)
|
||||||
opts = ChunkingOptions.new(max_characters=500, combine_text_under_n_chars=600)
|
assert opts.combine_text_under_n_chars == expected_value
|
||||||
except ValueError:
|
|
||||||
pytest.fail("did not accept `combine_text_under_n_chars` greater than `max_characters`")
|
|
||||||
|
|
||||||
assert opts.combine_text_under_n_chars == 500
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("n_chars", [-1, -42])
|
@pytest.mark.parametrize("n_chars", [-1, -42])
|
||||||
def it_rejects_new_after_n_chars_for_n_less_than_zero(self, n_chars: int):
|
def it_rejects_new_after_n_chars_for_n_less_than_zero(self, n_chars: int):
|
||||||
@ -103,7 +84,7 @@ class DescribeChunkingOptions:
|
|||||||
ValueError,
|
ValueError,
|
||||||
match=f"'new_after_n_chars' argument must be >= 0, got {n_chars}",
|
match=f"'new_after_n_chars' argument must be >= 0, got {n_chars}",
|
||||||
):
|
):
|
||||||
ChunkingOptions.new(new_after_n_chars=n_chars)
|
ChunkingOptions(new_after_n_chars=n_chars)._validate()
|
||||||
|
|
||||||
def it_rejects_overlap_not_less_than_max_characters(self):
|
def it_rejects_overlap_not_less_than_max_characters(self):
|
||||||
with pytest.raises(
|
with pytest.raises(
|
||||||
@ -113,26 +94,23 @@ class DescribeChunkingOptions:
|
|||||||
ChunkingOptions(max_characters=200, overlap=300)._validate()
|
ChunkingOptions(max_characters=200, overlap=300)._validate()
|
||||||
|
|
||||||
def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self):
|
def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self):
|
||||||
"""Caller can specify `new_after_n_chars` arg without specifying any other options.
|
"""Caller can specify `new_after_n_chars` arg without specifying any other options."""
|
||||||
|
opts = ChunkingOptions(new_after_n_chars=200)
|
||||||
In particular, `combine_text_under_n_chars` value is adjusted down to the
|
|
||||||
`new_after_n_chars` value when the default for `combine_text_under_n_chars` exceeds the
|
|
||||||
value of `new_after_n_chars`.
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
opts = ChunkingOptions.new(new_after_n_chars=200)
|
opts._validate()
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pytest.fail("did not accept `new_after_n_chars` as option by itself")
|
pytest.fail("did not accept `new_after_n_chars` as option by itself")
|
||||||
|
|
||||||
assert opts.soft_max == 200
|
assert opts.soft_max == 200
|
||||||
assert opts.combine_text_under_n_chars == 200
|
|
||||||
|
|
||||||
def it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk(self):
|
def it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk(self):
|
||||||
"""Specifying `new_after_n_chars=0` places each element into its own pre-chunk.
|
"""Specifying `new_after_n_chars=0` places each element into its own pre-chunk.
|
||||||
|
|
||||||
This puts each element into its own chunk, although long chunks are still split.
|
This puts each element into its own chunk, although long chunks are still split.
|
||||||
"""
|
"""
|
||||||
opts = ChunkingOptions.new(new_after_n_chars=0)
|
opts = ChunkingOptions(new_after_n_chars=0)
|
||||||
|
opts._validate()
|
||||||
|
|
||||||
assert opts.soft_max == 0
|
assert opts.soft_max == 0
|
||||||
|
|
||||||
def it_silently_accepts_new_after_n_chars_greater_than_maxchars(self):
|
def it_silently_accepts_new_after_n_chars_greater_than_maxchars(self):
|
||||||
@ -141,33 +119,32 @@ class DescribeChunkingOptions:
|
|||||||
So rather than raising an exception or warning, we just cap that value at `max_characters`
|
So rather than raising an exception or warning, we just cap that value at `max_characters`
|
||||||
which is the behavioral equivalent.
|
which is the behavioral equivalent.
|
||||||
"""
|
"""
|
||||||
|
opts = ChunkingOptions(max_characters=444, new_after_n_chars=555)
|
||||||
try:
|
try:
|
||||||
opts = ChunkingOptions.new(max_characters=444, new_after_n_chars=555)
|
opts._validate()
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pytest.fail("did not accept `new_after_n_chars` greater than `max_characters`")
|
pytest.fail("did not accept `new_after_n_chars` greater than `max_characters`")
|
||||||
|
|
||||||
assert opts.soft_max == 444
|
assert opts.soft_max == 444
|
||||||
|
|
||||||
def it_knows_how_much_overlap_to_apply_to_split_chunks(self):
|
def it_knows_how_much_overlap_to_apply_to_split_chunks(self):
|
||||||
assert ChunkingOptions.new(overlap=10).overlap == 10
|
assert ChunkingOptions(overlap=10).overlap == 10
|
||||||
|
|
||||||
def and_it_uses_the_same_value_for_inter_chunk_overlap_when_asked_to_overlap_all_chunks(self):
|
def and_it_uses_the_same_value_for_inter_chunk_overlap_when_asked_to_overlap_all_chunks(self):
|
||||||
assert ChunkingOptions.new(overlap=10, overlap_all=True).inter_chunk_overlap == 10
|
assert ChunkingOptions(overlap=10, overlap_all=True).inter_chunk_overlap == 10
|
||||||
|
|
||||||
def but_it_does_not_overlap_pre_chunks_by_default(self):
|
def but_it_does_not_overlap_pre_chunks_by_default(self):
|
||||||
assert ChunkingOptions.new(overlap=10).inter_chunk_overlap == 0
|
assert ChunkingOptions(overlap=10).inter_chunk_overlap == 0
|
||||||
|
|
||||||
def it_knows_the_text_separator_string(self):
|
def it_knows_the_text_separator_string(self):
|
||||||
assert ChunkingOptions.new().text_separator == "\n\n"
|
assert ChunkingOptions().text_separator == "\n\n"
|
||||||
|
|
||||||
|
|
||||||
class Describe_TextSplitter:
|
class Describe_TextSplitter:
|
||||||
"""Unit-test suite for `unstructured.chunking.base._TextSplitter` objects."""
|
"""Unit-test suite for `unstructured.chunking.base._TextSplitter` objects."""
|
||||||
|
|
||||||
def it_splits_on_a_preferred_separator_when_it_can(self):
|
def it_splits_on_a_preferred_separator_when_it_can(self):
|
||||||
opts = ChunkingOptions.new(
|
opts = ChunkingOptions(max_characters=50, text_splitting_separators=("\n", " "), overlap=10)
|
||||||
max_characters=50, text_splitting_separators=("\n", " "), overlap=10
|
|
||||||
)
|
|
||||||
split = _TextSplitter(opts)
|
split = _TextSplitter(opts)
|
||||||
text = (
|
text = (
|
||||||
"Lorem ipsum dolor amet consectetur adipiscing. \n "
|
"Lorem ipsum dolor amet consectetur adipiscing. \n "
|
||||||
@ -189,9 +166,7 @@ class Describe_TextSplitter:
|
|||||||
assert remainder == ""
|
assert remainder == ""
|
||||||
|
|
||||||
def and_it_splits_on_the_next_available_separator_when_the_first_is_not_available(self):
|
def and_it_splits_on_the_next_available_separator_when_the_first_is_not_available(self):
|
||||||
opts = ChunkingOptions.new(
|
opts = ChunkingOptions(max_characters=40, text_splitting_separators=("\n", " "), overlap=10)
|
||||||
max_characters=40, text_splitting_separators=("\n", " "), overlap=10
|
|
||||||
)
|
|
||||||
split = _TextSplitter(opts)
|
split = _TextSplitter(opts)
|
||||||
text = (
|
text = (
|
||||||
"Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus porta"
|
"Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus porta"
|
||||||
@ -211,9 +186,7 @@ class Describe_TextSplitter:
|
|||||||
assert remainder == ""
|
assert remainder == ""
|
||||||
|
|
||||||
def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self):
|
def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self):
|
||||||
opts = ChunkingOptions.new(
|
opts = ChunkingOptions(max_characters=30, text_splitting_separators=("\n", " "), overlap=10)
|
||||||
max_characters=30, text_splitting_separators=("\n", " "), overlap=10
|
|
||||||
)
|
|
||||||
split = _TextSplitter(opts)
|
split = _TextSplitter(opts)
|
||||||
text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
|
text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
|
||||||
|
|
||||||
@ -237,7 +210,7 @@ class Describe_TextSplitter:
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str):
|
def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str):
|
||||||
opts = ChunkingOptions.new(max_characters=46, overlap=10)
|
opts = ChunkingOptions(max_characters=46, overlap=10)
|
||||||
split = _TextSplitter(opts)
|
split = _TextSplitter(opts)
|
||||||
|
|
||||||
s, remainder = split(text)
|
s, remainder = split(text)
|
||||||
@ -246,7 +219,7 @@ class Describe_TextSplitter:
|
|||||||
assert remainder == ""
|
assert remainder == ""
|
||||||
|
|
||||||
def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self):
|
def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self):
|
||||||
opts = ChunkingOptions.new(max_characters=38, overlap=10)
|
opts = ChunkingOptions(max_characters=38, overlap=10)
|
||||||
split = _TextSplitter(opts)
|
split = _TextSplitter(opts)
|
||||||
text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
|
text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
|
||||||
|
|
||||||
@ -257,9 +230,7 @@ class Describe_TextSplitter:
|
|||||||
|
|
||||||
@pytest.mark.parametrize("separators", [("\n", " "), (" ",)])
|
@pytest.mark.parametrize("separators", [("\n", " "), (" ",)])
|
||||||
def it_strips_whitespace_around_the_split(self, separators: Sequence[str]):
|
def it_strips_whitespace_around_the_split(self, separators: Sequence[str]):
|
||||||
opts = ChunkingOptions.new(
|
opts = ChunkingOptions(max_characters=50, text_splitting_separators=separators, overlap=10)
|
||||||
max_characters=50, text_splitting_separators=separators, overlap=10
|
|
||||||
)
|
|
||||||
split = _TextSplitter(opts)
|
split = _TextSplitter(opts)
|
||||||
text = "Lorem ipsum dolor amet consectetur adipiscing. \n\n In rhoncus ipsum sed lectus."
|
text = "Lorem ipsum dolor amet consectetur adipiscing. \n\n In rhoncus ipsum sed lectus."
|
||||||
# |-------------------------------------------------^ 50-chars
|
# |-------------------------------------------------^ 50-chars
|
||||||
@ -271,12 +242,12 @@ class Describe_TextSplitter:
|
|||||||
|
|
||||||
|
|
||||||
# ================================================================================================
|
# ================================================================================================
|
||||||
# BASE PRE-CHUNKER
|
# PRE-CHUNKER
|
||||||
# ================================================================================================
|
# ================================================================================================
|
||||||
|
|
||||||
|
|
||||||
class DescribeBasePreChunker:
|
class DescribePreChunker:
|
||||||
"""Unit-test suite for `unstructured.chunking.base.BasePreChunker` objects."""
|
"""Unit-test suite for `unstructured.chunking.base.PreChunker` objects."""
|
||||||
|
|
||||||
def it_gathers_elements_into_pre_chunks_respecting_the_specified_chunk_size(self):
|
def it_gathers_elements_into_pre_chunks_respecting_the_specified_chunk_size(self):
|
||||||
elements = [
|
elements = [
|
||||||
@ -289,9 +260,9 @@ class DescribeBasePreChunker:
|
|||||||
CheckBox(),
|
CheckBox(),
|
||||||
]
|
]
|
||||||
|
|
||||||
opts = ChunkingOptions.new(max_characters=150, new_after_n_chars=65)
|
opts = ChunkingOptions(max_characters=150, new_after_n_chars=65)
|
||||||
|
|
||||||
pre_chunk_iter = BasePreChunker.iter_pre_chunks(elements, opts=opts)
|
pre_chunk_iter = PreChunker.iter_pre_chunks(elements, opts=opts)
|
||||||
|
|
||||||
pre_chunk = next(pre_chunk_iter)
|
pre_chunk = next(pre_chunk_iter)
|
||||||
assert isinstance(pre_chunk, TextPreChunk)
|
assert isinstance(pre_chunk, TextPreChunk)
|
||||||
@ -344,7 +315,7 @@ class DescribeTablePreChunk:
|
|||||||
pre_chunk = TablePreChunk(
|
pre_chunk = TablePreChunk(
|
||||||
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
|
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
|
||||||
overlap_prefix="ctus porta volutpat.",
|
overlap_prefix="ctus porta volutpat.",
|
||||||
opts=ChunkingOptions.new(max_characters=175),
|
opts=ChunkingOptions(max_characters=175),
|
||||||
)
|
)
|
||||||
|
|
||||||
chunk_iter = pre_chunk.iter_chunks()
|
chunk_iter = pre_chunk.iter_chunks()
|
||||||
@ -393,7 +364,7 @@ class DescribeTablePreChunk:
|
|||||||
pre_chunk = TablePreChunk(
|
pre_chunk = TablePreChunk(
|
||||||
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
|
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
|
||||||
overlap_prefix="",
|
overlap_prefix="",
|
||||||
opts=ChunkingOptions.new(max_characters=100, text_splitting_separators=("\n", " ")),
|
opts=ChunkingOptions(max_characters=100, text_splitting_separators=("\n", " ")),
|
||||||
)
|
)
|
||||||
|
|
||||||
chunk_iter = pre_chunk.iter_chunks()
|
chunk_iter = pre_chunk.iter_chunks()
|
||||||
@ -457,7 +428,7 @@ class DescribeTablePreChunk:
|
|||||||
self, text: str, expected_value: str
|
self, text: str, expected_value: str
|
||||||
):
|
):
|
||||||
pre_chunk = TablePreChunk(
|
pre_chunk = TablePreChunk(
|
||||||
Table(text), overlap_prefix="", opts=ChunkingOptions.new(overlap=20, overlap_all=True)
|
Table(text), overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True)
|
||||||
)
|
)
|
||||||
assert pre_chunk.overlap_tail == expected_value
|
assert pre_chunk.overlap_tail == expected_value
|
||||||
|
|
||||||
@ -480,7 +451,7 @@ class DescribeTablePreChunk:
|
|||||||
self, text: str, overlap_prefix: str, expected_value: str
|
self, text: str, overlap_prefix: str, expected_value: str
|
||||||
):
|
):
|
||||||
pre_chunk = TablePreChunk(
|
pre_chunk = TablePreChunk(
|
||||||
Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions.new()
|
Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions()
|
||||||
)
|
)
|
||||||
assert pre_chunk._text == expected_value
|
assert pre_chunk._text == expected_value
|
||||||
|
|
||||||
@ -536,7 +507,7 @@ class DescribeTextPreChunk:
|
|||||||
self, max_characters: int, combine_text_under_n_chars: int, expected_value: bool
|
self, max_characters: int, combine_text_under_n_chars: int, expected_value: bool
|
||||||
):
|
):
|
||||||
"""This allows `PreChunkCombiner` to operate without knowing `TextPreChunk` internals."""
|
"""This allows `PreChunkCombiner` to operate without knowing `TextPreChunk` internals."""
|
||||||
opts = ChunkingOptions.new(
|
opts = ChunkingOptions(
|
||||||
max_characters=max_characters,
|
max_characters=max_characters,
|
||||||
combine_text_under_n_chars=combine_text_under_n_chars,
|
combine_text_under_n_chars=combine_text_under_n_chars,
|
||||||
overlap=20,
|
overlap=20,
|
||||||
@ -560,7 +531,7 @@ class DescribeTextPreChunk:
|
|||||||
|
|
||||||
Note that neither the original or other pre_chunk are mutated.
|
Note that neither the original or other pre_chunk are mutated.
|
||||||
"""
|
"""
|
||||||
opts = ChunkingOptions.new()
|
opts = ChunkingOptions()
|
||||||
pre_chunk = TextPreChunk(
|
pre_chunk = TextPreChunk(
|
||||||
[
|
[
|
||||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||||
@ -623,7 +594,7 @@ class DescribeTextPreChunk:
|
|||||||
),
|
),
|
||||||
],
|
],
|
||||||
overlap_prefix="e feugiat efficitur.",
|
overlap_prefix="e feugiat efficitur.",
|
||||||
opts=ChunkingOptions.new(max_characters=200),
|
opts=ChunkingOptions(max_characters=200),
|
||||||
)
|
)
|
||||||
|
|
||||||
chunk_iter = pre_chunk.iter_chunks()
|
chunk_iter = pre_chunk.iter_chunks()
|
||||||
@ -648,7 +619,7 @@ class DescribeTextPreChunk:
|
|||||||
),
|
),
|
||||||
],
|
],
|
||||||
overlap_prefix="",
|
overlap_prefix="",
|
||||||
opts=ChunkingOptions.new(max_characters=200, text_splitting_separators=("\n", " ")),
|
opts=ChunkingOptions(max_characters=200, text_splitting_separators=("\n", " ")),
|
||||||
)
|
)
|
||||||
|
|
||||||
chunk_iter = pre_chunk.iter_chunks()
|
chunk_iter = pre_chunk.iter_chunks()
|
||||||
@ -681,7 +652,7 @@ class DescribeTextPreChunk:
|
|||||||
self, text: str, expected_value: str
|
self, text: str, expected_value: str
|
||||||
):
|
):
|
||||||
pre_chunk = TextPreChunk(
|
pre_chunk = TextPreChunk(
|
||||||
[Text(text)], overlap_prefix="", opts=ChunkingOptions.new(overlap=20, overlap_all=True)
|
[Text(text)], overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True)
|
||||||
)
|
)
|
||||||
assert pre_chunk.overlap_tail == expected_value
|
assert pre_chunk.overlap_tail == expected_value
|
||||||
|
|
||||||
@ -708,7 +679,7 @@ class DescribeTextPreChunk:
|
|||||||
),
|
),
|
||||||
],
|
],
|
||||||
overlap_prefix="",
|
overlap_prefix="",
|
||||||
opts=ChunkingOptions.new(),
|
opts=ChunkingOptions(),
|
||||||
)
|
)
|
||||||
|
|
||||||
assert pre_chunk._all_metadata_values == {
|
assert pre_chunk._all_metadata_values == {
|
||||||
@ -746,7 +717,7 @@ class DescribeTextPreChunk:
|
|||||||
Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
|
Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
|
||||||
],
|
],
|
||||||
overlap_prefix="",
|
overlap_prefix="",
|
||||||
opts=ChunkingOptions.new(),
|
opts=ChunkingOptions(),
|
||||||
)
|
)
|
||||||
|
|
||||||
# -- ad-hoc fields "coefficient" and "quotient" do not appear --
|
# -- ad-hoc fields "coefficient" and "quotient" do not appear --
|
||||||
@ -789,7 +760,7 @@ class DescribeTextPreChunk:
|
|||||||
),
|
),
|
||||||
],
|
],
|
||||||
overlap_prefix="ficitur.", # len == 8
|
overlap_prefix="ficitur.", # len == 8
|
||||||
opts=ChunkingOptions.new(),
|
opts=ChunkingOptions(),
|
||||||
)
|
)
|
||||||
|
|
||||||
regex_metadata = pre_chunk._consolidated_regex_meta
|
regex_metadata = pre_chunk._consolidated_regex_meta
|
||||||
@ -845,7 +816,7 @@ class DescribeTextPreChunk:
|
|||||||
),
|
),
|
||||||
],
|
],
|
||||||
overlap_prefix="",
|
overlap_prefix="",
|
||||||
opts=ChunkingOptions.new(),
|
opts=ChunkingOptions(),
|
||||||
)
|
)
|
||||||
|
|
||||||
meta_kwargs = pre_chunk._meta_kwargs
|
meta_kwargs = pre_chunk._meta_kwargs
|
||||||
@ -881,9 +852,7 @@ class DescribeTextPreChunk:
|
|||||||
The text-segment contributed by each element is separated from the next by a blank line
|
The text-segment contributed by each element is separated from the next by a blank line
|
||||||
("\n\n"). An element that contributes no text does not give rise to a separator.
|
("\n\n"). An element that contributes no text does not give rise to a separator.
|
||||||
"""
|
"""
|
||||||
pre_chunk = TextPreChunk(
|
pre_chunk = TextPreChunk(elements, overlap_prefix=overlap_prefix, opts=ChunkingOptions())
|
||||||
elements, overlap_prefix=overlap_prefix, opts=ChunkingOptions.new()
|
|
||||||
)
|
|
||||||
assert pre_chunk._text == expected_value
|
assert pre_chunk._text == expected_value
|
||||||
|
|
||||||
|
|
||||||
@ -896,13 +865,13 @@ class DescribePreChunkBuilder:
|
|||||||
"""Unit-test suite for `unstructured.chunking.base.PreChunkBuilder`."""
|
"""Unit-test suite for `unstructured.chunking.base.PreChunkBuilder`."""
|
||||||
|
|
||||||
def it_is_empty_on_construction(self):
|
def it_is_empty_on_construction(self):
|
||||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
|
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=50))
|
||||||
|
|
||||||
assert builder._text_length == 0
|
assert builder._text_length == 0
|
||||||
assert builder._remaining_space == 50
|
assert builder._remaining_space == 50
|
||||||
|
|
||||||
def it_accumulates_elements_added_to_it(self):
|
def it_accumulates_elements_added_to_it(self):
|
||||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
|
||||||
|
|
||||||
builder.add_element(Title("Introduction"))
|
builder.add_element(Title("Introduction"))
|
||||||
assert builder._text_length == 12
|
assert builder._text_length == 12
|
||||||
@ -919,7 +888,7 @@ class DescribePreChunkBuilder:
|
|||||||
|
|
||||||
@pytest.mark.parametrize("element", [Table("Heading\nCell text"), Text("abcd " * 200)])
|
@pytest.mark.parametrize("element", [Table("Heading\nCell text"), Text("abcd " * 200)])
|
||||||
def it_will_fit_a_Table_or_oversized_element_when_empty(self, element: Element):
|
def it_will_fit_a_Table_or_oversized_element_when_empty(self, element: Element):
|
||||||
builder = PreChunkBuilder(opts=ChunkingOptions.new())
|
builder = PreChunkBuilder(opts=ChunkingOptions())
|
||||||
assert builder.will_fit(element)
|
assert builder.will_fit(element)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@ -934,22 +903,20 @@ class DescribePreChunkBuilder:
|
|||||||
def but_not_when_it_already_contains_an_element_of_any_kind(
|
def but_not_when_it_already_contains_an_element_of_any_kind(
|
||||||
self, existing_element: Element, next_element: Element
|
self, existing_element: Element, next_element: Element
|
||||||
):
|
):
|
||||||
builder = PreChunkBuilder(opts=ChunkingOptions.new())
|
builder = PreChunkBuilder(opts=ChunkingOptions())
|
||||||
builder.add_element(existing_element)
|
builder.add_element(existing_element)
|
||||||
|
|
||||||
assert not builder.will_fit(next_element)
|
assert not builder.will_fit(next_element)
|
||||||
|
|
||||||
@pytest.mark.parametrize("element", [Text("abcd"), Table("Fruits\nMango")])
|
@pytest.mark.parametrize("element", [Text("abcd"), Table("Fruits\nMango")])
|
||||||
def it_will_not_fit_any_element_when_it_already_contains_a_table(self, element: Element):
|
def it_will_not_fit_any_element_when_it_already_contains_a_table(self, element: Element):
|
||||||
builder = PreChunkBuilder(opts=ChunkingOptions.new())
|
builder = PreChunkBuilder(opts=ChunkingOptions())
|
||||||
builder.add_element(Table("Heading\nCell text"))
|
builder.add_element(Table("Heading\nCell text"))
|
||||||
|
|
||||||
assert not builder.will_fit(element)
|
assert not builder.will_fit(element)
|
||||||
|
|
||||||
def it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen(self):
|
def it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen(self):
|
||||||
builder = PreChunkBuilder(
|
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100, new_after_n_chars=50))
|
||||||
opts=ChunkingOptions.new(max_characters=100, new_after_n_chars=50)
|
|
||||||
)
|
|
||||||
builder.add_element(
|
builder.add_element(
|
||||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
|
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
|
||||||
)
|
)
|
||||||
@ -957,7 +924,7 @@ class DescribePreChunkBuilder:
|
|||||||
assert not builder.will_fit(Text("In rhoncus ipsum."))
|
assert not builder.will_fit(Text("In rhoncus ipsum."))
|
||||||
|
|
||||||
def and_it_will_not_fit_an_element_when_that_would_cause_it_to_exceed_the_hard_maxlen(self):
|
def and_it_will_not_fit_an_element_when_that_would_cause_it_to_exceed_the_hard_maxlen(self):
|
||||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100))
|
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100))
|
||||||
builder.add_element(
|
builder.add_element(
|
||||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
|
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
|
||||||
)
|
)
|
||||||
@ -968,7 +935,7 @@ class DescribePreChunkBuilder:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def but_it_will_fit_an_element_that_fits(self):
|
def but_it_will_fit_an_element_that_fits(self):
|
||||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100))
|
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100))
|
||||||
builder.add_element(
|
builder.add_element(
|
||||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
|
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
|
||||||
)
|
)
|
||||||
@ -977,7 +944,7 @@ class DescribePreChunkBuilder:
|
|||||||
assert builder.will_fit(Text("In rhoncus ipsum sed lectus porto volutpat.")) # 43-chars
|
assert builder.will_fit(Text("In rhoncus ipsum sed lectus porto volutpat.")) # 43-chars
|
||||||
|
|
||||||
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
|
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
|
||||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
|
||||||
builder.add_element(Title("Introduction"))
|
builder.add_element(Title("Introduction"))
|
||||||
builder.add_element(
|
builder.add_element(
|
||||||
Text(
|
Text(
|
||||||
@ -1000,7 +967,7 @@ class DescribePreChunkBuilder:
|
|||||||
assert builder._remaining_space == 150
|
assert builder._remaining_space == 150
|
||||||
|
|
||||||
def and_it_generates_a_TablePreChunk_when_it_contains_a_Table_element(self):
|
def and_it_generates_a_TablePreChunk_when_it_contains_a_Table_element(self):
|
||||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
|
||||||
builder.add_element(Table("Heading\nCell text"))
|
builder.add_element(Table("Heading\nCell text"))
|
||||||
|
|
||||||
pre_chunk = next(builder.flush())
|
pre_chunk = next(builder.flush())
|
||||||
@ -1016,7 +983,7 @@ class DescribePreChunkBuilder:
|
|||||||
assert pre_chunk._table == Table("Heading\nCell text")
|
assert pre_chunk._table == Table("Heading\nCell text")
|
||||||
|
|
||||||
def but_it_does_not_generate_a_pre_chunk_on_flush_when_empty(self):
|
def but_it_does_not_generate_a_pre_chunk_on_flush_when_empty(self):
|
||||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
|
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
|
||||||
|
|
||||||
pre_chunks = list(builder.flush())
|
pre_chunks = list(builder.flush())
|
||||||
|
|
||||||
@ -1025,7 +992,7 @@ class DescribePreChunkBuilder:
|
|||||||
assert builder._remaining_space == 150
|
assert builder._remaining_space == 150
|
||||||
|
|
||||||
def it_computes_overlap_from_each_pre_chunk_and_applies_it_to_the_next(self):
|
def it_computes_overlap_from_each_pre_chunk_and_applies_it_to_the_next(self):
|
||||||
opts = ChunkingOptions.new(overlap=15, overlap_all=True)
|
opts = ChunkingOptions(overlap=15, overlap_all=True)
|
||||||
builder = PreChunkBuilder(opts=opts)
|
builder = PreChunkBuilder(opts=opts)
|
||||||
|
|
||||||
builder.add_element(Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."))
|
builder.add_element(Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."))
|
||||||
@ -1044,7 +1011,7 @@ class DescribePreChunkBuilder:
|
|||||||
assert pre_chunk._text == "porta volutpat.\n\nDonec semper facilisis metus finibus."
|
assert pre_chunk._text == "porta volutpat.\n\nDonec semper facilisis metus finibus."
|
||||||
|
|
||||||
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
|
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
|
||||||
builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
|
builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=50))
|
||||||
builder.add_element(Text("abcde"))
|
builder.add_element(Text("abcde"))
|
||||||
builder.add_element(Text("fghij"))
|
builder.add_element(Text("fghij"))
|
||||||
|
|
||||||
@ -1061,7 +1028,7 @@ class DescribePreChunkCombiner:
|
|||||||
"""Unit-test suite for `unstructured.chunking.base.PreChunkCombiner`."""
|
"""Unit-test suite for `unstructured.chunking.base.PreChunkCombiner`."""
|
||||||
|
|
||||||
def it_combines_sequential_small_text_pre_chunks(self):
|
def it_combines_sequential_small_text_pre_chunks(self):
|
||||||
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
|
opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=250)
|
||||||
pre_chunks = [
|
pre_chunks = [
|
||||||
TextPreChunk(
|
TextPreChunk(
|
||||||
[
|
[
|
||||||
@ -1105,7 +1072,7 @@ class DescribePreChunkCombiner:
|
|||||||
next(pre_chunk_iter)
|
next(pre_chunk_iter)
|
||||||
|
|
||||||
def but_it_does_not_combine_table_pre_chunks(self):
|
def but_it_does_not_combine_table_pre_chunks(self):
|
||||||
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
|
opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=250)
|
||||||
pre_chunks = [
|
pre_chunks = [
|
||||||
TextPreChunk(
|
TextPreChunk(
|
||||||
[
|
[
|
||||||
@ -1127,7 +1094,7 @@ class DescribePreChunkCombiner:
|
|||||||
]
|
]
|
||||||
|
|
||||||
pre_chunk_iter = PreChunkCombiner(
|
pre_chunk_iter = PreChunkCombiner(
|
||||||
pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
|
pre_chunks, ChunkingOptions(max_characters=250, combine_text_under_n_chars=250)
|
||||||
).iter_combined_pre_chunks()
|
).iter_combined_pre_chunks()
|
||||||
|
|
||||||
pre_chunk = next(pre_chunk_iter)
|
pre_chunk = next(pre_chunk_iter)
|
||||||
@ -1152,7 +1119,7 @@ class DescribePreChunkCombiner:
|
|||||||
next(pre_chunk_iter)
|
next(pre_chunk_iter)
|
||||||
|
|
||||||
def it_respects_the_specified_combination_threshold(self):
|
def it_respects_the_specified_combination_threshold(self):
|
||||||
opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80)
|
opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=80)
|
||||||
pre_chunks = [
|
pre_chunks = [
|
||||||
TextPreChunk( # 68
|
TextPreChunk( # 68
|
||||||
[
|
[
|
||||||
@ -1203,7 +1170,7 @@ class DescribePreChunkCombiner:
|
|||||||
next(pre_chunk_iter)
|
next(pre_chunk_iter)
|
||||||
|
|
||||||
def it_respects_the_hard_maximum_window_length(self):
|
def it_respects_the_hard_maximum_window_length(self):
|
||||||
opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200)
|
opts = ChunkingOptions(max_characters=200, combine_text_under_n_chars=200)
|
||||||
pre_chunks = [
|
pre_chunks = [
|
||||||
TextPreChunk( # 68
|
TextPreChunk( # 68
|
||||||
[
|
[
|
||||||
@ -1256,7 +1223,7 @@ class DescribePreChunkCombiner:
|
|||||||
|
|
||||||
def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
|
def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
|
||||||
"""Such as occurs when a single element exceeds the window size."""
|
"""Such as occurs when a single element exceeds the window size."""
|
||||||
opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
|
opts = ChunkingOptions(max_characters=150, combine_text_under_n_chars=150)
|
||||||
pre_chunks = [
|
pre_chunks = [
|
||||||
TextPreChunk([Title("Lorem Ipsum")], overlap_prefix="", opts=opts),
|
TextPreChunk([Title("Lorem Ipsum")], overlap_prefix="", opts=opts),
|
||||||
TextPreChunk( # 179
|
TextPreChunk( # 179
|
||||||
@ -1274,7 +1241,7 @@ class DescribePreChunkCombiner:
|
|||||||
]
|
]
|
||||||
|
|
||||||
pre_chunk_iter = PreChunkCombiner(
|
pre_chunk_iter = PreChunkCombiner(
|
||||||
pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
|
pre_chunks, ChunkingOptions(max_characters=150, combine_text_under_n_chars=150)
|
||||||
).iter_combined_pre_chunks()
|
).iter_combined_pre_chunks()
|
||||||
|
|
||||||
pre_chunk = next(pre_chunk_iter)
|
pre_chunk = next(pre_chunk_iter)
|
||||||
@ -1303,7 +1270,7 @@ class DescribeTextPreChunkAccumulator:
|
|||||||
"""Unit-test suite for `unstructured.chunking.base.TextPreChunkAccumulator`."""
|
"""Unit-test suite for `unstructured.chunking.base.TextPreChunkAccumulator`."""
|
||||||
|
|
||||||
def it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
|
def it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
|
||||||
opts = ChunkingOptions.new()
|
opts = ChunkingOptions(combine_text_under_n_chars=500)
|
||||||
accum = TextPreChunkAccumulator(opts=opts)
|
accum = TextPreChunkAccumulator(opts=opts)
|
||||||
|
|
||||||
pre_chunk = TextPreChunk(
|
pre_chunk = TextPreChunk(
|
||||||
@ -1362,7 +1329,7 @@ class DescribeTextPreChunkAccumulator:
|
|||||||
next(accum.flush())
|
next(accum.flush())
|
||||||
|
|
||||||
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
|
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
|
||||||
accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150))
|
accum = TextPreChunkAccumulator(opts=ChunkingOptions(max_characters=150))
|
||||||
assert list(accum.flush()) == []
|
assert list(accum.flush()) == []
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
"""Unit-test suite for the `unstructured.chunking.basic` module.
|
"""Test suite for the `unstructured.chunking.basic` module.
|
||||||
|
|
||||||
That module implements the baseline chunking strategy. The baseline strategy has all behaviors
|
That module implements the baseline chunking strategy. The baseline strategy has all behaviors
|
||||||
shared by all chunking strategies and no extra rules like perserve section or page boundaries.
|
shared by all chunking strategies and no extra rules like perserve section or page boundaries.
|
||||||
|
|||||||
@ -1,13 +1,20 @@
|
|||||||
# pyright: reportPrivateUsage=false
|
# pyright: reportPrivateUsage=false
|
||||||
|
|
||||||
"""Unit-test suite for the `unstructured.chunking.title` module."""
|
"""Test suite for the `unstructured.chunking.title` module."""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from unstructured.chunking.base import ChunkingOptions, TablePreChunk, TextPreChunk
|
from unstructured.chunking.base import (
|
||||||
from unstructured.chunking.title import _ByTitlePreChunker, chunk_by_title
|
CHUNK_MULTI_PAGE_DEFAULT,
|
||||||
|
PreChunker,
|
||||||
|
TablePreChunk,
|
||||||
|
TextPreChunk,
|
||||||
|
)
|
||||||
|
from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title
|
||||||
from unstructured.documents.coordinates import CoordinateSystem
|
from unstructured.documents.coordinates import CoordinateSystem
|
||||||
from unstructured.documents.elements import (
|
from unstructured.documents.elements import (
|
||||||
CheckBox,
|
CheckBox,
|
||||||
@ -23,6 +30,13 @@ from unstructured.documents.elements import (
|
|||||||
)
|
)
|
||||||
from unstructured.partition.html import partition_html
|
from unstructured.partition.html import partition_html
|
||||||
|
|
||||||
|
# ================================================================================================
|
||||||
|
# INTEGRATION-TESTS
|
||||||
|
# ================================================================================================
|
||||||
|
# These test `chunk_by_title()` as an integrated whole, calling `chunk_by_title()` and inspecting
|
||||||
|
# the outputs.
|
||||||
|
# ================================================================================================
|
||||||
|
|
||||||
|
|
||||||
def test_it_splits_a_large_element_into_multiple_chunks():
|
def test_it_splits_a_large_element_into_multiple_chunks():
|
||||||
elements: list[Element] = [
|
elements: list[Element] = [
|
||||||
@ -57,7 +71,7 @@ def test_split_elements_by_title_and_table():
|
|||||||
CheckBox(),
|
CheckBox(),
|
||||||
]
|
]
|
||||||
|
|
||||||
pre_chunks = _ByTitlePreChunker.iter_pre_chunks(elements, opts=ChunkingOptions.new())
|
pre_chunks = PreChunker.iter_pre_chunks(elements, opts=_ByTitleChunkingOptions.new())
|
||||||
|
|
||||||
pre_chunk = next(pre_chunks)
|
pre_chunk = next(pre_chunks)
|
||||||
assert isinstance(pre_chunk, TextPreChunk)
|
assert isinstance(pre_chunk, TextPreChunk)
|
||||||
@ -544,3 +558,86 @@ def test_it_considers_separator_length_when_pre_chunking():
|
|||||||
),
|
),
|
||||||
CompositeElement("Minimize mid-text chunk-splitting"),
|
CompositeElement("Minimize mid-text chunk-splitting"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================================================
|
||||||
|
# UNIT-TESTS
|
||||||
|
# ================================================================================================
|
||||||
|
# These test individual components in isolation so can exercise all edge cases while still
|
||||||
|
# performing well.
|
||||||
|
# ================================================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class Describe_ByTitleChunkingOptions:
|
||||||
|
"""Unit-test suite for `unstructured.chunking.title._ByTitleChunkingOptions` objects."""
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("n_chars", [-1, -42])
|
||||||
|
def it_rejects_combine_text_under_n_chars_for_n_less_than_zero(self, n_chars: int):
|
||||||
|
with pytest.raises(
|
||||||
|
ValueError,
|
||||||
|
match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}",
|
||||||
|
):
|
||||||
|
_ByTitleChunkingOptions.new(combine_text_under_n_chars=n_chars)
|
||||||
|
|
||||||
|
def it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining(self):
|
||||||
|
"""Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining."""
|
||||||
|
opts = _ByTitleChunkingOptions(combine_text_under_n_chars=0)
|
||||||
|
assert opts.combine_text_under_n_chars == 0
|
||||||
|
|
||||||
|
def it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself(self):
|
||||||
|
"""Caller can specify `combine_text_under_n_chars` arg without specifying other options."""
|
||||||
|
try:
|
||||||
|
opts = _ByTitleChunkingOptions(combine_text_under_n_chars=50)
|
||||||
|
except ValueError:
|
||||||
|
pytest.fail("did not accept `combine_text_under_n_chars` as option by itself")
|
||||||
|
|
||||||
|
assert opts.combine_text_under_n_chars == 50
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("combine_text_under_n_chars", "max_characters", "expected_hard_max"),
|
||||||
|
[(600, None, 500), (600, 450, 450)],
|
||||||
|
)
|
||||||
|
def it_rejects_combine_text_under_n_chars_greater_than_maxchars(
|
||||||
|
self, combine_text_under_n_chars: int, max_characters: Optional[int], expected_hard_max: int
|
||||||
|
):
|
||||||
|
"""`combine_text_under_n_chars` > `max_characters` can produce behavior confusing to users.
|
||||||
|
|
||||||
|
The behavior is no different from `combine_text_under_n_chars == max_characters`, but if
|
||||||
|
`max_characters` is left to default (500) and `combine_text_under_n_chars` is set to a
|
||||||
|
larger number like 1500 then it can look like chunk-combining isn't working.
|
||||||
|
"""
|
||||||
|
with pytest.raises(
|
||||||
|
ValueError,
|
||||||
|
match=(
|
||||||
|
"'combine_text_under_n_chars' argument must not exceed `max_characters` value,"
|
||||||
|
f" got {combine_text_under_n_chars} > {expected_hard_max}"
|
||||||
|
),
|
||||||
|
):
|
||||||
|
_ByTitleChunkingOptions.new(
|
||||||
|
max_characters=max_characters, combine_text_under_n_chars=combine_text_under_n_chars
|
||||||
|
)
|
||||||
|
|
||||||
|
def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self):
|
||||||
|
"""Caller can specify `new_after_n_chars` arg without specifying any other options.
|
||||||
|
|
||||||
|
In particular, `combine_text_under_n_chars` value is adjusted down to the
|
||||||
|
`new_after_n_chars` value when the default for `combine_text_under_n_chars` exceeds the
|
||||||
|
value of `new_after_n_chars`.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
opts = _ByTitleChunkingOptions(new_after_n_chars=200)
|
||||||
|
except ValueError:
|
||||||
|
pytest.fail("did not accept `new_after_n_chars` as option by itself")
|
||||||
|
|
||||||
|
assert opts.soft_max == 200
|
||||||
|
assert opts.combine_text_under_n_chars == 200
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("multipage_sections", "expected_value"),
|
||||||
|
[(True, True), (False, False), (None, CHUNK_MULTI_PAGE_DEFAULT)],
|
||||||
|
)
|
||||||
|
def it_knows_whether_to_break_chunks_on_page_boundaries(
|
||||||
|
self, multipage_sections: bool, expected_value: bool
|
||||||
|
):
|
||||||
|
opts = _ByTitleChunkingOptions(multipage_sections=multipage_sections)
|
||||||
|
assert opts.multipage_sections is expected_value
|
||||||
|
|||||||
@ -7,7 +7,7 @@ import copy
|
|||||||
from typing import Any, Callable, DefaultDict, Iterable, Iterator, Optional, Sequence, cast
|
from typing import Any, Callable, DefaultDict, Iterable, Iterator, Optional, Sequence, cast
|
||||||
|
|
||||||
import regex
|
import regex
|
||||||
from typing_extensions import Self, TypeAlias
|
from typing_extensions import TypeAlias
|
||||||
|
|
||||||
from unstructured.documents.elements import (
|
from unstructured.documents.elements import (
|
||||||
CompositeElement,
|
CompositeElement,
|
||||||
@ -68,9 +68,6 @@ class ChunkingOptions:
|
|||||||
when not specified, which effectively disables this behavior. Specifying 0 for this
|
when not specified, which effectively disables this behavior. Specifying 0 for this
|
||||||
argument causes each element to appear in a chunk by itself (although an element with text
|
argument causes each element to appear in a chunk by itself (although an element with text
|
||||||
longer than `max_characters` will be still be split into two or more chunks).
|
longer than `max_characters` will be still be split into two or more chunks).
|
||||||
multipage_sections
|
|
||||||
Indicates that page-boundaries should not be respected while chunking, i.e. elements
|
|
||||||
appearing on two different pages can appear in the same chunk.
|
|
||||||
combine_text_under_n_chars
|
combine_text_under_n_chars
|
||||||
Provides a way to "recombine" small chunks formed by breaking on a semantic boundary. Only
|
Provides a way to "recombine" small chunks formed by breaking on a semantic boundary. Only
|
||||||
relevant for a chunking strategy that specifies higher-level semantic boundaries to be
|
relevant for a chunking strategy that specifies higher-level semantic boundaries to be
|
||||||
@ -101,9 +98,9 @@ class ChunkingOptions:
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
*,
|
||||||
combine_text_under_n_chars: Optional[int] = None,
|
combine_text_under_n_chars: Optional[int] = None,
|
||||||
max_characters: Optional[int] = None,
|
max_characters: Optional[int] = None,
|
||||||
multipage_sections: Optional[bool] = None,
|
|
||||||
new_after_n_chars: Optional[int] = None,
|
new_after_n_chars: Optional[int] = None,
|
||||||
overlap: Optional[int] = None,
|
overlap: Optional[int] = None,
|
||||||
overlap_all: Optional[bool] = None,
|
overlap_all: Optional[bool] = None,
|
||||||
@ -111,59 +108,28 @@ class ChunkingOptions:
|
|||||||
):
|
):
|
||||||
self._combine_text_under_n_chars_arg = combine_text_under_n_chars
|
self._combine_text_under_n_chars_arg = combine_text_under_n_chars
|
||||||
self._max_characters_arg = max_characters
|
self._max_characters_arg = max_characters
|
||||||
self._multipage_sections_arg = multipage_sections
|
|
||||||
self._new_after_n_chars_arg = new_after_n_chars
|
self._new_after_n_chars_arg = new_after_n_chars
|
||||||
self._overlap_arg = overlap
|
self._overlap_arg = overlap
|
||||||
self._overlap_all_arg = overlap_all
|
self._overlap_all_arg = overlap_all
|
||||||
self._text_splitting_separators = text_splitting_separators
|
self._text_splitting_separators = text_splitting_separators
|
||||||
|
|
||||||
@classmethod
|
@lazyproperty
|
||||||
def new(
|
def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
|
||||||
cls,
|
"""The semantic-boundary detectors to be applied to break pre-chunks.
|
||||||
combine_text_under_n_chars: Optional[int] = None,
|
|
||||||
max_characters: Optional[int] = None,
|
|
||||||
multipage_sections: Optional[bool] = None,
|
|
||||||
new_after_n_chars: Optional[int] = None,
|
|
||||||
overlap: Optional[int] = None,
|
|
||||||
overlap_all: Optional[bool] = None,
|
|
||||||
text_splitting_separators: Sequence[str] = ("\n", " "),
|
|
||||||
) -> Self:
|
|
||||||
"""Construct validated instance.
|
|
||||||
|
|
||||||
Raises `ValueError` on invalid arguments like overlap > max_chars.
|
Overridden by sub-typs to provide semantic-boundary isolation behaviors.
|
||||||
"""
|
"""
|
||||||
self = cls(
|
return ()
|
||||||
combine_text_under_n_chars,
|
|
||||||
max_characters,
|
|
||||||
multipage_sections,
|
|
||||||
new_after_n_chars,
|
|
||||||
overlap,
|
|
||||||
overlap_all,
|
|
||||||
text_splitting_separators,
|
|
||||||
)
|
|
||||||
self._validate()
|
|
||||||
return self
|
|
||||||
|
|
||||||
@lazyproperty
|
@lazyproperty
|
||||||
def combine_text_under_n_chars(self) -> int:
|
def combine_text_under_n_chars(self) -> int:
|
||||||
"""Combine consecutive text pre-chunks if former is smaller than this and both will fit.
|
"""Combine two consecutive text pre-chunks if first is smaller than this and both will fit.
|
||||||
|
|
||||||
- Does not combine table chunks with text chunks even if they would both fit in the
|
Default applied here is `0` which essentially disables chunk combining. Must be overridden
|
||||||
chunking window.
|
by subclass where combining behavior is supported.
|
||||||
- Does not combine text chunks if together they would exceed the chunking window.
|
|
||||||
- Defaults to `max_characters` when not specified.
|
|
||||||
- Is reduced to `new_after_n_chars` when it exceeds that value.
|
|
||||||
"""
|
"""
|
||||||
max_characters = self.hard_max
|
arg_value = self._combine_text_under_n_chars_arg
|
||||||
soft_max = self.soft_max
|
return arg_value if arg_value is not None else 0
|
||||||
arg = self._combine_text_under_n_chars_arg
|
|
||||||
|
|
||||||
# -- `combine_text_under_n_chars` defaults to `max_characters` when not specified and is
|
|
||||||
# -- capped at max-chars
|
|
||||||
combine_text_under_n_chars = max_characters if arg is None or arg > max_characters else arg
|
|
||||||
|
|
||||||
# -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` --
|
|
||||||
return soft_max if combine_text_under_n_chars > soft_max else combine_text_under_n_chars
|
|
||||||
|
|
||||||
@lazyproperty
|
@lazyproperty
|
||||||
def hard_max(self) -> int:
|
def hard_max(self) -> int:
|
||||||
@ -185,12 +151,6 @@ class ChunkingOptions:
|
|||||||
"""
|
"""
|
||||||
return self.overlap if self._overlap_all_arg else 0
|
return self.overlap if self._overlap_all_arg else 0
|
||||||
|
|
||||||
@lazyproperty
|
|
||||||
def multipage_sections(self) -> bool:
|
|
||||||
"""When False, break pre-chunks on page-boundaries."""
|
|
||||||
arg_value = self._multipage_sections_arg
|
|
||||||
return CHUNK_MULTI_PAGE_DEFAULT if arg_value is None else bool(arg_value)
|
|
||||||
|
|
||||||
@lazyproperty
|
@lazyproperty
|
||||||
def overlap(self) -> int:
|
def overlap(self) -> int:
|
||||||
"""The number of characters to overlap text when splitting chunks mid-text.
|
"""The number of characters to overlap text when splitting chunks mid-text.
|
||||||
@ -256,28 +216,15 @@ class ChunkingOptions:
|
|||||||
if max_characters <= 0:
|
if max_characters <= 0:
|
||||||
raise ValueError(f"'max_characters' argument must be > 0," f" got {max_characters}")
|
raise ValueError(f"'max_characters' argument must be > 0," f" got {max_characters}")
|
||||||
|
|
||||||
# -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination)
|
# -- a negative value for `new_after_n_chars` is assumed to be a mistake the caller will
|
||||||
# -- but a negative value is not
|
# -- want to know about
|
||||||
combine_text_under_n_chars = self._combine_text_under_n_chars_arg
|
|
||||||
if combine_text_under_n_chars is not None and combine_text_under_n_chars < 0:
|
|
||||||
raise ValueError(
|
|
||||||
f"'combine_text_under_n_chars' argument must be >= 0,"
|
|
||||||
f" got {combine_text_under_n_chars}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# -- a negative value for `new_after_n_chars` is assumed to
|
|
||||||
# -- be a mistake the caller will want to know about
|
|
||||||
new_after_n_chars = self._new_after_n_chars_arg
|
new_after_n_chars = self._new_after_n_chars_arg
|
||||||
if new_after_n_chars is not None and new_after_n_chars < 0:
|
if new_after_n_chars is not None and new_after_n_chars < 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"'new_after_n_chars' argument must be >= 0," f" got {new_after_n_chars}"
|
f"'new_after_n_chars' argument must be >= 0," f" got {new_after_n_chars}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# -- overlap must be less than max-chars or the chunk text will
|
# -- overlap must be less than max-chars or the chunk text will never be consumed --
|
||||||
# -- never be consumed
|
|
||||||
# TODO: consider a heuristic like never overlap more than half,
|
|
||||||
# otherwise there could be corner cases leading to an infinite
|
|
||||||
# loop (I think).
|
|
||||||
if self.overlap >= max_characters:
|
if self.overlap >= max_characters:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"'overlap' argument must be less than `max_characters`,"
|
f"'overlap' argument must be less than `max_characters`,"
|
||||||
@ -402,12 +349,12 @@ class _TextSplitter:
|
|||||||
|
|
||||||
|
|
||||||
# ================================================================================================
|
# ================================================================================================
|
||||||
# BASE PRE-CHUNKER
|
# PRE-CHUNKER
|
||||||
# ================================================================================================
|
# ================================================================================================
|
||||||
|
|
||||||
|
|
||||||
class BasePreChunker:
|
class PreChunker:
|
||||||
"""Base-class for per-strategy pre-chunkers.
|
"""Gathers sequential elements into pre-chunks as length constraints allow.
|
||||||
|
|
||||||
The pre-chunker's responsibilities are:
|
The pre-chunker's responsibilities are:
|
||||||
|
|
||||||
@ -465,7 +412,7 @@ class BasePreChunker:
|
|||||||
@lazyproperty
|
@lazyproperty
|
||||||
def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
|
def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
|
||||||
"""The semantic-boundary detectors to be applied to break pre-chunks."""
|
"""The semantic-boundary detectors to be applied to break pre-chunks."""
|
||||||
return ()
|
return self._opts.boundary_predicates
|
||||||
|
|
||||||
def _is_in_new_semantic_unit(self, element: Element) -> bool:
|
def _is_in_new_semantic_unit(self, element: Element) -> bool:
|
||||||
"""True when `element` begins a new semantic unit such as a section or page."""
|
"""True when `element` begins a new semantic unit such as a section or page."""
|
||||||
|
|||||||
@ -17,7 +17,9 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from typing import Iterable, Optional
|
from typing import Iterable, Optional
|
||||||
|
|
||||||
from unstructured.chunking.base import BasePreChunker, ChunkingOptions
|
from typing_extensions import Self
|
||||||
|
|
||||||
|
from unstructured.chunking.base import ChunkingOptions, PreChunker
|
||||||
from unstructured.documents.elements import Element
|
from unstructured.documents.elements import Element
|
||||||
|
|
||||||
|
|
||||||
@ -58,7 +60,7 @@ def chunk_elements(
|
|||||||
level of "pollution" of otherwise clean semantic chunk boundaries.
|
level of "pollution" of otherwise clean semantic chunk boundaries.
|
||||||
"""
|
"""
|
||||||
# -- raises ValueError on invalid parameters --
|
# -- raises ValueError on invalid parameters --
|
||||||
opts = ChunkingOptions.new(
|
opts = _BasicChunkingOptions.new(
|
||||||
max_characters=max_characters,
|
max_characters=max_characters,
|
||||||
new_after_n_chars=new_after_n_chars,
|
new_after_n_chars=new_after_n_chars,
|
||||||
overlap=overlap,
|
overlap=overlap,
|
||||||
@ -67,14 +69,32 @@ def chunk_elements(
|
|||||||
|
|
||||||
return [
|
return [
|
||||||
chunk
|
chunk
|
||||||
for pre_chunk in BasicPreChunker.iter_pre_chunks(elements, opts)
|
for pre_chunk in PreChunker.iter_pre_chunks(elements, opts)
|
||||||
for chunk in pre_chunk.iter_chunks()
|
for chunk in pre_chunk.iter_chunks()
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
class BasicPreChunker(BasePreChunker):
|
class _BasicChunkingOptions(ChunkingOptions):
|
||||||
"""Produces pre-chunks from a sequence of document-elements using the "basic" rule-set.
|
"""Options for `basic` chunking."""
|
||||||
|
|
||||||
The "basic" rule-set is essentially "no-rules" other than `Table` is segregated into its own
|
@classmethod
|
||||||
pre-chunk.
|
def new(
|
||||||
|
cls,
|
||||||
|
*,
|
||||||
|
max_characters: Optional[int] = None,
|
||||||
|
new_after_n_chars: Optional[int] = None,
|
||||||
|
overlap: Optional[int] = None,
|
||||||
|
overlap_all: Optional[bool] = None,
|
||||||
|
) -> Self:
|
||||||
|
"""Construct validated instance.
|
||||||
|
|
||||||
|
Raises `ValueError` on invalid arguments like overlap > max_chars.
|
||||||
"""
|
"""
|
||||||
|
self = cls(
|
||||||
|
max_characters=max_characters,
|
||||||
|
new_after_n_chars=new_after_n_chars,
|
||||||
|
overlap=overlap,
|
||||||
|
overlap_all=overlap_all,
|
||||||
|
)
|
||||||
|
self._validate()
|
||||||
|
return self
|
||||||
|
|||||||
@ -7,11 +7,14 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from typing import Iterable, Iterator, Optional
|
from typing import Iterable, Iterator, Optional
|
||||||
|
|
||||||
|
from typing_extensions import Self
|
||||||
|
|
||||||
from unstructured.chunking.base import (
|
from unstructured.chunking.base import (
|
||||||
BasePreChunker,
|
CHUNK_MULTI_PAGE_DEFAULT,
|
||||||
BoundaryPredicate,
|
BoundaryPredicate,
|
||||||
ChunkingOptions,
|
ChunkingOptions,
|
||||||
PreChunkCombiner,
|
PreChunkCombiner,
|
||||||
|
PreChunker,
|
||||||
is_in_next_section,
|
is_in_next_section,
|
||||||
is_on_next_page,
|
is_on_next_page,
|
||||||
is_title,
|
is_title,
|
||||||
@ -22,6 +25,7 @@ from unstructured.utils import lazyproperty
|
|||||||
|
|
||||||
def chunk_by_title(
|
def chunk_by_title(
|
||||||
elements: Iterable[Element],
|
elements: Iterable[Element],
|
||||||
|
*,
|
||||||
max_characters: Optional[int] = None,
|
max_characters: Optional[int] = None,
|
||||||
multipage_sections: Optional[bool] = None,
|
multipage_sections: Optional[bool] = None,
|
||||||
combine_text_under_n_chars: Optional[int] = None,
|
combine_text_under_n_chars: Optional[int] = None,
|
||||||
@ -65,7 +69,7 @@ def chunk_by_title(
|
|||||||
elements and not subject to text-splitting. Use this with caution as it entails a certain
|
elements and not subject to text-splitting. Use this with caution as it entails a certain
|
||||||
level of "pollution" of otherwise clean semantic chunk boundaries.
|
level of "pollution" of otherwise clean semantic chunk boundaries.
|
||||||
"""
|
"""
|
||||||
opts = ChunkingOptions.new(
|
opts = _ByTitleChunkingOptions.new(
|
||||||
combine_text_under_n_chars=combine_text_under_n_chars,
|
combine_text_under_n_chars=combine_text_under_n_chars,
|
||||||
max_characters=max_characters,
|
max_characters=max_characters,
|
||||||
multipage_sections=multipage_sections,
|
multipage_sections=multipage_sections,
|
||||||
@ -75,27 +79,127 @@ def chunk_by_title(
|
|||||||
)
|
)
|
||||||
|
|
||||||
pre_chunks = PreChunkCombiner(
|
pre_chunks = PreChunkCombiner(
|
||||||
_ByTitlePreChunker.iter_pre_chunks(elements, opts), opts=opts
|
PreChunker.iter_pre_chunks(elements, opts), opts=opts
|
||||||
).iter_combined_pre_chunks()
|
).iter_combined_pre_chunks()
|
||||||
|
|
||||||
return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks()]
|
return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks()]
|
||||||
|
|
||||||
|
|
||||||
class _ByTitlePreChunker(BasePreChunker):
|
class _ByTitleChunkingOptions(ChunkingOptions):
|
||||||
"""Pre-chunker for the "by_title" chunking strategy.
|
"""Adds the by-title-specific chunking options to the base case.
|
||||||
|
|
||||||
The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates a
|
`by_title`-specific options:
|
||||||
new "section", hence the "by-title" designation.
|
|
||||||
|
multipage_sections
|
||||||
|
Indicates that page-boundaries should not be respected while chunking, i.e. elements
|
||||||
|
appearing on two different pages can appear in the same chunk.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
max_characters: Optional[int] = None,
|
||||||
|
combine_text_under_n_chars: Optional[int] = None,
|
||||||
|
multipage_sections: Optional[bool] = None,
|
||||||
|
new_after_n_chars: Optional[int] = None,
|
||||||
|
overlap: Optional[int] = None,
|
||||||
|
overlap_all: Optional[bool] = None,
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
combine_text_under_n_chars=combine_text_under_n_chars,
|
||||||
|
max_characters=max_characters,
|
||||||
|
new_after_n_chars=new_after_n_chars,
|
||||||
|
overlap=overlap,
|
||||||
|
overlap_all=overlap_all,
|
||||||
|
)
|
||||||
|
self._multipage_sections_arg = multipage_sections
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def new(
|
||||||
|
cls,
|
||||||
|
*,
|
||||||
|
max_characters: Optional[int] = None,
|
||||||
|
combine_text_under_n_chars: Optional[int] = None,
|
||||||
|
multipage_sections: Optional[bool] = None,
|
||||||
|
new_after_n_chars: Optional[int] = None,
|
||||||
|
overlap: Optional[int] = None,
|
||||||
|
overlap_all: Optional[bool] = None,
|
||||||
|
) -> Self:
|
||||||
|
"""Return instance or raises `ValueError` on invalid arguments like overlap > max_chars."""
|
||||||
|
self = cls(
|
||||||
|
max_characters=max_characters,
|
||||||
|
combine_text_under_n_chars=combine_text_under_n_chars,
|
||||||
|
multipage_sections=multipage_sections,
|
||||||
|
new_after_n_chars=new_after_n_chars,
|
||||||
|
overlap=overlap,
|
||||||
|
overlap_all=overlap_all,
|
||||||
|
)
|
||||||
|
self._validate()
|
||||||
|
return self
|
||||||
|
|
||||||
@lazyproperty
|
@lazyproperty
|
||||||
def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
|
def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
|
||||||
"""The semantic-boundary detectors to be applied to break pre-chunks."""
|
"""The semantic-boundary detectors to be applied to break pre-chunks.
|
||||||
|
|
||||||
|
For the `by_title` strategy these are sections indicated by a title (section-heading), an
|
||||||
|
explicit section metadata item (only present for certain document types), and optionally
|
||||||
|
page boundaries.
|
||||||
|
"""
|
||||||
|
|
||||||
def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:
|
def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:
|
||||||
yield is_title
|
yield is_title
|
||||||
yield is_in_next_section()
|
yield is_in_next_section()
|
||||||
if not self._opts.multipage_sections:
|
if not self.multipage_sections:
|
||||||
yield is_on_next_page()
|
yield is_on_next_page()
|
||||||
|
|
||||||
return tuple(iter_boundary_predicates())
|
return tuple(iter_boundary_predicates())
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def combine_text_under_n_chars(self) -> int:
|
||||||
|
"""Combine consecutive text pre-chunks if former is smaller than this and both will fit.
|
||||||
|
|
||||||
|
- Does not combine table chunks with text chunks even if they would both fit in the
|
||||||
|
chunking window.
|
||||||
|
- Does not combine text chunks if together they would exceed the chunking window.
|
||||||
|
- Defaults to `max_characters` when not specified.
|
||||||
|
- Is reduced to `new_after_n_chars` when it exceeds that value.
|
||||||
|
"""
|
||||||
|
max_characters = self.hard_max
|
||||||
|
soft_max = self.soft_max
|
||||||
|
arg_value = self._combine_text_under_n_chars_arg
|
||||||
|
|
||||||
|
# -- `combine_text_under_n_chars` defaults to `max_characters` when not specified --
|
||||||
|
combine_text_under_n_chars = max_characters if arg_value is None else arg_value
|
||||||
|
|
||||||
|
# -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` --
|
||||||
|
return soft_max if combine_text_under_n_chars > soft_max else combine_text_under_n_chars
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def multipage_sections(self) -> bool:
|
||||||
|
"""When False, break pre-chunks on page-boundaries."""
|
||||||
|
arg_value = self._multipage_sections_arg
|
||||||
|
return CHUNK_MULTI_PAGE_DEFAULT if arg_value is None else bool(arg_value)
|
||||||
|
|
||||||
|
def _validate(self) -> None:
|
||||||
|
"""Raise ValueError if request option-set is invalid."""
|
||||||
|
# -- start with base-class validations --
|
||||||
|
super()._validate()
|
||||||
|
|
||||||
|
if (combine_text_under_n_chars_arg := self._combine_text_under_n_chars_arg) is not None:
|
||||||
|
# -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination)
|
||||||
|
# -- but a negative value is not
|
||||||
|
if combine_text_under_n_chars_arg < 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"'combine_text_under_n_chars' argument must be >= 0,"
|
||||||
|
f" got {combine_text_under_n_chars_arg}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# -- `combine_text_under_n_chars` > `max_characters` can produce behavior confusing to
|
||||||
|
# -- users. The chunking behavior would be no different than when
|
||||||
|
# -- `combine_text_under_n_chars == max_characters`, but if `max_characters` is left to
|
||||||
|
# -- default (500) then it can look like chunk-combining isn't working.
|
||||||
|
if combine_text_under_n_chars_arg > self.hard_max:
|
||||||
|
raise ValueError(
|
||||||
|
f"'combine_text_under_n_chars' argument must not exceed `max_characters`"
|
||||||
|
f" value, got {combine_text_under_n_chars_arg} > {self.hard_max}"
|
||||||
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user