114 lines
4.9 KiB
Python
Raw Normal View History

"""Unit-test suite for the `unstructured.chunking.base` module."""
from __future__ import annotations
import pytest
from unstructured.chunking.base import ChunkingOptions
class DescribeChunkingOptions:
"""Unit-test suite for `unstructured.chunking.model.ChunkingOptions objects."""
@pytest.mark.parametrize("max_characters", [0, -1, -42])
def it_rejects_max_characters_not_greater_than_zero(self, max_characters: int):
with pytest.raises(
ValueError,
match=f"'max_characters' argument must be > 0, got {max_characters}",
):
ChunkingOptions.new(max_characters=max_characters)
def it_does_not_complain_when_specifying_max_characters_by_itself(self):
"""Caller can specify `max_characters` arg without specifying any others.
In particular, When `combine_text_under_n_chars` is not specified it defaults to the value
of `max_characters`; it has no fixed default value that can be greater than `max_characters`
and trigger an exception.
"""
try:
ChunkingOptions.new(max_characters=50)
except ValueError:
pytest.fail("did not accept `max_characters` as option by itself")
@pytest.mark.parametrize("n_chars", [-1, -42])
def it_rejects_combine_text_under_n_chars_for_n_less_than_zero(self, n_chars: int):
with pytest.raises(
ValueError,
match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}",
):
ChunkingOptions.new(combine_text_under_n_chars=n_chars)
def it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining(self):
"""Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining."""
opts = ChunkingOptions.new(combine_text_under_n_chars=0)
assert opts.combine_text_under_n_chars == 0
def it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself(self):
"""Caller can specify `combine_text_under_n_chars` arg without specifying other options."""
try:
opts = ChunkingOptions.new(combine_text_under_n_chars=50)
except ValueError:
pytest.fail("did not accept `combine_text_under_n_chars` as option by itself")
assert opts.combine_text_under_n_chars == 50
def it_silently_accepts_combine_text_under_n_chars_greater_than_maxchars(self):
"""`combine_text_under_n_chars` > `max_characters` doesn't affect chunking behavior.
So rather than raising an exception or warning, we just cap that value at `max_characters`
which is the behavioral equivalent.
"""
try:
opts = ChunkingOptions.new(max_characters=500, combine_text_under_n_chars=600)
except ValueError:
pytest.fail("did not accept `combine_text_under_n_chars` greater than `max_characters`")
assert opts.combine_text_under_n_chars == 500
@pytest.mark.parametrize("n_chars", [-1, -42])
def it_rejects_new_after_n_chars_for_n_less_than_zero(self, n_chars: int):
with pytest.raises(
ValueError,
match=f"'new_after_n_chars' argument must be >= 0, got {n_chars}",
):
ChunkingOptions.new(new_after_n_chars=n_chars)
def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self):
"""Caller can specify `new_after_n_chars` arg without specifying any other options.
In particular, `combine_text_under_n_chars` value is adjusted down to the
`new_after_n_chars` value when the default for `combine_text_under_n_chars` exceeds the
value of `new_after_n_chars`.
"""
try:
opts = ChunkingOptions.new(new_after_n_chars=200)
except ValueError:
pytest.fail("did not accept `new_after_n_chars` as option by itself")
assert opts.soft_max == 200
assert opts.combine_text_under_n_chars == 200
def it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk(self):
"""Specifying `new_after_n_chars=0` places each element into its own pre-chunk.
This puts each element into its own chunk, although long chunks are still split.
"""
opts = ChunkingOptions.new(new_after_n_chars=0)
assert opts.soft_max == 0
def it_silently_accepts_new_after_n_chars_greater_than_maxchars(self):
"""`new_after_n_chars` > `max_characters` doesn't affect chunking behavior.
So rather than raising an exception or warning, we just cap that value at `max_characters`
which is the behavioral equivalent.
"""
try:
opts = ChunkingOptions.new(max_characters=444, new_after_n_chars=555)
except ValueError:
pytest.fail("did not accept `new_after_n_chars` greater than `max_characters`")
assert opts.soft_max == 444
def it_knows_the_text_separator_string(self):
assert ChunkingOptions.new().text_separator == "\n\n"