mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00

**Summary** Step 2 in prep for pluggable auto-partitioners, remove `regex_metadata` field from `ElementMetadata`. **Additional Context** - "regex-metadata" was an experimental feature that didn't pan out. - It's implemented by one of the post-partitioning metadata decorators, so get rid of it as part of the cleanup before consolidating those decorators.
503 lines
18 KiB
Python
503 lines
18 KiB
Python
# pyright: reportPrivateUsage=false
|
|
|
|
"""Test suite for the `unstructured.chunking.title` module."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import Any, Optional
|
|
|
|
import pytest
|
|
|
|
from test_unstructured.unit_utils import FixtureRequest, Mock, function_mock
|
|
from unstructured.chunking.base import CHUNK_MULTI_PAGE_DEFAULT
|
|
from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title
|
|
from unstructured.documents.coordinates import CoordinateSystem
|
|
from unstructured.documents.elements import (
|
|
CheckBox,
|
|
CompositeElement,
|
|
CoordinatesMetadata,
|
|
Element,
|
|
ElementMetadata,
|
|
ListItem,
|
|
Table,
|
|
Text,
|
|
Title,
|
|
)
|
|
from unstructured.partition.html import partition_html
|
|
|
|
# ================================================================================================
|
|
# INTEGRATION-TESTS
|
|
# ================================================================================================
|
|
# These test `chunk_by_title()` as an integrated whole, calling `chunk_by_title()` and inspecting
|
|
# the outputs.
|
|
# ================================================================================================
|
|
|
|
|
|
def test_it_splits_a_large_element_into_multiple_chunks():
|
|
elements: list[Element] = [
|
|
Title("Introduction"),
|
|
Text(
|
|
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed lectus"
|
|
" porta volutpat.",
|
|
),
|
|
]
|
|
|
|
chunks = chunk_by_title(elements, max_characters=50)
|
|
|
|
assert chunks == [
|
|
CompositeElement("Introduction"),
|
|
CompositeElement("Lorem ipsum dolor sit amet consectetur adipiscing"),
|
|
CompositeElement("elit. In rhoncus ipsum sed lectus porta volutpat."),
|
|
]
|
|
|
|
|
|
def test_it_splits_elements_by_title_and_table():
|
|
elements: list[Element] = [
|
|
Title("A Great Day"),
|
|
Text("Today is a great day."),
|
|
Text("It is sunny outside."),
|
|
Table("Heading\nCell text"),
|
|
Title("An Okay Day"),
|
|
Text("Today is an okay day."),
|
|
Text("It is rainy outside."),
|
|
Title("A Bad Day"),
|
|
Text("Today is a bad day."),
|
|
Text("It is storming outside."),
|
|
CheckBox(),
|
|
]
|
|
|
|
chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=True)
|
|
|
|
assert len(chunks) == 4
|
|
# --
|
|
chunk = chunks[0]
|
|
assert isinstance(chunk, CompositeElement)
|
|
assert chunk.metadata.orig_elements == [
|
|
Title("A Great Day"),
|
|
Text("Today is a great day."),
|
|
Text("It is sunny outside."),
|
|
]
|
|
# --
|
|
chunk = chunks[1]
|
|
assert isinstance(chunk, Table)
|
|
assert chunk.metadata.orig_elements == [Table("Heading\nCell text")]
|
|
# ==
|
|
chunk = chunks[2]
|
|
assert isinstance(chunk, CompositeElement)
|
|
assert chunk.metadata.orig_elements == [
|
|
Title("An Okay Day"),
|
|
Text("Today is an okay day."),
|
|
Text("It is rainy outside."),
|
|
]
|
|
# --
|
|
chunk = chunks[3]
|
|
assert isinstance(chunk, CompositeElement)
|
|
assert chunk.metadata.orig_elements == [
|
|
Title("A Bad Day"),
|
|
Text("Today is a bad day."),
|
|
Text("It is storming outside."),
|
|
CheckBox(),
|
|
]
|
|
|
|
|
|
def test_chunk_by_title():
|
|
elements: list[Element] = [
|
|
Title("A Great Day", metadata=ElementMetadata(emphasized_text_contents=["Day"])),
|
|
Text("Today is a great day.", metadata=ElementMetadata(emphasized_text_contents=["day"])),
|
|
Text("It is sunny outside."),
|
|
Table("Heading\nCell text"),
|
|
Title("An Okay Day"),
|
|
Text("Today is an okay day."),
|
|
Text("It is rainy outside."),
|
|
Title("A Bad Day"),
|
|
Text("Today is a bad day."),
|
|
Text("It is storming outside."),
|
|
CheckBox(),
|
|
]
|
|
|
|
chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=False)
|
|
|
|
assert chunks == [
|
|
CompositeElement(
|
|
"A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
|
|
),
|
|
Table("Heading\nCell text"),
|
|
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
|
|
CompositeElement(
|
|
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
|
|
),
|
|
]
|
|
assert chunks[0].metadata == ElementMetadata(emphasized_text_contents=["Day", "day"])
|
|
|
|
|
|
def test_chunk_by_title_separates_by_page_number():
|
|
elements: list[Element] = [
|
|
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
|
|
Text("Today is a great day.", metadata=ElementMetadata(page_number=2)),
|
|
Text("It is sunny outside.", metadata=ElementMetadata(page_number=2)),
|
|
Table("Heading\nCell text"),
|
|
Title("An Okay Day"),
|
|
Text("Today is an okay day."),
|
|
Text("It is rainy outside."),
|
|
Title("A Bad Day"),
|
|
Text("Today is a bad day."),
|
|
Text("It is storming outside."),
|
|
CheckBox(),
|
|
]
|
|
chunks = chunk_by_title(elements, multipage_sections=False, combine_text_under_n_chars=0)
|
|
|
|
assert chunks == [
|
|
CompositeElement(
|
|
"A Great Day",
|
|
),
|
|
CompositeElement(
|
|
"Today is a great day.\n\nIt is sunny outside.",
|
|
),
|
|
Table("Heading\nCell text"),
|
|
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
|
|
CompositeElement(
|
|
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
|
|
),
|
|
]
|
|
|
|
|
|
def test_chuck_by_title_respects_multipage():
|
|
elements: list[Element] = [
|
|
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
|
|
Text("Today is a great day.", metadata=ElementMetadata(page_number=2)),
|
|
Text("It is sunny outside.", metadata=ElementMetadata(page_number=2)),
|
|
Table("Heading\nCell text"),
|
|
Title("An Okay Day"),
|
|
Text("Today is an okay day."),
|
|
Text("It is rainy outside."),
|
|
Title("A Bad Day"),
|
|
Text("Today is a bad day."),
|
|
Text("It is storming outside."),
|
|
CheckBox(),
|
|
]
|
|
chunks = chunk_by_title(elements, multipage_sections=True, combine_text_under_n_chars=0)
|
|
assert chunks == [
|
|
CompositeElement(
|
|
"A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
|
|
),
|
|
Table("Heading\nCell text"),
|
|
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
|
|
CompositeElement(
|
|
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
|
|
),
|
|
]
|
|
|
|
|
|
def test_chunk_by_title_groups_across_pages():
|
|
elements: list[Element] = [
|
|
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
|
|
Text("Today is a great day.", metadata=ElementMetadata(page_number=2)),
|
|
Text("It is sunny outside.", metadata=ElementMetadata(page_number=2)),
|
|
Table("Heading\nCell text"),
|
|
Title("An Okay Day"),
|
|
Text("Today is an okay day."),
|
|
Text("It is rainy outside."),
|
|
Title("A Bad Day"),
|
|
Text("Today is a bad day."),
|
|
Text("It is storming outside."),
|
|
CheckBox(),
|
|
]
|
|
chunks = chunk_by_title(elements, multipage_sections=True, combine_text_under_n_chars=0)
|
|
|
|
assert chunks == [
|
|
CompositeElement(
|
|
"A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
|
|
),
|
|
Table("Heading\nCell text"),
|
|
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
|
|
CompositeElement(
|
|
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
|
|
),
|
|
]
|
|
|
|
|
|
def test_add_chunking_strategy_on_partition_html():
|
|
filename = "example-docs/example-10k-1p.html"
|
|
chunk_elements = partition_html(filename, chunking_strategy="by_title")
|
|
elements = partition_html(filename)
|
|
chunks = chunk_by_title(elements)
|
|
assert chunk_elements != elements
|
|
assert chunk_elements == chunks
|
|
|
|
|
|
def test_add_chunking_strategy_respects_max_characters():
|
|
filename = "example-docs/example-10k-1p.html"
|
|
chunk_elements = partition_html(
|
|
filename,
|
|
chunking_strategy="by_title",
|
|
combine_text_under_n_chars=0,
|
|
new_after_n_chars=50,
|
|
max_characters=100,
|
|
)
|
|
elements = partition_html(filename)
|
|
chunks = chunk_by_title(
|
|
elements,
|
|
combine_text_under_n_chars=0,
|
|
new_after_n_chars=50,
|
|
max_characters=100,
|
|
)
|
|
|
|
for chunk in chunks:
|
|
assert isinstance(chunk, Text)
|
|
assert len(chunk.text) <= 100
|
|
for chunk_element in chunk_elements:
|
|
assert isinstance(chunk_element, Text)
|
|
assert len(chunk_element.text) <= 100
|
|
assert chunk_elements != elements
|
|
assert chunk_elements == chunks
|
|
|
|
|
|
def test_chunk_by_title_drops_detection_class_prob():
|
|
elements: list[Element] = [
|
|
Title(
|
|
"A Great Day",
|
|
metadata=ElementMetadata(
|
|
detection_class_prob=0.5,
|
|
),
|
|
),
|
|
Text(
|
|
"Today is a great day.",
|
|
metadata=ElementMetadata(
|
|
detection_class_prob=0.62,
|
|
),
|
|
),
|
|
Text(
|
|
"It is sunny outside.",
|
|
metadata=ElementMetadata(
|
|
detection_class_prob=0.73,
|
|
),
|
|
),
|
|
Title(
|
|
"An Okay Day",
|
|
metadata=ElementMetadata(
|
|
detection_class_prob=0.84,
|
|
),
|
|
),
|
|
Text(
|
|
"Today is an okay day.",
|
|
metadata=ElementMetadata(
|
|
detection_class_prob=0.95,
|
|
),
|
|
),
|
|
]
|
|
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
|
|
assert str(chunks[0]) == str(
|
|
CompositeElement("A Great Day\n\nToday is a great day.\n\nIt is sunny outside."),
|
|
)
|
|
assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day."))
|
|
|
|
|
|
def test_chunk_by_title_drops_extra_metadata():
|
|
elements: list[Element] = [
|
|
Title(
|
|
"A Great Day",
|
|
metadata=ElementMetadata(
|
|
coordinates=CoordinatesMetadata(
|
|
points=(
|
|
(0.1, 0.1),
|
|
(0.2, 0.1),
|
|
(0.1, 0.2),
|
|
(0.2, 0.2),
|
|
),
|
|
system=CoordinateSystem(width=0.1, height=0.1),
|
|
),
|
|
),
|
|
),
|
|
Text(
|
|
"Today is a great day.",
|
|
metadata=ElementMetadata(
|
|
coordinates=CoordinatesMetadata(
|
|
points=(
|
|
(0.2, 0.2),
|
|
(0.3, 0.2),
|
|
(0.2, 0.3),
|
|
(0.3, 0.3),
|
|
),
|
|
system=CoordinateSystem(width=0.2, height=0.2),
|
|
),
|
|
),
|
|
),
|
|
Text(
|
|
"It is sunny outside.",
|
|
metadata=ElementMetadata(
|
|
coordinates=CoordinatesMetadata(
|
|
points=(
|
|
(0.3, 0.3),
|
|
(0.4, 0.3),
|
|
(0.3, 0.4),
|
|
(0.4, 0.4),
|
|
),
|
|
system=CoordinateSystem(width=0.3, height=0.3),
|
|
),
|
|
),
|
|
),
|
|
Title(
|
|
"An Okay Day",
|
|
metadata=ElementMetadata(
|
|
coordinates=CoordinatesMetadata(
|
|
points=(
|
|
(0.3, 0.3),
|
|
(0.4, 0.3),
|
|
(0.3, 0.4),
|
|
(0.4, 0.4),
|
|
),
|
|
system=CoordinateSystem(width=0.3, height=0.3),
|
|
),
|
|
),
|
|
),
|
|
Text(
|
|
"Today is an okay day.",
|
|
metadata=ElementMetadata(
|
|
coordinates=CoordinatesMetadata(
|
|
points=(
|
|
(0.4, 0.4),
|
|
(0.5, 0.4),
|
|
(0.4, 0.5),
|
|
(0.5, 0.5),
|
|
),
|
|
system=CoordinateSystem(width=0.4, height=0.4),
|
|
),
|
|
),
|
|
),
|
|
]
|
|
|
|
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
|
|
|
|
assert str(chunks[0]) == str(
|
|
CompositeElement("A Great Day\n\nToday is a great day.\n\nIt is sunny outside."),
|
|
)
|
|
|
|
assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day."))
|
|
|
|
|
|
def test_it_considers_separator_length_when_pre_chunking():
|
|
"""PreChunker includes length of separators when computing remaining space."""
|
|
elements: list[Element] = [
|
|
Title("Chunking Priorities"), # 19 chars
|
|
ListItem("Divide text into manageable chunks"), # 34 chars
|
|
ListItem("Preserve semantic boundaries"), # 28 chars
|
|
ListItem("Minimize mid-text chunk-splitting"), # 33 chars
|
|
] # 114 chars total but 120 chars with separators
|
|
|
|
chunks = chunk_by_title(elements, max_characters=115)
|
|
|
|
assert chunks == [
|
|
CompositeElement(
|
|
"Chunking Priorities"
|
|
"\n\nDivide text into manageable chunks"
|
|
"\n\nPreserve semantic boundaries",
|
|
),
|
|
CompositeElement("Minimize mid-text chunk-splitting"),
|
|
]
|
|
|
|
|
|
# ================================================================================================
|
|
# UNIT-TESTS
|
|
# ================================================================================================
|
|
# These test individual components in isolation so can exercise all edge cases while still
|
|
# performing well.
|
|
# ================================================================================================
|
|
|
|
|
|
class Describe_chunk_by_title:
|
|
"""Unit-test suite for `unstructured.chunking.title.chunk_by_title()` function."""
|
|
|
|
@pytest.mark.parametrize(
|
|
("kwargs", "expected_value"),
|
|
[
|
|
({"include_orig_elements": True}, True),
|
|
({"include_orig_elements": False}, False),
|
|
({"include_orig_elements": None}, True),
|
|
({}, True),
|
|
],
|
|
)
|
|
def it_supports_the_include_orig_elements_option(
|
|
self, kwargs: dict[str, Any], expected_value: bool, _chunk_by_title_: Mock
|
|
):
|
|
# -- this line would raise if "include_orig_elements" was not an available parameter on
|
|
# -- `chunk_by_title()`.
|
|
chunk_by_title([], **kwargs)
|
|
|
|
_, opts = _chunk_by_title_.call_args.args
|
|
assert opts.include_orig_elements is expected_value
|
|
|
|
# -- fixtures --------------------------------------------------------------------------------
|
|
|
|
@pytest.fixture()
|
|
def _chunk_by_title_(self, request: FixtureRequest):
|
|
return function_mock(request, "unstructured.chunking.title._chunk_by_title")
|
|
|
|
|
|
class Describe_ByTitleChunkingOptions:
|
|
"""Unit-test suite for `unstructured.chunking.title._ByTitleChunkingOptions` objects."""
|
|
|
|
@pytest.mark.parametrize("n_chars", [-1, -42])
|
|
def it_rejects_combine_text_under_n_chars_for_n_less_than_zero(self, n_chars: int):
|
|
with pytest.raises(
|
|
ValueError,
|
|
match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}",
|
|
):
|
|
_ByTitleChunkingOptions.new(combine_text_under_n_chars=n_chars)
|
|
|
|
def it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining(self):
|
|
"""Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining."""
|
|
opts = _ByTitleChunkingOptions(combine_text_under_n_chars=0)
|
|
assert opts.combine_text_under_n_chars == 0
|
|
|
|
def it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself(self):
|
|
"""Caller can specify `combine_text_under_n_chars` arg without specifying other options."""
|
|
try:
|
|
opts = _ByTitleChunkingOptions(combine_text_under_n_chars=50)
|
|
except ValueError:
|
|
pytest.fail("did not accept `combine_text_under_n_chars` as option by itself")
|
|
|
|
assert opts.combine_text_under_n_chars == 50
|
|
|
|
@pytest.mark.parametrize(
|
|
("combine_text_under_n_chars", "max_characters", "expected_hard_max"),
|
|
[(600, None, 500), (600, 450, 450)],
|
|
)
|
|
def it_rejects_combine_text_under_n_chars_greater_than_maxchars(
|
|
self, combine_text_under_n_chars: int, max_characters: Optional[int], expected_hard_max: int
|
|
):
|
|
"""`combine_text_under_n_chars` > `max_characters` can produce behavior confusing to users.
|
|
|
|
The behavior is no different from `combine_text_under_n_chars == max_characters`, but if
|
|
`max_characters` is left to default (500) and `combine_text_under_n_chars` is set to a
|
|
larger number like 1500 then it can look like chunk-combining isn't working.
|
|
"""
|
|
with pytest.raises(
|
|
ValueError,
|
|
match=(
|
|
"'combine_text_under_n_chars' argument must not exceed `max_characters` value,"
|
|
f" got {combine_text_under_n_chars} > {expected_hard_max}"
|
|
),
|
|
):
|
|
_ByTitleChunkingOptions.new(
|
|
max_characters=max_characters, combine_text_under_n_chars=combine_text_under_n_chars
|
|
)
|
|
|
|
def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self):
|
|
"""Caller can specify `new_after_n_chars` arg without specifying any other options."""
|
|
try:
|
|
opts = _ByTitleChunkingOptions.new(new_after_n_chars=200)
|
|
except ValueError:
|
|
pytest.fail("did not accept `new_after_n_chars` as option by itself")
|
|
|
|
assert opts.soft_max == 200
|
|
|
|
@pytest.mark.parametrize(
|
|
("multipage_sections", "expected_value"),
|
|
[(True, True), (False, False), (None, CHUNK_MULTI_PAGE_DEFAULT)],
|
|
)
|
|
def it_knows_whether_to_break_chunks_on_page_boundaries(
|
|
self, multipage_sections: bool, expected_value: bool
|
|
):
|
|
opts = _ByTitleChunkingOptions(multipage_sections=multipage_sections)
|
|
assert opts.multipage_sections is expected_value
|