Amanda Cameron 0584e1d031
chore: fix infer_table bug (#1833)
Carrying `skip_infer_table_types` to `infer_table_structure` in
partition flow. Now PPT/X, DOC/X, etc. Table elements should not have a
`text_as_html` field.

Note: I've continued to exclude this var from partitioners that go
through html flow, I think if we've already got the html it doesn't make
sense to carry the infer variable along, since we're not 'infer-ing' the
html table in these cases.


TODO:
  add unit tests

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: amanda103 <amanda103@users.noreply.github.com>
2023-10-24 00:11:53 +00:00

556 lines
18 KiB
Python

# pyright: reportPrivateUsage=false
from typing import List
import pytest
from unstructured.chunking.title import (
_split_elements_by_title_and_table,
chunk_by_title,
)
from unstructured.documents.coordinates import CoordinateSystem
from unstructured.documents.elements import (
CheckBox,
CompositeElement,
CoordinatesMetadata,
Element,
ElementMetadata,
RegexMetadata,
Table,
Text,
Title,
)
from unstructured.partition.html import partition_html
def test_it_splits_a_large_section_into_multiple_chunks():
elements: List[Element] = [
Title("Introduction"),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed lectus"
" porta volutpat.",
),
]
chunks = chunk_by_title(elements, combine_text_under_n_chars=50, max_characters=50)
assert chunks == [
CompositeElement("Introduction"),
CompositeElement("Lorem ipsum dolor sit amet consectetur adipiscing "),
CompositeElement("elit. In rhoncus ipsum sed lectus porta volutpat."),
]
def test_split_elements_by_title_and_table():
elements: List[Element] = [
Title("A Great Day"),
Text("Today is a great day."),
Text("It is sunny outside."),
Table("<table></table>"),
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
Title("A Bad Day"),
Text("Today is a bad day."),
Text("It is storming outside."),
CheckBox(),
]
sections = _split_elements_by_title_and_table(elements, combine_text_under_n_chars=0)
assert sections == [
[
Title("A Great Day"),
Text("Today is a great day."),
Text("It is sunny outside."),
],
[
Table("<table></table>"),
],
[
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
],
[
Title("A Bad Day"),
Text("Today is a bad day."),
Text("It is storming outside."),
],
[
CheckBox(),
],
]
def test_chunk_by_title():
elements: List[Element] = [
Title("A Great Day", metadata=ElementMetadata(emphasized_text_contents=["Day"])),
Text("Today is a great day.", metadata=ElementMetadata(emphasized_text_contents=["day"])),
Text("It is sunny outside."),
Table("<table></table>"),
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
Title("A Bad Day"),
Text(
"Today is a bad day.",
metadata=ElementMetadata(
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
),
),
Text("It is storming outside."),
CheckBox(),
]
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
assert chunks == [
CompositeElement(
"A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
),
Table("<table></table>"),
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
CompositeElement(
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
),
CheckBox(),
]
assert chunks[0].metadata == ElementMetadata(emphasized_text_contents=["Day", "day"])
assert chunks[3].metadata == ElementMetadata(
regex_metadata={"a": [RegexMetadata(text="A", start=11, end=12)]},
)
def test_chunk_by_title_respects_section_change():
elements: List[Element] = [
Title("A Great Day", metadata=ElementMetadata(section="first")),
Text("Today is a great day.", metadata=ElementMetadata(section="second")),
Text("It is sunny outside.", metadata=ElementMetadata(section="second")),
Table("<table></table>"),
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
Title("A Bad Day"),
Text(
"Today is a bad day.",
metadata=ElementMetadata(
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
),
),
Text("It is storming outside."),
CheckBox(),
]
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
assert chunks == [
CompositeElement(
"A Great Day",
),
CompositeElement(
"Today is a great day.\n\nIt is sunny outside.",
),
Table("<table></table>"),
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
CompositeElement(
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
),
CheckBox(),
]
def test_chunk_by_title_separates_by_page_number():
elements: List[Element] = [
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
Text("Today is a great day.", metadata=ElementMetadata(page_number=2)),
Text("It is sunny outside.", metadata=ElementMetadata(page_number=2)),
Table("<table></table>"),
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
Title("A Bad Day"),
Text(
"Today is a bad day.",
metadata=ElementMetadata(
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
),
),
Text("It is storming outside."),
CheckBox(),
]
chunks = chunk_by_title(elements, multipage_sections=False, combine_text_under_n_chars=0)
assert chunks == [
CompositeElement(
"A Great Day",
),
CompositeElement(
"Today is a great day.\n\nIt is sunny outside.",
),
Table("<table></table>"),
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
CompositeElement(
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
),
CheckBox(),
]
def test_chunk_by_title_does_not_break_on_regex_metadata_change():
"""Sectioner is insensitive to regex-metadata changes.
A regex-metadata match in an element does not signify a semantic boundary and a section should
not be split based on such a difference.
"""
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={"dolor": [RegexMetadata(text="dolor", start=12, end=17)]},
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
),
),
]
chunks = chunk_by_title(elements)
assert chunks == [
CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat.",
),
]
def test_chunk_by_title_consolidates_and_adjusts_offsets_of_regex_metadata():
"""ElementMetadata.regex_metadata of chunk is union of regex_metadatas of its elements.
The `start` and `end` offsets of each regex-match are adjusted to reflect their new position in
the chunk after element text has been concatenated.
"""
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
},
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
),
),
]
chunks = chunk_by_title(elements)
assert len(chunks) == 1
chunk = chunks[0]
assert chunk == CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat.",
)
assert chunk.metadata.regex_metadata == {
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
"ipsum": [
RegexMetadata(text="Ipsum", start=6, end=11),
RegexMetadata(text="ipsum", start=19, end=24),
RegexMetadata(text="ipsum", start=81, end=86),
],
}
def test_chunk_by_title_groups_across_pages():
elements: List[Element] = [
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
Text("Today is a great day.", metadata=ElementMetadata(page_number=2)),
Text("It is sunny outside.", metadata=ElementMetadata(page_number=2)),
Table("<table></table>"),
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
Title("A Bad Day"),
Text(
"Today is a bad day.",
metadata=ElementMetadata(
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
),
),
Text("It is storming outside."),
CheckBox(),
]
chunks = chunk_by_title(elements, multipage_sections=True, combine_text_under_n_chars=0)
assert chunks == [
CompositeElement(
"A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
),
Table("<table></table>"),
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
CompositeElement(
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
),
CheckBox(),
]
def test_add_chunking_strategy_on_partition_html():
filename = "example-docs/example-10k-1p.html"
chunk_elements = partition_html(filename, chunking_strategy="by_title")
elements = partition_html(filename)
chunks = chunk_by_title(elements)
assert chunk_elements != elements
assert chunk_elements == chunks
def test_add_chunking_strategy_respects_max_characters():
filename = "example-docs/example-10k-1p.html"
chunk_elements = partition_html(
filename,
chunking_strategy="by_title",
combine_text_under_n_chars=0,
new_after_n_chars=50,
max_characters=100,
)
elements = partition_html(filename)
chunks = chunk_by_title(
elements,
combine_text_under_n_chars=0,
new_after_n_chars=50,
max_characters=100,
)
for chunk in chunks:
assert isinstance(chunk, Text)
assert len(chunk.text) <= 100
for chunk_element in chunk_elements:
assert isinstance(chunk_element, Text)
assert len(chunk_element.text) <= 100
assert chunk_elements != elements
assert chunk_elements == chunks
def test_add_chunking_strategy_on_partition_html_respects_multipage():
filename = "example-docs/example-10k-1p.html"
partitioned_elements_multipage_false_combine_chars_0 = partition_html(
filename,
chunking_strategy="by_title",
multipage_sections=False,
combine_text_under_n_chars=0,
new_after_n_chars=300,
max_characters=400,
)
partitioned_elements_multipage_true_combine_chars_0 = partition_html(
filename,
chunking_strategy="by_title",
multipage_sections=True,
combine_text_under_n_chars=0,
new_after_n_chars=300,
max_characters=400,
)
elements = partition_html(filename)
cleaned_elements_multipage_false_combine_chars_0 = chunk_by_title(
elements,
multipage_sections=False,
combine_text_under_n_chars=0,
new_after_n_chars=300,
max_characters=400,
)
cleaned_elements_multipage_true_combine_chars_0 = chunk_by_title(
elements,
multipage_sections=True,
combine_text_under_n_chars=0,
new_after_n_chars=300,
max_characters=400,
)
assert (
partitioned_elements_multipage_false_combine_chars_0
== cleaned_elements_multipage_false_combine_chars_0
)
assert (
partitioned_elements_multipage_true_combine_chars_0
== cleaned_elements_multipage_true_combine_chars_0
)
assert len(partitioned_elements_multipage_true_combine_chars_0) != len(
partitioned_elements_multipage_false_combine_chars_0,
)
@pytest.mark.parametrize(
("combine_text_under_n_chars", "new_after_n_chars", "max_characters"),
[
(-1, -1, -1), # invalid chunk size
(0, 0, 0), # invalid max_characters
(-5666, -6777, -8999), # invalid chunk size
(-5, 40, 50), # invalid chunk size
(50, 70, 20), # max_characters needs to be greater than new_after_n_chars
(70, 50, 50), # combine_text_under_n_chars needs to be les than new_after_n_chars
],
)
def test_add_chunking_strategy_raises_error_for_invalid_n_chars(
combine_text_under_n_chars: int,
new_after_n_chars: int,
max_characters: int,
):
elements: List[Element] = [
Title("A Great Day"),
Text("Today is a great day."),
Text("It is sunny outside."),
Table("<table></table>"),
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
Title("A Bad Day"),
Text("It is storming outside."),
CheckBox(),
]
with pytest.raises(ValueError):
chunk_by_title(
elements,
combine_text_under_n_chars=combine_text_under_n_chars,
new_after_n_chars=new_after_n_chars,
max_characters=max_characters,
)
def test_chunk_by_title_drops_detection_class_prob():
elements: List[Element] = [
Title(
"A Great Day",
metadata=ElementMetadata(
detection_class_prob=0.5,
),
),
Text(
"Today is a great day.",
metadata=ElementMetadata(
detection_class_prob=0.62,
),
),
Text(
"It is sunny outside.",
metadata=ElementMetadata(
detection_class_prob=0.73,
),
),
Title(
"An Okay Day",
metadata=ElementMetadata(
detection_class_prob=0.84,
),
),
Text(
"Today is an okay day.",
metadata=ElementMetadata(
detection_class_prob=0.95,
),
),
]
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
assert str(chunks[0]) == str(
CompositeElement("A Great Day\n\nToday is a great day.\n\nIt is sunny outside."),
)
assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day."))
def test_chunk_by_title_drops_extra_metadata():
elements: List[Element] = [
Title(
"A Great Day",
metadata=ElementMetadata(
coordinates=CoordinatesMetadata(
points=(
(0.1, 0.1),
(0.2, 0.1),
(0.1, 0.2),
(0.2, 0.2),
),
system=CoordinateSystem(width=0.1, height=0.1),
),
),
),
Text(
"Today is a great day.",
metadata=ElementMetadata(
coordinates=CoordinatesMetadata(
points=(
(0.2, 0.2),
(0.3, 0.2),
(0.2, 0.3),
(0.3, 0.3),
),
system=CoordinateSystem(width=0.2, height=0.2),
),
),
),
Text(
"It is sunny outside.",
metadata=ElementMetadata(
coordinates=CoordinatesMetadata(
points=(
(0.3, 0.3),
(0.4, 0.3),
(0.3, 0.4),
(0.4, 0.4),
),
system=CoordinateSystem(width=0.3, height=0.3),
),
),
),
Title(
"An Okay Day",
metadata=ElementMetadata(
coordinates=CoordinatesMetadata(
points=(
(0.3, 0.3),
(0.4, 0.3),
(0.3, 0.4),
(0.4, 0.4),
),
system=CoordinateSystem(width=0.3, height=0.3),
),
),
),
Text(
"Today is an okay day.",
metadata=ElementMetadata(
coordinates=CoordinatesMetadata(
points=(
(0.4, 0.4),
(0.5, 0.4),
(0.4, 0.5),
(0.5, 0.5),
),
system=CoordinateSystem(width=0.4, height=0.4),
),
),
),
]
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
assert str(chunks[0]) == str(
CompositeElement("A Great Day\n\nToday is a great day.\n\nIt is sunny outside."),
)
assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day."))