193 lines
6.1 KiB
Python
Raw Normal View History

from unstructured.chunking.title import (
_split_elements_by_title_and_table,
chunk_by_title,
)
from unstructured.documents.elements import (
CheckBox,
CompositeElement,
ElementMetadata,
Table,
Text,
Title,
)
def test_split_elements_by_title_and_table():
elements = [
Title("A Great Day"),
Text("Today is a great day."),
Text("It is sunny outside."),
Table("<table></table>"),
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
Title("A Bad Day"),
Text("Today is a bad day."),
Text("It is storming outside."),
CheckBox(),
]
sections = _split_elements_by_title_and_table(elements, combine_under_n_chars=0)
assert sections == [
[
Title("A Great Day"),
Text("Today is a great day."),
Text("It is sunny outside."),
],
[
Table("<table></table>"),
],
[
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
],
[
Title("A Bad Day"),
Text("Today is a bad day."),
Text("It is storming outside."),
],
[
CheckBox(),
],
]
def test_chunk_by_title():
elements = [
Title("A Great Day", metadata=ElementMetadata(emphasized_text_contents=["Day"])),
Text("Today is a great day.", metadata=ElementMetadata(emphasized_text_contents=["day"])),
Text("It is sunny outside."),
Table("<table></table>"),
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
Title("A Bad Day"),
Text(
"Today is a bad day.",
metadata=ElementMetadata(regex_metadata=[{"text": "A", "start": 0, "end": 1}]),
),
Text("It is storming outside."),
CheckBox(),
]
chunks = chunk_by_title(elements, combine_under_n_chars=0)
assert chunks == [
CompositeElement(
"A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
),
Table("<table></table>"),
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
CompositeElement(
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
),
CheckBox(),
]
assert chunks[0].metadata == ElementMetadata(emphasized_text_contents=["Day", "day"])
assert chunks[3].metadata == ElementMetadata(
regex_metadata=[{"text": "A", "start": 11, "end": 12}],
)
def test_chunk_by_title_respects_section_change():
elements = [
Title("A Great Day", metadata=ElementMetadata(section="first")),
Text("Today is a great day.", metadata=ElementMetadata(section="second")),
Text("It is sunny outside.", metadata=ElementMetadata(section="second")),
Table("<table></table>"),
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
Title("A Bad Day"),
Text(
"Today is a bad day.",
metadata=ElementMetadata(regex_metadata=[{"text": "A", "start": 0, "end": 1}]),
),
Text("It is storming outside."),
CheckBox(),
]
chunks = chunk_by_title(elements, combine_under_n_chars=0)
assert chunks == [
CompositeElement(
"A Great Day",
),
CompositeElement(
"Today is a great day.\n\nIt is sunny outside.",
),
Table("<table></table>"),
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
CompositeElement(
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
),
CheckBox(),
]
def test_chunk_by_title_separates_by_page_number():
elements = [
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
Text("Today is a great day.", metadata=ElementMetadata(page_number=2)),
Text("It is sunny outside.", metadata=ElementMetadata(page_number=2)),
Table("<table></table>"),
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
Title("A Bad Day"),
Text(
"Today is a bad day.",
metadata=ElementMetadata(regex_metadata=[{"text": "A", "start": 0, "end": 1}]),
),
Text("It is storming outside."),
CheckBox(),
]
chunks = chunk_by_title(elements, multipage_sections=False, combine_under_n_chars=0)
assert chunks == [
CompositeElement(
"A Great Day",
),
CompositeElement(
"Today is a great day.\n\nIt is sunny outside.",
),
Table("<table></table>"),
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
CompositeElement(
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
),
CheckBox(),
]
def test_chunk_by_title_groups_across_pages():
elements = [
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
Text("Today is a great day.", metadata=ElementMetadata(page_number=2)),
Text("It is sunny outside.", metadata=ElementMetadata(page_number=2)),
Table("<table></table>"),
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
Title("A Bad Day"),
Text(
"Today is a bad day.",
metadata=ElementMetadata(regex_metadata=[{"text": "A", "start": 0, "end": 1}]),
),
Text("It is storming outside."),
CheckBox(),
]
chunks = chunk_by_title(elements, multipage_sections=True, combine_under_n_chars=0)
assert chunks == [
CompositeElement(
"A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
),
Table("<table></table>"),
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
CompositeElement(
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
),
CheckBox(),
]