mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
193 lines
6.1 KiB
Python
193 lines
6.1 KiB
Python
![]() |
from unstructured.chunking.title import (
|
||
|
_split_elements_by_title_and_table,
|
||
|
chunk_by_title,
|
||
|
)
|
||
|
from unstructured.documents.elements import (
|
||
|
CheckBox,
|
||
|
CompositeElement,
|
||
|
ElementMetadata,
|
||
|
Table,
|
||
|
Text,
|
||
|
Title,
|
||
|
)
|
||
|
|
||
|
|
||
|
def test_split_elements_by_title_and_table():
|
||
|
elements = [
|
||
|
Title("A Great Day"),
|
||
|
Text("Today is a great day."),
|
||
|
Text("It is sunny outside."),
|
||
|
Table("<table></table>"),
|
||
|
Title("An Okay Day"),
|
||
|
Text("Today is an okay day."),
|
||
|
Text("It is rainy outside."),
|
||
|
Title("A Bad Day"),
|
||
|
Text("Today is a bad day."),
|
||
|
Text("It is storming outside."),
|
||
|
CheckBox(),
|
||
|
]
|
||
|
sections = _split_elements_by_title_and_table(elements, combine_under_n_chars=0)
|
||
|
|
||
|
assert sections == [
|
||
|
[
|
||
|
Title("A Great Day"),
|
||
|
Text("Today is a great day."),
|
||
|
Text("It is sunny outside."),
|
||
|
],
|
||
|
[
|
||
|
Table("<table></table>"),
|
||
|
],
|
||
|
[
|
||
|
Title("An Okay Day"),
|
||
|
Text("Today is an okay day."),
|
||
|
Text("It is rainy outside."),
|
||
|
],
|
||
|
[
|
||
|
Title("A Bad Day"),
|
||
|
Text("Today is a bad day."),
|
||
|
Text("It is storming outside."),
|
||
|
],
|
||
|
[
|
||
|
CheckBox(),
|
||
|
],
|
||
|
]
|
||
|
|
||
|
|
||
|
def test_chunk_by_title():
|
||
|
elements = [
|
||
|
Title("A Great Day", metadata=ElementMetadata(emphasized_text_contents=["Day"])),
|
||
|
Text("Today is a great day.", metadata=ElementMetadata(emphasized_text_contents=["day"])),
|
||
|
Text("It is sunny outside."),
|
||
|
Table("<table></table>"),
|
||
|
Title("An Okay Day"),
|
||
|
Text("Today is an okay day."),
|
||
|
Text("It is rainy outside."),
|
||
|
Title("A Bad Day"),
|
||
|
Text(
|
||
|
"Today is a bad day.",
|
||
|
metadata=ElementMetadata(regex_metadata=[{"text": "A", "start": 0, "end": 1}]),
|
||
|
),
|
||
|
Text("It is storming outside."),
|
||
|
CheckBox(),
|
||
|
]
|
||
|
chunks = chunk_by_title(elements, combine_under_n_chars=0)
|
||
|
|
||
|
assert chunks == [
|
||
|
CompositeElement(
|
||
|
"A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
|
||
|
),
|
||
|
Table("<table></table>"),
|
||
|
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
|
||
|
CompositeElement(
|
||
|
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
|
||
|
),
|
||
|
CheckBox(),
|
||
|
]
|
||
|
|
||
|
assert chunks[0].metadata == ElementMetadata(emphasized_text_contents=["Day", "day"])
|
||
|
assert chunks[3].metadata == ElementMetadata(
|
||
|
regex_metadata=[{"text": "A", "start": 11, "end": 12}],
|
||
|
)
|
||
|
|
||
|
|
||
|
def test_chunk_by_title_respects_section_change():
|
||
|
elements = [
|
||
|
Title("A Great Day", metadata=ElementMetadata(section="first")),
|
||
|
Text("Today is a great day.", metadata=ElementMetadata(section="second")),
|
||
|
Text("It is sunny outside.", metadata=ElementMetadata(section="second")),
|
||
|
Table("<table></table>"),
|
||
|
Title("An Okay Day"),
|
||
|
Text("Today is an okay day."),
|
||
|
Text("It is rainy outside."),
|
||
|
Title("A Bad Day"),
|
||
|
Text(
|
||
|
"Today is a bad day.",
|
||
|
metadata=ElementMetadata(regex_metadata=[{"text": "A", "start": 0, "end": 1}]),
|
||
|
),
|
||
|
Text("It is storming outside."),
|
||
|
CheckBox(),
|
||
|
]
|
||
|
chunks = chunk_by_title(elements, combine_under_n_chars=0)
|
||
|
|
||
|
assert chunks == [
|
||
|
CompositeElement(
|
||
|
"A Great Day",
|
||
|
),
|
||
|
CompositeElement(
|
||
|
"Today is a great day.\n\nIt is sunny outside.",
|
||
|
),
|
||
|
Table("<table></table>"),
|
||
|
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
|
||
|
CompositeElement(
|
||
|
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
|
||
|
),
|
||
|
CheckBox(),
|
||
|
]
|
||
|
|
||
|
|
||
|
def test_chunk_by_title_separates_by_page_number():
|
||
|
elements = [
|
||
|
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
|
||
|
Text("Today is a great day.", metadata=ElementMetadata(page_number=2)),
|
||
|
Text("It is sunny outside.", metadata=ElementMetadata(page_number=2)),
|
||
|
Table("<table></table>"),
|
||
|
Title("An Okay Day"),
|
||
|
Text("Today is an okay day."),
|
||
|
Text("It is rainy outside."),
|
||
|
Title("A Bad Day"),
|
||
|
Text(
|
||
|
"Today is a bad day.",
|
||
|
metadata=ElementMetadata(regex_metadata=[{"text": "A", "start": 0, "end": 1}]),
|
||
|
),
|
||
|
Text("It is storming outside."),
|
||
|
CheckBox(),
|
||
|
]
|
||
|
chunks = chunk_by_title(elements, multipage_sections=False, combine_under_n_chars=0)
|
||
|
|
||
|
assert chunks == [
|
||
|
CompositeElement(
|
||
|
"A Great Day",
|
||
|
),
|
||
|
CompositeElement(
|
||
|
"Today is a great day.\n\nIt is sunny outside.",
|
||
|
),
|
||
|
Table("<table></table>"),
|
||
|
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
|
||
|
CompositeElement(
|
||
|
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
|
||
|
),
|
||
|
CheckBox(),
|
||
|
]
|
||
|
|
||
|
|
||
|
def test_chunk_by_title_groups_across_pages():
|
||
|
elements = [
|
||
|
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
|
||
|
Text("Today is a great day.", metadata=ElementMetadata(page_number=2)),
|
||
|
Text("It is sunny outside.", metadata=ElementMetadata(page_number=2)),
|
||
|
Table("<table></table>"),
|
||
|
Title("An Okay Day"),
|
||
|
Text("Today is an okay day."),
|
||
|
Text("It is rainy outside."),
|
||
|
Title("A Bad Day"),
|
||
|
Text(
|
||
|
"Today is a bad day.",
|
||
|
metadata=ElementMetadata(regex_metadata=[{"text": "A", "start": 0, "end": 1}]),
|
||
|
),
|
||
|
Text("It is storming outside."),
|
||
|
CheckBox(),
|
||
|
]
|
||
|
chunks = chunk_by_title(elements, multipage_sections=True, combine_under_n_chars=0)
|
||
|
|
||
|
assert chunks == [
|
||
|
CompositeElement(
|
||
|
"A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
|
||
|
),
|
||
|
Table("<table></table>"),
|
||
|
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
|
||
|
CompositeElement(
|
||
|
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
|
||
|
),
|
||
|
CheckBox(),
|
||
|
]
|