mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-29 11:16:38 +00:00

### `chunk_by_title()` interface is "rude" **Executive Summary.** Perhaps the most commonly specified option for `chunk_by_title()` is `max_characters` (default: 500), which specifies the chunk window size. When a user specifies this value, they get an error message: ```python >>> chunks = chunk_by_title(elements, max_characters=100) ValueError: Invalid values for combine_text_under_n_chars, new_after_n_chars, and/or max_characters. ``` A few of the things that might reasonably pass through a user's mind at such a moment are: * "Is `110` not a valid value for `max_characters`? Why would that be?" * "I didn't specify a value for `combine_text_under_n_chars` or `new_after_n_chars`, in fact I don't know what they are because I haven't studied the documentation and would prefer not to; I just want smaller chunks! How could I supply an invalid value when I haven't supplied any value at all for these?" * "Which of these values is the problem? Why are you making me figure that out for myself? I'm sure the code knows which one is not valid, why doesn't it share that information with me? I'm busy here!" In this particular case, the problem is that `combine_text_under_n_chars` (defaults to 500) is greater than `max_characters`, which means it would never take effect (which is actually not a problem in itself). To fix this, once figuring out that was the problem, probably after opening an issue and maybe reading the source code, the user would need to specify: ```python >>> chunks = chunk_by_title( ... elements, max_characters=100, combine_text_under_n_chars=100 ... ) ``` This and other stressful user scenarios can be remedied by: * Using "active" defaults for the `combine_text_under_n_chars` and `new_after_n_chars` options. * Providing a specific error message for each way a constraint may be violated, such that direction to remedy the problem is immediately clear to the user. An *active default* is for example: * Make the default for `combine_text_under_n_chars: int | None = None` such that the code can detect when it has not been specified. * When not specified, set its value to `max_characters`, the same as its current (static) default. This particular change would avoid the behavior in the motivating example above. Another alternative for this argument is simply: ```python combine_text_under_n_chars = min(max_characters, combine_text_under_n_chars) ``` ### Fix 1. Add constraint-specific error messages. 2. Use "active" defaults for `combine_text_under_n_ chars` and `new_after_n_chars`. 3. Improve docstring to describe active defaults, and explain other argument behaviors, in particular identifying suppression options like `combine_text_under_n_chars = 0` to disable chunk combining.
657 lines
22 KiB
Python
657 lines
22 KiB
Python
# pyright: reportPrivateUsage=false
|
|
|
|
from typing import List
|
|
|
|
import pytest
|
|
|
|
from unstructured.chunking.title import (
|
|
_split_elements_by_title_and_table,
|
|
chunk_by_title,
|
|
)
|
|
from unstructured.documents.coordinates import CoordinateSystem
|
|
from unstructured.documents.elements import (
|
|
CheckBox,
|
|
CompositeElement,
|
|
CoordinatesMetadata,
|
|
Element,
|
|
ElementMetadata,
|
|
RegexMetadata,
|
|
Table,
|
|
Text,
|
|
Title,
|
|
)
|
|
from unstructured.partition.html import partition_html
|
|
|
|
# == chunk_by_title() validation behaviors =======================================================
|
|
|
|
|
|
@pytest.mark.parametrize("max_characters", [0, -1, -42])
|
|
def test_it_rejects_max_characters_not_greater_than_zero(max_characters: int):
|
|
elements: List[Element] = [Text("Lorem ipsum dolor.")]
|
|
|
|
with pytest.raises(
|
|
ValueError, match=f"'max_characters' argument must be > 0, got {max_characters}"
|
|
):
|
|
chunk_by_title(elements, max_characters=max_characters)
|
|
|
|
|
|
def test_it_does_not_complain_when_specifying_max_characters_by_itself():
|
|
"""Caller can specify `max_characters` arg without specifying any others.
|
|
|
|
In particular, When `combine_text_under_n_chars` is not specified it defaults to the value of
|
|
`max_characters`; it has no fixed default value that can be greater than `max_characters` and
|
|
trigger an exception.
|
|
"""
|
|
elements: List[Element] = [Text("Lorem ipsum dolor.")]
|
|
|
|
try:
|
|
chunk_by_title(elements, max_characters=50)
|
|
except ValueError:
|
|
pytest.fail("did not accept `max_characters` as option by itself")
|
|
|
|
|
|
@pytest.mark.parametrize("n_chars", [-1, -42])
|
|
def test_it_rejects_combine_text_under_n_chars_for_n_less_than_zero(n_chars: int):
|
|
elements: List[Element] = [Text("Lorem ipsum dolor.")]
|
|
|
|
with pytest.raises(
|
|
ValueError, match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}"
|
|
):
|
|
chunk_by_title(elements, combine_text_under_n_chars=n_chars)
|
|
|
|
|
|
def test_it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining():
|
|
"""Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining."""
|
|
elements: List[Element] = [Text("Lorem ipsum dolor.")]
|
|
|
|
chunks = chunk_by_title(elements, max_characters=50, combine_text_under_n_chars=0)
|
|
|
|
assert chunks == [CompositeElement("Lorem ipsum dolor.")]
|
|
|
|
|
|
def test_it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself():
|
|
"""Caller can specify `combine_text_under_n_chars` arg without specifying any other options."""
|
|
elements: List[Element] = [Text("Lorem ipsum dolor.")]
|
|
|
|
try:
|
|
chunk_by_title(elements, combine_text_under_n_chars=50)
|
|
except ValueError:
|
|
pytest.fail("did not accept `combine_text_under_n_chars` as option by itself")
|
|
|
|
|
|
def test_it_silently_accepts_combine_text_under_n_chars_greater_than_maxchars():
|
|
"""`combine_text_under_n_chars` > `max_characters` doesn't affect chunking behavior.
|
|
|
|
So rather than raising an exception or warning, we just cap that value at `max_characters` which
|
|
is the behavioral equivalent.
|
|
"""
|
|
elements: List[Element] = [Text("Lorem ipsum dolor.")]
|
|
|
|
try:
|
|
chunk_by_title(elements, max_characters=500, combine_text_under_n_chars=600)
|
|
except ValueError:
|
|
pytest.fail("did not accept `new_after_n_chars` greater than `max_characters`")
|
|
|
|
|
|
@pytest.mark.parametrize("n_chars", [-1, -42])
|
|
def test_it_rejects_new_after_n_chars_for_n_less_than_zero(n_chars: int):
|
|
elements: List[Element] = [Text("Lorem ipsum dolor.")]
|
|
|
|
with pytest.raises(
|
|
ValueError, match=f"'new_after_n_chars' argument must be >= 0, got {n_chars}"
|
|
):
|
|
chunk_by_title(elements, new_after_n_chars=n_chars)
|
|
|
|
|
|
def test_it_does_not_complain_when_specifying_new_after_n_chars_by_itself():
|
|
"""Caller can specify `new_after_n_chars` arg without specifying any other options.
|
|
|
|
In particular, `combine_text_under_n_chars` value is adjusted down to the `new_after_n_chars`
|
|
value when the default for `combine_text_under_n_chars` exceeds the value of
|
|
`new_after_n_chars`.
|
|
"""
|
|
elements: List[Element] = [Text("Lorem ipsum dolor.")]
|
|
|
|
try:
|
|
chunk_by_title(elements, new_after_n_chars=50)
|
|
except ValueError:
|
|
pytest.fail("did not accept `new_after_n_chars` as option by itself")
|
|
|
|
|
|
def test_it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk():
|
|
"""Specifying `new_after_n_chars=0` places each element into its own section.
|
|
|
|
This puts each element into its own chunk, although long chunks are still split.
|
|
"""
|
|
elements: List[Element] = [
|
|
Text("Lorem"),
|
|
Text("ipsum"),
|
|
Text("dolor"),
|
|
]
|
|
|
|
chunks = chunk_by_title(elements, max_characters=50, new_after_n_chars=0)
|
|
|
|
assert chunks == [
|
|
CompositeElement("Lorem"),
|
|
CompositeElement("ipsum"),
|
|
CompositeElement("dolor"),
|
|
]
|
|
|
|
|
|
def test_it_silently_accepts_new_after_n_chars_greater_than_maxchars():
|
|
"""`new_after_n_chars` > `max_characters` doesn't affect chunking behavior.
|
|
|
|
So rather than raising an exception or warning, we just cap that value at `max_characters` which
|
|
is the behavioral equivalent.
|
|
"""
|
|
elements: List[Element] = [Text("Lorem ipsum dolor.")]
|
|
|
|
try:
|
|
chunk_by_title(elements, max_characters=500, new_after_n_chars=600)
|
|
except ValueError:
|
|
pytest.fail("did not accept `new_after_n_chars` greater than `max_characters`")
|
|
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
def test_it_splits_a_large_section_into_multiple_chunks():
|
|
elements: List[Element] = [
|
|
Title("Introduction"),
|
|
Text(
|
|
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed lectus"
|
|
" porta volutpat.",
|
|
),
|
|
]
|
|
|
|
chunks = chunk_by_title(elements, max_characters=50)
|
|
|
|
assert chunks == [
|
|
CompositeElement("Introduction"),
|
|
CompositeElement("Lorem ipsum dolor sit amet consectetur adipiscing "),
|
|
CompositeElement("elit. In rhoncus ipsum sed lectus porta volutpat."),
|
|
]
|
|
|
|
|
|
def test_split_elements_by_title_and_table():
|
|
elements: List[Element] = [
|
|
Title("A Great Day"),
|
|
Text("Today is a great day."),
|
|
Text("It is sunny outside."),
|
|
Table("<table></table>"),
|
|
Title("An Okay Day"),
|
|
Text("Today is an okay day."),
|
|
Text("It is rainy outside."),
|
|
Title("A Bad Day"),
|
|
Text("Today is a bad day."),
|
|
Text("It is storming outside."),
|
|
CheckBox(),
|
|
]
|
|
sections = _split_elements_by_title_and_table(
|
|
elements,
|
|
multipage_sections=True,
|
|
combine_text_under_n_chars=0,
|
|
new_after_n_chars=500,
|
|
max_characters=500,
|
|
)
|
|
|
|
assert sections == [
|
|
[
|
|
Title("A Great Day"),
|
|
Text("Today is a great day."),
|
|
Text("It is sunny outside."),
|
|
],
|
|
[
|
|
Table("<table></table>"),
|
|
],
|
|
[
|
|
Title("An Okay Day"),
|
|
Text("Today is an okay day."),
|
|
Text("It is rainy outside."),
|
|
],
|
|
[
|
|
Title("A Bad Day"),
|
|
Text("Today is a bad day."),
|
|
Text("It is storming outside."),
|
|
],
|
|
[
|
|
CheckBox(),
|
|
],
|
|
]
|
|
|
|
|
|
def test_chunk_by_title():
|
|
elements: List[Element] = [
|
|
Title("A Great Day", metadata=ElementMetadata(emphasized_text_contents=["Day"])),
|
|
Text("Today is a great day.", metadata=ElementMetadata(emphasized_text_contents=["day"])),
|
|
Text("It is sunny outside."),
|
|
Table("<table></table>"),
|
|
Title("An Okay Day"),
|
|
Text("Today is an okay day."),
|
|
Text("It is rainy outside."),
|
|
Title("A Bad Day"),
|
|
Text(
|
|
"Today is a bad day.",
|
|
metadata=ElementMetadata(
|
|
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
|
|
),
|
|
),
|
|
Text("It is storming outside."),
|
|
CheckBox(),
|
|
]
|
|
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
|
|
|
|
assert chunks == [
|
|
CompositeElement(
|
|
"A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
|
|
),
|
|
Table("<table></table>"),
|
|
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
|
|
CompositeElement(
|
|
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
|
|
),
|
|
CheckBox(),
|
|
]
|
|
|
|
assert chunks[0].metadata == ElementMetadata(emphasized_text_contents=["Day", "day"])
|
|
assert chunks[3].metadata == ElementMetadata(
|
|
regex_metadata={"a": [RegexMetadata(text="A", start=11, end=12)]},
|
|
)
|
|
|
|
|
|
def test_chunk_by_title_respects_section_change():
|
|
elements: List[Element] = [
|
|
Title("A Great Day", metadata=ElementMetadata(section="first")),
|
|
Text("Today is a great day.", metadata=ElementMetadata(section="second")),
|
|
Text("It is sunny outside.", metadata=ElementMetadata(section="second")),
|
|
Table("<table></table>"),
|
|
Title("An Okay Day"),
|
|
Text("Today is an okay day."),
|
|
Text("It is rainy outside."),
|
|
Title("A Bad Day"),
|
|
Text(
|
|
"Today is a bad day.",
|
|
metadata=ElementMetadata(
|
|
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
|
|
),
|
|
),
|
|
Text("It is storming outside."),
|
|
CheckBox(),
|
|
]
|
|
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
|
|
|
|
assert chunks == [
|
|
CompositeElement(
|
|
"A Great Day",
|
|
),
|
|
CompositeElement(
|
|
"Today is a great day.\n\nIt is sunny outside.",
|
|
),
|
|
Table("<table></table>"),
|
|
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
|
|
CompositeElement(
|
|
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
|
|
),
|
|
CheckBox(),
|
|
]
|
|
|
|
|
|
def test_chunk_by_title_separates_by_page_number():
|
|
elements: List[Element] = [
|
|
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
|
|
Text("Today is a great day.", metadata=ElementMetadata(page_number=2)),
|
|
Text("It is sunny outside.", metadata=ElementMetadata(page_number=2)),
|
|
Table("<table></table>"),
|
|
Title("An Okay Day"),
|
|
Text("Today is an okay day."),
|
|
Text("It is rainy outside."),
|
|
Title("A Bad Day"),
|
|
Text(
|
|
"Today is a bad day.",
|
|
metadata=ElementMetadata(
|
|
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
|
|
),
|
|
),
|
|
Text("It is storming outside."),
|
|
CheckBox(),
|
|
]
|
|
chunks = chunk_by_title(elements, multipage_sections=False, combine_text_under_n_chars=0)
|
|
|
|
assert chunks == [
|
|
CompositeElement(
|
|
"A Great Day",
|
|
),
|
|
CompositeElement(
|
|
"Today is a great day.\n\nIt is sunny outside.",
|
|
),
|
|
Table("<table></table>"),
|
|
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
|
|
CompositeElement(
|
|
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
|
|
),
|
|
CheckBox(),
|
|
]
|
|
|
|
|
|
def test_chunk_by_title_does_not_break_on_regex_metadata_change():
|
|
"""Sectioner is insensitive to regex-metadata changes.
|
|
|
|
A regex-metadata match in an element does not signify a semantic boundary and a section should
|
|
not be split based on such a difference.
|
|
"""
|
|
elements: List[Element] = [
|
|
Title(
|
|
"Lorem Ipsum",
|
|
metadata=ElementMetadata(
|
|
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
|
|
),
|
|
),
|
|
Text(
|
|
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
|
|
metadata=ElementMetadata(
|
|
regex_metadata={"dolor": [RegexMetadata(text="dolor", start=12, end=17)]},
|
|
),
|
|
),
|
|
Text(
|
|
"In rhoncus ipsum sed lectus porta volutpat.",
|
|
metadata=ElementMetadata(
|
|
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
|
|
),
|
|
),
|
|
]
|
|
|
|
chunks = chunk_by_title(elements)
|
|
|
|
assert chunks == [
|
|
CompositeElement(
|
|
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
|
|
" ipsum sed lectus porta volutpat.",
|
|
),
|
|
]
|
|
|
|
|
|
def test_chunk_by_title_consolidates_and_adjusts_offsets_of_regex_metadata():
|
|
"""ElementMetadata.regex_metadata of chunk is union of regex_metadatas of its elements.
|
|
|
|
The `start` and `end` offsets of each regex-match are adjusted to reflect their new position in
|
|
the chunk after element text has been concatenated.
|
|
"""
|
|
elements: List[Element] = [
|
|
Title(
|
|
"Lorem Ipsum",
|
|
metadata=ElementMetadata(
|
|
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
|
|
),
|
|
),
|
|
Text(
|
|
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
|
|
metadata=ElementMetadata(
|
|
regex_metadata={
|
|
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
|
|
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
|
|
},
|
|
),
|
|
),
|
|
Text(
|
|
"In rhoncus ipsum sed lectus porta volutpat.",
|
|
metadata=ElementMetadata(
|
|
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
|
|
),
|
|
),
|
|
]
|
|
chunks = chunk_by_title(elements)
|
|
|
|
assert len(chunks) == 1
|
|
chunk = chunks[0]
|
|
assert chunk == CompositeElement(
|
|
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
|
|
" ipsum sed lectus porta volutpat.",
|
|
)
|
|
assert chunk.metadata.regex_metadata == {
|
|
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
|
|
"ipsum": [
|
|
RegexMetadata(text="Ipsum", start=6, end=11),
|
|
RegexMetadata(text="ipsum", start=19, end=24),
|
|
RegexMetadata(text="ipsum", start=81, end=86),
|
|
],
|
|
}
|
|
|
|
|
|
def test_chunk_by_title_groups_across_pages():
|
|
elements: List[Element] = [
|
|
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
|
|
Text("Today is a great day.", metadata=ElementMetadata(page_number=2)),
|
|
Text("It is sunny outside.", metadata=ElementMetadata(page_number=2)),
|
|
Table("<table></table>"),
|
|
Title("An Okay Day"),
|
|
Text("Today is an okay day."),
|
|
Text("It is rainy outside."),
|
|
Title("A Bad Day"),
|
|
Text(
|
|
"Today is a bad day.",
|
|
metadata=ElementMetadata(
|
|
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
|
|
),
|
|
),
|
|
Text("It is storming outside."),
|
|
CheckBox(),
|
|
]
|
|
chunks = chunk_by_title(elements, multipage_sections=True, combine_text_under_n_chars=0)
|
|
|
|
assert chunks == [
|
|
CompositeElement(
|
|
"A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
|
|
),
|
|
Table("<table></table>"),
|
|
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
|
|
CompositeElement(
|
|
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
|
|
),
|
|
CheckBox(),
|
|
]
|
|
|
|
|
|
def test_add_chunking_strategy_on_partition_html():
|
|
filename = "example-docs/example-10k-1p.html"
|
|
chunk_elements = partition_html(filename, chunking_strategy="by_title")
|
|
elements = partition_html(filename)
|
|
chunks = chunk_by_title(elements)
|
|
assert chunk_elements != elements
|
|
assert chunk_elements == chunks
|
|
|
|
|
|
def test_add_chunking_strategy_respects_max_characters():
|
|
filename = "example-docs/example-10k-1p.html"
|
|
chunk_elements = partition_html(
|
|
filename,
|
|
chunking_strategy="by_title",
|
|
combine_text_under_n_chars=0,
|
|
new_after_n_chars=50,
|
|
max_characters=100,
|
|
)
|
|
elements = partition_html(filename)
|
|
chunks = chunk_by_title(
|
|
elements,
|
|
combine_text_under_n_chars=0,
|
|
new_after_n_chars=50,
|
|
max_characters=100,
|
|
)
|
|
|
|
for chunk in chunks:
|
|
assert isinstance(chunk, Text)
|
|
assert len(chunk.text) <= 100
|
|
for chunk_element in chunk_elements:
|
|
assert isinstance(chunk_element, Text)
|
|
assert len(chunk_element.text) <= 100
|
|
assert chunk_elements != elements
|
|
assert chunk_elements == chunks
|
|
|
|
|
|
def test_add_chunking_strategy_on_partition_html_respects_multipage():
|
|
filename = "example-docs/example-10k-1p.html"
|
|
partitioned_elements_multipage_false_combine_chars_0 = partition_html(
|
|
filename,
|
|
chunking_strategy="by_title",
|
|
multipage_sections=False,
|
|
combine_text_under_n_chars=0,
|
|
new_after_n_chars=300,
|
|
max_characters=400,
|
|
)
|
|
partitioned_elements_multipage_true_combine_chars_0 = partition_html(
|
|
filename,
|
|
chunking_strategy="by_title",
|
|
multipage_sections=True,
|
|
combine_text_under_n_chars=0,
|
|
new_after_n_chars=300,
|
|
max_characters=400,
|
|
)
|
|
elements = partition_html(filename)
|
|
cleaned_elements_multipage_false_combine_chars_0 = chunk_by_title(
|
|
elements,
|
|
multipage_sections=False,
|
|
combine_text_under_n_chars=0,
|
|
new_after_n_chars=300,
|
|
max_characters=400,
|
|
)
|
|
cleaned_elements_multipage_true_combine_chars_0 = chunk_by_title(
|
|
elements,
|
|
multipage_sections=True,
|
|
combine_text_under_n_chars=0,
|
|
new_after_n_chars=300,
|
|
max_characters=400,
|
|
)
|
|
assert (
|
|
partitioned_elements_multipage_false_combine_chars_0
|
|
== cleaned_elements_multipage_false_combine_chars_0
|
|
)
|
|
assert (
|
|
partitioned_elements_multipage_true_combine_chars_0
|
|
== cleaned_elements_multipage_true_combine_chars_0
|
|
)
|
|
assert len(partitioned_elements_multipage_true_combine_chars_0) != len(
|
|
partitioned_elements_multipage_false_combine_chars_0,
|
|
)
|
|
|
|
|
|
def test_chunk_by_title_drops_detection_class_prob():
|
|
elements: List[Element] = [
|
|
Title(
|
|
"A Great Day",
|
|
metadata=ElementMetadata(
|
|
detection_class_prob=0.5,
|
|
),
|
|
),
|
|
Text(
|
|
"Today is a great day.",
|
|
metadata=ElementMetadata(
|
|
detection_class_prob=0.62,
|
|
),
|
|
),
|
|
Text(
|
|
"It is sunny outside.",
|
|
metadata=ElementMetadata(
|
|
detection_class_prob=0.73,
|
|
),
|
|
),
|
|
Title(
|
|
"An Okay Day",
|
|
metadata=ElementMetadata(
|
|
detection_class_prob=0.84,
|
|
),
|
|
),
|
|
Text(
|
|
"Today is an okay day.",
|
|
metadata=ElementMetadata(
|
|
detection_class_prob=0.95,
|
|
),
|
|
),
|
|
]
|
|
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
|
|
assert str(chunks[0]) == str(
|
|
CompositeElement("A Great Day\n\nToday is a great day.\n\nIt is sunny outside."),
|
|
)
|
|
assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day."))
|
|
|
|
|
|
def test_chunk_by_title_drops_extra_metadata():
|
|
elements: List[Element] = [
|
|
Title(
|
|
"A Great Day",
|
|
metadata=ElementMetadata(
|
|
coordinates=CoordinatesMetadata(
|
|
points=(
|
|
(0.1, 0.1),
|
|
(0.2, 0.1),
|
|
(0.1, 0.2),
|
|
(0.2, 0.2),
|
|
),
|
|
system=CoordinateSystem(width=0.1, height=0.1),
|
|
),
|
|
),
|
|
),
|
|
Text(
|
|
"Today is a great day.",
|
|
metadata=ElementMetadata(
|
|
coordinates=CoordinatesMetadata(
|
|
points=(
|
|
(0.2, 0.2),
|
|
(0.3, 0.2),
|
|
(0.2, 0.3),
|
|
(0.3, 0.3),
|
|
),
|
|
system=CoordinateSystem(width=0.2, height=0.2),
|
|
),
|
|
),
|
|
),
|
|
Text(
|
|
"It is sunny outside.",
|
|
metadata=ElementMetadata(
|
|
coordinates=CoordinatesMetadata(
|
|
points=(
|
|
(0.3, 0.3),
|
|
(0.4, 0.3),
|
|
(0.3, 0.4),
|
|
(0.4, 0.4),
|
|
),
|
|
system=CoordinateSystem(width=0.3, height=0.3),
|
|
),
|
|
),
|
|
),
|
|
Title(
|
|
"An Okay Day",
|
|
metadata=ElementMetadata(
|
|
coordinates=CoordinatesMetadata(
|
|
points=(
|
|
(0.3, 0.3),
|
|
(0.4, 0.3),
|
|
(0.3, 0.4),
|
|
(0.4, 0.4),
|
|
),
|
|
system=CoordinateSystem(width=0.3, height=0.3),
|
|
),
|
|
),
|
|
),
|
|
Text(
|
|
"Today is an okay day.",
|
|
metadata=ElementMetadata(
|
|
coordinates=CoordinatesMetadata(
|
|
points=(
|
|
(0.4, 0.4),
|
|
(0.5, 0.4),
|
|
(0.4, 0.5),
|
|
(0.5, 0.5),
|
|
),
|
|
system=CoordinateSystem(width=0.4, height=0.4),
|
|
),
|
|
),
|
|
),
|
|
]
|
|
|
|
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
|
|
|
|
assert str(chunks[0]) == str(
|
|
CompositeElement("A Great Day\n\nToday is a great day.\n\nIt is sunny outside."),
|
|
)
|
|
|
|
assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day."))
|