mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-11 07:01:24 +00:00
rfctr: rename pre chunk (#2261)
The original naming for the pre-cursor to a chunk in `chunk_by_title()` was conflated with the idea of how these element subsequences were bounded (by document-section) for that strategy. I mistakenly picked that up as a universal concept but in fact no notion of section arises in the `by_character` or other chunking strategies. Fix this misconception by using the name `pre-chunk` for this concept throughout.
This commit is contained in:
parent
74d089d942
commit
cbeaed21ef
@ -1,4 +1,4 @@
|
||||
## 0.11.4-dev10
|
||||
## 0.11.4-dev11
|
||||
|
||||
### Enhancements
|
||||
|
||||
|
||||
@ -5,12 +5,12 @@ from typing import List
|
||||
import pytest
|
||||
|
||||
from unstructured.chunking.title import (
|
||||
_SectionCombiner,
|
||||
PreChunkCombiner,
|
||||
TablePreChunk,
|
||||
TextPreChunk,
|
||||
TextPreChunkAccumulator,
|
||||
TextPreChunkBuilder,
|
||||
_split_elements_by_title_and_table,
|
||||
_TableSection,
|
||||
_TextSection,
|
||||
_TextSectionAccumulator,
|
||||
_TextSectionBuilder,
|
||||
chunk_by_title,
|
||||
)
|
||||
from unstructured.documents.coordinates import CoordinateSystem
|
||||
@ -130,7 +130,7 @@ def test_it_does_not_complain_when_specifying_new_after_n_chars_by_itself():
|
||||
|
||||
|
||||
def test_it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk():
|
||||
"""Specifying `new_after_n_chars=0` places each element into its own section.
|
||||
"""Specifying `new_after_n_chars=0` places each element into its own pre-chunk.
|
||||
|
||||
This puts each element into its own chunk, although long chunks are still split.
|
||||
"""
|
||||
@ -166,7 +166,7 @@ def test_it_silently_accepts_new_after_n_chars_greater_than_maxchars():
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
def test_it_splits_a_large_section_into_multiple_chunks():
|
||||
def test_it_splits_a_large_element_into_multiple_chunks():
|
||||
elements: List[Element] = [
|
||||
Title("Introduction"),
|
||||
Text(
|
||||
@ -199,36 +199,36 @@ def test_split_elements_by_title_and_table():
|
||||
CheckBox(),
|
||||
]
|
||||
|
||||
sections = _split_elements_by_title_and_table(
|
||||
pre_chunks = _split_elements_by_title_and_table(
|
||||
elements,
|
||||
multipage_sections=True,
|
||||
new_after_n_chars=500,
|
||||
max_characters=500,
|
||||
)
|
||||
|
||||
section = next(sections)
|
||||
assert isinstance(section, _TextSection)
|
||||
assert section._elements == [
|
||||
pre_chunk = next(pre_chunks)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("A Great Day"),
|
||||
Text("Today is a great day."),
|
||||
Text("It is sunny outside."),
|
||||
]
|
||||
# --
|
||||
section = next(sections)
|
||||
assert isinstance(section, _TableSection)
|
||||
assert section._table == Table("Heading\nCell text")
|
||||
pre_chunk = next(pre_chunks)
|
||||
assert isinstance(pre_chunk, TablePreChunk)
|
||||
assert pre_chunk._table == Table("Heading\nCell text")
|
||||
# ==
|
||||
section = next(sections)
|
||||
assert isinstance(section, _TextSection)
|
||||
assert section._elements == [
|
||||
pre_chunk = next(pre_chunks)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("An Okay Day"),
|
||||
Text("Today is an okay day."),
|
||||
Text("It is rainy outside."),
|
||||
]
|
||||
# --
|
||||
section = next(sections)
|
||||
assert isinstance(section, _TextSection)
|
||||
assert section._elements == [
|
||||
pre_chunk = next(pre_chunks)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("A Bad Day"),
|
||||
Text("Today is a bad day."),
|
||||
Text("It is storming outside."),
|
||||
@ -236,7 +236,7 @@ def test_split_elements_by_title_and_table():
|
||||
]
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(sections)
|
||||
next(pre_chunks)
|
||||
|
||||
|
||||
def test_chunk_by_title():
|
||||
@ -351,9 +351,9 @@ def test_chunk_by_title_separates_by_page_number():
|
||||
|
||||
|
||||
def test_chunk_by_title_does_not_break_on_regex_metadata_change():
|
||||
"""Sectioner is insensitive to regex-metadata changes.
|
||||
"""PreChunker is insensitive to regex-metadata changes.
|
||||
|
||||
A regex-metadata match in an element does not signify a semantic boundary and a section should
|
||||
A regex-metadata match in an element does not signify a semantic boundary and a pre-chunk should
|
||||
not be split based on such a difference.
|
||||
"""
|
||||
elements: List[Element] = [
|
||||
@ -672,8 +672,8 @@ def test_chunk_by_title_drops_extra_metadata():
|
||||
assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day."))
|
||||
|
||||
|
||||
def test_it_considers_separator_length_when_sectioning():
|
||||
"""Sectioner includes length of separators when computing remaining space."""
|
||||
def test_it_considers_separator_length_when_pre_chunking():
|
||||
"""PreChunker includes length of separators when computing remaining space."""
|
||||
elements: List[Element] = [
|
||||
Title("Chunking Priorities"), # 19 chars
|
||||
ListItem("Divide text into manageable chunks"), # 34 chars
|
||||
@ -693,11 +693,11 @@ def test_it_considers_separator_length_when_sectioning():
|
||||
]
|
||||
|
||||
|
||||
# == Sections ====================================================================================
|
||||
# == PreChunks ===================================================================================
|
||||
|
||||
|
||||
class Describe_TableSection:
|
||||
"""Unit-test suite for `unstructured.chunking.title._TableSection objects."""
|
||||
class DescribeTablePreChunk:
|
||||
"""Unit-test suite for `unstructured.chunking.title.TablePreChunk objects."""
|
||||
|
||||
def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
|
||||
html_table = (
|
||||
@ -711,11 +711,11 @@ class Describe_TableSection:
|
||||
"</table>"
|
||||
)
|
||||
text_table = "Header Col 1 Header Col 2\n" "Lorem ipsum adipiscing"
|
||||
section = _TableSection(
|
||||
pre_chunk = TablePreChunk(
|
||||
Table(text_table, metadata=ElementMetadata(text_as_html=html_table))
|
||||
)
|
||||
|
||||
chunk_iter = section.iter_chunks(maxlen=175)
|
||||
chunk_iter = pre_chunk.iter_chunks(maxlen=175)
|
||||
|
||||
chunk = next(chunk_iter)
|
||||
assert isinstance(chunk, Table)
|
||||
@ -756,11 +756,11 @@ class Describe_TableSection:
|
||||
"Nunc aliquam id enim nec molestie\n"
|
||||
"Vivamus quis nunc ipsum donec ac fermentum"
|
||||
)
|
||||
section = _TableSection(
|
||||
pre_chunk = TablePreChunk(
|
||||
Table(text_table, metadata=ElementMetadata(text_as_html=html_table))
|
||||
)
|
||||
|
||||
chunk_iter = section.iter_chunks(maxlen=100)
|
||||
chunk_iter = pre_chunk.iter_chunks(maxlen=100)
|
||||
|
||||
chunk = next(chunk_iter)
|
||||
assert isinstance(chunk, TableChunk)
|
||||
@ -810,30 +810,30 @@ class Describe_TableSection:
|
||||
next(chunk_iter)
|
||||
|
||||
|
||||
class Describe_TextSection:
|
||||
"""Unit-test suite for `unstructured.chunking.title._TextSection objects."""
|
||||
class DescribeTextPreChunk:
|
||||
"""Unit-test suite for `unstructured.chunking.title.TextPreChunk objects."""
|
||||
|
||||
def it_can_combine_itself_with_another_TextSection_instance(self):
|
||||
""".combine() produces a new section by appending the elements of `other_section`.
|
||||
def it_can_combine_itself_with_another_TextPreChunk_instance(self):
|
||||
""".combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`.
|
||||
|
||||
Note that neither the original or other section are mutated.
|
||||
Note that neither the original or other pre_chunk are mutated.
|
||||
"""
|
||||
section = _TextSection(
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Text("In rhoncus ipsum sed lectus porta volutpat."),
|
||||
]
|
||||
)
|
||||
other_section = _TextSection(
|
||||
other_pre_chunk = TextPreChunk(
|
||||
[
|
||||
Text("Donec semper facilisis metus finibus malesuada."),
|
||||
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
|
||||
]
|
||||
)
|
||||
|
||||
new_section = section.combine(other_section)
|
||||
new_pre_chunk = pre_chunk.combine(other_pre_chunk)
|
||||
|
||||
assert new_section == _TextSection(
|
||||
assert new_pre_chunk == TextPreChunk(
|
||||
[
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Text("In rhoncus ipsum sed lectus porta volutpat."),
|
||||
@ -841,13 +841,13 @@ class Describe_TextSection:
|
||||
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
|
||||
]
|
||||
)
|
||||
assert section == _TextSection(
|
||||
assert pre_chunk == TextPreChunk(
|
||||
[
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Text("In rhoncus ipsum sed lectus porta volutpat."),
|
||||
]
|
||||
)
|
||||
assert other_section == _TextSection(
|
||||
assert other_pre_chunk == TextPreChunk(
|
||||
[
|
||||
Text("Donec semper facilisis metus finibus malesuada."),
|
||||
Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
|
||||
@ -855,7 +855,7 @@ class Describe_TextSection:
|
||||
)
|
||||
|
||||
def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
|
||||
section = _TextSection(
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Title("Introduction"),
|
||||
Text(
|
||||
@ -865,19 +865,19 @@ class Describe_TextSection:
|
||||
]
|
||||
)
|
||||
|
||||
chunk_iter = section.iter_chunks(maxlen=200)
|
||||
chunk_iter = pre_chunk.iter_chunks(maxlen=200)
|
||||
|
||||
chunk = next(chunk_iter)
|
||||
assert chunk == CompositeElement(
|
||||
"Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
|
||||
" In rhoncus ipsum sedlectus porta volutpat.",
|
||||
)
|
||||
assert chunk.metadata is section._consolidated_metadata
|
||||
assert chunk.metadata is pre_chunk._consolidated_metadata
|
||||
|
||||
def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
|
||||
# -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
|
||||
# -- The sectioner will isolate that element in a section of its own.
|
||||
section = _TextSection(
|
||||
# -- The pre-chunker will isolate that element in a pre_chunk of its own.
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
|
||||
@ -888,7 +888,7 @@ class Describe_TextSection:
|
||||
]
|
||||
)
|
||||
|
||||
chunk_iter = section.iter_chunks(maxlen=200)
|
||||
chunk_iter = pre_chunk.iter_chunks(maxlen=200)
|
||||
|
||||
chunk = next(chunk_iter)
|
||||
assert chunk == CompositeElement(
|
||||
@ -896,22 +896,22 @@ class Describe_TextSection:
|
||||
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
|
||||
" veniam, quis nostrud exercitation ullamco laboris nisi ut a"
|
||||
)
|
||||
assert chunk.metadata is section._consolidated_metadata
|
||||
assert chunk.metadata is pre_chunk._consolidated_metadata
|
||||
# --
|
||||
chunk = next(chunk_iter)
|
||||
assert chunk == CompositeElement("liquip ex ea commodo consequat.")
|
||||
assert chunk.metadata is section._consolidated_metadata
|
||||
assert chunk.metadata is pre_chunk._consolidated_metadata
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(chunk_iter)
|
||||
|
||||
def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self):
|
||||
""".text_length is the size of chunk this section will produce (before any splitting)."""
|
||||
section = _TextSection([PageBreak(""), Text("foo"), Text("bar")])
|
||||
assert section.text_length == 8
|
||||
""".text_length is the size of chunk this pre-chunk will produce (before any splitting)."""
|
||||
pre_chunk = TextPreChunk([PageBreak(""), Text("foo"), Text("bar")])
|
||||
assert pre_chunk.text_length == 8
|
||||
|
||||
def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self):
|
||||
section = _TextSection(
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Title(
|
||||
"Lorem Ipsum",
|
||||
@ -934,7 +934,7 @@ class Describe_TextSection:
|
||||
]
|
||||
)
|
||||
|
||||
assert section._all_metadata_values == {
|
||||
assert pre_chunk._all_metadata_values == {
|
||||
# -- scalar values are accumulated in a list in element order --
|
||||
"category_depth": [0, 1],
|
||||
# -- all values are accumulated, not only unique ones --
|
||||
@ -963,7 +963,7 @@ class Describe_TextSection:
|
||||
)
|
||||
metadata_2.quotient = 1.74
|
||||
|
||||
section = _TextSection(
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Title("Lorem Ipsum", metadata=metadata),
|
||||
Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
|
||||
@ -971,7 +971,7 @@ class Describe_TextSection:
|
||||
)
|
||||
|
||||
# -- ad-hoc fields "coefficient" and "quotient" do not appear --
|
||||
assert section._all_metadata_values == {
|
||||
assert pre_chunk._all_metadata_values == {
|
||||
"category_depth": [0, 1],
|
||||
"filename": ["foo.docx", "foo.docx"],
|
||||
"image_path": ["sprite.png"],
|
||||
@ -985,7 +985,7 @@ class Describe_TextSection:
|
||||
Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new
|
||||
position in the chunk after element text has been concatenated.
|
||||
"""
|
||||
section = _TextSection(
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Title(
|
||||
"Lorem Ipsum",
|
||||
@ -1011,7 +1011,7 @@ class Describe_TextSection:
|
||||
]
|
||||
)
|
||||
|
||||
regex_metadata = section._consolidated_regex_meta
|
||||
regex_metadata = pre_chunk._consolidated_regex_meta
|
||||
|
||||
assert regex_metadata == {
|
||||
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
|
||||
@ -1026,9 +1026,9 @@ class Describe_TextSection:
|
||||
"""._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
|
||||
|
||||
Only non-None fields should appear in the dict and each field value should be the
|
||||
consolidation of the values across the section elements.
|
||||
consolidation of the values across the pre_chunk elements.
|
||||
"""
|
||||
section = _TextSection(
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
PageBreak(""),
|
||||
Title(
|
||||
@ -1065,7 +1065,7 @@ class Describe_TextSection:
|
||||
]
|
||||
)
|
||||
|
||||
meta_kwargs = section._meta_kwargs
|
||||
meta_kwargs = pre_chunk._meta_kwargs
|
||||
|
||||
assert meta_kwargs == {
|
||||
"filename": "foo.docx",
|
||||
@ -1090,29 +1090,29 @@ class Describe_TextSection:
|
||||
([Text("foo"), Text("bar"), PageBreak("")], "foo\n\nbar"),
|
||||
],
|
||||
)
|
||||
def it_knows_the_concatenated_text_of_the_section(
|
||||
def it_knows_the_concatenated_text_of_the_pre_chunk(
|
||||
self, elements: List[Text], expected_value: str
|
||||
):
|
||||
"""._text is the "joined" text of the section elements.
|
||||
"""._text is the "joined" text of the pre-chunk elements.
|
||||
|
||||
The text-segment contributed by each element is separated from the next by a blank line
|
||||
("\n\n"). An element that contributes no text does not give rise to a separator.
|
||||
"""
|
||||
section = _TextSection(elements)
|
||||
assert section._text == expected_value
|
||||
pre_chunk = TextPreChunk(elements)
|
||||
assert pre_chunk._text == expected_value
|
||||
|
||||
|
||||
class Describe_TextSectionBuilder:
|
||||
"""Unit-test suite for `unstructured.chunking.title._TextSectionBuilder`."""
|
||||
class DescribeTextPreChunkBuilder:
|
||||
"""Unit-test suite for `unstructured.chunking.title.TextPreChunkBuilder`."""
|
||||
|
||||
def it_is_empty_on_construction(self):
|
||||
builder = _TextSectionBuilder(maxlen=50)
|
||||
builder = TextPreChunkBuilder(maxlen=50)
|
||||
|
||||
assert builder.text_length == 0
|
||||
assert builder.remaining_space == 50
|
||||
|
||||
def it_accumulates_elements_added_to_it(self):
|
||||
builder = _TextSectionBuilder(maxlen=150)
|
||||
builder = TextPreChunkBuilder(maxlen=150)
|
||||
|
||||
builder.add_element(Title("Introduction"))
|
||||
assert builder.text_length == 12
|
||||
@ -1127,8 +1127,8 @@ class Describe_TextSectionBuilder:
|
||||
assert builder.text_length == 112
|
||||
assert builder.remaining_space == 36
|
||||
|
||||
def it_generates_a_TextSection_when_flushed_and_resets_itself_to_empty(self):
|
||||
builder = _TextSectionBuilder(maxlen=150)
|
||||
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
|
||||
builder = TextPreChunkBuilder(maxlen=150)
|
||||
builder.add_element(Title("Introduction"))
|
||||
builder.add_element(
|
||||
Text(
|
||||
@ -1137,10 +1137,10 @@ class Describe_TextSectionBuilder:
|
||||
),
|
||||
)
|
||||
|
||||
section = next(builder.flush())
|
||||
pre_chunk = next(builder.flush())
|
||||
|
||||
assert isinstance(section, _TextSection)
|
||||
assert section._elements == [
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Introduction"),
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
||||
@ -1150,17 +1150,17 @@ class Describe_TextSectionBuilder:
|
||||
assert builder.text_length == 0
|
||||
assert builder.remaining_space == 150
|
||||
|
||||
def but_it_does_not_generate_a_TextSection_on_flush_when_empty(self):
|
||||
builder = _TextSectionBuilder(maxlen=150)
|
||||
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
|
||||
builder = TextPreChunkBuilder(maxlen=150)
|
||||
|
||||
sections = list(builder.flush())
|
||||
pre_chunks = list(builder.flush())
|
||||
|
||||
assert sections == []
|
||||
assert pre_chunks == []
|
||||
assert builder.text_length == 0
|
||||
assert builder.remaining_space == 150
|
||||
|
||||
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
|
||||
builder = _TextSectionBuilder(maxlen=50)
|
||||
builder = TextPreChunkBuilder(maxlen=50)
|
||||
builder.add_element(Text("abcde"))
|
||||
builder.add_element(Text("fghij"))
|
||||
|
||||
@ -1173,27 +1173,27 @@ class Describe_TextSectionBuilder:
|
||||
assert builder.remaining_space == 36
|
||||
|
||||
|
||||
# == SectionCombiner =============================================================================
|
||||
# == PreChunkCombiner =============================================================================
|
||||
|
||||
|
||||
class Describe_SectionCombiner:
|
||||
"""Unit-test suite for `unstructured.chunking.title._SectionCombiner`."""
|
||||
class DescribePreChunkCombiner:
|
||||
"""Unit-test suite for `unstructured.chunking.title.PreChunkCombiner`."""
|
||||
|
||||
def it_combines_sequential_small_text_sections(self):
|
||||
sections = [
|
||||
_TextSection(
|
||||
def it_combines_sequential_small_text_pre_chunks(self):
|
||||
pre_chunks = [
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Lorem Ipsum"), # 11
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
|
||||
]
|
||||
),
|
||||
_TextSection(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Mauris Nec"), # 10
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
|
||||
]
|
||||
),
|
||||
_TextSection(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Sed Orci"), # 8
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
|
||||
@ -1201,13 +1201,13 @@ class Describe_SectionCombiner:
|
||||
),
|
||||
]
|
||||
|
||||
section_iter = _SectionCombiner(
|
||||
sections, maxlen=250, combine_text_under_n_chars=250
|
||||
).iter_combined_sections()
|
||||
pre_chunk_iter = PreChunkCombiner(
|
||||
pre_chunks, maxlen=250, combine_text_under_n_chars=250
|
||||
).iter_combined_pre_chunks()
|
||||
|
||||
section = next(section_iter)
|
||||
assert isinstance(section, _TextSection)
|
||||
assert section._elements == [
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Title("Mauris Nec"),
|
||||
@ -1216,18 +1216,18 @@ class Describe_SectionCombiner:
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
|
||||
]
|
||||
with pytest.raises(StopIteration):
|
||||
next(section_iter)
|
||||
next(pre_chunk_iter)
|
||||
|
||||
def but_it_does_not_combine_table_sections(self):
|
||||
sections = [
|
||||
_TextSection(
|
||||
def but_it_does_not_combine_table_pre_chunks(self):
|
||||
pre_chunks = [
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
]
|
||||
),
|
||||
_TableSection(Table("Heading\nCell text")),
|
||||
_TextSection(
|
||||
TablePreChunk(Table("Heading\nCell text")),
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
@ -1235,47 +1235,47 @@ class Describe_SectionCombiner:
|
||||
),
|
||||
]
|
||||
|
||||
section_iter = _SectionCombiner(
|
||||
sections, maxlen=250, combine_text_under_n_chars=250
|
||||
).iter_combined_sections()
|
||||
pre_chunk_iter = PreChunkCombiner(
|
||||
pre_chunks, maxlen=250, combine_text_under_n_chars=250
|
||||
).iter_combined_pre_chunks()
|
||||
|
||||
section = next(section_iter)
|
||||
assert isinstance(section, _TextSection)
|
||||
assert section._elements == [
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
]
|
||||
# --
|
||||
section = next(section_iter)
|
||||
assert isinstance(section, _TableSection)
|
||||
assert section._table == Table("Heading\nCell text")
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TablePreChunk)
|
||||
assert pre_chunk._table == Table("Heading\nCell text")
|
||||
# --
|
||||
section = next(section_iter)
|
||||
assert isinstance(section, _TextSection)
|
||||
assert section._elements == [
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
]
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(section_iter)
|
||||
next(pre_chunk_iter)
|
||||
|
||||
def it_respects_the_specified_combination_threshold(self):
|
||||
sections = [
|
||||
_TextSection( # 68
|
||||
pre_chunks = [
|
||||
TextPreChunk( # 68
|
||||
[
|
||||
Title("Lorem Ipsum"), # 11
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
|
||||
]
|
||||
),
|
||||
_TextSection( # 71
|
||||
TextPreChunk( # 71
|
||||
[
|
||||
Title("Mauris Nec"), # 10
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
|
||||
]
|
||||
),
|
||||
# -- len == 139
|
||||
_TextSection(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Sed Orci"), # 8
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
|
||||
@ -1283,45 +1283,45 @@ class Describe_SectionCombiner:
|
||||
),
|
||||
]
|
||||
|
||||
section_iter = _SectionCombiner(
|
||||
sections, maxlen=250, combine_text_under_n_chars=80
|
||||
).iter_combined_sections()
|
||||
pre_chunk_iter = PreChunkCombiner(
|
||||
pre_chunks, maxlen=250, combine_text_under_n_chars=80
|
||||
).iter_combined_pre_chunks()
|
||||
|
||||
section = next(section_iter)
|
||||
assert isinstance(section, _TextSection)
|
||||
assert section._elements == [
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
]
|
||||
# --
|
||||
section = next(section_iter)
|
||||
assert isinstance(section, _TextSection)
|
||||
assert section._elements == [
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Sed Orci"),
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
|
||||
]
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(section_iter)
|
||||
next(pre_chunk_iter)
|
||||
|
||||
def it_respects_the_hard_maximum_window_length(self):
|
||||
sections = [
|
||||
_TextSection( # 68
|
||||
pre_chunks = [
|
||||
TextPreChunk( # 68
|
||||
[
|
||||
Title("Lorem Ipsum"), # 11
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."), # 55
|
||||
]
|
||||
),
|
||||
_TextSection( # 71
|
||||
TextPreChunk( # 71
|
||||
[
|
||||
Title("Mauris Nec"), # 10
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."), # 59
|
||||
]
|
||||
),
|
||||
# -- len == 139
|
||||
_TextSection(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Sed Orci"), # 8
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."), # 63
|
||||
@ -1330,35 +1330,35 @@ class Describe_SectionCombiner:
|
||||
# -- len == 214
|
||||
]
|
||||
|
||||
section_iter = _SectionCombiner(
|
||||
sections, maxlen=200, combine_text_under_n_chars=200
|
||||
).iter_combined_sections()
|
||||
pre_chunk_iter = PreChunkCombiner(
|
||||
pre_chunks, maxlen=200, combine_text_under_n_chars=200
|
||||
).iter_combined_pre_chunks()
|
||||
|
||||
section = next(section_iter)
|
||||
assert isinstance(section, _TextSection)
|
||||
assert section._elements == [
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
]
|
||||
# --
|
||||
section = next(section_iter)
|
||||
assert isinstance(section, _TextSection)
|
||||
assert section._elements == [
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Sed Orci"),
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
|
||||
]
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(section_iter)
|
||||
next(pre_chunk_iter)
|
||||
|
||||
def it_accommodates_and_isolates_an_oversized_section(self):
|
||||
def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
|
||||
"""Such as occurs when a single element exceeds the window size."""
|
||||
|
||||
sections = [
|
||||
_TextSection([Title("Lorem Ipsum")]),
|
||||
_TextSection( # 179
|
||||
pre_chunks = [
|
||||
TextPreChunk([Title("Lorem Ipsum")]),
|
||||
TextPreChunk( # 179
|
||||
[
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit." # 55
|
||||
@ -1367,20 +1367,20 @@ class Describe_SectionCombiner:
|
||||
)
|
||||
]
|
||||
),
|
||||
_TextSection([Title("Vulputate Consequat")]),
|
||||
TextPreChunk([Title("Vulputate Consequat")]),
|
||||
]
|
||||
|
||||
section_iter = _SectionCombiner(
|
||||
sections, maxlen=150, combine_text_under_n_chars=150
|
||||
).iter_combined_sections()
|
||||
pre_chunk_iter = PreChunkCombiner(
|
||||
pre_chunks, maxlen=150, combine_text_under_n_chars=150
|
||||
).iter_combined_pre_chunks()
|
||||
|
||||
section = next(section_iter)
|
||||
assert isinstance(section, _TextSection)
|
||||
assert section._elements == [Title("Lorem Ipsum")]
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [Title("Lorem Ipsum")]
|
||||
# --
|
||||
section = next(section_iter)
|
||||
assert isinstance(section, _TextSection)
|
||||
assert section._elements == [
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit."
|
||||
" Mauris nec urna non augue vulputate consequat eget et nisi."
|
||||
@ -1388,28 +1388,28 @@ class Describe_SectionCombiner:
|
||||
)
|
||||
]
|
||||
# --
|
||||
section = next(section_iter)
|
||||
assert isinstance(section, _TextSection)
|
||||
assert section._elements == [Title("Vulputate Consequat")]
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [Title("Vulputate Consequat")]
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(section_iter)
|
||||
next(pre_chunk_iter)
|
||||
|
||||
|
||||
class Describe_TextSectionAccumulator:
|
||||
"""Unit-test suite for `unstructured.chunking.title._TextSectionAccumulator`."""
|
||||
class DescribeTextPreChunkAccumulator:
|
||||
"""Unit-test suite for `unstructured.chunking.title.TextPreChunkAccumulator`."""
|
||||
|
||||
def it_is_empty_on_construction(self):
|
||||
accum = _TextSectionAccumulator(maxlen=100)
|
||||
accum = TextPreChunkAccumulator(maxlen=100)
|
||||
|
||||
assert accum.text_length == 0
|
||||
assert accum.remaining_space == 100
|
||||
|
||||
def it_accumulates_sections_added_to_it(self):
|
||||
accum = _TextSectionAccumulator(maxlen=500)
|
||||
def it_accumulates_pre_chunks_added_to_it(self):
|
||||
accum = TextPreChunkAccumulator(maxlen=500)
|
||||
|
||||
accum.add_section(
|
||||
_TextSection(
|
||||
accum.add_pre_chunk(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
@ -1419,8 +1419,8 @@ class Describe_TextSectionAccumulator:
|
||||
assert accum.text_length == 68
|
||||
assert accum.remaining_space == 430
|
||||
|
||||
accum.add_section(
|
||||
_TextSection(
|
||||
accum.add_pre_chunk(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
@ -1430,26 +1430,26 @@ class Describe_TextSectionAccumulator:
|
||||
assert accum.text_length == 141
|
||||
assert accum.remaining_space == 357
|
||||
|
||||
def it_generates_a_TextSection_when_flushed_and_resets_itself_to_empty(self):
|
||||
accum = _TextSectionAccumulator(maxlen=150)
|
||||
accum.add_section(
|
||||
_TextSection(
|
||||
def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
|
||||
accum = TextPreChunkAccumulator(maxlen=150)
|
||||
accum.add_pre_chunk(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
]
|
||||
)
|
||||
)
|
||||
accum.add_section(
|
||||
_TextSection(
|
||||
accum.add_pre_chunk(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Mauris Nec"),
|
||||
Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
|
||||
]
|
||||
)
|
||||
)
|
||||
accum.add_section(
|
||||
_TextSection(
|
||||
accum.add_pre_chunk(
|
||||
TextPreChunk(
|
||||
[
|
||||
Title("Sed Orci"),
|
||||
Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
|
||||
@ -1457,15 +1457,15 @@ class Describe_TextSectionAccumulator:
|
||||
)
|
||||
)
|
||||
|
||||
section_iter = accum.flush()
|
||||
pre_chunk_iter = accum.flush()
|
||||
|
||||
# -- iterator generates exactly one section --
|
||||
section = next(section_iter)
|
||||
# -- iterator generates exactly one pre_chunk --
|
||||
pre_chunk = next(pre_chunk_iter)
|
||||
with pytest.raises(StopIteration):
|
||||
next(section_iter)
|
||||
# -- and it is a _TextSection containing all the elements --
|
||||
assert isinstance(section, _TextSection)
|
||||
assert section._elements == [
|
||||
next(pre_chunk_iter)
|
||||
# -- and it is a _TextPreChunk containing all the elements --
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
Title("Lorem Ipsum"),
|
||||
Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
|
||||
Title("Mauris Nec"),
|
||||
@ -1476,24 +1476,24 @@ class Describe_TextSectionAccumulator:
|
||||
assert accum.text_length == 0
|
||||
assert accum.remaining_space == 150
|
||||
|
||||
def but_it_does_not_generate_a_TextSection_on_flush_when_empty(self):
|
||||
accum = _TextSectionAccumulator(maxlen=150)
|
||||
def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
|
||||
accum = TextPreChunkAccumulator(maxlen=150)
|
||||
|
||||
sections = list(accum.flush())
|
||||
pre_chunks = list(accum.flush())
|
||||
|
||||
assert sections == []
|
||||
assert pre_chunks == []
|
||||
assert accum.text_length == 0
|
||||
assert accum.remaining_space == 150
|
||||
|
||||
def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
|
||||
accum = _TextSectionAccumulator(maxlen=100)
|
||||
accum.add_section(_TextSection([Text("abcde")]))
|
||||
accum.add_section(_TextSection([Text("fghij")]))
|
||||
accum = TextPreChunkAccumulator(maxlen=100)
|
||||
accum.add_pre_chunk(TextPreChunk([Text("abcde")]))
|
||||
accum.add_pre_chunk(TextPreChunk([Text("fghij")]))
|
||||
|
||||
# -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
|
||||
# -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
|
||||
assert accum.text_length == 12
|
||||
# -- .remaining_space is reduced by the length (2) of the trailing separator which would
|
||||
# -- go between the current text and that of the next section if one was added.
|
||||
# -- go between the current text and that of the next pre-chunk if one was added.
|
||||
# -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88
|
||||
assert accum.remaining_space == 86
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.11.4-dev10" # pragma: no cover
|
||||
__version__ = "0.11.4-dev11" # pragma: no cover
|
||||
|
||||
@ -25,7 +25,7 @@ from unstructured.documents.elements import (
|
||||
)
|
||||
from unstructured.utils import lazyproperty
|
||||
|
||||
_Section: TypeAlias = "_TableSection | _TextSection"
|
||||
PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
|
||||
|
||||
# -- goes between text of each element when element-text is concatenated to form chunk --
|
||||
TEXT_SEPARATOR = "\n\n"
|
||||
@ -98,7 +98,7 @@ def chunk_by_title(
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
sections = _SectionCombiner(
|
||||
pre_chunks = PreChunkCombiner(
|
||||
_split_elements_by_title_and_table(
|
||||
elements,
|
||||
multipage_sections=multipage_sections,
|
||||
@ -107,9 +107,9 @@ def chunk_by_title(
|
||||
),
|
||||
max_characters,
|
||||
combine_text_under_n_chars,
|
||||
).iter_combined_sections()
|
||||
).iter_combined_pre_chunks()
|
||||
|
||||
return [chunk for section in sections for chunk in section.iter_chunks(max_characters)]
|
||||
return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks(max_characters)]
|
||||
|
||||
|
||||
def _split_elements_by_title_and_table(
|
||||
@ -117,31 +117,31 @@ def _split_elements_by_title_and_table(
|
||||
multipage_sections: bool,
|
||||
new_after_n_chars: int,
|
||||
max_characters: int,
|
||||
) -> Iterator[_TextSection | _TableSection]:
|
||||
"""Implements "sectioner" responsibilities.
|
||||
) -> Iterator[TextPreChunk | TablePreChunk]:
|
||||
"""Implements "pre-chunker" responsibilities.
|
||||
|
||||
A _section_ can be thought of as a "pre-chunk", generally determining the size and contents of a
|
||||
chunk formed by the subsequent "chunker" process. The only exception occurs when a single
|
||||
element is too big to fit in the chunk window and the chunker splits it into two or more chunks
|
||||
divided mid-text. The sectioner never divides an element mid-text.
|
||||
divided mid-text. The pre-chunker never divides an element mid-text.
|
||||
|
||||
The sectioner's responsibilities are:
|
||||
The pre-chunker's responsibilities are:
|
||||
|
||||
* **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
|
||||
either side of those boundaries into different sections. In this case, the primary
|
||||
either side of those boundaries into different pre-chunks. In this case, the primary
|
||||
indicator of a semantic boundary is a `Title` element. A page-break (change in
|
||||
page-number) is also a semantic boundary when `multipage_sections` is `False`.
|
||||
|
||||
* **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
|
||||
into sections as big as possible without exceeding the chunk window size.
|
||||
into pre-chunks as big as possible without exceeding the chunk window size.
|
||||
|
||||
* **Minimize chunks that must be split mid-text.** Precompute the text length of each
|
||||
section and only produce a section that exceeds the chunk window size when there is a
|
||||
pre-chunk and only produce a pre-chunk that exceeds the chunk window size when there is a
|
||||
single element with text longer than that window.
|
||||
|
||||
A Table or Checkbox element is placed into a section by itself.
|
||||
A Table or Checkbox element is placed into a pre-chunk by itself.
|
||||
"""
|
||||
section_builder = _TextSectionBuilder(max_characters)
|
||||
pre_chunk_builder = TextPreChunkBuilder(max_characters)
|
||||
|
||||
prior_element = None
|
||||
|
||||
@ -152,31 +152,31 @@ def _split_elements_by_title_and_table(
|
||||
else False
|
||||
)
|
||||
|
||||
# -- start new section when necessary --
|
||||
# -- start new pre_chunk when necessary --
|
||||
if (
|
||||
# -- Title and Table both start a new section --
|
||||
# -- Title and Table both start a new pre_chunk --
|
||||
isinstance(element, (Title, Table))
|
||||
# -- adding this element would exceed hard-maxlen for section --
|
||||
or section_builder.remaining_space < len(str(element))
|
||||
# -- section already meets or exceeds soft-maxlen --
|
||||
or section_builder.text_length >= new_after_n_chars
|
||||
# -- adding this element would exceed hard-maxlen for pre_chunk --
|
||||
or pre_chunk_builder.remaining_space < len(str(element))
|
||||
# -- pre_chunk already meets or exceeds soft-maxlen --
|
||||
or pre_chunk_builder.text_length >= new_after_n_chars
|
||||
# -- a semantic boundary is indicated by metadata change since prior element --
|
||||
or metadata_differs
|
||||
):
|
||||
# -- complete any work-in-progress section --
|
||||
yield from section_builder.flush()
|
||||
# -- complete any work-in-progress pre_chunk --
|
||||
yield from pre_chunk_builder.flush()
|
||||
|
||||
# -- emit table and checkbox immediately since they are always isolated --
|
||||
if isinstance(element, Table):
|
||||
yield _TableSection(table=element)
|
||||
yield TablePreChunk(table=element)
|
||||
# -- but accumulate text elements for consolidation into a composite chunk --
|
||||
else:
|
||||
section_builder.add_element(element)
|
||||
pre_chunk_builder.add_element(element)
|
||||
|
||||
prior_element = element
|
||||
|
||||
# -- flush "tail" section, any partially-filled section after last element is processed --
|
||||
yield from section_builder.flush()
|
||||
# -- flush "tail" pre_chunk, any partially-filled pre_chunk after last element is processed --
|
||||
yield from pre_chunk_builder.flush()
|
||||
|
||||
|
||||
def _metadata_differs(
|
||||
@ -255,17 +255,17 @@ def add_chunking_strategy() -> Callable[[Callable[_P, List[Element]]], Callable[
|
||||
return decorator
|
||||
|
||||
|
||||
# == Sections ====================================================================================
|
||||
# == PreChunks ===================================================================================
|
||||
|
||||
|
||||
class _TableSection:
|
||||
"""A section composed of a single Table element."""
|
||||
class TablePreChunk:
|
||||
"""A pre-chunk composed of a single Table element."""
|
||||
|
||||
def __init__(self, table: Table) -> None:
|
||||
self._table = table
|
||||
|
||||
def iter_chunks(self, maxlen: int) -> Iterator[Table | TableChunk]:
|
||||
"""Split this section into one or more `Table` or `TableChunk` objects maxlen or smaller."""
|
||||
"""Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
|
||||
text = self._table.text
|
||||
html = self._table.metadata.text_as_html or ""
|
||||
|
||||
@ -296,7 +296,7 @@ class _TableSection:
|
||||
is_continuation = True
|
||||
|
||||
|
||||
class _TextSection:
|
||||
class TextPreChunk:
|
||||
"""A sequence of elements that belong to the same semantic unit within a document.
|
||||
|
||||
The name "section" derives from the idea of a document-section, a heading followed by the
|
||||
@ -310,16 +310,16 @@ class _TextSection:
|
||||
self._elements = list(elements)
|
||||
|
||||
def __eq__(self, other: Any) -> bool:
|
||||
if not isinstance(other, _TextSection):
|
||||
if not isinstance(other, TextPreChunk):
|
||||
return False
|
||||
return self._elements == other._elements
|
||||
|
||||
def combine(self, other_section: _TextSection) -> _TextSection:
|
||||
"""Return new `_TextSection` that combines this and `other_section`."""
|
||||
return _TextSection(self._elements + other_section._elements)
|
||||
def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk:
|
||||
"""Return new `TextPreChunk` that combines this and `other_pre_chunk`."""
|
||||
return TextPreChunk(self._elements + other_pre_chunk._elements)
|
||||
|
||||
def iter_chunks(self, maxlen: int) -> Iterator[CompositeElement]:
|
||||
"""Split this section into one or more `CompositeElement` objects maxlen or smaller."""
|
||||
"""Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller."""
|
||||
text = self._text
|
||||
text_len = len(text)
|
||||
start = 0
|
||||
@ -333,8 +333,8 @@ class _TextSection:
|
||||
|
||||
@lazyproperty
|
||||
def text_length(self) -> int:
|
||||
"""Length of concatenated text of this section, including separators."""
|
||||
# -- used by section-combiner to identify combination candidates --
|
||||
"""Length of concatenated text of this pre-chunk, including separators."""
|
||||
# -- used by pre-chunk-combiner to identify combination candidates --
|
||||
return len(self._text)
|
||||
|
||||
@lazyproperty
|
||||
@ -342,7 +342,7 @@ class _TextSection:
|
||||
"""Collection of all populated metadata values across elements.
|
||||
|
||||
The resulting dict has one key for each `ElementMetadata` field that had a non-None value in
|
||||
at least one of the elements in this section. The value of that key is a list of all those
|
||||
at least one of the elements in this pre-chunk. The value of that key is a list of all those
|
||||
populated values, in element order, for example:
|
||||
|
||||
{
|
||||
@ -374,13 +374,13 @@ class _TextSection:
|
||||
|
||||
@lazyproperty
|
||||
def _consolidated_metadata(self) -> ElementMetadata:
|
||||
"""Metadata applicable to this section as a single chunk.
|
||||
"""Metadata applicable to this pre-chunk as a single chunk.
|
||||
|
||||
Formed by applying consolidation rules to all metadata fields across the elements of this
|
||||
section.
|
||||
pre-chunk.
|
||||
|
||||
For the sake of consistency, the same rules are applied (for example, for dropping values)
|
||||
to a single-element section too, even though metadata for such a section is already
|
||||
to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
|
||||
"consolidated".
|
||||
"""
|
||||
return ElementMetadata(**self._meta_kwargs)
|
||||
@ -460,26 +460,26 @@ class _TextSection:
|
||||
|
||||
@lazyproperty
|
||||
def _text(self) -> str:
|
||||
"""The concatenated text of all elements in this section.
|
||||
"""The concatenated text of all elements in this pre-chunk.
|
||||
|
||||
Each element-text is separated from the next by a blank line ("\n\n").
|
||||
"""
|
||||
return TEXT_SEPARATOR.join(e.text for e in self._elements if e.text)
|
||||
|
||||
|
||||
class _TextSectionBuilder:
|
||||
"""An element accumulator suitable for incrementally forming a section.
|
||||
class TextPreChunkBuilder:
|
||||
"""An element accumulator suitable for incrementally forming a pre-chunk.
|
||||
|
||||
Provides monitoring properties like `.remaining_space` and `.text_length` a sectioner can use
|
||||
Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use
|
||||
to determine whether it should add the next element in the element stream.
|
||||
|
||||
`.flush()` is used to build a `TextSection` object from the accumulated elements. This method
|
||||
returns an interator that generates zero-or-one `TextSection` object and is used like so:
|
||||
`.flush()` is used to build a `TextPreChunk` object from the accumulated elements. This method
|
||||
returns an interator that generates zero-or-one `TextPreChunk` object and is used like so:
|
||||
|
||||
yield from builder.flush()
|
||||
|
||||
If no elements have been accumulated, no `TextSection` is generated. Flushing the builder clears
|
||||
the elements it contains so it is ready to build the next text-section.
|
||||
If no elements have been accumulated, no `TextPreChunk` is generated. Flushing the builder
|
||||
clears the elements it contains so it is ready to build the next text-pre-chunk.
|
||||
"""
|
||||
|
||||
def __init__(self, maxlen: int) -> None:
|
||||
@ -502,22 +502,22 @@ class _TextSectionBuilder:
|
||||
self._text_segments.append(element.text)
|
||||
self._text_len += len(element.text)
|
||||
|
||||
def flush(self) -> Iterator[_TextSection]:
|
||||
"""Generate zero-or-one `Section` object and clear the accumulator.
|
||||
def flush(self) -> Iterator[TextPreChunk]:
|
||||
"""Generate zero-or-one `PreChunk` object and clear the accumulator.
|
||||
|
||||
Suitable for use to emit a Section when the maximum size has been reached or a semantic
|
||||
boundary has been reached. Also to clear out a terminal section at the end of an element
|
||||
Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
|
||||
boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element
|
||||
stream.
|
||||
"""
|
||||
if not self._elements:
|
||||
return
|
||||
# -- clear builder before yield so we're not sensitive to the timing of how/when this
|
||||
# -- iterator is exhausted and can add eleemnts for the next section immediately.
|
||||
# -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately.
|
||||
elements = self._elements[:]
|
||||
self._elements.clear()
|
||||
self._text_segments.clear()
|
||||
self._text_len = 0
|
||||
yield _TextSection(elements)
|
||||
yield TextPreChunk(elements)
|
||||
|
||||
@property
|
||||
def remaining_space(self) -> int:
|
||||
@ -528,9 +528,9 @@ class _TextSectionBuilder:
|
||||
|
||||
@property
|
||||
def text_length(self) -> int:
|
||||
"""Length of the text in this section.
|
||||
"""Length of the text in this pre-chunk.
|
||||
|
||||
This value represents the chunk-size that would result if this section was flushed in its
|
||||
This value represents the chunk-size that would result if this pre-chunk was flushed in its
|
||||
current state. In particular, it does not include the length of a trailing separator (since
|
||||
that would only appear if an additional element was added).
|
||||
|
||||
@ -544,104 +544,104 @@ class _TextSectionBuilder:
|
||||
return self._text_len + (separator_count * self._separator_len)
|
||||
|
||||
|
||||
# == SectionCombiner =============================================================================
|
||||
# == PreChunkCombiner ============================================================================
|
||||
|
||||
|
||||
class _SectionCombiner:
|
||||
"""Filters section stream to combine small sections where possible."""
|
||||
class PreChunkCombiner:
|
||||
"""Filters pre-chunk stream to combine small pre-chunks where possible."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sections: Iterable[_Section],
|
||||
pre_chunks: Iterable[PreChunk],
|
||||
maxlen: int,
|
||||
combine_text_under_n_chars: int,
|
||||
):
|
||||
self._sections = sections
|
||||
self._pre_chunks = pre_chunks
|
||||
self._maxlen = maxlen
|
||||
self._combine_text_under_n_chars = combine_text_under_n_chars
|
||||
|
||||
def iter_combined_sections(self) -> Iterator[_Section]:
|
||||
"""Generate section objects, combining TextSection objects when they will fit in window."""
|
||||
accum = _TextSectionAccumulator(self._maxlen)
|
||||
def iter_combined_pre_chunks(self) -> Iterator[PreChunk]:
|
||||
"""Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window."""
|
||||
accum = TextPreChunkAccumulator(self._maxlen)
|
||||
|
||||
for section in self._sections:
|
||||
# -- start new section under these conditions --
|
||||
for pre_chunk in self._pre_chunks:
|
||||
# -- start new pre-chunk under these conditions --
|
||||
if (
|
||||
# -- a table section is never combined --
|
||||
isinstance(section, _TableSection)
|
||||
# -- don't add another section once length has reached combination soft-max --
|
||||
# -- a table pre-chunk is never combined --
|
||||
isinstance(pre_chunk, TablePreChunk)
|
||||
# -- don't add another pre-chunk once length has reached combination soft-max --
|
||||
or accum.text_length >= self._combine_text_under_n_chars
|
||||
# -- combining would exceed hard-max --
|
||||
or accum.remaining_space < section.text_length
|
||||
or accum.remaining_space < pre_chunk.text_length
|
||||
):
|
||||
yield from accum.flush()
|
||||
|
||||
# -- a table section is never combined so don't accumulate --
|
||||
if isinstance(section, _TableSection):
|
||||
yield section
|
||||
# -- a table pre-chunk is never combined so don't accumulate --
|
||||
if isinstance(pre_chunk, TablePreChunk):
|
||||
yield pre_chunk
|
||||
else:
|
||||
accum.add_section(section)
|
||||
accum.add_pre_chunk(pre_chunk)
|
||||
|
||||
yield from accum.flush()
|
||||
|
||||
|
||||
class _TextSectionAccumulator:
|
||||
"""Accumulates, measures, and combines section objects.
|
||||
class TextPreChunkAccumulator:
|
||||
"""Accumulates, measures, and combines pre-chunk objects.
|
||||
|
||||
Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
|
||||
whether to add another section.
|
||||
whether to add another pre-chunk.
|
||||
|
||||
`.flush()` is used to combine the accumulated sections into a single `TextSection` object. This
|
||||
method returns an interator that generates zero-or-one `TextSection` objects and is used like
|
||||
so:
|
||||
`.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
|
||||
This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used
|
||||
like so:
|
||||
|
||||
yield from accum.flush()
|
||||
|
||||
If no sections have been accumulated, no `TextSection` is generated. Flushing the builder clears
|
||||
the sections it contains so it is ready to accept the next text-section.
|
||||
If no pre-chunks have been accumulated, no `TextPreChunk` is generated. Flushing the builder
|
||||
clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk.
|
||||
"""
|
||||
|
||||
def __init__(self, maxlen: int) -> None:
|
||||
self._maxlen = maxlen
|
||||
self._sections: List[_TextSection] = []
|
||||
self._pre_chunks: List[TextPreChunk] = []
|
||||
|
||||
def add_section(self, section: _TextSection) -> None:
|
||||
"""Add a section to the accumulator for possible combination with next section."""
|
||||
self._sections.append(section)
|
||||
def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None:
|
||||
"""Add a pre-chunk to the accumulator for possible combination with next pre-chunk."""
|
||||
self._pre_chunks.append(pre_chunk)
|
||||
|
||||
def flush(self) -> Iterator[_TextSection]:
|
||||
"""Generate all accumulated sections as a single combined section."""
|
||||
sections = self._sections
|
||||
def flush(self) -> Iterator[TextPreChunk]:
|
||||
"""Generate all accumulated pre-chunks as a single combined pre-chunk."""
|
||||
pre_chunks = self._pre_chunks
|
||||
|
||||
# -- nothing to do if no sections have been accumulated --
|
||||
if not sections:
|
||||
# -- nothing to do if no pre-chunks have been accumulated --
|
||||
if not pre_chunks:
|
||||
return
|
||||
|
||||
# -- otherwise combine all accumulated section into one --
|
||||
section = sections[0]
|
||||
for other_section in sections[1:]:
|
||||
section = section.combine(other_section)
|
||||
yield section
|
||||
# -- otherwise combine all accumulated pre-chunk into one --
|
||||
pre_chunk = pre_chunks[0]
|
||||
for other_pre_chunk in pre_chunks[1:]:
|
||||
pre_chunk = pre_chunk.combine(other_pre_chunk)
|
||||
yield pre_chunk
|
||||
|
||||
# -- and reset the accumulator (to empty) --
|
||||
sections.clear()
|
||||
pre_chunks.clear()
|
||||
|
||||
@property
|
||||
def remaining_space(self) -> int:
|
||||
"""Maximum size of section that can be added without exceeding maxlen."""
|
||||
"""Maximum size of pre-chunk that can be added without exceeding maxlen."""
|
||||
return (
|
||||
self._maxlen
|
||||
if not self._sections
|
||||
# -- an additional section will also incur an additional separator --
|
||||
if not self._pre_chunks
|
||||
# -- an additional pre-chunk will also incur an additional separator --
|
||||
else self._maxlen - self.text_length - len(TEXT_SEPARATOR)
|
||||
)
|
||||
|
||||
@property
|
||||
def text_length(self) -> int:
|
||||
"""Size of concatenated text in all sections in accumulator."""
|
||||
n = len(self._sections)
|
||||
"""Size of concatenated text in all pre-chunks in accumulator."""
|
||||
n = len(self._pre_chunks)
|
||||
return (
|
||||
0
|
||||
if n == 0
|
||||
else sum(s.text_length for s in self._sections) + len(TEXT_SEPARATOR) * (n - 1)
|
||||
else sum(s.text_length for s in self._pre_chunks) + len(TEXT_SEPARATOR) * (n - 1)
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user