feat(chunking): add .orig_elements behavior to chunking (#2656)

**Summary**
Add the actual behavior to populate `.metadata.orig_elements` during
chunking, when so instructed by the `include_orig_elements` option.

**Additional Context**
The underlying structures to support this, namely the
`.metadata.orig_elements` field and the `include_orig_elements` chunking
option, were added in closely prior PRs. This PR adds the behavior to
actually populate that metadata field during chunking when the option is
set.
This commit is contained in:
Steve Canny 2024-03-18 12:27:39 -07:00 committed by GitHub
parent c02cfb89d3
commit 1af41d5f90
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 271 additions and 55 deletions

View File

@ -6,6 +6,8 @@
### Features ### Features
* **Chunking populates `.metadata.orig_elements` for each chunk.** This behavior allows the text and metadata of the elements combined to make each chunk to be accessed. This can be important for example to recover metadata such as `.coordinates` that cannot be consolidated across elements and so is dropped from chunks. This option is controlled by the `include_orig_elements` parameter to `partition_*()` or to the chunking functions. This option defaults to `True` so original-elements are preserved by default. This behavior is not yet supported via the REST APIs or SDKs but will be in a closely subsequent PR to other `unstructured` repositories. The original elements will also not serialize or deserialize yet; this will also be added in a closely subsequent PR.
### Fixes ### Fixes
* **Clarify IAM Role Requirement for GCS Platform Connectors**. The GCS Source Connector requires Storage Object Viewer and GCS Destination Connector requires Storage Object Creator IAM roles. * **Clarify IAM Role Requirement for GCS Platform Connectors**. The GCS Source Connector requires Storage Object Viewer and GCS Destination Connector requires Storage Object Creator IAM roles.

View File

@ -352,7 +352,30 @@ class DescribeTablePreChunk:
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(chunk_iter) next(chunk_iter)
def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self): def and_it_includes_the_original_table_element_in_metadata_when_so_instructed(self):
table = Table("foo bar", metadata=ElementMetadata(text_as_html="<table>foo bar</table>"))
opts = ChunkingOptions(include_orig_elements=True)
pre_chunk = TablePreChunk(table, "", opts)
chunk_iter = pre_chunk.iter_chunks()
chunk = next(chunk_iter)
assert isinstance(chunk, Table)
assert chunk.metadata.orig_elements == [table]
assert chunk.metadata.text_as_html == "<table>foo bar</table>"
# --
with pytest.raises(StopIteration):
next(chunk_iter)
def but_not_when_instructed_not_to(self):
pre_chunk = TablePreChunk(Table("foobar"), "", ChunkingOptions(include_orig_elements=False))
chunk = next(pre_chunk.iter_chunks())
assert isinstance(chunk, Table)
assert chunk.metadata.orig_elements is None
def it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
# fixed-overhead = 8+8+9+8+9+8 = 50 # fixed-overhead = 8+8+9+8+9+8 = 50
# per-row overhead = 27 # per-row overhead = 27
html_table = ( html_table = (
@ -398,6 +421,7 @@ class DescribeTablePreChunk:
"<tbody>\n" "<tbody>\n"
"<tr><td>Lo" "<tr><td>Lo"
) )
assert not chunk.metadata.is_continuation
# -- # --
chunk = next(chunk_iter) chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk) assert isinstance(chunk, TableChunk)
@ -408,6 +432,7 @@ class DescribeTablePreChunk:
"rem ipsum </td><td>A Link example</td></tr>\n" "rem ipsum </td><td>A Link example</td></tr>\n"
"<tr><td>Consectetur </td><td>adipiscing elit</td><" "<tr><td>Consectetur </td><td>adipiscing elit</td><"
) )
assert chunk.metadata.is_continuation
# -- note that text runs out but HTML continues because it's significantly longer. So two # -- note that text runs out but HTML continues because it's significantly longer. So two
# -- of these chunks have HTML but no text. # -- of these chunks have HTML but no text.
chunk = next(chunk_iter) chunk = next(chunk_iter)
@ -418,6 +443,7 @@ class DescribeTablePreChunk:
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n" "<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
"<tr><td>Vivamus quis </td><td>" "<tr><td>Vivamus quis </td><td>"
) )
assert chunk.metadata.is_continuation
# -- # --
chunk = next(chunk_iter) chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk) assert isinstance(chunk, TableChunk)
@ -425,10 +451,34 @@ class DescribeTablePreChunk:
assert chunk.metadata.text_as_html == ( assert chunk.metadata.text_as_html == (
"nunc ipsum donec ac fermentum</td></tr>\n</tbody>\n</table>" "nunc ipsum donec ac fermentum</td></tr>\n</tbody>\n</table>"
) )
assert chunk.metadata.is_continuation
# -- # --
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(chunk_iter) next(chunk_iter)
def and_it_includes_the_whole_original_Table_in_each_metadata_when_so_instructed(self):
"""Even though text and html are split, the orig_elements metadata is not."""
table = Table(
"Header Col 1 Header Col 2\nLorem ipsum dolor sit amet",
metadata=ElementMetadata(text_as_html="<table/>"),
)
opts = ChunkingOptions(max_characters=30, include_orig_elements=True)
pre_chunk = TablePreChunk(table, overlap_prefix="", opts=opts)
chunk_iter = pre_chunk.iter_chunks()
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
assert chunk.text == "Header Col 1 Header Col 2"
assert chunk.metadata.orig_elements == [table]
assert not chunk.metadata.is_continuation
# --
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
assert chunk.text == "Lorem ipsum dolor sit amet"
assert chunk.metadata.orig_elements == [table]
assert chunk.metadata.is_continuation
@pytest.mark.parametrize( @pytest.mark.parametrize(
("text", "expected_value"), ("text", "expected_value"),
[ [
@ -469,6 +519,50 @@ class DescribeTablePreChunk:
) )
assert pre_chunk._text == expected_value assert pre_chunk._text == expected_value
def it_computes_metadata_for_each_chunk_to_help(self):
table = Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="<table/>"))
pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions())
metadata = pre_chunk._metadata
assert metadata.text_as_html == "<table/>"
# -- opts.include_orig_elements is True by default --
assert metadata.orig_elements == [table]
# -- it produces a new instance each time it is called so changing one chunk's metadata does
# -- not change that of any other chunk.
assert pre_chunk._metadata is not metadata
def but_it_omits_orig_elements_from_metadata_when_so_instructed(self):
pre_chunk = TablePreChunk(
Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="<table/>")),
overlap_prefix="",
opts=ChunkingOptions(include_orig_elements=False),
)
assert pre_chunk._metadata.orig_elements is None
def it_computes_the_original_elements_list_to_help(self):
table = Table(
"Lorem ipsum",
metadata=ElementMetadata(text_as_html="<table/>", orig_elements=[Table("Lorem Ipsum")]),
)
pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions())
orig_elements = pre_chunk._orig_elements
# -- a TablePreChunk always has exactly one original (Table) element --
assert len(orig_elements) == 1
orig_element = orig_elements[0]
# -- each item in orig_elements is a copy of the original element so we can mutate it
# -- without changing user's data.
assert orig_element == table
assert orig_element is not table
# -- it strips any .metadata.orig_elements from each element to prevent a recursive data
# -- structure
assert orig_element.metadata.orig_elements is None
# -- computation is only on first call, all chunks get exactly the same orig-elements --
assert pre_chunk._orig_elements is orig_elements
class DescribeTextPreChunk: class DescribeTextPreChunk:
"""Unit-test suite for `unstructured.chunking.base.TextPreChunk` objects.""" """Unit-test suite for `unstructured.chunking.base.TextPreChunk` objects."""
@ -599,17 +693,15 @@ class DescribeTextPreChunk:
) )
def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self): def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
pre_chunk = TextPreChunk( elements = [
[ Title("Introduction"),
Title("Introduction"), Text(
Text( "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed" " lectus porta volutpat.",
" lectus porta volutpat.", ),
), ]
], opts = ChunkingOptions(max_characters=200, include_orig_elements=True)
overlap_prefix="e feugiat efficitur.", pre_chunk = TextPreChunk(elements, overlap_prefix="e feugiat efficitur.", opts=opts)
opts=ChunkingOptions(max_characters=200),
)
chunk_iter = pre_chunk.iter_chunks() chunk_iter = pre_chunk.iter_chunks()
@ -619,25 +711,31 @@ class DescribeTextPreChunk:
" adipiscing elit. In rhoncus ipsum sed lectus porta volutpat.", " adipiscing elit. In rhoncus ipsum sed lectus porta volutpat.",
) )
assert chunk.metadata is pre_chunk._consolidated_metadata assert chunk.metadata is pre_chunk._consolidated_metadata
assert chunk.metadata.orig_elements == elements
# --
with pytest.raises(StopIteration):
next(chunk_iter)
def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self): def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
# -- Chunk-splitting only occurs when a *single* element is too big to fit in the window. # -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
# -- The pre-chunker will isolate that element in a pre_chunk of its own. # -- The pre-chunker will isolate that element in a pre_chunk of its own.
pre_chunk = TextPreChunk( elements = [
[ Text(
Text( "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod" " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim" " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
" veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea" " commodo consequat."
" commodo consequat." )
), ]
], opts = ChunkingOptions(max_characters=200, include_orig_elements=True)
overlap_prefix="", pre_chunk = TextPreChunk(elements, overlap_prefix="", opts=opts)
opts=ChunkingOptions(max_characters=200, text_splitting_separators=("\n", " ")),
)
chunk_iter = pre_chunk.iter_chunks() chunk_iter = pre_chunk.iter_chunks()
# -- Note that .metadata.orig_elements is the same single original element, "repeated" for
# -- each text-split chunk. This behavior emerges without explicit command as a consequence
# -- of using `._consolidated_metadata` (and `._continuation_metadata` which extends
# -- `._consolidated_metadata)` for each text-split chunk.
chunk = next(chunk_iter) chunk = next(chunk_iter)
assert chunk == CompositeElement( assert chunk == CompositeElement(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod" "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
@ -645,10 +743,12 @@ class DescribeTextPreChunk:
" veniam, quis nostrud exercitation ullamco laboris nisi ut" " veniam, quis nostrud exercitation ullamco laboris nisi ut"
) )
assert chunk.metadata is pre_chunk._consolidated_metadata assert chunk.metadata is pre_chunk._consolidated_metadata
assert chunk.metadata.orig_elements == elements
# -- # --
chunk = next(chunk_iter) chunk = next(chunk_iter)
assert chunk == CompositeElement("aliquip ex ea commodo consequat.") assert chunk == CompositeElement("aliquip ex ea commodo consequat.")
assert chunk.metadata is pre_chunk._continuation_metadata assert chunk.metadata is pre_chunk._continuation_metadata
assert chunk.metadata.orig_elements == elements
# -- # --
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(chunk_iter) next(chunk_iter)
@ -762,6 +862,23 @@ class DescribeTextPreChunk:
"parent_id": ["f87731e0"], "parent_id": ["f87731e0"],
} }
def and_it_adds_the_pre_chunk_elements_to_metadata_when_so_instructed(self):
opts = ChunkingOptions(include_orig_elements=True)
metadata = ElementMetadata(filename="foo.pdf")
element = Title("Lorem Ipsum", metadata=metadata)
element_2 = Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata)
pre_chunk = TextPreChunk([element, element_2], overlap_prefix="", opts=opts)
consolidated_metadata = pre_chunk._consolidated_metadata
# -- pre-chunk elements are included as metadata --
orig_elements = consolidated_metadata.orig_elements
assert orig_elements is not None
assert orig_elements == [element, element_2]
# -- and they are the exact instances, not copies --
assert orig_elements[0] is element
assert orig_elements[1] is element_2
def it_consolidates_regex_metadata_in_a_field_specific_way(self): def it_consolidates_regex_metadata_in_a_field_specific_way(self):
"""regex_metadata of chunk is combined regex_metadatas of its elements. """regex_metadata of chunk is combined regex_metadatas of its elements.
@ -868,6 +985,32 @@ class DescribeTextPreChunk:
}, },
} }
def it_computes_the_original_elements_list_to_help(self):
element = Title("Introduction")
element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
element_3 = CompositeElement(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(orig_elements=[Text("Porta volupat.")]),
)
pre_chunk = TextPreChunk(
[element, element_2, element_3],
overlap_prefix="",
opts=ChunkingOptions(include_orig_elements=True),
)
orig_elements = pre_chunk._orig_elements
# -- all elements of pre-chunk are included --
assert orig_elements == [element, element_2, element_3]
# -- orig_elements that are chunks (having orig-elements of their own) are copied and the
# -- copy is stripped of its `.metadata.orig_elements` to prevent a recursive data
# -- structure that nests orig_elements within orig_elements.
assert orig_elements[0] is element
assert orig_elements[2] is not element_3
assert orig_elements[2].metadata.orig_elements is None
# -- computation is only on first call, all chunks get exactly the same orig-elements --
assert pre_chunk._orig_elements is orig_elements
@pytest.mark.parametrize( @pytest.mark.parametrize(
("elements", "overlap_prefix", "expected_value"), ("elements", "overlap_prefix", "expected_value"),
[ [

View File

@ -113,6 +113,27 @@ def test_it_chunks_elements_when_the_user_already_has_them():
] ]
def test_it_includes_original_elements_as_metadata_when_requested():
element = Title("Introduction")
element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
element_3 = Text("In rhoncus ipsum sed lectus porta volutpat.")
chunks = chunk_elements(
[element, element_2, element_3], max_characters=70, include_orig_elements=True
)
assert len(chunks) == 2
chunk = chunks[0]
assert chunk == CompositeElement(
"Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
)
assert chunk.metadata.orig_elements == [element, element_2]
# --
chunk = chunks[1]
assert chunk == CompositeElement("In rhoncus ipsum sed lectus porta volutpat.")
assert chunk.metadata.orig_elements == [element_3]
# ------------------------------------------------------------------------------------------------ # ------------------------------------------------------------------------------------------------
# UNIT TESTS # UNIT TESTS
# ------------------------------------------------------------------------------------------------ # ------------------------------------------------------------------------------------------------

View File

@ -9,12 +9,7 @@ from typing import Any, Optional
import pytest import pytest
from test_unstructured.unit_utils import FixtureRequest, Mock, function_mock from test_unstructured.unit_utils import FixtureRequest, Mock, function_mock
from unstructured.chunking.base import ( from unstructured.chunking.base import CHUNK_MULTI_PAGE_DEFAULT
CHUNK_MULTI_PAGE_DEFAULT,
PreChunker,
TablePreChunk,
TextPreChunk,
)
from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title
from unstructured.documents.coordinates import CoordinateSystem from unstructured.documents.coordinates import CoordinateSystem
from unstructured.documents.elements import ( from unstructured.documents.elements import (
@ -57,7 +52,7 @@ def test_it_splits_a_large_element_into_multiple_chunks():
] ]
def test_split_elements_by_title_and_table(): def test_it_splits_elements_by_title_and_table():
elements: list[Element] = [ elements: list[Element] = [
Title("A Great Day"), Title("A Great Day"),
Text("Today is a great day."), Text("Today is a great day."),
@ -72,39 +67,38 @@ def test_split_elements_by_title_and_table():
CheckBox(), CheckBox(),
] ]
pre_chunks = PreChunker.iter_pre_chunks(elements, opts=_ByTitleChunkingOptions.new()) chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=True)
pre_chunk = next(pre_chunks) assert len(chunks) == 4
assert isinstance(pre_chunk, TextPreChunk) # --
assert pre_chunk._elements == [ chunk = chunks[0]
assert isinstance(chunk, CompositeElement)
assert chunk.metadata.orig_elements == [
Title("A Great Day"), Title("A Great Day"),
Text("Today is a great day."), Text("Today is a great day."),
Text("It is sunny outside."), Text("It is sunny outside."),
] ]
# -- # --
pre_chunk = next(pre_chunks) chunk = chunks[1]
assert isinstance(pre_chunk, TablePreChunk) assert isinstance(chunk, Table)
assert pre_chunk._table == Table("Heading\nCell text") assert chunk.metadata.orig_elements == [Table("Heading\nCell text")]
# == # ==
pre_chunk = next(pre_chunks) chunk = chunks[2]
assert isinstance(pre_chunk, TextPreChunk) assert isinstance(chunk, CompositeElement)
assert pre_chunk._elements == [ assert chunk.metadata.orig_elements == [
Title("An Okay Day"), Title("An Okay Day"),
Text("Today is an okay day."), Text("Today is an okay day."),
Text("It is rainy outside."), Text("It is rainy outside."),
] ]
# -- # --
pre_chunk = next(pre_chunks) chunk = chunks[3]
assert isinstance(pre_chunk, TextPreChunk) assert isinstance(chunk, CompositeElement)
assert pre_chunk._elements == [ assert chunk.metadata.orig_elements == [
Title("A Bad Day"), Title("A Bad Day"),
Text("Today is a bad day."), Text("Today is a bad day."),
Text("It is storming outside."), Text("It is storming outside."),
CheckBox(), CheckBox(),
] ]
# --
with pytest.raises(StopIteration):
next(pre_chunks)
def test_chunk_by_title(): def test_chunk_by_title():
@ -127,7 +121,7 @@ def test_chunk_by_title():
CheckBox(), CheckBox(),
] ]
chunks = chunk_by_title(elements, combine_text_under_n_chars=0) chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=False)
assert chunks == [ assert chunks == [
CompositeElement( CompositeElement(

View File

@ -447,10 +447,10 @@ class TablePreChunk:
text_remainder = self._text text_remainder = self._text
html_remainder = self._table.metadata.text_as_html or "" html_remainder = self._table.metadata.text_as_html or ""
# -- only chunk a table when it's too big to swallow whole -- # -- only text-split a table when it's longer than the chunking window --
if len(text_remainder) <= maxlen and len(html_remainder) <= maxlen: if len(text_remainder) <= maxlen and len(html_remainder) <= maxlen:
# -- but the overlap-prefix must be added to its text -- # -- but the overlap-prefix must be added to its text --
yield Table(text=text_remainder, metadata=copy.deepcopy(self._table.metadata)) yield Table(text=text_remainder, metadata=self._metadata)
return return
split = self._opts.split split = self._opts.split
@ -459,19 +459,19 @@ class TablePreChunk:
while text_remainder or html_remainder: while text_remainder or html_remainder:
# -- split off the next chunk-worth of characters into a TableChunk -- # -- split off the next chunk-worth of characters into a TableChunk --
chunk_text, text_remainder = split(text_remainder) chunk_text, text_remainder = split(text_remainder)
table_chunk = TableChunk(text=chunk_text, metadata=copy.deepcopy(self._table.metadata)) metadata = self._metadata
# -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the # -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
# -- HTML elements that *correspond* to the TextChunk.text fragment. # -- HTML elements that *correspond* to the TextChunk.text fragment.
if html_remainder: if html_remainder:
chunk_html, html_remainder = html_remainder[:maxlen], html_remainder[maxlen:] chunk_html, html_remainder = html_remainder[:maxlen], html_remainder[maxlen:]
table_chunk.metadata.text_as_html = chunk_html metadata.text_as_html = chunk_html
# -- mark second and later chunks as a continuation -- # -- mark second and later chunks as a continuation --
if is_continuation: if is_continuation:
table_chunk.metadata.is_continuation = True metadata.is_continuation = True
yield table_chunk yield TableChunk(text=chunk_text, metadata=metadata)
is_continuation = True is_continuation = True
@ -486,6 +486,37 @@ class TablePreChunk:
overlap = self._opts.inter_chunk_overlap overlap = self._opts.inter_chunk_overlap
return self._text[-overlap:].strip() if overlap else "" return self._text[-overlap:].strip() if overlap else ""
@property
def _metadata(self) -> ElementMetadata:
"""The base `.metadata` value for chunks formed from this pre-chunk.
The term "base" here means that other metadata fields will be added, depending on the chunk.
In particular, `.metadata.text_as_html` will be different for each text-split chunk and
`.metadata.is_continuation` must be added for second-and-later text-split chunks.
Note this is a fresh copy of the metadata on each call since it will need to be mutated
differently for each chunk formed from from this pre-chunk.
"""
metadata = copy.deepcopy(self._table.metadata)
if self._opts.include_orig_elements:
metadata.orig_elements = self._orig_elements
return metadata
@lazyproperty
def _orig_elements(self) -> list[Element]:
"""The `.metadata.orig_elements` value for chunks formed from this pre-chunk.
Note this is not just the `Table` element, it must be adjusted to strip out any
`.metadata.orig_elements` value it may have when it is itself a chunk and not a direct
product of partitioning.
"""
# -- make a copy because we're going to mutate the `Table` element and it doesn't belong to
# -- us (the user may have downstream purposes for it).
orig_table = copy.deepcopy(self._table)
# -- prevent recursive .orig_elements when `Table` element is a chunk --
orig_table.metadata.orig_elements = None
return [orig_table]
@lazyproperty @lazyproperty
def _text(self) -> str: def _text(self) -> str:
"""The text for this chunk, including the overlap-prefix when present.""" """The text for this chunk, including the overlap-prefix when present."""
@ -615,7 +646,10 @@ class TextPreChunk:
to a single-element pre-chunk too, even though metadata for such a pre-chunk is already to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
"consolidated". "consolidated".
""" """
return ElementMetadata(**self._meta_kwargs) consolidated_metadata = ElementMetadata(**self._meta_kwargs)
if self._opts.include_orig_elements:
consolidated_metadata.orig_elements = self._orig_elements
return consolidated_metadata
@lazyproperty @lazyproperty
def _continuation_metadata(self) -> ElementMetadata: def _continuation_metadata(self) -> ElementMetadata:
@ -717,6 +751,25 @@ class TextPreChunk:
return dict(iter_kwarg_pairs()) return dict(iter_kwarg_pairs())
@lazyproperty
def _orig_elements(self) -> list[Element]:
"""The `.metadata.orig_elements` value for chunks formed from this pre-chunk."""
def iter_orig_elements():
for e in self._elements:
if e.metadata.orig_elements is None:
yield e
continue
# -- make copy of any element we're going to mutate because these elements don't
# -- belong to us (the user may have downstream purposes for them).
orig_element = copy.copy(e)
# -- prevent recursive .orig_elements when element is a chunk (has orig-elements of
# -- its own)
orig_element.metadata.orig_elements = None
yield orig_element
return list(iter_orig_elements())
@lazyproperty @lazyproperty
def _text(self) -> str: def _text(self) -> str:
"""The concatenated text of all elements in this pre-chunk. """The concatenated text of all elements in this pre-chunk.

View File

@ -378,6 +378,9 @@ class ElementMetadata:
for field_name in self.DEBUG_FIELD_NAMES: for field_name in self.DEBUG_FIELD_NAMES:
meta_dict.pop(field_name, None) meta_dict.pop(field_name, None)
# -- remove `.orig_elements` for now as that won't serialize --
meta_dict.pop("orig_elements", None)
# -- don't serialize empty lists -- # -- don't serialize empty lists --
meta_dict: dict[str, Any] = { meta_dict: dict[str, Any] = {
field_name: value field_name: value