mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-15 12:19:36 +00:00
feat(chunking): add .orig_elements behavior to chunking (#2656)
**Summary** Add the actual behavior to populate `.metadata.orig_elements` during chunking, when so instructed by the `include_orig_elements` option. **Additional Context** The underlying structures to support this, namely the `.metadata.orig_elements` field and the `include_orig_elements` chunking option, were added in closely prior PRs. This PR adds the behavior to actually populate that metadata field during chunking when the option is set.
This commit is contained in:
parent
c02cfb89d3
commit
1af41d5f90
@ -6,6 +6,8 @@
|
||||
|
||||
### Features
|
||||
|
||||
* **Chunking populates `.metadata.orig_elements` for each chunk.** This behavior allows the text and metadata of the elements combined to make each chunk to be accessed. This can be important for example to recover metadata such as `.coordinates` that cannot be consolidated across elements and so is dropped from chunks. This option is controlled by the `include_orig_elements` parameter to `partition_*()` or to the chunking functions. This option defaults to `True` so original-elements are preserved by default. This behavior is not yet supported via the REST APIs or SDKs but will be in a closely subsequent PR to other `unstructured` repositories. The original elements will also not serialize or deserialize yet; this will also be added in a closely subsequent PR.
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Clarify IAM Role Requirement for GCS Platform Connectors**. The GCS Source Connector requires Storage Object Viewer and GCS Destination Connector requires Storage Object Creator IAM roles.
|
||||
|
@ -352,7 +352,30 @@ class DescribeTablePreChunk:
|
||||
with pytest.raises(StopIteration):
|
||||
next(chunk_iter)
|
||||
|
||||
def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
|
||||
def and_it_includes_the_original_table_element_in_metadata_when_so_instructed(self):
|
||||
table = Table("foo bar", metadata=ElementMetadata(text_as_html="<table>foo bar</table>"))
|
||||
opts = ChunkingOptions(include_orig_elements=True)
|
||||
pre_chunk = TablePreChunk(table, "", opts)
|
||||
|
||||
chunk_iter = pre_chunk.iter_chunks()
|
||||
|
||||
chunk = next(chunk_iter)
|
||||
assert isinstance(chunk, Table)
|
||||
assert chunk.metadata.orig_elements == [table]
|
||||
assert chunk.metadata.text_as_html == "<table>foo bar</table>"
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(chunk_iter)
|
||||
|
||||
def but_not_when_instructed_not_to(self):
|
||||
pre_chunk = TablePreChunk(Table("foobar"), "", ChunkingOptions(include_orig_elements=False))
|
||||
|
||||
chunk = next(pre_chunk.iter_chunks())
|
||||
|
||||
assert isinstance(chunk, Table)
|
||||
assert chunk.metadata.orig_elements is None
|
||||
|
||||
def it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
|
||||
# fixed-overhead = 8+8+9+8+9+8 = 50
|
||||
# per-row overhead = 27
|
||||
html_table = (
|
||||
@ -398,6 +421,7 @@ class DescribeTablePreChunk:
|
||||
"<tbody>\n"
|
||||
"<tr><td>Lo"
|
||||
)
|
||||
assert not chunk.metadata.is_continuation
|
||||
# --
|
||||
chunk = next(chunk_iter)
|
||||
assert isinstance(chunk, TableChunk)
|
||||
@ -408,6 +432,7 @@ class DescribeTablePreChunk:
|
||||
"rem ipsum </td><td>A Link example</td></tr>\n"
|
||||
"<tr><td>Consectetur </td><td>adipiscing elit</td><"
|
||||
)
|
||||
assert chunk.metadata.is_continuation
|
||||
# -- note that text runs out but HTML continues because it's significantly longer. So two
|
||||
# -- of these chunks have HTML but no text.
|
||||
chunk = next(chunk_iter)
|
||||
@ -418,6 +443,7 @@ class DescribeTablePreChunk:
|
||||
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
|
||||
"<tr><td>Vivamus quis </td><td>"
|
||||
)
|
||||
assert chunk.metadata.is_continuation
|
||||
# --
|
||||
chunk = next(chunk_iter)
|
||||
assert isinstance(chunk, TableChunk)
|
||||
@ -425,10 +451,34 @@ class DescribeTablePreChunk:
|
||||
assert chunk.metadata.text_as_html == (
|
||||
"nunc ipsum donec ac fermentum</td></tr>\n</tbody>\n</table>"
|
||||
)
|
||||
assert chunk.metadata.is_continuation
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(chunk_iter)
|
||||
|
||||
def and_it_includes_the_whole_original_Table_in_each_metadata_when_so_instructed(self):
|
||||
"""Even though text and html are split, the orig_elements metadata is not."""
|
||||
table = Table(
|
||||
"Header Col 1 Header Col 2\nLorem ipsum dolor sit amet",
|
||||
metadata=ElementMetadata(text_as_html="<table/>"),
|
||||
)
|
||||
opts = ChunkingOptions(max_characters=30, include_orig_elements=True)
|
||||
pre_chunk = TablePreChunk(table, overlap_prefix="", opts=opts)
|
||||
|
||||
chunk_iter = pre_chunk.iter_chunks()
|
||||
|
||||
chunk = next(chunk_iter)
|
||||
assert isinstance(chunk, TableChunk)
|
||||
assert chunk.text == "Header Col 1 Header Col 2"
|
||||
assert chunk.metadata.orig_elements == [table]
|
||||
assert not chunk.metadata.is_continuation
|
||||
# --
|
||||
chunk = next(chunk_iter)
|
||||
assert isinstance(chunk, TableChunk)
|
||||
assert chunk.text == "Lorem ipsum dolor sit amet"
|
||||
assert chunk.metadata.orig_elements == [table]
|
||||
assert chunk.metadata.is_continuation
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("text", "expected_value"),
|
||||
[
|
||||
@ -469,6 +519,50 @@ class DescribeTablePreChunk:
|
||||
)
|
||||
assert pre_chunk._text == expected_value
|
||||
|
||||
def it_computes_metadata_for_each_chunk_to_help(self):
|
||||
table = Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="<table/>"))
|
||||
pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions())
|
||||
|
||||
metadata = pre_chunk._metadata
|
||||
|
||||
assert metadata.text_as_html == "<table/>"
|
||||
# -- opts.include_orig_elements is True by default --
|
||||
assert metadata.orig_elements == [table]
|
||||
# -- it produces a new instance each time it is called so changing one chunk's metadata does
|
||||
# -- not change that of any other chunk.
|
||||
assert pre_chunk._metadata is not metadata
|
||||
|
||||
def but_it_omits_orig_elements_from_metadata_when_so_instructed(self):
|
||||
pre_chunk = TablePreChunk(
|
||||
Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="<table/>")),
|
||||
overlap_prefix="",
|
||||
opts=ChunkingOptions(include_orig_elements=False),
|
||||
)
|
||||
|
||||
assert pre_chunk._metadata.orig_elements is None
|
||||
|
||||
def it_computes_the_original_elements_list_to_help(self):
|
||||
table = Table(
|
||||
"Lorem ipsum",
|
||||
metadata=ElementMetadata(text_as_html="<table/>", orig_elements=[Table("Lorem Ipsum")]),
|
||||
)
|
||||
pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions())
|
||||
|
||||
orig_elements = pre_chunk._orig_elements
|
||||
|
||||
# -- a TablePreChunk always has exactly one original (Table) element --
|
||||
assert len(orig_elements) == 1
|
||||
orig_element = orig_elements[0]
|
||||
# -- each item in orig_elements is a copy of the original element so we can mutate it
|
||||
# -- without changing user's data.
|
||||
assert orig_element == table
|
||||
assert orig_element is not table
|
||||
# -- it strips any .metadata.orig_elements from each element to prevent a recursive data
|
||||
# -- structure
|
||||
assert orig_element.metadata.orig_elements is None
|
||||
# -- computation is only on first call, all chunks get exactly the same orig-elements --
|
||||
assert pre_chunk._orig_elements is orig_elements
|
||||
|
||||
|
||||
class DescribeTextPreChunk:
|
||||
"""Unit-test suite for `unstructured.chunking.base.TextPreChunk` objects."""
|
||||
@ -599,17 +693,15 @@ class DescribeTextPreChunk:
|
||||
)
|
||||
|
||||
def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Title("Introduction"),
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
||||
" lectus porta volutpat.",
|
||||
),
|
||||
],
|
||||
overlap_prefix="e feugiat efficitur.",
|
||||
opts=ChunkingOptions(max_characters=200),
|
||||
)
|
||||
elements = [
|
||||
Title("Introduction"),
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
|
||||
" lectus porta volutpat.",
|
||||
),
|
||||
]
|
||||
opts = ChunkingOptions(max_characters=200, include_orig_elements=True)
|
||||
pre_chunk = TextPreChunk(elements, overlap_prefix="e feugiat efficitur.", opts=opts)
|
||||
|
||||
chunk_iter = pre_chunk.iter_chunks()
|
||||
|
||||
@ -619,25 +711,31 @@ class DescribeTextPreChunk:
|
||||
" adipiscing elit. In rhoncus ipsum sed lectus porta volutpat.",
|
||||
)
|
||||
assert chunk.metadata is pre_chunk._consolidated_metadata
|
||||
assert chunk.metadata.orig_elements == elements
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(chunk_iter)
|
||||
|
||||
def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
|
||||
# -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
|
||||
# -- The pre-chunker will isolate that element in a pre_chunk of its own.
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
|
||||
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
|
||||
" veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
|
||||
" commodo consequat."
|
||||
),
|
||||
],
|
||||
overlap_prefix="",
|
||||
opts=ChunkingOptions(max_characters=200, text_splitting_separators=("\n", " ")),
|
||||
)
|
||||
elements = [
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
|
||||
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
|
||||
" veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
|
||||
" commodo consequat."
|
||||
)
|
||||
]
|
||||
opts = ChunkingOptions(max_characters=200, include_orig_elements=True)
|
||||
pre_chunk = TextPreChunk(elements, overlap_prefix="", opts=opts)
|
||||
|
||||
chunk_iter = pre_chunk.iter_chunks()
|
||||
|
||||
# -- Note that .metadata.orig_elements is the same single original element, "repeated" for
|
||||
# -- each text-split chunk. This behavior emerges without explicit command as a consequence
|
||||
# -- of using `._consolidated_metadata` (and `._continuation_metadata` which extends
|
||||
# -- `._consolidated_metadata)` for each text-split chunk.
|
||||
chunk = next(chunk_iter)
|
||||
assert chunk == CompositeElement(
|
||||
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
|
||||
@ -645,10 +743,12 @@ class DescribeTextPreChunk:
|
||||
" veniam, quis nostrud exercitation ullamco laboris nisi ut"
|
||||
)
|
||||
assert chunk.metadata is pre_chunk._consolidated_metadata
|
||||
assert chunk.metadata.orig_elements == elements
|
||||
# --
|
||||
chunk = next(chunk_iter)
|
||||
assert chunk == CompositeElement("aliquip ex ea commodo consequat.")
|
||||
assert chunk.metadata is pre_chunk._continuation_metadata
|
||||
assert chunk.metadata.orig_elements == elements
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(chunk_iter)
|
||||
@ -762,6 +862,23 @@ class DescribeTextPreChunk:
|
||||
"parent_id": ["f87731e0"],
|
||||
}
|
||||
|
||||
def and_it_adds_the_pre_chunk_elements_to_metadata_when_so_instructed(self):
|
||||
opts = ChunkingOptions(include_orig_elements=True)
|
||||
metadata = ElementMetadata(filename="foo.pdf")
|
||||
element = Title("Lorem Ipsum", metadata=metadata)
|
||||
element_2 = Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata)
|
||||
pre_chunk = TextPreChunk([element, element_2], overlap_prefix="", opts=opts)
|
||||
|
||||
consolidated_metadata = pre_chunk._consolidated_metadata
|
||||
|
||||
# -- pre-chunk elements are included as metadata --
|
||||
orig_elements = consolidated_metadata.orig_elements
|
||||
assert orig_elements is not None
|
||||
assert orig_elements == [element, element_2]
|
||||
# -- and they are the exact instances, not copies --
|
||||
assert orig_elements[0] is element
|
||||
assert orig_elements[1] is element_2
|
||||
|
||||
def it_consolidates_regex_metadata_in_a_field_specific_way(self):
|
||||
"""regex_metadata of chunk is combined regex_metadatas of its elements.
|
||||
|
||||
@ -868,6 +985,32 @@ class DescribeTextPreChunk:
|
||||
},
|
||||
}
|
||||
|
||||
def it_computes_the_original_elements_list_to_help(self):
|
||||
element = Title("Introduction")
|
||||
element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
|
||||
element_3 = CompositeElement(
|
||||
"In rhoncus ipsum sed lectus porta volutpat.",
|
||||
metadata=ElementMetadata(orig_elements=[Text("Porta volupat.")]),
|
||||
)
|
||||
pre_chunk = TextPreChunk(
|
||||
[element, element_2, element_3],
|
||||
overlap_prefix="",
|
||||
opts=ChunkingOptions(include_orig_elements=True),
|
||||
)
|
||||
|
||||
orig_elements = pre_chunk._orig_elements
|
||||
|
||||
# -- all elements of pre-chunk are included --
|
||||
assert orig_elements == [element, element_2, element_3]
|
||||
# -- orig_elements that are chunks (having orig-elements of their own) are copied and the
|
||||
# -- copy is stripped of its `.metadata.orig_elements` to prevent a recursive data
|
||||
# -- structure that nests orig_elements within orig_elements.
|
||||
assert orig_elements[0] is element
|
||||
assert orig_elements[2] is not element_3
|
||||
assert orig_elements[2].metadata.orig_elements is None
|
||||
# -- computation is only on first call, all chunks get exactly the same orig-elements --
|
||||
assert pre_chunk._orig_elements is orig_elements
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("elements", "overlap_prefix", "expected_value"),
|
||||
[
|
||||
|
@ -113,6 +113,27 @@ def test_it_chunks_elements_when_the_user_already_has_them():
|
||||
]
|
||||
|
||||
|
||||
def test_it_includes_original_elements_as_metadata_when_requested():
|
||||
element = Title("Introduction")
|
||||
element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
|
||||
element_3 = Text("In rhoncus ipsum sed lectus porta volutpat.")
|
||||
|
||||
chunks = chunk_elements(
|
||||
[element, element_2, element_3], max_characters=70, include_orig_elements=True
|
||||
)
|
||||
|
||||
assert len(chunks) == 2
|
||||
chunk = chunks[0]
|
||||
assert chunk == CompositeElement(
|
||||
"Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
|
||||
)
|
||||
assert chunk.metadata.orig_elements == [element, element_2]
|
||||
# --
|
||||
chunk = chunks[1]
|
||||
assert chunk == CompositeElement("In rhoncus ipsum sed lectus porta volutpat.")
|
||||
assert chunk.metadata.orig_elements == [element_3]
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
# UNIT TESTS
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
|
@ -9,12 +9,7 @@ from typing import Any, Optional
|
||||
import pytest
|
||||
|
||||
from test_unstructured.unit_utils import FixtureRequest, Mock, function_mock
|
||||
from unstructured.chunking.base import (
|
||||
CHUNK_MULTI_PAGE_DEFAULT,
|
||||
PreChunker,
|
||||
TablePreChunk,
|
||||
TextPreChunk,
|
||||
)
|
||||
from unstructured.chunking.base import CHUNK_MULTI_PAGE_DEFAULT
|
||||
from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title
|
||||
from unstructured.documents.coordinates import CoordinateSystem
|
||||
from unstructured.documents.elements import (
|
||||
@ -57,7 +52,7 @@ def test_it_splits_a_large_element_into_multiple_chunks():
|
||||
]
|
||||
|
||||
|
||||
def test_split_elements_by_title_and_table():
|
||||
def test_it_splits_elements_by_title_and_table():
|
||||
elements: list[Element] = [
|
||||
Title("A Great Day"),
|
||||
Text("Today is a great day."),
|
||||
@ -72,39 +67,38 @@ def test_split_elements_by_title_and_table():
|
||||
CheckBox(),
|
||||
]
|
||||
|
||||
pre_chunks = PreChunker.iter_pre_chunks(elements, opts=_ByTitleChunkingOptions.new())
|
||||
chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=True)
|
||||
|
||||
pre_chunk = next(pre_chunks)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
assert len(chunks) == 4
|
||||
# --
|
||||
chunk = chunks[0]
|
||||
assert isinstance(chunk, CompositeElement)
|
||||
assert chunk.metadata.orig_elements == [
|
||||
Title("A Great Day"),
|
||||
Text("Today is a great day."),
|
||||
Text("It is sunny outside."),
|
||||
]
|
||||
# --
|
||||
pre_chunk = next(pre_chunks)
|
||||
assert isinstance(pre_chunk, TablePreChunk)
|
||||
assert pre_chunk._table == Table("Heading\nCell text")
|
||||
chunk = chunks[1]
|
||||
assert isinstance(chunk, Table)
|
||||
assert chunk.metadata.orig_elements == [Table("Heading\nCell text")]
|
||||
# ==
|
||||
pre_chunk = next(pre_chunks)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
chunk = chunks[2]
|
||||
assert isinstance(chunk, CompositeElement)
|
||||
assert chunk.metadata.orig_elements == [
|
||||
Title("An Okay Day"),
|
||||
Text("Today is an okay day."),
|
||||
Text("It is rainy outside."),
|
||||
]
|
||||
# --
|
||||
pre_chunk = next(pre_chunks)
|
||||
assert isinstance(pre_chunk, TextPreChunk)
|
||||
assert pre_chunk._elements == [
|
||||
chunk = chunks[3]
|
||||
assert isinstance(chunk, CompositeElement)
|
||||
assert chunk.metadata.orig_elements == [
|
||||
Title("A Bad Day"),
|
||||
Text("Today is a bad day."),
|
||||
Text("It is storming outside."),
|
||||
CheckBox(),
|
||||
]
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
next(pre_chunks)
|
||||
|
||||
|
||||
def test_chunk_by_title():
|
||||
@ -127,7 +121,7 @@ def test_chunk_by_title():
|
||||
CheckBox(),
|
||||
]
|
||||
|
||||
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
|
||||
chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=False)
|
||||
|
||||
assert chunks == [
|
||||
CompositeElement(
|
||||
|
@ -447,10 +447,10 @@ class TablePreChunk:
|
||||
text_remainder = self._text
|
||||
html_remainder = self._table.metadata.text_as_html or ""
|
||||
|
||||
# -- only chunk a table when it's too big to swallow whole --
|
||||
# -- only text-split a table when it's longer than the chunking window --
|
||||
if len(text_remainder) <= maxlen and len(html_remainder) <= maxlen:
|
||||
# -- but the overlap-prefix must be added to its text --
|
||||
yield Table(text=text_remainder, metadata=copy.deepcopy(self._table.metadata))
|
||||
yield Table(text=text_remainder, metadata=self._metadata)
|
||||
return
|
||||
|
||||
split = self._opts.split
|
||||
@ -459,19 +459,19 @@ class TablePreChunk:
|
||||
while text_remainder or html_remainder:
|
||||
# -- split off the next chunk-worth of characters into a TableChunk --
|
||||
chunk_text, text_remainder = split(text_remainder)
|
||||
table_chunk = TableChunk(text=chunk_text, metadata=copy.deepcopy(self._table.metadata))
|
||||
metadata = self._metadata
|
||||
|
||||
# -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
|
||||
# -- HTML elements that *correspond* to the TextChunk.text fragment.
|
||||
if html_remainder:
|
||||
chunk_html, html_remainder = html_remainder[:maxlen], html_remainder[maxlen:]
|
||||
table_chunk.metadata.text_as_html = chunk_html
|
||||
metadata.text_as_html = chunk_html
|
||||
|
||||
# -- mark second and later chunks as a continuation --
|
||||
if is_continuation:
|
||||
table_chunk.metadata.is_continuation = True
|
||||
metadata.is_continuation = True
|
||||
|
||||
yield table_chunk
|
||||
yield TableChunk(text=chunk_text, metadata=metadata)
|
||||
|
||||
is_continuation = True
|
||||
|
||||
@ -486,6 +486,37 @@ class TablePreChunk:
|
||||
overlap = self._opts.inter_chunk_overlap
|
||||
return self._text[-overlap:].strip() if overlap else ""
|
||||
|
||||
@property
|
||||
def _metadata(self) -> ElementMetadata:
|
||||
"""The base `.metadata` value for chunks formed from this pre-chunk.
|
||||
|
||||
The term "base" here means that other metadata fields will be added, depending on the chunk.
|
||||
In particular, `.metadata.text_as_html` will be different for each text-split chunk and
|
||||
`.metadata.is_continuation` must be added for second-and-later text-split chunks.
|
||||
|
||||
Note this is a fresh copy of the metadata on each call since it will need to be mutated
|
||||
differently for each chunk formed from from this pre-chunk.
|
||||
"""
|
||||
metadata = copy.deepcopy(self._table.metadata)
|
||||
if self._opts.include_orig_elements:
|
||||
metadata.orig_elements = self._orig_elements
|
||||
return metadata
|
||||
|
||||
@lazyproperty
|
||||
def _orig_elements(self) -> list[Element]:
|
||||
"""The `.metadata.orig_elements` value for chunks formed from this pre-chunk.
|
||||
|
||||
Note this is not just the `Table` element, it must be adjusted to strip out any
|
||||
`.metadata.orig_elements` value it may have when it is itself a chunk and not a direct
|
||||
product of partitioning.
|
||||
"""
|
||||
# -- make a copy because we're going to mutate the `Table` element and it doesn't belong to
|
||||
# -- us (the user may have downstream purposes for it).
|
||||
orig_table = copy.deepcopy(self._table)
|
||||
# -- prevent recursive .orig_elements when `Table` element is a chunk --
|
||||
orig_table.metadata.orig_elements = None
|
||||
return [orig_table]
|
||||
|
||||
@lazyproperty
|
||||
def _text(self) -> str:
|
||||
"""The text for this chunk, including the overlap-prefix when present."""
|
||||
@ -615,7 +646,10 @@ class TextPreChunk:
|
||||
to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
|
||||
"consolidated".
|
||||
"""
|
||||
return ElementMetadata(**self._meta_kwargs)
|
||||
consolidated_metadata = ElementMetadata(**self._meta_kwargs)
|
||||
if self._opts.include_orig_elements:
|
||||
consolidated_metadata.orig_elements = self._orig_elements
|
||||
return consolidated_metadata
|
||||
|
||||
@lazyproperty
|
||||
def _continuation_metadata(self) -> ElementMetadata:
|
||||
@ -717,6 +751,25 @@ class TextPreChunk:
|
||||
|
||||
return dict(iter_kwarg_pairs())
|
||||
|
||||
@lazyproperty
|
||||
def _orig_elements(self) -> list[Element]:
|
||||
"""The `.metadata.orig_elements` value for chunks formed from this pre-chunk."""
|
||||
|
||||
def iter_orig_elements():
|
||||
for e in self._elements:
|
||||
if e.metadata.orig_elements is None:
|
||||
yield e
|
||||
continue
|
||||
# -- make copy of any element we're going to mutate because these elements don't
|
||||
# -- belong to us (the user may have downstream purposes for them).
|
||||
orig_element = copy.copy(e)
|
||||
# -- prevent recursive .orig_elements when element is a chunk (has orig-elements of
|
||||
# -- its own)
|
||||
orig_element.metadata.orig_elements = None
|
||||
yield orig_element
|
||||
|
||||
return list(iter_orig_elements())
|
||||
|
||||
@lazyproperty
|
||||
def _text(self) -> str:
|
||||
"""The concatenated text of all elements in this pre-chunk.
|
||||
|
@ -378,6 +378,9 @@ class ElementMetadata:
|
||||
for field_name in self.DEBUG_FIELD_NAMES:
|
||||
meta_dict.pop(field_name, None)
|
||||
|
||||
# -- remove `.orig_elements` for now as that won't serialize --
|
||||
meta_dict.pop("orig_elements", None)
|
||||
|
||||
# -- don't serialize empty lists --
|
||||
meta_dict: dict[str, Any] = {
|
||||
field_name: value
|
||||
|
Loading…
x
Reference in New Issue
Block a user