feat(chunking): add .orig_elements behavior to chunking (#2656)

**Summary**
Add the actual behavior to populate `.metadata.orig_elements` during
chunking, when so instructed by the `include_orig_elements` option.

**Additional Context**
The underlying structures to support this, namely the
`.metadata.orig_elements` field and the `include_orig_elements` chunking
option, were added in closely prior PRs. This PR adds the behavior to
actually populate that metadata field during chunking when the option is
set.
This commit is contained in:
Steve Canny 2024-03-18 12:27:39 -07:00 committed by GitHub
parent c02cfb89d3
commit 1af41d5f90
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 271 additions and 55 deletions

View File

@ -6,6 +6,8 @@
### Features
* **Chunking populates `.metadata.orig_elements` for each chunk.** This behavior allows the text and metadata of the elements combined to make each chunk to be accessed. This can be important for example to recover metadata such as `.coordinates` that cannot be consolidated across elements and so is dropped from chunks. This option is controlled by the `include_orig_elements` parameter to `partition_*()` or to the chunking functions. This option defaults to `True` so original-elements are preserved by default. This behavior is not yet supported via the REST APIs or SDKs but will be in a closely subsequent PR to other `unstructured` repositories. The original elements will also not serialize or deserialize yet; this will also be added in a closely subsequent PR.
### Fixes
* **Clarify IAM Role Requirement for GCS Platform Connectors**. The GCS Source Connector requires Storage Object Viewer and GCS Destination Connector requires Storage Object Creator IAM roles.

View File

@ -352,7 +352,30 @@ class DescribeTablePreChunk:
with pytest.raises(StopIteration):
next(chunk_iter)
def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
def and_it_includes_the_original_table_element_in_metadata_when_so_instructed(self):
table = Table("foo bar", metadata=ElementMetadata(text_as_html="<table>foo bar</table>"))
opts = ChunkingOptions(include_orig_elements=True)
pre_chunk = TablePreChunk(table, "", opts)
chunk_iter = pre_chunk.iter_chunks()
chunk = next(chunk_iter)
assert isinstance(chunk, Table)
assert chunk.metadata.orig_elements == [table]
assert chunk.metadata.text_as_html == "<table>foo bar</table>"
# --
with pytest.raises(StopIteration):
next(chunk_iter)
def but_not_when_instructed_not_to(self):
pre_chunk = TablePreChunk(Table("foobar"), "", ChunkingOptions(include_orig_elements=False))
chunk = next(pre_chunk.iter_chunks())
assert isinstance(chunk, Table)
assert chunk.metadata.orig_elements is None
def it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
# fixed-overhead = 8+8+9+8+9+8 = 50
# per-row overhead = 27
html_table = (
@ -398,6 +421,7 @@ class DescribeTablePreChunk:
"<tbody>\n"
"<tr><td>Lo"
)
assert not chunk.metadata.is_continuation
# --
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
@ -408,6 +432,7 @@ class DescribeTablePreChunk:
"rem ipsum </td><td>A Link example</td></tr>\n"
"<tr><td>Consectetur </td><td>adipiscing elit</td><"
)
assert chunk.metadata.is_continuation
# -- note that text runs out but HTML continues because it's significantly longer. So two
# -- of these chunks have HTML but no text.
chunk = next(chunk_iter)
@ -418,6 +443,7 @@ class DescribeTablePreChunk:
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
"<tr><td>Vivamus quis </td><td>"
)
assert chunk.metadata.is_continuation
# --
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
@ -425,10 +451,34 @@ class DescribeTablePreChunk:
assert chunk.metadata.text_as_html == (
"nunc ipsum donec ac fermentum</td></tr>\n</tbody>\n</table>"
)
assert chunk.metadata.is_continuation
# --
with pytest.raises(StopIteration):
next(chunk_iter)
def and_it_includes_the_whole_original_Table_in_each_metadata_when_so_instructed(self):
"""Even though text and html are split, the orig_elements metadata is not."""
table = Table(
"Header Col 1 Header Col 2\nLorem ipsum dolor sit amet",
metadata=ElementMetadata(text_as_html="<table/>"),
)
opts = ChunkingOptions(max_characters=30, include_orig_elements=True)
pre_chunk = TablePreChunk(table, overlap_prefix="", opts=opts)
chunk_iter = pre_chunk.iter_chunks()
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
assert chunk.text == "Header Col 1 Header Col 2"
assert chunk.metadata.orig_elements == [table]
assert not chunk.metadata.is_continuation
# --
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
assert chunk.text == "Lorem ipsum dolor sit amet"
assert chunk.metadata.orig_elements == [table]
assert chunk.metadata.is_continuation
@pytest.mark.parametrize(
("text", "expected_value"),
[
@ -469,6 +519,50 @@ class DescribeTablePreChunk:
)
assert pre_chunk._text == expected_value
def it_computes_metadata_for_each_chunk_to_help(self):
table = Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="<table/>"))
pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions())
metadata = pre_chunk._metadata
assert metadata.text_as_html == "<table/>"
# -- opts.include_orig_elements is True by default --
assert metadata.orig_elements == [table]
# -- it produces a new instance each time it is called so changing one chunk's metadata does
# -- not change that of any other chunk.
assert pre_chunk._metadata is not metadata
def but_it_omits_orig_elements_from_metadata_when_so_instructed(self):
pre_chunk = TablePreChunk(
Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="<table/>")),
overlap_prefix="",
opts=ChunkingOptions(include_orig_elements=False),
)
assert pre_chunk._metadata.orig_elements is None
def it_computes_the_original_elements_list_to_help(self):
table = Table(
"Lorem ipsum",
metadata=ElementMetadata(text_as_html="<table/>", orig_elements=[Table("Lorem Ipsum")]),
)
pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions())
orig_elements = pre_chunk._orig_elements
# -- a TablePreChunk always has exactly one original (Table) element --
assert len(orig_elements) == 1
orig_element = orig_elements[0]
# -- each item in orig_elements is a copy of the original element so we can mutate it
# -- without changing user's data.
assert orig_element == table
assert orig_element is not table
# -- it strips any .metadata.orig_elements from each element to prevent a recursive data
# -- structure
assert orig_element.metadata.orig_elements is None
# -- computation is only on first call, all chunks get exactly the same orig-elements --
assert pre_chunk._orig_elements is orig_elements
class DescribeTextPreChunk:
"""Unit-test suite for `unstructured.chunking.base.TextPreChunk` objects."""
@ -599,17 +693,15 @@ class DescribeTextPreChunk:
)
def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
pre_chunk = TextPreChunk(
[
Title("Introduction"),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
" lectus porta volutpat.",
),
],
overlap_prefix="e feugiat efficitur.",
opts=ChunkingOptions(max_characters=200),
)
elements = [
Title("Introduction"),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
" lectus porta volutpat.",
),
]
opts = ChunkingOptions(max_characters=200, include_orig_elements=True)
pre_chunk = TextPreChunk(elements, overlap_prefix="e feugiat efficitur.", opts=opts)
chunk_iter = pre_chunk.iter_chunks()
@ -619,25 +711,31 @@ class DescribeTextPreChunk:
" adipiscing elit. In rhoncus ipsum sed lectus porta volutpat.",
)
assert chunk.metadata is pre_chunk._consolidated_metadata
assert chunk.metadata.orig_elements == elements
# --
with pytest.raises(StopIteration):
next(chunk_iter)
def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
# -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
# -- The pre-chunker will isolate that element in a pre_chunk of its own.
pre_chunk = TextPreChunk(
[
Text(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
" veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
" commodo consequat."
),
],
overlap_prefix="",
opts=ChunkingOptions(max_characters=200, text_splitting_separators=("\n", " ")),
)
elements = [
Text(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
" veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
" commodo consequat."
)
]
opts = ChunkingOptions(max_characters=200, include_orig_elements=True)
pre_chunk = TextPreChunk(elements, overlap_prefix="", opts=opts)
chunk_iter = pre_chunk.iter_chunks()
# -- Note that .metadata.orig_elements is the same single original element, "repeated" for
# -- each text-split chunk. This behavior emerges without explicit command as a consequence
# -- of using `._consolidated_metadata` (and `._continuation_metadata` which extends
# -- `._consolidated_metadata)` for each text-split chunk.
chunk = next(chunk_iter)
assert chunk == CompositeElement(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
@ -645,10 +743,12 @@ class DescribeTextPreChunk:
" veniam, quis nostrud exercitation ullamco laboris nisi ut"
)
assert chunk.metadata is pre_chunk._consolidated_metadata
assert chunk.metadata.orig_elements == elements
# --
chunk = next(chunk_iter)
assert chunk == CompositeElement("aliquip ex ea commodo consequat.")
assert chunk.metadata is pre_chunk._continuation_metadata
assert chunk.metadata.orig_elements == elements
# --
with pytest.raises(StopIteration):
next(chunk_iter)
@ -762,6 +862,23 @@ class DescribeTextPreChunk:
"parent_id": ["f87731e0"],
}
def and_it_adds_the_pre_chunk_elements_to_metadata_when_so_instructed(self):
opts = ChunkingOptions(include_orig_elements=True)
metadata = ElementMetadata(filename="foo.pdf")
element = Title("Lorem Ipsum", metadata=metadata)
element_2 = Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata)
pre_chunk = TextPreChunk([element, element_2], overlap_prefix="", opts=opts)
consolidated_metadata = pre_chunk._consolidated_metadata
# -- pre-chunk elements are included as metadata --
orig_elements = consolidated_metadata.orig_elements
assert orig_elements is not None
assert orig_elements == [element, element_2]
# -- and they are the exact instances, not copies --
assert orig_elements[0] is element
assert orig_elements[1] is element_2
def it_consolidates_regex_metadata_in_a_field_specific_way(self):
"""regex_metadata of chunk is combined regex_metadatas of its elements.
@ -868,6 +985,32 @@ class DescribeTextPreChunk:
},
}
def it_computes_the_original_elements_list_to_help(self):
element = Title("Introduction")
element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
element_3 = CompositeElement(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(orig_elements=[Text("Porta volupat.")]),
)
pre_chunk = TextPreChunk(
[element, element_2, element_3],
overlap_prefix="",
opts=ChunkingOptions(include_orig_elements=True),
)
orig_elements = pre_chunk._orig_elements
# -- all elements of pre-chunk are included --
assert orig_elements == [element, element_2, element_3]
# -- orig_elements that are chunks (having orig-elements of their own) are copied and the
# -- copy is stripped of its `.metadata.orig_elements` to prevent a recursive data
# -- structure that nests orig_elements within orig_elements.
assert orig_elements[0] is element
assert orig_elements[2] is not element_3
assert orig_elements[2].metadata.orig_elements is None
# -- computation is only on first call, all chunks get exactly the same orig-elements --
assert pre_chunk._orig_elements is orig_elements
@pytest.mark.parametrize(
("elements", "overlap_prefix", "expected_value"),
[

View File

@ -113,6 +113,27 @@ def test_it_chunks_elements_when_the_user_already_has_them():
]
def test_it_includes_original_elements_as_metadata_when_requested():
element = Title("Introduction")
element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
element_3 = Text("In rhoncus ipsum sed lectus porta volutpat.")
chunks = chunk_elements(
[element, element_2, element_3], max_characters=70, include_orig_elements=True
)
assert len(chunks) == 2
chunk = chunks[0]
assert chunk == CompositeElement(
"Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
)
assert chunk.metadata.orig_elements == [element, element_2]
# --
chunk = chunks[1]
assert chunk == CompositeElement("In rhoncus ipsum sed lectus porta volutpat.")
assert chunk.metadata.orig_elements == [element_3]
# ------------------------------------------------------------------------------------------------
# UNIT TESTS
# ------------------------------------------------------------------------------------------------

View File

@ -9,12 +9,7 @@ from typing import Any, Optional
import pytest
from test_unstructured.unit_utils import FixtureRequest, Mock, function_mock
from unstructured.chunking.base import (
CHUNK_MULTI_PAGE_DEFAULT,
PreChunker,
TablePreChunk,
TextPreChunk,
)
from unstructured.chunking.base import CHUNK_MULTI_PAGE_DEFAULT
from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title
from unstructured.documents.coordinates import CoordinateSystem
from unstructured.documents.elements import (
@ -57,7 +52,7 @@ def test_it_splits_a_large_element_into_multiple_chunks():
]
def test_split_elements_by_title_and_table():
def test_it_splits_elements_by_title_and_table():
elements: list[Element] = [
Title("A Great Day"),
Text("Today is a great day."),
@ -72,39 +67,38 @@ def test_split_elements_by_title_and_table():
CheckBox(),
]
pre_chunks = PreChunker.iter_pre_chunks(elements, opts=_ByTitleChunkingOptions.new())
chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=True)
pre_chunk = next(pre_chunks)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
assert len(chunks) == 4
# --
chunk = chunks[0]
assert isinstance(chunk, CompositeElement)
assert chunk.metadata.orig_elements == [
Title("A Great Day"),
Text("Today is a great day."),
Text("It is sunny outside."),
]
# --
pre_chunk = next(pre_chunks)
assert isinstance(pre_chunk, TablePreChunk)
assert pre_chunk._table == Table("Heading\nCell text")
chunk = chunks[1]
assert isinstance(chunk, Table)
assert chunk.metadata.orig_elements == [Table("Heading\nCell text")]
# ==
pre_chunk = next(pre_chunks)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
chunk = chunks[2]
assert isinstance(chunk, CompositeElement)
assert chunk.metadata.orig_elements == [
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
]
# --
pre_chunk = next(pre_chunks)
assert isinstance(pre_chunk, TextPreChunk)
assert pre_chunk._elements == [
chunk = chunks[3]
assert isinstance(chunk, CompositeElement)
assert chunk.metadata.orig_elements == [
Title("A Bad Day"),
Text("Today is a bad day."),
Text("It is storming outside."),
CheckBox(),
]
# --
with pytest.raises(StopIteration):
next(pre_chunks)
def test_chunk_by_title():
@ -127,7 +121,7 @@ def test_chunk_by_title():
CheckBox(),
]
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=False)
assert chunks == [
CompositeElement(

View File

@ -447,10 +447,10 @@ class TablePreChunk:
text_remainder = self._text
html_remainder = self._table.metadata.text_as_html or ""
# -- only chunk a table when it's too big to swallow whole --
# -- only text-split a table when it's longer than the chunking window --
if len(text_remainder) <= maxlen and len(html_remainder) <= maxlen:
# -- but the overlap-prefix must be added to its text --
yield Table(text=text_remainder, metadata=copy.deepcopy(self._table.metadata))
yield Table(text=text_remainder, metadata=self._metadata)
return
split = self._opts.split
@ -459,19 +459,19 @@ class TablePreChunk:
while text_remainder or html_remainder:
# -- split off the next chunk-worth of characters into a TableChunk --
chunk_text, text_remainder = split(text_remainder)
table_chunk = TableChunk(text=chunk_text, metadata=copy.deepcopy(self._table.metadata))
metadata = self._metadata
# -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
# -- HTML elements that *correspond* to the TextChunk.text fragment.
if html_remainder:
chunk_html, html_remainder = html_remainder[:maxlen], html_remainder[maxlen:]
table_chunk.metadata.text_as_html = chunk_html
metadata.text_as_html = chunk_html
# -- mark second and later chunks as a continuation --
if is_continuation:
table_chunk.metadata.is_continuation = True
metadata.is_continuation = True
yield table_chunk
yield TableChunk(text=chunk_text, metadata=metadata)
is_continuation = True
@ -486,6 +486,37 @@ class TablePreChunk:
overlap = self._opts.inter_chunk_overlap
return self._text[-overlap:].strip() if overlap else ""
@property
def _metadata(self) -> ElementMetadata:
"""The base `.metadata` value for chunks formed from this pre-chunk.
The term "base" here means that other metadata fields will be added, depending on the chunk.
In particular, `.metadata.text_as_html` will be different for each text-split chunk and
`.metadata.is_continuation` must be added for second-and-later text-split chunks.
Note this is a fresh copy of the metadata on each call since it will need to be mutated
differently for each chunk formed from from this pre-chunk.
"""
metadata = copy.deepcopy(self._table.metadata)
if self._opts.include_orig_elements:
metadata.orig_elements = self._orig_elements
return metadata
@lazyproperty
def _orig_elements(self) -> list[Element]:
"""The `.metadata.orig_elements` value for chunks formed from this pre-chunk.
Note this is not just the `Table` element, it must be adjusted to strip out any
`.metadata.orig_elements` value it may have when it is itself a chunk and not a direct
product of partitioning.
"""
# -- make a copy because we're going to mutate the `Table` element and it doesn't belong to
# -- us (the user may have downstream purposes for it).
orig_table = copy.deepcopy(self._table)
# -- prevent recursive .orig_elements when `Table` element is a chunk --
orig_table.metadata.orig_elements = None
return [orig_table]
@lazyproperty
def _text(self) -> str:
"""The text for this chunk, including the overlap-prefix when present."""
@ -615,7 +646,10 @@ class TextPreChunk:
to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
"consolidated".
"""
return ElementMetadata(**self._meta_kwargs)
consolidated_metadata = ElementMetadata(**self._meta_kwargs)
if self._opts.include_orig_elements:
consolidated_metadata.orig_elements = self._orig_elements
return consolidated_metadata
@lazyproperty
def _continuation_metadata(self) -> ElementMetadata:
@ -717,6 +751,25 @@ class TextPreChunk:
return dict(iter_kwarg_pairs())
@lazyproperty
def _orig_elements(self) -> list[Element]:
"""The `.metadata.orig_elements` value for chunks formed from this pre-chunk."""
def iter_orig_elements():
for e in self._elements:
if e.metadata.orig_elements is None:
yield e
continue
# -- make copy of any element we're going to mutate because these elements don't
# -- belong to us (the user may have downstream purposes for them).
orig_element = copy.copy(e)
# -- prevent recursive .orig_elements when element is a chunk (has orig-elements of
# -- its own)
orig_element.metadata.orig_elements = None
yield orig_element
return list(iter_orig_elements())
@lazyproperty
def _text(self) -> str:
"""The concatenated text of all elements in this pre-chunk.

View File

@ -378,6 +378,9 @@ class ElementMetadata:
for field_name in self.DEBUG_FIELD_NAMES:
meta_dict.pop(field_name, None)
# -- remove `.orig_elements` for now as that won't serialize --
meta_dict.pop("orig_elements", None)
# -- don't serialize empty lists --
meta_dict: dict[str, Any] = {
field_name: value