diff --git a/CHANGELOG.md b/CHANGELOG.md
index dd4bf4177..a09b57c57 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,8 @@
### Features
+* **Chunking populates `.metadata.orig_elements` for each chunk.** This behavior allows the text and metadata of the elements combined to make each chunk to be accessed. This can be important for example to recover metadata such as `.coordinates` that cannot be consolidated across elements and so is dropped from chunks. This option is controlled by the `include_orig_elements` parameter to `partition_*()` or to the chunking functions. This option defaults to `True` so original-elements are preserved by default. This behavior is not yet supported via the REST APIs or SDKs but will be in a closely subsequent PR to other `unstructured` repositories. The original elements will also not serialize or deserialize yet; this will also be added in a closely subsequent PR.
+
### Fixes
* **Clarify IAM Role Requirement for GCS Platform Connectors**. The GCS Source Connector requires Storage Object Viewer and GCS Destination Connector requires Storage Object Creator IAM roles.
diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py
index ab16e74f4..dd9a68fa7 100644
--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@@ -352,7 +352,30 @@ class DescribeTablePreChunk:
with pytest.raises(StopIteration):
next(chunk_iter)
- def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
+ def and_it_includes_the_original_table_element_in_metadata_when_so_instructed(self):
+ table = Table("foo bar", metadata=ElementMetadata(text_as_html="
"))
+ opts = ChunkingOptions(include_orig_elements=True)
+ pre_chunk = TablePreChunk(table, "", opts)
+
+ chunk_iter = pre_chunk.iter_chunks()
+
+ chunk = next(chunk_iter)
+ assert isinstance(chunk, Table)
+ assert chunk.metadata.orig_elements == [table]
+ assert chunk.metadata.text_as_html == ""
+ # --
+ with pytest.raises(StopIteration):
+ next(chunk_iter)
+
+ def but_not_when_instructed_not_to(self):
+ pre_chunk = TablePreChunk(Table("foobar"), "", ChunkingOptions(include_orig_elements=False))
+
+ chunk = next(pre_chunk.iter_chunks())
+
+ assert isinstance(chunk, Table)
+ assert chunk.metadata.orig_elements is None
+
+ def it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
# fixed-overhead = 8+8+9+8+9+8 = 50
# per-row overhead = 27
html_table = (
@@ -398,6 +421,7 @@ class DescribeTablePreChunk:
"\n"
"Lo"
)
+ assert not chunk.metadata.is_continuation
# --
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
@@ -408,6 +432,7 @@ class DescribeTablePreChunk:
"rem ipsum | A Link example |
\n"
"Consectetur | adipiscing elit | <"
)
+ assert chunk.metadata.is_continuation
# -- note that text runs out but HTML continues because it's significantly longer. So two
# -- of these chunks have HTML but no text.
chunk = next(chunk_iter)
@@ -418,6 +443,7 @@ class DescribeTablePreChunk:
"
Nunc aliquam | id enim nec molestie |
\n"
"Vivamus quis | "
)
+ assert chunk.metadata.is_continuation
# --
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
@@ -425,10 +451,34 @@ class DescribeTablePreChunk:
assert chunk.metadata.text_as_html == (
"nunc ipsum donec ac fermentum |
\n\n"
)
+ assert chunk.metadata.is_continuation
# --
with pytest.raises(StopIteration):
next(chunk_iter)
+ def and_it_includes_the_whole_original_Table_in_each_metadata_when_so_instructed(self):
+ """Even though text and html are split, the orig_elements metadata is not."""
+ table = Table(
+ "Header Col 1 Header Col 2\nLorem ipsum dolor sit amet",
+ metadata=ElementMetadata(text_as_html=""),
+ )
+ opts = ChunkingOptions(max_characters=30, include_orig_elements=True)
+ pre_chunk = TablePreChunk(table, overlap_prefix="", opts=opts)
+
+ chunk_iter = pre_chunk.iter_chunks()
+
+ chunk = next(chunk_iter)
+ assert isinstance(chunk, TableChunk)
+ assert chunk.text == "Header Col 1 Header Col 2"
+ assert chunk.metadata.orig_elements == [table]
+ assert not chunk.metadata.is_continuation
+ # --
+ chunk = next(chunk_iter)
+ assert isinstance(chunk, TableChunk)
+ assert chunk.text == "Lorem ipsum dolor sit amet"
+ assert chunk.metadata.orig_elements == [table]
+ assert chunk.metadata.is_continuation
+
@pytest.mark.parametrize(
("text", "expected_value"),
[
@@ -469,6 +519,50 @@ class DescribeTablePreChunk:
)
assert pre_chunk._text == expected_value
+ def it_computes_metadata_for_each_chunk_to_help(self):
+ table = Table("Lorem ipsum", metadata=ElementMetadata(text_as_html=""))
+ pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions())
+
+ metadata = pre_chunk._metadata
+
+ assert metadata.text_as_html == ""
+ # -- opts.include_orig_elements is True by default --
+ assert metadata.orig_elements == [table]
+ # -- it produces a new instance each time it is called so changing one chunk's metadata does
+ # -- not change that of any other chunk.
+ assert pre_chunk._metadata is not metadata
+
+ def but_it_omits_orig_elements_from_metadata_when_so_instructed(self):
+ pre_chunk = TablePreChunk(
+ Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="")),
+ overlap_prefix="",
+ opts=ChunkingOptions(include_orig_elements=False),
+ )
+
+ assert pre_chunk._metadata.orig_elements is None
+
+ def it_computes_the_original_elements_list_to_help(self):
+ table = Table(
+ "Lorem ipsum",
+ metadata=ElementMetadata(text_as_html="", orig_elements=[Table("Lorem Ipsum")]),
+ )
+ pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions())
+
+ orig_elements = pre_chunk._orig_elements
+
+ # -- a TablePreChunk always has exactly one original (Table) element --
+ assert len(orig_elements) == 1
+ orig_element = orig_elements[0]
+ # -- each item in orig_elements is a copy of the original element so we can mutate it
+ # -- without changing user's data.
+ assert orig_element == table
+ assert orig_element is not table
+ # -- it strips any .metadata.orig_elements from each element to prevent a recursive data
+ # -- structure
+ assert orig_element.metadata.orig_elements is None
+ # -- computation is only on first call, all chunks get exactly the same orig-elements --
+ assert pre_chunk._orig_elements is orig_elements
+
class DescribeTextPreChunk:
"""Unit-test suite for `unstructured.chunking.base.TextPreChunk` objects."""
@@ -599,17 +693,15 @@ class DescribeTextPreChunk:
)
def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
- pre_chunk = TextPreChunk(
- [
- Title("Introduction"),
- Text(
- "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
- " lectus porta volutpat.",
- ),
- ],
- overlap_prefix="e feugiat efficitur.",
- opts=ChunkingOptions(max_characters=200),
- )
+ elements = [
+ Title("Introduction"),
+ Text(
+ "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
+ " lectus porta volutpat.",
+ ),
+ ]
+ opts = ChunkingOptions(max_characters=200, include_orig_elements=True)
+ pre_chunk = TextPreChunk(elements, overlap_prefix="e feugiat efficitur.", opts=opts)
chunk_iter = pre_chunk.iter_chunks()
@@ -619,25 +711,31 @@ class DescribeTextPreChunk:
" adipiscing elit. In rhoncus ipsum sed lectus porta volutpat.",
)
assert chunk.metadata is pre_chunk._consolidated_metadata
+ assert chunk.metadata.orig_elements == elements
+ # --
+ with pytest.raises(StopIteration):
+ next(chunk_iter)
def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
# -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
# -- The pre-chunker will isolate that element in a pre_chunk of its own.
- pre_chunk = TextPreChunk(
- [
- Text(
- "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
- " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
- " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
- " commodo consequat."
- ),
- ],
- overlap_prefix="",
- opts=ChunkingOptions(max_characters=200, text_splitting_separators=("\n", " ")),
- )
+ elements = [
+ Text(
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
+ " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
+ " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
+ " commodo consequat."
+ )
+ ]
+ opts = ChunkingOptions(max_characters=200, include_orig_elements=True)
+ pre_chunk = TextPreChunk(elements, overlap_prefix="", opts=opts)
chunk_iter = pre_chunk.iter_chunks()
+ # -- Note that .metadata.orig_elements is the same single original element, "repeated" for
+ # -- each text-split chunk. This behavior emerges without explicit command as a consequence
+ # -- of using `._consolidated_metadata` (and `._continuation_metadata` which extends
+ # -- `._consolidated_metadata)` for each text-split chunk.
chunk = next(chunk_iter)
assert chunk == CompositeElement(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
@@ -645,10 +743,12 @@ class DescribeTextPreChunk:
" veniam, quis nostrud exercitation ullamco laboris nisi ut"
)
assert chunk.metadata is pre_chunk._consolidated_metadata
+ assert chunk.metadata.orig_elements == elements
# --
chunk = next(chunk_iter)
assert chunk == CompositeElement("aliquip ex ea commodo consequat.")
assert chunk.metadata is pre_chunk._continuation_metadata
+ assert chunk.metadata.orig_elements == elements
# --
with pytest.raises(StopIteration):
next(chunk_iter)
@@ -762,6 +862,23 @@ class DescribeTextPreChunk:
"parent_id": ["f87731e0"],
}
+ def and_it_adds_the_pre_chunk_elements_to_metadata_when_so_instructed(self):
+ opts = ChunkingOptions(include_orig_elements=True)
+ metadata = ElementMetadata(filename="foo.pdf")
+ element = Title("Lorem Ipsum", metadata=metadata)
+ element_2 = Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata)
+ pre_chunk = TextPreChunk([element, element_2], overlap_prefix="", opts=opts)
+
+ consolidated_metadata = pre_chunk._consolidated_metadata
+
+ # -- pre-chunk elements are included as metadata --
+ orig_elements = consolidated_metadata.orig_elements
+ assert orig_elements is not None
+ assert orig_elements == [element, element_2]
+ # -- and they are the exact instances, not copies --
+ assert orig_elements[0] is element
+ assert orig_elements[1] is element_2
+
def it_consolidates_regex_metadata_in_a_field_specific_way(self):
"""regex_metadata of chunk is combined regex_metadatas of its elements.
@@ -868,6 +985,32 @@ class DescribeTextPreChunk:
},
}
+ def it_computes_the_original_elements_list_to_help(self):
+ element = Title("Introduction")
+ element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
+ element_3 = CompositeElement(
+ "In rhoncus ipsum sed lectus porta volutpat.",
+ metadata=ElementMetadata(orig_elements=[Text("Porta volupat.")]),
+ )
+ pre_chunk = TextPreChunk(
+ [element, element_2, element_3],
+ overlap_prefix="",
+ opts=ChunkingOptions(include_orig_elements=True),
+ )
+
+ orig_elements = pre_chunk._orig_elements
+
+ # -- all elements of pre-chunk are included --
+ assert orig_elements == [element, element_2, element_3]
+ # -- orig_elements that are chunks (having orig-elements of their own) are copied and the
+ # -- copy is stripped of its `.metadata.orig_elements` to prevent a recursive data
+ # -- structure that nests orig_elements within orig_elements.
+ assert orig_elements[0] is element
+ assert orig_elements[2] is not element_3
+ assert orig_elements[2].metadata.orig_elements is None
+ # -- computation is only on first call, all chunks get exactly the same orig-elements --
+ assert pre_chunk._orig_elements is orig_elements
+
@pytest.mark.parametrize(
("elements", "overlap_prefix", "expected_value"),
[
diff --git a/test_unstructured/chunking/test_basic.py b/test_unstructured/chunking/test_basic.py
index b5f5adf97..85f807b6f 100644
--- a/test_unstructured/chunking/test_basic.py
+++ b/test_unstructured/chunking/test_basic.py
@@ -113,6 +113,27 @@ def test_it_chunks_elements_when_the_user_already_has_them():
]
+def test_it_includes_original_elements_as_metadata_when_requested():
+ element = Title("Introduction")
+ element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
+ element_3 = Text("In rhoncus ipsum sed lectus porta volutpat.")
+
+ chunks = chunk_elements(
+ [element, element_2, element_3], max_characters=70, include_orig_elements=True
+ )
+
+ assert len(chunks) == 2
+ chunk = chunks[0]
+ assert chunk == CompositeElement(
+ "Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
+ )
+ assert chunk.metadata.orig_elements == [element, element_2]
+ # --
+ chunk = chunks[1]
+ assert chunk == CompositeElement("In rhoncus ipsum sed lectus porta volutpat.")
+ assert chunk.metadata.orig_elements == [element_3]
+
+
# ------------------------------------------------------------------------------------------------
# UNIT TESTS
# ------------------------------------------------------------------------------------------------
diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py
index 923d10a89..369588d68 100644
--- a/test_unstructured/chunking/test_title.py
+++ b/test_unstructured/chunking/test_title.py
@@ -9,12 +9,7 @@ from typing import Any, Optional
import pytest
from test_unstructured.unit_utils import FixtureRequest, Mock, function_mock
-from unstructured.chunking.base import (
- CHUNK_MULTI_PAGE_DEFAULT,
- PreChunker,
- TablePreChunk,
- TextPreChunk,
-)
+from unstructured.chunking.base import CHUNK_MULTI_PAGE_DEFAULT
from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title
from unstructured.documents.coordinates import CoordinateSystem
from unstructured.documents.elements import (
@@ -57,7 +52,7 @@ def test_it_splits_a_large_element_into_multiple_chunks():
]
-def test_split_elements_by_title_and_table():
+def test_it_splits_elements_by_title_and_table():
elements: list[Element] = [
Title("A Great Day"),
Text("Today is a great day."),
@@ -72,39 +67,38 @@ def test_split_elements_by_title_and_table():
CheckBox(),
]
- pre_chunks = PreChunker.iter_pre_chunks(elements, opts=_ByTitleChunkingOptions.new())
+ chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=True)
- pre_chunk = next(pre_chunks)
- assert isinstance(pre_chunk, TextPreChunk)
- assert pre_chunk._elements == [
+ assert len(chunks) == 4
+ # --
+ chunk = chunks[0]
+ assert isinstance(chunk, CompositeElement)
+ assert chunk.metadata.orig_elements == [
Title("A Great Day"),
Text("Today is a great day."),
Text("It is sunny outside."),
]
# --
- pre_chunk = next(pre_chunks)
- assert isinstance(pre_chunk, TablePreChunk)
- assert pre_chunk._table == Table("Heading\nCell text")
+ chunk = chunks[1]
+ assert isinstance(chunk, Table)
+ assert chunk.metadata.orig_elements == [Table("Heading\nCell text")]
# ==
- pre_chunk = next(pre_chunks)
- assert isinstance(pre_chunk, TextPreChunk)
- assert pre_chunk._elements == [
+ chunk = chunks[2]
+ assert isinstance(chunk, CompositeElement)
+ assert chunk.metadata.orig_elements == [
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
]
# --
- pre_chunk = next(pre_chunks)
- assert isinstance(pre_chunk, TextPreChunk)
- assert pre_chunk._elements == [
+ chunk = chunks[3]
+ assert isinstance(chunk, CompositeElement)
+ assert chunk.metadata.orig_elements == [
Title("A Bad Day"),
Text("Today is a bad day."),
Text("It is storming outside."),
CheckBox(),
]
- # --
- with pytest.raises(StopIteration):
- next(pre_chunks)
def test_chunk_by_title():
@@ -127,7 +121,7 @@ def test_chunk_by_title():
CheckBox(),
]
- chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
+ chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=False)
assert chunks == [
CompositeElement(
diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py
index f360f97bf..ec368f85d 100644
--- a/unstructured/chunking/base.py
+++ b/unstructured/chunking/base.py
@@ -447,10 +447,10 @@ class TablePreChunk:
text_remainder = self._text
html_remainder = self._table.metadata.text_as_html or ""
- # -- only chunk a table when it's too big to swallow whole --
+ # -- only text-split a table when it's longer than the chunking window --
if len(text_remainder) <= maxlen and len(html_remainder) <= maxlen:
# -- but the overlap-prefix must be added to its text --
- yield Table(text=text_remainder, metadata=copy.deepcopy(self._table.metadata))
+ yield Table(text=text_remainder, metadata=self._metadata)
return
split = self._opts.split
@@ -459,19 +459,19 @@ class TablePreChunk:
while text_remainder or html_remainder:
# -- split off the next chunk-worth of characters into a TableChunk --
chunk_text, text_remainder = split(text_remainder)
- table_chunk = TableChunk(text=chunk_text, metadata=copy.deepcopy(self._table.metadata))
+ metadata = self._metadata
# -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
# -- HTML elements that *correspond* to the TextChunk.text fragment.
if html_remainder:
chunk_html, html_remainder = html_remainder[:maxlen], html_remainder[maxlen:]
- table_chunk.metadata.text_as_html = chunk_html
+ metadata.text_as_html = chunk_html
# -- mark second and later chunks as a continuation --
if is_continuation:
- table_chunk.metadata.is_continuation = True
+ metadata.is_continuation = True
- yield table_chunk
+ yield TableChunk(text=chunk_text, metadata=metadata)
is_continuation = True
@@ -486,6 +486,37 @@ class TablePreChunk:
overlap = self._opts.inter_chunk_overlap
return self._text[-overlap:].strip() if overlap else ""
+ @property
+ def _metadata(self) -> ElementMetadata:
+ """The base `.metadata` value for chunks formed from this pre-chunk.
+
+ The term "base" here means that other metadata fields will be added, depending on the chunk.
+ In particular, `.metadata.text_as_html` will be different for each text-split chunk and
+ `.metadata.is_continuation` must be added for second-and-later text-split chunks.
+
+ Note this is a fresh copy of the metadata on each call since it will need to be mutated
+ differently for each chunk formed from from this pre-chunk.
+ """
+ metadata = copy.deepcopy(self._table.metadata)
+ if self._opts.include_orig_elements:
+ metadata.orig_elements = self._orig_elements
+ return metadata
+
+ @lazyproperty
+ def _orig_elements(self) -> list[Element]:
+ """The `.metadata.orig_elements` value for chunks formed from this pre-chunk.
+
+ Note this is not just the `Table` element, it must be adjusted to strip out any
+ `.metadata.orig_elements` value it may have when it is itself a chunk and not a direct
+ product of partitioning.
+ """
+ # -- make a copy because we're going to mutate the `Table` element and it doesn't belong to
+ # -- us (the user may have downstream purposes for it).
+ orig_table = copy.deepcopy(self._table)
+ # -- prevent recursive .orig_elements when `Table` element is a chunk --
+ orig_table.metadata.orig_elements = None
+ return [orig_table]
+
@lazyproperty
def _text(self) -> str:
"""The text for this chunk, including the overlap-prefix when present."""
@@ -615,7 +646,10 @@ class TextPreChunk:
to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
"consolidated".
"""
- return ElementMetadata(**self._meta_kwargs)
+ consolidated_metadata = ElementMetadata(**self._meta_kwargs)
+ if self._opts.include_orig_elements:
+ consolidated_metadata.orig_elements = self._orig_elements
+ return consolidated_metadata
@lazyproperty
def _continuation_metadata(self) -> ElementMetadata:
@@ -717,6 +751,25 @@ class TextPreChunk:
return dict(iter_kwarg_pairs())
+ @lazyproperty
+ def _orig_elements(self) -> list[Element]:
+ """The `.metadata.orig_elements` value for chunks formed from this pre-chunk."""
+
+ def iter_orig_elements():
+ for e in self._elements:
+ if e.metadata.orig_elements is None:
+ yield e
+ continue
+ # -- make copy of any element we're going to mutate because these elements don't
+ # -- belong to us (the user may have downstream purposes for them).
+ orig_element = copy.copy(e)
+ # -- prevent recursive .orig_elements when element is a chunk (has orig-elements of
+ # -- its own)
+ orig_element.metadata.orig_elements = None
+ yield orig_element
+
+ return list(iter_orig_elements())
+
@lazyproperty
def _text(self) -> str:
"""The concatenated text of all elements in this pre-chunk.
diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py
index 7cfc5c83f..1a9406be9 100644
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@@ -378,6 +378,9 @@ class ElementMetadata:
for field_name in self.DEBUG_FIELD_NAMES:
meta_dict.pop(field_name, None)
+ # -- remove `.orig_elements` for now as that won't serialize --
+ meta_dict.pop("orig_elements", None)
+
# -- don't serialize empty lists --
meta_dict: dict[str, Any] = {
field_name: value