feat(chunking): add .orig_elements behavior to chunking (#2656)

**Summary** Add the actual behavior to populate `.metadata.orig_elements` during chunking, when so instructed by the `include_orig_elements` option. **Additional Context** The underlying structures to support this, namely the `.metadata.orig_elements` field and the `include_orig_elements` chunking option, were added in closely prior PRs. This PR adds the behavior to actually populate that metadata field during chunking when the option is set.
2025-11-28 08:10:29 +00:00 · 2024-03-18 12:27:39 -07:00 · 2024-03-18 12:27:39 -07:00 · 1af41d5f90
commit 1af41d5f90
parent c02cfb89d3
6 changed files with 271 additions and 55 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -6,6 +6,8 @@

 ### Features

+* **Chunking populates `.metadata.orig_elements` for each chunk.** This behavior allows the text and metadata of the elements combined to make each chunk to be accessed. This can be important for example to recover metadata such as `.coordinates` that cannot be consolidated across elements and so is dropped from chunks. This option is controlled by the `include_orig_elements` parameter to `partition_*()` or to the chunking functions. This option defaults to `True` so original-elements are preserved by default. This behavior is not yet supported via the REST APIs or SDKs but will be in a closely subsequent PR to other `unstructured` repositories. The original elements will also not serialize or deserialize yet; this will also be added in a closely subsequent PR.
+
 ### Fixes

 * **Clarify IAM Role Requirement for GCS Platform Connectors**. The GCS Source Connector requires Storage Object Viewer and GCS Destination Connector requires Storage Object Creator IAM roles.
--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@ -352,7 +352,30 @@ class DescribeTablePreChunk:
        with pytest.raises(StopIteration):
            next(chunk_iter)

-    def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
+    def and_it_includes_the_original_table_element_in_metadata_when_so_instructed(self):
+        table = Table("foo bar", metadata=ElementMetadata(text_as_html="<table>foo bar</table>"))
+        opts = ChunkingOptions(include_orig_elements=True)
+        pre_chunk = TablePreChunk(table, "", opts)
+
+        chunk_iter = pre_chunk.iter_chunks()
+
+        chunk = next(chunk_iter)
+        assert isinstance(chunk, Table)
+        assert chunk.metadata.orig_elements == [table]
+        assert chunk.metadata.text_as_html == "<table>foo bar</table>"
+        # --
+        with pytest.raises(StopIteration):
+            next(chunk_iter)
+
+    def but_not_when_instructed_not_to(self):
+        pre_chunk = TablePreChunk(Table("foobar"), "", ChunkingOptions(include_orig_elements=False))
+
+        chunk = next(pre_chunk.iter_chunks())
+
+        assert isinstance(chunk, Table)
+        assert chunk.metadata.orig_elements is None
+
+    def it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
        # fixed-overhead = 8+8+9+8+9+8 = 50
        # per-row overhead = 27
        html_table = (
@ -398,6 +421,7 @@ class DescribeTablePreChunk:
            "<tbody>\n"
            "<tr><td>Lo"
        )
+        assert not chunk.metadata.is_continuation
        # --
        chunk = next(chunk_iter)
        assert isinstance(chunk, TableChunk)
@ -408,6 +432,7 @@ class DescribeTablePreChunk:
            "rem ipsum    </td><td>A Link example</td></tr>\n"
            "<tr><td>Consectetur    </td><td>adipiscing elit</td><"
        )
+        assert chunk.metadata.is_continuation
        # -- note that text runs out but HTML continues because it's significantly longer. So two
        # -- of these chunks have HTML but no text.
        chunk = next(chunk_iter)
@ -418,6 +443,7 @@ class DescribeTablePreChunk:
            "<tr><td>Nunc aliquam   </td><td>id enim nec molestie</td></tr>\n"
            "<tr><td>Vivamus quis   </td><td>"
        )
+        assert chunk.metadata.is_continuation
        # --
        chunk = next(chunk_iter)
        assert isinstance(chunk, TableChunk)
@ -425,10 +451,34 @@ class DescribeTablePreChunk:
        assert chunk.metadata.text_as_html == (
            "nunc ipsum donec ac fermentum</td></tr>\n</tbody>\n</table>"
        )
+        assert chunk.metadata.is_continuation
        # --
        with pytest.raises(StopIteration):
            next(chunk_iter)

+    def and_it_includes_the_whole_original_Table_in_each_metadata_when_so_instructed(self):
+        """Even though text and html are split, the orig_elements metadata is not."""
+        table = Table(
+            "Header Col 1   Header Col 2\nLorem ipsum   dolor sit amet",
+            metadata=ElementMetadata(text_as_html="<table/>"),
+        )
+        opts = ChunkingOptions(max_characters=30, include_orig_elements=True)
+        pre_chunk = TablePreChunk(table, overlap_prefix="", opts=opts)
+
+        chunk_iter = pre_chunk.iter_chunks()
+
+        chunk = next(chunk_iter)
+        assert isinstance(chunk, TableChunk)
+        assert chunk.text == "Header Col 1   Header Col 2"
+        assert chunk.metadata.orig_elements == [table]
+        assert not chunk.metadata.is_continuation
+        # --
+        chunk = next(chunk_iter)
+        assert isinstance(chunk, TableChunk)
+        assert chunk.text == "Lorem ipsum   dolor sit amet"
+        assert chunk.metadata.orig_elements == [table]
+        assert chunk.metadata.is_continuation
+
    @pytest.mark.parametrize(
        ("text", "expected_value"),
        [
@ -469,6 +519,50 @@ class DescribeTablePreChunk:
        )
        assert pre_chunk._text == expected_value

+    def it_computes_metadata_for_each_chunk_to_help(self):
+        table = Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="<table/>"))
+        pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions())
+
+        metadata = pre_chunk._metadata
+
+        assert metadata.text_as_html == "<table/>"
+        # -- opts.include_orig_elements is True by default --
+        assert metadata.orig_elements == [table]
+        # -- it produces a new instance each time it is called so changing one chunk's metadata does
+        # -- not change that of any other chunk.
+        assert pre_chunk._metadata is not metadata
+
+    def but_it_omits_orig_elements_from_metadata_when_so_instructed(self):
+        pre_chunk = TablePreChunk(
+            Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="<table/>")),
+            overlap_prefix="",
+            opts=ChunkingOptions(include_orig_elements=False),
+        )
+
+        assert pre_chunk._metadata.orig_elements is None
+
+    def it_computes_the_original_elements_list_to_help(self):
+        table = Table(
+            "Lorem ipsum",
+            metadata=ElementMetadata(text_as_html="<table/>", orig_elements=[Table("Lorem Ipsum")]),
+        )
+        pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions())
+
+        orig_elements = pre_chunk._orig_elements
+
+        # -- a TablePreChunk always has exactly one original (Table) element --
+        assert len(orig_elements) == 1
+        orig_element = orig_elements[0]
+        # -- each item in orig_elements is a copy of the original element so we can mutate it
+        # -- without changing user's data.
+        assert orig_element == table
+        assert orig_element is not table
+        # -- it strips any .metadata.orig_elements from each element to prevent a recursive data
+        # -- structure
+        assert orig_element.metadata.orig_elements is None
+        # -- computation is only on first call, all chunks get exactly the same orig-elements --
+        assert pre_chunk._orig_elements is orig_elements
+

 class DescribeTextPreChunk:
    """Unit-test suite for `unstructured.chunking.base.TextPreChunk` objects."""
@ -599,17 +693,15 @@ class DescribeTextPreChunk:
        )

    def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
-        pre_chunk = TextPreChunk(
-            [
-                Title("Introduction"),
-                Text(
-                    "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
-                    " lectus porta volutpat.",
-                ),
-            ],
-            overlap_prefix="e feugiat efficitur.",
-            opts=ChunkingOptions(max_characters=200),
-        )
+        elements = [
+            Title("Introduction"),
+            Text(
+                "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
+                " lectus porta volutpat.",
+            ),
+        ]
+        opts = ChunkingOptions(max_characters=200, include_orig_elements=True)
+        pre_chunk = TextPreChunk(elements, overlap_prefix="e feugiat efficitur.", opts=opts)

        chunk_iter = pre_chunk.iter_chunks()

@ -619,25 +711,31 @@ class DescribeTextPreChunk:
            " adipiscing elit. In rhoncus ipsum sed lectus porta volutpat.",
        )
        assert chunk.metadata is pre_chunk._consolidated_metadata
+        assert chunk.metadata.orig_elements == elements
+        # --
+        with pytest.raises(StopIteration):
+            next(chunk_iter)

    def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
        # -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
        # -- The pre-chunker will isolate that element in a pre_chunk of its own.
-        pre_chunk = TextPreChunk(
-            [
-                Text(
-                    "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
-                    " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
-                    " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
-                    " commodo consequat."
-                ),
-            ],
-            overlap_prefix="",
-            opts=ChunkingOptions(max_characters=200, text_splitting_separators=("\n", " ")),
-        )
+        elements = [
+            Text(
+                "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
+                " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
+                " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
+                " commodo consequat."
+            )
+        ]
+        opts = ChunkingOptions(max_characters=200, include_orig_elements=True)
+        pre_chunk = TextPreChunk(elements, overlap_prefix="", opts=opts)

        chunk_iter = pre_chunk.iter_chunks()

+        # -- Note that .metadata.orig_elements is the same single original element, "repeated" for
+        # -- each text-split chunk. This behavior emerges without explicit command as a consequence
+        # -- of using `._consolidated_metadata` (and `._continuation_metadata` which extends
+        # -- `._consolidated_metadata)` for each text-split chunk.
        chunk = next(chunk_iter)
        assert chunk == CompositeElement(
            "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
@ -645,10 +743,12 @@ class DescribeTextPreChunk:
            " veniam, quis nostrud exercitation ullamco laboris nisi ut"
        )
        assert chunk.metadata is pre_chunk._consolidated_metadata
+        assert chunk.metadata.orig_elements == elements
        # --
        chunk = next(chunk_iter)
        assert chunk == CompositeElement("aliquip ex ea commodo consequat.")
        assert chunk.metadata is pre_chunk._continuation_metadata
+        assert chunk.metadata.orig_elements == elements
        # --
        with pytest.raises(StopIteration):
            next(chunk_iter)
@ -762,6 +862,23 @@ class DescribeTextPreChunk:
            "parent_id": ["f87731e0"],
        }

+    def and_it_adds_the_pre_chunk_elements_to_metadata_when_so_instructed(self):
+        opts = ChunkingOptions(include_orig_elements=True)
+        metadata = ElementMetadata(filename="foo.pdf")
+        element = Title("Lorem Ipsum", metadata=metadata)
+        element_2 = Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata)
+        pre_chunk = TextPreChunk([element, element_2], overlap_prefix="", opts=opts)
+
+        consolidated_metadata = pre_chunk._consolidated_metadata
+
+        # -- pre-chunk elements are included as metadata --
+        orig_elements = consolidated_metadata.orig_elements
+        assert orig_elements is not None
+        assert orig_elements == [element, element_2]
+        # -- and they are the exact instances, not copies --
+        assert orig_elements[0] is element
+        assert orig_elements[1] is element_2
+
    def it_consolidates_regex_metadata_in_a_field_specific_way(self):
        """regex_metadata of chunk is combined regex_metadatas of its elements.

@ -868,6 +985,32 @@ class DescribeTextPreChunk:
            },
        }

+    def it_computes_the_original_elements_list_to_help(self):
+        element = Title("Introduction")
+        element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
+        element_3 = CompositeElement(
+            "In rhoncus ipsum sed lectus porta volutpat.",
+            metadata=ElementMetadata(orig_elements=[Text("Porta volupat.")]),
+        )
+        pre_chunk = TextPreChunk(
+            [element, element_2, element_3],
+            overlap_prefix="",
+            opts=ChunkingOptions(include_orig_elements=True),
+        )
+
+        orig_elements = pre_chunk._orig_elements
+
+        # -- all elements of pre-chunk are included --
+        assert orig_elements == [element, element_2, element_3]
+        # -- orig_elements that are chunks (having orig-elements of their own) are copied and the
+        # -- copy is stripped of its `.metadata.orig_elements` to prevent a recursive data
+        # -- structure that nests orig_elements within orig_elements.
+        assert orig_elements[0] is element
+        assert orig_elements[2] is not element_3
+        assert orig_elements[2].metadata.orig_elements is None
+        # -- computation is only on first call, all chunks get exactly the same orig-elements --
+        assert pre_chunk._orig_elements is orig_elements
+
    @pytest.mark.parametrize(
        ("elements", "overlap_prefix", "expected_value"),
        [
--- a/test_unstructured/chunking/test_basic.py
+++ b/test_unstructured/chunking/test_basic.py
@ -113,6 +113,27 @@ def test_it_chunks_elements_when_the_user_already_has_them():
    ]


+def test_it_includes_original_elements_as_metadata_when_requested():
+    element = Title("Introduction")
+    element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
+    element_3 = Text("In rhoncus ipsum sed lectus porta volutpat.")
+
+    chunks = chunk_elements(
+        [element, element_2, element_3], max_characters=70, include_orig_elements=True
+    )
+
+    assert len(chunks) == 2
+    chunk = chunks[0]
+    assert chunk == CompositeElement(
+        "Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
+    )
+    assert chunk.metadata.orig_elements == [element, element_2]
+    # --
+    chunk = chunks[1]
+    assert chunk == CompositeElement("In rhoncus ipsum sed lectus porta volutpat.")
+    assert chunk.metadata.orig_elements == [element_3]
+
+
 # ------------------------------------------------------------------------------------------------
 # UNIT TESTS
 # ------------------------------------------------------------------------------------------------
--- a/test_unstructured/chunking/test_title.py
+++ b/test_unstructured/chunking/test_title.py
@ -9,12 +9,7 @@ from typing import Any, Optional
 import pytest

 from test_unstructured.unit_utils import FixtureRequest, Mock, function_mock
-from unstructured.chunking.base import (
-    CHUNK_MULTI_PAGE_DEFAULT,
-    PreChunker,
-    TablePreChunk,
-    TextPreChunk,
-)
+from unstructured.chunking.base import CHUNK_MULTI_PAGE_DEFAULT
 from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title
 from unstructured.documents.coordinates import CoordinateSystem
 from unstructured.documents.elements import (
@ -57,7 +52,7 @@ def test_it_splits_a_large_element_into_multiple_chunks():
    ]


-def test_split_elements_by_title_and_table():
+def test_it_splits_elements_by_title_and_table():
    elements: list[Element] = [
        Title("A Great Day"),
        Text("Today is a great day."),
@ -72,39 +67,38 @@ def test_split_elements_by_title_and_table():
        CheckBox(),
    ]

-    pre_chunks = PreChunker.iter_pre_chunks(elements, opts=_ByTitleChunkingOptions.new())
+    chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=True)

-    pre_chunk = next(pre_chunks)
-    assert isinstance(pre_chunk, TextPreChunk)
-    assert pre_chunk._elements == [
+    assert len(chunks) == 4
+    # --
+    chunk = chunks[0]
+    assert isinstance(chunk, CompositeElement)
+    assert chunk.metadata.orig_elements == [
        Title("A Great Day"),
        Text("Today is a great day."),
        Text("It is sunny outside."),
    ]
    # --
-    pre_chunk = next(pre_chunks)
-    assert isinstance(pre_chunk, TablePreChunk)
-    assert pre_chunk._table == Table("Heading\nCell text")
+    chunk = chunks[1]
+    assert isinstance(chunk, Table)
+    assert chunk.metadata.orig_elements == [Table("Heading\nCell text")]
    # ==
-    pre_chunk = next(pre_chunks)
-    assert isinstance(pre_chunk, TextPreChunk)
-    assert pre_chunk._elements == [
+    chunk = chunks[2]
+    assert isinstance(chunk, CompositeElement)
+    assert chunk.metadata.orig_elements == [
        Title("An Okay Day"),
        Text("Today is an okay day."),
        Text("It is rainy outside."),
    ]
    # --
-    pre_chunk = next(pre_chunks)
-    assert isinstance(pre_chunk, TextPreChunk)
-    assert pre_chunk._elements == [
+    chunk = chunks[3]
+    assert isinstance(chunk, CompositeElement)
+    assert chunk.metadata.orig_elements == [
        Title("A Bad Day"),
        Text("Today is a bad day."),
        Text("It is storming outside."),
        CheckBox(),
    ]
-    # --
-    with pytest.raises(StopIteration):
-        next(pre_chunks)


 def test_chunk_by_title():
@ -127,7 +121,7 @@ def test_chunk_by_title():
        CheckBox(),
    ]

-    chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
+    chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=False)

    assert chunks == [
        CompositeElement(
--- a/unstructured/chunking/base.py
+++ b/unstructured/chunking/base.py
@ -447,10 +447,10 @@ class TablePreChunk:
        text_remainder = self._text
        html_remainder = self._table.metadata.text_as_html or ""

-        # -- only chunk a table when it's too big to swallow whole --
+        # -- only text-split a table when it's longer than the chunking window --
        if len(text_remainder) <= maxlen and len(html_remainder) <= maxlen:
            # -- but the overlap-prefix must be added to its text --
-            yield Table(text=text_remainder, metadata=copy.deepcopy(self._table.metadata))
+            yield Table(text=text_remainder, metadata=self._metadata)
            return

        split = self._opts.split
@ -459,19 +459,19 @@ class TablePreChunk:
        while text_remainder or html_remainder:
            # -- split off the next chunk-worth of characters into a TableChunk --
            chunk_text, text_remainder = split(text_remainder)
-            table_chunk = TableChunk(text=chunk_text, metadata=copy.deepcopy(self._table.metadata))
+            metadata = self._metadata

            # -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
            # -- HTML elements that *correspond* to the TextChunk.text fragment.
            if html_remainder:
                chunk_html, html_remainder = html_remainder[:maxlen], html_remainder[maxlen:]
-                table_chunk.metadata.text_as_html = chunk_html
+                metadata.text_as_html = chunk_html

            # -- mark second and later chunks as a continuation --
            if is_continuation:
-                table_chunk.metadata.is_continuation = True
+                metadata.is_continuation = True

-            yield table_chunk
+            yield TableChunk(text=chunk_text, metadata=metadata)

            is_continuation = True

@ -486,6 +486,37 @@ class TablePreChunk:
        overlap = self._opts.inter_chunk_overlap
        return self._text[-overlap:].strip() if overlap else ""

+    @property
+    def _metadata(self) -> ElementMetadata:
+        """The base `.metadata` value for chunks formed from this pre-chunk.
+
+        The term "base" here means that other metadata fields will be added, depending on the chunk.
+        In particular, `.metadata.text_as_html` will be different for each text-split chunk and
+        `.metadata.is_continuation` must be added for second-and-later text-split chunks.
+
+        Note this is a fresh copy of the metadata on each call since it will need to be mutated
+        differently for each chunk formed from from this pre-chunk.
+        """
+        metadata = copy.deepcopy(self._table.metadata)
+        if self._opts.include_orig_elements:
+            metadata.orig_elements = self._orig_elements
+        return metadata
+
+    @lazyproperty
+    def _orig_elements(self) -> list[Element]:
+        """The `.metadata.orig_elements` value for chunks formed from this pre-chunk.
+
+        Note this is not just the `Table` element, it must be adjusted to strip out any
+        `.metadata.orig_elements` value it may have when it is itself a chunk and not a direct
+        product of partitioning.
+        """
+        # -- make a copy because we're going to mutate the `Table` element and it doesn't belong to
+        # -- us (the user may have downstream purposes for it).
+        orig_table = copy.deepcopy(self._table)
+        # -- prevent recursive .orig_elements when `Table` element is a chunk --
+        orig_table.metadata.orig_elements = None
+        return [orig_table]
+
    @lazyproperty
    def _text(self) -> str:
        """The text for this chunk, including the overlap-prefix when present."""
@ -615,7 +646,10 @@ class TextPreChunk:
        to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
        "consolidated".
        """
-        return ElementMetadata(**self._meta_kwargs)
+        consolidated_metadata = ElementMetadata(**self._meta_kwargs)
+        if self._opts.include_orig_elements:
+            consolidated_metadata.orig_elements = self._orig_elements
+        return consolidated_metadata

    @lazyproperty
    def _continuation_metadata(self) -> ElementMetadata:
@ -717,6 +751,25 @@ class TextPreChunk:

        return dict(iter_kwarg_pairs())

+    @lazyproperty
+    def _orig_elements(self) -> list[Element]:
+        """The `.metadata.orig_elements` value for chunks formed from this pre-chunk."""
+
+        def iter_orig_elements():
+            for e in self._elements:
+                if e.metadata.orig_elements is None:
+                    yield e
+                    continue
+                # -- make copy of any element we're going to mutate because these elements don't
+                # -- belong to us (the user may have downstream purposes for them).
+                orig_element = copy.copy(e)
+                # -- prevent recursive .orig_elements when element is a chunk (has orig-elements of
+                # -- its own)
+                orig_element.metadata.orig_elements = None
+                yield orig_element
+
+        return list(iter_orig_elements())
+
    @lazyproperty
    def _text(self) -> str:
        """The concatenated text of all elements in this pre-chunk.
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@ -378,6 +378,9 @@ class ElementMetadata:
        for field_name in self.DEBUG_FIELD_NAMES:
            meta_dict.pop(field_name, None)

+        # -- remove `.orig_elements` for now as that won't serialize --
+        meta_dict.pop("orig_elements", None)
+
        # -- don't serialize empty lists --
        meta_dict: dict[str, Any] = {
            field_name: value