feat(chunking): add .orig_elements behavior to chunking (#2656)

**Summary** Add the actual behavior to populate `.metadata.orig_elements` during chunking, when so instructed by the `include_orig_elements` option. **Additional Context** The underlying structures to support this, namely the `.metadata.orig_elements` field and the `include_orig_elements` chunking option, were added in closely prior PRs. This PR adds the behavior to actually populate that metadata field during chunking when the option is set.
2025-12-07 12:33:11 +00:00 · 2024-03-18 12:27:39 -07:00 · 2024-03-18 12:27:39 -07:00 · 1af41d5f90
commit 1af41d5f90
parent c02cfb89d3
6 changed files with 271 additions and 55 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -6,6 +6,8 @@
 ### Features
 * **Chunking populates `.metadata.orig_elements` for each chunk.** This behavior allows the text and metadata of the elements combined to make each chunk to be accessed. This can be important for example to recover metadata such as `.coordinates` that cannot be consolidated across elements and so is dropped from chunks. This option is controlled by the `include_orig_elements` parameter to `partition_*()` or to the chunking functions. This option defaults to `True` so original-elements are preserved by default. This behavior is not yet supported via the REST APIs or SDKs but will be in a closely subsequent PR to other `unstructured` repositories. The original elements will also not serialize or deserialize yet; this will also be added in a closely subsequent PR.
 ### Fixes
 * **Clarify IAM Role Requirement for GCS Platform Connectors**. The GCS Source Connector requires Storage Object Viewer and GCS Destination Connector requires Storage Object Creator IAM roles.
--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@ -352,7 +352,30 @@ class DescribeTablePreChunk:
        with pytest.raises(StopIteration):
            next(chunk_iter)
-    def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
+    def and_it_includes_the_original_table_element_in_metadata_when_so_instructed(self):
        table = Table("foo bar", metadata=ElementMetadata(text_as_html="<table>foo bar</table>"))
        opts = ChunkingOptions(include_orig_elements=True)
        pre_chunk = TablePreChunk(table, "", opts)
        chunk_iter = pre_chunk.iter_chunks()
        chunk = next(chunk_iter)
        assert isinstance(chunk, Table)
        assert chunk.metadata.orig_elements == [table]
        assert chunk.metadata.text_as_html == "<table>foo bar</table>"
        # --
        with pytest.raises(StopIteration):
            next(chunk_iter)
    def but_not_when_instructed_not_to(self):
        pre_chunk = TablePreChunk(Table("foobar"), "", ChunkingOptions(include_orig_elements=False))
        chunk = next(pre_chunk.iter_chunks())
        assert isinstance(chunk, Table)
        assert chunk.metadata.orig_elements is None
    def it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
        # fixed-overhead = 8+8+9+8+9+8 = 50
        # per-row overhead = 27
        html_table = (
@ -398,6 +421,7 @@ class DescribeTablePreChunk:
            "<tbody>\n"
            "<tr><td>Lo"
        )
        assert not chunk.metadata.is_continuation
        # --
        chunk = next(chunk_iter)
        assert isinstance(chunk, TableChunk)
@ -408,6 +432,7 @@ class DescribeTablePreChunk:
            "rem ipsum    </td><td>A Link example</td></tr>\n"
            "<tr><td>Consectetur    </td><td>adipiscing elit</td><"
        )
        assert chunk.metadata.is_continuation
        # -- note that text runs out but HTML continues because it's significantly longer. So two
        # -- of these chunks have HTML but no text.
        chunk = next(chunk_iter)
@ -418,6 +443,7 @@ class DescribeTablePreChunk:
            "<tr><td>Nunc aliquam   </td><td>id enim nec molestie</td></tr>\n"
            "<tr><td>Vivamus quis   </td><td>"
        )
        assert chunk.metadata.is_continuation
        # --
        chunk = next(chunk_iter)
        assert isinstance(chunk, TableChunk)
@ -425,10 +451,34 @@ class DescribeTablePreChunk:
        assert chunk.metadata.text_as_html == (
            "nunc ipsum donec ac fermentum</td></tr>\n</tbody>\n</table>"
        )
        assert chunk.metadata.is_continuation
        # --
        with pytest.raises(StopIteration):
            next(chunk_iter)
    def and_it_includes_the_whole_original_Table_in_each_metadata_when_so_instructed(self):
        """Even though text and html are split, the orig_elements metadata is not."""
        table = Table(
            "Header Col 1   Header Col 2\nLorem ipsum   dolor sit amet",
            metadata=ElementMetadata(text_as_html="<table/>"),
        )
        opts = ChunkingOptions(max_characters=30, include_orig_elements=True)
        pre_chunk = TablePreChunk(table, overlap_prefix="", opts=opts)
        chunk_iter = pre_chunk.iter_chunks()
        chunk = next(chunk_iter)
        assert isinstance(chunk, TableChunk)
        assert chunk.text == "Header Col 1   Header Col 2"
        assert chunk.metadata.orig_elements == [table]
        assert not chunk.metadata.is_continuation
        # --
        chunk = next(chunk_iter)
        assert isinstance(chunk, TableChunk)
        assert chunk.text == "Lorem ipsum   dolor sit amet"
        assert chunk.metadata.orig_elements == [table]
        assert chunk.metadata.is_continuation
    @pytest.mark.parametrize(
        ("text", "expected_value"),
        [
@ -469,6 +519,50 @@ class DescribeTablePreChunk:
        )
        assert pre_chunk._text == expected_value
    def it_computes_metadata_for_each_chunk_to_help(self):
        table = Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="<table/>"))
        pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions())
        metadata = pre_chunk._metadata
        assert metadata.text_as_html == "<table/>"
        # -- opts.include_orig_elements is True by default --
        assert metadata.orig_elements == [table]
        # -- it produces a new instance each time it is called so changing one chunk's metadata does
        # -- not change that of any other chunk.
        assert pre_chunk._metadata is not metadata
    def but_it_omits_orig_elements_from_metadata_when_so_instructed(self):
        pre_chunk = TablePreChunk(
            Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="<table/>")),
            overlap_prefix="",
            opts=ChunkingOptions(include_orig_elements=False),
        )
        assert pre_chunk._metadata.orig_elements is None
    def it_computes_the_original_elements_list_to_help(self):
        table = Table(
            "Lorem ipsum",
            metadata=ElementMetadata(text_as_html="<table/>", orig_elements=[Table("Lorem Ipsum")]),
        )
        pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions())
        orig_elements = pre_chunk._orig_elements
        # -- a TablePreChunk always has exactly one original (Table) element --
        assert len(orig_elements) == 1
        orig_element = orig_elements[0]
        # -- each item in orig_elements is a copy of the original element so we can mutate it
        # -- without changing user's data.
        assert orig_element == table
        assert orig_element is not table
        # -- it strips any .metadata.orig_elements from each element to prevent a recursive data
        # -- structure
        assert orig_element.metadata.orig_elements is None
        # -- computation is only on first call, all chunks get exactly the same orig-elements --
        assert pre_chunk._orig_elements is orig_elements
 class DescribeTextPreChunk:
    """Unit-test suite for `unstructured.chunking.base.TextPreChunk` objects."""
@ -599,17 +693,15 @@ class DescribeTextPreChunk:
        )
    def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
-        pre_chunk = TextPreChunk(
+        elements = [
-            [
+            Title("Introduction"),
-                Title("Introduction"),
+            Text(
-                Text(
+                "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
-                    "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
+                " lectus porta volutpat.",
-                    " lectus porta volutpat.",
+            ),
-                ),
+        ]
-            ],
+        opts = ChunkingOptions(max_characters=200, include_orig_elements=True)
-            overlap_prefix="e feugiat efficitur.",
+        pre_chunk = TextPreChunk(elements, overlap_prefix="e feugiat efficitur.", opts=opts)
            opts=ChunkingOptions(max_characters=200),
        )
        chunk_iter = pre_chunk.iter_chunks()
@ -619,25 +711,31 @@ class DescribeTextPreChunk:
            " adipiscing elit. In rhoncus ipsum sed lectus porta volutpat.",
        )
        assert chunk.metadata is pre_chunk._consolidated_metadata
        assert chunk.metadata.orig_elements == elements
        # --
        with pytest.raises(StopIteration):
            next(chunk_iter)
    def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
        # -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
        # -- The pre-chunker will isolate that element in a pre_chunk of its own.
-        pre_chunk = TextPreChunk(
+        elements = [
-            [
+            Text(
-                Text(
+                "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
-                    "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
+                " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
-                    " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
+                " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
-                    " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
+                " commodo consequat."
-                    " commodo consequat."
+            )
-                ),
+        ]
-            ],
+        opts = ChunkingOptions(max_characters=200, include_orig_elements=True)
-            overlap_prefix="",
+        pre_chunk = TextPreChunk(elements, overlap_prefix="", opts=opts)
            opts=ChunkingOptions(max_characters=200, text_splitting_separators=("\n", " ")),
        )
        chunk_iter = pre_chunk.iter_chunks()
        # -- Note that .metadata.orig_elements is the same single original element, "repeated" for
        # -- each text-split chunk. This behavior emerges without explicit command as a consequence
        # -- of using `._consolidated_metadata` (and `._continuation_metadata` which extends
        # -- `._consolidated_metadata)` for each text-split chunk.
        chunk = next(chunk_iter)
        assert chunk == CompositeElement(
            "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
@ -645,10 +743,12 @@ class DescribeTextPreChunk:
            " veniam, quis nostrud exercitation ullamco laboris nisi ut"
        )
        assert chunk.metadata is pre_chunk._consolidated_metadata
        assert chunk.metadata.orig_elements == elements
        # --
        chunk = next(chunk_iter)
        assert chunk == CompositeElement("aliquip ex ea commodo consequat.")
        assert chunk.metadata is pre_chunk._continuation_metadata
        assert chunk.metadata.orig_elements == elements
        # --
        with pytest.raises(StopIteration):
            next(chunk_iter)
@ -762,6 +862,23 @@ class DescribeTextPreChunk:
            "parent_id": ["f87731e0"],
        }
    def and_it_adds_the_pre_chunk_elements_to_metadata_when_so_instructed(self):
        opts = ChunkingOptions(include_orig_elements=True)
        metadata = ElementMetadata(filename="foo.pdf")
        element = Title("Lorem Ipsum", metadata=metadata)
        element_2 = Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata)
        pre_chunk = TextPreChunk([element, element_2], overlap_prefix="", opts=opts)
        consolidated_metadata = pre_chunk._consolidated_metadata
        # -- pre-chunk elements are included as metadata --
        orig_elements = consolidated_metadata.orig_elements
        assert orig_elements is not None
        assert orig_elements == [element, element_2]
        # -- and they are the exact instances, not copies --
        assert orig_elements[0] is element
        assert orig_elements[1] is element_2
    def it_consolidates_regex_metadata_in_a_field_specific_way(self):
        """regex_metadata of chunk is combined regex_metadatas of its elements.
@ -868,6 +985,32 @@ class DescribeTextPreChunk:
            },
        }
    def it_computes_the_original_elements_list_to_help(self):
        element = Title("Introduction")
        element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
        element_3 = CompositeElement(
            "In rhoncus ipsum sed lectus porta volutpat.",
            metadata=ElementMetadata(orig_elements=[Text("Porta volupat.")]),
        )
        pre_chunk = TextPreChunk(
            [element, element_2, element_3],
            overlap_prefix="",
            opts=ChunkingOptions(include_orig_elements=True),
        )
        orig_elements = pre_chunk._orig_elements
        # -- all elements of pre-chunk are included --
        assert orig_elements == [element, element_2, element_3]
        # -- orig_elements that are chunks (having orig-elements of their own) are copied and the
        # -- copy is stripped of its `.metadata.orig_elements` to prevent a recursive data
        # -- structure that nests orig_elements within orig_elements.
        assert orig_elements[0] is element
        assert orig_elements[2] is not element_3
        assert orig_elements[2].metadata.orig_elements is None
        # -- computation is only on first call, all chunks get exactly the same orig-elements --
        assert pre_chunk._orig_elements is orig_elements
    @pytest.mark.parametrize(
        ("elements", "overlap_prefix", "expected_value"),
        [
--- a/test_unstructured/chunking/test_basic.py
+++ b/test_unstructured/chunking/test_basic.py
@ -113,6 +113,27 @@ def test_it_chunks_elements_when_the_user_already_has_them():
    ]
 def test_it_includes_original_elements_as_metadata_when_requested():
    element = Title("Introduction")
    element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
    element_3 = Text("In rhoncus ipsum sed lectus porta volutpat.")
    chunks = chunk_elements(
        [element, element_2, element_3], max_characters=70, include_orig_elements=True
    )
    assert len(chunks) == 2
    chunk = chunks[0]
    assert chunk == CompositeElement(
        "Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
    )
    assert chunk.metadata.orig_elements == [element, element_2]
    # --
    chunk = chunks[1]
    assert chunk == CompositeElement("In rhoncus ipsum sed lectus porta volutpat.")
    assert chunk.metadata.orig_elements == [element_3]
 # ------------------------------------------------------------------------------------------------
 # UNIT TESTS
 # ------------------------------------------------------------------------------------------------
--- a/test_unstructured/chunking/test_title.py
+++ b/test_unstructured/chunking/test_title.py
@ -9,12 +9,7 @@ from typing import Any, Optional
 import pytest
 from test_unstructured.unit_utils import FixtureRequest, Mock, function_mock
-from unstructured.chunking.base import (
+from unstructured.chunking.base import CHUNK_MULTI_PAGE_DEFAULT
    CHUNK_MULTI_PAGE_DEFAULT,
    PreChunker,
    TablePreChunk,
    TextPreChunk,
 )
 from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title
 from unstructured.documents.coordinates import CoordinateSystem
 from unstructured.documents.elements import (
@ -57,7 +52,7 @@ def test_it_splits_a_large_element_into_multiple_chunks():
    ]
-def test_split_elements_by_title_and_table():
+def test_it_splits_elements_by_title_and_table():
    elements: list[Element] = [
        Title("A Great Day"),
        Text("Today is a great day."),
@ -72,39 +67,38 @@ def test_split_elements_by_title_and_table():
        CheckBox(),
    ]
-    pre_chunks = PreChunker.iter_pre_chunks(elements, opts=_ByTitleChunkingOptions.new())
+    chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=True)
-    pre_chunk = next(pre_chunks)
+    assert len(chunks) == 4
-    assert isinstance(pre_chunk, TextPreChunk)
+    # --
-    assert pre_chunk._elements == [
+    chunk = chunks[0]
    assert isinstance(chunk, CompositeElement)
    assert chunk.metadata.orig_elements == [
        Title("A Great Day"),
        Text("Today is a great day."),
        Text("It is sunny outside."),
    ]
    # --
-    pre_chunk = next(pre_chunks)
+    chunk = chunks[1]
-    assert isinstance(pre_chunk, TablePreChunk)
+    assert isinstance(chunk, Table)
-    assert pre_chunk._table == Table("Heading\nCell text")
+    assert chunk.metadata.orig_elements == [Table("Heading\nCell text")]
    # ==
-    pre_chunk = next(pre_chunks)
+    chunk = chunks[2]
-    assert isinstance(pre_chunk, TextPreChunk)
+    assert isinstance(chunk, CompositeElement)
-    assert pre_chunk._elements == [
+    assert chunk.metadata.orig_elements == [
        Title("An Okay Day"),
        Text("Today is an okay day."),
        Text("It is rainy outside."),
    ]
    # --
-    pre_chunk = next(pre_chunks)
+    chunk = chunks[3]
-    assert isinstance(pre_chunk, TextPreChunk)
+    assert isinstance(chunk, CompositeElement)
-    assert pre_chunk._elements == [
+    assert chunk.metadata.orig_elements == [
        Title("A Bad Day"),
        Text("Today is a bad day."),
        Text("It is storming outside."),
        CheckBox(),
    ]
    # --
    with pytest.raises(StopIteration):
        next(pre_chunks)
 def test_chunk_by_title():
@ -127,7 +121,7 @@ def test_chunk_by_title():
        CheckBox(),
    ]
-    chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
+    chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=False)
    assert chunks == [
        CompositeElement(
--- a/unstructured/chunking/base.py
+++ b/unstructured/chunking/base.py
@ -447,10 +447,10 @@ class TablePreChunk:
        text_remainder = self._text
        html_remainder = self._table.metadata.text_as_html or ""
-        # -- only chunk a table when it's too big to swallow whole --
+        # -- only text-split a table when it's longer than the chunking window --
        if len(text_remainder) <= maxlen and len(html_remainder) <= maxlen:
            # -- but the overlap-prefix must be added to its text --
-            yield Table(text=text_remainder, metadata=copy.deepcopy(self._table.metadata))
+            yield Table(text=text_remainder, metadata=self._metadata)
            return
        split = self._opts.split
@ -459,19 +459,19 @@ class TablePreChunk:
        while text_remainder or html_remainder:
            # -- split off the next chunk-worth of characters into a TableChunk --
            chunk_text, text_remainder = split(text_remainder)
-            table_chunk = TableChunk(text=chunk_text, metadata=copy.deepcopy(self._table.metadata))
+            metadata = self._metadata
            # -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
            # -- HTML elements that *correspond* to the TextChunk.text fragment.
            if html_remainder:
                chunk_html, html_remainder = html_remainder[:maxlen], html_remainder[maxlen:]
-                table_chunk.metadata.text_as_html = chunk_html
+                metadata.text_as_html = chunk_html
            # -- mark second and later chunks as a continuation --
            if is_continuation:
-                table_chunk.metadata.is_continuation = True
+                metadata.is_continuation = True
-            yield table_chunk
+            yield TableChunk(text=chunk_text, metadata=metadata)
            is_continuation = True
@ -486,6 +486,37 @@ class TablePreChunk:
        overlap = self._opts.inter_chunk_overlap
        return self._text[-overlap:].strip() if overlap else ""
    @property
    def _metadata(self) -> ElementMetadata:
        """The base `.metadata` value for chunks formed from this pre-chunk.
        The term "base" here means that other metadata fields will be added, depending on the chunk.
        In particular, `.metadata.text_as_html` will be different for each text-split chunk and
        `.metadata.is_continuation` must be added for second-and-later text-split chunks.
        Note this is a fresh copy of the metadata on each call since it will need to be mutated
        differently for each chunk formed from from this pre-chunk.
        """
        metadata = copy.deepcopy(self._table.metadata)
        if self._opts.include_orig_elements:
            metadata.orig_elements = self._orig_elements
        return metadata
    @lazyproperty
    def _orig_elements(self) -> list[Element]:
        """The `.metadata.orig_elements` value for chunks formed from this pre-chunk.
        Note this is not just the `Table` element, it must be adjusted to strip out any
        `.metadata.orig_elements` value it may have when it is itself a chunk and not a direct
        product of partitioning.
        """
        # -- make a copy because we're going to mutate the `Table` element and it doesn't belong to
        # -- us (the user may have downstream purposes for it).
        orig_table = copy.deepcopy(self._table)
        # -- prevent recursive .orig_elements when `Table` element is a chunk --
        orig_table.metadata.orig_elements = None
        return [orig_table]
    @lazyproperty
    def _text(self) -> str:
        """The text for this chunk, including the overlap-prefix when present."""
@ -615,7 +646,10 @@ class TextPreChunk:
        to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
        "consolidated".
        """
-        return ElementMetadata(**self._meta_kwargs)
+        consolidated_metadata = ElementMetadata(**self._meta_kwargs)
        if self._opts.include_orig_elements:
            consolidated_metadata.orig_elements = self._orig_elements
        return consolidated_metadata
    @lazyproperty
    def _continuation_metadata(self) -> ElementMetadata:
@ -717,6 +751,25 @@ class TextPreChunk:
        return dict(iter_kwarg_pairs())
    @lazyproperty
    def _orig_elements(self) -> list[Element]:
        """The `.metadata.orig_elements` value for chunks formed from this pre-chunk."""
        def iter_orig_elements():
            for e in self._elements:
                if e.metadata.orig_elements is None:
                    yield e
                    continue
                # -- make copy of any element we're going to mutate because these elements don't
                # -- belong to us (the user may have downstream purposes for them).
                orig_element = copy.copy(e)
                # -- prevent recursive .orig_elements when element is a chunk (has orig-elements of
                # -- its own)
                orig_element.metadata.orig_elements = None
                yield orig_element
        return list(iter_orig_elements())
    @lazyproperty
    def _text(self) -> str:
        """The concatenated text of all elements in this pre-chunk.
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@ -378,6 +378,9 @@ class ElementMetadata:
        for field_name in self.DEBUG_FIELD_NAMES:
            meta_dict.pop(field_name, None)
        # -- remove `.orig_elements` for now as that won't serialize --
        meta_dict.pop("orig_elements", None)
        # -- don't serialize empty lists --
        meta_dict: dict[str, Any] = {
            field_name: value