From 1af41d5f90e7b7ac0cfe588f3aa6bd545ff13f70 Mon Sep 17 00:00:00 2001
From: Steve Canny <stcanny@gmail.com>
Date: Mon, 18 Mar 2024 12:27:39 -0700
Subject: [PATCH] feat(chunking): add .orig_elements behavior to chunking
 (#2656)

**Summary**
Add the actual behavior to populate `.metadata.orig_elements` during
chunking, when so instructed by the `include_orig_elements` option.

**Additional Context**
The underlying structures to support this, namely the
`.metadata.orig_elements` field and the `include_orig_elements` chunking
option, were added in closely prior PRs. This PR adds the behavior to
actually populate that metadata field during chunking when the option is
set.
---
 CHANGELOG.md                             |   2 +
 test_unstructured/chunking/test_base.py  | 191 ++++++++++++++++++++---
 test_unstructured/chunking/test_basic.py |  21 +++
 test_unstructured/chunking/test_title.py |  42 +++--
 unstructured/chunking/base.py            |  67 +++++++-
 unstructured/documents/elements.py       |   3 +
 6 files changed, 271 insertions(+), 55 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index dd4bf4177..a09b57c57 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,8 @@
 
 ### Features
 
+* **Chunking populates `.metadata.orig_elements` for each chunk.** This behavior allows the text and metadata of the elements combined to make each chunk to be accessed. This can be important for example to recover metadata such as `.coordinates` that cannot be consolidated across elements and so is dropped from chunks. This option is controlled by the `include_orig_elements` parameter to `partition_*()` or to the chunking functions. This option defaults to `True` so original-elements are preserved by default. This behavior is not yet supported via the REST APIs or SDKs but will be in a closely subsequent PR to other `unstructured` repositories. The original elements will also not serialize or deserialize yet; this will also be added in a closely subsequent PR.
+
 ### Fixes
 
 * **Clarify IAM Role Requirement for GCS Platform Connectors**. The GCS Source Connector requires Storage Object Viewer and GCS Destination Connector requires Storage Object Creator IAM roles.
diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py
index ab16e74f4..dd9a68fa7 100644
--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@@ -352,7 +352,30 @@ class DescribeTablePreChunk:
         with pytest.raises(StopIteration):
             next(chunk_iter)
 
-    def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
+    def and_it_includes_the_original_table_element_in_metadata_when_so_instructed(self):
+        table = Table("foo bar", metadata=ElementMetadata(text_as_html="<table>foo bar</table>"))
+        opts = ChunkingOptions(include_orig_elements=True)
+        pre_chunk = TablePreChunk(table, "", opts)
+
+        chunk_iter = pre_chunk.iter_chunks()
+
+        chunk = next(chunk_iter)
+        assert isinstance(chunk, Table)
+        assert chunk.metadata.orig_elements == [table]
+        assert chunk.metadata.text_as_html == "<table>foo bar</table>"
+        # --
+        with pytest.raises(StopIteration):
+            next(chunk_iter)
+
+    def but_not_when_instructed_not_to(self):
+        pre_chunk = TablePreChunk(Table("foobar"), "", ChunkingOptions(include_orig_elements=False))
+
+        chunk = next(pre_chunk.iter_chunks())
+
+        assert isinstance(chunk, Table)
+        assert chunk.metadata.orig_elements is None
+
+    def it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
         # fixed-overhead = 8+8+9+8+9+8 = 50
         # per-row overhead = 27
         html_table = (
@@ -398,6 +421,7 @@ class DescribeTablePreChunk:
             "<tbody>\n"
             "<tr><td>Lo"
         )
+        assert not chunk.metadata.is_continuation
         # --
         chunk = next(chunk_iter)
         assert isinstance(chunk, TableChunk)
@@ -408,6 +432,7 @@ class DescribeTablePreChunk:
             "rem ipsum    </td><td>A Link example</td></tr>\n"
             "<tr><td>Consectetur    </td><td>adipiscing elit</td><"
         )
+        assert chunk.metadata.is_continuation
         # -- note that text runs out but HTML continues because it's significantly longer. So two
         # -- of these chunks have HTML but no text.
         chunk = next(chunk_iter)
@@ -418,6 +443,7 @@ class DescribeTablePreChunk:
             "<tr><td>Nunc aliquam   </td><td>id enim nec molestie</td></tr>\n"
             "<tr><td>Vivamus quis   </td><td>"
         )
+        assert chunk.metadata.is_continuation
         # --
         chunk = next(chunk_iter)
         assert isinstance(chunk, TableChunk)
@@ -425,10 +451,34 @@ class DescribeTablePreChunk:
         assert chunk.metadata.text_as_html == (
             "nunc ipsum donec ac fermentum</td></tr>\n</tbody>\n</table>"
         )
+        assert chunk.metadata.is_continuation
         # --
         with pytest.raises(StopIteration):
             next(chunk_iter)
 
+    def and_it_includes_the_whole_original_Table_in_each_metadata_when_so_instructed(self):
+        """Even though text and html are split, the orig_elements metadata is not."""
+        table = Table(
+            "Header Col 1   Header Col 2\nLorem ipsum   dolor sit amet",
+            metadata=ElementMetadata(text_as_html="<table/>"),
+        )
+        opts = ChunkingOptions(max_characters=30, include_orig_elements=True)
+        pre_chunk = TablePreChunk(table, overlap_prefix="", opts=opts)
+
+        chunk_iter = pre_chunk.iter_chunks()
+
+        chunk = next(chunk_iter)
+        assert isinstance(chunk, TableChunk)
+        assert chunk.text == "Header Col 1   Header Col 2"
+        assert chunk.metadata.orig_elements == [table]
+        assert not chunk.metadata.is_continuation
+        # --
+        chunk = next(chunk_iter)
+        assert isinstance(chunk, TableChunk)
+        assert chunk.text == "Lorem ipsum   dolor sit amet"
+        assert chunk.metadata.orig_elements == [table]
+        assert chunk.metadata.is_continuation
+
     @pytest.mark.parametrize(
         ("text", "expected_value"),
         [
@@ -469,6 +519,50 @@ class DescribeTablePreChunk:
         )
         assert pre_chunk._text == expected_value
 
+    def it_computes_metadata_for_each_chunk_to_help(self):
+        table = Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="<table/>"))
+        pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions())
+
+        metadata = pre_chunk._metadata
+
+        assert metadata.text_as_html == "<table/>"
+        # -- opts.include_orig_elements is True by default --
+        assert metadata.orig_elements == [table]
+        # -- it produces a new instance each time it is called so changing one chunk's metadata does
+        # -- not change that of any other chunk.
+        assert pre_chunk._metadata is not metadata
+
+    def but_it_omits_orig_elements_from_metadata_when_so_instructed(self):
+        pre_chunk = TablePreChunk(
+            Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="<table/>")),
+            overlap_prefix="",
+            opts=ChunkingOptions(include_orig_elements=False),
+        )
+
+        assert pre_chunk._metadata.orig_elements is None
+
+    def it_computes_the_original_elements_list_to_help(self):
+        table = Table(
+            "Lorem ipsum",
+            metadata=ElementMetadata(text_as_html="<table/>", orig_elements=[Table("Lorem Ipsum")]),
+        )
+        pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions())
+
+        orig_elements = pre_chunk._orig_elements
+
+        # -- a TablePreChunk always has exactly one original (Table) element --
+        assert len(orig_elements) == 1
+        orig_element = orig_elements[0]
+        # -- each item in orig_elements is a copy of the original element so we can mutate it
+        # -- without changing user's data.
+        assert orig_element == table
+        assert orig_element is not table
+        # -- it strips any .metadata.orig_elements from each element to prevent a recursive data
+        # -- structure
+        assert orig_element.metadata.orig_elements is None
+        # -- computation is only on first call, all chunks get exactly the same orig-elements --
+        assert pre_chunk._orig_elements is orig_elements
+
 
 class DescribeTextPreChunk:
     """Unit-test suite for `unstructured.chunking.base.TextPreChunk` objects."""
@@ -599,17 +693,15 @@ class DescribeTextPreChunk:
         )
 
     def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
-        pre_chunk = TextPreChunk(
-            [
-                Title("Introduction"),
-                Text(
-                    "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
-                    " lectus porta volutpat.",
-                ),
-            ],
-            overlap_prefix="e feugiat efficitur.",
-            opts=ChunkingOptions(max_characters=200),
-        )
+        elements = [
+            Title("Introduction"),
+            Text(
+                "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
+                " lectus porta volutpat.",
+            ),
+        ]
+        opts = ChunkingOptions(max_characters=200, include_orig_elements=True)
+        pre_chunk = TextPreChunk(elements, overlap_prefix="e feugiat efficitur.", opts=opts)
 
         chunk_iter = pre_chunk.iter_chunks()
 
@@ -619,25 +711,31 @@ class DescribeTextPreChunk:
             " adipiscing elit. In rhoncus ipsum sed lectus porta volutpat.",
         )
         assert chunk.metadata is pre_chunk._consolidated_metadata
+        assert chunk.metadata.orig_elements == elements
+        # --
+        with pytest.raises(StopIteration):
+            next(chunk_iter)
 
     def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
         # -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
         # -- The pre-chunker will isolate that element in a pre_chunk of its own.
-        pre_chunk = TextPreChunk(
-            [
-                Text(
-                    "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
-                    " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
-                    " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
-                    " commodo consequat."
-                ),
-            ],
-            overlap_prefix="",
-            opts=ChunkingOptions(max_characters=200, text_splitting_separators=("\n", " ")),
-        )
+        elements = [
+            Text(
+                "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
+                " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
+                " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
+                " commodo consequat."
+            )
+        ]
+        opts = ChunkingOptions(max_characters=200, include_orig_elements=True)
+        pre_chunk = TextPreChunk(elements, overlap_prefix="", opts=opts)
 
         chunk_iter = pre_chunk.iter_chunks()
 
+        # -- Note that .metadata.orig_elements is the same single original element, "repeated" for
+        # -- each text-split chunk. This behavior emerges without explicit command as a consequence
+        # -- of using `._consolidated_metadata` (and `._continuation_metadata` which extends
+        # -- `._consolidated_metadata)` for each text-split chunk.
         chunk = next(chunk_iter)
         assert chunk == CompositeElement(
             "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
@@ -645,10 +743,12 @@ class DescribeTextPreChunk:
             " veniam, quis nostrud exercitation ullamco laboris nisi ut"
         )
         assert chunk.metadata is pre_chunk._consolidated_metadata
+        assert chunk.metadata.orig_elements == elements
         # --
         chunk = next(chunk_iter)
         assert chunk == CompositeElement("aliquip ex ea commodo consequat.")
         assert chunk.metadata is pre_chunk._continuation_metadata
+        assert chunk.metadata.orig_elements == elements
         # --
         with pytest.raises(StopIteration):
             next(chunk_iter)
@@ -762,6 +862,23 @@ class DescribeTextPreChunk:
             "parent_id": ["f87731e0"],
         }
 
+    def and_it_adds_the_pre_chunk_elements_to_metadata_when_so_instructed(self):
+        opts = ChunkingOptions(include_orig_elements=True)
+        metadata = ElementMetadata(filename="foo.pdf")
+        element = Title("Lorem Ipsum", metadata=metadata)
+        element_2 = Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata)
+        pre_chunk = TextPreChunk([element, element_2], overlap_prefix="", opts=opts)
+
+        consolidated_metadata = pre_chunk._consolidated_metadata
+
+        # -- pre-chunk elements are included as metadata --
+        orig_elements = consolidated_metadata.orig_elements
+        assert orig_elements is not None
+        assert orig_elements == [element, element_2]
+        # -- and they are the exact instances, not copies --
+        assert orig_elements[0] is element
+        assert orig_elements[1] is element_2
+
     def it_consolidates_regex_metadata_in_a_field_specific_way(self):
         """regex_metadata of chunk is combined regex_metadatas of its elements.
 
@@ -868,6 +985,32 @@ class DescribeTextPreChunk:
             },
         }
 
+    def it_computes_the_original_elements_list_to_help(self):
+        element = Title("Introduction")
+        element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
+        element_3 = CompositeElement(
+            "In rhoncus ipsum sed lectus porta volutpat.",
+            metadata=ElementMetadata(orig_elements=[Text("Porta volupat.")]),
+        )
+        pre_chunk = TextPreChunk(
+            [element, element_2, element_3],
+            overlap_prefix="",
+            opts=ChunkingOptions(include_orig_elements=True),
+        )
+
+        orig_elements = pre_chunk._orig_elements
+
+        # -- all elements of pre-chunk are included --
+        assert orig_elements == [element, element_2, element_3]
+        # -- orig_elements that are chunks (having orig-elements of their own) are copied and the
+        # -- copy is stripped of its `.metadata.orig_elements` to prevent a recursive data
+        # -- structure that nests orig_elements within orig_elements.
+        assert orig_elements[0] is element
+        assert orig_elements[2] is not element_3
+        assert orig_elements[2].metadata.orig_elements is None
+        # -- computation is only on first call, all chunks get exactly the same orig-elements --
+        assert pre_chunk._orig_elements is orig_elements
+
     @pytest.mark.parametrize(
         ("elements", "overlap_prefix", "expected_value"),
         [
diff --git a/test_unstructured/chunking/test_basic.py b/test_unstructured/chunking/test_basic.py
index b5f5adf97..85f807b6f 100644
--- a/test_unstructured/chunking/test_basic.py
+++ b/test_unstructured/chunking/test_basic.py
@@ -113,6 +113,27 @@ def test_it_chunks_elements_when_the_user_already_has_them():
     ]
 
 
+def test_it_includes_original_elements_as_metadata_when_requested():
+    element = Title("Introduction")
+    element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
+    element_3 = Text("In rhoncus ipsum sed lectus porta volutpat.")
+
+    chunks = chunk_elements(
+        [element, element_2, element_3], max_characters=70, include_orig_elements=True
+    )
+
+    assert len(chunks) == 2
+    chunk = chunks[0]
+    assert chunk == CompositeElement(
+        "Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
+    )
+    assert chunk.metadata.orig_elements == [element, element_2]
+    # --
+    chunk = chunks[1]
+    assert chunk == CompositeElement("In rhoncus ipsum sed lectus porta volutpat.")
+    assert chunk.metadata.orig_elements == [element_3]
+
+
 # ------------------------------------------------------------------------------------------------
 # UNIT TESTS
 # ------------------------------------------------------------------------------------------------
diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py
index 923d10a89..369588d68 100644
--- a/test_unstructured/chunking/test_title.py
+++ b/test_unstructured/chunking/test_title.py
@@ -9,12 +9,7 @@ from typing import Any, Optional
 import pytest
 
 from test_unstructured.unit_utils import FixtureRequest, Mock, function_mock
-from unstructured.chunking.base import (
-    CHUNK_MULTI_PAGE_DEFAULT,
-    PreChunker,
-    TablePreChunk,
-    TextPreChunk,
-)
+from unstructured.chunking.base import CHUNK_MULTI_PAGE_DEFAULT
 from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title
 from unstructured.documents.coordinates import CoordinateSystem
 from unstructured.documents.elements import (
@@ -57,7 +52,7 @@ def test_it_splits_a_large_element_into_multiple_chunks():
     ]
 
 
-def test_split_elements_by_title_and_table():
+def test_it_splits_elements_by_title_and_table():
     elements: list[Element] = [
         Title("A Great Day"),
         Text("Today is a great day."),
@@ -72,39 +67,38 @@ def test_split_elements_by_title_and_table():
         CheckBox(),
     ]
 
-    pre_chunks = PreChunker.iter_pre_chunks(elements, opts=_ByTitleChunkingOptions.new())
+    chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=True)
 
-    pre_chunk = next(pre_chunks)
-    assert isinstance(pre_chunk, TextPreChunk)
-    assert pre_chunk._elements == [
+    assert len(chunks) == 4
+    # --
+    chunk = chunks[0]
+    assert isinstance(chunk, CompositeElement)
+    assert chunk.metadata.orig_elements == [
         Title("A Great Day"),
         Text("Today is a great day."),
         Text("It is sunny outside."),
     ]
     # --
-    pre_chunk = next(pre_chunks)
-    assert isinstance(pre_chunk, TablePreChunk)
-    assert pre_chunk._table == Table("Heading\nCell text")
+    chunk = chunks[1]
+    assert isinstance(chunk, Table)
+    assert chunk.metadata.orig_elements == [Table("Heading\nCell text")]
     # ==
-    pre_chunk = next(pre_chunks)
-    assert isinstance(pre_chunk, TextPreChunk)
-    assert pre_chunk._elements == [
+    chunk = chunks[2]
+    assert isinstance(chunk, CompositeElement)
+    assert chunk.metadata.orig_elements == [
         Title("An Okay Day"),
         Text("Today is an okay day."),
         Text("It is rainy outside."),
     ]
     # --
-    pre_chunk = next(pre_chunks)
-    assert isinstance(pre_chunk, TextPreChunk)
-    assert pre_chunk._elements == [
+    chunk = chunks[3]
+    assert isinstance(chunk, CompositeElement)
+    assert chunk.metadata.orig_elements == [
         Title("A Bad Day"),
         Text("Today is a bad day."),
         Text("It is storming outside."),
         CheckBox(),
     ]
-    # --
-    with pytest.raises(StopIteration):
-        next(pre_chunks)
 
 
 def test_chunk_by_title():
@@ -127,7 +121,7 @@ def test_chunk_by_title():
         CheckBox(),
     ]
 
-    chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
+    chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=False)
 
     assert chunks == [
         CompositeElement(
diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py
index f360f97bf..ec368f85d 100644
--- a/unstructured/chunking/base.py
+++ b/unstructured/chunking/base.py
@@ -447,10 +447,10 @@ class TablePreChunk:
         text_remainder = self._text
         html_remainder = self._table.metadata.text_as_html or ""
 
-        # -- only chunk a table when it's too big to swallow whole --
+        # -- only text-split a table when it's longer than the chunking window --
         if len(text_remainder) <= maxlen and len(html_remainder) <= maxlen:
             # -- but the overlap-prefix must be added to its text --
-            yield Table(text=text_remainder, metadata=copy.deepcopy(self._table.metadata))
+            yield Table(text=text_remainder, metadata=self._metadata)
             return
 
         split = self._opts.split
@@ -459,19 +459,19 @@ class TablePreChunk:
         while text_remainder or html_remainder:
             # -- split off the next chunk-worth of characters into a TableChunk --
             chunk_text, text_remainder = split(text_remainder)
-            table_chunk = TableChunk(text=chunk_text, metadata=copy.deepcopy(self._table.metadata))
+            metadata = self._metadata
 
             # -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
             # -- HTML elements that *correspond* to the TextChunk.text fragment.
             if html_remainder:
                 chunk_html, html_remainder = html_remainder[:maxlen], html_remainder[maxlen:]
-                table_chunk.metadata.text_as_html = chunk_html
+                metadata.text_as_html = chunk_html
 
             # -- mark second and later chunks as a continuation --
             if is_continuation:
-                table_chunk.metadata.is_continuation = True
+                metadata.is_continuation = True
 
-            yield table_chunk
+            yield TableChunk(text=chunk_text, metadata=metadata)
 
             is_continuation = True
 
@@ -486,6 +486,37 @@ class TablePreChunk:
         overlap = self._opts.inter_chunk_overlap
         return self._text[-overlap:].strip() if overlap else ""
 
+    @property
+    def _metadata(self) -> ElementMetadata:
+        """The base `.metadata` value for chunks formed from this pre-chunk.
+
+        The term "base" here means that other metadata fields will be added, depending on the chunk.
+        In particular, `.metadata.text_as_html` will be different for each text-split chunk and
+        `.metadata.is_continuation` must be added for second-and-later text-split chunks.
+
+        Note this is a fresh copy of the metadata on each call since it will need to be mutated
+        differently for each chunk formed from from this pre-chunk.
+        """
+        metadata = copy.deepcopy(self._table.metadata)
+        if self._opts.include_orig_elements:
+            metadata.orig_elements = self._orig_elements
+        return metadata
+
+    @lazyproperty
+    def _orig_elements(self) -> list[Element]:
+        """The `.metadata.orig_elements` value for chunks formed from this pre-chunk.
+
+        Note this is not just the `Table` element, it must be adjusted to strip out any
+        `.metadata.orig_elements` value it may have when it is itself a chunk and not a direct
+        product of partitioning.
+        """
+        # -- make a copy because we're going to mutate the `Table` element and it doesn't belong to
+        # -- us (the user may have downstream purposes for it).
+        orig_table = copy.deepcopy(self._table)
+        # -- prevent recursive .orig_elements when `Table` element is a chunk --
+        orig_table.metadata.orig_elements = None
+        return [orig_table]
+
     @lazyproperty
     def _text(self) -> str:
         """The text for this chunk, including the overlap-prefix when present."""
@@ -615,7 +646,10 @@ class TextPreChunk:
         to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
         "consolidated".
         """
-        return ElementMetadata(**self._meta_kwargs)
+        consolidated_metadata = ElementMetadata(**self._meta_kwargs)
+        if self._opts.include_orig_elements:
+            consolidated_metadata.orig_elements = self._orig_elements
+        return consolidated_metadata
 
     @lazyproperty
     def _continuation_metadata(self) -> ElementMetadata:
@@ -717,6 +751,25 @@ class TextPreChunk:
 
         return dict(iter_kwarg_pairs())
 
+    @lazyproperty
+    def _orig_elements(self) -> list[Element]:
+        """The `.metadata.orig_elements` value for chunks formed from this pre-chunk."""
+
+        def iter_orig_elements():
+            for e in self._elements:
+                if e.metadata.orig_elements is None:
+                    yield e
+                    continue
+                # -- make copy of any element we're going to mutate because these elements don't
+                # -- belong to us (the user may have downstream purposes for them).
+                orig_element = copy.copy(e)
+                # -- prevent recursive .orig_elements when element is a chunk (has orig-elements of
+                # -- its own)
+                orig_element.metadata.orig_elements = None
+                yield orig_element
+
+        return list(iter_orig_elements())
+
     @lazyproperty
     def _text(self) -> str:
         """The concatenated text of all elements in this pre-chunk.
diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py
index 7cfc5c83f..1a9406be9 100644
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@@ -378,6 +378,9 @@ class ElementMetadata:
         for field_name in self.DEBUG_FIELD_NAMES:
             meta_dict.pop(field_name, None)
 
+        # -- remove `.orig_elements` for now as that won't serialize --
+        meta_dict.pop("orig_elements", None)
+
         # -- don't serialize empty lists --
         meta_dict: dict[str, Any] = {
             field_name: value