feat(chunk): split tables on even row boundaries (#3504)

**Summary** Use more sophisticated algorithm for splitting oversized `Table` elements into `TableChunk` elements during chunking to ensure element text and HTML are "synchronized" and HTML is always parseable. **Additional Context** Table splitting now has the following characteristics: - `TableChunk.metadata.text_as_html` is always a parseable HTML `<table>` subtree. - `TableChunk.text` is always the text in the HTML version of the table fragment in `.metadata.text_as_html`. Text and HTML are "synchronized". - The table is divided at a whole-row boundary whenever possible. - A row is broken at an even-cell boundary when a single row is larger than the chunking window. - A cell is broken at an even-word boundary when a single cell is larger than the chunking window. - `.text_as_html` is "minified", removing all extraneous whitespace and unneeded elements or attributes. This maximizes the semantic "density" of each chunk.
2025-12-24 21:55:33 +00:00 · 2024-08-19 11:56:53 -07:00 · 2024-08-19 11:56:53 -07:00 · a861ed8fe7
commit a861ed8fe7
parent 99f72d65ba
21 changed files with 1003 additions and 140 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.15.6-dev0
+## 0.15.6-dev1

 ### Enhancements

@ -7,6 +7,7 @@
 ### Fixes

 * **Update CI for `ingest-test-fixture-update-pr` to resolve NLTK model download errors.**
+* **Synchronized text and html on `TableChunk` splits.** When a `Table` element is divided during chunking to fit the chunking window, `TableChunk.text` corresponds exactly with the table text in `TableChunk.metadata.text_as_html`, `.text_as_html` is always parseable HTML, and the table is split on even row boundaries whenever possible.


 ## 0.15.5
--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@ -7,6 +7,7 @@ from __future__ import annotations
 from typing import Any, Sequence

 import pytest
+from lxml.html import fragment_fromstring

 from unstructured.chunking.base import (
    ChunkingOptions,
@ -16,10 +17,14 @@ from unstructured.chunking.base import (
    TablePreChunk,
    TextPreChunk,
    TextPreChunkAccumulator,
+    _CellAccumulator,
+    _RowAccumulator,
+    _TableSplitter,
    _TextSplitter,
    is_on_next_page,
    is_title,
 )
+from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable
 from unstructured.documents.elements import (
    CheckBox,
    CompositeElement,
@ -341,16 +346,21 @@ class DescribePreChunkBuilder:
        builder.add_element(Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."))
        pre_chunk = list(builder.flush())[0]

+        assert isinstance(pre_chunk, TextPreChunk)
        assert pre_chunk._text == "Lorem ipsum dolor sit amet consectetur adipiscing elit."

        builder.add_element(Table("In rhoncus ipsum sed lectus porta volutpat."))
        pre_chunk = list(builder.flush())[0]

-        assert pre_chunk._text == "dipiscing elit.\nIn rhoncus ipsum sed lectus porta volutpat."
+        assert isinstance(pre_chunk, TablePreChunk)
+        assert pre_chunk._text_with_overlap == (
+            "dipiscing elit.\nIn rhoncus ipsum sed lectus porta volutpat."
+        )

        builder.add_element(Text("Donec semper facilisis metus finibus."))
        pre_chunk = list(builder.flush())[0]

+        assert isinstance(pre_chunk, TextPreChunk)
        assert pre_chunk._text == "porta volutpat.\n\nDonec semper facilisis metus finibus."

    def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
@ -386,7 +396,7 @@ class DescribeTablePreChunk:
            "</tbody>\n"
            "</table>"
        )
-        text_table = "Header Col 1  Header Col 2\n" "Lorem ipsum   adipiscing"
+        text_table = "Header Col 1  Header Col 2\nLorem ipsum   adipiscing"
        pre_chunk = TablePreChunk(
            Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
            overlap_prefix="ctus porta volutpat.",
@ -401,18 +411,27 @@ class DescribeTablePreChunk:
            "ctus porta volutpat.\nHeader Col 1  Header Col 2\nLorem ipsum   adipiscing"
        )
        assert chunk.metadata.text_as_html == (
-            "<table>\n"
-            "<thead>\n"
-            "<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
-            "</thead>\n"
-            "<tbody>\n"
-            "<tr><td>Lorem ipsum  </td><td>adipiscing   </td></tr>\n"
-            "</tbody>\n"
+            "<table>"
+            "<tr><td>Header Col 1</td><td>Header Col 2</td></tr>"
+            "<tr><td>Lorem ipsum</td><td>adipiscing</td></tr>"
            "</table>"
        )
        with pytest.raises(StopIteration):
            next(chunk_iter)

+    def but_not_when_the_table_is_is_empty_or_contains_only_whitespace(self):
+        html_table = "<table><tr><td/><td>  \t  \n   </td></tr></table>"
+        pre_chunk = TablePreChunk(
+            Table("  \t  \n  ", metadata=ElementMetadata(text_as_html=html_table)),
+            overlap_prefix="volutpat.",
+            opts=ChunkingOptions(max_characters=175),
+        )
+
+        chunk_iter = pre_chunk.iter_chunks()
+
+        with pytest.raises(StopIteration):
+            next(chunk_iter)
+
    def and_it_includes_the_original_table_element_in_metadata_when_so_instructed(self):
        table = Table("foo bar", metadata=ElementMetadata(text_as_html="<table>foo bar</table>"))
        opts = ChunkingOptions(include_orig_elements=True)
@ -437,21 +456,18 @@ class DescribeTablePreChunk:
        assert chunk.metadata.orig_elements is None

    def it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
-        # fixed-overhead = 8+8+9+8+9+8 = 50
-        # per-row overhead = 27
-        html_table = (
-            "<table>\n"  # 8
-            "<thead>\n"  # 8
-            "<tr><th>Header Col 1   </th><th>Header Col 2  </th></tr>\n"
-            "</thead>\n"  # 9
-            "<tbody>\n"  # 8
-            "<tr><td>Lorem ipsum    </td><td>A Link example</td></tr>\n"
-            "<tr><td>Consectetur    </td><td>adipiscing elit</td></tr>\n"
-            "<tr><td>Nunc aliquam   </td><td>id enim nec molestie</td></tr>\n"
-            "<tr><td>Vivamus quis   </td><td>nunc ipsum donec ac fermentum</td></tr>\n"
-            "</tbody>\n"  # 9
-            "</table>"  # 8
-        )
+        html_table = """\
+            <table>
+            <thead>
+            <tr><th>Header Col 1   </th><th>Header Col 2  </th></tr>
+            </thead>
+            <tbody>
+            <tr><td>Lorem ipsum    </td><td>A Link example</td></tr>
+            <tr><td>Consectetur    </td><td>adipiscing elit</td></tr>
+            <tr><td>Nunc aliquam   </td><td>id enim nec molestie</td></tr>
+            </tbody>
+            </table>
+        """
        text_table = (
            "Header Col 1   Header Col 2\n"
            "Lorem ipsum    dolor sit amet\n"
@ -469,48 +485,33 @@ class DescribeTablePreChunk:

        chunk = next(chunk_iter)
        assert isinstance(chunk, TableChunk)
-        assert chunk.text == (
-            "Header Col 1   Header Col 2\n"
-            "Lorem ipsum    dolor sit amet\n"
-            "Consectetur    adipiscing elit"
-        )
+        assert chunk.text == "Header Col 1 Header Col 2"
        assert chunk.metadata.text_as_html == (
-            "<table>\n"
-            "<thead>\n"
-            "<tr><th>Header Col 1   </th><th>Header Col 2  </th></tr>\n"
-            "</thead>\n"
-            "<tbody>\n"
-            "<tr><td>Lo"
+            "<table><tr><td>Header Col 1</td><td>Header Col 2</td></tr></table>"
        )
-        assert not chunk.metadata.is_continuation
+        assert chunk.metadata.is_continuation is None
        # --
        chunk = next(chunk_iter)
        assert isinstance(chunk, TableChunk)
-        assert chunk.text == (
-            "Nunc aliquam   id enim nec molestie\nVivamus quis   nunc ipsum donec ac fermentum"
-        )
+        assert chunk.text == "Lorem ipsum A Link example"
        assert chunk.metadata.text_as_html == (
-            "rem ipsum    </td><td>A Link example</td></tr>\n"
-            "<tr><td>Consectetur    </td><td>adipiscing elit</td><"
-        )
-        assert chunk.metadata.is_continuation
-        # -- note that text runs out but HTML continues because it's significantly longer. So two
-        # -- of these chunks have HTML but no text.
-        chunk = next(chunk_iter)
-        assert isinstance(chunk, TableChunk)
-        assert chunk.text == ""
-        assert chunk.metadata.text_as_html == (
-            "/tr>\n"
-            "<tr><td>Nunc aliquam   </td><td>id enim nec molestie</td></tr>\n"
-            "<tr><td>Vivamus quis   </td><td>"
+            "<table><tr><td>Lorem ipsum</td><td>A Link example</td></tr></table>"
        )
        assert chunk.metadata.is_continuation
        # --
        chunk = next(chunk_iter)
        assert isinstance(chunk, TableChunk)
-        assert chunk.text == ""
+        assert chunk.text == "Consectetur adipiscing elit"
        assert chunk.metadata.text_as_html == (
-            "nunc ipsum donec ac fermentum</td></tr>\n</tbody>\n</table>"
+            "<table><tr><td>Consectetur</td><td>adipiscing elit</td></tr></table>"
+        )
+        assert chunk.metadata.is_continuation
+        # --
+        chunk = next(chunk_iter)
+        assert isinstance(chunk, TableChunk)
+        assert chunk.text == "Nunc aliquam id enim nec molestie"
+        assert chunk.metadata.text_as_html == (
+            "<table><tr><td>Nunc aliquam</td><td>id enim nec molestie</td></tr></table>"
        )
        assert chunk.metadata.is_continuation
        # --
@ -545,8 +546,8 @@ class DescribeTablePreChunk:
        [
            # -- normally it splits exactly on overlap size  |------- 20 -------|
            ("In rhoncus ipsum sed lectus porta volutpat.", "ctus porta volutpat."),
-            # -- but it strips leading and trailing whitespace when the tail includes it --
-            ("In rhoncus ipsum sed lectus   porta volutpat.  ", "porta volutpat."),
+            # -- but it strips leading whitespace when the tail includes it --
+            ("In rhoncus ipsum sed lectus     porta volutpat.", "porta volutpat."),
        ],
    )
    def it_computes_its_overlap_tail_for_use_in_inter_pre_chunk_overlap(
@ -578,7 +579,7 @@ class DescribeTablePreChunk:
        pre_chunk = TablePreChunk(
            Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions()
        )
-        assert pre_chunk._text == expected_value
+        assert pre_chunk._text_with_overlap == expected_value

    def it_computes_metadata_for_each_chunk_to_help(self):
        table = Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="<table/>"))
@ -659,6 +660,10 @@ class DescribeTextPreChunk:

        assert (pre_chunk == other_pre_chunk) is expected_value

+    def and_it_knows_it_is_not_equal_to_an_object_that_is_not_a_TextPreChunk(self):
+        pre_chunk = TextPreChunk([], overlap_prefix="", opts=ChunkingOptions())
+        assert pre_chunk != 42
+
    @pytest.mark.parametrize(
        ("max_characters", "combine_text_under_n_chars", "expected_value"),
        [
@ -833,6 +838,19 @@ class DescribeTextPreChunk:

        assert [c.metadata.is_continuation for c in chunk_iter] == [None, True, True]

+    def but_it_generates_no_chunks_when_the_pre_chunk_contains_no_text(self):
+        metadata = ElementMetadata()
+        pre_chunk = TextPreChunk(
+            [PageBreak("", metadata=metadata)],
+            overlap_prefix="",
+            opts=ChunkingOptions(),
+        )
+
+        chunk_iter = pre_chunk.iter_chunks()
+
+        with pytest.raises(StopIteration):
+            next(chunk_iter)
+
    @pytest.mark.parametrize(
        ("text", "expected_value"),
        [
@ -1098,6 +1116,168 @@ class DescribeTextPreChunk:
 # ================================================================================================


+class Describe_TableSplitter:
+    """Unit-test suite for `unstructured.chunking.base._TableSplitter`."""
+
+    def it_splits_an_HTML_table_on_even_rows_when_possible(self):
+        opts = ChunkingOptions(max_characters=(150))
+        html_table = HtmlTable.from_html_text(
+            """
+            <table border="1" class="dataframe">
+              <tbody>
+                <tr>
+                  <td>Stanley
+              Cups</td>
+                  <td></td>
+                  <td></td>
+                </tr>
+                <tr>
+                  <td>Team</td>
+                  <td>Location</td>
+                  <td>Stanley Cups</td>
+                </tr>
+                <tr>
+                  <td>Blues</td>
+                  <td>STL</td>
+                  <td>1</td>
+                </tr>
+                <tr>
+                  <td>Flyers</td>
+                  <td>PHI</td>
+                  <td>2</td>
+                </tr>
+                <tr>
+                  <td>Maple Leafs</td>
+                  <td>TOR</td>
+                  <td>13</td>
+                </tr>
+              </tbody>
+            </table>
+            """
+        )
+
+        assert list(_TableSplitter.iter_subtables(html_table, opts)) == [
+            (
+                "Stanley Cups Team Location Stanley Cups",
+                "<table>"
+                "<tr><td>Stanley Cups</td><td/><td/></tr>"
+                "<tr><td>Team</td><td>Location</td><td>Stanley Cups</td></tr>"
+                "</table>",
+            ),
+            (
+                "Blues STL 1 Flyers PHI 2",
+                "<table>"
+                "<tr><td>Blues</td><td>STL</td><td>1</td></tr>"
+                "<tr><td>Flyers</td><td>PHI</td><td>2</td></tr>"
+                "</table>",
+            ),
+            (
+                "Maple Leafs TOR 13",
+                "<table>" "<tr><td>Maple Leafs</td><td>TOR</td><td>13</td></tr>" "</table>",
+            ),
+        ]
+
+    def and_it_splits_an_oversized_row_on_an_even_cell_boundary_when_possible(self):
+        opts = ChunkingOptions(max_characters=(100))
+        html_table = HtmlTable.from_html_text(
+            """
+            <html><body><table>
+              <tr>
+                <td>Lorem ipsum dolor sit amet.</td>
+                <td>   Consectetur    adipiscing     elit.   </td>
+                <td>
+                  Laboris nisi ut
+                  aliquip ex ea commodo.
+                </td>
+              </tr>
+              <tr>
+                <td>Duis</td>
+                <td>Dolor</td>
+              </tr>
+              <tr>
+                <td>Duis</td>
+                <td>Cillum</td>
+              </tr>
+            </table></body></html>
+            """
+        )
+
+        assert list(_TableSplitter.iter_subtables(html_table, opts)) == [
+            (
+                "Lorem ipsum dolor sit amet. Consectetur adipiscing elit.",
+                "<table><tr>"
+                "<td>Lorem ipsum dolor sit amet.</td>"
+                "<td>Consectetur adipiscing elit.</td>"
+                "</tr></table>",
+            ),
+            (
+                "Laboris nisi ut aliquip ex ea commodo.",
+                "<table><tr><td>Laboris nisi ut aliquip ex ea commodo.</td></tr></table>",
+            ),
+            (
+                "Duis Dolor Duis Cillum",
+                "<table>"
+                "<tr><td>Duis</td><td>Dolor</td></tr>"
+                "<tr><td>Duis</td><td>Cillum</td></tr>"
+                "</table>",
+            ),
+        ]
+
+    def and_it_splits_an_oversized_cell_on_an_even_word_boundary(self):
+        opts = ChunkingOptions(max_characters=(100))
+        html_table = HtmlTable.from_html_text(
+            """
+            <table>
+              <thead>
+                <tr>
+                  <td>
+                    Lorem ipsum dolor sit amet,
+                    consectetur adipiscing elit.
+                    Sed do eiusmod tempor
+                    incididunt ut labore et dolore magna aliqua.
+                  </td>
+                  <td> Ut enim ad minim veniam.           </td>
+                  <td> Quis nostrud exercitation ullamco. </td>
+                </tr>
+              </thead>
+              <tbody>
+                <tr><td>Duis aute irure dolor</td></tr>
+                <tr><td>In reprehenderit voluptate.</td></tr>
+              </tbody>
+            </table
+            """
+        )
+
+        assert list(_TableSplitter.iter_subtables(html_table, opts)) == [
+            (
+                "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do",
+                "<table>"
+                "<tr><td>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do</td></tr>"
+                "</table>",
+            ),
+            (
+                "eiusmod tempor incididunt ut labore et dolore magna aliqua.",
+                "<table>"
+                "<tr><td>eiusmod tempor incididunt ut labore et dolore magna aliqua.</td></tr>"
+                "</table>",
+            ),
+            (
+                "Ut enim ad minim veniam. Quis nostrud exercitation ullamco.",
+                "<table><tr>"
+                "<td>Ut enim ad minim veniam.</td>"
+                "<td>Quis nostrud exercitation ullamco.</td>"
+                "</tr></table>",
+            ),
+            (
+                "Duis aute irure dolor In reprehenderit voluptate.",
+                "<table>"
+                "<tr><td>Duis aute irure dolor</td></tr>"
+                "<tr><td>In reprehenderit voluptate.</td></tr>"
+                "</table>",
+            ),
+        ]
+
+
 class Describe_TextSplitter:
    """Unit-test suite for `unstructured.chunking.base._TextSplitter` objects."""

@ -1199,6 +1379,192 @@ class Describe_TextSplitter:
        assert remainder == "ipiscing. In rhoncus ipsum sed lectus."


+class Describe_CellAccumulator:
+    """Unit-test suite for `unstructured.chunking.base._CellAccumulator`."""
+
+    def it_is_empty_on_construction(self):
+        accum = _CellAccumulator(maxlen=100)
+
+        assert accum._cells == []
+
+    def it_accumulates_elements_added_to_it(self):
+        td = fragment_fromstring("<td>foobar</td>")
+        cell = HtmlCell(td)
+        accum = _CellAccumulator(maxlen=100)
+
+        accum.add_cell(cell)
+
+        assert accum._cells == [cell]
+
+    @pytest.mark.parametrize(
+        ("cell_html", "expected_value"),
+        [
+            ("<td/>", True),
+            ("<td>Lorem Ipsum.</td>", True),
+            ("<td>Lorem Ipsum dolor sit.</td>", True),
+            ("<td>Lorem Ipsum dolor sit amet.</td>", False),
+        ],
+    )
+    def it_will_fit_a_cell_with_text_shorter_than_maxlen_minus_33_when_empty(
+        self, cell_html: str, expected_value: bool
+    ):
+        """Cell text must be 22-chars or shorter to fit in 55-char window.
+
+        `<table><tr><td>...</td></tr></table>` overhead is 33 characters.
+        """
+        accum = _CellAccumulator(maxlen=55)
+        cell = HtmlCell(fragment_fromstring(cell_html))
+
+        assert accum.will_fit(cell) is expected_value
+
+    @pytest.mark.parametrize(
+        ("cell_html", "expected_value"),
+        [
+            ("<td/>", True),  # -- 0 --
+            ("<td>Lorem Ipsum.</td>", True),  # -- 12 --
+            ("<td>Lorem Ipsum amet.</td>", True),  # -- 17 --
+            ("<td>Lorem Ipsum dolor.</td>", False),  # -- 18 --
+            ("<td>Lorem Ipsum dolor sit amet.</td>", False),  # -- 27 --
+        ],
+    )
+    def and_it_will_fit_a_cell_with_text_shorter_than_remaining_space_minus_9_when_not_empty(
+        self, cell_html: str, expected_value: bool
+    ):
+        """Cell text must be 9-chars shorter than remaining space to fit with accumulated cells.
+
+        `<td>...</td>` overhead is 9 characters.
+        """
+        accum = _CellAccumulator(maxlen=85)
+        accum.add_cell(HtmlCell(fragment_fromstring("<td>abcdefghijklmnopqrstuvwxyz</td>")))
+        # -- remaining space is 85 - 26 -33 = 26; max new cell text len is 17 --
+        cell = HtmlCell(fragment_fromstring(cell_html))
+
+        assert accum.will_fit(cell) is expected_value
+
+    def it_generates_a_TextAndHtml_pair_and_resets_itself_to_empty_when_flushed(self):
+        accum = _CellAccumulator(maxlen=100)
+        accum.add_cell(HtmlCell(fragment_fromstring("<td>abcde fghij klmno</td>")))
+
+        text, html = next(accum.flush())
+
+        assert text == "abcde fghij klmno"
+        assert html == "<table><tr><td>abcde fghij klmno</td></tr></table>"
+        assert accum._cells == []
+
+    def and_the_HTML_contains_as_many_cells_as_were_accumulated(self):
+        accum = _CellAccumulator(maxlen=100)
+        accum.add_cell(HtmlCell(fragment_fromstring("<td>abcde fghij klmno</td>")))
+        accum.add_cell(HtmlCell(fragment_fromstring("<td>pqrst uvwxy z</td>")))
+
+        text, html = next(accum.flush())
+
+        assert text == "abcde fghij klmno pqrst uvwxy z"
+        assert html == "<table><tr><td>abcde fghij klmno</td><td>pqrst uvwxy z</td></tr></table>"
+        assert accum._cells == []
+
+    def but_it_does_not_generate_a_TextAndHtml_pair_when_empty(self):
+        accum = _CellAccumulator(maxlen=100)
+
+        with pytest.raises(StopIteration):
+            next(accum.flush())
+
+
+class Describe_RowAccumulator:
+    """Unit-test suite for `unstructured.chunking.base._RowAccumulator`."""
+
+    def it_is_empty_on_construction(self):
+        accum = _RowAccumulator(maxlen=100)
+
+        assert accum._rows == []
+
+    def it_accumulates_rows_added_to_it(self):
+        accum = _RowAccumulator(maxlen=100)
+        row = HtmlRow(fragment_fromstring("<tr><td>foo</td><td>bar</td></tr>"))
+
+        accum.add_row(row)
+
+        assert accum._rows == [row]
+
+    @pytest.mark.parametrize(
+        ("row_html", "expected_value"),
+        [
+            ("<tr/>", True),  # -- 5 --
+            ("<tr><td/></tr>", True),  # -- 14 --
+            ("<tr><td>Lorem Ipsum.</td></tr>", True),  # -- 30 --
+            ("<tr><td>Lorem Ipsum dolor sit.</td></tr>", True),  # -- 40 --
+            ("<tr><td>Lorem</td><td>Sit amet</td></tr>", True),  # -- 40 --
+            ("<tr><td>Lorem Ipsum dolor sit amet.</td></tr>", False),  # -- 45 --
+            ("<tr><td>Lorem Ipsum</td><td>Dolor sit.</td></tr>", False),  # -- 48 --
+        ],
+    )
+    def it_will_fit_a_row_with_HTML_shorter_than_maxlen_minus_15_when_empty(
+        self, row_html: str, expected_value: bool
+    ):
+        """Row HTML must be 40-chars or shorter to fit in 55-char chunking window.
+
+        `<table>...</table>` overhead is 15 characters.
+        """
+        accum = _RowAccumulator(maxlen=55)
+        row = HtmlRow(fragment_fromstring(row_html))
+
+        assert accum.will_fit(row) is expected_value
+
+    @pytest.mark.parametrize(
+        ("row_html", "expected_value"),
+        [
+            ("<tr/>", True),  # -- 5 --
+            ("<tr><td/></tr>", True),  # -- 14 --
+            ("<tr><td>Lorem Ipsum dolor sit</td></tr>", True),  # -- 39 --
+            ("<tr><td>Lorem Ipsum dolor sit.</td></tr>", True),  # -- 40 --
+            ("<tr><td>Lorem</td><td>Sit amet</td></tr>", True),  # -- 40 --
+            ("<tr><td>Lorem</td><td>Sit amet.</td></tr>", False),  # -- 41 --
+            ("<tr><td>Lorem Ipsum</td><td>Dolor sit.</td></tr>", False),  # -- 48 --
+        ],
+    )
+    def and_it_will_fit_a_row_with_HTML_shorter_than_remaining_space_when_not_empty(
+        self, row_html: str, expected_value: bool
+    ):
+        """There is no overhead beyond row HTML for additional rows."""
+        accum = _RowAccumulator(maxlen=99)
+        accum.add_row(HtmlRow(fragment_fromstring("<tr><td>abcdefghijklmnopqrstuvwxyz</td></tr>")))
+        # -- remaining space is 85 - 26 - 33 = 26; max new row HTML len is 40 --
+        row = HtmlRow(fragment_fromstring(row_html))
+
+        assert accum.will_fit(row) is expected_value
+
+    def it_generates_a_TextAndHtml_pair_and_resets_itself_to_empty_when_flushed(self):
+        accum = _RowAccumulator(maxlen=100)
+        accum.add_row(HtmlRow(fragment_fromstring("<tr><td>abcde fghij klmno</td></tr>")))
+
+        text, html = next(accum.flush())
+
+        assert text == "abcde fghij klmno"
+        assert html == "<table><tr><td>abcde fghij klmno</td></tr></table>"
+        assert accum._rows == []
+
+    def and_the_HTML_contains_as_many_rows_as_were_accumulated(self):
+        accum = _RowAccumulator(maxlen=100)
+        accum.add_row(HtmlRow(fragment_fromstring("<tr><td>abcde fghij klmno</td></tr>")))
+        accum.add_row(HtmlRow(fragment_fromstring("<tr><td>pqrst uvwxy z</td></tr>")))
+
+        text, html = next(accum.flush())
+
+        assert text == "abcde fghij klmno pqrst uvwxy z"
+        assert html == (
+            "<table>"
+            "<tr><td>abcde fghij klmno</td></tr>"
+            "<tr><td>pqrst uvwxy z</td></tr>"
+            "</table>"
+        )
+        assert accum._rows == []
+
+    def but_it_does_not_generate_a_TextAndHtml_pair_when_empty(self):
+        accum = _RowAccumulator(maxlen=100)
+
+        with pytest.raises(StopIteration):
+            next(accum.flush())
+
+
 # ================================================================================================
 # PRE-CHUNK COMBINER
 # ================================================================================================
--- a/test_unstructured/common/test_html_table.py
+++ b/test_unstructured/common/test_html_table.py
@ -1,8 +1,18 @@
+# pyright: reportPrivateUsage=false
+
 """Unit-test suite for the `unstructured.common.html_table` module."""

 from __future__ import annotations

-from unstructured.common.html_table import htmlify_matrix_of_cell_texts
+import pytest
+from lxml.html import fragment_fromstring
+
+from unstructured.common.html_table import (
+    HtmlCell,
+    HtmlRow,
+    HtmlTable,
+    htmlify_matrix_of_cell_texts,
+)


 class Describe_htmlify_matrix_of_cell_texts:
@ -11,8 +21,8 @@ class Describe_htmlify_matrix_of_cell_texts:
    def test_htmlify_matrix_handles_empty_cells(self):
        assert htmlify_matrix_of_cell_texts([["cell1", "", "cell3"], ["", "cell5", ""]]) == (
            "<table>"
-            "<tr><td>cell1</td><td></td><td>cell3</td></tr>"
-            "<tr><td></td><td>cell5</td><td></td></tr>"
+            "<tr><td>cell1</td><td/><td>cell3</td></tr>"
+            "<tr><td/><td>cell5</td><td/></tr>"
            "</table>"
        )

@ -31,3 +41,163 @@ class Describe_htmlify_matrix_of_cell_texts:

    def test_htmlify_matrix_handles_empty_matrix(self):
        assert htmlify_matrix_of_cell_texts([]) == ""
+
+
+class DescribeHtmlTable:
+    """Unit-test suite for `unstructured.common.html_table.HtmlTable`."""
+
+    def it_can_construct_from_html_text(self):
+        html_table = HtmlTable.from_html_text("<table><tr><td>foobar</td></tr></table>")
+
+        assert isinstance(html_table, HtmlTable)
+        assert html_table._table.tag == "table"
+
+    @pytest.mark.parametrize(
+        "html_text",
+        [
+            "<table><tr><td>foobar</td></tr></table>",
+            "<body><table><tr><td>foobar</td></tr></table></body>",
+            "<html><body><table><tr><td>foobar</td></tr></table></body></html>",
+        ],
+    )
+    def it_can_find_a_table_wrapped_in_an_html_or_body_element(self, html_text: str):
+        html_table = HtmlTable.from_html_text(html_text)
+
+        assert isinstance(html_table, HtmlTable)
+        assert html_table._table.tag == "table"
+
+    def but_it_raises_when_no_table_element_is_present_in_the_html(self):
+        with pytest.raises(ValueError, match="`html_text` contains no `<table>` element"):
+            HtmlTable.from_html_text("<html><body><tr><td>foobar</td></tr></body></html>")
+
+    def it_removes_any_attributes_present_on_the_table_element(self):
+        html_table = HtmlTable.from_html_text(
+            '<table border="1", class="foobar"><tr><td>foobar</td></tr></table>',
+        )
+        assert html_table.html == "<table><tr><td>foobar</td></tr></table>"
+
+    @pytest.mark.parametrize(
+        "html_text",
+        [
+            "<table><thead><tr><td>foobar</td></tr></thead></table>",
+            "<table><thead><tr><td>foobar</td></tr></thead><tbody></tbody></table>",
+            "<table><tbody><tr><td>foobar</td></tr></tbody><tfoot></tfoot></table>",
+        ],
+    )
+    def it_removes_any_thead_tbody_or_tfoot_elements_present_within_the_table_element(
+        self, html_text: str
+    ):
+        html_table = HtmlTable.from_html_text(html_text)
+        assert html_table.html == "<table><tr><td>foobar</td></tr></table>"
+
+    def it_changes_any_th_elements_to_td_elements_for_cell_element_uniformity(self):
+        html_table = HtmlTable.from_html_text(
+            "<table>"
+            "  <tr><th>a</th><th/><th>b</th></tr>"
+            "  <tr><td/><td>c</td><td/></tr>"
+            "</table>"
+        )
+        assert html_table.html == (
+            "<table><tr><td>a</td><td/><td>b</td></tr><tr><td/><td>c</td><td/></tr></table>"
+        )
+
+    def it_removes_any_extra_whitespace_between_elements_and_normalizes_whitespace_in_text(self):
+        html_table = HtmlTable.from_html_text(
+            "\n  <table>\n  <tr>\n    <td>\tabc   def\nghi </td>\n  </tr>\n</table>\n  ",
+        )
+        assert html_table.html == "<table><tr><td>abc def ghi</td></tr></table>"
+
+    def it_can_serialize_the_table_element_to_str_html_text(self):
+        table = fragment_fromstring("<table><tr><td>foobar</td></tr></table>")
+        html_table = HtmlTable(table)
+
+        assert html_table.html == "<table><tr><td>foobar</td></tr></table>"
+
+    def it_can_iterate_the_rows_in_the_table(self):
+        html_table = HtmlTable.from_html_text(
+            "<table>"
+            "  <tr><td>abc</td><td>def</td><td>ghi</td></tr>"
+            "  <tr><td>jkl</td><td>mno</td><td>pqr</td></tr>"
+            "  <tr><td>stu</td><td>vwx</td><td>yz</td></tr>"
+            "</table>"
+        )
+
+        row_iter = html_table.iter_rows()
+
+        row = next(row_iter)
+        assert isinstance(row, HtmlRow)
+        assert row.html == "<tr><td>abc</td><td>def</td><td>ghi</td></tr>"
+        # --
+        row = next(row_iter)
+        assert isinstance(row, HtmlRow)
+        assert row.html == "<tr><td>jkl</td><td>mno</td><td>pqr</td></tr>"
+        # --
+        row = next(row_iter)
+        assert isinstance(row, HtmlRow)
+        assert row.html == "<tr><td>stu</td><td>vwx</td><td>yz</td></tr>"
+        # --
+        with pytest.raises(StopIteration):
+            next(row_iter)
+
+    def it_provides_access_to_the_clear_concatenated_text_of_the_table(self):
+        html_table = HtmlTable.from_html_text(
+            "<table>"
+            "  <tr><th> a\n b  c  </th><th/><th>def</th></tr>"
+            "  <tr><td>gh \ti</td><td/><td>\n jk l </td></tr>"
+            "  <tr><td/><td> m n op\n</td><td/></tr>"
+            "</table>"
+        )
+        assert html_table.text == "a b c def gh i jk l m n op"
+
+
+class DescribeHtmlRow:
+    """Unit-test suite for `unstructured.common.html_table.HtmlRow`."""
+
+    def it_can_serialize_the_row_to_html(self):
+        assert HtmlRow(fragment_fromstring("<tr><td>a</td><td>b</td><td/></tr>")).html == (
+            "<tr><td>a</td><td>b</td><td/></tr>"
+        )
+
+    def it_can_iterate_the_cells_in_the_row(self):
+        row = HtmlRow(fragment_fromstring("<tr><td>a</td><td>b</td><td/></tr>"))
+
+        cell_iter = row.iter_cells()
+
+        cell = next(cell_iter)
+        assert isinstance(cell, HtmlCell)
+        assert cell.html == "<td>a</td>"
+        # --
+        cell = next(cell_iter)
+        assert isinstance(cell, HtmlCell)
+        assert cell.html == "<td>b</td>"
+        # --
+        cell = next(cell_iter)
+        assert isinstance(cell, HtmlCell)
+        assert cell.html == "<td/>"
+        # --
+        with pytest.raises(StopIteration):
+            next(cell_iter)
+
+    def it_can_iterate_the_texts_of_the_cells_in_the_row(self):
+        row = HtmlRow(fragment_fromstring("<tr><td>a</td><td>b</td><td/></tr>"))
+
+        text_iter = row.iter_cell_texts()
+
+        assert next(text_iter) == "a"
+        assert next(text_iter) == "b"
+        with pytest.raises(StopIteration):
+            next(text_iter)
+
+
+class DescribeHtmlCell:
+    """Unit-test suite for `unstructured.common.html_table.HtmlCell`."""
+
+    def it_can_serialize_the_cell_to_html(self):
+        assert HtmlCell(fragment_fromstring("<td>a b c</td>")).html == "<td>a b c</td>"
+
+    @pytest.mark.parametrize(
+        ("cell_html", "expected_value"),
+        [("<td>  Lorem ipsum  </td>", "Lorem ipsum"), ("<td/>", "")],
+    )
+    def it_knows_the_text_in_the_cell(self, cell_html: str, expected_value: str):
+        assert HtmlCell(fragment_fromstring(cell_html)).text == expected_value
--- a/test_unstructured/partition/html/test_partition.py
+++ b/test_unstructured/partition/html/test_partition.py
@ -504,7 +504,7 @@ def test_partition_html_accommodates_tds_with_child_elements():
    )
    assert element.metadata.text_as_html == (
        "<table>"
-        "<tr><td></td><td></td></tr>"
+        "<tr><td/><td/></tr>"
        "<tr><td>☒</td><td>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES"
        " EXCHANGE ACT OF 1934</td></tr>"
        "</table>"
--- a/test_unstructured_ingest/dest/pinecone.sh
+++ b/test_unstructured_ingest/dest/pinecone.sh
@ -110,7 +110,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
 # to give it that time process the writes. Will timeout after checking for a minute.
 num_of_vectors_remote=0
 attempt=1
-sleep_amount=8
+sleep_amount=30
 while [ "$num_of_vectors_remote" -eq 0 ] && [ "$attempt" -lt 4 ]; do
  echo "attempt $attempt: sleeping $sleep_amount seconds to let index finish catching up after writes"
  sleep $sleep_amount
--- a/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json
@ -4,7 +4,7 @@
    "element_id": "32bc8af17151389d3e80f65036f8e65b",
    "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
    "metadata": {
-      "text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from.  The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
+      "text_as_html": "<table><tr><td/><td/><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
      "languages": [
        "eng"
      ],
--- a/test_unstructured_ingest/expected-structured-output/box/nested-1/nested-2/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/box/nested-1/nested-2/ideas-page.html.json
@ -4,7 +4,7 @@
    "element_id": "32bc8af17151389d3e80f65036f8e65b",
    "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
    "metadata": {
-      "text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from.  The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
+      "text_as_html": "<table><tr><td/><td/><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
      "languages": [
        "eng"
      ],
--- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1540126.json
+++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1540126.json
@ -16,7 +16,7 @@
      "languages": [
        "eng"
      ],
-      "text_as_html": "<table><tr><td>Driver</td><td></td></tr><tr><td>Approver</td><td></td></tr><tr><td>Contributors</td><td></td></tr><tr><td>Informed</td><td></td></tr><tr><td>Objective</td><td></td></tr><tr><td>Due date</td><td></td></tr><tr><td>Key outcomes</td><td></td></tr><tr><td>Status</td><td>NOT STARTED / IN PROGRESS / COMPLETE</td></tr></table>"
+      "text_as_html": "<table><tr><td>Driver</td><td/></tr><tr><td>Approver</td><td/></tr><tr><td>Contributors</td><td/></tr><tr><td>Informed</td><td/></tr><tr><td>Objective</td><td/></tr><tr><td>Due date</td><td/></tr><tr><td>Key outcomes</td><td/></tr><tr><td>Status</td><td>NOT STARTED / IN PROGRESS / COMPLETE</td></tr></table>"
    },
    "text": "Driver Approver Contributors Informed Objective Due date Key outcomes Status NOT STARTED / IN PROGRESS / COMPLETE",
    "type": "Table"
@ -80,7 +80,7 @@
      "languages": [
        "eng"
      ],
-      "text_as_html": "<table><tr><td>Must have:</td><td></td></tr><tr><td>Nice to have:</td><td></td></tr><tr><td>Not in scope:</td><td></td></tr></table>"
+      "text_as_html": "<table><tr><td>Must have:</td><td/></tr><tr><td>Nice to have:</td><td/></tr><tr><td>Not in scope:</td><td/></tr></table>"
    },
    "text": "Must have: Nice to have: Not in scope:",
    "type": "Table"
@ -312,7 +312,7 @@
      "languages": [
        "eng"
      ],
-      "text_as_html": "<table><tr><td>Milestone</td><td>Owner</td><td>Deadline</td><td>Status</td></tr><tr><td></td><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td></tr></table>"
+      "text_as_html": "<table><tr><td>Milestone</td><td>Owner</td><td>Deadline</td><td>Status</td></tr><tr><td/><td/><td/><td/></tr><tr><td/><td/><td/><td/></tr><tr><td/><td/><td/><td/></tr></table>"
    },
    "text": "Milestone Owner Deadline Status",
    "type": "Table"
--- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605928.json
+++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605928.json
@ -100,7 +100,7 @@
      "languages": [
        "eng"
      ],
-      "text_as_html": "<table><tr><td>Time</td><td>Item</td><td>Presenter</td><td>Notes</td></tr><tr><td></td><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td></tr></table>"
+      "text_as_html": "<table><tr><td>Time</td><td>Item</td><td>Presenter</td><td>Notes</td></tr><tr><td/><td/><td/><td/></tr><tr><td/><td/><td/><td/></tr></table>"
    },
    "text": "Time Item Presenter Notes",
    "type": "Table"
--- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605942.json
+++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605942.json
@ -118,7 +118,7 @@
      "languages": [
        "eng"
      ],
-      "text_as_html": "<table><tr><td>Notes</td><td></td></tr><tr><td>Important Links</td><td></td></tr></table>"
+      "text_as_html": "<table><tr><td>Notes</td><td/></tr><tr><td>Important Links</td><td/></tr></table>"
    },
    "text": "Notes Important Links",
    "type": "Table"
--- a/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json
@ -4,7 +4,7 @@
    "element_id": "32bc8af17151389d3e80f65036f8e65b",
    "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
    "metadata": {
-      "text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from.  The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
+      "text_as_html": "<table><tr><td/><td/><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
      "languages": [
        "eng"
      ],
--- a/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json
@ -4,7 +4,7 @@
    "element_id": "32bc8af17151389d3e80f65036f8e65b",
    "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
    "metadata": {
-      "text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from.  The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
+      "text_as_html": "<table><tr><td/><td/><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
      "languages": [
        "eng"
      ],
--- a/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json
@ -4,7 +4,7 @@
    "element_id": "32bc8af17151389d3e80f65036f8e65b",
    "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
    "metadata": {
-      "text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from.  The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
+      "text_as_html": "<table><tr><td/><td/><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
      "languages": [
        "eng"
      ],
--- a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json
@ -4,7 +4,7 @@
    "element_id": "32bc8af17151389d3e80f65036f8e65b",
    "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
    "metadata": {
-      "text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from.  The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
+      "text_as_html": "<table><tr><td/><td/><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
      "languages": [
        "eng"
      ],
--- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json
@ -4,7 +4,7 @@
    "element_id": "32bc8af17151389d3e80f65036f8e65b",
    "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
    "metadata": {
-      "text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from.  The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
+      "text_as_html": "<table><tr><td/><td/><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>",
      "languages": [
        "eng"
      ],
--- a/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv
+++ b/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv
@ -1,3 +0,0 @@
-metric	average	sample_sd	population_sd	count
-cct-accuracy	0.811	0.239	0.232	17
-cct-%missing	0.024	0.032	0.031	17
--- a/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv
+++ b/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv
@ -1,18 +0,0 @@
-filename	doctype	connector	cct-accuracy	cct-%missing
-fake-text.txt	txt	Sharepoint	1.0	0.0
-ideas-page.html	html	Sharepoint	0.93	0.033
-stanley-cups.xlsx	xlsx	Sharepoint	0.778	0.0
-Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf	pdf	azure	0.981	0.005
-IRS-form-1987.pdf	pdf	azure	0.794	0.135
-spring-weather.html	html	azure	0.0	0.018
-example-10k.html	html	local	0.754	0.027
-fake-html-cp1252.html	html	local	0.659	0.0
-ideas-page.html	html	local	0.93	0.033
-UDHR_first_article_all.txt	txt	local-single-file	0.995	0.0
-handbook-1p.docx	docx	local-single-file-basic-chunking	0.858	0.029
-fake-html-cp1252.html	html	local-single-file-with-encoding	0.659	0.0
-layout-parser-paper-with-table.jpg	jpg	local-single-file-with-pdf-infer-table-structure	0.716	0.032
-layout-parser-paper.pdf	pdf	local-single-file-with-pdf-infer-table-structure	0.95	0.029
-2023-Jan-economic-outlook.pdf	pdf	s3	0.84	0.044
-page-with-formula.pdf	pdf	s3	0.971	0.021
-recalibrating-risk-report.pdf	pdf	s3	0.968	0.008
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.15.6-dev0"  # pragma: no cover
+__version__ = "0.15.6-dev1"  # pragma: no cover
--- a/unstructured/chunking/base.py
+++ b/unstructured/chunking/base.py
@ -9,6 +9,7 @@ from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
 import regex
 from typing_extensions import Self, TypeAlias

+from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable
 from unstructured.documents.elements import (
    CompositeElement,
    ConsolidationStrategy,
@ -46,6 +47,8 @@ BoundaryPredicate: TypeAlias = Callable[[Element], bool]
 PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
 """The kind of object produced by a pre-chunker."""

+TextAndHtml: TypeAlias = tuple[str, str]
+

 # ================================================================================================
 # CHUNKING OPTIONS
@ -441,37 +444,31 @@ class TablePreChunk:

    def iter_chunks(self) -> Iterator[Table | TableChunk]:
        """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
-        maxlen = self._opts.hard_max
-        text_remainder = self._text
-        html_remainder = self._table.metadata.text_as_html or ""
-
-        # -- only text-split a table when it's longer than the chunking window --
-        if len(text_remainder) <= maxlen and len(html_remainder) <= maxlen:
-            # -- but the overlap-prefix must be added to its text --
-            yield Table(text=text_remainder, metadata=self._metadata)
+        # -- A table with no non-whitespace text produces no chunks --
+        if not self._table_text:
            return

-        split = self._opts.split
-        is_continuation = False
-
-        while text_remainder or html_remainder:
-            # -- split off the next chunk-worth of characters into a TableChunk --
-            chunk_text, text_remainder = split(text_remainder)
+        # -- only text-split a table when it's longer than the chunking window --
+        maxlen = self._opts.hard_max
+        if len(self._text_with_overlap) <= maxlen and len(self._html) <= maxlen:
+            # -- use the compactified html for .text_as_html, even though we're not splitting --
            metadata = self._metadata
+            metadata.text_as_html = self._html or None
+            # -- note the overlap-prefix is prepended to its text --
+            yield Table(text=self._text_with_overlap, metadata=metadata)
+            return

-            # -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
-            # -- HTML elements that *correspond* to the TextChunk.text fragment.
-            if html_remainder:
-                chunk_html, html_remainder = html_remainder[:maxlen], html_remainder[maxlen:]
-                metadata.text_as_html = chunk_html
+        # -- When there's no HTML, split it like a normal element. Also fall back to text-only
+        # -- chunks when `max_characters` is less than 50. `.text_as_html` metadata is impractical
+        # -- for a chunking window that small because the 33 characterss of HTML overhead for each
+        # -- chunk (`<table><tr><td>...</td></tr></table>`) would produce a very large number of
+        # -- very small chunks.
+        if not self._html or self._opts.hard_max < 50:
+            yield from self._iter_text_only_table_chunks()
+            return

-            # -- mark second and later chunks as a continuation --
-            if is_continuation:
-                metadata.is_continuation = True
-
-            yield TableChunk(text=chunk_text, metadata=metadata)
-
-            is_continuation = True
+        # -- otherwise, form splits with "synchronized" text and html --
+        yield from self._iter_text_and_html_table_chunks()

    @lazyproperty
    def overlap_tail(self) -> str:
@ -482,18 +479,80 @@ class TablePreChunk:
        trailing whitespace.
        """
        overlap = self._opts.inter_chunk_overlap
-        return self._text[-overlap:].strip() if overlap else ""
+        return self._text_with_overlap[-overlap:].strip() if overlap else ""
+
+    @lazyproperty
+    def _html(self) -> str:
+        """The compactified HTML for this table when it has text-as-HTML.
+
+        The empty string when table-structure has not been captured, perhaps because
+        `infer_table_structure` was set `False` in the partitioning call.
+        """
+        if not (html_table := self._html_table):
+            return ""
+
+        return html_table.html
+
+    @lazyproperty
+    def _html_table(self) -> HtmlTable | None:
+        """The `lxml` HTML element object for this table.
+
+        `None` when the `Table` element has no `.metadata.text_as_html`.
+        """
+        if (text_as_html := self._table.metadata.text_as_html) is None:
+            return None
+
+        text_as_html = text_as_html.strip()
+        if not text_as_html:  # pragma: no cover
+            return None
+
+        return HtmlTable.from_html_text(text_as_html)
+
+    def _iter_text_and_html_table_chunks(self) -> Iterator[TableChunk]:
+        """Split table into chunks where HTML corresponds exactly to text.
+
+        `.metadata.text_as_html` for each chunk is a parsable `<table>` HTML fragment.
+        """
+        if (html_table := self._html_table) is None:  # pragma: no cover
+            raise ValueError("this method is undefined for a table having no .text_as_html")
+
+        is_continuation = False
+
+        for text, html in _TableSplitter.iter_subtables(html_table, self._opts):
+            metadata = self._metadata
+            metadata.text_as_html = html
+            # -- second and later chunks get `.metadata.is_continuation = True` --
+            metadata.is_continuation = is_continuation or None
+            is_continuation = True
+
+            yield TableChunk(text=text, metadata=metadata)
+
+    def _iter_text_only_table_chunks(self) -> Iterator[TableChunk]:
+        """Split oversized text-only table (no text-as-html) into chunks."""
+        text_remainder = self._text_with_overlap
+        split = self._opts.split
+        is_continuation = False
+
+        while text_remainder:
+            # -- split off the next chunk-worth of characters into a TableChunk --
+            chunk_text, text_remainder = split(text_remainder)
+            metadata = self._metadata
+            # -- second and later chunks get `.metadata.is_continuation = True` --
+            metadata.is_continuation = is_continuation or None
+            is_continuation = True
+
+            yield TableChunk(text=chunk_text, metadata=metadata)

    @property
    def _metadata(self) -> ElementMetadata:
        """The base `.metadata` value for chunks formed from this pre-chunk.

-        The term "base" here means that other metadata fields will be added, depending on the chunk.
-        In particular, `.metadata.text_as_html` will be different for each text-split chunk and
-        `.metadata.is_continuation` must be added for second-and-later text-split chunks.
+        The term "base" here means that other metadata fields will be added, depending on the
+        chunk. In particular, `.metadata.text_as_html` will be different for each text-split chunk
+        and `.metadata.is_continuation` must be added for second-and-later text-split chunks.

        Note this is a fresh copy of the metadata on each call since it will need to be mutated
-        differently for each chunk formed from from this pre-chunk.
+        differently for each chunk formed from this pre-chunk.
        """
        CS = ConsolidationStrategy
        metadata = copy.deepcopy(self._table.metadata)
@ -528,10 +587,15 @@ class TablePreChunk:
        return [orig_table]

    @lazyproperty
-    def _text(self) -> str:
+    def _table_text(self) -> str:
+        """The text in this table, not including any overlap-prefix or extra whitespace."""
+        return " ".join(self._table.text.split())
+
+    @lazyproperty
+    def _text_with_overlap(self) -> str:
        """The text for this chunk, including the overlap-prefix when present."""
        overlap_prefix = self._overlap_prefix
-        table_text = self._table.text
+        table_text = self._table.text.strip()
        # -- use row-separator between overlap and table-text --
        return overlap_prefix + "\n" + table_text if overlap_prefix else table_text

@ -795,6 +859,82 @@ class TextPreChunk:
 # ================================================================================================


+class _TableSplitter:
+    """Produces (text, html) pairs for a `<table>` HtmlElement.
+
+    Each chunk contains a whole number of rows whenever possible. An oversized row is split on an
+    even cell boundary and a single cell that is by itself too big to fit in the chunking window
+    is divided by text-splitting.
+
+    The returned `html` value is always a parseable HTML `<table>` subtree.
+    """
+
+    def __init__(self, table_element: HtmlTable, opts: ChunkingOptions):
+        self._table_element = table_element
+        self._opts = opts
+
+    @classmethod
+    def iter_subtables(
+        cls, table_element: HtmlTable, opts: ChunkingOptions
+    ) -> Iterator[TextAndHtml]:
+        """Generate (text, html) pair for each split of this table pre-chunk.
+
+        Each split is on an even row boundary whenever possible, falling back to even cell and even
+        word boundaries when a row or cell is by itself oversized, respectively.
+        """
+        return cls(table_element, opts)._iter_subtables()
+
+    def _iter_subtables(self) -> Iterator[TextAndHtml]:
+        """Generate (text, html) pairs containing as many whole rows as will fit in window.
+
+        Falls back to splitting rows into whole cells when a single row is by itself too big to
+        fit in the chunking window.
+        """
+        accum = _RowAccumulator(maxlen=self._opts.hard_max)
+
+        for row in self._table_element.iter_rows():
+            # -- if row won't fit, any WIP chunk is done, send it on its way --
+            if not accum.will_fit(row):
+                yield from accum.flush()
+            # -- if row fits, add it to accumulator --
+            if accum.will_fit(row):
+                accum.add_row(row)
+            else:  # -- otherwise, single row is bigger than chunking window --
+                yield from self._iter_row_splits(row)
+
+        yield from accum.flush()
+
+    def _iter_row_splits(self, row: HtmlRow) -> Iterator[TextAndHtml]:
+        """Split oversized row into (text, html) pairs containing as many cells as will fit."""
+        accum = _CellAccumulator(maxlen=self._opts.hard_max)
+
+        for cell in row.iter_cells():
+            # -- if cell won't fit, flush and check again --
+            if not accum.will_fit(cell):
+                yield from accum.flush()
+            # -- if cell fits, add it to accumulator --
+            if accum.will_fit(cell):
+                accum.add_cell(cell)
+            else:  # -- otherwise, single cell is bigger than chunking window --
+                yield from self._iter_cell_splits(cell)
+
+        yield from accum.flush()
+
+    def _iter_cell_splits(self, cell: HtmlCell) -> Iterator[TextAndHtml]:
+        """Split a single oversized cell into sub-sub-sub-table HTML fragments."""
+        # -- 33 is len("<table><tr><td></td></tr></table>"), HTML overhead beyond text content --
+        opts = ChunkingOptions(max_characters=(self._opts.hard_max - 33))
+        split = _TextSplitter(opts)
+
+        text, remainder = split(cell.text)
+        yield text, f"<table><tr><td>{text}</td></tr></table>"
+
+        # -- an oversized cell will have a remainder, split that up into additional chunks.
+        while remainder:
+            text, remainder = split(remainder)
+            yield text, f"<table><tr><td>{text}</td></tr></table>"
+
+
 class _TextSplitter:
    """Provides a text-splitting function configured on construction.

@ -911,6 +1051,97 @@ class _TextSplitter:
        return fragment, overlapped_remainder


+class _CellAccumulator:
+    """Incrementally build `<table>` fragment cell-by-cell to maximally fill chunking window.
+
+    Accumulate cells until chunking window is filled, then generate the text and HTML for the
+    subtable composed of all those rows that fit in the window.
+    """
+
+    def __init__(self, maxlen: int):
+        self._maxlen = maxlen
+        self._cells: list[HtmlCell] = []
+
+    def add_cell(self, cell: HtmlCell) -> None:
+        """Add `cell` to this accumulation. Caller is responsible for ensuring it will fit."""
+        self._cells.append(cell)
+
+    def flush(self) -> Iterator[TextAndHtml]:
+        """Generate zero-or-one (text, html) pairs for accumulated sub-sub-table."""
+        if not self._cells:
+            return
+        text = " ".join(self._iter_cell_texts())
+        tds_str = "".join(c.html for c in self._cells)
+        html = f"<table><tr>{tds_str}</tr></table>"
+        self._cells.clear()
+        yield text, html
+
+    def will_fit(self, cell: HtmlCell) -> bool:
+        """True when `cell` will fit within remaining space left by accummulated cells."""
+        return self._remaining_space >= len(cell.html)
+
+    def _iter_cell_texts(self) -> Iterator[str]:
+        """Generate contents of each accumulated cell as a separate string.
+
+        A cell that is empty or contains only whitespace does not generate a string.
+        """
+        for cell in self._cells:
+            if not (text := cell.text):
+                continue
+            yield text
+
+    @property
+    def _remaining_space(self) -> int:
+        """Number of characters remaining when accumulated cells are formed into HTML."""
+        # -- 24 is `len("<table><tr></tr></table>")`, the overhead in addition to `<td>`
+        # -- HTML fragments
+        return self._maxlen - 24 - sum(len(c.html) for c in self._cells)
+
+
+class _RowAccumulator:
+    """Maybe `SubtableAccumulator`.
+
+    Accumulate rows until chunking window is filled, then generate the text and HTML for the
+    subtable composed of all those rows that fit in the window.
+    """
+
+    def __init__(self, maxlen: int):
+        self._maxlen = maxlen
+        self._rows: list[HtmlRow] = []
+
+    def add_row(self, row: HtmlRow) -> None:
+        """Add `row` to this accumulation. Caller is responsible for ensuring it will fit."""
+        self._rows.append(row)
+
+    def flush(self) -> Iterator[TextAndHtml]:
+        """Generate zero-or-one (text, html) pairs for accumulated sub-table."""
+        if not self._rows:
+            return
+        text = " ".join(self._iter_cell_texts())
+        trs_str = "".join(r.html for r in self._rows)
+        html = f"<table>{trs_str}</table>"
+        self._rows.clear()
+        yield text, html
+
+    def will_fit(self, row: HtmlRow) -> bool:
+        """True when `row` will fit within remaining space left by accummulated rows."""
+        return self._remaining_space >= len(row.html)
+
+    def _iter_cell_texts(self) -> Iterator[str]:
+        """Generate contents of each row cell as a separate string.
+
+        A cell that is empty or contains only whitespace does not generate a string.
+        """
+        for r in self._rows:
+            yield from r.iter_cell_texts()
+
+    @property
+    def _remaining_space(self) -> int:
+        """Number of characters remaining when accumulated rows are formed into HTML."""
+        # -- 15 is `len("<table></table>")`, the overhead in addition to `<tr>` HTML fragments --
+        return self._maxlen - 15 - sum(len(r.html) for r in self._rows)
+
+
 # ================================================================================================
 # PRE-CHUNK COMBINER
 # ================================================================================================
--- a/unstructured/common/html_table.py
+++ b/unstructured/common/html_table.py
@ -6,7 +6,15 @@ Used during partitioning as well as chunking.
 from __future__ import annotations

 import html
-from typing import Iterator, Sequence
+from typing import TYPE_CHECKING, Iterator, Sequence, cast
+
+from lxml import etree
+from lxml.html import fragment_fromstring
+
+from unstructured.utils import lazyproperty
+
+if TYPE_CHECKING:
+    from lxml.html import HtmlElement


 def htmlify_matrix_of_cell_texts(matrix: Sequence[Sequence[str]]) -> str:
@ -33,7 +41,121 @@ def htmlify_matrix_of_cell_texts(matrix: Sequence[Sequence[str]]) -> str:
            s = html.escape(s)
            # -- substitute <br/> elements for line-feeds in the text --
            s = "<br/>".join(s.split("\n"))
-            # -- strip leading and trailing whitespace, wrap it up and go --
-            yield f"<td>{s.strip()}</td>"
+            # -- normalize whitespace in cell --
+            cell_text = " ".join(s.split())
+            # -- emit void `<td/>` when cell text is empty string --
+            yield f"<td>{cell_text}</td>" if cell_text else "<td/>"

    return f"<table>{''.join(iter_trs(matrix))}</table>" if matrix else ""
+
+
+class HtmlTable:
+    """A `<table>` element."""
+
+    def __init__(self, table: HtmlElement):
+        self._table = table
+
+    @classmethod
+    def from_html_text(cls, html_text: str) -> HtmlTable:
+        # -- root is always a `<table>` element so far but let's be robust --
+        root = fragment_fromstring(html_text)
+        tables = root.xpath("//table")
+        if not tables:
+            raise ValueError("`html_text` contains no `<table>` element")
+        table = tables[0]
+
+        # -- remove `<thead>`, `<tbody>`, and `<tfoot>` noise elements when present --
+        noise_elements = table.xpath(".//thead | .//tbody | .//tfoot")
+        for e in noise_elements:
+            e.drop_tag()
+
+        # -- normalize and compactify the HTML --
+        for e in table.iter():
+            # -- Strip all attributes from elements, like border="1", class="dataframe" added
+            # -- by pandas.DataFrame.to_html(), style="text-align: right;", etc.
+            e.attrib.clear()
+
+            # -- change any `<th>` elements to `<td>` so all cells have the same tag --
+            if e.tag == "th":
+                e.tag = "td"
+
+            # -- normalize whitespace in element text; this removes indent whitespace before nested
+            # -- elements and reduces whitespace between words to a single space.
+            if e.text:
+                e.text = " ".join(e.text.split())
+
+            # -- remove all tails, those are newline + indent if anything --
+            if e.tail:
+                e.tail = None
+
+        return cls(table)
+
+    @lazyproperty
+    def html(self) -> str:
+        """The HTML-fragment for this `<table>` element, all on one line.
+
+        Like: `<table><tr><td>foo</td></tr><tr><td>bar</td></tr></table>`
+
+        The HTML contains no human-readability whitespace, attributes, or `<thead>`, `<tbody>`, or
+        `<tfoot>` tags. It is made as compact as possible to maximize the semantic content in a
+        given space. This is particularly important for chunking.
+        """
+        return etree.tostring(self._table, encoding=str)
+
+    def iter_rows(self) -> Iterator[HtmlRow]:
+        yield from (HtmlRow(tr) for tr in cast("list[HtmlElement]", self._table.xpath("./tr")))
+
+    @lazyproperty
+    def text(self) -> str:
+        """The clean, concatenated, text for this table."""
+        table_text = " ".join(self._table.itertext())
+        # -- blank cells will introduce extra whitespace, so normalize after accumulating --
+        return " ".join(table_text.split())
+
+
+class HtmlRow:
+    """A `<tr>` element."""
+
+    def __init__(self, tr: HtmlElement):
+        self._tr = tr
+
+    @lazyproperty
+    def html(self) -> str:
+        """Like  "<tr><td>foo</td><td>bar</td></tr>"."""
+        return etree.tostring(self._tr, encoding=str)
+
+    def iter_cells(self) -> Iterator[HtmlCell]:
+        for td in self._tr:
+            yield HtmlCell(td)
+
+    def iter_cell_texts(self) -> Iterator[str]:
+        """Generate contents of each cell of this row as a separate string.
+
+        A cell that is empty or contains only whitespace does not generate a string.
+        """
+        for td in self._tr:
+            if (text := td.text) is None:
+                continue
+            text = text.strip()
+            if not text:
+                continue
+            yield text
+
+
+class HtmlCell:
+    """A `<td>` element."""
+
+    def __init__(self, td: HtmlElement):
+        self._td = td
+
+    @lazyproperty
+    def html(self) -> str:
+        """Like  "<td>foo bar baz</td>"."""
+        return etree.tostring(self._td, encoding=str) if self.text else "<td/>"
+
+    @lazyproperty
+    def text(self) -> str:
+        """Text inside `<td>` element, empty string when no text."""
+        if (text := self._td.text) is None:
+            return ""
+        return text.strip()
--- a/unstructured/partition/text_type.py
+++ b/unstructured/partition/text_type.py
@ -4,13 +4,7 @@ from __future__ import annotations

 import os
 import re
-import sys
-from typing import List, Optional
-
-if sys.version_info < (3, 8):
-    from typing_extensions import Final  # pragma: nocover
-else:
-    from typing import Final
+from typing import Final, List, Optional

 from unstructured.cleaners.core import remove_punctuation
 from unstructured.logger import trace_logger