rfctr(chunking): extract general-purpose objects to base (#2281)

Many of the classes defined in `unstructured.chunking.title` are applicable to any chunking strategy and will shortly be used for the "by-character" chunking strategy as well. Move these and their tests to `unstructured.chunking.base`. Along the way, rename `TextPreChunkBuilder` to `PreChunkBuilder` because it will be generalized in a subsequent PR to also take `Table` elements such that inter-pre-chunk overlap can be implemented. Otherwise, no logic changes, just moves.
2025-11-03 03:23:25 +00:00 · 2023-12-16 09:28:15 -08:00 · 2023-12-16 09:28:15 -08:00 · 36e81c3367
commit 36e81c3367
parent a7c3f5f570
6 changed files with 1297 additions and 1267 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.11.5-dev1
+## 0.11.5-dev2

 ### Enhancements

--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@ -1,14 +1,35 @@
+# pyright: reportPrivateUsage=false
+
 """Unit-test suite for the `unstructured.chunking.base` module."""

 from __future__ import annotations

+from typing import List
+
 import pytest

-from unstructured.chunking.base import ChunkingOptions
+from unstructured.chunking.base import (
+    ChunkingOptions,
+    PreChunkBuilder,
+    PreChunkCombiner,
+    TablePreChunk,
+    TextPreChunk,
+    TextPreChunkAccumulator,
+)
+from unstructured.documents.elements import (
+    CompositeElement,
+    ElementMetadata,
+    PageBreak,
+    RegexMetadata,
+    Table,
+    TableChunk,
+    Text,
+    Title,
+)


 class DescribeChunkingOptions:
-    """Unit-test suite for `unstructured.chunking.model.ChunkingOptions objects."""
+    """Unit-test suite for `unstructured.chunking.base.ChunkingOptions objects."""

    @pytest.mark.parametrize("max_characters", [0, -1, -42])
    def it_rejects_max_characters_not_greater_than_zero(self, max_characters: int):
@ -111,3 +132,847 @@ class DescribeChunkingOptions:

    def it_knows_the_text_separator_string(self):
        assert ChunkingOptions.new().text_separator == "\n\n"
+
+
+# ================================================================================================
+# PRE-CHUNK SUBTYPES
+# ================================================================================================
+
+
+class DescribeTablePreChunk:
+    """Unit-test suite for `unstructured.chunking.base.TablePreChunk objects."""
+
+    def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
+        html_table = (
+            "<table>\n"
+            "<thead>\n"
+            "<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
+            "</thead>\n"
+            "<tbody>\n"
+            "<tr><td>Lorem ipsum  </td><td>adipiscing   </td></tr>\n"
+            "</tbody>\n"
+            "</table>"
+        )
+        text_table = "Header Col 1  Header Col 2\n" "Lorem ipsum   adipiscing"
+        pre_chunk = TablePreChunk(
+            Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
+            opts=ChunkingOptions.new(max_characters=175),
+        )
+
+        chunk_iter = pre_chunk.iter_chunks()
+
+        chunk = next(chunk_iter)
+        assert isinstance(chunk, Table)
+        assert chunk.text == "Header Col 1  Header Col 2\nLorem ipsum   adipiscing"
+        assert chunk.metadata.text_as_html == (
+            "<table>\n"
+            "<thead>\n"
+            "<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
+            "</thead>\n"
+            "<tbody>\n"
+            "<tr><td>Lorem ipsum  </td><td>adipiscing   </td></tr>\n"
+            "</tbody>\n"
+            "</table>"
+        )
+        with pytest.raises(StopIteration):
+            next(chunk_iter)
+
+    def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
+        # fixed-overhead = 8+8+9+8+9+8 = 50
+        # per-row overhead = 27
+        html_table = (
+            "<table>\n"  # 8
+            "<thead>\n"  # 8
+            "<tr><th>Header Col 1   </th><th>Header Col 2  </th></tr>\n"
+            "</thead>\n"  # 9
+            "<tbody>\n"  # 8
+            "<tr><td>Lorem ipsum    </td><td>A Link example</td></tr>\n"
+            "<tr><td>Consectetur    </td><td>adipiscing elit</td></tr>\n"
+            "<tr><td>Nunc aliquam   </td><td>id enim nec molestie</td></tr>\n"
+            "<tr><td>Vivamus quis   </td><td>nunc ipsum donec ac fermentum</td></tr>\n"
+            "</tbody>\n"  # 9
+            "</table>"  # 8
+        )
+        text_table = (
+            "Header Col 1   Header Col 2\n"
+            "Lorem ipsum    dolor sit amet\n"
+            "Consectetur    adipiscing elit\n"
+            "Nunc aliquam   id enim nec molestie\n"
+            "Vivamus quis   nunc ipsum donec ac fermentum"
+        )
+        pre_chunk = TablePreChunk(
+            Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
+            opts=ChunkingOptions.new(max_characters=100),
+        )
+
+        chunk_iter = pre_chunk.iter_chunks()
+
+        chunk = next(chunk_iter)
+        assert isinstance(chunk, TableChunk)
+        assert chunk.text == (
+            "Header Col 1   Header Col 2\n"
+            "Lorem ipsum    dolor sit amet\n"
+            "Consectetur    adipiscing elit\n"
+            "Nunc aliqua"
+        )
+        assert chunk.metadata.text_as_html == (
+            "<table>\n"
+            "<thead>\n"
+            "<tr><th>Header Col 1   </th><th>Header Col 2  </th></tr>\n"
+            "</thead>\n"
+            "<tbody>\n"
+            "<tr><td>Lo"
+        )
+        # --
+        chunk = next(chunk_iter)
+        assert isinstance(chunk, TableChunk)
+        assert (
+            chunk.text == "m   id enim nec molestie\nVivamus quis   nunc ipsum donec ac fermentum"
+        )
+        assert chunk.metadata.text_as_html == (
+            "rem ipsum    </td><td>A Link example</td></tr>\n"
+            "<tr><td>Consectetur    </td><td>adipiscing elit</td><"
+        )
+        # -- note that text runs out but HTML continues because it's significantly longer. So two
+        # -- of these chunks have HTML but no text.
+        chunk = next(chunk_iter)
+        assert isinstance(chunk, TableChunk)
+        assert chunk.text == ""
+        assert chunk.metadata.text_as_html == (
+            "/tr>\n"
+            "<tr><td>Nunc aliquam   </td><td>id enim nec molestie</td></tr>\n"
+            "<tr><td>Vivamus quis   </td><td>"
+        )
+        # --
+        chunk = next(chunk_iter)
+        assert isinstance(chunk, TableChunk)
+        assert chunk.text == ""
+        assert chunk.metadata.text_as_html == (
+            "nunc ipsum donec ac fermentum</td></tr>\n</tbody>\n</table>"
+        )
+        # --
+        with pytest.raises(StopIteration):
+            next(chunk_iter)
+
+
+class DescribeTextPreChunk:
+    """Unit-test suite for `unstructured.chunking.base.TextPreChunk objects."""
+
+    def it_can_combine_itself_with_another_TextPreChunk_instance(self):
+        """.combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`.
+
+        Note that neither the original or other pre_chunk are mutated.
+        """
+        opts = ChunkingOptions.new()
+        pre_chunk = TextPreChunk(
+            [
+                Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+                Text("In rhoncus ipsum sed lectus porta volutpat."),
+            ],
+            opts=opts,
+        )
+        other_pre_chunk = TextPreChunk(
+            [
+                Text("Donec semper facilisis metus finibus malesuada."),
+                Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
+            ],
+            opts=opts,
+        )
+
+        new_pre_chunk = pre_chunk.combine(other_pre_chunk)
+
+        assert new_pre_chunk == TextPreChunk(
+            [
+                Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+                Text("In rhoncus ipsum sed lectus porta volutpat."),
+                Text("Donec semper facilisis metus finibus malesuada."),
+                Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
+            ],
+            opts=opts,
+        )
+        assert pre_chunk == TextPreChunk(
+            [
+                Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+                Text("In rhoncus ipsum sed lectus porta volutpat."),
+            ],
+            opts=opts,
+        )
+        assert other_pre_chunk == TextPreChunk(
+            [
+                Text("Donec semper facilisis metus finibus malesuada."),
+                Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
+            ],
+            opts=opts,
+        )
+
+    def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
+        pre_chunk = TextPreChunk(
+            [
+                Title("Introduction"),
+                Text(
+                    "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
+                    "lectus porta volutpat.",
+                ),
+            ],
+            opts=ChunkingOptions.new(max_characters=200),
+        )
+
+        chunk_iter = pre_chunk.iter_chunks()
+
+        chunk = next(chunk_iter)
+        assert chunk == CompositeElement(
+            "Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
+            " In rhoncus ipsum sedlectus porta volutpat.",
+        )
+        assert chunk.metadata is pre_chunk._consolidated_metadata
+
+    def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
+        # -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
+        # -- The pre-chunker will isolate that element in a pre_chunk of its own.
+        pre_chunk = TextPreChunk(
+            [
+                Text(
+                    "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
+                    " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
+                    " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
+                    " commodo consequat."
+                ),
+            ],
+            opts=ChunkingOptions.new(max_characters=200),
+        )
+
+        chunk_iter = pre_chunk.iter_chunks()
+
+        chunk = next(chunk_iter)
+        assert chunk == CompositeElement(
+            "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
+            " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
+            " veniam, quis nostrud exercitation ullamco laboris nisi ut a"
+        )
+        assert chunk.metadata is pre_chunk._consolidated_metadata
+        # --
+        chunk = next(chunk_iter)
+        assert chunk == CompositeElement("liquip ex ea commodo consequat.")
+        assert chunk.metadata is pre_chunk._consolidated_metadata
+        # --
+        with pytest.raises(StopIteration):
+            next(chunk_iter)
+
+    def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self):
+        """.text_length is the size of chunk this pre-chunk will produce (before any splitting)."""
+        pre_chunk = TextPreChunk(
+            [PageBreak(""), Text("foo"), Text("bar")], opts=ChunkingOptions.new()
+        )
+        assert pre_chunk.text_length == 8
+
+    def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self):
+        pre_chunk = TextPreChunk(
+            [
+                Title(
+                    "Lorem Ipsum",
+                    metadata=ElementMetadata(
+                        category_depth=0,
+                        filename="foo.docx",
+                        languages=["lat"],
+                        parent_id="f87731e0",
+                    ),
+                ),
+                Text(
+                    "'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
+                    metadata=ElementMetadata(
+                        category_depth=1,
+                        filename="foo.docx",
+                        image_path="sprite.png",
+                        languages=["lat", "eng"],
+                    ),
+                ),
+            ],
+            opts=ChunkingOptions.new(),
+        )
+
+        assert pre_chunk._all_metadata_values == {
+            # -- scalar values are accumulated in a list in element order --
+            "category_depth": [0, 1],
+            # -- all values are accumulated, not only unique ones --
+            "filename": ["foo.docx", "foo.docx"],
+            # -- list-type fields produce a list of lists --
+            "languages": [["lat"], ["lat", "eng"]],
+            # -- fields that only appear in some elements are captured --
+            "image_path": ["sprite.png"],
+            "parent_id": ["f87731e0"],
+            # -- A `None` value never appears, neither does a field-name with an empty list --
+        }
+
+    def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self):
+        metadata = ElementMetadata(
+            category_depth=0,
+            filename="foo.docx",
+            languages=["lat"],
+            parent_id="f87731e0",
+        )
+        metadata.coefficient = 0.62
+        metadata_2 = ElementMetadata(
+            category_depth=1,
+            filename="foo.docx",
+            image_path="sprite.png",
+            languages=["lat", "eng"],
+        )
+        metadata_2.quotient = 1.74
+
+        pre_chunk = TextPreChunk(
+            [
+                Title("Lorem Ipsum", metadata=metadata),
+                Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
+            ],
+            opts=ChunkingOptions.new(),
+        )
+
+        # -- ad-hoc fields "coefficient" and "quotient" do not appear --
+        assert pre_chunk._all_metadata_values == {
+            "category_depth": [0, 1],
+            "filename": ["foo.docx", "foo.docx"],
+            "image_path": ["sprite.png"],
+            "languages": [["lat"], ["lat", "eng"]],
+            "parent_id": ["f87731e0"],
+        }
+
+    def it_consolidates_regex_metadata_in_a_field_specific_way(self):
+        """regex_metadata of chunk is combined regex_metadatas of its elements.
+
+        Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new
+        position in the chunk after element text has been concatenated.
+        """
+        pre_chunk = TextPreChunk(
+            [
+                Title(
+                    "Lorem Ipsum",
+                    metadata=ElementMetadata(
+                        regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
+                    ),
+                ),
+                Text(
+                    "Lorem ipsum dolor sit amet consectetur adipiscing elit.",
+                    metadata=ElementMetadata(
+                        regex_metadata={
+                            "dolor": [RegexMetadata(text="dolor", start=12, end=17)],
+                            "ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
+                        },
+                    ),
+                ),
+                Text(
+                    "In rhoncus ipsum sed lectus porta volutpat.",
+                    metadata=ElementMetadata(
+                        regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
+                    ),
+                ),
+            ],
+            opts=ChunkingOptions.new(),
+        )
+
+        regex_metadata = pre_chunk._consolidated_regex_meta
+
+        assert regex_metadata == {
+            "dolor": [RegexMetadata(text="dolor", start=25, end=30)],
+            "ipsum": [
+                RegexMetadata(text="Ipsum", start=6, end=11),
+                RegexMetadata(text="ipsum", start=19, end=24),
+                RegexMetadata(text="ipsum", start=81, end=86),
+            ],
+        }
+
+    def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self):
+        """._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
+
+        Only non-None fields should appear in the dict and each field value should be the
+        consolidation of the values across the pre_chunk elements.
+        """
+        pre_chunk = TextPreChunk(
+            [
+                PageBreak(""),
+                Title(
+                    "Lorem Ipsum",
+                    metadata=ElementMetadata(
+                        filename="foo.docx",
+                        # -- category_depth has DROP strategy so doesn't appear in result --
+                        category_depth=0,
+                        emphasized_text_contents=["Lorem", "Ipsum"],
+                        emphasized_text_tags=["b", "i"],
+                        languages=["lat"],
+                        regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
+                    ),
+                ),
+                Text(
+                    "'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
+                    metadata=ElementMetadata(
+                        # -- filename change doesn't happen IRL but demonstrates FIRST strategy --
+                        filename="bar.docx",
+                        # -- emphasized_text_contents has LIST_CONCATENATE strategy, so "Lorem"
+                        # -- appears twice in consolidated-meta (as it should) and length matches
+                        # -- that of emphasized_text_tags both before and after consolidation.
+                        emphasized_text_contents=["Lorem", "ipsum"],
+                        emphasized_text_tags=["i", "b"],
+                        # -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once --
+                        languages=["eng", "lat"],
+                        # -- regex_metadata has its own dedicated consolidation-strategy (REGEX) --
+                        regex_metadata={
+                            "dolor": [RegexMetadata(text="dolor", start=12, end=17)],
+                            "ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
+                        },
+                    ),
+                ),
+            ],
+            opts=ChunkingOptions.new(),
+        )
+
+        meta_kwargs = pre_chunk._meta_kwargs
+
+        assert meta_kwargs == {
+            "filename": "foo.docx",
+            "emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"],
+            "emphasized_text_tags": ["b", "i", "i", "b"],
+            "languages": ["lat", "eng"],
+            "regex_metadata": {
+                "ipsum": [
+                    RegexMetadata(text="Ipsum", start=6, end=11),
+                    RegexMetadata(text="ipsum", start=19, end=24),
+                ],
+                "dolor": [RegexMetadata(text="dolor", start=25, end=30)],
+            },
+        }
+
+    @pytest.mark.parametrize(
+        ("elements", "expected_value"),
+        [
+            ([Text("foo"), Text("bar")], "foo\n\nbar"),
+            ([Text("foo"), PageBreak(""), Text("bar")], "foo\n\nbar"),
+            ([PageBreak(""), Text("foo"), Text("bar")], "foo\n\nbar"),
+            ([Text("foo"), Text("bar"), PageBreak("")], "foo\n\nbar"),
+        ],
+    )
+    def it_knows_the_concatenated_text_of_the_pre_chunk(
+        self, elements: List[Text], expected_value: str
+    ):
+        """._text is the "joined" text of the pre-chunk elements.
+
+        The text-segment contributed by each element is separated from the next by a blank line
+        ("\n\n"). An element that contributes no text does not give rise to a separator.
+        """
+        pre_chunk = TextPreChunk(elements, opts=ChunkingOptions.new())
+        assert pre_chunk._text == expected_value
+
+
+# ================================================================================================
+# PRE-CHUNKING ACCUMULATORS
+# ================================================================================================
+
+
+class DescribePreChunkBuilder:
+    """Unit-test suite for `unstructured.chunking.base.PreChunkBuilder`."""
+
+    def it_is_empty_on_construction(self):
+        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
+
+        assert builder.text_length == 0
+        assert builder.remaining_space == 50
+
+    def it_accumulates_elements_added_to_it(self):
+        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
+
+        builder.add_element(Title("Introduction"))
+        assert builder.text_length == 12
+        assert builder.remaining_space == 136
+
+        builder.add_element(
+            Text(
+                "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
+                "lectus porta volutpat.",
+            ),
+        )
+        assert builder.text_length == 112
+        assert builder.remaining_space == 36
+
+    def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
+        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
+        builder.add_element(Title("Introduction"))
+        builder.add_element(
+            Text(
+                "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
+                "lectus porta volutpat.",
+            ),
+        )
+
+        pre_chunk = next(builder.flush())
+
+        assert isinstance(pre_chunk, TextPreChunk)
+        assert pre_chunk._elements == [
+            Title("Introduction"),
+            Text(
+                "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
+                "lectus porta volutpat.",
+            ),
+        ]
+        assert builder.text_length == 0
+        assert builder.remaining_space == 150
+
+    def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
+        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
+
+        pre_chunks = list(builder.flush())
+
+        assert pre_chunks == []
+        assert builder.text_length == 0
+        assert builder.remaining_space == 150
+
+    def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
+        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
+        builder.add_element(Text("abcde"))
+        builder.add_element(Text("fghij"))
+
+        # -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
+        # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
+        assert builder.text_length == 12
+        # -- .remaining_space is reduced by the length (2) of the trailing separator which would go
+        # -- between the current text and that of the next element if one was added.
+        # -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
+        assert builder.remaining_space == 36
+
+
+class DescribePreChunkCombiner:
+    """Unit-test suite for `unstructured.chunking.base.PreChunkCombiner`."""
+
+    def it_combines_sequential_small_text_pre_chunks(self):
+        opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
+        pre_chunks = [
+            TextPreChunk(
+                [
+                    Title("Lorem Ipsum"),  # 11
+                    Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),  # 55
+                ],
+                opts=opts,
+            ),
+            TextPreChunk(
+                [
+                    Title("Mauris Nec"),  # 10
+                    Text("Mauris nec urna non augue vulputate consequat eget et nisi."),  # 59
+                ],
+                opts=opts,
+            ),
+            TextPreChunk(
+                [
+                    Title("Sed Orci"),  # 8
+                    Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),  # 63
+                ],
+                opts=opts,
+            ),
+        ]
+
+        pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
+
+        pre_chunk = next(pre_chunk_iter)
+        assert isinstance(pre_chunk, TextPreChunk)
+        assert pre_chunk._elements == [
+            Title("Lorem Ipsum"),
+            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+            Title("Mauris Nec"),
+            Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+            Title("Sed Orci"),
+            Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
+        ]
+        with pytest.raises(StopIteration):
+            next(pre_chunk_iter)
+
+    def but_it_does_not_combine_table_pre_chunks(self):
+        opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
+        pre_chunks = [
+            TextPreChunk(
+                [
+                    Title("Lorem Ipsum"),
+                    Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+                ],
+                opts=opts,
+            ),
+            TablePreChunk(Table("Heading\nCell text"), opts=opts),
+            TextPreChunk(
+                [
+                    Title("Mauris Nec"),
+                    Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+                ],
+                opts=opts,
+            ),
+        ]
+
+        pre_chunk_iter = PreChunkCombiner(
+            pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
+        ).iter_combined_pre_chunks()
+
+        pre_chunk = next(pre_chunk_iter)
+        assert isinstance(pre_chunk, TextPreChunk)
+        assert pre_chunk._elements == [
+            Title("Lorem Ipsum"),
+            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+        ]
+        # --
+        pre_chunk = next(pre_chunk_iter)
+        assert isinstance(pre_chunk, TablePreChunk)
+        assert pre_chunk._table == Table("Heading\nCell text")
+        # --
+        pre_chunk = next(pre_chunk_iter)
+        assert isinstance(pre_chunk, TextPreChunk)
+        assert pre_chunk._elements == [
+            Title("Mauris Nec"),
+            Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+        ]
+        # --
+        with pytest.raises(StopIteration):
+            next(pre_chunk_iter)
+
+    def it_respects_the_specified_combination_threshold(self):
+        opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80)
+        pre_chunks = [
+            TextPreChunk(  # 68
+                [
+                    Title("Lorem Ipsum"),  # 11
+                    Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),  # 55
+                ],
+                opts=opts,
+            ),
+            TextPreChunk(  # 71
+                [
+                    Title("Mauris Nec"),  # 10
+                    Text("Mauris nec urna non augue vulputate consequat eget et nisi."),  # 59
+                ],
+                opts=opts,
+            ),
+            # -- len == 139
+            TextPreChunk(
+                [
+                    Title("Sed Orci"),  # 8
+                    Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),  # 63
+                ],
+                opts=opts,
+            ),
+        ]
+
+        pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
+
+        pre_chunk = next(pre_chunk_iter)
+        assert isinstance(pre_chunk, TextPreChunk)
+        assert pre_chunk._elements == [
+            Title("Lorem Ipsum"),
+            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+            Title("Mauris Nec"),
+            Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+        ]
+        # --
+        pre_chunk = next(pre_chunk_iter)
+        assert isinstance(pre_chunk, TextPreChunk)
+        assert pre_chunk._elements == [
+            Title("Sed Orci"),
+            Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
+        ]
+        # --
+        with pytest.raises(StopIteration):
+            next(pre_chunk_iter)
+
+    def it_respects_the_hard_maximum_window_length(self):
+        opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200)
+        pre_chunks = [
+            TextPreChunk(  # 68
+                [
+                    Title("Lorem Ipsum"),  # 11
+                    Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),  # 55
+                ],
+                opts=opts,
+            ),
+            TextPreChunk(  # 71
+                [
+                    Title("Mauris Nec"),  # 10
+                    Text("Mauris nec urna non augue vulputate consequat eget et nisi."),  # 59
+                ],
+                opts=opts,
+            ),
+            # -- len == 139
+            TextPreChunk(
+                [
+                    Title("Sed Orci"),  # 8
+                    Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),  # 63
+                ],
+                opts=opts,
+            ),
+            # -- len == 214
+        ]
+
+        pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
+
+        pre_chunk = next(pre_chunk_iter)
+        assert isinstance(pre_chunk, TextPreChunk)
+        assert pre_chunk._elements == [
+            Title("Lorem Ipsum"),
+            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+            Title("Mauris Nec"),
+            Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+        ]
+        # --
+        pre_chunk = next(pre_chunk_iter)
+        assert isinstance(pre_chunk, TextPreChunk)
+        assert pre_chunk._elements == [
+            Title("Sed Orci"),
+            Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
+        ]
+        # --
+        with pytest.raises(StopIteration):
+            next(pre_chunk_iter)
+
+    def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
+        """Such as occurs when a single element exceeds the window size."""
+        opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
+        pre_chunks = [
+            TextPreChunk([Title("Lorem Ipsum")], opts=opts),
+            TextPreChunk(  # 179
+                [
+                    Text(
+                        "Lorem ipsum dolor sit amet consectetur adipiscing elit."  # 55
+                        " Mauris nec urna non augue vulputate consequat eget et nisi."  # 60
+                        " Sed orci quam, eleifend sit amet vehicula, elementum ultricies."  # 64
+                    )
+                ],
+                opts=opts,
+            ),
+            TextPreChunk([Title("Vulputate Consequat")], opts=opts),
+        ]
+
+        pre_chunk_iter = PreChunkCombiner(
+            pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
+        ).iter_combined_pre_chunks()
+
+        pre_chunk = next(pre_chunk_iter)
+        assert isinstance(pre_chunk, TextPreChunk)
+        assert pre_chunk._elements == [Title("Lorem Ipsum")]
+        # --
+        pre_chunk = next(pre_chunk_iter)
+        assert isinstance(pre_chunk, TextPreChunk)
+        assert pre_chunk._elements == [
+            Text(
+                "Lorem ipsum dolor sit amet consectetur adipiscing elit."
+                " Mauris nec urna non augue vulputate consequat eget et nisi."
+                " Sed orci quam, eleifend sit amet vehicula, elementum ultricies."
+            )
+        ]
+        # --
+        pre_chunk = next(pre_chunk_iter)
+        assert isinstance(pre_chunk, TextPreChunk)
+        assert pre_chunk._elements == [Title("Vulputate Consequat")]
+        # --
+        with pytest.raises(StopIteration):
+            next(pre_chunk_iter)
+
+
+class DescribeTextPreChunkAccumulator:
+    """Unit-test suite for `unstructured.chunking.base.TextPreChunkAccumulator`."""
+
+    def it_is_empty_on_construction(self):
+        accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=100))
+
+        assert accum.text_length == 0
+        assert accum.remaining_space == 100
+
+    def it_accumulates_pre_chunks_added_to_it(self):
+        opts = ChunkingOptions.new(max_characters=500)
+        accum = TextPreChunkAccumulator(opts=opts)
+
+        accum.add_pre_chunk(
+            TextPreChunk(
+                [
+                    Title("Lorem Ipsum"),
+                    Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+                ],
+                opts=opts,
+            )
+        )
+        assert accum.text_length == 68
+        assert accum.remaining_space == 430
+
+        accum.add_pre_chunk(
+            TextPreChunk(
+                [
+                    Title("Mauris Nec"),
+                    Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+                ],
+                opts=opts,
+            )
+        )
+        assert accum.text_length == 141
+        assert accum.remaining_space == 357
+
+    def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
+        opts = ChunkingOptions.new(max_characters=150)
+        accum = TextPreChunkAccumulator(opts=opts)
+        accum.add_pre_chunk(
+            TextPreChunk(
+                [
+                    Title("Lorem Ipsum"),
+                    Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+                ],
+                opts=opts,
+            )
+        )
+        accum.add_pre_chunk(
+            TextPreChunk(
+                [
+                    Title("Mauris Nec"),
+                    Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+                ],
+                opts=opts,
+            )
+        )
+        accum.add_pre_chunk(
+            TextPreChunk(
+                [
+                    Title("Sed Orci"),
+                    Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
+                ],
+                opts=opts,
+            )
+        )
+
+        pre_chunk_iter = accum.flush()
+
+        # -- iterator generates exactly one pre_chunk --
+        pre_chunk = next(pre_chunk_iter)
+        with pytest.raises(StopIteration):
+            next(pre_chunk_iter)
+        # -- and it is a _TextPreChunk containing all the elements --
+        assert isinstance(pre_chunk, TextPreChunk)
+        assert pre_chunk._elements == [
+            Title("Lorem Ipsum"),
+            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
+            Title("Mauris Nec"),
+            Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
+            Title("Sed Orci"),
+            Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
+        ]
+        assert accum.text_length == 0
+        assert accum.remaining_space == 150
+
+    def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
+        accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150))
+
+        pre_chunks = list(accum.flush())
+
+        assert pre_chunks == []
+        assert accum.text_length == 0
+        assert accum.remaining_space == 150
+
+    def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
+        opts = ChunkingOptions.new(max_characters=100)
+        accum = TextPreChunkAccumulator(opts=opts)
+        accum.add_pre_chunk(TextPreChunk([Text("abcde")], opts=opts))
+        accum.add_pre_chunk(TextPreChunk([Text("fghij")], opts=opts))
+
+        # -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
+        # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
+        assert accum.text_length == 12
+        # -- .remaining_space is reduced by the length (2) of the trailing separator which would
+        # -- go between the current text and that of the next pre-chunk if one was added.
+        # -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88
+        assert accum.remaining_space == 86
--- a/test_unstructured/chunking/test_title.py
+++ b/test_unstructured/chunking/test_title.py
@ -4,16 +4,8 @@ from typing import List

 import pytest

-from unstructured.chunking.base import ChunkingOptions
-from unstructured.chunking.title import (
-    PreChunkCombiner,
-    TablePreChunk,
-    TextPreChunk,
-    TextPreChunkAccumulator,
-    TextPreChunkBuilder,
-    _split_elements_by_title_and_table,
-    chunk_by_title,
-)
+from unstructured.chunking.base import ChunkingOptions, TablePreChunk, TextPreChunk
+from unstructured.chunking.title import _split_elements_by_title_and_table, chunk_by_title
 from unstructured.documents.coordinates import CoordinateSystem
 from unstructured.documents.elements import (
    CheckBox,
@ -22,10 +14,8 @@ from unstructured.documents.elements import (
    Element,
    ElementMetadata,
    ListItem,
-    PageBreak,
    RegexMetadata,
    Table,
-    TableChunk,
    Text,
    Title,
 )
@ -552,843 +542,3 @@ def test_it_considers_separator_length_when_pre_chunking():
        ),
        CompositeElement("Minimize mid-text chunk-splitting"),
    ]
-
-
-# == PreChunks ===================================================================================
-
-
-class DescribeTablePreChunk:
-    """Unit-test suite for `unstructured.chunking.title.TablePreChunk objects."""
-
-    def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
-        html_table = (
-            "<table>\n"
-            "<thead>\n"
-            "<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
-            "</thead>\n"
-            "<tbody>\n"
-            "<tr><td>Lorem ipsum  </td><td>adipiscing   </td></tr>\n"
-            "</tbody>\n"
-            "</table>"
-        )
-        text_table = "Header Col 1  Header Col 2\n" "Lorem ipsum   adipiscing"
-        pre_chunk = TablePreChunk(
-            Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
-            opts=ChunkingOptions.new(max_characters=175),
-        )
-
-        chunk_iter = pre_chunk.iter_chunks()
-
-        chunk = next(chunk_iter)
-        assert isinstance(chunk, Table)
-        assert chunk.text == "Header Col 1  Header Col 2\nLorem ipsum   adipiscing"
-        assert chunk.metadata.text_as_html == (
-            "<table>\n"
-            "<thead>\n"
-            "<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
-            "</thead>\n"
-            "<tbody>\n"
-            "<tr><td>Lorem ipsum  </td><td>adipiscing   </td></tr>\n"
-            "</tbody>\n"
-            "</table>"
-        )
-        with pytest.raises(StopIteration):
-            next(chunk_iter)
-
-    def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
-        # fixed-overhead = 8+8+9+8+9+8 = 50
-        # per-row overhead = 27
-        html_table = (
-            "<table>\n"  # 8
-            "<thead>\n"  # 8
-            "<tr><th>Header Col 1   </th><th>Header Col 2  </th></tr>\n"
-            "</thead>\n"  # 9
-            "<tbody>\n"  # 8
-            "<tr><td>Lorem ipsum    </td><td>A Link example</td></tr>\n"
-            "<tr><td>Consectetur    </td><td>adipiscing elit</td></tr>\n"
-            "<tr><td>Nunc aliquam   </td><td>id enim nec molestie</td></tr>\n"
-            "<tr><td>Vivamus quis   </td><td>nunc ipsum donec ac fermentum</td></tr>\n"
-            "</tbody>\n"  # 9
-            "</table>"  # 8
-        )
-        text_table = (
-            "Header Col 1   Header Col 2\n"
-            "Lorem ipsum    dolor sit amet\n"
-            "Consectetur    adipiscing elit\n"
-            "Nunc aliquam   id enim nec molestie\n"
-            "Vivamus quis   nunc ipsum donec ac fermentum"
-        )
-        pre_chunk = TablePreChunk(
-            Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
-            opts=ChunkingOptions.new(max_characters=100),
-        )
-
-        chunk_iter = pre_chunk.iter_chunks()
-
-        chunk = next(chunk_iter)
-        assert isinstance(chunk, TableChunk)
-        assert chunk.text == (
-            "Header Col 1   Header Col 2\n"
-            "Lorem ipsum    dolor sit amet\n"
-            "Consectetur    adipiscing elit\n"
-            "Nunc aliqua"
-        )
-        assert chunk.metadata.text_as_html == (
-            "<table>\n"
-            "<thead>\n"
-            "<tr><th>Header Col 1   </th><th>Header Col 2  </th></tr>\n"
-            "</thead>\n"
-            "<tbody>\n"
-            "<tr><td>Lo"
-        )
-        # --
-        chunk = next(chunk_iter)
-        assert isinstance(chunk, TableChunk)
-        assert (
-            chunk.text == "m   id enim nec molestie\nVivamus quis   nunc ipsum donec ac fermentum"
-        )
-        assert chunk.metadata.text_as_html == (
-            "rem ipsum    </td><td>A Link example</td></tr>\n"
-            "<tr><td>Consectetur    </td><td>adipiscing elit</td><"
-        )
-        # -- note that text runs out but HTML continues because it's significantly longer. So two
-        # -- of these chunks have HTML but no text.
-        chunk = next(chunk_iter)
-        assert isinstance(chunk, TableChunk)
-        assert chunk.text == ""
-        assert chunk.metadata.text_as_html == (
-            "/tr>\n"
-            "<tr><td>Nunc aliquam   </td><td>id enim nec molestie</td></tr>\n"
-            "<tr><td>Vivamus quis   </td><td>"
-        )
-        # --
-        chunk = next(chunk_iter)
-        assert isinstance(chunk, TableChunk)
-        assert chunk.text == ""
-        assert chunk.metadata.text_as_html == (
-            "nunc ipsum donec ac fermentum</td></tr>\n</tbody>\n</table>"
-        )
-        # --
-        with pytest.raises(StopIteration):
-            next(chunk_iter)
-
-
-class DescribeTextPreChunk:
-    """Unit-test suite for `unstructured.chunking.title.TextPreChunk objects."""
-
-    def it_can_combine_itself_with_another_TextPreChunk_instance(self):
-        """.combine() produces a new pre-chunk by appending the elements of `other_pre-chunk`.
-
-        Note that neither the original or other pre_chunk are mutated.
-        """
-        opts = ChunkingOptions.new()
-        pre_chunk = TextPreChunk(
-            [
-                Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
-                Text("In rhoncus ipsum sed lectus porta volutpat."),
-            ],
-            opts=opts,
-        )
-        other_pre_chunk = TextPreChunk(
-            [
-                Text("Donec semper facilisis metus finibus malesuada."),
-                Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
-            ],
-            opts=opts,
-        )
-
-        new_pre_chunk = pre_chunk.combine(other_pre_chunk)
-
-        assert new_pre_chunk == TextPreChunk(
-            [
-                Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
-                Text("In rhoncus ipsum sed lectus porta volutpat."),
-                Text("Donec semper facilisis metus finibus malesuada."),
-                Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
-            ],
-            opts=opts,
-        )
-        assert pre_chunk == TextPreChunk(
-            [
-                Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
-                Text("In rhoncus ipsum sed lectus porta volutpat."),
-            ],
-            opts=opts,
-        )
-        assert other_pre_chunk == TextPreChunk(
-            [
-                Text("Donec semper facilisis metus finibus malesuada."),
-                Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
-            ],
-            opts=opts,
-        )
-
-    def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
-        pre_chunk = TextPreChunk(
-            [
-                Title("Introduction"),
-                Text(
-                    "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
-                    "lectus porta volutpat.",
-                ),
-            ],
-            opts=ChunkingOptions.new(max_characters=200),
-        )
-
-        chunk_iter = pre_chunk.iter_chunks()
-
-        chunk = next(chunk_iter)
-        assert chunk == CompositeElement(
-            "Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
-            " In rhoncus ipsum sedlectus porta volutpat.",
-        )
-        assert chunk.metadata is pre_chunk._consolidated_metadata
-
-    def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
-        # -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
-        # -- The pre-chunker will isolate that element in a pre_chunk of its own.
-        pre_chunk = TextPreChunk(
-            [
-                Text(
-                    "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
-                    " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
-                    " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
-                    " commodo consequat."
-                ),
-            ],
-            opts=ChunkingOptions.new(max_characters=200),
-        )
-
-        chunk_iter = pre_chunk.iter_chunks()
-
-        chunk = next(chunk_iter)
-        assert chunk == CompositeElement(
-            "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
-            " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
-            " veniam, quis nostrud exercitation ullamco laboris nisi ut a"
-        )
-        assert chunk.metadata is pre_chunk._consolidated_metadata
-        # --
-        chunk = next(chunk_iter)
-        assert chunk == CompositeElement("liquip ex ea commodo consequat.")
-        assert chunk.metadata is pre_chunk._consolidated_metadata
-        # --
-        with pytest.raises(StopIteration):
-            next(chunk_iter)
-
-    def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self):
-        """.text_length is the size of chunk this pre-chunk will produce (before any splitting)."""
-        pre_chunk = TextPreChunk(
-            [PageBreak(""), Text("foo"), Text("bar")], opts=ChunkingOptions.new()
-        )
-        assert pre_chunk.text_length == 8
-
-    def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self):
-        pre_chunk = TextPreChunk(
-            [
-                Title(
-                    "Lorem Ipsum",
-                    metadata=ElementMetadata(
-                        category_depth=0,
-                        filename="foo.docx",
-                        languages=["lat"],
-                        parent_id="f87731e0",
-                    ),
-                ),
-                Text(
-                    "'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
-                    metadata=ElementMetadata(
-                        category_depth=1,
-                        filename="foo.docx",
-                        image_path="sprite.png",
-                        languages=["lat", "eng"],
-                    ),
-                ),
-            ],
-            opts=ChunkingOptions.new(),
-        )
-
-        assert pre_chunk._all_metadata_values == {
-            # -- scalar values are accumulated in a list in element order --
-            "category_depth": [0, 1],
-            # -- all values are accumulated, not only unique ones --
-            "filename": ["foo.docx", "foo.docx"],
-            # -- list-type fields produce a list of lists --
-            "languages": [["lat"], ["lat", "eng"]],
-            # -- fields that only appear in some elements are captured --
-            "image_path": ["sprite.png"],
-            "parent_id": ["f87731e0"],
-            # -- A `None` value never appears, neither does a field-name with an empty list --
-        }
-
-    def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self):
-        metadata = ElementMetadata(
-            category_depth=0,
-            filename="foo.docx",
-            languages=["lat"],
-            parent_id="f87731e0",
-        )
-        metadata.coefficient = 0.62
-        metadata_2 = ElementMetadata(
-            category_depth=1,
-            filename="foo.docx",
-            image_path="sprite.png",
-            languages=["lat", "eng"],
-        )
-        metadata_2.quotient = 1.74
-
-        pre_chunk = TextPreChunk(
-            [
-                Title("Lorem Ipsum", metadata=metadata),
-                Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
-            ],
-            opts=ChunkingOptions.new(),
-        )
-
-        # -- ad-hoc fields "coefficient" and "quotient" do not appear --
-        assert pre_chunk._all_metadata_values == {
-            "category_depth": [0, 1],
-            "filename": ["foo.docx", "foo.docx"],
-            "image_path": ["sprite.png"],
-            "languages": [["lat"], ["lat", "eng"]],
-            "parent_id": ["f87731e0"],
-        }
-
-    def it_consolidates_regex_metadata_in_a_field_specific_way(self):
-        """regex_metadata of chunk is combined regex_metadatas of its elements.
-
-        Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new
-        position in the chunk after element text has been concatenated.
-        """
-        pre_chunk = TextPreChunk(
-            [
-                Title(
-                    "Lorem Ipsum",
-                    metadata=ElementMetadata(
-                        regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
-                    ),
-                ),
-                Text(
-                    "Lorem ipsum dolor sit amet consectetur adipiscing elit.",
-                    metadata=ElementMetadata(
-                        regex_metadata={
-                            "dolor": [RegexMetadata(text="dolor", start=12, end=17)],
-                            "ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
-                        },
-                    ),
-                ),
-                Text(
-                    "In rhoncus ipsum sed lectus porta volutpat.",
-                    metadata=ElementMetadata(
-                        regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
-                    ),
-                ),
-            ],
-            opts=ChunkingOptions.new(),
-        )
-
-        regex_metadata = pre_chunk._consolidated_regex_meta
-
-        assert regex_metadata == {
-            "dolor": [RegexMetadata(text="dolor", start=25, end=30)],
-            "ipsum": [
-                RegexMetadata(text="Ipsum", start=6, end=11),
-                RegexMetadata(text="ipsum", start=19, end=24),
-                RegexMetadata(text="ipsum", start=81, end=86),
-            ],
-        }
-
-    def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self):
-        """._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
-
-        Only non-None fields should appear in the dict and each field value should be the
-        consolidation of the values across the pre_chunk elements.
-        """
-        pre_chunk = TextPreChunk(
-            [
-                PageBreak(""),
-                Title(
-                    "Lorem Ipsum",
-                    metadata=ElementMetadata(
-                        filename="foo.docx",
-                        # -- category_depth has DROP strategy so doesn't appear in result --
-                        category_depth=0,
-                        emphasized_text_contents=["Lorem", "Ipsum"],
-                        emphasized_text_tags=["b", "i"],
-                        languages=["lat"],
-                        regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
-                    ),
-                ),
-                Text(
-                    "'Lorem ipsum dolor' means 'Thank you very much' in Latin.",
-                    metadata=ElementMetadata(
-                        # -- filename change doesn't happen IRL but demonstrates FIRST strategy --
-                        filename="bar.docx",
-                        # -- emphasized_text_contents has LIST_CONCATENATE strategy, so "Lorem"
-                        # -- appears twice in consolidated-meta (as it should) and length matches
-                        # -- that of emphasized_text_tags both before and after consolidation.
-                        emphasized_text_contents=["Lorem", "ipsum"],
-                        emphasized_text_tags=["i", "b"],
-                        # -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once --
-                        languages=["eng", "lat"],
-                        # -- regex_metadata has its own dedicated consolidation-strategy (REGEX) --
-                        regex_metadata={
-                            "dolor": [RegexMetadata(text="dolor", start=12, end=17)],
-                            "ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
-                        },
-                    ),
-                ),
-            ],
-            opts=ChunkingOptions.new(),
-        )
-
-        meta_kwargs = pre_chunk._meta_kwargs
-
-        assert meta_kwargs == {
-            "filename": "foo.docx",
-            "emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"],
-            "emphasized_text_tags": ["b", "i", "i", "b"],
-            "languages": ["lat", "eng"],
-            "regex_metadata": {
-                "ipsum": [
-                    RegexMetadata(text="Ipsum", start=6, end=11),
-                    RegexMetadata(text="ipsum", start=19, end=24),
-                ],
-                "dolor": [RegexMetadata(text="dolor", start=25, end=30)],
-            },
-        }
-
-    @pytest.mark.parametrize(
-        ("elements", "expected_value"),
-        [
-            ([Text("foo"), Text("bar")], "foo\n\nbar"),
-            ([Text("foo"), PageBreak(""), Text("bar")], "foo\n\nbar"),
-            ([PageBreak(""), Text("foo"), Text("bar")], "foo\n\nbar"),
-            ([Text("foo"), Text("bar"), PageBreak("")], "foo\n\nbar"),
-        ],
-    )
-    def it_knows_the_concatenated_text_of_the_pre_chunk(
-        self, elements: List[Text], expected_value: str
-    ):
-        """._text is the "joined" text of the pre-chunk elements.
-
-        The text-segment contributed by each element is separated from the next by a blank line
-        ("\n\n"). An element that contributes no text does not give rise to a separator.
-        """
-        pre_chunk = TextPreChunk(elements, opts=ChunkingOptions.new())
-        assert pre_chunk._text == expected_value
-
-
-class DescribeTextPreChunkBuilder:
-    """Unit-test suite for `unstructured.chunking.title.TextPreChunkBuilder`."""
-
-    def it_is_empty_on_construction(self):
-        builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
-
-        assert builder.text_length == 0
-        assert builder.remaining_space == 50
-
-    def it_accumulates_elements_added_to_it(self):
-        builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
-
-        builder.add_element(Title("Introduction"))
-        assert builder.text_length == 12
-        assert builder.remaining_space == 136
-
-        builder.add_element(
-            Text(
-                "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
-                "lectus porta volutpat.",
-            ),
-        )
-        assert builder.text_length == 112
-        assert builder.remaining_space == 36
-
-    def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
-        builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
-        builder.add_element(Title("Introduction"))
-        builder.add_element(
-            Text(
-                "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
-                "lectus porta volutpat.",
-            ),
-        )
-
-        pre_chunk = next(builder.flush())
-
-        assert isinstance(pre_chunk, TextPreChunk)
-        assert pre_chunk._elements == [
-            Title("Introduction"),
-            Text(
-                "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
-                "lectus porta volutpat.",
-            ),
-        ]
-        assert builder.text_length == 0
-        assert builder.remaining_space == 150
-
-    def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
-        builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
-
-        pre_chunks = list(builder.flush())
-
-        assert pre_chunks == []
-        assert builder.text_length == 0
-        assert builder.remaining_space == 150
-
-    def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
-        builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
-        builder.add_element(Text("abcde"))
-        builder.add_element(Text("fghij"))
-
-        # -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
-        # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
-        assert builder.text_length == 12
-        # -- .remaining_space is reduced by the length (2) of the trailing separator which would go
-        # -- between the current text and that of the next element if one was added.
-        # -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
-        assert builder.remaining_space == 36
-
-
-# == PreChunkCombiner =============================================================================
-
-
-class DescribePreChunkCombiner:
-    """Unit-test suite for `unstructured.chunking.title.PreChunkCombiner`."""
-
-    def it_combines_sequential_small_text_pre_chunks(self):
-        opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
-        pre_chunks = [
-            TextPreChunk(
-                [
-                    Title("Lorem Ipsum"),  # 11
-                    Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),  # 55
-                ],
-                opts=opts,
-            ),
-            TextPreChunk(
-                [
-                    Title("Mauris Nec"),  # 10
-                    Text("Mauris nec urna non augue vulputate consequat eget et nisi."),  # 59
-                ],
-                opts=opts,
-            ),
-            TextPreChunk(
-                [
-                    Title("Sed Orci"),  # 8
-                    Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),  # 63
-                ],
-                opts=opts,
-            ),
-        ]
-
-        pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
-
-        pre_chunk = next(pre_chunk_iter)
-        assert isinstance(pre_chunk, TextPreChunk)
-        assert pre_chunk._elements == [
-            Title("Lorem Ipsum"),
-            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
-            Title("Mauris Nec"),
-            Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
-            Title("Sed Orci"),
-            Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
-        ]
-        with pytest.raises(StopIteration):
-            next(pre_chunk_iter)
-
-    def but_it_does_not_combine_table_pre_chunks(self):
-        opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
-        pre_chunks = [
-            TextPreChunk(
-                [
-                    Title("Lorem Ipsum"),
-                    Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
-                ],
-                opts=opts,
-            ),
-            TablePreChunk(Table("Heading\nCell text"), opts=opts),
-            TextPreChunk(
-                [
-                    Title("Mauris Nec"),
-                    Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
-                ],
-                opts=opts,
-            ),
-        ]
-
-        pre_chunk_iter = PreChunkCombiner(
-            pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
-        ).iter_combined_pre_chunks()
-
-        pre_chunk = next(pre_chunk_iter)
-        assert isinstance(pre_chunk, TextPreChunk)
-        assert pre_chunk._elements == [
-            Title("Lorem Ipsum"),
-            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
-        ]
-        # --
-        pre_chunk = next(pre_chunk_iter)
-        assert isinstance(pre_chunk, TablePreChunk)
-        assert pre_chunk._table == Table("Heading\nCell text")
-        # --
-        pre_chunk = next(pre_chunk_iter)
-        assert isinstance(pre_chunk, TextPreChunk)
-        assert pre_chunk._elements == [
-            Title("Mauris Nec"),
-            Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
-        ]
-        # --
-        with pytest.raises(StopIteration):
-            next(pre_chunk_iter)
-
-    def it_respects_the_specified_combination_threshold(self):
-        opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80)
-        pre_chunks = [
-            TextPreChunk(  # 68
-                [
-                    Title("Lorem Ipsum"),  # 11
-                    Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),  # 55
-                ],
-                opts=opts,
-            ),
-            TextPreChunk(  # 71
-                [
-                    Title("Mauris Nec"),  # 10
-                    Text("Mauris nec urna non augue vulputate consequat eget et nisi."),  # 59
-                ],
-                opts=opts,
-            ),
-            # -- len == 139
-            TextPreChunk(
-                [
-                    Title("Sed Orci"),  # 8
-                    Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),  # 63
-                ],
-                opts=opts,
-            ),
-        ]
-
-        pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
-
-        pre_chunk = next(pre_chunk_iter)
-        assert isinstance(pre_chunk, TextPreChunk)
-        assert pre_chunk._elements == [
-            Title("Lorem Ipsum"),
-            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
-            Title("Mauris Nec"),
-            Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
-        ]
-        # --
-        pre_chunk = next(pre_chunk_iter)
-        assert isinstance(pre_chunk, TextPreChunk)
-        assert pre_chunk._elements == [
-            Title("Sed Orci"),
-            Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
-        ]
-        # --
-        with pytest.raises(StopIteration):
-            next(pre_chunk_iter)
-
-    def it_respects_the_hard_maximum_window_length(self):
-        opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200)
-        pre_chunks = [
-            TextPreChunk(  # 68
-                [
-                    Title("Lorem Ipsum"),  # 11
-                    Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),  # 55
-                ],
-                opts=opts,
-            ),
-            TextPreChunk(  # 71
-                [
-                    Title("Mauris Nec"),  # 10
-                    Text("Mauris nec urna non augue vulputate consequat eget et nisi."),  # 59
-                ],
-                opts=opts,
-            ),
-            # -- len == 139
-            TextPreChunk(
-                [
-                    Title("Sed Orci"),  # 8
-                    Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),  # 63
-                ],
-                opts=opts,
-            ),
-            # -- len == 214
-        ]
-
-        pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()
-
-        pre_chunk = next(pre_chunk_iter)
-        assert isinstance(pre_chunk, TextPreChunk)
-        assert pre_chunk._elements == [
-            Title("Lorem Ipsum"),
-            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
-            Title("Mauris Nec"),
-            Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
-        ]
-        # --
-        pre_chunk = next(pre_chunk_iter)
-        assert isinstance(pre_chunk, TextPreChunk)
-        assert pre_chunk._elements == [
-            Title("Sed Orci"),
-            Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),
-        ]
-        # --
-        with pytest.raises(StopIteration):
-            next(pre_chunk_iter)
-
-    def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
-        """Such as occurs when a single element exceeds the window size."""
-        opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
-        pre_chunks = [
-            TextPreChunk([Title("Lorem Ipsum")], opts=opts),
-            TextPreChunk(  # 179
-                [
-                    Text(
-                        "Lorem ipsum dolor sit amet consectetur adipiscing elit."  # 55
-                        " Mauris nec urna non augue vulputate consequat eget et nisi."  # 60
-                        " Sed orci quam, eleifend sit amet vehicula, elementum ultricies."  # 64
-                    )
-                ],
-                opts=opts,
-            ),
-            TextPreChunk([Title("Vulputate Consequat")], opts=opts),
-        ]
-
-        pre_chunk_iter = PreChunkCombiner(
-            pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
-        ).iter_combined_pre_chunks()
-
-        pre_chunk = next(pre_chunk_iter)
-        assert isinstance(pre_chunk, TextPreChunk)
-        assert pre_chunk._elements == [Title("Lorem Ipsum")]
-        # --
-        pre_chunk = next(pre_chunk_iter)
-        assert isinstance(pre_chunk, TextPreChunk)
-        assert pre_chunk._elements == [
-            Text(
-                "Lorem ipsum dolor sit amet consectetur adipiscing elit."
-                " Mauris nec urna non augue vulputate consequat eget et nisi."
-                " Sed orci quam, eleifend sit amet vehicula, elementum ultricies."
-            )
-        ]
-        # --
-        pre_chunk = next(pre_chunk_iter)
-        assert isinstance(pre_chunk, TextPreChunk)
-        assert pre_chunk._elements == [Title("Vulputate Consequat")]
-        # --
-        with pytest.raises(StopIteration):
-            next(pre_chunk_iter)
-
-
-class DescribeTextPreChunkAccumulator:
-    """Unit-test suite for `unstructured.chunking.title.TextPreChunkAccumulator`."""
-
-    def it_is_empty_on_construction(self):
-        accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=100))
-
-        assert accum.text_length == 0
-        assert accum.remaining_space == 100
-
-    def it_accumulates_pre_chunks_added_to_it(self):
-        opts = ChunkingOptions.new(max_characters=500)
-        accum = TextPreChunkAccumulator(opts=opts)
-
-        accum.add_pre_chunk(
-            TextPreChunk(
-                [
-                    Title("Lorem Ipsum"),
-                    Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
-                ],
-                opts=opts,
-            )
-        )
-        assert accum.text_length == 68
-        assert accum.remaining_space == 430
-
-        accum.add_pre_chunk(
-            TextPreChunk(
-                [
-                    Title("Mauris Nec"),
-                    Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
-                ],
-                opts=opts,
-            )
-        )
-        assert accum.text_length == 141
-        assert accum.remaining_space == 357
-
-    def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
-        opts = ChunkingOptions.new(max_characters=150)
-        accum = TextPreChunkAccumulator(opts=opts)
-        accum.add_pre_chunk(
-            TextPreChunk(
-                [
-                    Title("Lorem Ipsum"),
-                    Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
-                ],
-                opts=opts,
-            )
-        )
-        accum.add_pre_chunk(
-            TextPreChunk(
-                [
-                    Title("Mauris Nec"),
-                    Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
-                ],
-                opts=opts,
-            )
-        )
-        accum.add_pre_chunk(
-            TextPreChunk(
-                [
-                    Title("Sed Orci"),
-                    Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
-                ],
-                opts=opts,
-            )
-        )
-
-        pre_chunk_iter = accum.flush()
-
-        # -- iterator generates exactly one pre_chunk --
-        pre_chunk = next(pre_chunk_iter)
-        with pytest.raises(StopIteration):
-            next(pre_chunk_iter)
-        # -- and it is a _TextPreChunk containing all the elements --
-        assert isinstance(pre_chunk, TextPreChunk)
-        assert pre_chunk._elements == [
-            Title("Lorem Ipsum"),
-            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
-            Title("Mauris Nec"),
-            Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
-            Title("Sed Orci"),
-            Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
-        ]
-        assert accum.text_length == 0
-        assert accum.remaining_space == 150
-
-    def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
-        accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150))
-
-        pre_chunks = list(accum.flush())
-
-        assert pre_chunks == []
-        assert accum.text_length == 0
-        assert accum.remaining_space == 150
-
-    def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
-        opts = ChunkingOptions.new(max_characters=100)
-        accum = TextPreChunkAccumulator(opts=opts)
-        accum.add_pre_chunk(TextPreChunk([Text("abcde")], opts=opts))
-        accum.add_pre_chunk(TextPreChunk([Text("fghij")], opts=opts))
-
-        # -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
-        # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
-        assert accum.text_length == 12
-        # -- .remaining_space is reduced by the length (2) of the trailing separator which would
-        # -- go between the current text and that of the next pre-chunk if one was added.
-        # -- So 100 - 12 - 2 = 86 here, not 100 - 12 = 88
-        assert accum.remaining_space == 86
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.11.5-dev1"  # pragma: no cover
+__version__ = "0.11.5-dev2"  # pragma: no cover
--- a/unstructured/chunking/base.py
+++ b/unstructured/chunking/base.py
@ -2,12 +2,25 @@

 from __future__ import annotations

-from typing import Optional
+import collections
+import copy
+from typing import Any, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple, cast

-from typing_extensions import Self
+from typing_extensions import Self, TypeAlias

+from unstructured.documents.elements import (
+    CompositeElement,
+    ConsolidationStrategy,
+    Element,
+    ElementMetadata,
+    RegexMetadata,
+    Table,
+    TableChunk,
+)
 from unstructured.utils import lazyproperty

+PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
+

 class ChunkingOptions:
    """Specifies parameters of optional chunking behaviors."""
@ -150,3 +163,404 @@ class ChunkingOptions:
        # loop (I think).
        if self._overlap >= max_characters:
            raise ValueError(f"'overlap' must be less than max_characters," f" got {self._overlap}")
+
+
+# ================================================================================================
+# PRE-CHUNK SUB-TYPES
+# ================================================================================================
+
+
+class TablePreChunk:
+    """A pre-chunk composed of a single Table element."""
+
+    def __init__(self, table: Table, opts: ChunkingOptions) -> None:
+        self._table = table
+        self._opts = opts
+
+    def iter_chunks(self) -> Iterator[Table | TableChunk]:
+        """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
+        text = self._table.text
+        html = self._table.metadata.text_as_html or ""
+        maxlen = self._opts.hard_max
+
+        # -- only chunk a table when it's too big to swallow whole --
+        if len(text) <= maxlen and len(html) <= maxlen:
+            yield self._table
+            return
+
+        is_continuation = False
+
+        while text or html:
+            # -- split off the next maxchars into the next TableChunk --
+            text_chunk, text = text[:maxlen], text[maxlen:]
+            table_chunk = TableChunk(text=text_chunk, metadata=copy.deepcopy(self._table.metadata))
+
+            # -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
+            # -- HTML elements that *correspond* to the TextChunk.text fragment.
+            if html:
+                html_chunk, html = html[:maxlen], html[maxlen:]
+                table_chunk.metadata.text_as_html = html_chunk
+
+            # -- mark second and later chunks as a continuation --
+            if is_continuation:
+                table_chunk.metadata.is_continuation = True
+
+            yield table_chunk
+
+            is_continuation = True
+
+
+class TextPreChunk:
+    """A sequence of elements that belong to the same semantic unit within a document.
+
+    The name "section" derives from the idea of a document-section, a heading followed by the
+    paragraphs "under" that heading. That structure is not found in all documents and actual section
+    content can vary, but that's the concept.
+
+    This object is purposely immutable.
+    """
+
+    def __init__(self, elements: Iterable[Element], opts: ChunkingOptions) -> None:
+        self._elements = list(elements)
+        self._opts = opts
+
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, TextPreChunk):
+            return False
+        return self._elements == other._elements
+
+    def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk:
+        """Return new `TextPreChunk` that combines this and `other_pre_chunk`."""
+        return TextPreChunk(self._elements + other_pre_chunk._elements, opts=self._opts)
+
+    def iter_chunks(self) -> Iterator[CompositeElement]:
+        """Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller."""
+        text = self._text
+        text_len = len(text)
+        maxlen = self._opts.hard_max
+        start = 0
+        remaining = text_len
+
+        while remaining > 0:
+            end = min(start + maxlen, text_len)
+            yield CompositeElement(text=text[start:end], metadata=self._consolidated_metadata)
+            start = end
+            remaining = text_len - end
+
+    @lazyproperty
+    def text_length(self) -> int:
+        """Length of concatenated text of this pre-chunk, including separators."""
+        # -- used by pre-chunk-combiner to identify combination candidates --
+        return len(self._text)
+
+    @lazyproperty
+    def _all_metadata_values(self) -> Dict[str, List[Any]]:
+        """Collection of all populated metadata values across elements.
+
+        The resulting dict has one key for each `ElementMetadata` field that had a non-None value in
+        at least one of the elements in this pre-chunk. The value of that key is a list of all those
+        populated values, in element order, for example:
+
+            {
+                "filename": ["sample.docx", "sample.docx"],
+                "languages": [["lat"], ["lat", "eng"]]
+                ...
+            }
+
+        This preprocessing step provides the input for a specified consolidation strategy that will
+        resolve the list of values for each field to a single consolidated value.
+        """
+
+        def iter_populated_fields(metadata: ElementMetadata) -> Iterator[Tuple[str, Any]]:
+            """(field_name, value) pair for each non-None field in single `ElementMetadata`."""
+            return (
+                (field_name, value)
+                for field_name, value in metadata.known_fields.items()
+                if value is not None
+            )
+
+        field_values: DefaultDict[str, List[Any]] = collections.defaultdict(list)
+
+        # -- collect all non-None field values in a list for each field, in element-order --
+        for e in self._elements:
+            for field_name, value in iter_populated_fields(e.metadata):
+                field_values[field_name].append(value)
+
+        return dict(field_values)
+
+    @lazyproperty
+    def _consolidated_metadata(self) -> ElementMetadata:
+        """Metadata applicable to this pre-chunk as a single chunk.
+
+        Formed by applying consolidation rules to all metadata fields across the elements of this
+        pre-chunk.
+
+        For the sake of consistency, the same rules are applied (for example, for dropping values)
+        to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
+        "consolidated".
+        """
+        return ElementMetadata(**self._meta_kwargs)
+
+    @lazyproperty
+    def _consolidated_regex_meta(self) -> Dict[str, List[RegexMetadata]]:
+        """Consolidate the regex-metadata in `regex_metadata_dicts` into a single dict.
+
+        This consolidated value is suitable for use in the chunk metadata. `start` and `end`
+        offsets of each regex match are also adjusted for their new positions.
+        """
+        chunk_regex_metadata: Dict[str, List[RegexMetadata]] = {}
+        separator_len = len(self._opts.text_separator)
+        running_text_len = 0
+        start_offset = 0
+
+        for element in self._elements:
+            text_len = len(element.text)
+            # -- skip empty elements like `PageBreak("")` --
+            if not text_len:
+                continue
+            # -- account for blank line between "squashed" elements, but not before first element --
+            running_text_len += separator_len if running_text_len else 0
+            start_offset = running_text_len
+            running_text_len += text_len
+
+            if not element.metadata.regex_metadata:
+                continue
+
+            # -- consolidate any `regex_metadata` matches, adjusting the match start/end offsets --
+            element_regex_metadata = copy.deepcopy(element.metadata.regex_metadata)
+            for regex_name, matches in element_regex_metadata.items():
+                for m in matches:
+                    m["start"] += start_offset
+                    m["end"] += start_offset
+                chunk_matches = chunk_regex_metadata.get(regex_name, [])
+                chunk_matches.extend(matches)
+                chunk_regex_metadata[regex_name] = chunk_matches
+
+        return chunk_regex_metadata
+
+    @lazyproperty
+    def _meta_kwargs(self) -> Dict[str, Any]:
+        """The consolidated metadata values as a dict suitable for constructing ElementMetadata.
+
+        This is where consolidation strategies are actually applied. The output is suitable for use
+        in constructing an `ElementMetadata` object like `ElementMetadata(**self._meta_kwargs)`.
+        """
+        CS = ConsolidationStrategy
+        field_consolidation_strategies = ConsolidationStrategy.field_consolidation_strategies()
+
+        def iter_kwarg_pairs() -> Iterator[Tuple[str, Any]]:
+            """Generate (field-name, value) pairs for each field in consolidated metadata."""
+            for field_name, values in self._all_metadata_values.items():
+                strategy = field_consolidation_strategies.get(field_name)
+                if strategy is CS.FIRST:
+                    yield field_name, values[0]
+                # -- concatenate lists from each element that had one, in order --
+                elif strategy is CS.LIST_CONCATENATE:
+                    yield field_name, sum(values, cast(List[Any], []))
+                # -- union lists from each element, preserving order of appearance --
+                elif strategy is CS.LIST_UNIQUE:
+                    # -- Python 3.7+ maintains dict insertion order --
+                    ordered_unique_keys = {key: None for val_list in values for key in val_list}
+                    yield field_name, list(ordered_unique_keys.keys())
+                elif strategy is CS.REGEX:
+                    yield field_name, self._consolidated_regex_meta
+                elif strategy is CS.DROP:
+                    continue
+                else:
+                    # -- not likely to hit this since we have a test in `text_elements.py` that
+                    # -- ensures every ElementMetadata fields has an assigned strategy.
+                    raise NotImplementedError(
+                        f"metadata field {repr(field_name)} has no defined consolidation strategy"
+                    )
+
+        return dict(iter_kwarg_pairs())
+
+    @lazyproperty
+    def _text(self) -> str:
+        """The concatenated text of all elements in this pre-chunk.
+
+        Each element-text is separated from the next by a blank line ("\n\n").
+        """
+        text_separator = self._opts.text_separator
+        return text_separator.join(e.text for e in self._elements if e.text)
+
+
+# ================================================================================================
+# PRE-CHUNKING ACCUMULATORS
+# ------------------------------------------------------------------------------------------------
+# Accumulators encapsulate the work of grouping elements and later pre-chunks to form the larger
+# pre-chunk and combined-pre-chunk items central to unstructured chunking.
+# ================================================================================================
+
+
+class PreChunkBuilder:
+    """An element accumulator suitable for incrementally forming a pre-chunk.
+
+    Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use
+    to determine whether it should add the next element in the element stream.
+
+    `.flush()` is used to build a PreChunk object from the accumulated elements. This method
+    returns an iterator that generates zero-or-one `TextPreChunk` or `TablePreChunk` object and is
+    used like so:
+
+        yield from builder.flush()
+
+    If no elements have been accumulated, no `PreChunk` instance is generated. Flushing the builder
+    clears the elements it contains so it is ready to build the next pre-chunk.
+    """
+
+    def __init__(self, opts: ChunkingOptions) -> None:
+        self._opts = opts
+        self._separator_len = len(opts.text_separator)
+        self._elements: List[Element] = []
+
+        # -- only includes non-empty element text, e.g. PageBreak.text=="" is not included --
+        self._text_segments: List[str] = []
+        # -- combined length of text-segments, not including separators --
+        self._text_len: int = 0
+
+    def add_element(self, element: Element) -> None:
+        """Add `element` to this section."""
+        self._elements.append(element)
+        if element.text:
+            self._text_segments.append(element.text)
+            self._text_len += len(element.text)
+
+    def flush(self) -> Iterator[TextPreChunk]:
+        """Generate zero-or-one `PreChunk` object and clear the accumulator.
+
+        Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
+        boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element
+        stream.
+        """
+        if not self._elements:
+            return
+        # -- clear builder before yield so we're not sensitive to the timing of how/when this
+        # -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately.
+        elements = self._elements[:]
+        self._elements.clear()
+        self._text_segments.clear()
+        self._text_len = 0
+        yield TextPreChunk(elements, self._opts)
+
+    @property
+    def remaining_space(self) -> int:
+        """Maximum text-length of an element that can be added without exceeding maxlen."""
+        # -- include length of trailing separator that will go before next element text --
+        separators_len = self._separator_len * len(self._text_segments)
+        return self._opts.hard_max - self._text_len - separators_len
+
+    @property
+    def text_length(self) -> int:
+        """Length of the text in this pre-chunk.
+
+        This value represents the chunk-size that would result if this pre-chunk was flushed in its
+        current state. In particular, it does not include the length of a trailing separator (since
+        that would only appear if an additional element was added).
+
+        Not suitable for judging remaining space, use `.remaining_space` for that value.
+        """
+        # -- number of text separators present in joined text of elements. This includes only
+        # -- separators *between* text segments, not one at the end. Note there are zero separators
+        # -- for both 0 and 1 text-segments.
+        n = len(self._text_segments)
+        separator_count = n - 1 if n else 0
+        return self._text_len + (separator_count * self._separator_len)
+
+
+class PreChunkCombiner:
+    """Filters pre-chunk stream to combine small pre-chunks where possible."""
+
+    def __init__(self, pre_chunks: Iterable[PreChunk], opts: ChunkingOptions):
+        self._pre_chunks = pre_chunks
+        self._opts = opts
+
+    def iter_combined_pre_chunks(self) -> Iterator[PreChunk]:
+        """Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window."""
+        accum = TextPreChunkAccumulator(self._opts)
+        combine_text_under_n_chars = self._opts.combine_text_under_n_chars
+
+        for pre_chunk in self._pre_chunks:
+            # -- start new pre-chunk under these conditions --
+            if (
+                # -- a table pre-chunk is never combined --
+                isinstance(pre_chunk, TablePreChunk)
+                # -- don't add another pre-chunk once length has reached combination soft-max --
+                or accum.text_length >= combine_text_under_n_chars
+                # -- combining would exceed hard-max --
+                or accum.remaining_space < pre_chunk.text_length
+            ):
+                yield from accum.flush()
+
+            # -- a table pre-chunk is never combined so don't accumulate --
+            if isinstance(pre_chunk, TablePreChunk):
+                yield pre_chunk
+            else:
+                accum.add_pre_chunk(pre_chunk)
+
+        yield from accum.flush()
+
+
+class TextPreChunkAccumulator:
+    """Accumulates, measures, and combines pre-chunk objects.
+
+    Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
+    whether to add another pre-chunk.
+
+    `.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
+    This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used
+    like so:
+
+        yield from accum.flush()
+
+    If no pre-chunks have been accumulated, no `TextPreChunk` is generated. Flushing the builder
+    clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk.
+    """
+
+    def __init__(self, opts: ChunkingOptions) -> None:
+        self._opts = opts
+        self._pre_chunks: List[TextPreChunk] = []
+
+    def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None:
+        """Add a pre-chunk to the accumulator for possible combination with next pre-chunk."""
+        self._pre_chunks.append(pre_chunk)
+
+    def flush(self) -> Iterator[TextPreChunk]:
+        """Generate all accumulated pre-chunks as a single combined pre-chunk."""
+        pre_chunks = self._pre_chunks
+
+        # -- nothing to do if no pre-chunks have been accumulated --
+        if not pre_chunks:
+            return
+
+        # -- otherwise combine all accumulated pre-chunk into one --
+        pre_chunk = pre_chunks[0]
+        for other_pre_chunk in pre_chunks[1:]:
+            pre_chunk = pre_chunk.combine(other_pre_chunk)
+        yield pre_chunk
+
+        # -- and reset the accumulator (to empty) --
+        pre_chunks.clear()
+
+    @property
+    def remaining_space(self) -> int:
+        """Maximum size of pre-chunk that can be added without exceeding maxlen."""
+        maxlen = self._opts.hard_max
+        return (
+            maxlen
+            if not self._pre_chunks
+            # -- an additional pre-chunk will also incur an additional separator --
+            else maxlen - self.text_length - len(self._opts.text_separator)
+        )
+
+    @property
+    def text_length(self) -> int:
+        """Size of concatenated text in all pre-chunks in accumulator."""
+        n = len(self._pre_chunks)
+
+        if n == 0:
+            return 0
+
+        total_text_length = sum(s.text_length for s in self._pre_chunks)
+        total_separator_length = len(self._opts.text_separator) * (n - 1)
+        return total_text_length + total_separator_length
--- a/unstructured/chunking/title.py
+++ b/unstructured/chunking/title.py
@ -5,26 +5,20 @@ Main entry point is the `@add_chunking_strategy()` decorator.

 from __future__ import annotations

-import collections
-import copy
-from typing import Any, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple, cast
+from typing import Iterator, List, Optional

-from typing_extensions import TypeAlias
-
-from unstructured.chunking.base import ChunkingOptions
+from unstructured.chunking.base import (
+    ChunkingOptions,
+    PreChunk,
+    PreChunkBuilder,
+    PreChunkCombiner,
+    TablePreChunk,
+)
 from unstructured.documents.elements import (
-    CompositeElement,
-    ConsolidationStrategy,
    Element,
-    ElementMetadata,
-    RegexMetadata,
    Table,
-    TableChunk,
    Title,
 )
-from unstructured.utils import lazyproperty
-
-PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"


 def chunk_by_title(
@ -78,7 +72,7 @@ def chunk_by_title(

 def _split_elements_by_title_and_table(
    elements: List[Element], opts: ChunkingOptions
-) -> Iterator[TextPreChunk | TablePreChunk]:
+) -> Iterator[PreChunk]:
    """Implements "pre-chunker" responsibilities.

    A _section_ can be thought of as a "pre-chunk", generally determining the size and contents of a
@ -102,7 +96,7 @@ def _split_elements_by_title_and_table(

    A Table or Checkbox element is placed into a pre-chunk by itself.
    """
-    pre_chunk_builder = TextPreChunkBuilder(opts)
+    pre_chunk_builder = PreChunkBuilder(opts)

    prior_element = None

@ -156,396 +150,3 @@ def _metadata_differs(
    if ignore_page_numbers:
        return False
    return metadata1.page_number != metadata2.page_number
-
-
-# == PreChunks ===================================================================================
-
-
-class TablePreChunk:
-    """A pre-chunk composed of a single Table element."""
-
-    def __init__(self, table: Table, opts: ChunkingOptions) -> None:
-        self._table = table
-        self._opts = opts
-
-    def iter_chunks(self) -> Iterator[Table | TableChunk]:
-        """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
-        text = self._table.text
-        html = self._table.metadata.text_as_html or ""
-        maxlen = self._opts.hard_max
-
-        # -- only chunk a table when it's too big to swallow whole --
-        if len(text) <= maxlen and len(html) <= maxlen:
-            yield self._table
-            return
-
-        is_continuation = False
-
-        while text or html:
-            # -- split off the next maxchars into the next TableChunk --
-            text_chunk, text = text[:maxlen], text[maxlen:]
-            table_chunk = TableChunk(text=text_chunk, metadata=copy.deepcopy(self._table.metadata))
-
-            # -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
-            # -- HTML elements that *correspond* to the TextChunk.text fragment.
-            if html:
-                html_chunk, html = html[:maxlen], html[maxlen:]
-                table_chunk.metadata.text_as_html = html_chunk
-
-            # -- mark second and later chunks as a continuation --
-            if is_continuation:
-                table_chunk.metadata.is_continuation = True
-
-            yield table_chunk
-
-            is_continuation = True
-
-
-class TextPreChunk:
-    """A sequence of elements that belong to the same semantic unit within a document.
-
-    The name "section" derives from the idea of a document-section, a heading followed by the
-    paragraphs "under" that heading. That structure is not found in all documents and actual section
-    content can vary, but that's the concept.
-
-    This object is purposely immutable.
-    """
-
-    def __init__(self, elements: Iterable[Element], opts: ChunkingOptions) -> None:
-        self._elements = list(elements)
-        self._opts = opts
-
-    def __eq__(self, other: Any) -> bool:
-        if not isinstance(other, TextPreChunk):
-            return False
-        return self._elements == other._elements
-
-    def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk:
-        """Return new `TextPreChunk` that combines this and `other_pre_chunk`."""
-        return TextPreChunk(self._elements + other_pre_chunk._elements, opts=self._opts)
-
-    def iter_chunks(self) -> Iterator[CompositeElement]:
-        """Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller."""
-        text = self._text
-        text_len = len(text)
-        maxlen = self._opts.hard_max
-        start = 0
-        remaining = text_len
-
-        while remaining > 0:
-            end = min(start + maxlen, text_len)
-            yield CompositeElement(text=text[start:end], metadata=self._consolidated_metadata)
-            start = end
-            remaining = text_len - end
-
-    @lazyproperty
-    def text_length(self) -> int:
-        """Length of concatenated text of this pre-chunk, including separators."""
-        # -- used by pre-chunk-combiner to identify combination candidates --
-        return len(self._text)
-
-    @lazyproperty
-    def _all_metadata_values(self) -> Dict[str, List[Any]]:
-        """Collection of all populated metadata values across elements.
-
-        The resulting dict has one key for each `ElementMetadata` field that had a non-None value in
-        at least one of the elements in this pre-chunk. The value of that key is a list of all those
-        populated values, in element order, for example:
-
-            {
-                "filename": ["sample.docx", "sample.docx"],
-                "languages": [["lat"], ["lat", "eng"]]
-                ...
-            }
-
-        This preprocessing step provides the input for a specified consolidation strategy that will
-        resolve the list of values for each field to a single consolidated value.
-        """
-
-        def iter_populated_fields(metadata: ElementMetadata) -> Iterator[Tuple[str, Any]]:
-            """(field_name, value) pair for each non-None field in single `ElementMetadata`."""
-            return (
-                (field_name, value)
-                for field_name, value in metadata.known_fields.items()
-                if value is not None
-            )
-
-        field_values: DefaultDict[str, List[Any]] = collections.defaultdict(list)
-
-        # -- collect all non-None field values in a list for each field, in element-order --
-        for e in self._elements:
-            for field_name, value in iter_populated_fields(e.metadata):
-                field_values[field_name].append(value)
-
-        return dict(field_values)
-
-    @lazyproperty
-    def _consolidated_metadata(self) -> ElementMetadata:
-        """Metadata applicable to this pre-chunk as a single chunk.
-
-        Formed by applying consolidation rules to all metadata fields across the elements of this
-        pre-chunk.
-
-        For the sake of consistency, the same rules are applied (for example, for dropping values)
-        to a single-element pre-chunk too, even though metadata for such a pre-chunk is already
-        "consolidated".
-        """
-        return ElementMetadata(**self._meta_kwargs)
-
-    @lazyproperty
-    def _consolidated_regex_meta(self) -> Dict[str, List[RegexMetadata]]:
-        """Consolidate the regex-metadata in `regex_metadata_dicts` into a single dict.
-
-        This consolidated value is suitable for use in the chunk metadata. `start` and `end`
-        offsets of each regex match are also adjusted for their new positions.
-        """
-        chunk_regex_metadata: Dict[str, List[RegexMetadata]] = {}
-        separator_len = len(self._opts.text_separator)
-        running_text_len = 0
-        start_offset = 0
-
-        for element in self._elements:
-            text_len = len(element.text)
-            # -- skip empty elements like `PageBreak("")` --
-            if not text_len:
-                continue
-            # -- account for blank line between "squashed" elements, but not before first element --
-            running_text_len += separator_len if running_text_len else 0
-            start_offset = running_text_len
-            running_text_len += text_len
-
-            if not element.metadata.regex_metadata:
-                continue
-
-            # -- consolidate any `regex_metadata` matches, adjusting the match start/end offsets --
-            element_regex_metadata = copy.deepcopy(element.metadata.regex_metadata)
-            for regex_name, matches in element_regex_metadata.items():
-                for m in matches:
-                    m["start"] += start_offset
-                    m["end"] += start_offset
-                chunk_matches = chunk_regex_metadata.get(regex_name, [])
-                chunk_matches.extend(matches)
-                chunk_regex_metadata[regex_name] = chunk_matches
-
-        return chunk_regex_metadata
-
-    @lazyproperty
-    def _meta_kwargs(self) -> Dict[str, Any]:
-        """The consolidated metadata values as a dict suitable for constructing ElementMetadata.
-
-        This is where consolidation strategies are actually applied. The output is suitable for use
-        in constructing an `ElementMetadata` object like `ElementMetadata(**self._meta_kwargs)`.
-        """
-        CS = ConsolidationStrategy
-        field_consolidation_strategies = ConsolidationStrategy.field_consolidation_strategies()
-
-        def iter_kwarg_pairs() -> Iterator[Tuple[str, Any]]:
-            """Generate (field-name, value) pairs for each field in consolidated metadata."""
-            for field_name, values in self._all_metadata_values.items():
-                strategy = field_consolidation_strategies.get(field_name)
-                if strategy is CS.FIRST:
-                    yield field_name, values[0]
-                # -- concatenate lists from each element that had one, in order --
-                elif strategy is CS.LIST_CONCATENATE:
-                    yield field_name, sum(values, cast(List[Any], []))
-                # -- union lists from each element, preserving order of appearance --
-                elif strategy is CS.LIST_UNIQUE:
-                    # -- Python 3.7+ maintains dict insertion order --
-                    ordered_unique_keys = {key: None for val_list in values for key in val_list}
-                    yield field_name, list(ordered_unique_keys.keys())
-                elif strategy is CS.REGEX:
-                    yield field_name, self._consolidated_regex_meta
-                elif strategy is CS.DROP:
-                    continue
-                else:
-                    # -- not likely to hit this since we have a test in `text_elements.py` that
-                    # -- ensures every ElementMetadata fields has an assigned strategy.
-                    raise NotImplementedError(
-                        f"metadata field {repr(field_name)} has no defined consolidation strategy"
-                    )
-
-        return dict(iter_kwarg_pairs())
-
-    @lazyproperty
-    def _text(self) -> str:
-        """The concatenated text of all elements in this pre-chunk.
-
-        Each element-text is separated from the next by a blank line ("\n\n").
-        """
-        text_separator = self._opts.text_separator
-        return text_separator.join(e.text for e in self._elements if e.text)
-
-
-class TextPreChunkBuilder:
-    """An element accumulator suitable for incrementally forming a pre-chunk.
-
-    Provides monitoring properties like `.remaining_space` and `.text_length` a pre-chunker can use
-    to determine whether it should add the next element in the element stream.
-
-    `.flush()` is used to build a `TextPreChunk` object from the accumulated elements. This method
-    returns an interator that generates zero-or-one `TextPreChunk` object and is used like so:
-
-        yield from builder.flush()
-
-    If no elements have been accumulated, no `TextPreChunk` is generated. Flushing the builder
-    clears the elements it contains so it is ready to build the next text-pre-chunk.
-    """
-
-    def __init__(self, opts: ChunkingOptions) -> None:
-        self._opts = opts
-        self._separator_len = len(opts.text_separator)
-        self._elements: List[Element] = []
-
-        # -- only includes non-empty element text, e.g. PageBreak.text=="" is not included --
-        self._text_segments: List[str] = []
-        # -- combined length of text-segments, not including separators --
-        self._text_len: int = 0
-
-    def add_element(self, element: Element) -> None:
-        """Add `element` to this section."""
-        self._elements.append(element)
-        if element.text:
-            self._text_segments.append(element.text)
-            self._text_len += len(element.text)
-
-    def flush(self) -> Iterator[TextPreChunk]:
-        """Generate zero-or-one `PreChunk` object and clear the accumulator.
-
-        Suitable for use to emit a PreChunk when the maximum size has been reached or a semantic
-        boundary has been reached. Also to clear out a terminal pre-chunk at the end of an element
-        stream.
-        """
-        if not self._elements:
-            return
-        # -- clear builder before yield so we're not sensitive to the timing of how/when this
-        # -- iterator is exhausted and can add eleemnts for the next pre-chunk immediately.
-        elements = self._elements[:]
-        self._elements.clear()
-        self._text_segments.clear()
-        self._text_len = 0
-        yield TextPreChunk(elements, self._opts)
-
-    @property
-    def remaining_space(self) -> int:
-        """Maximum text-length of an element that can be added without exceeding maxlen."""
-        # -- include length of trailing separator that will go before next element text --
-        separators_len = self._separator_len * len(self._text_segments)
-        return self._opts.hard_max - self._text_len - separators_len
-
-    @property
-    def text_length(self) -> int:
-        """Length of the text in this pre-chunk.
-
-        This value represents the chunk-size that would result if this pre-chunk was flushed in its
-        current state. In particular, it does not include the length of a trailing separator (since
-        that would only appear if an additional element was added).
-
-        Not suitable for judging remaining space, use `.remaining_space` for that value.
-        """
-        # -- number of text separators present in joined text of elements. This includes only
-        # -- separators *between* text segments, not one at the end. Note there are zero separators
-        # -- for both 0 and 1 text-segments.
-        n = len(self._text_segments)
-        separator_count = n - 1 if n else 0
-        return self._text_len + (separator_count * self._separator_len)
-
-
-# == PreChunkCombiner ============================================================================
-
-
-class PreChunkCombiner:
-    """Filters pre-chunk stream to combine small pre-chunks where possible."""
-
-    def __init__(self, pre_chunks: Iterable[PreChunk], opts: ChunkingOptions):
-        self._pre_chunks = pre_chunks
-        self._opts = opts
-
-    def iter_combined_pre_chunks(self) -> Iterator[PreChunk]:
-        """Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window."""
-        accum = TextPreChunkAccumulator(self._opts)
-        combine_text_under_n_chars = self._opts.combine_text_under_n_chars
-
-        for pre_chunk in self._pre_chunks:
-            # -- start new pre-chunk under these conditions --
-            if (
-                # -- a table pre-chunk is never combined --
-                isinstance(pre_chunk, TablePreChunk)
-                # -- don't add another pre-chunk once length has reached combination soft-max --
-                or accum.text_length >= combine_text_under_n_chars
-                # -- combining would exceed hard-max --
-                or accum.remaining_space < pre_chunk.text_length
-            ):
-                yield from accum.flush()
-
-            # -- a table pre-chunk is never combined so don't accumulate --
-            if isinstance(pre_chunk, TablePreChunk):
-                yield pre_chunk
-            else:
-                accum.add_pre_chunk(pre_chunk)
-
-        yield from accum.flush()
-
-
-class TextPreChunkAccumulator:
-    """Accumulates, measures, and combines pre-chunk objects.
-
-    Provides monitoring properties `.remaining_space` and `.text_length` suitable for deciding
-    whether to add another pre-chunk.
-
-    `.flush()` is used to combine the accumulated pre-chunks into a single `TextPreChunk` object.
-    This method returns an interator that generates zero-or-one `TextPreChunk` objects and is used
-    like so:
-
-        yield from accum.flush()
-
-    If no pre-chunks have been accumulated, no `TextPreChunk` is generated. Flushing the builder
-    clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk.
-    """
-
-    def __init__(self, opts: ChunkingOptions) -> None:
-        self._opts = opts
-        self._pre_chunks: List[TextPreChunk] = []
-
-    def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None:
-        """Add a pre-chunk to the accumulator for possible combination with next pre-chunk."""
-        self._pre_chunks.append(pre_chunk)
-
-    def flush(self) -> Iterator[TextPreChunk]:
-        """Generate all accumulated pre-chunks as a single combined pre-chunk."""
-        pre_chunks = self._pre_chunks
-
-        # -- nothing to do if no pre-chunks have been accumulated --
-        if not pre_chunks:
-            return
-
-        # -- otherwise combine all accumulated pre-chunk into one --
-        pre_chunk = pre_chunks[0]
-        for other_pre_chunk in pre_chunks[1:]:
-            pre_chunk = pre_chunk.combine(other_pre_chunk)
-        yield pre_chunk
-
-        # -- and reset the accumulator (to empty) --
-        pre_chunks.clear()
-
-    @property
-    def remaining_space(self) -> int:
-        """Maximum size of pre-chunk that can be added without exceeding maxlen."""
-        maxlen = self._opts.hard_max
-        return (
-            maxlen
-            if not self._pre_chunks
-            # -- an additional pre-chunk will also incur an additional separator --
-            else maxlen - self.text_length - len(self._opts.text_separator)
-        )
-
-    @property
-    def text_length(self) -> int:
-        """Size of concatenated text in all pre-chunks in accumulator."""
-        n = len(self._pre_chunks)
-
-        if n == 0:
-            return 0
-
-        total_text_length = sum(s.text_length for s in self._pre_chunks)
-        total_separator_length = len(self._opts.text_separator) * (n - 1)
-        return total_text_length + total_separator_length