rfctr: extract ChunkingOptions (#2266)

Chunking options for things like chunk-size are largely independent of chunking strategy. Further, validating the args and applying defaults based on call arguments is sophisticated to make its use easy for the caller. These details distract from what the chunker is actually doing and would need to be repeated for every chunking strategy if left where they are. Extract these settings and the rules governing chunking behavior based on options into its own immutable object that can be passed to any component that is subject to optional behavior (pretty much all of them).
2025-12-04 11:10:22 +00:00 · 2023-12-15 11:51:02 -08:00 · 2023-12-15 11:51:02 -08:00 · 70cf141036
commit 70cf141036
parent 8ba1bedfca
7 changed files with 422 additions and 290 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,11 @@
+## 0.11.5-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+
 ## 0.11.4

 ### Enhancements
--- a/test_unstructured/chunking/init.py
+++ b/test_unstructured/chunking/init.py
--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@ -0,0 +1,113 @@
+"""Unit-test suite for the `unstructured.chunking.base` module."""
+
+from __future__ import annotations
+
+import pytest
+
+from unstructured.chunking.base import ChunkingOptions
+
+
+class DescribeChunkingOptions:
+    """Unit-test suite for `unstructured.chunking.model.ChunkingOptions objects."""
+
+    @pytest.mark.parametrize("max_characters", [0, -1, -42])
+    def it_rejects_max_characters_not_greater_than_zero(self, max_characters: int):
+        with pytest.raises(
+            ValueError,
+            match=f"'max_characters' argument must be > 0, got {max_characters}",
+        ):
+            ChunkingOptions.new(max_characters=max_characters)
+
+    def it_does_not_complain_when_specifying_max_characters_by_itself(self):
+        """Caller can specify `max_characters` arg without specifying any others.
+
+        In particular, When `combine_text_under_n_chars` is not specified it defaults to the value
+        of `max_characters`; it has no fixed default value that can be greater than `max_characters`
+        and trigger an exception.
+        """
+        try:
+            ChunkingOptions.new(max_characters=50)
+        except ValueError:
+            pytest.fail("did not accept `max_characters` as option by itself")
+
+    @pytest.mark.parametrize("n_chars", [-1, -42])
+    def it_rejects_combine_text_under_n_chars_for_n_less_than_zero(self, n_chars: int):
+        with pytest.raises(
+            ValueError,
+            match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}",
+        ):
+            ChunkingOptions.new(combine_text_under_n_chars=n_chars)
+
+    def it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining(self):
+        """Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining."""
+        opts = ChunkingOptions.new(combine_text_under_n_chars=0)
+        assert opts.combine_text_under_n_chars == 0
+
+    def it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself(self):
+        """Caller can specify `combine_text_under_n_chars` arg without specifying other options."""
+        try:
+            opts = ChunkingOptions.new(combine_text_under_n_chars=50)
+        except ValueError:
+            pytest.fail("did not accept `combine_text_under_n_chars` as option by itself")
+
+        assert opts.combine_text_under_n_chars == 50
+
+    def it_silently_accepts_combine_text_under_n_chars_greater_than_maxchars(self):
+        """`combine_text_under_n_chars` > `max_characters` doesn't affect chunking behavior.
+
+        So rather than raising an exception or warning, we just cap that value at `max_characters`
+        which is the behavioral equivalent.
+        """
+        try:
+            opts = ChunkingOptions.new(max_characters=500, combine_text_under_n_chars=600)
+        except ValueError:
+            pytest.fail("did not accept `combine_text_under_n_chars` greater than `max_characters`")
+
+        assert opts.combine_text_under_n_chars == 500
+
+    @pytest.mark.parametrize("n_chars", [-1, -42])
+    def it_rejects_new_after_n_chars_for_n_less_than_zero(self, n_chars: int):
+        with pytest.raises(
+            ValueError,
+            match=f"'new_after_n_chars' argument must be >= 0, got {n_chars}",
+        ):
+            ChunkingOptions.new(new_after_n_chars=n_chars)
+
+    def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self):
+        """Caller can specify `new_after_n_chars` arg without specifying any other options.
+
+        In particular, `combine_text_under_n_chars` value is adjusted down to the
+        `new_after_n_chars` value when the default for `combine_text_under_n_chars` exceeds the
+        value of `new_after_n_chars`.
+        """
+        try:
+            opts = ChunkingOptions.new(new_after_n_chars=200)
+        except ValueError:
+            pytest.fail("did not accept `new_after_n_chars` as option by itself")
+
+        assert opts.soft_max == 200
+        assert opts.combine_text_under_n_chars == 200
+
+    def it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk(self):
+        """Specifying `new_after_n_chars=0` places each element into its own pre-chunk.
+
+        This puts each element into its own chunk, although long chunks are still split.
+        """
+        opts = ChunkingOptions.new(new_after_n_chars=0)
+        assert opts.soft_max == 0
+
+    def it_silently_accepts_new_after_n_chars_greater_than_maxchars(self):
+        """`new_after_n_chars` > `max_characters` doesn't affect chunking behavior.
+
+        So rather than raising an exception or warning, we just cap that value at `max_characters`
+        which is the behavioral equivalent.
+        """
+        try:
+            opts = ChunkingOptions.new(max_characters=444, new_after_n_chars=555)
+        except ValueError:
+            pytest.fail("did not accept `new_after_n_chars` greater than `max_characters`")
+
+        assert opts.soft_max == 444
+
+    def it_knows_the_text_separator_string(self):
+        assert ChunkingOptions.new().text_separator == "\n\n"
--- a/test_unstructured/chunking/test_title.py
+++ b/test_unstructured/chunking/test_title.py
@ -4,6 +4,7 @@ from typing import List

 import pytest

+from unstructured.chunking.base import ChunkingOptions
 from unstructured.chunking.title import (
    PreChunkCombiner,
    TablePreChunk,
@ -30,141 +31,6 @@ from unstructured.documents.elements import (
 )
 from unstructured.partition.html import partition_html

-# == chunk_by_title() validation behaviors =======================================================
-
-
-@pytest.mark.parametrize("max_characters", [0, -1, -42])
-def test_it_rejects_max_characters_not_greater_than_zero(max_characters: int):
-    elements: List[Element] = [Text("Lorem ipsum dolor.")]
-
-    with pytest.raises(
-        ValueError,
-        match=f"'max_characters' argument must be > 0, got {max_characters}",
-    ):
-        chunk_by_title(elements, max_characters=max_characters)
-
-
-def test_it_does_not_complain_when_specifying_max_characters_by_itself():
-    """Caller can specify `max_characters` arg without specifying any others.
-
-    In particular, When `combine_text_under_n_chars` is not specified it defaults to the value of
-    `max_characters`; it has no fixed default value that can be greater than `max_characters` and
-    trigger an exception.
-    """
-    elements: List[Element] = [Text("Lorem ipsum dolor.")]
-
-    try:
-        chunk_by_title(elements, max_characters=50)
-    except ValueError:
-        pytest.fail("did not accept `max_characters` as option by itself")
-
-
-@pytest.mark.parametrize("n_chars", [-1, -42])
-def test_it_rejects_combine_text_under_n_chars_for_n_less_than_zero(n_chars: int):
-    elements: List[Element] = [Text("Lorem ipsum dolor.")]
-
-    with pytest.raises(
-        ValueError,
-        match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}",
-    ):
-        chunk_by_title(elements, combine_text_under_n_chars=n_chars)
-
-
-def test_it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining():
-    """Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining."""
-    elements: List[Element] = [Text("Lorem ipsum dolor.")]
-
-    chunks = chunk_by_title(elements, max_characters=50, combine_text_under_n_chars=0)
-
-    assert chunks == [CompositeElement("Lorem ipsum dolor.")]
-
-
-def test_it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself():
-    """Caller can specify `combine_text_under_n_chars` arg without specifying any other options."""
-    elements: List[Element] = [Text("Lorem ipsum dolor.")]
-
-    try:
-        chunk_by_title(elements, combine_text_under_n_chars=50)
-    except ValueError:
-        pytest.fail("did not accept `combine_text_under_n_chars` as option by itself")
-
-
-def test_it_silently_accepts_combine_text_under_n_chars_greater_than_maxchars():
-    """`combine_text_under_n_chars` > `max_characters` doesn't affect chunking behavior.
-
-    So rather than raising an exception or warning, we just cap that value at `max_characters` which
-    is the behavioral equivalent.
-    """
-    elements: List[Element] = [Text("Lorem ipsum dolor.")]
-
-    try:
-        chunk_by_title(elements, max_characters=500, combine_text_under_n_chars=600)
-    except ValueError:
-        pytest.fail("did not accept `new_after_n_chars` greater than `max_characters`")
-
-
-@pytest.mark.parametrize("n_chars", [-1, -42])
-def test_it_rejects_new_after_n_chars_for_n_less_than_zero(n_chars: int):
-    elements: List[Element] = [Text("Lorem ipsum dolor.")]
-
-    with pytest.raises(
-        ValueError,
-        match=f"'new_after_n_chars' argument must be >= 0, got {n_chars}",
-    ):
-        chunk_by_title(elements, new_after_n_chars=n_chars)
-
-
-def test_it_does_not_complain_when_specifying_new_after_n_chars_by_itself():
-    """Caller can specify `new_after_n_chars` arg without specifying any other options.
-
-    In particular, `combine_text_under_n_chars` value is adjusted down to the `new_after_n_chars`
-    value when the default for `combine_text_under_n_chars` exceeds the value of
-    `new_after_n_chars`.
-    """
-    elements: List[Element] = [Text("Lorem ipsum dolor.")]
-
-    try:
-        chunk_by_title(elements, new_after_n_chars=50)
-    except ValueError:
-        pytest.fail("did not accept `new_after_n_chars` as option by itself")
-
-
-def test_it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk():
-    """Specifying `new_after_n_chars=0` places each element into its own pre-chunk.
-
-    This puts each element into its own chunk, although long chunks are still split.
-    """
-    elements: List[Element] = [
-        Text("Lorem"),
-        Text("ipsum"),
-        Text("dolor"),
-    ]
-
-    chunks = chunk_by_title(elements, max_characters=50, new_after_n_chars=0)
-
-    assert chunks == [
-        CompositeElement("Lorem"),
-        CompositeElement("ipsum"),
-        CompositeElement("dolor"),
-    ]
-
-
-def test_it_silently_accepts_new_after_n_chars_greater_than_maxchars():
-    """`new_after_n_chars` > `max_characters` doesn't affect chunking behavior.
-
-    So rather than raising an exception or warning, we just cap that value at `max_characters` which
-    is the behavioral equivalent.
-    """
-    elements: List[Element] = [Text("Lorem ipsum dolor.")]
-
-    try:
-        chunk_by_title(elements, max_characters=500, new_after_n_chars=600)
-    except ValueError:
-        pytest.fail("did not accept `new_after_n_chars` greater than `max_characters`")
-
-
-# ================================================================================================
-

 def test_it_splits_a_large_element_into_multiple_chunks():
    elements: List[Element] = [
@ -199,12 +65,7 @@ def test_split_elements_by_title_and_table():
        CheckBox(),
    ]

-    pre_chunks = _split_elements_by_title_and_table(
-        elements,
-        multipage_sections=True,
-        new_after_n_chars=500,
-        max_characters=500,
-    )
+    pre_chunks = _split_elements_by_title_and_table(elements, opts=ChunkingOptions.new())

    pre_chunk = next(pre_chunks)
    assert isinstance(pre_chunk, TextPreChunk)
@ -712,10 +573,11 @@ class DescribeTablePreChunk:
        )
        text_table = "Header Col 1  Header Col 2\n" "Lorem ipsum   adipiscing"
        pre_chunk = TablePreChunk(
-            Table(text_table, metadata=ElementMetadata(text_as_html=html_table))
+            Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
+            opts=ChunkingOptions.new(max_characters=175),
        )

-        chunk_iter = pre_chunk.iter_chunks(maxlen=175)
+        chunk_iter = pre_chunk.iter_chunks()

        chunk = next(chunk_iter)
        assert isinstance(chunk, Table)
@ -757,10 +619,11 @@ class DescribeTablePreChunk:
            "Vivamus quis   nunc ipsum donec ac fermentum"
        )
        pre_chunk = TablePreChunk(
-            Table(text_table, metadata=ElementMetadata(text_as_html=html_table))
+            Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
+            opts=ChunkingOptions.new(max_characters=100),
        )

-        chunk_iter = pre_chunk.iter_chunks(maxlen=100)
+        chunk_iter = pre_chunk.iter_chunks()

        chunk = next(chunk_iter)
        assert isinstance(chunk, TableChunk)
@ -818,17 +681,20 @@ class DescribeTextPreChunk:

        Note that neither the original or other pre_chunk are mutated.
        """
+        opts = ChunkingOptions.new()
        pre_chunk = TextPreChunk(
            [
                Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
                Text("In rhoncus ipsum sed lectus porta volutpat."),
-            ]
+            ],
+            opts=opts,
        )
        other_pre_chunk = TextPreChunk(
            [
                Text("Donec semper facilisis metus finibus malesuada."),
                Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
-            ]
+            ],
+            opts=opts,
        )

        new_pre_chunk = pre_chunk.combine(other_pre_chunk)
@ -839,19 +705,22 @@ class DescribeTextPreChunk:
                Text("In rhoncus ipsum sed lectus porta volutpat."),
                Text("Donec semper facilisis metus finibus malesuada."),
                Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
-            ]
+            ],
+            opts=opts,
        )
        assert pre_chunk == TextPreChunk(
            [
                Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
                Text("In rhoncus ipsum sed lectus porta volutpat."),
-            ]
+            ],
+            opts=opts,
        )
        assert other_pre_chunk == TextPreChunk(
            [
                Text("Donec semper facilisis metus finibus malesuada."),
                Text("Vivamus magna nibh, blandit eu dui congue, feugiat efficitur velit."),
-            ]
+            ],
+            opts=opts,
        )

    def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
@ -862,10 +731,11 @@ class DescribeTextPreChunk:
                    "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
                    "lectus porta volutpat.",
                ),
-            ]
+            ],
+            opts=ChunkingOptions.new(max_characters=200),
        )

-        chunk_iter = pre_chunk.iter_chunks(maxlen=200)
+        chunk_iter = pre_chunk.iter_chunks()

        chunk = next(chunk_iter)
        assert chunk == CompositeElement(
@ -885,10 +755,11 @@ class DescribeTextPreChunk:
                    " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
                    " commodo consequat."
                ),
-            ]
+            ],
+            opts=ChunkingOptions.new(max_characters=200),
        )

-        chunk_iter = pre_chunk.iter_chunks(maxlen=200)
+        chunk_iter = pre_chunk.iter_chunks()

        chunk = next(chunk_iter)
        assert chunk == CompositeElement(
@ -907,7 +778,9 @@ class DescribeTextPreChunk:

    def it_knows_the_length_of_the_combined_text_of_its_elements_which_is_the_chunk_size(self):
        """.text_length is the size of chunk this pre-chunk will produce (before any splitting)."""
-        pre_chunk = TextPreChunk([PageBreak(""), Text("foo"), Text("bar")])
+        pre_chunk = TextPreChunk(
+            [PageBreak(""), Text("foo"), Text("bar")], opts=ChunkingOptions.new()
+        )
        assert pre_chunk.text_length == 8

    def it_extracts_all_populated_metadata_values_from_the_elements_to_help(self):
@ -931,7 +804,8 @@ class DescribeTextPreChunk:
                        languages=["lat", "eng"],
                    ),
                ),
-            ]
+            ],
+            opts=ChunkingOptions.new(),
        )

        assert pre_chunk._all_metadata_values == {
@ -967,7 +841,8 @@ class DescribeTextPreChunk:
            [
                Title("Lorem Ipsum", metadata=metadata),
                Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
-            ]
+            ],
+            opts=ChunkingOptions.new(),
        )

        # -- ad-hoc fields "coefficient" and "quotient" do not appear --
@ -1008,7 +883,8 @@ class DescribeTextPreChunk:
                        regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
                    ),
                ),
-            ]
+            ],
+            opts=ChunkingOptions.new(),
        )

        regex_metadata = pre_chunk._consolidated_regex_meta
@ -1062,7 +938,8 @@ class DescribeTextPreChunk:
                        },
                    ),
                ),
-            ]
+            ],
+            opts=ChunkingOptions.new(),
        )

        meta_kwargs = pre_chunk._meta_kwargs
@ -1098,7 +975,7 @@ class DescribeTextPreChunk:
        The text-segment contributed by each element is separated from the next by a blank line
        ("\n\n"). An element that contributes no text does not give rise to a separator.
        """
-        pre_chunk = TextPreChunk(elements)
+        pre_chunk = TextPreChunk(elements, opts=ChunkingOptions.new())
        assert pre_chunk._text == expected_value


@ -1106,13 +983,13 @@ class DescribeTextPreChunkBuilder:
    """Unit-test suite for `unstructured.chunking.title.TextPreChunkBuilder`."""

    def it_is_empty_on_construction(self):
-        builder = TextPreChunkBuilder(maxlen=50)
+        builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))

        assert builder.text_length == 0
        assert builder.remaining_space == 50

    def it_accumulates_elements_added_to_it(self):
-        builder = TextPreChunkBuilder(maxlen=150)
+        builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))

        builder.add_element(Title("Introduction"))
        assert builder.text_length == 12
@ -1128,7 +1005,7 @@ class DescribeTextPreChunkBuilder:
        assert builder.remaining_space == 36

    def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
-        builder = TextPreChunkBuilder(maxlen=150)
+        builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
        builder.add_element(Title("Introduction"))
        builder.add_element(
            Text(
@ -1151,7 +1028,7 @@ class DescribeTextPreChunkBuilder:
        assert builder.remaining_space == 150

    def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
-        builder = TextPreChunkBuilder(maxlen=150)
+        builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))

        pre_chunks = list(builder.flush())

@ -1160,7 +1037,7 @@ class DescribeTextPreChunkBuilder:
        assert builder.remaining_space == 150

    def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
-        builder = TextPreChunkBuilder(maxlen=50)
+        builder = TextPreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
        builder.add_element(Text("abcde"))
        builder.add_element(Text("fghij"))

@ -1180,30 +1057,32 @@ class DescribePreChunkCombiner:
    """Unit-test suite for `unstructured.chunking.title.PreChunkCombiner`."""

    def it_combines_sequential_small_text_pre_chunks(self):
+        opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
        pre_chunks = [
            TextPreChunk(
                [
                    Title("Lorem Ipsum"),  # 11
                    Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),  # 55
-                ]
+                ],
+                opts=opts,
            ),
            TextPreChunk(
                [
                    Title("Mauris Nec"),  # 10
                    Text("Mauris nec urna non augue vulputate consequat eget et nisi."),  # 59
-                ]
+                ],
+                opts=opts,
            ),
            TextPreChunk(
                [
                    Title("Sed Orci"),  # 8
                    Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),  # 63
-                ]
+                ],
+                opts=opts,
            ),
        ]

-        pre_chunk_iter = PreChunkCombiner(
-            pre_chunks, maxlen=250, combine_text_under_n_chars=250
-        ).iter_combined_pre_chunks()
+        pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()

        pre_chunk = next(pre_chunk_iter)
        assert isinstance(pre_chunk, TextPreChunk)
@ -1219,24 +1098,27 @@ class DescribePreChunkCombiner:
            next(pre_chunk_iter)

    def but_it_does_not_combine_table_pre_chunks(self):
+        opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
        pre_chunks = [
            TextPreChunk(
                [
                    Title("Lorem Ipsum"),
                    Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
-                ]
+                ],
+                opts=opts,
            ),
-            TablePreChunk(Table("Heading\nCell text")),
+            TablePreChunk(Table("Heading\nCell text"), opts=opts),
            TextPreChunk(
                [
                    Title("Mauris Nec"),
                    Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
-                ]
+                ],
+                opts=opts,
            ),
        ]

        pre_chunk_iter = PreChunkCombiner(
-            pre_chunks, maxlen=250, combine_text_under_n_chars=250
+            pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
        ).iter_combined_pre_chunks()

        pre_chunk = next(pre_chunk_iter)
@ -1261,31 +1143,33 @@ class DescribePreChunkCombiner:
            next(pre_chunk_iter)

    def it_respects_the_specified_combination_threshold(self):
+        opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80)
        pre_chunks = [
            TextPreChunk(  # 68
                [
                    Title("Lorem Ipsum"),  # 11
                    Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),  # 55
-                ]
+                ],
+                opts=opts,
            ),
            TextPreChunk(  # 71
                [
                    Title("Mauris Nec"),  # 10
                    Text("Mauris nec urna non augue vulputate consequat eget et nisi."),  # 59
-                ]
+                ],
+                opts=opts,
            ),
            # -- len == 139
            TextPreChunk(
                [
                    Title("Sed Orci"),  # 8
                    Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),  # 63
-                ]
+                ],
+                opts=opts,
            ),
        ]

-        pre_chunk_iter = PreChunkCombiner(
-            pre_chunks, maxlen=250, combine_text_under_n_chars=80
-        ).iter_combined_pre_chunks()
+        pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()

        pre_chunk = next(pre_chunk_iter)
        assert isinstance(pre_chunk, TextPreChunk)
@ -1307,32 +1191,34 @@ class DescribePreChunkCombiner:
            next(pre_chunk_iter)

    def it_respects_the_hard_maximum_window_length(self):
+        opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200)
        pre_chunks = [
            TextPreChunk(  # 68
                [
                    Title("Lorem Ipsum"),  # 11
                    Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),  # 55
-                ]
+                ],
+                opts=opts,
            ),
            TextPreChunk(  # 71
                [
                    Title("Mauris Nec"),  # 10
                    Text("Mauris nec urna non augue vulputate consequat eget et nisi."),  # 59
-                ]
+                ],
+                opts=opts,
            ),
            # -- len == 139
            TextPreChunk(
                [
                    Title("Sed Orci"),  # 8
                    Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies."),  # 63
-                ]
+                ],
+                opts=opts,
            ),
            # -- len == 214
        ]

-        pre_chunk_iter = PreChunkCombiner(
-            pre_chunks, maxlen=200, combine_text_under_n_chars=200
-        ).iter_combined_pre_chunks()
+        pre_chunk_iter = PreChunkCombiner(pre_chunks, opts=opts).iter_combined_pre_chunks()

        pre_chunk = next(pre_chunk_iter)
        assert isinstance(pre_chunk, TextPreChunk)
@ -1355,9 +1241,9 @@ class DescribePreChunkCombiner:

    def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
        """Such as occurs when a single element exceeds the window size."""
-
+        opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
        pre_chunks = [
-            TextPreChunk([Title("Lorem Ipsum")]),
+            TextPreChunk([Title("Lorem Ipsum")], opts=opts),
            TextPreChunk(  # 179
                [
                    Text(
@ -1365,13 +1251,14 @@ class DescribePreChunkCombiner:
                        " Mauris nec urna non augue vulputate consequat eget et nisi."  # 60
                        " Sed orci quam, eleifend sit amet vehicula, elementum ultricies."  # 64
                    )
-                ]
+                ],
+                opts=opts,
            ),
-            TextPreChunk([Title("Vulputate Consequat")]),
+            TextPreChunk([Title("Vulputate Consequat")], opts=opts),
        ]

        pre_chunk_iter = PreChunkCombiner(
-            pre_chunks, maxlen=150, combine_text_under_n_chars=150
+            pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
        ).iter_combined_pre_chunks()

        pre_chunk = next(pre_chunk_iter)
@ -1400,20 +1287,22 @@ class DescribeTextPreChunkAccumulator:
    """Unit-test suite for `unstructured.chunking.title.TextPreChunkAccumulator`."""

    def it_is_empty_on_construction(self):
-        accum = TextPreChunkAccumulator(maxlen=100)
+        accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=100))

        assert accum.text_length == 0
        assert accum.remaining_space == 100

    def it_accumulates_pre_chunks_added_to_it(self):
-        accum = TextPreChunkAccumulator(maxlen=500)
+        opts = ChunkingOptions.new(max_characters=500)
+        accum = TextPreChunkAccumulator(opts=opts)

        accum.add_pre_chunk(
            TextPreChunk(
                [
                    Title("Lorem Ipsum"),
                    Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
-                ]
+                ],
+                opts=opts,
            )
        )
        assert accum.text_length == 68
@ -1424,20 +1313,23 @@ class DescribeTextPreChunkAccumulator:
                [
                    Title("Mauris Nec"),
                    Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
-                ]
+                ],
+                opts=opts,
            )
        )
        assert accum.text_length == 141
        assert accum.remaining_space == 357

    def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
-        accum = TextPreChunkAccumulator(maxlen=150)
+        opts = ChunkingOptions.new(max_characters=150)
+        accum = TextPreChunkAccumulator(opts=opts)
        accum.add_pre_chunk(
            TextPreChunk(
                [
                    Title("Lorem Ipsum"),
                    Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
-                ]
+                ],
+                opts=opts,
            )
        )
        accum.add_pre_chunk(
@ -1445,7 +1337,8 @@ class DescribeTextPreChunkAccumulator:
                [
                    Title("Mauris Nec"),
                    Text("Mauris nec urna non augue vulputate consequat eget et nisi."),
-                ]
+                ],
+                opts=opts,
            )
        )
        accum.add_pre_chunk(
@ -1453,7 +1346,8 @@ class DescribeTextPreChunkAccumulator:
                [
                    Title("Sed Orci"),
                    Text("Sed orci quam, eleifend sit amet vehicula, elementum ultricies quam."),
-                ]
+                ],
+                opts=opts,
            )
        )

@ -1477,7 +1371,7 @@ class DescribeTextPreChunkAccumulator:
        assert accum.remaining_space == 150

    def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
-        accum = TextPreChunkAccumulator(maxlen=150)
+        accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150))

        pre_chunks = list(accum.flush())

@ -1486,9 +1380,10 @@ class DescribeTextPreChunkAccumulator:
        assert accum.remaining_space == 150

    def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
-        accum = TextPreChunkAccumulator(maxlen=100)
-        accum.add_pre_chunk(TextPreChunk([Text("abcde")]))
-        accum.add_pre_chunk(TextPreChunk([Text("fghij")]))
+        opts = ChunkingOptions.new(max_characters=100)
+        accum = TextPreChunkAccumulator(opts=opts)
+        accum.add_pre_chunk(TextPreChunk([Text("abcde")], opts=opts))
+        accum.add_pre_chunk(TextPreChunk([Text("fghij")], opts=opts))

        # -- .text_length includes a separator ("\n\n", len==2) between each text-segment,
        # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.11.4"  # pragma: no cover
+__version__ = "0.11.5-dev0"  # pragma: no cover
--- a/unstructured/chunking/base.py
+++ b/unstructured/chunking/base.py
@ -0,0 +1,152 @@
+"""Chunking objects not specific to a particular chunking strategy."""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from typing_extensions import Self
+
+from unstructured.utils import lazyproperty
+
+
+class ChunkingOptions:
+    """Specifies parameters of optional chunking behaviors."""
+
+    def __init__(
+        self,
+        combine_text_under_n_chars: Optional[int] = None,
+        max_characters: int = 500,
+        multipage_sections: bool = True,
+        new_after_n_chars: Optional[int] = None,
+        overlap: int = 0,
+    ):
+        self._combine_text_under_n_chars_arg = combine_text_under_n_chars
+        self._max_characters = max_characters
+        self._multipage_sections = multipage_sections
+        self._new_after_n_chars_arg = new_after_n_chars
+        self._overlap = overlap
+
+    @classmethod
+    def new(
+        cls,
+        combine_text_under_n_chars: Optional[int] = None,
+        max_characters: int = 500,
+        multipage_sections: bool = True,
+        new_after_n_chars: Optional[int] = None,
+        overlap: int = 0,
+    ) -> Self:
+        """Construct validated instance.
+
+        Raises `ValueError` on invalid arguments like overlap > max_chars.
+        """
+        self = cls(
+            combine_text_under_n_chars,
+            max_characters,
+            multipage_sections,
+            new_after_n_chars,
+            overlap,
+        )
+        self._validate()
+        return self
+
+    @lazyproperty
+    def combine_text_under_n_chars(self) -> int:
+        """Combine consecutive text pre-chunks if former is smaller than this and both will fit.
+
+        - Does not combine table chunks with text chunks even if they would both fit in the
+          chunking window.
+        - Does not combine text chunks if together they would exceed the chunking window.
+        - Defaults to `max_characters` when not specified.
+        - Is reduced to `new_after_n_chars` when it exceeds that value.
+        """
+        max_characters = self._max_characters
+        soft_max = self.soft_max
+        arg = self._combine_text_under_n_chars_arg
+
+        # -- `combine_text_under_n_chars` defaults to `max_characters` when not specified and is
+        # -- capped at max-chars
+        combine_text_under_n_chars = max_characters if arg is None or arg > max_characters else arg
+
+        # -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` --
+        return soft_max if combine_text_under_n_chars > soft_max else combine_text_under_n_chars
+
+    @lazyproperty
+    def hard_max(self) -> int:
+        """The maximum size for a chunk.
+
+        A pre-chunk will only exceed this size when it contains exactly one element which by itself
+        exceeds this size. Such a pre-chunk is subject to mid-text splitting later in the chunking
+        process.
+        """
+        return self._max_characters
+
+    @lazyproperty
+    def multipage_sections(self) -> bool:
+        """When False, break pre-chunks on page-boundaries."""
+        return self._multipage_sections
+
+    @lazyproperty
+    def overlap(self) -> int:
+        """The number of characters to overlap text when splitting chunks mid-text.
+
+        The actual overlap will not exceed this number of characters but may be less as required to
+        respect splitting-character boundaries.
+        """
+        return self._overlap
+
+    @lazyproperty
+    def soft_max(self) -> int:
+        """A pre-chunk of this size or greater is considered full.
+
+        ??? Is a value of 0 valid? It would produce the behavior: "put each element into its own
+        chunk".
+        """
+        max_chars = self._max_characters
+        new_after_n_chars = self._new_after_n_chars_arg
+        return (
+            max_chars
+            if (new_after_n_chars is None or new_after_n_chars < 0 or new_after_n_chars > max_chars)
+            else new_after_n_chars
+        )
+
+    @lazyproperty
+    def text_separator(self) -> str:
+        """The string to insert between elements when concatenating their text for a chunk.
+
+        Right now this is just "\n\n" (a blank line in plain text), but having this here rather
+        than as a module-level constant provides a way for us to easily make it user-configurable
+        in future if we want to.
+        """
+        return "\n\n"
+
+    def _validate(self) -> None:
+        """Raise ValueError if requestion option-set is invalid."""
+        max_characters = self._max_characters
+        # -- chunking window must have positive length --
+        if max_characters <= 0:
+            raise ValueError(f"'max_characters' argument must be > 0," f" got {max_characters}")
+
+        # -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination)
+        # -- but a negative value is not
+        combine_text_under_n_chars = self._combine_text_under_n_chars_arg
+        if combine_text_under_n_chars is not None and combine_text_under_n_chars < 0:
+            raise ValueError(
+                f"'combine_text_under_n_chars' argument must be >= 0,"
+                f" got {combine_text_under_n_chars}"
+            )
+
+        # -- a negative value for `new_after_n_chars` is assumed to
+        # -- be a mistake the caller will want to know about
+        new_after_n_chars = self._new_after_n_chars_arg
+        if new_after_n_chars is not None and new_after_n_chars < 0:
+            raise ValueError(
+                f"'new_after_n_chars' argument must be >= 0," f" got {new_after_n_chars}"
+            )
+
+        # -- overlap must be less than max-chars or the chunk text will
+        # -- never be consumed
+        # TODO: consider a heuristic like never overlap more than half,
+        # otherwise there could be corner cases leading to an infinite
+        # loop (I think).
+        if self._overlap >= max_characters:
+            raise ValueError(f"'overlap' must be less than max_characters," f" got {self._overlap}")
--- a/unstructured/chunking/title.py
+++ b/unstructured/chunking/title.py
@ -11,6 +11,7 @@ from typing import Any, DefaultDict, Dict, Iterable, Iterator, List, Optional, T

 from typing_extensions import TypeAlias

+from unstructured.chunking.base import ChunkingOptions
 from unstructured.documents.elements import (
    CompositeElement,
    ConsolidationStrategy,
@ -25,9 +26,6 @@ from unstructured.utils import lazyproperty

 PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"

-# -- goes between text of each element when element-text is concatenated to form chunk --
-TEXT_SEPARATOR = "\n\n"
-

 def chunk_by_title(
    elements: List[Element],
@ -64,57 +62,22 @@ def chunk_by_title(
        Chunks elements text and text_as_html (if present) into chunks of length
        n characters (hard max)
    """
-
-    # -- validation and arg pre-processing ---------------------------
-
-    # -- chunking window must have positive length --
-    if max_characters <= 0:
-        raise ValueError(f"'max_characters' argument must be > 0, got {max_characters}")
-
-    # -- `combine_text_under_n_chars` defaults to `max_characters` when not specified and is
-    # -- capped at max-chars
-    if combine_text_under_n_chars is None or combine_text_under_n_chars > max_characters:
-        combine_text_under_n_chars = max_characters
-
-    # -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination)
-    # -- but a negative value is not
-    if combine_text_under_n_chars < 0:
-        raise ValueError(
-            f"'combine_text_under_n_chars' argument must be >= 0, got {combine_text_under_n_chars}",
-        )
-
-    # -- same with `new_after_n_chars` --
-    if new_after_n_chars is None or new_after_n_chars > max_characters:
-        new_after_n_chars = max_characters
-
-    if new_after_n_chars < 0:
-        raise ValueError(f"'new_after_n_chars' argument must be >= 0, got {new_after_n_chars}")
-
-    # -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` --
-    if combine_text_under_n_chars > new_after_n_chars:
-        combine_text_under_n_chars = new_after_n_chars
-
-    # ----------------------------------------------------------------
+    opts = ChunkingOptions.new(
+        combine_text_under_n_chars=combine_text_under_n_chars,
+        max_characters=max_characters,
+        multipage_sections=multipage_sections,
+        new_after_n_chars=new_after_n_chars,
+    )

    pre_chunks = PreChunkCombiner(
-        _split_elements_by_title_and_table(
-            elements,
-            multipage_sections=multipage_sections,
-            new_after_n_chars=new_after_n_chars,
-            max_characters=max_characters,
-        ),
-        max_characters,
-        combine_text_under_n_chars,
+        _split_elements_by_title_and_table(elements, opts), opts=opts
    ).iter_combined_pre_chunks()

-    return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks(max_characters)]
+    return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks()]


 def _split_elements_by_title_and_table(
-    elements: List[Element],
-    multipage_sections: bool,
-    new_after_n_chars: int,
-    max_characters: int,
+    elements: List[Element], opts: ChunkingOptions
 ) -> Iterator[TextPreChunk | TablePreChunk]:
    """Implements "pre-chunker" responsibilities.

@ -139,13 +102,13 @@ def _split_elements_by_title_and_table(

    A Table or Checkbox element is placed into a pre-chunk by itself.
    """
-    pre_chunk_builder = TextPreChunkBuilder(max_characters)
+    pre_chunk_builder = TextPreChunkBuilder(opts)

    prior_element = None

    for element in elements:
        metadata_differs = (
-            _metadata_differs(element, prior_element, ignore_page_numbers=multipage_sections)
+            _metadata_differs(element, prior_element, ignore_page_numbers=opts.multipage_sections)
            if prior_element
            else False
        )
@ -157,7 +120,7 @@ def _split_elements_by_title_and_table(
            # -- adding this element would exceed hard-maxlen for pre_chunk --
            or pre_chunk_builder.remaining_space < len(str(element))
            # -- pre_chunk already meets or exceeds soft-maxlen --
-            or pre_chunk_builder.text_length >= new_after_n_chars
+            or pre_chunk_builder.text_length >= opts.soft_max
            # -- a semantic boundary is indicated by metadata change since prior element --
            or metadata_differs
        ):
@ -166,7 +129,7 @@ def _split_elements_by_title_and_table(

        # -- emit table and checkbox immediately since they are always isolated --
        if isinstance(element, Table):
-            yield TablePreChunk(table=element)
+            yield TablePreChunk(table=element, opts=opts)
        # -- but accumulate text elements for consolidation into a composite chunk --
        else:
            pre_chunk_builder.add_element(element)
@ -201,13 +164,15 @@ def _metadata_differs(
 class TablePreChunk:
    """A pre-chunk composed of a single Table element."""

-    def __init__(self, table: Table) -> None:
+    def __init__(self, table: Table, opts: ChunkingOptions) -> None:
        self._table = table
+        self._opts = opts

-    def iter_chunks(self, maxlen: int) -> Iterator[Table | TableChunk]:
+    def iter_chunks(self) -> Iterator[Table | TableChunk]:
        """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
        text = self._table.text
        html = self._table.metadata.text_as_html or ""
+        maxlen = self._opts.hard_max

        # -- only chunk a table when it's too big to swallow whole --
        if len(text) <= maxlen and len(html) <= maxlen:
@ -246,8 +211,9 @@ class TextPreChunk:
    This object is purposely immutable.
    """

-    def __init__(self, elements: Iterable[Element]) -> None:
+    def __init__(self, elements: Iterable[Element], opts: ChunkingOptions) -> None:
        self._elements = list(elements)
+        self._opts = opts

    def __eq__(self, other: Any) -> bool:
        if not isinstance(other, TextPreChunk):
@ -256,12 +222,13 @@ class TextPreChunk:

    def combine(self, other_pre_chunk: TextPreChunk) -> TextPreChunk:
        """Return new `TextPreChunk` that combines this and `other_pre_chunk`."""
-        return TextPreChunk(self._elements + other_pre_chunk._elements)
+        return TextPreChunk(self._elements + other_pre_chunk._elements, opts=self._opts)

-    def iter_chunks(self, maxlen: int) -> Iterator[CompositeElement]:
+    def iter_chunks(self) -> Iterator[CompositeElement]:
        """Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller."""
        text = self._text
        text_len = len(text)
+        maxlen = self._opts.hard_max
        start = 0
        remaining = text_len

@ -333,6 +300,7 @@ class TextPreChunk:
        offsets of each regex match are also adjusted for their new positions.
        """
        chunk_regex_metadata: Dict[str, List[RegexMetadata]] = {}
+        separator_len = len(self._opts.text_separator)
        running_text_len = 0
        start_offset = 0

@ -342,7 +310,7 @@ class TextPreChunk:
            if not text_len:
                continue
            # -- account for blank line between "squashed" elements, but not before first element --
-            running_text_len += len(TEXT_SEPARATOR) if running_text_len else 0
+            running_text_len += separator_len if running_text_len else 0
            start_offset = running_text_len
            running_text_len += text_len

@ -404,7 +372,8 @@ class TextPreChunk:

        Each element-text is separated from the next by a blank line ("\n\n").
        """
-        return TEXT_SEPARATOR.join(e.text for e in self._elements if e.text)
+        text_separator = self._opts.text_separator
+        return text_separator.join(e.text for e in self._elements if e.text)


 class TextPreChunkBuilder:
@ -422,14 +391,11 @@ class TextPreChunkBuilder:
    clears the elements it contains so it is ready to build the next text-pre-chunk.
    """

-    def __init__(self, maxlen: int) -> None:
-        self._maxlen = maxlen
-        self._separator_len = len(TEXT_SEPARATOR)
+    def __init__(self, opts: ChunkingOptions) -> None:
+        self._opts = opts
+        self._separator_len = len(opts.text_separator)
        self._elements: List[Element] = []

-        # -- these mutable working values probably represent premature optimization but improve
-        # -- performance and I expect will be welcome when processing a million elements
-
        # -- only includes non-empty element text, e.g. PageBreak.text=="" is not included --
        self._text_segments: List[str] = []
        # -- combined length of text-segments, not including separators --
@ -457,14 +423,14 @@ class TextPreChunkBuilder:
        self._elements.clear()
        self._text_segments.clear()
        self._text_len = 0
-        yield TextPreChunk(elements)
+        yield TextPreChunk(elements, self._opts)

    @property
    def remaining_space(self) -> int:
        """Maximum text-length of an element that can be added without exceeding maxlen."""
        # -- include length of trailing separator that will go before next element text --
        separators_len = self._separator_len * len(self._text_segments)
-        return self._maxlen - self._text_len - separators_len
+        return self._opts.hard_max - self._text_len - separators_len

    @property
    def text_length(self) -> int:
@ -490,19 +456,14 @@ class TextPreChunkBuilder:
 class PreChunkCombiner:
    """Filters pre-chunk stream to combine small pre-chunks where possible."""

-    def __init__(
-        self,
-        pre_chunks: Iterable[PreChunk],
-        maxlen: int,
-        combine_text_under_n_chars: int,
-    ):
+    def __init__(self, pre_chunks: Iterable[PreChunk], opts: ChunkingOptions):
        self._pre_chunks = pre_chunks
-        self._maxlen = maxlen
-        self._combine_text_under_n_chars = combine_text_under_n_chars
+        self._opts = opts

    def iter_combined_pre_chunks(self) -> Iterator[PreChunk]:
        """Generate pre-chunk objects, combining TextPreChunk objects when they'll fit in window."""
-        accum = TextPreChunkAccumulator(self._maxlen)
+        accum = TextPreChunkAccumulator(self._opts)
+        combine_text_under_n_chars = self._opts.combine_text_under_n_chars

        for pre_chunk in self._pre_chunks:
            # -- start new pre-chunk under these conditions --
@ -510,7 +471,7 @@ class PreChunkCombiner:
                # -- a table pre-chunk is never combined --
                isinstance(pre_chunk, TablePreChunk)
                # -- don't add another pre-chunk once length has reached combination soft-max --
-                or accum.text_length >= self._combine_text_under_n_chars
+                or accum.text_length >= combine_text_under_n_chars
                # -- combining would exceed hard-max --
                or accum.remaining_space < pre_chunk.text_length
            ):
@ -541,8 +502,8 @@ class TextPreChunkAccumulator:
    clears the pre-chunks it contains so it is ready to accept the next text-pre-chunk.
    """

-    def __init__(self, maxlen: int) -> None:
-        self._maxlen = maxlen
+    def __init__(self, opts: ChunkingOptions) -> None:
+        self._opts = opts
        self._pre_chunks: List[TextPreChunk] = []

    def add_pre_chunk(self, pre_chunk: TextPreChunk) -> None:
@ -569,19 +530,22 @@ class TextPreChunkAccumulator:
    @property
    def remaining_space(self) -> int:
        """Maximum size of pre-chunk that can be added without exceeding maxlen."""
+        maxlen = self._opts.hard_max
        return (
-            self._maxlen
+            maxlen
            if not self._pre_chunks
            # -- an additional pre-chunk will also incur an additional separator --
-            else self._maxlen - self.text_length - len(TEXT_SEPARATOR)
+            else maxlen - self.text_length - len(self._opts.text_separator)
        )

    @property
    def text_length(self) -> int:
        """Size of concatenated text in all pre-chunks in accumulator."""
        n = len(self._pre_chunks)
-        return (
-            0
-            if n == 0
-            else sum(s.text_length for s in self._pre_chunks) + len(TEXT_SEPARATOR) * (n - 1)
-        )
+
+        if n == 0:
+            return 0
+
+        total_text_length = sum(s.text_length for s in self._pre_chunks)
+        total_separator_length = len(self._opts.text_separator) * (n - 1)
+        return total_text_length + total_separator_length