rfctr(chunking): extract strategy-specific chunking options (#2556)

**Summary** A pluggable chunking strategy needs its own local set of chunking options that subclasses a base-class in `unstructured`. Extract distinct `_ByTitleChunkingOptions` and `_BasicChunkingOptions` for the existing two chunking strategies and move their strategy-specific option setting and validation to the respective subclass. This was also a good opportunity for us to clean up a few odds and ends we'd been meaning to. Might be worth looking at the commits individually as they are cohesive incremental steps toward the goal.
2025-11-02 11:03:38 +00:00 · 2024-02-23 10:22:44 -08:00 · 2024-02-23 10:22:44 -08:00 · 51cf6bf716
commit 51cf6bf716
parent b4d9ad8130
6 changed files with 340 additions and 205 deletions
--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@ -4,15 +4,15 @@

 from __future__ import annotations

-from typing import Sequence
+from typing import Optional, Sequence

 import pytest

 from unstructured.chunking.base import (
-    BasePreChunker,
    ChunkingOptions,
    PreChunkBuilder,
    PreChunkCombiner,
+    PreChunker,
    TablePreChunk,
    TextPreChunk,
    TextPreChunkAccumulator,
@ -48,7 +48,7 @@ class DescribeChunkingOptions:
            ValueError,
            match=f"'max_characters' argument must be > 0, got {max_characters}",
        ):
-            ChunkingOptions.new(max_characters=max_characters)
+            ChunkingOptions(max_characters=max_characters)._validate()

    def it_does_not_complain_when_specifying_max_characters_by_itself(self):
        """Caller can specify `max_characters` arg without specifying any others.
@ -58,44 +58,25 @@ class DescribeChunkingOptions:
        and trigger an exception.
        """
        try:
-            ChunkingOptions.new(max_characters=50)
+            ChunkingOptions(max_characters=50)._validate()
        except ValueError:
            pytest.fail("did not accept `max_characters` as option by itself")

-    @pytest.mark.parametrize("n_chars", [-1, -42])
-    def it_rejects_combine_text_under_n_chars_for_n_less_than_zero(self, n_chars: int):
-        with pytest.raises(
-            ValueError,
-            match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}",
-        ):
-            ChunkingOptions.new(combine_text_under_n_chars=n_chars)
+    @pytest.mark.parametrize(
+        ("combine_text_under_n_chars", "expected_value"), [(None, 0), (42, 42)]
+    )
+    def it_accepts_combine_text_under_n_chars_in_constructor_but_defaults_to_no_combining(
+        self, combine_text_under_n_chars: Optional[int], expected_value: int
+    ):
+        """Subclasses can store `combine_text_under_n_chars` but must validate and enable it.

-    def it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining(self):
-        """Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining."""
-        opts = ChunkingOptions.new(combine_text_under_n_chars=0)
-        assert opts.combine_text_under_n_chars == 0
-
-    def it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself(self):
-        """Caller can specify `combine_text_under_n_chars` arg without specifying other options."""
-        try:
-            opts = ChunkingOptions.new(combine_text_under_n_chars=50)
-        except ValueError:
-            pytest.fail("did not accept `combine_text_under_n_chars` as option by itself")
-
-        assert opts.combine_text_under_n_chars == 50
-
-    def it_silently_accepts_combine_text_under_n_chars_greater_than_maxchars(self):
-        """`combine_text_under_n_chars` > `max_characters` doesn't affect chunking behavior.
-
-        So rather than raising an exception or warning, we just cap that value at `max_characters`
-        which is the behavioral equivalent.
+        The `combine_text_under_n_chars` option is not used by all chunkers and its behavior can
+        differ between subtypes. It is present in and stored by the contructur but it defaults to
+        `0` (no pre-chunk combining) and must be overridden by subclasses to give it the desired
+        behavior.
        """
-        try:
-            opts = ChunkingOptions.new(max_characters=500, combine_text_under_n_chars=600)
-        except ValueError:
-            pytest.fail("did not accept `combine_text_under_n_chars` greater than `max_characters`")
-
-        assert opts.combine_text_under_n_chars == 500
+        opts = ChunkingOptions(combine_text_under_n_chars=combine_text_under_n_chars)
+        assert opts.combine_text_under_n_chars == expected_value

    @pytest.mark.parametrize("n_chars", [-1, -42])
    def it_rejects_new_after_n_chars_for_n_less_than_zero(self, n_chars: int):
@ -103,7 +84,7 @@ class DescribeChunkingOptions:
            ValueError,
            match=f"'new_after_n_chars' argument must be >= 0, got {n_chars}",
        ):
-            ChunkingOptions.new(new_after_n_chars=n_chars)
+            ChunkingOptions(new_after_n_chars=n_chars)._validate()

    def it_rejects_overlap_not_less_than_max_characters(self):
        with pytest.raises(
@ -113,26 +94,23 @@ class DescribeChunkingOptions:
            ChunkingOptions(max_characters=200, overlap=300)._validate()

    def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self):
-        """Caller can specify `new_after_n_chars` arg without specifying any other options.
-
-        In particular, `combine_text_under_n_chars` value is adjusted down to the
-        `new_after_n_chars` value when the default for `combine_text_under_n_chars` exceeds the
-        value of `new_after_n_chars`.
-        """
+        """Caller can specify `new_after_n_chars` arg without specifying any other options."""
+        opts = ChunkingOptions(new_after_n_chars=200)
        try:
-            opts = ChunkingOptions.new(new_after_n_chars=200)
+            opts._validate()
        except ValueError:
            pytest.fail("did not accept `new_after_n_chars` as option by itself")

        assert opts.soft_max == 200
-        assert opts.combine_text_under_n_chars == 200

    def it_accepts_0_for_new_after_n_chars_to_put_each_element_into_its_own_chunk(self):
        """Specifying `new_after_n_chars=0` places each element into its own pre-chunk.

        This puts each element into its own chunk, although long chunks are still split.
        """
-        opts = ChunkingOptions.new(new_after_n_chars=0)
+        opts = ChunkingOptions(new_after_n_chars=0)
+        opts._validate()
+
        assert opts.soft_max == 0

    def it_silently_accepts_new_after_n_chars_greater_than_maxchars(self):
@ -141,33 +119,32 @@ class DescribeChunkingOptions:
        So rather than raising an exception or warning, we just cap that value at `max_characters`
        which is the behavioral equivalent.
        """
+        opts = ChunkingOptions(max_characters=444, new_after_n_chars=555)
        try:
-            opts = ChunkingOptions.new(max_characters=444, new_after_n_chars=555)
+            opts._validate()
        except ValueError:
            pytest.fail("did not accept `new_after_n_chars` greater than `max_characters`")

        assert opts.soft_max == 444

    def it_knows_how_much_overlap_to_apply_to_split_chunks(self):
-        assert ChunkingOptions.new(overlap=10).overlap == 10
+        assert ChunkingOptions(overlap=10).overlap == 10

    def and_it_uses_the_same_value_for_inter_chunk_overlap_when_asked_to_overlap_all_chunks(self):
-        assert ChunkingOptions.new(overlap=10, overlap_all=True).inter_chunk_overlap == 10
+        assert ChunkingOptions(overlap=10, overlap_all=True).inter_chunk_overlap == 10

    def but_it_does_not_overlap_pre_chunks_by_default(self):
-        assert ChunkingOptions.new(overlap=10).inter_chunk_overlap == 0
+        assert ChunkingOptions(overlap=10).inter_chunk_overlap == 0

    def it_knows_the_text_separator_string(self):
-        assert ChunkingOptions.new().text_separator == "\n\n"
+        assert ChunkingOptions().text_separator == "\n\n"


 class Describe_TextSplitter:
    """Unit-test suite for `unstructured.chunking.base._TextSplitter` objects."""

    def it_splits_on_a_preferred_separator_when_it_can(self):
-        opts = ChunkingOptions.new(
-            max_characters=50, text_splitting_separators=("\n", " "), overlap=10
-        )
+        opts = ChunkingOptions(max_characters=50, text_splitting_separators=("\n", " "), overlap=10)
        split = _TextSplitter(opts)
        text = (
            "Lorem ipsum dolor amet consectetur adipiscing.  \n  "
@ -189,9 +166,7 @@ class Describe_TextSplitter:
        assert remainder == ""

    def and_it_splits_on_the_next_available_separator_when_the_first_is_not_available(self):
-        opts = ChunkingOptions.new(
-            max_characters=40, text_splitting_separators=("\n", " "), overlap=10
-        )
+        opts = ChunkingOptions(max_characters=40, text_splitting_separators=("\n", " "), overlap=10)
        split = _TextSplitter(opts)
        text = (
            "Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus porta"
@ -211,9 +186,7 @@ class Describe_TextSplitter:
        assert remainder == ""

    def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self):
-        opts = ChunkingOptions.new(
-            max_characters=30, text_splitting_separators=("\n", " "), overlap=10
-        )
+        opts = ChunkingOptions(max_characters=30, text_splitting_separators=("\n", " "), overlap=10)
        split = _TextSplitter(opts)
        text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."

@ -237,7 +210,7 @@ class Describe_TextSplitter:
        ],
    )
    def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str):
-        opts = ChunkingOptions.new(max_characters=46, overlap=10)
+        opts = ChunkingOptions(max_characters=46, overlap=10)
        split = _TextSplitter(opts)

        s, remainder = split(text)
@ -246,7 +219,7 @@ class Describe_TextSplitter:
        assert remainder == ""

    def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self):
-        opts = ChunkingOptions.new(max_characters=38, overlap=10)
+        opts = ChunkingOptions(max_characters=38, overlap=10)
        split = _TextSplitter(opts)
        text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."

@ -257,9 +230,7 @@ class Describe_TextSplitter:

    @pytest.mark.parametrize("separators", [("\n", " "), (" ",)])
    def it_strips_whitespace_around_the_split(self, separators: Sequence[str]):
-        opts = ChunkingOptions.new(
-            max_characters=50, text_splitting_separators=separators, overlap=10
-        )
+        opts = ChunkingOptions(max_characters=50, text_splitting_separators=separators, overlap=10)
        split = _TextSplitter(opts)
        text = "Lorem ipsum dolor amet consectetur adipiscing.   \n\n In rhoncus ipsum sed lectus."
        #       |-------------------------------------------------^  50-chars
@ -271,12 +242,12 @@ class Describe_TextSplitter:


 # ================================================================================================
-# BASE PRE-CHUNKER
+# PRE-CHUNKER
 # ================================================================================================


-class DescribeBasePreChunker:
-    """Unit-test suite for `unstructured.chunking.base.BasePreChunker` objects."""
+class DescribePreChunker:
+    """Unit-test suite for `unstructured.chunking.base.PreChunker` objects."""

    def it_gathers_elements_into_pre_chunks_respecting_the_specified_chunk_size(self):
        elements = [
@ -289,9 +260,9 @@ class DescribeBasePreChunker:
            CheckBox(),
        ]

-        opts = ChunkingOptions.new(max_characters=150, new_after_n_chars=65)
+        opts = ChunkingOptions(max_characters=150, new_after_n_chars=65)

-        pre_chunk_iter = BasePreChunker.iter_pre_chunks(elements, opts=opts)
+        pre_chunk_iter = PreChunker.iter_pre_chunks(elements, opts=opts)

        pre_chunk = next(pre_chunk_iter)
        assert isinstance(pre_chunk, TextPreChunk)
@ -344,7 +315,7 @@ class DescribeTablePreChunk:
        pre_chunk = TablePreChunk(
            Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
            overlap_prefix="ctus porta volutpat.",
-            opts=ChunkingOptions.new(max_characters=175),
+            opts=ChunkingOptions(max_characters=175),
        )

        chunk_iter = pre_chunk.iter_chunks()
@ -393,7 +364,7 @@ class DescribeTablePreChunk:
        pre_chunk = TablePreChunk(
            Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
            overlap_prefix="",
-            opts=ChunkingOptions.new(max_characters=100, text_splitting_separators=("\n", " ")),
+            opts=ChunkingOptions(max_characters=100, text_splitting_separators=("\n", " ")),
        )

        chunk_iter = pre_chunk.iter_chunks()
@ -457,7 +428,7 @@ class DescribeTablePreChunk:
        self, text: str, expected_value: str
    ):
        pre_chunk = TablePreChunk(
-            Table(text), overlap_prefix="", opts=ChunkingOptions.new(overlap=20, overlap_all=True)
+            Table(text), overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True)
        )
        assert pre_chunk.overlap_tail == expected_value

@ -480,7 +451,7 @@ class DescribeTablePreChunk:
        self, text: str, overlap_prefix: str, expected_value: str
    ):
        pre_chunk = TablePreChunk(
-            Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions.new()
+            Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions()
        )
        assert pre_chunk._text == expected_value

@ -536,7 +507,7 @@ class DescribeTextPreChunk:
        self, max_characters: int, combine_text_under_n_chars: int, expected_value: bool
    ):
        """This allows `PreChunkCombiner` to operate without knowing `TextPreChunk` internals."""
-        opts = ChunkingOptions.new(
+        opts = ChunkingOptions(
            max_characters=max_characters,
            combine_text_under_n_chars=combine_text_under_n_chars,
            overlap=20,
@ -560,7 +531,7 @@ class DescribeTextPreChunk:

        Note that neither the original or other pre_chunk are mutated.
        """
-        opts = ChunkingOptions.new()
+        opts = ChunkingOptions()
        pre_chunk = TextPreChunk(
            [
                Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."),
@ -623,7 +594,7 @@ class DescribeTextPreChunk:
                ),
            ],
            overlap_prefix="e feugiat efficitur.",
-            opts=ChunkingOptions.new(max_characters=200),
+            opts=ChunkingOptions(max_characters=200),
        )

        chunk_iter = pre_chunk.iter_chunks()
@ -648,7 +619,7 @@ class DescribeTextPreChunk:
                ),
            ],
            overlap_prefix="",
-            opts=ChunkingOptions.new(max_characters=200, text_splitting_separators=("\n", " ")),
+            opts=ChunkingOptions(max_characters=200, text_splitting_separators=("\n", " ")),
        )

        chunk_iter = pre_chunk.iter_chunks()
@ -681,7 +652,7 @@ class DescribeTextPreChunk:
        self, text: str, expected_value: str
    ):
        pre_chunk = TextPreChunk(
-            [Text(text)], overlap_prefix="", opts=ChunkingOptions.new(overlap=20, overlap_all=True)
+            [Text(text)], overlap_prefix="", opts=ChunkingOptions(overlap=20, overlap_all=True)
        )
        assert pre_chunk.overlap_tail == expected_value

@ -708,7 +679,7 @@ class DescribeTextPreChunk:
                ),
            ],
            overlap_prefix="",
-            opts=ChunkingOptions.new(),
+            opts=ChunkingOptions(),
        )

        assert pre_chunk._all_metadata_values == {
@ -746,7 +717,7 @@ class DescribeTextPreChunk:
                Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
            ],
            overlap_prefix="",
-            opts=ChunkingOptions.new(),
+            opts=ChunkingOptions(),
        )

        # -- ad-hoc fields "coefficient" and "quotient" do not appear --
@ -789,7 +760,7 @@ class DescribeTextPreChunk:
                ),
            ],
            overlap_prefix="ficitur.",  # len == 8
-            opts=ChunkingOptions.new(),
+            opts=ChunkingOptions(),
        )

        regex_metadata = pre_chunk._consolidated_regex_meta
@ -845,7 +816,7 @@ class DescribeTextPreChunk:
                ),
            ],
            overlap_prefix="",
-            opts=ChunkingOptions.new(),
+            opts=ChunkingOptions(),
        )

        meta_kwargs = pre_chunk._meta_kwargs
@ -881,9 +852,7 @@ class DescribeTextPreChunk:
        The text-segment contributed by each element is separated from the next by a blank line
        ("\n\n"). An element that contributes no text does not give rise to a separator.
        """
-        pre_chunk = TextPreChunk(
-            elements, overlap_prefix=overlap_prefix, opts=ChunkingOptions.new()
-        )
+        pre_chunk = TextPreChunk(elements, overlap_prefix=overlap_prefix, opts=ChunkingOptions())
        assert pre_chunk._text == expected_value


@ -896,13 +865,13 @@ class DescribePreChunkBuilder:
    """Unit-test suite for `unstructured.chunking.base.PreChunkBuilder`."""

    def it_is_empty_on_construction(self):
-        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
+        builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=50))

        assert builder._text_length == 0
        assert builder._remaining_space == 50

    def it_accumulates_elements_added_to_it(self):
-        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
+        builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))

        builder.add_element(Title("Introduction"))
        assert builder._text_length == 12
@ -919,7 +888,7 @@ class DescribePreChunkBuilder:

    @pytest.mark.parametrize("element", [Table("Heading\nCell text"), Text("abcd " * 200)])
    def it_will_fit_a_Table_or_oversized_element_when_empty(self, element: Element):
-        builder = PreChunkBuilder(opts=ChunkingOptions.new())
+        builder = PreChunkBuilder(opts=ChunkingOptions())
        assert builder.will_fit(element)

    @pytest.mark.parametrize(
@ -934,22 +903,20 @@ class DescribePreChunkBuilder:
    def but_not_when_it_already_contains_an_element_of_any_kind(
        self, existing_element: Element, next_element: Element
    ):
-        builder = PreChunkBuilder(opts=ChunkingOptions.new())
+        builder = PreChunkBuilder(opts=ChunkingOptions())
        builder.add_element(existing_element)

        assert not builder.will_fit(next_element)

    @pytest.mark.parametrize("element", [Text("abcd"), Table("Fruits\nMango")])
    def it_will_not_fit_any_element_when_it_already_contains_a_table(self, element: Element):
-        builder = PreChunkBuilder(opts=ChunkingOptions.new())
+        builder = PreChunkBuilder(opts=ChunkingOptions())
        builder.add_element(Table("Heading\nCell text"))

        assert not builder.will_fit(element)

    def it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen(self):
-        builder = PreChunkBuilder(
-            opts=ChunkingOptions.new(max_characters=100, new_after_n_chars=50)
-        )
+        builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100, new_after_n_chars=50))
        builder.add_element(
            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")  # 55-chars
        )
@ -957,7 +924,7 @@ class DescribePreChunkBuilder:
        assert not builder.will_fit(Text("In rhoncus ipsum."))

    def and_it_will_not_fit_an_element_when_that_would_cause_it_to_exceed_the_hard_maxlen(self):
-        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100))
+        builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100))
        builder.add_element(
            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")  # 55-chars
        )
@ -968,7 +935,7 @@ class DescribePreChunkBuilder:
        )

    def but_it_will_fit_an_element_that_fits(self):
-        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=100))
+        builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100))
        builder.add_element(
            Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")  # 55-chars
        )
@ -977,7 +944,7 @@ class DescribePreChunkBuilder:
        assert builder.will_fit(Text("In rhoncus ipsum sed lectus porto volutpat."))  # 43-chars

    def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
-        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
+        builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
        builder.add_element(Title("Introduction"))
        builder.add_element(
            Text(
@ -1000,7 +967,7 @@ class DescribePreChunkBuilder:
        assert builder._remaining_space == 150

    def and_it_generates_a_TablePreChunk_when_it_contains_a_Table_element(self):
-        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
+        builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
        builder.add_element(Table("Heading\nCell text"))

        pre_chunk = next(builder.flush())
@ -1016,7 +983,7 @@ class DescribePreChunkBuilder:
        assert pre_chunk._table == Table("Heading\nCell text")

    def but_it_does_not_generate_a_pre_chunk_on_flush_when_empty(self):
-        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=150))
+        builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))

        pre_chunks = list(builder.flush())

@ -1025,7 +992,7 @@ class DescribePreChunkBuilder:
        assert builder._remaining_space == 150

    def it_computes_overlap_from_each_pre_chunk_and_applies_it_to_the_next(self):
-        opts = ChunkingOptions.new(overlap=15, overlap_all=True)
+        opts = ChunkingOptions(overlap=15, overlap_all=True)
        builder = PreChunkBuilder(opts=opts)

        builder.add_element(Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."))
@ -1044,7 +1011,7 @@ class DescribePreChunkBuilder:
        assert pre_chunk._text == "porta volutpat.\n\nDonec semper facilisis metus finibus."

    def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
-        builder = PreChunkBuilder(opts=ChunkingOptions.new(max_characters=50))
+        builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=50))
        builder.add_element(Text("abcde"))
        builder.add_element(Text("fghij"))

@ -1061,7 +1028,7 @@ class DescribePreChunkCombiner:
    """Unit-test suite for `unstructured.chunking.base.PreChunkCombiner`."""

    def it_combines_sequential_small_text_pre_chunks(self):
-        opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
+        opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=250)
        pre_chunks = [
            TextPreChunk(
                [
@ -1105,7 +1072,7 @@ class DescribePreChunkCombiner:
            next(pre_chunk_iter)

    def but_it_does_not_combine_table_pre_chunks(self):
-        opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
+        opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=250)
        pre_chunks = [
            TextPreChunk(
                [
@ -1127,7 +1094,7 @@ class DescribePreChunkCombiner:
        ]

        pre_chunk_iter = PreChunkCombiner(
-            pre_chunks, ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=250)
+            pre_chunks, ChunkingOptions(max_characters=250, combine_text_under_n_chars=250)
        ).iter_combined_pre_chunks()

        pre_chunk = next(pre_chunk_iter)
@ -1152,7 +1119,7 @@ class DescribePreChunkCombiner:
            next(pre_chunk_iter)

    def it_respects_the_specified_combination_threshold(self):
-        opts = ChunkingOptions.new(max_characters=250, combine_text_under_n_chars=80)
+        opts = ChunkingOptions(max_characters=250, combine_text_under_n_chars=80)
        pre_chunks = [
            TextPreChunk(  # 68
                [
@ -1203,7 +1170,7 @@ class DescribePreChunkCombiner:
            next(pre_chunk_iter)

    def it_respects_the_hard_maximum_window_length(self):
-        opts = ChunkingOptions.new(max_characters=200, combine_text_under_n_chars=200)
+        opts = ChunkingOptions(max_characters=200, combine_text_under_n_chars=200)
        pre_chunks = [
            TextPreChunk(  # 68
                [
@ -1256,7 +1223,7 @@ class DescribePreChunkCombiner:

    def it_accommodates_and_isolates_an_oversized_pre_chunk(self):
        """Such as occurs when a single element exceeds the window size."""
-        opts = ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
+        opts = ChunkingOptions(max_characters=150, combine_text_under_n_chars=150)
        pre_chunks = [
            TextPreChunk([Title("Lorem Ipsum")], overlap_prefix="", opts=opts),
            TextPreChunk(  # 179
@ -1274,7 +1241,7 @@ class DescribePreChunkCombiner:
        ]

        pre_chunk_iter = PreChunkCombiner(
-            pre_chunks, ChunkingOptions.new(max_characters=150, combine_text_under_n_chars=150)
+            pre_chunks, ChunkingOptions(max_characters=150, combine_text_under_n_chars=150)
        ).iter_combined_pre_chunks()

        pre_chunk = next(pre_chunk_iter)
@ -1303,7 +1270,7 @@ class DescribeTextPreChunkAccumulator:
    """Unit-test suite for `unstructured.chunking.base.TextPreChunkAccumulator`."""

    def it_generates_a_combined_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
-        opts = ChunkingOptions.new()
+        opts = ChunkingOptions(combine_text_under_n_chars=500)
        accum = TextPreChunkAccumulator(opts=opts)

        pre_chunk = TextPreChunk(
@ -1362,7 +1329,7 @@ class DescribeTextPreChunkAccumulator:
            next(accum.flush())

    def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self):
-        accum = TextPreChunkAccumulator(opts=ChunkingOptions.new(max_characters=150))
+        accum = TextPreChunkAccumulator(opts=ChunkingOptions(max_characters=150))
        assert list(accum.flush()) == []


--- a/test_unstructured/chunking/test_basic.py
+++ b/test_unstructured/chunking/test_basic.py
@ -1,4 +1,4 @@
-"""Unit-test suite for the `unstructured.chunking.basic` module.
+"""Test suite for the `unstructured.chunking.basic` module.

 That module implements the baseline chunking strategy. The baseline strategy has all behaviors
 shared by all chunking strategies and no extra rules like perserve section or page boundaries.
--- a/test_unstructured/chunking/test_title.py
+++ b/test_unstructured/chunking/test_title.py
@ -1,13 +1,20 @@
 # pyright: reportPrivateUsage=false

-"""Unit-test suite for the `unstructured.chunking.title` module."""
+"""Test suite for the `unstructured.chunking.title` module."""

 from __future__ import annotations

+from typing import Optional
+
 import pytest

-from unstructured.chunking.base import ChunkingOptions, TablePreChunk, TextPreChunk
-from unstructured.chunking.title import _ByTitlePreChunker, chunk_by_title
+from unstructured.chunking.base import (
+    CHUNK_MULTI_PAGE_DEFAULT,
+    PreChunker,
+    TablePreChunk,
+    TextPreChunk,
+)
+from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title
 from unstructured.documents.coordinates import CoordinateSystem
 from unstructured.documents.elements import (
    CheckBox,
@ -23,6 +30,13 @@ from unstructured.documents.elements import (
 )
 from unstructured.partition.html import partition_html

+# ================================================================================================
+# INTEGRATION-TESTS
+# ================================================================================================
+# These test `chunk_by_title()` as an integrated whole, calling `chunk_by_title()` and inspecting
+# the outputs.
+# ================================================================================================
+

 def test_it_splits_a_large_element_into_multiple_chunks():
    elements: list[Element] = [
@ -57,7 +71,7 @@ def test_split_elements_by_title_and_table():
        CheckBox(),
    ]

-    pre_chunks = _ByTitlePreChunker.iter_pre_chunks(elements, opts=ChunkingOptions.new())
+    pre_chunks = PreChunker.iter_pre_chunks(elements, opts=_ByTitleChunkingOptions.new())

    pre_chunk = next(pre_chunks)
    assert isinstance(pre_chunk, TextPreChunk)
@ -544,3 +558,86 @@ def test_it_considers_separator_length_when_pre_chunking():
        ),
        CompositeElement("Minimize mid-text chunk-splitting"),
    ]
+
+
+# ================================================================================================
+# UNIT-TESTS
+# ================================================================================================
+# These test individual components in isolation so can exercise all edge cases while still
+# performing well.
+# ================================================================================================
+
+
+class Describe_ByTitleChunkingOptions:
+    """Unit-test suite for `unstructured.chunking.title._ByTitleChunkingOptions` objects."""
+
+    @pytest.mark.parametrize("n_chars", [-1, -42])
+    def it_rejects_combine_text_under_n_chars_for_n_less_than_zero(self, n_chars: int):
+        with pytest.raises(
+            ValueError,
+            match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}",
+        ):
+            _ByTitleChunkingOptions.new(combine_text_under_n_chars=n_chars)
+
+    def it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining(self):
+        """Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining."""
+        opts = _ByTitleChunkingOptions(combine_text_under_n_chars=0)
+        assert opts.combine_text_under_n_chars == 0
+
+    def it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself(self):
+        """Caller can specify `combine_text_under_n_chars` arg without specifying other options."""
+        try:
+            opts = _ByTitleChunkingOptions(combine_text_under_n_chars=50)
+        except ValueError:
+            pytest.fail("did not accept `combine_text_under_n_chars` as option by itself")
+
+        assert opts.combine_text_under_n_chars == 50
+
+    @pytest.mark.parametrize(
+        ("combine_text_under_n_chars", "max_characters", "expected_hard_max"),
+        [(600, None, 500), (600, 450, 450)],
+    )
+    def it_rejects_combine_text_under_n_chars_greater_than_maxchars(
+        self, combine_text_under_n_chars: int, max_characters: Optional[int], expected_hard_max: int
+    ):
+        """`combine_text_under_n_chars` > `max_characters` can produce behavior confusing to users.
+
+        The behavior is no different from `combine_text_under_n_chars == max_characters`, but if
+        `max_characters` is left to default (500) and `combine_text_under_n_chars` is set to a
+        larger number like 1500 then it can look like chunk-combining isn't working.
+        """
+        with pytest.raises(
+            ValueError,
+            match=(
+                "'combine_text_under_n_chars' argument must not exceed `max_characters` value,"
+                f" got {combine_text_under_n_chars} > {expected_hard_max}"
+            ),
+        ):
+            _ByTitleChunkingOptions.new(
+                max_characters=max_characters, combine_text_under_n_chars=combine_text_under_n_chars
+            )
+
+    def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self):
+        """Caller can specify `new_after_n_chars` arg without specifying any other options.
+
+        In particular, `combine_text_under_n_chars` value is adjusted down to the
+        `new_after_n_chars` value when the default for `combine_text_under_n_chars` exceeds the
+        value of `new_after_n_chars`.
+        """
+        try:
+            opts = _ByTitleChunkingOptions(new_after_n_chars=200)
+        except ValueError:
+            pytest.fail("did not accept `new_after_n_chars` as option by itself")
+
+        assert opts.soft_max == 200
+        assert opts.combine_text_under_n_chars == 200
+
+    @pytest.mark.parametrize(
+        ("multipage_sections", "expected_value"),
+        [(True, True), (False, False), (None, CHUNK_MULTI_PAGE_DEFAULT)],
+    )
+    def it_knows_whether_to_break_chunks_on_page_boundaries(
+        self, multipage_sections: bool, expected_value: bool
+    ):
+        opts = _ByTitleChunkingOptions(multipage_sections=multipage_sections)
+        assert opts.multipage_sections is expected_value
--- a/unstructured/chunking/base.py
+++ b/unstructured/chunking/base.py
@ -7,7 +7,7 @@ import copy
 from typing import Any, Callable, DefaultDict, Iterable, Iterator, Optional, Sequence, cast

 import regex
-from typing_extensions import Self, TypeAlias
+from typing_extensions import TypeAlias

 from unstructured.documents.elements import (
    CompositeElement,
@ -68,9 +68,6 @@ class ChunkingOptions:
        when not specified, which effectively disables this behavior. Specifying 0 for this
        argument causes each element to appear in a chunk by itself (although an element with text
        longer than `max_characters` will be still be split into two or more chunks).
-    multipage_sections
-        Indicates that page-boundaries should not be respected while chunking, i.e. elements
-        appearing on two different pages can appear in the same chunk.
    combine_text_under_n_chars
        Provides a way to "recombine" small chunks formed by breaking on a semantic boundary. Only
        relevant for a chunking strategy that specifies higher-level semantic boundaries to be
@ -101,9 +98,9 @@ class ChunkingOptions:

    def __init__(
        self,
+        *,
        combine_text_under_n_chars: Optional[int] = None,
        max_characters: Optional[int] = None,
-        multipage_sections: Optional[bool] = None,
        new_after_n_chars: Optional[int] = None,
        overlap: Optional[int] = None,
        overlap_all: Optional[bool] = None,
@ -111,59 +108,28 @@ class ChunkingOptions:
    ):
        self._combine_text_under_n_chars_arg = combine_text_under_n_chars
        self._max_characters_arg = max_characters
-        self._multipage_sections_arg = multipage_sections
        self._new_after_n_chars_arg = new_after_n_chars
        self._overlap_arg = overlap
        self._overlap_all_arg = overlap_all
        self._text_splitting_separators = text_splitting_separators

-    @classmethod
-    def new(
-        cls,
-        combine_text_under_n_chars: Optional[int] = None,
-        max_characters: Optional[int] = None,
-        multipage_sections: Optional[bool] = None,
-        new_after_n_chars: Optional[int] = None,
-        overlap: Optional[int] = None,
-        overlap_all: Optional[bool] = None,
-        text_splitting_separators: Sequence[str] = ("\n", " "),
-    ) -> Self:
-        """Construct validated instance.
+    @lazyproperty
+    def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
+        """The semantic-boundary detectors to be applied to break pre-chunks.

-        Raises `ValueError` on invalid arguments like overlap > max_chars.
+        Overridden by sub-typs to provide semantic-boundary isolation behaviors.
        """
-        self = cls(
-            combine_text_under_n_chars,
-            max_characters,
-            multipage_sections,
-            new_after_n_chars,
-            overlap,
-            overlap_all,
-            text_splitting_separators,
-        )
-        self._validate()
-        return self
+        return ()

    @lazyproperty
    def combine_text_under_n_chars(self) -> int:
-        """Combine consecutive text pre-chunks if former is smaller than this and both will fit.
+        """Combine two consecutive text pre-chunks if first is smaller than this and both will fit.

-        - Does not combine table chunks with text chunks even if they would both fit in the
-          chunking window.
-        - Does not combine text chunks if together they would exceed the chunking window.
-        - Defaults to `max_characters` when not specified.
-        - Is reduced to `new_after_n_chars` when it exceeds that value.
+        Default applied here is `0` which essentially disables chunk combining. Must be overridden
+        by subclass where combining behavior is supported.
        """
-        max_characters = self.hard_max
-        soft_max = self.soft_max
-        arg = self._combine_text_under_n_chars_arg
-
-        # -- `combine_text_under_n_chars` defaults to `max_characters` when not specified and is
-        # -- capped at max-chars
-        combine_text_under_n_chars = max_characters if arg is None or arg > max_characters else arg
-
-        # -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` --
-        return soft_max if combine_text_under_n_chars > soft_max else combine_text_under_n_chars
+        arg_value = self._combine_text_under_n_chars_arg
+        return arg_value if arg_value is not None else 0

    @lazyproperty
    def hard_max(self) -> int:
@ -185,12 +151,6 @@ class ChunkingOptions:
        """
        return self.overlap if self._overlap_all_arg else 0

-    @lazyproperty
-    def multipage_sections(self) -> bool:
-        """When False, break pre-chunks on page-boundaries."""
-        arg_value = self._multipage_sections_arg
-        return CHUNK_MULTI_PAGE_DEFAULT if arg_value is None else bool(arg_value)
-
    @lazyproperty
    def overlap(self) -> int:
        """The number of characters to overlap text when splitting chunks mid-text.
@ -256,28 +216,15 @@ class ChunkingOptions:
        if max_characters <= 0:
            raise ValueError(f"'max_characters' argument must be > 0," f" got {max_characters}")

-        # -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination)
-        # -- but a negative value is not
-        combine_text_under_n_chars = self._combine_text_under_n_chars_arg
-        if combine_text_under_n_chars is not None and combine_text_under_n_chars < 0:
-            raise ValueError(
-                f"'combine_text_under_n_chars' argument must be >= 0,"
-                f" got {combine_text_under_n_chars}"
-            )
-
-        # -- a negative value for `new_after_n_chars` is assumed to
-        # -- be a mistake the caller will want to know about
+        # -- a negative value for `new_after_n_chars` is assumed to be a mistake the caller will
+        # -- want to know about
        new_after_n_chars = self._new_after_n_chars_arg
        if new_after_n_chars is not None and new_after_n_chars < 0:
            raise ValueError(
                f"'new_after_n_chars' argument must be >= 0," f" got {new_after_n_chars}"
            )

-        # -- overlap must be less than max-chars or the chunk text will
-        # -- never be consumed
-        # TODO: consider a heuristic like never overlap more than half,
-        # otherwise there could be corner cases leading to an infinite
-        # loop (I think).
+        # -- overlap must be less than max-chars or the chunk text will never be consumed --
        if self.overlap >= max_characters:
            raise ValueError(
                f"'overlap' argument must be less than `max_characters`,"
@ -402,12 +349,12 @@ class _TextSplitter:


 # ================================================================================================
-# BASE PRE-CHUNKER
+# PRE-CHUNKER
 # ================================================================================================


-class BasePreChunker:
-    """Base-class for per-strategy pre-chunkers.
+class PreChunker:
+    """Gathers sequential elements into pre-chunks as length constraints allow.

    The pre-chunker's responsibilities are:

@ -465,7 +412,7 @@ class BasePreChunker:
    @lazyproperty
    def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
        """The semantic-boundary detectors to be applied to break pre-chunks."""
-        return ()
+        return self._opts.boundary_predicates

    def _is_in_new_semantic_unit(self, element: Element) -> bool:
        """True when `element` begins a new semantic unit such as a section or page."""
--- a/unstructured/chunking/basic.py
+++ b/unstructured/chunking/basic.py
@ -17,7 +17,9 @@ from __future__ import annotations

 from typing import Iterable, Optional

-from unstructured.chunking.base import BasePreChunker, ChunkingOptions
+from typing_extensions import Self
+
+from unstructured.chunking.base import ChunkingOptions, PreChunker
 from unstructured.documents.elements import Element


@ -58,7 +60,7 @@ def chunk_elements(
        level of "pollution" of otherwise clean semantic chunk boundaries.
    """
    # -- raises ValueError on invalid parameters --
-    opts = ChunkingOptions.new(
+    opts = _BasicChunkingOptions.new(
        max_characters=max_characters,
        new_after_n_chars=new_after_n_chars,
        overlap=overlap,
@ -67,14 +69,32 @@ def chunk_elements(

    return [
        chunk
-        for pre_chunk in BasicPreChunker.iter_pre_chunks(elements, opts)
+        for pre_chunk in PreChunker.iter_pre_chunks(elements, opts)
        for chunk in pre_chunk.iter_chunks()
    ]


-class BasicPreChunker(BasePreChunker):
-    """Produces pre-chunks from a sequence of document-elements using the "basic" rule-set.
+class _BasicChunkingOptions(ChunkingOptions):
+    """Options for `basic` chunking."""

-    The "basic" rule-set is essentially "no-rules" other than `Table` is segregated into its own
-    pre-chunk.
-    """
+    @classmethod
+    def new(
+        cls,
+        *,
+        max_characters: Optional[int] = None,
+        new_after_n_chars: Optional[int] = None,
+        overlap: Optional[int] = None,
+        overlap_all: Optional[bool] = None,
+    ) -> Self:
+        """Construct validated instance.
+
+        Raises `ValueError` on invalid arguments like overlap > max_chars.
+        """
+        self = cls(
+            max_characters=max_characters,
+            new_after_n_chars=new_after_n_chars,
+            overlap=overlap,
+            overlap_all=overlap_all,
+        )
+        self._validate()
+        return self
--- a/unstructured/chunking/title.py
+++ b/unstructured/chunking/title.py
@ -7,11 +7,14 @@ from __future__ import annotations

 from typing import Iterable, Iterator, Optional

+from typing_extensions import Self
+
 from unstructured.chunking.base import (
-    BasePreChunker,
+    CHUNK_MULTI_PAGE_DEFAULT,
    BoundaryPredicate,
    ChunkingOptions,
    PreChunkCombiner,
+    PreChunker,
    is_in_next_section,
    is_on_next_page,
    is_title,
@ -22,6 +25,7 @@ from unstructured.utils import lazyproperty

 def chunk_by_title(
    elements: Iterable[Element],
+    *,
    max_characters: Optional[int] = None,
    multipage_sections: Optional[bool] = None,
    combine_text_under_n_chars: Optional[int] = None,
@ -65,7 +69,7 @@ def chunk_by_title(
        elements and not subject to text-splitting. Use this with caution as it entails a certain
        level of "pollution" of otherwise clean semantic chunk boundaries.
    """
-    opts = ChunkingOptions.new(
+    opts = _ByTitleChunkingOptions.new(
        combine_text_under_n_chars=combine_text_under_n_chars,
        max_characters=max_characters,
        multipage_sections=multipage_sections,
@ -75,27 +79,127 @@ def chunk_by_title(
    )

    pre_chunks = PreChunkCombiner(
-        _ByTitlePreChunker.iter_pre_chunks(elements, opts), opts=opts
+        PreChunker.iter_pre_chunks(elements, opts), opts=opts
    ).iter_combined_pre_chunks()

    return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks()]


-class _ByTitlePreChunker(BasePreChunker):
-    """Pre-chunker for the "by_title" chunking strategy.
+class _ByTitleChunkingOptions(ChunkingOptions):
+    """Adds the by-title-specific chunking options to the base case.

-    The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates a
-    new "section", hence the "by-title" designation.
+    `by_title`-specific options:
+
+    multipage_sections
+        Indicates that page-boundaries should not be respected while chunking, i.e. elements
+        appearing on two different pages can appear in the same chunk.
    """

+    def __init__(
+        self,
+        *,
+        max_characters: Optional[int] = None,
+        combine_text_under_n_chars: Optional[int] = None,
+        multipage_sections: Optional[bool] = None,
+        new_after_n_chars: Optional[int] = None,
+        overlap: Optional[int] = None,
+        overlap_all: Optional[bool] = None,
+    ):
+        super().__init__(
+            combine_text_under_n_chars=combine_text_under_n_chars,
+            max_characters=max_characters,
+            new_after_n_chars=new_after_n_chars,
+            overlap=overlap,
+            overlap_all=overlap_all,
+        )
+        self._multipage_sections_arg = multipage_sections
+
+    @classmethod
+    def new(
+        cls,
+        *,
+        max_characters: Optional[int] = None,
+        combine_text_under_n_chars: Optional[int] = None,
+        multipage_sections: Optional[bool] = None,
+        new_after_n_chars: Optional[int] = None,
+        overlap: Optional[int] = None,
+        overlap_all: Optional[bool] = None,
+    ) -> Self:
+        """Return instance or raises `ValueError` on invalid arguments like overlap > max_chars."""
+        self = cls(
+            max_characters=max_characters,
+            combine_text_under_n_chars=combine_text_under_n_chars,
+            multipage_sections=multipage_sections,
+            new_after_n_chars=new_after_n_chars,
+            overlap=overlap,
+            overlap_all=overlap_all,
+        )
+        self._validate()
+        return self
+
    @lazyproperty
-    def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
-        """The semantic-boundary detectors to be applied to break pre-chunks."""
+    def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
+        """The semantic-boundary detectors to be applied to break pre-chunks.
+
+        For the `by_title` strategy these are sections indicated by a title (section-heading), an
+        explicit section metadata item (only present for certain document types), and optionally
+        page boundaries.
+        """

        def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:
            yield is_title
            yield is_in_next_section()
-            if not self._opts.multipage_sections:
+            if not self.multipage_sections:
                yield is_on_next_page()

        return tuple(iter_boundary_predicates())
+
+    @lazyproperty
+    def combine_text_under_n_chars(self) -> int:
+        """Combine consecutive text pre-chunks if former is smaller than this and both will fit.
+
+        - Does not combine table chunks with text chunks even if they would both fit in the
+          chunking window.
+        - Does not combine text chunks if together they would exceed the chunking window.
+        - Defaults to `max_characters` when not specified.
+        - Is reduced to `new_after_n_chars` when it exceeds that value.
+        """
+        max_characters = self.hard_max
+        soft_max = self.soft_max
+        arg_value = self._combine_text_under_n_chars_arg
+
+        # -- `combine_text_under_n_chars` defaults to `max_characters` when not specified --
+        combine_text_under_n_chars = max_characters if arg_value is None else arg_value
+
+        # -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` --
+        return soft_max if combine_text_under_n_chars > soft_max else combine_text_under_n_chars
+
+    @lazyproperty
+    def multipage_sections(self) -> bool:
+        """When False, break pre-chunks on page-boundaries."""
+        arg_value = self._multipage_sections_arg
+        return CHUNK_MULTI_PAGE_DEFAULT if arg_value is None else bool(arg_value)
+
+    def _validate(self) -> None:
+        """Raise ValueError if request option-set is invalid."""
+        # -- start with base-class validations --
+        super()._validate()
+
+        if (combine_text_under_n_chars_arg := self._combine_text_under_n_chars_arg) is not None:
+            # -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination)
+            # -- but a negative value is not
+            if combine_text_under_n_chars_arg < 0:
+                raise ValueError(
+                    f"'combine_text_under_n_chars' argument must be >= 0,"
+                    f" got {combine_text_under_n_chars_arg}"
+                )
+
+            # -- `combine_text_under_n_chars` > `max_characters` can produce behavior confusing to
+            # -- users. The chunking behavior would be no different than when
+            # -- `combine_text_under_n_chars == max_characters`, but if `max_characters` is left to
+            # -- default (500) then it can look like chunk-combining isn't working.
+            if combine_text_under_n_chars_arg > self.hard_max:
+                raise ValueError(
+                    f"'combine_text_under_n_chars' argument must not exceed `max_characters`"
+                    f" value, got {combine_text_under_n_chars_arg} > {self.hard_max}"
+                )