refactor text.py (#1872)

### Summary Closes #1520 Partial solution to #1521 - Adds an abstraction layer between the user API and the partitioner implementation - Adds comments explaining paragraph chunking - Makes edits to pass strict type-checking for both text.py and test_text.py
2025-10-12 16:43:53 +00:00 · 2023-11-01 17:44:55 -05:00 · 2023-11-01 17:44:55 -05:00 · 2f553333bd
commit 2f553333bd
parent b92cab7fbd
7 changed files with 275 additions and 208 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.10.29-dev6
+## 0.10.29-dev7
 ### Enhancements
--- a/test_unstructured/partition/pptx/test_pptx.py
+++ b/test_unstructured/partition/pptx/test_pptx.py
@ -395,7 +395,7 @@ def test_partition_pptx_respects_detect_language_per_element():
 def test_partition_pptx_raises_TypeError_for_invalid_languages():
    with pytest.raises(TypeError):
        filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
-        partition_pptx(filename=filename, languages="eng")
+        partition_pptx(filename=filename, languages="eng")  # type: ignore
 # == DescribePptxPartitionerDownstreamBehaviors ==================================================
--- a/test_unstructured/partition/test_text.py
+++ b/test_unstructured/partition/test_text.py
@ -1,23 +1,28 @@
 # pyright: reportPrivateUsage=false
 from __future__ import annotations
 import json
 import os
 import pathlib
 from typing import Optional, Sequence, Type, cast
 import pytest
 from pytest_mock import MockerFixture
 from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
 from unstructured.chunking.title import chunk_by_title
 from unstructured.cleaners.core import group_broken_paragraphs
-from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
+from unstructured.documents.elements import Address, ListItem, NarrativeText, Text, Title
 from unstructured.partition.text import (
-    combine_paragraphs_less_than_min,
+    _combine_paragraphs_less_than_min,
    _split_content_to_fit_max,
    partition_text,
    split_content_to_fit_max,
 )
 from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
 DIRECTORY = pathlib.Path(__file__).parent.resolve()
 EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
 EXPECTED_OUTPUT = [
    NarrativeText(text="This is a test document to use for unit tests."),
@ -62,8 +67,8 @@ End.
        ("fake-text-utf-16-be.txt", "utf-16-be"),
    ],
 )
-def test_partition_text_from_filename(filename, encoding):
+def test_partition_text_from_filename(filename: str, encoding: Optional[str]):
-    filename_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
    elements = partition_text(filename=filename_path, encoding=encoding)
    assert len(elements) > 0
    assert elements == EXPECTED_OUTPUT
@ -74,7 +79,7 @@ def test_partition_text_from_filename(filename, encoding):
 def test_partition_text_from_filename_with_metadata_filename():
-    filename_path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
+    filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    elements = partition_text(
        filename=filename_path,
        encoding="utf-8",
@ -89,8 +94,8 @@ def test_partition_text_from_filename_with_metadata_filename():
    "filename",
    ["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
 )
-def test_partition_text_from_filename_default_encoding(filename):
+def test_partition_text_from_filename_default_encoding(filename: str):
-    filename_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
    elements = partition_text(filename=filename_path)
    assert len(elements) > 0
    assert elements == EXPECTED_OUTPUT
@ -105,15 +110,19 @@ def test_partition_text_from_filename_default_encoding(filename):
        ("fake-text-utf-16-be.txt", "utf-16", UnicodeError),
    ],
 )
-def test_partition_text_from_filename_raises_econding_error(filename, encoding, error):
+def test_partition_text_from_filename_raises_econding_error(
    filename: str,
    encoding: Optional[str],
    error: Type[BaseException],
 ):
    with pytest.raises(error):
-        filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+        filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
        partition_text(filename=filename, encoding=encoding)
 def test_partition_text_from_file():
-    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
-    with open(filename) as f:
+    with open(filename, "rb") as f:
        elements = partition_text(file=f)
    assert len(elements) > 0
    assert elements == EXPECTED_OUTPUT
@ -122,8 +131,8 @@ def test_partition_text_from_file():
 def test_partition_text_from_file_with_metadata_filename():
-    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
-    with open(filename) as f:
+    with open(filename, "rb") as f:
        elements = partition_text(file=f, metadata_filename="test")
    assert len(elements) > 0
    assert elements == EXPECTED_OUTPUT
@ -135,9 +144,9 @@ def test_partition_text_from_file_with_metadata_filename():
    "filename",
    ["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
 )
-def test_partition_text_from_file_default_encoding(filename):
+def test_partition_text_from_file_default_encoding(filename: str):
-    filename_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
-    with open(filename_path) as f:
+    with open(filename_path, "rb") as f:
        elements = partition_text(file=f)
    assert len(elements) > 0
    assert elements == EXPECTED_OUTPUT
@ -146,7 +155,7 @@ def test_partition_text_from_file_default_encoding(filename):
 def test_partition_text_from_bytes_file():
-    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    with open(filename, "rb") as f:
        elements = partition_text(file=f)
    assert len(elements) > 0
@ -159,8 +168,8 @@ def test_partition_text_from_bytes_file():
    "filename",
    ["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
 )
-def test_partition_text_from_bytes_file_default_encoding(filename):
+def test_partition_text_from_bytes_file_default_encoding(filename: str):
-    filename_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
    with open(filename_path, "rb") as f:
        elements = partition_text(file=f)
    assert len(elements) > 0
@ -176,7 +185,7 @@ def test_text_partition_element_metadata_user_provided_languages():
 def test_partition_text_from_text():
-    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    with open(filename) as f:
        text = f.read()
    elements = partition_text(text=text)
@ -196,7 +205,7 @@ def test_partition_text_raises_with_none_specified():
 def test_partition_text_raises_with_too_many_specified():
-    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    with open(filename) as f:
        text = f.read()
@ -245,16 +254,18 @@ def test_partition_text_extract_regex_metadata():
        assert element.metadata.filename is None
-def test_partition_text_splits_long_text(filename="example-docs/norwich-city.txt"):
+def test_partition_text_splits_long_text():
-    elements = partition_text(filename=filename)
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
    elements = cast(Sequence[Text], partition_text(filename=filename))
    assert len(elements) > 0
    assert elements[0].text.startswith("Iwan Roberts")
    assert elements[-1].text.endswith("External links")
-def test_partition_text_splits_long_text_max_partition(filename="example-docs/norwich-city.txt"):
+def test_partition_text_splits_long_text_max_partition():
-    elements = partition_text(filename=filename)
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
-    elements_max_part = partition_text(filename=filename, max_partition=500)
+    elements = cast(Sequence[Text], partition_text(filename=filename))
    elements_max_part = cast(Sequence[Text], partition_text(filename=filename, max_partition=500))
    # NOTE(klaijan) - I edited the operation here from < to <=
    # Please revert back if this does not make sense
    assert len(elements) <= len(elements_max_part)
@ -265,9 +276,13 @@ def test_partition_text_splits_long_text_max_partition(filename="example-docs/no
    assert " ".join([el.text for el in elements]) == " ".join([el.text for el in elements_max_part])
-def test_partition_text_splits_max_min_partition(filename="example-docs/norwich-city.txt"):
+def test_partition_text_splits_max_min_partition():
-    elements = partition_text(filename=filename)
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
-    elements_max_part = partition_text(filename=filename, min_partition=1000, max_partition=1500)
+    elements = cast(Sequence[Text], partition_text(filename=filename))
    elements_max_part = cast(
        Sequence[Text],
        partition_text(filename=filename, min_partition=1000, max_partition=1500),
    )
    for i, element in enumerate(elements_max_part):
        # NOTE(robinson) - the last element does not have a next element to merge with,
        # so it can be short
@ -298,10 +313,13 @@ def test_partition_text_splits_max_min_partition(filename="example-docs/norwich-
    )
-def test_partition_text_min_max(filename="example-docs/norwich-city.txt"):
+def test_partition_text_min_max():
-    segments = partition_text(
+    segments = cast(
-        text=SHORT_PARAGRAPHS,
+        Sequence[Text],
-        min_partition=6,
+        partition_text(
            text=SHORT_PARAGRAPHS,
            min_partition=6,
        ),
    )
    for i, segment in enumerate(segments):
        # NOTE(robinson) - the last element does not have a next element to merge with,
@ -309,10 +327,13 @@ def test_partition_text_min_max(filename="example-docs/norwich-city.txt"):
        if i < len(segments) - 1:
            assert len(segment.text) >= 6
-    segments = partition_text(
+    segments = cast(
-        text=SHORT_PARAGRAPHS,
+        Sequence[Text],
-        max_partition=20,
+        partition_text(
-        min_partition=7,
+            text=SHORT_PARAGRAPHS,
            max_partition=20,
            min_partition=7,
        ),
    )
    for i, segment in enumerate(segments):
        # NOTE(robinson) - the last element does not have a next element to merge with,
@ -323,7 +344,7 @@ def test_partition_text_min_max(filename="example-docs/norwich-city.txt"):
 def test_split_content_to_fit_max():
-    segments = split_content_to_fit_max(
+    segments = _split_content_to_fit_max(
        content=MIN_MAX_TEXT,
        max_partition=75,
    )
@ -337,7 +358,7 @@ def test_split_content_to_fit_max():
 def test_combine_paragraphs_less_than_min():
-    segments = combine_paragraphs_less_than_min(
+    segments = _combine_paragraphs_less_than_min(
        SHORT_PARAGRAPHS.split("\n\n"),
        max_partition=1500,
        min_partition=7,
@ -347,7 +368,7 @@ def test_combine_paragraphs_less_than_min():
 def test_partition_text_doesnt_get_page_breaks():
    text = "--------------------"
-    elements = partition_text(text=text)
+    elements = cast(Sequence[Text], partition_text(text=text))
    assert len(elements) == 1
    assert elements[0].text == text
    assert not isinstance(elements[0], ListItem)
@ -361,8 +382,8 @@ def test_partition_text_doesnt_get_page_breaks():
        ("fake-text-utf-16-be.txt", "utf-16-be"),
    ],
 )
-def test_partition_text_from_filename_exclude_metadata(filename, encoding):
+def test_partition_text_from_filename_exclude_metadata(filename: str, encoding: Optional[str]):
-    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
    elements = partition_text(
        filename=filename,
        encoding=encoding,
@ -373,17 +394,15 @@ def test_partition_text_from_filename_exclude_metadata(filename, encoding):
 def test_partition_text_from_file_exclude_metadata():
-    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
-    with open(filename) as f:
+    with open(filename, "rb") as f:
        elements = partition_text(file=f, include_metadata=False)
    for i in range(len(elements)):
        assert elements[i].metadata.to_dict() == {}
-def test_partition_text_metadata_date(
+def test_partition_text_metadata_date(mocker: MockerFixture):
-    mocker,
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    filename="example-docs/fake-text.txt",
 ):
    mocked_last_modification_date = "2029-07-05T09:24:28"
    mocker.patch(
@ -398,10 +417,8 @@ def test_partition_text_metadata_date(
    assert elements[0].metadata.last_modified == mocked_last_modification_date
-def test_partition_text_with_custom_metadata_date(
+def test_partition_text_with_custom_metadata_date(mocker: MockerFixture):
-    mocker,
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    filename="example-docs/fake-text.txt",
 ):
    mocked_last_modification_date = "2029-07-05T09:24:28"
    expected_last_modification_date = "2020-07-05T09:24:28"
@ -418,10 +435,8 @@ def test_partition_text_with_custom_metadata_date(
    assert elements[0].metadata.last_modified == expected_last_modification_date
-def test_partition_text_from_file_metadata_date(
+def test_partition_text_from_file_metadata_date(mocker: MockerFixture):
-    mocker,
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    filename="example-docs/fake-text.txt",
 ):
    mocked_last_modification_date = "2029-07-05T09:24:28"
    mocker.patch(
@ -437,10 +452,8 @@ def test_partition_text_from_file_metadata_date(
    assert elements[0].metadata.last_modified == mocked_last_modification_date
-def test_partition_text_from_file_with_custom_metadata_date(
+def test_partition_text_from_file_with_custom_metadata_date(mocker: MockerFixture):
-    mocker,
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    filename="example-docs/fake-text.txt",
 ):
    mocked_last_modification_date = "2029-07-05T09:24:28"
    expected_last_modification_date = "2020-07-05T09:24:28"
@ -455,9 +468,8 @@ def test_partition_text_from_file_with_custom_metadata_date(
    assert elements[0].metadata.last_modified == expected_last_modification_date
-def test_partition_text_from_text_metadata_date(
+def test_partition_text_from_text_metadata_date():
-    filename="example-docs/fake-text.txt",
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
 ):
    with open(filename) as f:
        text = f.read()
@ -467,9 +479,8 @@ def test_partition_text_from_text_metadata_date(
    assert elements[0].metadata.last_modified is None
-def test_partition_text_from_text_with_custom_metadata_date(
+def test_partition_text_from_text_with_custom_metadata_date():
-    filename="example-docs/fake-text.txt",
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
 ):
    expected_last_modification_date = "2020-07-05T09:24:28"
    with open(filename) as f:
@ -487,8 +498,10 @@ def test_partition_text_with_unique_ids():
    json.dumps(elements[0].to_dict())
    elements = partition_text(text="hello there!", unique_element_ids=True)
-    assert len(elements[0].id) == 36
+    id = elements[0].id
-    assert elements[0].id.count("-") == 4
+    assert isinstance(id, str)  # included for type-narrowing
    assert len(id) == 36
    assert id.count("-") == 4
    # Test that the element is JSON serializable. This should run without an error
    json.dumps(elements[0].to_dict())
@ -506,7 +519,8 @@ def test_partition_text_with_json(file_name: str, encoding: str | None):
    assert_round_trips_through_JSON(elements)
-def test_add_chunking_strategy_on_partition_text(filename="example-docs/norwich-city.txt"):
+def test_add_chunking_strategy_on_partition_text():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
    elements = partition_text(filename=filename)
    chunk_elements = partition_text(filename, chunking_strategy="by_title")
    chunks = chunk_by_title(elements)
@ -515,32 +529,32 @@ def test_add_chunking_strategy_on_partition_text(filename="example-docs/norwich-
 def test_partition_text_element_metadata_has_languages():
-    filename = "example-docs/norwich-city.txt"
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
    elements = partition_text(filename=filename)
    assert elements[0].metadata.languages == ["eng"]
 def test_partition_text_respects_detect_language_per_element():
-    filename = "example-docs/language-docs/eng_spa_mult.txt"
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "language-docs", "eng_spa_mult.txt")
    elements = partition_text(filename=filename, detect_language_per_element=True)
    langs = [element.metadata.languages for element in elements]
    assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
 def test_partition_text_respects_languages_arg():
-    filename = "example-docs/norwich-city.txt"
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
    elements = partition_text(filename=filename, languages=["deu"])
    assert elements[0].metadata.languages == ["deu"]
 def test_partition_text_element_metadata_raises_TypeError():
    with pytest.raises(TypeError):
-        filename = "example-docs/norwich-city.txt"
+        filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
-        partition_text(filename=filename, languages="eng")
+        partition_text(filename=filename, languages="eng")  # type: ignore
 def test_partition_text_detects_more_than_3_languages():
-    filename = "example-docs/language-docs/UDHR_first_article_all.txt"
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "language-docs", "UDHR_first_article_all.txt")
    elements = partition_text(filename=filename, detect_language_per_element=True)
    langs = list(
        {element.metadata.languages[0] for element in elements if element.metadata.languages},
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.10.29-dev6"  # pragma: no cover
+__version__ = "0.10.29-dev7"  # pragma: no cover
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@ -1,3 +1,5 @@
 from __future__ import annotations
 import quopri
 import re
 import sys
@ -132,8 +134,8 @@ def group_bullet_paragraph(paragraph: str) -> list:
 def group_broken_paragraphs(
    text: str,
-    line_split: re.Pattern = PARAGRAPH_PATTERN_RE,
+    line_split: re.Pattern[str] = PARAGRAPH_PATTERN_RE,
-    paragraph_split: re.Pattern = DOUBLE_PARAGRAPH_PATTERN_RE,
+    paragraph_split: re.Pattern[str] = DOUBLE_PARAGRAPH_PATTERN_RE,
 ) -> str:
    """Groups paragraphs that have line breaks for visual/formatting purposes.
    For example:
@ -174,7 +176,7 @@ def group_broken_paragraphs(
 def new_line_grouper(
    text: str,
-    paragraph_split: re.Pattern = LINE_BREAK_RE,
+    paragraph_split: re.Pattern[str] = LINE_BREAK_RE,
 ) -> str:
    """
    Concatenates text document that has one-line paragraph break pattern
@ -221,7 +223,7 @@ def blank_line_grouper(
 def auto_paragraph_grouper(
    text: str,
-    line_split: re.Pattern = LINE_BREAK_RE,
+    line_split: re.Pattern[str] = LINE_BREAK_RE,
    max_line_count: int = 2000,
    threshold: float = 0.1,
 ) -> str:
--- a/unstructured/partition/text.py
+++ b/unstructured/partition/text.py
@ -1,7 +1,7 @@
 import copy
 import re
 import textwrap
-from typing import IO, Callable, List, Optional, Tuple
+from typing import IO, Any, Callable, List, Optional, Tuple
 from unstructured.chunking.title import add_chunking_strategy
 from unstructured.cleaners.core import (
@ -40,126 +40,6 @@ from unstructured.partition.text_type import (
 )
 def split_by_paragraph(
    file_text: str,
    min_partition: Optional[int] = 0,
    max_partition: Optional[int] = 1500,
 ) -> List[str]:
    paragraphs = re.split(PARAGRAPH_PATTERN, file_text.strip())
    split_paragraphs = []
    for paragraph in paragraphs:
        split_paragraphs.extend(
            split_content_to_fit_max(
                content=paragraph,
                max_partition=max_partition,
            ),
        )
    combined_paragraphs = combine_paragraphs_less_than_min(
        split_paragraphs=split_paragraphs,
        max_partition=max_partition,
        min_partition=min_partition,
    )
    return combined_paragraphs
 def _split_in_half_at_breakpoint(
    content: str,
    breakpoint: str = " ",
 ) -> List[str]:
    """Splits a segment of content at the breakpoint closest to the middle"""
    mid = len(content) // 2
    for i in range(len(content) // 2):
        if content[mid + i] == breakpoint:
            mid += i
            break
        elif content[mid - i] == breakpoint:
            mid += -i
            break
    return [content[:mid].rstrip(), content[mid:].lstrip()]
 def _split_content_size_n(content: str, n: int) -> List[str]:
    """Splits a section of content into chunks that are at most
    size n without breaking apart words."""
    segments = []
    if len(content) < n * 2:
        segments = list(_split_in_half_at_breakpoint(content))
    else:
        segments = textwrap.wrap(content, width=n)
    return segments
 def split_content_to_fit_max(
    content: str,
    max_partition: Optional[int] = 1500,
 ) -> List[str]:
    """Splits a paragraph or section of content so that all of the elements fit into the
    max partition window."""
    sentences = sent_tokenize(content)
    chunks = []
    tmp_chunk = ""
    for sentence in sentences:
        if max_partition is not None and len(sentence) > max_partition:
            if tmp_chunk:
                chunks.append(tmp_chunk)
                tmp_chunk = ""
            segments = _split_content_size_n(sentence, n=max_partition)
            chunks.extend(segments[:-1])
            tmp_chunk = segments[-1]
        else:
            if max_partition is not None and len(tmp_chunk + " " + sentence) > max_partition:
                chunks.append(tmp_chunk)
                tmp_chunk = sentence
            else:
                if not tmp_chunk:
                    tmp_chunk = sentence
                else:
                    tmp_chunk += " " + sentence
                    tmp_chunk = tmp_chunk.strip()
    if tmp_chunk:
        chunks.append(tmp_chunk)
    return chunks
 def combine_paragraphs_less_than_min(
    split_paragraphs: List[str],
    max_partition: Optional[int] = 1500,
    min_partition: Optional[int] = 0,
 ) -> List[str]:
    """Combine paragraphs less than `min_partition` while not exceeding `max_partition`."""
    min_partition = min_partition or 0
    max_possible_partition = len(" ".join(split_paragraphs))
    max_partition = max_partition or max_possible_partition
    combined_paras = []
    combined_idxs = []
    for i, para in enumerate(split_paragraphs):
        if i in combined_idxs:
            continue
        if len(para) >= min_partition:
            combined_paras.append(para)
        else:
            combined_para = para
            for j, next_para in enumerate(split_paragraphs[i + 1 :]):  # noqa
                if len(combined_para) + len(next_para) + 1 <= max_partition:
                    combined_idxs.append(i + j + 1)
                    combined_para += " " + next_para
                else:
                    break
            combined_paras.append(combined_para)
    return combined_paras
@process_metadata()
@add_metadata_with_filetype(FileType.TXT)
@add_chunking_strategy()
 def partition_text(
    filename: Optional[str] = None,
    file: Optional[IO[bytes]] = None,
@ -175,7 +55,7 @@ def partition_text(
    chunking_strategy: Optional[str] = None,
    detect_language_per_element: bool = False,
    detection_origin: Optional[str] = "text",
-    **kwargs,
+    **kwargs: Any,
 ) -> List[Element]:
    """Partitions an .txt documents into its constituent paragraph elements.
    If paragraphs are below "min_partition" or above "max_partition" boundaries,
@ -185,7 +65,7 @@ def partition_text(
    filename
        A string defining the target filename path.
    file
-        A file-like object using "r" mode --> open(filename, "r").
+        A file-like object using "rb" mode --> open(filename, "rb").
    text
        The string representation of the .txt document.
    encoding
@ -210,6 +90,46 @@ def partition_text(
    metadata_last_modified
        The day of the last modification
    """
    return _partition_text(
        filename=filename,
        file=file,
        text=text,
        encoding=encoding,
        paragraph_grouper=paragraph_grouper,
        metadata_filename=metadata_filename,
        include_metadata=include_metadata,
        languages=languages,
        max_partition=max_partition,
        min_partition=min_partition,
        metadata_last_modified=metadata_last_modified,
        chunking_strategy=chunking_strategy,
        detect_language_per_element=detect_language_per_element,
        detection_origin=detection_origin,
        **kwargs,
    )
@process_metadata()
@add_metadata_with_filetype(FileType.TXT)
@add_chunking_strategy()
 def _partition_text(
    filename: Optional[str] = None,
    file: Optional[IO[bytes]] = None,
    text: Optional[str] = None,
    encoding: Optional[str] = None,
    paragraph_grouper: Optional[Callable[[str], str]] = None,
    metadata_filename: Optional[str] = None,
    include_metadata: bool = True,
    languages: Optional[List[str]] = ["auto"],
    max_partition: Optional[int] = 1500,
    min_partition: Optional[int] = 0,
    metadata_last_modified: Optional[str] = None,
    chunking_strategy: Optional[str] = None,
    detect_language_per_element: bool = False,
    detection_origin: Optional[str] = "text",
    **kwargs: Any,
 ) -> List[Element]:
    """internal API for `partition_text`"""
    if text is not None and text.strip() == "" and not file and not filename:
        return []
@ -222,6 +142,7 @@ def partition_text(
    # Verify that only one of the arguments was provided
    exactly_one(filename=filename, file=file, text=text)
    file_text = ""
    last_modification_date = None
    if filename is not None:
@ -245,7 +166,7 @@ def partition_text(
    if min_partition is not None and len(file_text) < min_partition:
        raise ValueError("`min_partition` cannot be larger than the length of file contents.")
-    file_content = split_by_paragraph(
+    file_content = _split_by_paragraph(
        file_text,
        min_partition=min_partition,
        max_partition=max_partition,
@ -323,3 +244,133 @@ def element_from_text(
            coordinates=coordinates,
            coordinate_system=coordinate_system,
        )
 def _combine_paragraphs_less_than_min(
    split_paragraphs: List[str],
    max_partition: Optional[int] = 1500,
    min_partition: Optional[int] = 0,
 ) -> List[str]:
    """Combine paragraphs less than `min_partition` while not exceeding `max_partition`."""
    min_partition = min_partition or 0
    max_possible_partition = len(" ".join(split_paragraphs))
    max_partition = max_partition or max_possible_partition
    combined_paras: List[str] = []
    combined_idxs: List[int] = []
    for i, para in enumerate(split_paragraphs):
        if i in combined_idxs:
            continue
        # Paragraphs have already been split to fit `max_partition`, so they can be safely added
        # to the final list of chunks if they are also greater than `min_partition`
        if len(para) >= min_partition:
            combined_paras.append(para)
        else:
            combined_para = para
            for j, next_para in enumerate(split_paragraphs[i + 1 :]):  # noqa
                # Combine the current paragraph(s), e.g. `combined_para` with the next paragraph(s)
                # as long as they don't exceed `max_partition`, and keep track of the indices
                # that have been combined.
                if len(combined_para) + len(next_para) + 1 <= max_partition:
                    combined_idxs.append(i + j + 1)
                    combined_para += " " + next_para
                else:
                    break
            combined_paras.append(combined_para)
    return combined_paras
 def _split_by_paragraph(
    file_text: str,
    min_partition: Optional[int] = 0,
    max_partition: Optional[int] = 1500,
 ) -> List[str]:
    """Split text into paragraphs that fit within the `min_` and `max_partition` window."""
    paragraphs = re.split(PARAGRAPH_PATTERN, file_text.strip())
    split_paragraphs: List[str] = []
    for paragraph in paragraphs:
        split_paragraphs.extend(
            _split_content_to_fit_max(
                content=paragraph,
                max_partition=max_partition,
            ),
        )
    combined_paragraphs = _combine_paragraphs_less_than_min(
        split_paragraphs=split_paragraphs,
        max_partition=max_partition,
        min_partition=min_partition,
    )
    return combined_paragraphs
 def _split_content_size_n(content: str, n: int) -> List[str]:
    """Splits a section of content into chunks that are at most
    size n without breaking apart words."""
    segments = []
    if len(content) < n * 2:
        segments = list(_split_in_half_at_breakpoint(content))
    else:
        segments = textwrap.wrap(content, width=n)
    return segments
 def _split_content_to_fit_max(
    content: str,
    max_partition: Optional[int] = 1500,
 ) -> List[str]:
    """Splits a paragraph or section of content so that all of the elements fit into the
    max partition window."""
    sentences = sent_tokenize(content)
    chunks: List[str] = []
    tmp_chunk = ""
    # Initialize an empty string to collect sentence segments (`tmp_chunk`).
    for sentence in sentences:
        # If a single sentence is larger than `max_partition`, the sentence will be split by
        # `_split_content_size_n` and the last segment of the original sentence will be used
        # as the beginning of the next chunk.
        if max_partition is not None and len(sentence) > max_partition:
            if tmp_chunk:
                chunks.append(tmp_chunk)
                tmp_chunk = ""
            segments = _split_content_size_n(sentence, n=max_partition)
            chunks.extend(segments[:-1])
            tmp_chunk = segments[-1]
        else:
            # If the current sentence is smaller than `max_partition`, but adding it to the
            # current `tmp_chunk` would exceed `max_partition`, add the `tmp_chunk` to the
            # final list of `chunks` and begin the next chunk with the current sentence.
            if max_partition is not None and len(tmp_chunk + " " + sentence) > max_partition:
                chunks.append(tmp_chunk)
                tmp_chunk = sentence
            else:
                # Otherwise, the sentence can be added to `tmp_chunk`
                if not tmp_chunk:
                    tmp_chunk = sentence
                else:
                    tmp_chunk += " " + sentence
                    tmp_chunk = tmp_chunk.strip()
    if tmp_chunk:
        chunks.append(tmp_chunk)
    return chunks
 def _split_in_half_at_breakpoint(
    content: str,
    breakpoint: str = " ",
 ) -> List[str]:
    """Splits a segment of content at the breakpoint closest to the middle"""
    mid = len(content) // 2
    for i in range(len(content) // 2):
        if content[mid + i] == breakpoint:
            mid += i
            break
        elif content[mid - i] == breakpoint:
            mid += -i
            break
    return [content[:mid].rstrip(), content[mid:].lstrip()]
--- a/unstructured/partition/text_type.py
+++ b/unstructured/partition/text_type.py
@ -311,6 +311,6 @@ def is_email_address(text: str) -> bool:
    return EMAIL_ADDRESS_PATTERN_RE.match(text.strip()) is not None
-def is_possible_numbered_list(text) -> bool:
+def is_possible_numbered_list(text: str) -> bool:
    """Checks to see if the text is a potential numbered list."""
    return NUMBERED_LIST_RE.match(text.strip()) is not None
`@ -1,4 +1,4 @@`
	`## 0.10.29-dev6`	`## 0.10.29-dev7`

	`### Enhancements`	`### Enhancements`
`@ -1 +1 @@`
	`__version__ = "0.10.29-dev6" # pragma: no cover`	`__version__ = "0.10.29-dev7" # pragma: no cover`