feat: add min_partition kwarg to that combines elements below a specified threshold (#926)

* add min_partition * functioning _split_content_to_fit_min_max * create test and make tidy/check * fix rebase issues * fix type hinting, remove unused code, add tests * various changes and refactoring of methods * add test, refactor, change var names for debugging purposes * update test * make tidy/check * give more descriptive var names and add comments * update xml partition via partition_text and create test * fix <pre> bug for test_partition_html_with_pre_tag * make tidy * refactor and fix tests * make tidy/check * ingest-test-fixtures-update * change list comprehension to for loop * fix error check
2025-12-16 17:55:48 +00:00 · 2023-07-24 10:57:24 -05:00 · 2023-07-24 10:57:24 -05:00 · 676c50a6ec
commit 676c50a6ec
parent d0329126ef
11 changed files with 314 additions and 45 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,7 +1,7 @@
 ## 0.8.2-dev4
 ### Enhancements
-
+* Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
 * set the file's current position to the beginning after reading the file in `convert_to_bytes`
 * Add slide notes to pptx
--- a/test_unstructured/documents/test_xml.py
+++ b/test_unstructured/documents/test_xml.py
@ -44,6 +44,20 @@ def test_from_string(sample_document):
    assert type_tag.text.strip() == "10-K"
 def test_from_string_with_pre_tag():
    sample_document = """
    <pre>
    <SEC-DOCUMENT>
    <TYPE>10-K
    <COMPANY>Proctor & Gamble
    </SEC-DOCUMENT>
    </pre>
    """
    xml_document = XMLDocument.from_string(sample_document)
    type_tag = xml_document.document_tree.find(".//type")
    assert type_tag.text.strip() == "10-K"
 def test_read_with_stylesheet():
    filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
    stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")
--- a/test_unstructured/partition/test_email.py
+++ b/test_unstructured/partition/test_email.py
@ -241,6 +241,19 @@ def test_partition_email_from_text_file_with_headers():
        assert element.metadata.filename is None
 def test_partition_email_from_text_file_max():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
    with open(filename) as f:
        elements = partition_email(file=f, content_source="text/plain", max_partition=20)
    assert len(elements) == 6
 def test_partition_email_from_text_file_raises_value_error():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
    with pytest.raises(ValueError), open(filename) as f:
        partition_email(file=f, content_source="text/plain", min_partition=1000)
 def test_partition_email_from_text():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
    with open(filename) as f:
--- a/test_unstructured/partition/test_image.py
+++ b/test_unstructured/partition/test_image.py
@ -144,7 +144,6 @@ def test_partition_image_with_ocr_detects_korean():
 def test_partition_image_with_ocr_detects_korean_from_file():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
    with open(filename, "rb") as f:
        elements = image.partition_image(
            file=f,
--- a/test_unstructured/partition/test_text.py
+++ b/test_unstructured/partition/test_text.py
@ -5,7 +5,11 @@ import pytest
 from unstructured.cleaners.core import group_broken_paragraphs
 from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
-from unstructured.partition.text import partition_text
+from unstructured.partition.text import (
    combine_paragraphs_less_than_min,
    partition_text,
    split_content_to_fit_max,
 )
 DIRECTORY = pathlib.Path(__file__).parent.resolve()
@ -18,6 +22,31 @@ EXPECTED_OUTPUT = [
    ListItem(text="I love fuzzy blankets"),
 ]
 MIN_MAX_TEXT = """This is a story. This is a story that doesn't matter
 because it is just being used as an example. Hi. Hello. Howdy. Hola.
 The example is simple and repetitive and long and somewhat boring,
 but it serves a purpose. End.""".replace(
    "\n",
    "",
 )
 SHORT_PARAGRAPHS = """This is a story.
 This is a story that doesn't matter because it is just being used as an example.
 Hi.
 Hello.
 Howdy.
 Hola.
 The example is simple and repetitive and long and somewhat boring, but it serves a purpose.
 End.
 """
@pytest.mark.parametrize(
    ("filename", "encoding"),
@ -201,6 +230,79 @@ def test_partition_text_splits_long_text(filename="example-docs/norwich-city.txt
    assert elements[-1].text.endswith("External links")
 def test_partition_text_splits_long_text_max_partition(filename="example-docs/norwich-city.txt"):
    elements = partition_text(filename=filename)
    elements_max_part = partition_text(filename=filename, max_partition=500)
    assert len(elements) < len(elements_max_part)
 def test_partition_text_min_max():
    segments = partition_text(
        text=SHORT_PARAGRAPHS,
        min_partition=6,
    )
    expected = [
        "This is a story.",
        "This is a story that doesn't matter because it is just being used as an example.",
        "Hi. Hello.",
        "Howdy.",
        """Hola. The example is simple and repetitive and long and somewhat boring,
 but it serves a purpose. End.""".replace(
            "\n",
            "",
        ),
    ]
    for segment, test_segment in zip(segments, expected):
        assert segment.text == test_segment
    segments = partition_text(
        text=SHORT_PARAGRAPHS,
        max_partition=20,
        min_partition=7,
    )
    expected = [
        "This is a story.",
        "This is a story that",
        "doesn't matter",
        "because it is just",
        "being used as an",
        "example.",
        "Hi. Hello.",
        "Howdy. Hola.",
        "The example is",
        "simple and",
        "repetitive and long",
        "and somewhat boring,",
        "but it serves a",
        "purpose. End.",
    ]
    for segment, test_segment in zip(segments, expected):
        assert segment.text == test_segment
 def test_split_content_to_fit_max():
    segments = split_content_to_fit_max(
        content=MIN_MAX_TEXT,
        max_partition=75,
    )
    assert segments == [
        "This is a story.",
        "This is a story that doesn't matter because",
        "it is just being used as an example. Hi. Hello. Howdy. Hola.",
        "The example is simple and repetitive and long",
        "and somewhat boring, but it serves a purpose. End.",
    ]
 def test_combine_paragraphs_less_than_min():
    segments = combine_paragraphs_less_than_min(
        SHORT_PARAGRAPHS.split("\n\n"),
        max_partition=1500,
        min_partition=7,
    )
    assert len(segments) < len(SHORT_PARAGRAPHS)
 def test_partition_text_doesnt_get_page_breaks():
    text = "--------------------"
    elements = partition_text(text=text)
--- a/unstructured/documents/xml.py
+++ b/unstructured/documents/xml.py
@ -7,7 +7,7 @@ from unstructured.file_utils.encoding import read_txt_file
 from unstructured.logger import logger
 from unstructured.partition.text import (
    element_from_text,
-    split_by_paragraph,
+    partition_text,
 )
 VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None]
@ -78,13 +78,16 @@ class XMLDocument(Document):
            #     Please use  bytes input or XML fragments without declaration.
            except ValueError:
                document_tree = etree.fromstring(content.encode(), self.parser)
            if "<pre>" and "</pre>" in content:
                tree = etree.HTML(content)
                for element in tree.xpath("//pre"):
                    if not element.text:
                        continue
-                    text_content = split_by_paragraph(element.text)
+
                    text_content = []
                    for element in partition_text(text=element.text, paragraph_grouper=False):
                        text_content.append(element.text)
                    for text in text_content:
                        element = etree.Element("span")
                        element.text = str(element_from_text(text=text))
--- a/unstructured/partition/email.py
+++ b/unstructured/partition/email.py
@ -52,7 +52,7 @@ from unstructured.file_utils.filetype import FileType, add_metadata_with_filetyp
 from unstructured.logger import logger
 from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
 from unstructured.partition.html import partition_html
-from unstructured.partition.text import partition_text, split_by_paragraph
+from unstructured.partition.text import partition_text
 VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html", "text/plain"]
@ -232,6 +232,7 @@ def partition_email(
    metadata_filename: Optional[str] = None,
    process_attachments: bool = False,
    attachment_partitioner: Optional[Callable] = None,
    min_partition: Optional[int] = 0,
    **kwargs,
 ) -> List[Element]:
    """Partitions an .eml documents into its constituent elements.
@ -258,6 +259,9 @@ def partition_email(
        processing the content of the email itself.
    attachment_partitioner
        The partitioning function to use to process attachments.
    min_partition
        The minimum number of characters to include in a partition. Only applies if
        processing the text/plain content.
    """
    if content_source not in VALID_CONTENT_SOURCES:
        raise ValueError(
@ -270,7 +274,6 @@ def partition_email(
    # Verify that only one of the arguments was provided
    exactly_one(filename=filename, file=file, text=text)
    detected_encoding = "utf-8"
    if filename is not None:
        extracted_encoding, msg = parse_email(filename=filename)
@ -342,12 +345,12 @@ def partition_email(
                            continue
    elif content_source == "text/plain":
        list_content = split_by_paragraph(content)
        elements = partition_text(
            text=content,
            encoding=encoding,
            max_partition=max_partition,
            metadata_filename=metadata_filename or filename,
            min_partition=min_partition,
        )
    for idx, element in enumerate(elements):
--- a/unstructured/partition/msg.py
+++ b/unstructured/partition/msg.py
@ -22,6 +22,7 @@ def partition_msg(
    metadata_filename: Optional[str] = None,
    process_attachments: bool = False,
    attachment_partitioner: Optional[Callable] = None,
    min_partition: Optional[int] = 0,
    **kwargs,
 ) -> List[Element]:
    """Partitions a MSFT Outlook .msg file
@ -42,6 +43,9 @@ def partition_msg(
        processing the content of the email itself.
    attachment_partitioner
        The partitioning function to use to process attachments.
    min_partition
        The minimum number of characters to include in a partition. Only applies if
        processing text/plain content.
    """
    exactly_one(filename=filename, file=file)
@ -57,7 +61,11 @@ def partition_msg(
    if "<html>" in text or "</div>" in text:
        elements = partition_html(text=text)
    else:
-        elements = partition_text(text=text, max_partition=max_partition)
+        elements = partition_text(
            text=text,
            max_partition=max_partition,
            min_partition=min_partition,
        )
    metadata = build_msg_metadata(msg_obj, metadata_filename or filename)
    for element in elements:
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -51,6 +51,7 @@ def partition_pdf(
    max_partition: Optional[int] = 1500,
    include_metadata: bool = True,
    metadata_filename: Optional[str] = None,
    min_partition: Optional[int] = 0,
    **kwargs,
 ) -> List[Element]:
    """Parses a pdf document into a list of interpreted elements.
@ -81,6 +82,9 @@ def partition_pdf(
    max_partition
        The maximum number of characters to include in a partition. If None is passed,
        no maximum is applied. Only applies to the "ocr_only" strategy.
    min_partition
        The minimum number of characters to include in a partition. Only applies if
        processing text/plain content.
    """
    exactly_one(filename=filename, file=file)
    return partition_pdf_or_image(
@ -91,6 +95,7 @@ def partition_pdf(
        infer_table_structure=infer_table_structure,
        ocr_languages=ocr_languages,
        max_partition=max_partition,
        min_partition=min_partition,
        **kwargs,
    )
@ -116,6 +121,7 @@ def partition_pdf_or_image(
    infer_table_structure: bool = False,
    ocr_languages: str = "eng",
    max_partition: Optional[int] = 1500,
    min_partition: Optional[int] = 0,
    **kwargs,
 ) -> List[Element]:
    """Parses a pdf or image document into a list of interpreted elements."""
@ -172,6 +178,7 @@ def partition_pdf_or_image(
                ocr_languages=ocr_languages,
                is_image=is_image,
                max_partition=max_partition,
                min_partition=min_partition,
            )
    return layout_elements
@ -391,6 +398,7 @@ def _partition_pdf_or_image_with_ocr(
    ocr_languages: str = "eng",
    is_image: bool = False,
    max_partition: Optional[int] = 1500,
    min_partition: Optional[int] = 0,
 ):
    """Partitions and image or PDF using Tesseract OCR. For PDFs, each page is converted
    to an image prior to processing."""
@ -402,7 +410,11 @@ def _partition_pdf_or_image_with_ocr(
            text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
        else:
            text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
-        elements = partition_text(text=text, max_partition=max_partition)
+        elements = partition_text(
            text=text,
            max_partition=max_partition,
            min_partition=min_partition,
        )
    else:
        elements = []
        page_number = 0
@ -411,7 +423,11 @@ def _partition_pdf_or_image_with_ocr(
            metadata = ElementMetadata(filename=filename, page_number=page_number)
            text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
-            _elements = partition_text(text=text, max_partition=max_partition)
+            _elements = partition_text(
                text=text,
                max_partition=max_partition,
                min_partition=min_partition,
            )
            for element in _elements:
                element.metadata = metadata
                elements.append(element)
--- a/unstructured/partition/text.py
+++ b/unstructured/partition/text.py
@ -1,4 +1,5 @@
 import re
 import textwrap
 from typing import IO, Callable, List, Optional, Tuple
 from unstructured.cleaners.core import clean_bullets, group_broken_paragraphs
@ -26,52 +27,125 @@ from unstructured.partition.text_type import (
 )
-def split_by_paragraph(content: str, max_partition: Optional[int] = 1500) -> List[str]:
+def _split_in_half_at_breakpoint(
-    paragraphs = re.split(PARAGRAPH_PATTERN, content)
+    content: str,
-    if max_partition is None:
+    breakpoint: str = " ",
-        return paragraphs
+) -> List[str]:
    """Splits a segment of content at the breakpoint closest to the middle"""
    mid = len(content) // 2
    for i in range(len(content) // 2):
        if content[mid + i] == breakpoint:
            mid += i
            break
        elif content[mid - i] == breakpoint:
            mid += -i
            break
-    split_paragraphs = []
+    return [content[:mid].rstrip(), content[mid:].lstrip()]
    for paragraph in paragraphs:
        split_paragraphs.extend(
            _split_to_fit_max_content(paragraph, max_partition=max_partition),
        )
    return split_paragraphs
 def _split_content_size_n(content: str, n: int) -> List[str]:
-    """Splits a string into chunks that are at most size n."""
+    """Splits a section of content into chunks that are at most
    size n without breaking apart words."""
    segments = []
-    for i in range(0, len(content), n):
+    if len(content) < n * 2:
-        segment = content[i : i + n]  # noqa: E203
+        segments = list(_split_in_half_at_breakpoint(content))
-        segments.append(segment)
+    else:
        segments = textwrap.wrap(content, width=n)
    return segments
-def _split_to_fit_max_content(content: str, max_partition: int = 1500) -> List[str]:
+def split_content_to_fit_max(
-    """Splits a section of content so that all of the elements fit into the
+    content: str,
    max_partition: Optional[int] = 1500,
 ) -> List[str]:
    """Splits a paragraph or section of content so that all of the elements fit into the
    max partition window."""
    sentences = sent_tokenize(content)
    num_sentences = len(sentences)
    chunks = []
-    chunk = ""
+    tmp_chunk = ""
-
+    for sentence in sentences:
-    for i, sentence in enumerate(sentences):
+        if max_partition is not None and len(sentence) > max_partition:
-        if len(sentence) > max_partition:
+            if tmp_chunk:
-            chunks.extend(_split_content_size_n(sentence, n=max_partition))
+                chunks.append(tmp_chunk)
-
+                tmp_chunk = ""
-        if len(chunk + " " + sentence) > max_partition:
+            segments = _split_content_size_n(sentence, n=max_partition)
-            chunks.append(chunk)
+            chunks.extend(segments[:-1])
-            chunk = sentence
+            tmp_chunk = segments[-1]
        else:
-            chunk += " " + sentence
+            if max_partition is not None and len(tmp_chunk + " " + sentence) > max_partition:
-            if i == num_sentences - 1:
+                chunks.append(tmp_chunk)
-                chunks.append(chunk)
+                tmp_chunk = sentence
            else:
                if not tmp_chunk:
                    tmp_chunk = sentence
                else:
                    tmp_chunk += " " + sentence
                    tmp_chunk = tmp_chunk.strip()
    if tmp_chunk:
        chunks.append(tmp_chunk)
    return chunks
 def combine_paragraphs_less_than_min(
    split_paragraphs: List[str],
    max_partition: Optional[int] = 1500,
    min_partition: Optional[int] = 0,
 ) -> List[str]:
    """Combine paragraphs less than `min_partition` while not exceeding `max_partition`."""
    if type(split_paragraphs) is not list:
        raise ValueError("`split_paragraphs` is not a list")
    file_content: List[str] = []
    tmp_paragraph = ""
    next_index = 0
    for current_index, paragraph in enumerate(split_paragraphs):
        if next_index > current_index:
            continue  # Skip the current iteration if `next_index`` is already updated
        if min_partition is not None and len(paragraph) < min_partition:
            # Combine paragraphs that are less than `min_partition``
            # while not exceeding `max_partition``
            tmp_paragraph += paragraph + "\n"
            while len(tmp_paragraph.strip()) < min_partition:
                if current_index + 1 == len(split_paragraphs):
                    # If it's the last paragraph, append the paragraph
                    # to the previous content
                    file_content[-1] += " " + tmp_paragraph.rstrip()
                    tmp_paragraph = ""
                    break
                for offset_index, para in enumerate(
                    split_paragraphs[current_index + 1 :], start=1  # noqa
                ):
                    if (
                        max_partition is not None
                        and len(tmp_paragraph + "\n" + para) < max_partition
                    ):
                        tmp_paragraph += "\n" + para
                        # Update `next_index` to skip already combined paragraphs
                        next_index = offset_index + current_index + 1
                    if len(tmp_paragraph.strip()) > min_partition:
                        break  # Stop combining if the combined paragraphs
                        # meet the `min_partition`` requirement
                    elif (
                        max_partition is not None
                        and len(tmp_paragraph) < min_partition
                        and len(tmp_paragraph + "\n" + para) > max_partition
                    ):
                        raise ValueError(
                            "`min_partition` and `max_partition` are defined too close together",
                        )
            # Add the combined paragraph to the final result
            file_content.append(
                tmp_paragraph.strip(),
            )
            tmp_paragraph = ""
        else:
            file_content.append(paragraph)
    return file_content
@process_metadata()
@add_metadata_with_filetype(FileType.TXT)
 def partition_text(
@ -83,9 +157,12 @@ def partition_text(
    metadata_filename: Optional[str] = None,
    include_metadata: bool = True,
    max_partition: Optional[int] = 1500,
    min_partition: Optional[int] = 0,
    **kwargs,
 ) -> List[Element]:
-    """Partitions an .txt documents into its constituent elements.
+    """Partitions an .txt documents into its constituent paragraph elements.
    If paragraphs are below "min_partition" or above "max_partition" boundaries,
    they are combined or split.
    Parameters
    ----------
    filename
@ -104,10 +181,19 @@ def partition_text(
    max_partition
        The maximum number of characters to include in a partition. If None is passed,
        no maximum is applied.
    min_partition
        The minimum number of characters to include in a partition.
    """
    if text is not None and text.strip() == "" and not file and not filename:
        return []
    if (
        min_partition is not None
        and max_partition is not None
        and (min_partition > max_partition or min_partition < 0 or max_partition < 0)
    ):
        raise ValueError("Invalid values for min_partition and/or max_partition.")
    # Verify that only one of the arguments was provided
    exactly_one(filename=filename, file=file, text=text)
@ -120,12 +206,33 @@ def partition_text(
    elif text is not None:
        file_text = str(text)
-    if paragraph_grouper is not None:
+    if paragraph_grouper is False:
        pass
    elif paragraph_grouper is not None:
        file_text = paragraph_grouper(file_text)
    else:
        file_text = group_broken_paragraphs(file_text)
-    file_content = split_by_paragraph(file_text, max_partition=max_partition)
+    if min_partition is not None and len(file_text) < min_partition:
        raise ValueError("`min_partition` cannot be larger than the length of file contents.")
    split_paragraphs = re.split(PARAGRAPH_PATTERN, file_text.strip())
    paragraphs = combine_paragraphs_less_than_min(
        split_paragraphs=split_paragraphs,
        max_partition=max_partition,
        min_partition=min_partition,
    )
    file_content = []
    for paragraph in paragraphs:
        file_content.extend(
            split_content_to_fit_max(
                content=paragraph,
                max_partition=max_partition,
            ),
        )
    elements: List[Element] = []
    metadata = (
--- a/unstructured/partition/xml.py
+++ b/unstructured/partition/xml.py
@ -54,6 +54,7 @@ def partition_xml(
    include_metadata: bool = True,
    encoding: Optional[str] = None,
    max_partition: Optional[int] = 1500,
    min_partition: Optional[int] = 0,
    **kwargs,
 ) -> List[Element]:
    """Partitions an XML document into its document elements.
@ -77,6 +78,8 @@ def partition_xml(
    max_partition
        The maximum number of characters to include in a partition. If None is passed,
        no maximum is applied.
    min_partition
        The minimum number of characters to include in a partition.
    """
    exactly_one(filename=filename, file=file)
@ -97,6 +100,7 @@ def partition_xml(
        metadata_filename=metadata_filename,
        include_metadata=include_metadata,
        max_partition=max_partition,
        min_partition=min_partition,
    )
    return elements