feat: add min_partition kwarg to that combines elements below a specified threshold (#926)

* add min_partition * functioning _split_content_to_fit_min_max * create test and make tidy/check * fix rebase issues * fix type hinting, remove unused code, add tests * various changes and refactoring of methods * add test, refactor, change var names for debugging purposes * update test * make tidy/check * give more descriptive var names and add comments * update xml partition via partition_text and create test * fix <pre> bug for test_partition_html_with_pre_tag * make tidy * refactor and fix tests * make tidy/check * ingest-test-fixtures-update * change list comprehension to for loop * fix error check
2025-12-10 14:42:24 +00:00 · 2023-07-24 10:57:24 -05:00 · 2023-07-24 10:57:24 -05:00 · 676c50a6ec
commit 676c50a6ec
parent d0329126ef
11 changed files with 314 additions and 45 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,7 +1,7 @@
 ## 0.8.2-dev4

 ### Enhancements
-
+* Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
 * set the file's current position to the beginning after reading the file in `convert_to_bytes`
 * Add slide notes to pptx

--- a/test_unstructured/documents/test_xml.py
+++ b/test_unstructured/documents/test_xml.py
@ -44,6 +44,20 @@ def test_from_string(sample_document):
    assert type_tag.text.strip() == "10-K"


+def test_from_string_with_pre_tag():
+    sample_document = """
+    <pre>
+    <SEC-DOCUMENT>
+    <TYPE>10-K
+    <COMPANY>Proctor & Gamble
+    </SEC-DOCUMENT>
+    </pre>
+    """
+    xml_document = XMLDocument.from_string(sample_document)
+    type_tag = xml_document.document_tree.find(".//type")
+    assert type_tag.text.strip() == "10-K"
+
+
 def test_read_with_stylesheet():
    filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
    stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")
--- a/test_unstructured/partition/test_email.py
+++ b/test_unstructured/partition/test_email.py
@ -241,6 +241,19 @@ def test_partition_email_from_text_file_with_headers():
        assert element.metadata.filename is None


+def test_partition_email_from_text_file_max():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
+    with open(filename) as f:
+        elements = partition_email(file=f, content_source="text/plain", max_partition=20)
+    assert len(elements) == 6
+
+
+def test_partition_email_from_text_file_raises_value_error():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
+    with pytest.raises(ValueError), open(filename) as f:
+        partition_email(file=f, content_source="text/plain", min_partition=1000)
+
+
 def test_partition_email_from_text():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
    with open(filename) as f:
--- a/test_unstructured/partition/test_image.py
+++ b/test_unstructured/partition/test_image.py
@ -144,7 +144,6 @@ def test_partition_image_with_ocr_detects_korean():

 def test_partition_image_with_ocr_detects_korean_from_file():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
-
    with open(filename, "rb") as f:
        elements = image.partition_image(
            file=f,
--- a/test_unstructured/partition/test_text.py
+++ b/test_unstructured/partition/test_text.py
@ -5,7 +5,11 @@ import pytest

 from unstructured.cleaners.core import group_broken_paragraphs
 from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
-from unstructured.partition.text import partition_text
+from unstructured.partition.text import (
+    combine_paragraphs_less_than_min,
+    partition_text,
+    split_content_to_fit_max,
+)

 DIRECTORY = pathlib.Path(__file__).parent.resolve()

@ -18,6 +22,31 @@ EXPECTED_OUTPUT = [
    ListItem(text="I love fuzzy blankets"),
 ]

+MIN_MAX_TEXT = """This is a story. This is a story that doesn't matter
+ because it is just being used as an example. Hi. Hello. Howdy. Hola.
+ The example is simple and repetitive and long and somewhat boring,
+ but it serves a purpose. End.""".replace(
+    "\n",
+    "",
+)
+
+SHORT_PARAGRAPHS = """This is a story.
+
+This is a story that doesn't matter because it is just being used as an example.
+
+Hi.
+
+Hello.
+
+Howdy.
+
+Hola.
+
+The example is simple and repetitive and long and somewhat boring, but it serves a purpose.
+
+End.
+"""
+

@pytest.mark.parametrize(
    ("filename", "encoding"),
@ -201,6 +230,79 @@ def test_partition_text_splits_long_text(filename="example-docs/norwich-city.txt
    assert elements[-1].text.endswith("External links")


+def test_partition_text_splits_long_text_max_partition(filename="example-docs/norwich-city.txt"):
+    elements = partition_text(filename=filename)
+    elements_max_part = partition_text(filename=filename, max_partition=500)
+    assert len(elements) < len(elements_max_part)
+
+
+def test_partition_text_min_max():
+    segments = partition_text(
+        text=SHORT_PARAGRAPHS,
+        min_partition=6,
+    )
+    expected = [
+        "This is a story.",
+        "This is a story that doesn't matter because it is just being used as an example.",
+        "Hi. Hello.",
+        "Howdy.",
+        """Hola. The example is simple and repetitive and long and somewhat boring,
+ but it serves a purpose. End.""".replace(
+            "\n",
+            "",
+        ),
+    ]
+    for segment, test_segment in zip(segments, expected):
+        assert segment.text == test_segment
+
+    segments = partition_text(
+        text=SHORT_PARAGRAPHS,
+        max_partition=20,
+        min_partition=7,
+    )
+    expected = [
+        "This is a story.",
+        "This is a story that",
+        "doesn't matter",
+        "because it is just",
+        "being used as an",
+        "example.",
+        "Hi. Hello.",
+        "Howdy. Hola.",
+        "The example is",
+        "simple and",
+        "repetitive and long",
+        "and somewhat boring,",
+        "but it serves a",
+        "purpose. End.",
+    ]
+    for segment, test_segment in zip(segments, expected):
+        assert segment.text == test_segment
+
+
+def test_split_content_to_fit_max():
+    segments = split_content_to_fit_max(
+        content=MIN_MAX_TEXT,
+        max_partition=75,
+    )
+    assert segments == [
+        "This is a story.",
+        "This is a story that doesn't matter because",
+        "it is just being used as an example. Hi. Hello. Howdy. Hola.",
+        "The example is simple and repetitive and long",
+        "and somewhat boring, but it serves a purpose. End.",
+    ]
+
+
+def test_combine_paragraphs_less_than_min():
+    segments = combine_paragraphs_less_than_min(
+        SHORT_PARAGRAPHS.split("\n\n"),
+        max_partition=1500,
+        min_partition=7,
+    )
+    assert len(segments) < len(SHORT_PARAGRAPHS)
+
+
 def test_partition_text_doesnt_get_page_breaks():
    text = "--------------------"
    elements = partition_text(text=text)
--- a/unstructured/documents/xml.py
+++ b/unstructured/documents/xml.py
@ -7,7 +7,7 @@ from unstructured.file_utils.encoding import read_txt_file
 from unstructured.logger import logger
 from unstructured.partition.text import (
    element_from_text,
-    split_by_paragraph,
+    partition_text,
 )

 VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None]
@ -78,13 +78,16 @@ class XMLDocument(Document):
            #     Please use  bytes input or XML fragments without declaration.
            except ValueError:
                document_tree = etree.fromstring(content.encode(), self.parser)
-
            if "<pre>" and "</pre>" in content:
                tree = etree.HTML(content)
                for element in tree.xpath("//pre"):
                    if not element.text:
                        continue
-                    text_content = split_by_paragraph(element.text)
+
+                    text_content = []
+                    for element in partition_text(text=element.text, paragraph_grouper=False):
+                        text_content.append(element.text)
+
                    for text in text_content:
                        element = etree.Element("span")
                        element.text = str(element_from_text(text=text))
--- a/unstructured/partition/email.py
+++ b/unstructured/partition/email.py
@ -52,7 +52,7 @@ from unstructured.file_utils.filetype import FileType, add_metadata_with_filetyp
 from unstructured.logger import logger
 from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
 from unstructured.partition.html import partition_html
-from unstructured.partition.text import partition_text, split_by_paragraph
+from unstructured.partition.text import partition_text

 VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html", "text/plain"]

@ -232,6 +232,7 @@ def partition_email(
    metadata_filename: Optional[str] = None,
    process_attachments: bool = False,
    attachment_partitioner: Optional[Callable] = None,
+    min_partition: Optional[int] = 0,
    **kwargs,
 ) -> List[Element]:
    """Partitions an .eml documents into its constituent elements.
@ -258,6 +259,9 @@ def partition_email(
        processing the content of the email itself.
    attachment_partitioner
        The partitioning function to use to process attachments.
+    min_partition
+        The minimum number of characters to include in a partition. Only applies if
+        processing the text/plain content.
    """
    if content_source not in VALID_CONTENT_SOURCES:
        raise ValueError(
@ -270,7 +274,6 @@ def partition_email(

    # Verify that only one of the arguments was provided
    exactly_one(filename=filename, file=file, text=text)
-
    detected_encoding = "utf-8"
    if filename is not None:
        extracted_encoding, msg = parse_email(filename=filename)
@ -342,12 +345,12 @@ def partition_email(
                            continue

    elif content_source == "text/plain":
-        list_content = split_by_paragraph(content)
        elements = partition_text(
            text=content,
            encoding=encoding,
            max_partition=max_partition,
            metadata_filename=metadata_filename or filename,
+            min_partition=min_partition,
        )

    for idx, element in enumerate(elements):
--- a/unstructured/partition/msg.py
+++ b/unstructured/partition/msg.py
@ -22,6 +22,7 @@ def partition_msg(
    metadata_filename: Optional[str] = None,
    process_attachments: bool = False,
    attachment_partitioner: Optional[Callable] = None,
+    min_partition: Optional[int] = 0,
    **kwargs,
 ) -> List[Element]:
    """Partitions a MSFT Outlook .msg file
@ -42,6 +43,9 @@ def partition_msg(
        processing the content of the email itself.
    attachment_partitioner
        The partitioning function to use to process attachments.
+    min_partition
+        The minimum number of characters to include in a partition. Only applies if
+        processing text/plain content.
    """
    exactly_one(filename=filename, file=file)

@ -57,7 +61,11 @@ def partition_msg(
    if "<html>" in text or "</div>" in text:
        elements = partition_html(text=text)
    else:
-        elements = partition_text(text=text, max_partition=max_partition)
+        elements = partition_text(
+            text=text,
+            max_partition=max_partition,
+            min_partition=min_partition,
+        )

    metadata = build_msg_metadata(msg_obj, metadata_filename or filename)
    for element in elements:
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -51,6 +51,7 @@ def partition_pdf(
    max_partition: Optional[int] = 1500,
    include_metadata: bool = True,
    metadata_filename: Optional[str] = None,
+    min_partition: Optional[int] = 0,
    **kwargs,
 ) -> List[Element]:
    """Parses a pdf document into a list of interpreted elements.
@ -81,6 +82,9 @@ def partition_pdf(
    max_partition
        The maximum number of characters to include in a partition. If None is passed,
        no maximum is applied. Only applies to the "ocr_only" strategy.
+    min_partition
+        The minimum number of characters to include in a partition. Only applies if
+        processing text/plain content.
    """
    exactly_one(filename=filename, file=file)
    return partition_pdf_or_image(
@ -91,6 +95,7 @@ def partition_pdf(
        infer_table_structure=infer_table_structure,
        ocr_languages=ocr_languages,
        max_partition=max_partition,
+        min_partition=min_partition,
        **kwargs,
    )

@ -116,6 +121,7 @@ def partition_pdf_or_image(
    infer_table_structure: bool = False,
    ocr_languages: str = "eng",
    max_partition: Optional[int] = 1500,
+    min_partition: Optional[int] = 0,
    **kwargs,
 ) -> List[Element]:
    """Parses a pdf or image document into a list of interpreted elements."""
@ -172,6 +178,7 @@ def partition_pdf_or_image(
                ocr_languages=ocr_languages,
                is_image=is_image,
                max_partition=max_partition,
+                min_partition=min_partition,
            )

    return layout_elements
@ -391,6 +398,7 @@ def _partition_pdf_or_image_with_ocr(
    ocr_languages: str = "eng",
    is_image: bool = False,
    max_partition: Optional[int] = 1500,
+    min_partition: Optional[int] = 0,
 ):
    """Partitions and image or PDF using Tesseract OCR. For PDFs, each page is converted
    to an image prior to processing."""
@ -402,7 +410,11 @@ def _partition_pdf_or_image_with_ocr(
            text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
        else:
            text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
-        elements = partition_text(text=text, max_partition=max_partition)
+        elements = partition_text(
+            text=text,
+            max_partition=max_partition,
+            min_partition=min_partition,
+        )
    else:
        elements = []
        page_number = 0
@ -411,7 +423,11 @@ def _partition_pdf_or_image_with_ocr(
            metadata = ElementMetadata(filename=filename, page_number=page_number)
            text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")

-            _elements = partition_text(text=text, max_partition=max_partition)
+            _elements = partition_text(
+                text=text,
+                max_partition=max_partition,
+                min_partition=min_partition,
+            )
            for element in _elements:
                element.metadata = metadata
                elements.append(element)
--- a/unstructured/partition/text.py
+++ b/unstructured/partition/text.py
@ -1,4 +1,5 @@
 import re
+import textwrap
 from typing import IO, Callable, List, Optional, Tuple

 from unstructured.cleaners.core import clean_bullets, group_broken_paragraphs
@ -26,52 +27,125 @@ from unstructured.partition.text_type import (
 )


-def split_by_paragraph(content: str, max_partition: Optional[int] = 1500) -> List[str]:
-    paragraphs = re.split(PARAGRAPH_PATTERN, content)
-    if max_partition is None:
-        return paragraphs
+def _split_in_half_at_breakpoint(
+    content: str,
+    breakpoint: str = " ",
+) -> List[str]:
+    """Splits a segment of content at the breakpoint closest to the middle"""
+    mid = len(content) // 2
+    for i in range(len(content) // 2):
+        if content[mid + i] == breakpoint:
+            mid += i
+            break
+        elif content[mid - i] == breakpoint:
+            mid += -i
+            break

-    split_paragraphs = []
-    for paragraph in paragraphs:
-        split_paragraphs.extend(
-            _split_to_fit_max_content(paragraph, max_partition=max_partition),
-        )
-    return split_paragraphs
+    return [content[:mid].rstrip(), content[mid:].lstrip()]


 def _split_content_size_n(content: str, n: int) -> List[str]:
-    """Splits a string into chunks that are at most size n."""
+    """Splits a section of content into chunks that are at most
+    size n without breaking apart words."""
    segments = []
-    for i in range(0, len(content), n):
-        segment = content[i : i + n]  # noqa: E203
-        segments.append(segment)
+    if len(content) < n * 2:
+        segments = list(_split_in_half_at_breakpoint(content))
+    else:
+        segments = textwrap.wrap(content, width=n)
    return segments


-def _split_to_fit_max_content(content: str, max_partition: int = 1500) -> List[str]:
-    """Splits a section of content so that all of the elements fit into the
+def split_content_to_fit_max(
+    content: str,
+    max_partition: Optional[int] = 1500,
+) -> List[str]:
+    """Splits a paragraph or section of content so that all of the elements fit into the
    max partition window."""
    sentences = sent_tokenize(content)
-    num_sentences = len(sentences)
-
    chunks = []
-    chunk = ""
-
-    for i, sentence in enumerate(sentences):
-        if len(sentence) > max_partition:
-            chunks.extend(_split_content_size_n(sentence, n=max_partition))
-
-        if len(chunk + " " + sentence) > max_partition:
-            chunks.append(chunk)
-            chunk = sentence
+    tmp_chunk = ""
+    for sentence in sentences:
+        if max_partition is not None and len(sentence) > max_partition:
+            if tmp_chunk:
+                chunks.append(tmp_chunk)
+                tmp_chunk = ""
+            segments = _split_content_size_n(sentence, n=max_partition)
+            chunks.extend(segments[:-1])
+            tmp_chunk = segments[-1]
        else:
-            chunk += " " + sentence
-            if i == num_sentences - 1:
-                chunks.append(chunk)
+            if max_partition is not None and len(tmp_chunk + " " + sentence) > max_partition:
+                chunks.append(tmp_chunk)
+                tmp_chunk = sentence
+            else:
+                if not tmp_chunk:
+                    tmp_chunk = sentence
+                else:
+                    tmp_chunk += " " + sentence
+                    tmp_chunk = tmp_chunk.strip()
+    if tmp_chunk:
+        chunks.append(tmp_chunk)

    return chunks


+def combine_paragraphs_less_than_min(
+    split_paragraphs: List[str],
+    max_partition: Optional[int] = 1500,
+    min_partition: Optional[int] = 0,
+) -> List[str]:
+    """Combine paragraphs less than `min_partition` while not exceeding `max_partition`."""
+    if type(split_paragraphs) is not list:
+        raise ValueError("`split_paragraphs` is not a list")
+    file_content: List[str] = []
+    tmp_paragraph = ""
+    next_index = 0
+    for current_index, paragraph in enumerate(split_paragraphs):
+        if next_index > current_index:
+            continue  # Skip the current iteration if `next_index`` is already updated
+        if min_partition is not None and len(paragraph) < min_partition:
+            # Combine paragraphs that are less than `min_partition``
+            # while not exceeding `max_partition``
+            tmp_paragraph += paragraph + "\n"
+
+            while len(tmp_paragraph.strip()) < min_partition:
+                if current_index + 1 == len(split_paragraphs):
+                    # If it's the last paragraph, append the paragraph
+                    # to the previous content
+                    file_content[-1] += " " + tmp_paragraph.rstrip()
+                    tmp_paragraph = ""
+                    break
+                for offset_index, para in enumerate(
+                    split_paragraphs[current_index + 1 :], start=1  # noqa
+                ):
+                    if (
+                        max_partition is not None
+                        and len(tmp_paragraph + "\n" + para) < max_partition
+                    ):
+                        tmp_paragraph += "\n" + para
+                        # Update `next_index` to skip already combined paragraphs
+                        next_index = offset_index + current_index + 1
+
+                    if len(tmp_paragraph.strip()) > min_partition:
+                        break  # Stop combining if the combined paragraphs
+                        # meet the `min_partition`` requirement
+                    elif (
+                        max_partition is not None
+                        and len(tmp_paragraph) < min_partition
+                        and len(tmp_paragraph + "\n" + para) > max_partition
+                    ):
+                        raise ValueError(
+                            "`min_partition` and `max_partition` are defined too close together",
+                        )
+            # Add the combined paragraph to the final result
+            file_content.append(
+                tmp_paragraph.strip(),
+            )
+            tmp_paragraph = ""
+        else:
+            file_content.append(paragraph)
+    return file_content
+
+
@process_metadata()
@add_metadata_with_filetype(FileType.TXT)
 def partition_text(
@ -83,9 +157,12 @@ def partition_text(
    metadata_filename: Optional[str] = None,
    include_metadata: bool = True,
    max_partition: Optional[int] = 1500,
+    min_partition: Optional[int] = 0,
    **kwargs,
 ) -> List[Element]:
-    """Partitions an .txt documents into its constituent elements.
+    """Partitions an .txt documents into its constituent paragraph elements.
+    If paragraphs are below "min_partition" or above "max_partition" boundaries,
+    they are combined or split.
    Parameters
    ----------
    filename
@ -104,10 +181,19 @@ def partition_text(
    max_partition
        The maximum number of characters to include in a partition. If None is passed,
        no maximum is applied.
+    min_partition
+        The minimum number of characters to include in a partition.
    """
    if text is not None and text.strip() == "" and not file and not filename:
        return []

+    if (
+        min_partition is not None
+        and max_partition is not None
+        and (min_partition > max_partition or min_partition < 0 or max_partition < 0)
+    ):
+        raise ValueError("Invalid values for min_partition and/or max_partition.")
+
    # Verify that only one of the arguments was provided
    exactly_one(filename=filename, file=file, text=text)

@ -120,12 +206,33 @@ def partition_text(
    elif text is not None:
        file_text = str(text)

-    if paragraph_grouper is not None:
+    if paragraph_grouper is False:
+        pass
+    elif paragraph_grouper is not None:
        file_text = paragraph_grouper(file_text)
    else:
        file_text = group_broken_paragraphs(file_text)

-    file_content = split_by_paragraph(file_text, max_partition=max_partition)
+    if min_partition is not None and len(file_text) < min_partition:
+        raise ValueError("`min_partition` cannot be larger than the length of file contents.")
+
+    split_paragraphs = re.split(PARAGRAPH_PATTERN, file_text.strip())
+
+    paragraphs = combine_paragraphs_less_than_min(
+        split_paragraphs=split_paragraphs,
+        max_partition=max_partition,
+        min_partition=min_partition,
+    )
+
+    file_content = []
+
+    for paragraph in paragraphs:
+        file_content.extend(
+            split_content_to_fit_max(
+                content=paragraph,
+                max_partition=max_partition,
+            ),
+        )

    elements: List[Element] = []
    metadata = (
--- a/unstructured/partition/xml.py
+++ b/unstructured/partition/xml.py
@ -54,6 +54,7 @@ def partition_xml(
    include_metadata: bool = True,
    encoding: Optional[str] = None,
    max_partition: Optional[int] = 1500,
+    min_partition: Optional[int] = 0,
    **kwargs,
 ) -> List[Element]:
    """Partitions an XML document into its document elements.
@ -77,6 +78,8 @@ def partition_xml(
    max_partition
        The maximum number of characters to include in a partition. If None is passed,
        no maximum is applied.
+    min_partition
+        The minimum number of characters to include in a partition.
    """
    exactly_one(filename=filename, file=file)

@ -97,6 +100,7 @@ def partition_xml(
        metadata_filename=metadata_filename,
        include_metadata=include_metadata,
        max_partition=max_partition,
+        min_partition=min_partition,
    )

    return elements