From 676c50a6ecbe42360f5c522b2c5dff929cf76a3a Mon Sep 17 00:00:00 2001
From: John <43506685+Coniferish@users.noreply.github.com>
Date: Mon, 24 Jul 2023 10:57:24 -0500
Subject: [PATCH] feat: add min_partition kwarg to that combines elements below
 a specified threshold (#926)

* add min_partition

* functioning _split_content_to_fit_min_max

* create test and make tidy/check

* fix rebase issues

* fix type hinting, remove unused code, add tests

* various changes and refactoring of methods

* add test, refactor, change var names for debugging purposes

* update test

* make tidy/check

* give more descriptive var names and add comments

* update xml partition via partition_text and create test

* fix <pre> bug for test_partition_html_with_pre_tag

* make tidy

* refactor and fix tests

* make tidy/check

* ingest-test-fixtures-update

* change list comprehension to for loop

* fix error check
---
 CHANGELOG.md                              |   2 +-
 test_unstructured/documents/test_xml.py   |  14 ++
 test_unstructured/partition/test_email.py |  13 ++
 test_unstructured/partition/test_image.py |   1 -
 test_unstructured/partition/test_text.py  | 104 ++++++++++++-
 unstructured/documents/xml.py             |   9 +-
 unstructured/partition/email.py           |   9 +-
 unstructured/partition/msg.py             |  10 +-
 unstructured/partition/pdf.py             |  20 ++-
 unstructured/partition/text.py            | 173 +++++++++++++++++-----
 unstructured/partition/xml.py             |   4 +
 11 files changed, 314 insertions(+), 45 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3a0514540..bf112773a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,7 @@
 ## 0.8.2-dev4
 
 ### Enhancements
-
+* Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
 * set the file's current position to the beginning after reading the file in `convert_to_bytes`
 * Add slide notes to pptx
 
diff --git a/test_unstructured/documents/test_xml.py b/test_unstructured/documents/test_xml.py
index 9e3e0ae03..bb05bc7b4 100644
--- a/test_unstructured/documents/test_xml.py
+++ b/test_unstructured/documents/test_xml.py
@@ -44,6 +44,20 @@ def test_from_string(sample_document):
     assert type_tag.text.strip() == "10-K"
 
 
+def test_from_string_with_pre_tag():
+    sample_document = """
+    <pre>
+    <SEC-DOCUMENT>
+    <TYPE>10-K
+    <COMPANY>Proctor & Gamble
+    </SEC-DOCUMENT>
+    </pre>
+    """
+    xml_document = XMLDocument.from_string(sample_document)
+    type_tag = xml_document.document_tree.find(".//type")
+    assert type_tag.text.strip() == "10-K"
+
+
 def test_read_with_stylesheet():
     filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
     stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")
diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py
index 71d8b41a3..7f3aea511 100644
--- a/test_unstructured/partition/test_email.py
+++ b/test_unstructured/partition/test_email.py
@@ -241,6 +241,19 @@ def test_partition_email_from_text_file_with_headers():
         assert element.metadata.filename is None
 
 
+def test_partition_email_from_text_file_max():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
+    with open(filename) as f:
+        elements = partition_email(file=f, content_source="text/plain", max_partition=20)
+    assert len(elements) == 6
+
+
+def test_partition_email_from_text_file_raises_value_error():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
+    with pytest.raises(ValueError), open(filename) as f:
+        partition_email(file=f, content_source="text/plain", min_partition=1000)
+
+
 def test_partition_email_from_text():
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
     with open(filename) as f:
diff --git a/test_unstructured/partition/test_image.py b/test_unstructured/partition/test_image.py
index 10d529422..77a1f7996 100644
--- a/test_unstructured/partition/test_image.py
+++ b/test_unstructured/partition/test_image.py
@@ -144,7 +144,6 @@ def test_partition_image_with_ocr_detects_korean():
 
 def test_partition_image_with_ocr_detects_korean_from_file():
     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
-
     with open(filename, "rb") as f:
         elements = image.partition_image(
             file=f,
diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py
index 87887237a..de6ef82e6 100644
--- a/test_unstructured/partition/test_text.py
+++ b/test_unstructured/partition/test_text.py
@@ -5,7 +5,11 @@ import pytest
 
 from unstructured.cleaners.core import group_broken_paragraphs
 from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
-from unstructured.partition.text import partition_text
+from unstructured.partition.text import (
+    combine_paragraphs_less_than_min,
+    partition_text,
+    split_content_to_fit_max,
+)
 
 DIRECTORY = pathlib.Path(__file__).parent.resolve()
 
@@ -18,6 +22,31 @@ EXPECTED_OUTPUT = [
     ListItem(text="I love fuzzy blankets"),
 ]
 
+MIN_MAX_TEXT = """This is a story. This is a story that doesn't matter
+ because it is just being used as an example. Hi. Hello. Howdy. Hola.
+ The example is simple and repetitive and long and somewhat boring,
+ but it serves a purpose. End.""".replace(
+    "\n",
+    "",
+)
+
+SHORT_PARAGRAPHS = """This is a story.
+
+This is a story that doesn't matter because it is just being used as an example.
+
+Hi.
+
+Hello.
+
+Howdy.
+
+Hola.
+
+The example is simple and repetitive and long and somewhat boring, but it serves a purpose.
+
+End.
+"""
+
 
 @pytest.mark.parametrize(
     ("filename", "encoding"),
@@ -201,6 +230,79 @@ def test_partition_text_splits_long_text(filename="example-docs/norwich-city.txt
     assert elements[-1].text.endswith("External links")
 
 
+def test_partition_text_splits_long_text_max_partition(filename="example-docs/norwich-city.txt"):
+    elements = partition_text(filename=filename)
+    elements_max_part = partition_text(filename=filename, max_partition=500)
+    assert len(elements) < len(elements_max_part)
+
+
+def test_partition_text_min_max():
+    segments = partition_text(
+        text=SHORT_PARAGRAPHS,
+        min_partition=6,
+    )
+    expected = [
+        "This is a story.",
+        "This is a story that doesn't matter because it is just being used as an example.",
+        "Hi. Hello.",
+        "Howdy.",
+        """Hola. The example is simple and repetitive and long and somewhat boring,
+ but it serves a purpose. End.""".replace(
+            "\n",
+            "",
+        ),
+    ]
+    for segment, test_segment in zip(segments, expected):
+        assert segment.text == test_segment
+
+    segments = partition_text(
+        text=SHORT_PARAGRAPHS,
+        max_partition=20,
+        min_partition=7,
+    )
+    expected = [
+        "This is a story.",
+        "This is a story that",
+        "doesn't matter",
+        "because it is just",
+        "being used as an",
+        "example.",
+        "Hi. Hello.",
+        "Howdy. Hola.",
+        "The example is",
+        "simple and",
+        "repetitive and long",
+        "and somewhat boring,",
+        "but it serves a",
+        "purpose. End.",
+    ]
+    for segment, test_segment in zip(segments, expected):
+        assert segment.text == test_segment
+
+
+def test_split_content_to_fit_max():
+    segments = split_content_to_fit_max(
+        content=MIN_MAX_TEXT,
+        max_partition=75,
+    )
+    assert segments == [
+        "This is a story.",
+        "This is a story that doesn't matter because",
+        "it is just being used as an example. Hi. Hello. Howdy. Hola.",
+        "The example is simple and repetitive and long",
+        "and somewhat boring, but it serves a purpose. End.",
+    ]
+
+
+def test_combine_paragraphs_less_than_min():
+    segments = combine_paragraphs_less_than_min(
+        SHORT_PARAGRAPHS.split("\n\n"),
+        max_partition=1500,
+        min_partition=7,
+    )
+    assert len(segments) < len(SHORT_PARAGRAPHS)
+
+
 def test_partition_text_doesnt_get_page_breaks():
     text = "--------------------"
     elements = partition_text(text=text)
diff --git a/unstructured/documents/xml.py b/unstructured/documents/xml.py
index f7e08be39..0801d6391 100644
--- a/unstructured/documents/xml.py
+++ b/unstructured/documents/xml.py
@@ -7,7 +7,7 @@ from unstructured.file_utils.encoding import read_txt_file
 from unstructured.logger import logger
 from unstructured.partition.text import (
     element_from_text,
-    split_by_paragraph,
+    partition_text,
 )
 
 VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None]
@@ -78,13 +78,16 @@ class XMLDocument(Document):
             #     Please use  bytes input or XML fragments without declaration.
             except ValueError:
                 document_tree = etree.fromstring(content.encode(), self.parser)
-
             if "<pre>" and "</pre>" in content:
                 tree = etree.HTML(content)
                 for element in tree.xpath("//pre"):
                     if not element.text:
                         continue
-                    text_content = split_by_paragraph(element.text)
+
+                    text_content = []
+                    for element in partition_text(text=element.text, paragraph_grouper=False):
+                        text_content.append(element.text)
+
                     for text in text_content:
                         element = etree.Element("span")
                         element.text = str(element_from_text(text=text))
diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py
index 4f3a4d1ea..015f1a881 100644
--- a/unstructured/partition/email.py
+++ b/unstructured/partition/email.py
@@ -52,7 +52,7 @@ from unstructured.file_utils.filetype import FileType, add_metadata_with_filetyp
 from unstructured.logger import logger
 from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
 from unstructured.partition.html import partition_html
-from unstructured.partition.text import partition_text, split_by_paragraph
+from unstructured.partition.text import partition_text
 
 VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html", "text/plain"]
 
@@ -232,6 +232,7 @@ def partition_email(
     metadata_filename: Optional[str] = None,
     process_attachments: bool = False,
     attachment_partitioner: Optional[Callable] = None,
+    min_partition: Optional[int] = 0,
     **kwargs,
 ) -> List[Element]:
     """Partitions an .eml documents into its constituent elements.
@@ -258,6 +259,9 @@ def partition_email(
         processing the content of the email itself.
     attachment_partitioner
         The partitioning function to use to process attachments.
+    min_partition
+        The minimum number of characters to include in a partition. Only applies if
+        processing the text/plain content.
     """
     if content_source not in VALID_CONTENT_SOURCES:
         raise ValueError(
@@ -270,7 +274,6 @@ def partition_email(
 
     # Verify that only one of the arguments was provided
     exactly_one(filename=filename, file=file, text=text)
-
     detected_encoding = "utf-8"
     if filename is not None:
         extracted_encoding, msg = parse_email(filename=filename)
@@ -342,12 +345,12 @@ def partition_email(
                             continue
 
     elif content_source == "text/plain":
-        list_content = split_by_paragraph(content)
         elements = partition_text(
             text=content,
             encoding=encoding,
             max_partition=max_partition,
             metadata_filename=metadata_filename or filename,
+            min_partition=min_partition,
         )
 
     for idx, element in enumerate(elements):
diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py
index 18e15a4f8..280700c51 100644
--- a/unstructured/partition/msg.py
+++ b/unstructured/partition/msg.py
@@ -22,6 +22,7 @@ def partition_msg(
     metadata_filename: Optional[str] = None,
     process_attachments: bool = False,
     attachment_partitioner: Optional[Callable] = None,
+    min_partition: Optional[int] = 0,
     **kwargs,
 ) -> List[Element]:
     """Partitions a MSFT Outlook .msg file
@@ -42,6 +43,9 @@ def partition_msg(
         processing the content of the email itself.
     attachment_partitioner
         The partitioning function to use to process attachments.
+    min_partition
+        The minimum number of characters to include in a partition. Only applies if
+        processing text/plain content.
     """
     exactly_one(filename=filename, file=file)
 
@@ -57,7 +61,11 @@ def partition_msg(
     if "<html>" in text or "</div>" in text:
         elements = partition_html(text=text)
     else:
-        elements = partition_text(text=text, max_partition=max_partition)
+        elements = partition_text(
+            text=text,
+            max_partition=max_partition,
+            min_partition=min_partition,
+        )
 
     metadata = build_msg_metadata(msg_obj, metadata_filename or filename)
     for element in elements:
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
index 703d829e9..d416c76b2 100644
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@@ -51,6 +51,7 @@ def partition_pdf(
     max_partition: Optional[int] = 1500,
     include_metadata: bool = True,
     metadata_filename: Optional[str] = None,
+    min_partition: Optional[int] = 0,
     **kwargs,
 ) -> List[Element]:
     """Parses a pdf document into a list of interpreted elements.
@@ -81,6 +82,9 @@ def partition_pdf(
     max_partition
         The maximum number of characters to include in a partition. If None is passed,
         no maximum is applied. Only applies to the "ocr_only" strategy.
+    min_partition
+        The minimum number of characters to include in a partition. Only applies if
+        processing text/plain content.
     """
     exactly_one(filename=filename, file=file)
     return partition_pdf_or_image(
@@ -91,6 +95,7 @@ def partition_pdf(
         infer_table_structure=infer_table_structure,
         ocr_languages=ocr_languages,
         max_partition=max_partition,
+        min_partition=min_partition,
         **kwargs,
     )
 
@@ -116,6 +121,7 @@ def partition_pdf_or_image(
     infer_table_structure: bool = False,
     ocr_languages: str = "eng",
     max_partition: Optional[int] = 1500,
+    min_partition: Optional[int] = 0,
     **kwargs,
 ) -> List[Element]:
     """Parses a pdf or image document into a list of interpreted elements."""
@@ -172,6 +178,7 @@ def partition_pdf_or_image(
                 ocr_languages=ocr_languages,
                 is_image=is_image,
                 max_partition=max_partition,
+                min_partition=min_partition,
             )
 
     return layout_elements
@@ -391,6 +398,7 @@ def _partition_pdf_or_image_with_ocr(
     ocr_languages: str = "eng",
     is_image: bool = False,
     max_partition: Optional[int] = 1500,
+    min_partition: Optional[int] = 0,
 ):
     """Partitions and image or PDF using Tesseract OCR. For PDFs, each page is converted
     to an image prior to processing."""
@@ -402,7 +410,11 @@ def _partition_pdf_or_image_with_ocr(
             text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
         else:
             text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
-        elements = partition_text(text=text, max_partition=max_partition)
+        elements = partition_text(
+            text=text,
+            max_partition=max_partition,
+            min_partition=min_partition,
+        )
     else:
         elements = []
         page_number = 0
@@ -411,7 +423,11 @@ def _partition_pdf_or_image_with_ocr(
             metadata = ElementMetadata(filename=filename, page_number=page_number)
             text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
 
-            _elements = partition_text(text=text, max_partition=max_partition)
+            _elements = partition_text(
+                text=text,
+                max_partition=max_partition,
+                min_partition=min_partition,
+            )
             for element in _elements:
                 element.metadata = metadata
                 elements.append(element)
diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py
index 8fa334e14..a6d799eba 100644
--- a/unstructured/partition/text.py
+++ b/unstructured/partition/text.py
@@ -1,4 +1,5 @@
 import re
+import textwrap
 from typing import IO, Callable, List, Optional, Tuple
 
 from unstructured.cleaners.core import clean_bullets, group_broken_paragraphs
@@ -26,52 +27,125 @@ from unstructured.partition.text_type import (
 )
 
 
-def split_by_paragraph(content: str, max_partition: Optional[int] = 1500) -> List[str]:
-    paragraphs = re.split(PARAGRAPH_PATTERN, content)
-    if max_partition is None:
-        return paragraphs
+def _split_in_half_at_breakpoint(
+    content: str,
+    breakpoint: str = " ",
+) -> List[str]:
+    """Splits a segment of content at the breakpoint closest to the middle"""
+    mid = len(content) // 2
+    for i in range(len(content) // 2):
+        if content[mid + i] == breakpoint:
+            mid += i
+            break
+        elif content[mid - i] == breakpoint:
+            mid += -i
+            break
 
-    split_paragraphs = []
-    for paragraph in paragraphs:
-        split_paragraphs.extend(
-            _split_to_fit_max_content(paragraph, max_partition=max_partition),
-        )
-    return split_paragraphs
+    return [content[:mid].rstrip(), content[mid:].lstrip()]
 
 
 def _split_content_size_n(content: str, n: int) -> List[str]:
-    """Splits a string into chunks that are at most size n."""
+    """Splits a section of content into chunks that are at most
+    size n without breaking apart words."""
     segments = []
-    for i in range(0, len(content), n):
-        segment = content[i : i + n]  # noqa: E203
-        segments.append(segment)
+    if len(content) < n * 2:
+        segments = list(_split_in_half_at_breakpoint(content))
+    else:
+        segments = textwrap.wrap(content, width=n)
     return segments
 
 
-def _split_to_fit_max_content(content: str, max_partition: int = 1500) -> List[str]:
-    """Splits a section of content so that all of the elements fit into the
+def split_content_to_fit_max(
+    content: str,
+    max_partition: Optional[int] = 1500,
+) -> List[str]:
+    """Splits a paragraph or section of content so that all of the elements fit into the
     max partition window."""
     sentences = sent_tokenize(content)
-    num_sentences = len(sentences)
-
     chunks = []
-    chunk = ""
-
-    for i, sentence in enumerate(sentences):
-        if len(sentence) > max_partition:
-            chunks.extend(_split_content_size_n(sentence, n=max_partition))
-
-        if len(chunk + " " + sentence) > max_partition:
-            chunks.append(chunk)
-            chunk = sentence
+    tmp_chunk = ""
+    for sentence in sentences:
+        if max_partition is not None and len(sentence) > max_partition:
+            if tmp_chunk:
+                chunks.append(tmp_chunk)
+                tmp_chunk = ""
+            segments = _split_content_size_n(sentence, n=max_partition)
+            chunks.extend(segments[:-1])
+            tmp_chunk = segments[-1]
         else:
-            chunk += " " + sentence
-            if i == num_sentences - 1:
-                chunks.append(chunk)
+            if max_partition is not None and len(tmp_chunk + " " + sentence) > max_partition:
+                chunks.append(tmp_chunk)
+                tmp_chunk = sentence
+            else:
+                if not tmp_chunk:
+                    tmp_chunk = sentence
+                else:
+                    tmp_chunk += " " + sentence
+                    tmp_chunk = tmp_chunk.strip()
+    if tmp_chunk:
+        chunks.append(tmp_chunk)
 
     return chunks
 
 
+def combine_paragraphs_less_than_min(
+    split_paragraphs: List[str],
+    max_partition: Optional[int] = 1500,
+    min_partition: Optional[int] = 0,
+) -> List[str]:
+    """Combine paragraphs less than `min_partition` while not exceeding `max_partition`."""
+    if type(split_paragraphs) is not list:
+        raise ValueError("`split_paragraphs` is not a list")
+    file_content: List[str] = []
+    tmp_paragraph = ""
+    next_index = 0
+    for current_index, paragraph in enumerate(split_paragraphs):
+        if next_index > current_index:
+            continue  # Skip the current iteration if `next_index`` is already updated
+        if min_partition is not None and len(paragraph) < min_partition:
+            # Combine paragraphs that are less than `min_partition``
+            # while not exceeding `max_partition``
+            tmp_paragraph += paragraph + "\n"
+
+            while len(tmp_paragraph.strip()) < min_partition:
+                if current_index + 1 == len(split_paragraphs):
+                    # If it's the last paragraph, append the paragraph
+                    # to the previous content
+                    file_content[-1] += " " + tmp_paragraph.rstrip()
+                    tmp_paragraph = ""
+                    break
+                for offset_index, para in enumerate(
+                    split_paragraphs[current_index + 1 :], start=1  # noqa
+                ):
+                    if (
+                        max_partition is not None
+                        and len(tmp_paragraph + "\n" + para) < max_partition
+                    ):
+                        tmp_paragraph += "\n" + para
+                        # Update `next_index` to skip already combined paragraphs
+                        next_index = offset_index + current_index + 1
+
+                    if len(tmp_paragraph.strip()) > min_partition:
+                        break  # Stop combining if the combined paragraphs
+                        # meet the `min_partition`` requirement
+                    elif (
+                        max_partition is not None
+                        and len(tmp_paragraph) < min_partition
+                        and len(tmp_paragraph + "\n" + para) > max_partition
+                    ):
+                        raise ValueError(
+                            "`min_partition` and `max_partition` are defined too close together",
+                        )
+            # Add the combined paragraph to the final result
+            file_content.append(
+                tmp_paragraph.strip(),
+            )
+            tmp_paragraph = ""
+        else:
+            file_content.append(paragraph)
+    return file_content
+
+
 @process_metadata()
 @add_metadata_with_filetype(FileType.TXT)
 def partition_text(
@@ -83,9 +157,12 @@ def partition_text(
     metadata_filename: Optional[str] = None,
     include_metadata: bool = True,
     max_partition: Optional[int] = 1500,
+    min_partition: Optional[int] = 0,
     **kwargs,
 ) -> List[Element]:
-    """Partitions an .txt documents into its constituent elements.
+    """Partitions an .txt documents into its constituent paragraph elements.
+    If paragraphs are below "min_partition" or above "max_partition" boundaries,
+    they are combined or split.
     Parameters
     ----------
     filename
@@ -104,10 +181,19 @@ def partition_text(
     max_partition
         The maximum number of characters to include in a partition. If None is passed,
         no maximum is applied.
+    min_partition
+        The minimum number of characters to include in a partition.
     """
     if text is not None and text.strip() == "" and not file and not filename:
         return []
 
+    if (
+        min_partition is not None
+        and max_partition is not None
+        and (min_partition > max_partition or min_partition < 0 or max_partition < 0)
+    ):
+        raise ValueError("Invalid values for min_partition and/or max_partition.")
+
     # Verify that only one of the arguments was provided
     exactly_one(filename=filename, file=file, text=text)
 
@@ -120,12 +206,33 @@ def partition_text(
     elif text is not None:
         file_text = str(text)
 
-    if paragraph_grouper is not None:
+    if paragraph_grouper is False:
+        pass
+    elif paragraph_grouper is not None:
         file_text = paragraph_grouper(file_text)
     else:
         file_text = group_broken_paragraphs(file_text)
 
-    file_content = split_by_paragraph(file_text, max_partition=max_partition)
+    if min_partition is not None and len(file_text) < min_partition:
+        raise ValueError("`min_partition` cannot be larger than the length of file contents.")
+
+    split_paragraphs = re.split(PARAGRAPH_PATTERN, file_text.strip())
+
+    paragraphs = combine_paragraphs_less_than_min(
+        split_paragraphs=split_paragraphs,
+        max_partition=max_partition,
+        min_partition=min_partition,
+    )
+
+    file_content = []
+
+    for paragraph in paragraphs:
+        file_content.extend(
+            split_content_to_fit_max(
+                content=paragraph,
+                max_partition=max_partition,
+            ),
+        )
 
     elements: List[Element] = []
     metadata = (
diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py
index 0ef609dfd..6b81f4af5 100644
--- a/unstructured/partition/xml.py
+++ b/unstructured/partition/xml.py
@@ -54,6 +54,7 @@ def partition_xml(
     include_metadata: bool = True,
     encoding: Optional[str] = None,
     max_partition: Optional[int] = 1500,
+    min_partition: Optional[int] = 0,
     **kwargs,
 ) -> List[Element]:
     """Partitions an XML document into its document elements.
@@ -77,6 +78,8 @@ def partition_xml(
     max_partition
         The maximum number of characters to include in a partition. If None is passed,
         no maximum is applied.
+    min_partition
+        The minimum number of characters to include in a partition.
     """
     exactly_one(filename=filename, file=file)
 
@@ -97,6 +100,7 @@ def partition_xml(
         metadata_filename=metadata_filename,
         include_metadata=include_metadata,
         max_partition=max_partition,
+        min_partition=min_partition,
     )
 
     return elements