From 676c50a6ecbe42360f5c522b2c5dff929cf76a3a Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Mon, 24 Jul 2023 10:57:24 -0500 Subject: [PATCH] feat: add min_partition kwarg to that combines elements below a specified threshold (#926) * add min_partition * functioning _split_content_to_fit_min_max * create test and make tidy/check * fix rebase issues * fix type hinting, remove unused code, add tests * various changes and refactoring of methods * add test, refactor, change var names for debugging purposes * update test * make tidy/check * give more descriptive var names and add comments * update xml partition via partition_text and create test * fix
bug for test_partition_html_with_pre_tag
* make tidy
* refactor and fix tests
* make tidy/check
* ingest-test-fixtures-update
* change list comprehension to for loop
* fix error check
---
CHANGELOG.md | 2 +-
test_unstructured/documents/test_xml.py | 14 ++
test_unstructured/partition/test_email.py | 13 ++
test_unstructured/partition/test_image.py | 1 -
test_unstructured/partition/test_text.py | 104 ++++++++++++-
unstructured/documents/xml.py | 9 +-
unstructured/partition/email.py | 9 +-
unstructured/partition/msg.py | 10 +-
unstructured/partition/pdf.py | 20 ++-
unstructured/partition/text.py | 173 +++++++++++++++++-----
unstructured/partition/xml.py | 4 +
11 files changed, 314 insertions(+), 45 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3a0514540..bf112773a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,7 @@
## 0.8.2-dev4
### Enhancements
-
+* Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
* set the file's current position to the beginning after reading the file in `convert_to_bytes`
* Add slide notes to pptx
diff --git a/test_unstructured/documents/test_xml.py b/test_unstructured/documents/test_xml.py
index 9e3e0ae03..bb05bc7b4 100644
--- a/test_unstructured/documents/test_xml.py
+++ b/test_unstructured/documents/test_xml.py
@@ -44,6 +44,20 @@ def test_from_string(sample_document):
assert type_tag.text.strip() == "10-K"
+def test_from_string_with_pre_tag():
+ sample_document = """
+
+
+ 10-K
+ Proctor & Gamble
+
+
+ """
+ xml_document = XMLDocument.from_string(sample_document)
+ type_tag = xml_document.document_tree.find(".//type")
+ assert type_tag.text.strip() == "10-K"
+
+
def test_read_with_stylesheet():
filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")
diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py
index 71d8b41a3..7f3aea511 100644
--- a/test_unstructured/partition/test_email.py
+++ b/test_unstructured/partition/test_email.py
@@ -241,6 +241,19 @@ def test_partition_email_from_text_file_with_headers():
assert element.metadata.filename is None
+def test_partition_email_from_text_file_max():
+ filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
+ with open(filename) as f:
+ elements = partition_email(file=f, content_source="text/plain", max_partition=20)
+ assert len(elements) == 6
+
+
+def test_partition_email_from_text_file_raises_value_error():
+ filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
+ with pytest.raises(ValueError), open(filename) as f:
+ partition_email(file=f, content_source="text/plain", min_partition=1000)
+
+
def test_partition_email_from_text():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
with open(filename) as f:
diff --git a/test_unstructured/partition/test_image.py b/test_unstructured/partition/test_image.py
index 10d529422..77a1f7996 100644
--- a/test_unstructured/partition/test_image.py
+++ b/test_unstructured/partition/test_image.py
@@ -144,7 +144,6 @@ def test_partition_image_with_ocr_detects_korean():
def test_partition_image_with_ocr_detects_korean_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
-
with open(filename, "rb") as f:
elements = image.partition_image(
file=f,
diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py
index 87887237a..de6ef82e6 100644
--- a/test_unstructured/partition/test_text.py
+++ b/test_unstructured/partition/test_text.py
@@ -5,7 +5,11 @@ import pytest
from unstructured.cleaners.core import group_broken_paragraphs
from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
-from unstructured.partition.text import partition_text
+from unstructured.partition.text import (
+ combine_paragraphs_less_than_min,
+ partition_text,
+ split_content_to_fit_max,
+)
DIRECTORY = pathlib.Path(__file__).parent.resolve()
@@ -18,6 +22,31 @@ EXPECTED_OUTPUT = [
ListItem(text="I love fuzzy blankets"),
]
+MIN_MAX_TEXT = """This is a story. This is a story that doesn't matter
+ because it is just being used as an example. Hi. Hello. Howdy. Hola.
+ The example is simple and repetitive and long and somewhat boring,
+ but it serves a purpose. End.""".replace(
+ "\n",
+ "",
+)
+
+SHORT_PARAGRAPHS = """This is a story.
+
+This is a story that doesn't matter because it is just being used as an example.
+
+Hi.
+
+Hello.
+
+Howdy.
+
+Hola.
+
+The example is simple and repetitive and long and somewhat boring, but it serves a purpose.
+
+End.
+"""
+
@pytest.mark.parametrize(
("filename", "encoding"),
@@ -201,6 +230,79 @@ def test_partition_text_splits_long_text(filename="example-docs/norwich-city.txt
assert elements[-1].text.endswith("External links")
+def test_partition_text_splits_long_text_max_partition(filename="example-docs/norwich-city.txt"):
+ elements = partition_text(filename=filename)
+ elements_max_part = partition_text(filename=filename, max_partition=500)
+ assert len(elements) < len(elements_max_part)
+
+
+def test_partition_text_min_max():
+ segments = partition_text(
+ text=SHORT_PARAGRAPHS,
+ min_partition=6,
+ )
+ expected = [
+ "This is a story.",
+ "This is a story that doesn't matter because it is just being used as an example.",
+ "Hi. Hello.",
+ "Howdy.",
+ """Hola. The example is simple and repetitive and long and somewhat boring,
+ but it serves a purpose. End.""".replace(
+ "\n",
+ "",
+ ),
+ ]
+ for segment, test_segment in zip(segments, expected):
+ assert segment.text == test_segment
+
+ segments = partition_text(
+ text=SHORT_PARAGRAPHS,
+ max_partition=20,
+ min_partition=7,
+ )
+ expected = [
+ "This is a story.",
+ "This is a story that",
+ "doesn't matter",
+ "because it is just",
+ "being used as an",
+ "example.",
+ "Hi. Hello.",
+ "Howdy. Hola.",
+ "The example is",
+ "simple and",
+ "repetitive and long",
+ "and somewhat boring,",
+ "but it serves a",
+ "purpose. End.",
+ ]
+ for segment, test_segment in zip(segments, expected):
+ assert segment.text == test_segment
+
+
+def test_split_content_to_fit_max():
+ segments = split_content_to_fit_max(
+ content=MIN_MAX_TEXT,
+ max_partition=75,
+ )
+ assert segments == [
+ "This is a story.",
+ "This is a story that doesn't matter because",
+ "it is just being used as an example. Hi. Hello. Howdy. Hola.",
+ "The example is simple and repetitive and long",
+ "and somewhat boring, but it serves a purpose. End.",
+ ]
+
+
+def test_combine_paragraphs_less_than_min():
+ segments = combine_paragraphs_less_than_min(
+ SHORT_PARAGRAPHS.split("\n\n"),
+ max_partition=1500,
+ min_partition=7,
+ )
+ assert len(segments) < len(SHORT_PARAGRAPHS)
+
+
def test_partition_text_doesnt_get_page_breaks():
text = "--------------------"
elements = partition_text(text=text)
diff --git a/unstructured/documents/xml.py b/unstructured/documents/xml.py
index f7e08be39..0801d6391 100644
--- a/unstructured/documents/xml.py
+++ b/unstructured/documents/xml.py
@@ -7,7 +7,7 @@ from unstructured.file_utils.encoding import read_txt_file
from unstructured.logger import logger
from unstructured.partition.text import (
element_from_text,
- split_by_paragraph,
+ partition_text,
)
VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None]
@@ -78,13 +78,16 @@ class XMLDocument(Document):
# Please use bytes input or XML fragments without declaration.
except ValueError:
document_tree = etree.fromstring(content.encode(), self.parser)
-
if "" and "
" in content:
tree = etree.HTML(content)
for element in tree.xpath("//pre"):
if not element.text:
continue
- text_content = split_by_paragraph(element.text)
+
+ text_content = []
+ for element in partition_text(text=element.text, paragraph_grouper=False):
+ text_content.append(element.text)
+
for text in text_content:
element = etree.Element("span")
element.text = str(element_from_text(text=text))
diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py
index 4f3a4d1ea..015f1a881 100644
--- a/unstructured/partition/email.py
+++ b/unstructured/partition/email.py
@@ -52,7 +52,7 @@ from unstructured.file_utils.filetype import FileType, add_metadata_with_filetyp
from unstructured.logger import logger
from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
from unstructured.partition.html import partition_html
-from unstructured.partition.text import partition_text, split_by_paragraph
+from unstructured.partition.text import partition_text
VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html", "text/plain"]
@@ -232,6 +232,7 @@ def partition_email(
metadata_filename: Optional[str] = None,
process_attachments: bool = False,
attachment_partitioner: Optional[Callable] = None,
+ min_partition: Optional[int] = 0,
**kwargs,
) -> List[Element]:
"""Partitions an .eml documents into its constituent elements.
@@ -258,6 +259,9 @@ def partition_email(
processing the content of the email itself.
attachment_partitioner
The partitioning function to use to process attachments.
+ min_partition
+ The minimum number of characters to include in a partition. Only applies if
+ processing the text/plain content.
"""
if content_source not in VALID_CONTENT_SOURCES:
raise ValueError(
@@ -270,7 +274,6 @@ def partition_email(
# Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file, text=text)
-
detected_encoding = "utf-8"
if filename is not None:
extracted_encoding, msg = parse_email(filename=filename)
@@ -342,12 +345,12 @@ def partition_email(
continue
elif content_source == "text/plain":
- list_content = split_by_paragraph(content)
elements = partition_text(
text=content,
encoding=encoding,
max_partition=max_partition,
metadata_filename=metadata_filename or filename,
+ min_partition=min_partition,
)
for idx, element in enumerate(elements):
diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py
index 18e15a4f8..280700c51 100644
--- a/unstructured/partition/msg.py
+++ b/unstructured/partition/msg.py
@@ -22,6 +22,7 @@ def partition_msg(
metadata_filename: Optional[str] = None,
process_attachments: bool = False,
attachment_partitioner: Optional[Callable] = None,
+ min_partition: Optional[int] = 0,
**kwargs,
) -> List[Element]:
"""Partitions a MSFT Outlook .msg file
@@ -42,6 +43,9 @@ def partition_msg(
processing the content of the email itself.
attachment_partitioner
The partitioning function to use to process attachments.
+ min_partition
+ The minimum number of characters to include in a partition. Only applies if
+ processing text/plain content.
"""
exactly_one(filename=filename, file=file)
@@ -57,7 +61,11 @@ def partition_msg(
if "" in text or "" in text:
elements = partition_html(text=text)
else:
- elements = partition_text(text=text, max_partition=max_partition)
+ elements = partition_text(
+ text=text,
+ max_partition=max_partition,
+ min_partition=min_partition,
+ )
metadata = build_msg_metadata(msg_obj, metadata_filename or filename)
for element in elements:
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
index 703d829e9..d416c76b2 100644
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@@ -51,6 +51,7 @@ def partition_pdf(
max_partition: Optional[int] = 1500,
include_metadata: bool = True,
metadata_filename: Optional[str] = None,
+ min_partition: Optional[int] = 0,
**kwargs,
) -> List[Element]:
"""Parses a pdf document into a list of interpreted elements.
@@ -81,6 +82,9 @@ def partition_pdf(
max_partition
The maximum number of characters to include in a partition. If None is passed,
no maximum is applied. Only applies to the "ocr_only" strategy.
+ min_partition
+ The minimum number of characters to include in a partition. Only applies if
+ processing text/plain content.
"""
exactly_one(filename=filename, file=file)
return partition_pdf_or_image(
@@ -91,6 +95,7 @@ def partition_pdf(
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
max_partition=max_partition,
+ min_partition=min_partition,
**kwargs,
)
@@ -116,6 +121,7 @@ def partition_pdf_or_image(
infer_table_structure: bool = False,
ocr_languages: str = "eng",
max_partition: Optional[int] = 1500,
+ min_partition: Optional[int] = 0,
**kwargs,
) -> List[Element]:
"""Parses a pdf or image document into a list of interpreted elements."""
@@ -172,6 +178,7 @@ def partition_pdf_or_image(
ocr_languages=ocr_languages,
is_image=is_image,
max_partition=max_partition,
+ min_partition=min_partition,
)
return layout_elements
@@ -391,6 +398,7 @@ def _partition_pdf_or_image_with_ocr(
ocr_languages: str = "eng",
is_image: bool = False,
max_partition: Optional[int] = 1500,
+ min_partition: Optional[int] = 0,
):
"""Partitions and image or PDF using Tesseract OCR. For PDFs, each page is converted
to an image prior to processing."""
@@ -402,7 +410,11 @@ def _partition_pdf_or_image_with_ocr(
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
else:
text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
- elements = partition_text(text=text, max_partition=max_partition)
+ elements = partition_text(
+ text=text,
+ max_partition=max_partition,
+ min_partition=min_partition,
+ )
else:
elements = []
page_number = 0
@@ -411,7 +423,11 @@ def _partition_pdf_or_image_with_ocr(
metadata = ElementMetadata(filename=filename, page_number=page_number)
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
- _elements = partition_text(text=text, max_partition=max_partition)
+ _elements = partition_text(
+ text=text,
+ max_partition=max_partition,
+ min_partition=min_partition,
+ )
for element in _elements:
element.metadata = metadata
elements.append(element)
diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py
index 8fa334e14..a6d799eba 100644
--- a/unstructured/partition/text.py
+++ b/unstructured/partition/text.py
@@ -1,4 +1,5 @@
import re
+import textwrap
from typing import IO, Callable, List, Optional, Tuple
from unstructured.cleaners.core import clean_bullets, group_broken_paragraphs
@@ -26,52 +27,125 @@ from unstructured.partition.text_type import (
)
-def split_by_paragraph(content: str, max_partition: Optional[int] = 1500) -> List[str]:
- paragraphs = re.split(PARAGRAPH_PATTERN, content)
- if max_partition is None:
- return paragraphs
+def _split_in_half_at_breakpoint(
+ content: str,
+ breakpoint: str = " ",
+) -> List[str]:
+ """Splits a segment of content at the breakpoint closest to the middle"""
+ mid = len(content) // 2
+ for i in range(len(content) // 2):
+ if content[mid + i] == breakpoint:
+ mid += i
+ break
+ elif content[mid - i] == breakpoint:
+ mid += -i
+ break
- split_paragraphs = []
- for paragraph in paragraphs:
- split_paragraphs.extend(
- _split_to_fit_max_content(paragraph, max_partition=max_partition),
- )
- return split_paragraphs
+ return [content[:mid].rstrip(), content[mid:].lstrip()]
def _split_content_size_n(content: str, n: int) -> List[str]:
- """Splits a string into chunks that are at most size n."""
+ """Splits a section of content into chunks that are at most
+ size n without breaking apart words."""
segments = []
- for i in range(0, len(content), n):
- segment = content[i : i + n] # noqa: E203
- segments.append(segment)
+ if len(content) < n * 2:
+ segments = list(_split_in_half_at_breakpoint(content))
+ else:
+ segments = textwrap.wrap(content, width=n)
return segments
-def _split_to_fit_max_content(content: str, max_partition: int = 1500) -> List[str]:
- """Splits a section of content so that all of the elements fit into the
+def split_content_to_fit_max(
+ content: str,
+ max_partition: Optional[int] = 1500,
+) -> List[str]:
+ """Splits a paragraph or section of content so that all of the elements fit into the
max partition window."""
sentences = sent_tokenize(content)
- num_sentences = len(sentences)
-
chunks = []
- chunk = ""
-
- for i, sentence in enumerate(sentences):
- if len(sentence) > max_partition:
- chunks.extend(_split_content_size_n(sentence, n=max_partition))
-
- if len(chunk + " " + sentence) > max_partition:
- chunks.append(chunk)
- chunk = sentence
+ tmp_chunk = ""
+ for sentence in sentences:
+ if max_partition is not None and len(sentence) > max_partition:
+ if tmp_chunk:
+ chunks.append(tmp_chunk)
+ tmp_chunk = ""
+ segments = _split_content_size_n(sentence, n=max_partition)
+ chunks.extend(segments[:-1])
+ tmp_chunk = segments[-1]
else:
- chunk += " " + sentence
- if i == num_sentences - 1:
- chunks.append(chunk)
+ if max_partition is not None and len(tmp_chunk + " " + sentence) > max_partition:
+ chunks.append(tmp_chunk)
+ tmp_chunk = sentence
+ else:
+ if not tmp_chunk:
+ tmp_chunk = sentence
+ else:
+ tmp_chunk += " " + sentence
+ tmp_chunk = tmp_chunk.strip()
+ if tmp_chunk:
+ chunks.append(tmp_chunk)
return chunks
+def combine_paragraphs_less_than_min(
+ split_paragraphs: List[str],
+ max_partition: Optional[int] = 1500,
+ min_partition: Optional[int] = 0,
+) -> List[str]:
+ """Combine paragraphs less than `min_partition` while not exceeding `max_partition`."""
+ if type(split_paragraphs) is not list:
+ raise ValueError("`split_paragraphs` is not a list")
+ file_content: List[str] = []
+ tmp_paragraph = ""
+ next_index = 0
+ for current_index, paragraph in enumerate(split_paragraphs):
+ if next_index > current_index:
+ continue # Skip the current iteration if `next_index`` is already updated
+ if min_partition is not None and len(paragraph) < min_partition:
+ # Combine paragraphs that are less than `min_partition``
+ # while not exceeding `max_partition``
+ tmp_paragraph += paragraph + "\n"
+
+ while len(tmp_paragraph.strip()) < min_partition:
+ if current_index + 1 == len(split_paragraphs):
+ # If it's the last paragraph, append the paragraph
+ # to the previous content
+ file_content[-1] += " " + tmp_paragraph.rstrip()
+ tmp_paragraph = ""
+ break
+ for offset_index, para in enumerate(
+ split_paragraphs[current_index + 1 :], start=1 # noqa
+ ):
+ if (
+ max_partition is not None
+ and len(tmp_paragraph + "\n" + para) < max_partition
+ ):
+ tmp_paragraph += "\n" + para
+ # Update `next_index` to skip already combined paragraphs
+ next_index = offset_index + current_index + 1
+
+ if len(tmp_paragraph.strip()) > min_partition:
+ break # Stop combining if the combined paragraphs
+ # meet the `min_partition`` requirement
+ elif (
+ max_partition is not None
+ and len(tmp_paragraph) < min_partition
+ and len(tmp_paragraph + "\n" + para) > max_partition
+ ):
+ raise ValueError(
+ "`min_partition` and `max_partition` are defined too close together",
+ )
+ # Add the combined paragraph to the final result
+ file_content.append(
+ tmp_paragraph.strip(),
+ )
+ tmp_paragraph = ""
+ else:
+ file_content.append(paragraph)
+ return file_content
+
+
@process_metadata()
@add_metadata_with_filetype(FileType.TXT)
def partition_text(
@@ -83,9 +157,12 @@ def partition_text(
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
max_partition: Optional[int] = 1500,
+ min_partition: Optional[int] = 0,
**kwargs,
) -> List[Element]:
- """Partitions an .txt documents into its constituent elements.
+ """Partitions an .txt documents into its constituent paragraph elements.
+ If paragraphs are below "min_partition" or above "max_partition" boundaries,
+ they are combined or split.
Parameters
----------
filename
@@ -104,10 +181,19 @@ def partition_text(
max_partition
The maximum number of characters to include in a partition. If None is passed,
no maximum is applied.
+ min_partition
+ The minimum number of characters to include in a partition.
"""
if text is not None and text.strip() == "" and not file and not filename:
return []
+ if (
+ min_partition is not None
+ and max_partition is not None
+ and (min_partition > max_partition or min_partition < 0 or max_partition < 0)
+ ):
+ raise ValueError("Invalid values for min_partition and/or max_partition.")
+
# Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file, text=text)
@@ -120,12 +206,33 @@ def partition_text(
elif text is not None:
file_text = str(text)
- if paragraph_grouper is not None:
+ if paragraph_grouper is False:
+ pass
+ elif paragraph_grouper is not None:
file_text = paragraph_grouper(file_text)
else:
file_text = group_broken_paragraphs(file_text)
- file_content = split_by_paragraph(file_text, max_partition=max_partition)
+ if min_partition is not None and len(file_text) < min_partition:
+ raise ValueError("`min_partition` cannot be larger than the length of file contents.")
+
+ split_paragraphs = re.split(PARAGRAPH_PATTERN, file_text.strip())
+
+ paragraphs = combine_paragraphs_less_than_min(
+ split_paragraphs=split_paragraphs,
+ max_partition=max_partition,
+ min_partition=min_partition,
+ )
+
+ file_content = []
+
+ for paragraph in paragraphs:
+ file_content.extend(
+ split_content_to_fit_max(
+ content=paragraph,
+ max_partition=max_partition,
+ ),
+ )
elements: List[Element] = []
metadata = (
diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py
index 0ef609dfd..6b81f4af5 100644
--- a/unstructured/partition/xml.py
+++ b/unstructured/partition/xml.py
@@ -54,6 +54,7 @@ def partition_xml(
include_metadata: bool = True,
encoding: Optional[str] = None,
max_partition: Optional[int] = 1500,
+ min_partition: Optional[int] = 0,
**kwargs,
) -> List[Element]:
"""Partitions an XML document into its document elements.
@@ -77,6 +78,8 @@ def partition_xml(
max_partition
The maximum number of characters to include in a partition. If None is passed,
no maximum is applied.
+ min_partition
+ The minimum number of characters to include in a partition.
"""
exactly_one(filename=filename, file=file)
@@ -97,6 +100,7 @@ def partition_xml(
metadata_filename=metadata_filename,
include_metadata=include_metadata,
max_partition=max_partition,
+ min_partition=min_partition,
)
return elements