From 676c50a6ecbe42360f5c522b2c5dff929cf76a3a Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Mon, 24 Jul 2023 10:57:24 -0500 Subject: [PATCH] feat: add min_partition kwarg to that combines elements below a specified threshold (#926) * add min_partition * functioning _split_content_to_fit_min_max * create test and make tidy/check * fix rebase issues * fix type hinting, remove unused code, add tests * various changes and refactoring of methods * add test, refactor, change var names for debugging purposes * update test * make tidy/check * give more descriptive var names and add comments * update xml partition via partition_text and create test * fix
 bug for test_partition_html_with_pre_tag

* make tidy

* refactor and fix tests

* make tidy/check

* ingest-test-fixtures-update

* change list comprehension to for loop

* fix error check
---
 CHANGELOG.md                              |   2 +-
 test_unstructured/documents/test_xml.py   |  14 ++
 test_unstructured/partition/test_email.py |  13 ++
 test_unstructured/partition/test_image.py |   1 -
 test_unstructured/partition/test_text.py  | 104 ++++++++++++-
 unstructured/documents/xml.py             |   9 +-
 unstructured/partition/email.py           |   9 +-
 unstructured/partition/msg.py             |  10 +-
 unstructured/partition/pdf.py             |  20 ++-
 unstructured/partition/text.py            | 173 +++++++++++++++++-----
 unstructured/partition/xml.py             |   4 +
 11 files changed, 314 insertions(+), 45 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3a0514540..bf112773a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,7 @@
 ## 0.8.2-dev4
 
 ### Enhancements
-
+* Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
 * set the file's current position to the beginning after reading the file in `convert_to_bytes`
 * Add slide notes to pptx
 
diff --git a/test_unstructured/documents/test_xml.py b/test_unstructured/documents/test_xml.py
index 9e3e0ae03..bb05bc7b4 100644
--- a/test_unstructured/documents/test_xml.py
+++ b/test_unstructured/documents/test_xml.py
@@ -44,6 +44,20 @@ def test_from_string(sample_document):
     assert type_tag.text.strip() == "10-K"
 
 
+def test_from_string_with_pre_tag():
+    sample_document = """
+    
+    
+    10-K
+    Proctor & Gamble
+    
+    
+ """ + xml_document = XMLDocument.from_string(sample_document) + type_tag = xml_document.document_tree.find(".//type") + assert type_tag.text.strip() == "10-K" + + def test_read_with_stylesheet(): filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml") stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl") diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py index 71d8b41a3..7f3aea511 100644 --- a/test_unstructured/partition/test_email.py +++ b/test_unstructured/partition/test_email.py @@ -241,6 +241,19 @@ def test_partition_email_from_text_file_with_headers(): assert element.metadata.filename is None +def test_partition_email_from_text_file_max(): + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt") + with open(filename) as f: + elements = partition_email(file=f, content_source="text/plain", max_partition=20) + assert len(elements) == 6 + + +def test_partition_email_from_text_file_raises_value_error(): + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt") + with pytest.raises(ValueError), open(filename) as f: + partition_email(file=f, content_source="text/plain", min_partition=1000) + + def test_partition_email_from_text(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") with open(filename) as f: diff --git a/test_unstructured/partition/test_image.py b/test_unstructured/partition/test_image.py index 10d529422..77a1f7996 100644 --- a/test_unstructured/partition/test_image.py +++ b/test_unstructured/partition/test_image.py @@ -144,7 +144,6 @@ def test_partition_image_with_ocr_detects_korean(): def test_partition_image_with_ocr_detects_korean_from_file(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png") - with open(filename, "rb") as f: elements = image.partition_image( file=f, diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py index 87887237a..de6ef82e6 100644 --- a/test_unstructured/partition/test_text.py +++ b/test_unstructured/partition/test_text.py @@ -5,7 +5,11 @@ import pytest from unstructured.cleaners.core import group_broken_paragraphs from unstructured.documents.elements import Address, ListItem, NarrativeText, Title -from unstructured.partition.text import partition_text +from unstructured.partition.text import ( + combine_paragraphs_less_than_min, + partition_text, + split_content_to_fit_max, +) DIRECTORY = pathlib.Path(__file__).parent.resolve() @@ -18,6 +22,31 @@ EXPECTED_OUTPUT = [ ListItem(text="I love fuzzy blankets"), ] +MIN_MAX_TEXT = """This is a story. This is a story that doesn't matter + because it is just being used as an example. Hi. Hello. Howdy. Hola. + The example is simple and repetitive and long and somewhat boring, + but it serves a purpose. End.""".replace( + "\n", + "", +) + +SHORT_PARAGRAPHS = """This is a story. + +This is a story that doesn't matter because it is just being used as an example. + +Hi. + +Hello. + +Howdy. + +Hola. + +The example is simple and repetitive and long and somewhat boring, but it serves a purpose. + +End. +""" + @pytest.mark.parametrize( ("filename", "encoding"), @@ -201,6 +230,79 @@ def test_partition_text_splits_long_text(filename="example-docs/norwich-city.txt assert elements[-1].text.endswith("External links") +def test_partition_text_splits_long_text_max_partition(filename="example-docs/norwich-city.txt"): + elements = partition_text(filename=filename) + elements_max_part = partition_text(filename=filename, max_partition=500) + assert len(elements) < len(elements_max_part) + + +def test_partition_text_min_max(): + segments = partition_text( + text=SHORT_PARAGRAPHS, + min_partition=6, + ) + expected = [ + "This is a story.", + "This is a story that doesn't matter because it is just being used as an example.", + "Hi. Hello.", + "Howdy.", + """Hola. The example is simple and repetitive and long and somewhat boring, + but it serves a purpose. End.""".replace( + "\n", + "", + ), + ] + for segment, test_segment in zip(segments, expected): + assert segment.text == test_segment + + segments = partition_text( + text=SHORT_PARAGRAPHS, + max_partition=20, + min_partition=7, + ) + expected = [ + "This is a story.", + "This is a story that", + "doesn't matter", + "because it is just", + "being used as an", + "example.", + "Hi. Hello.", + "Howdy. Hola.", + "The example is", + "simple and", + "repetitive and long", + "and somewhat boring,", + "but it serves a", + "purpose. End.", + ] + for segment, test_segment in zip(segments, expected): + assert segment.text == test_segment + + +def test_split_content_to_fit_max(): + segments = split_content_to_fit_max( + content=MIN_MAX_TEXT, + max_partition=75, + ) + assert segments == [ + "This is a story.", + "This is a story that doesn't matter because", + "it is just being used as an example. Hi. Hello. Howdy. Hola.", + "The example is simple and repetitive and long", + "and somewhat boring, but it serves a purpose. End.", + ] + + +def test_combine_paragraphs_less_than_min(): + segments = combine_paragraphs_less_than_min( + SHORT_PARAGRAPHS.split("\n\n"), + max_partition=1500, + min_partition=7, + ) + assert len(segments) < len(SHORT_PARAGRAPHS) + + def test_partition_text_doesnt_get_page_breaks(): text = "--------------------" elements = partition_text(text=text) diff --git a/unstructured/documents/xml.py b/unstructured/documents/xml.py index f7e08be39..0801d6391 100644 --- a/unstructured/documents/xml.py +++ b/unstructured/documents/xml.py @@ -7,7 +7,7 @@ from unstructured.file_utils.encoding import read_txt_file from unstructured.logger import logger from unstructured.partition.text import ( element_from_text, - split_by_paragraph, + partition_text, ) VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None] @@ -78,13 +78,16 @@ class XMLDocument(Document): # Please use bytes input or XML fragments without declaration. except ValueError: document_tree = etree.fromstring(content.encode(), self.parser) - if "
" and "
" in content: tree = etree.HTML(content) for element in tree.xpath("//pre"): if not element.text: continue - text_content = split_by_paragraph(element.text) + + text_content = [] + for element in partition_text(text=element.text, paragraph_grouper=False): + text_content.append(element.text) + for text in text_content: element = etree.Element("span") element.text = str(element_from_text(text=text)) diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index 4f3a4d1ea..015f1a881 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -52,7 +52,7 @@ from unstructured.file_utils.filetype import FileType, add_metadata_with_filetyp from unstructured.logger import logger from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE from unstructured.partition.html import partition_html -from unstructured.partition.text import partition_text, split_by_paragraph +from unstructured.partition.text import partition_text VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html", "text/plain"] @@ -232,6 +232,7 @@ def partition_email( metadata_filename: Optional[str] = None, process_attachments: bool = False, attachment_partitioner: Optional[Callable] = None, + min_partition: Optional[int] = 0, **kwargs, ) -> List[Element]: """Partitions an .eml documents into its constituent elements. @@ -258,6 +259,9 @@ def partition_email( processing the content of the email itself. attachment_partitioner The partitioning function to use to process attachments. + min_partition + The minimum number of characters to include in a partition. Only applies if + processing the text/plain content. """ if content_source not in VALID_CONTENT_SOURCES: raise ValueError( @@ -270,7 +274,6 @@ def partition_email( # Verify that only one of the arguments was provided exactly_one(filename=filename, file=file, text=text) - detected_encoding = "utf-8" if filename is not None: extracted_encoding, msg = parse_email(filename=filename) @@ -342,12 +345,12 @@ def partition_email( continue elif content_source == "text/plain": - list_content = split_by_paragraph(content) elements = partition_text( text=content, encoding=encoding, max_partition=max_partition, metadata_filename=metadata_filename or filename, + min_partition=min_partition, ) for idx, element in enumerate(elements): diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py index 18e15a4f8..280700c51 100644 --- a/unstructured/partition/msg.py +++ b/unstructured/partition/msg.py @@ -22,6 +22,7 @@ def partition_msg( metadata_filename: Optional[str] = None, process_attachments: bool = False, attachment_partitioner: Optional[Callable] = None, + min_partition: Optional[int] = 0, **kwargs, ) -> List[Element]: """Partitions a MSFT Outlook .msg file @@ -42,6 +43,9 @@ def partition_msg( processing the content of the email itself. attachment_partitioner The partitioning function to use to process attachments. + min_partition + The minimum number of characters to include in a partition. Only applies if + processing text/plain content. """ exactly_one(filename=filename, file=file) @@ -57,7 +61,11 @@ def partition_msg( if "" in text or "" in text: elements = partition_html(text=text) else: - elements = partition_text(text=text, max_partition=max_partition) + elements = partition_text( + text=text, + max_partition=max_partition, + min_partition=min_partition, + ) metadata = build_msg_metadata(msg_obj, metadata_filename or filename) for element in elements: diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 703d829e9..d416c76b2 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -51,6 +51,7 @@ def partition_pdf( max_partition: Optional[int] = 1500, include_metadata: bool = True, metadata_filename: Optional[str] = None, + min_partition: Optional[int] = 0, **kwargs, ) -> List[Element]: """Parses a pdf document into a list of interpreted elements. @@ -81,6 +82,9 @@ def partition_pdf( max_partition The maximum number of characters to include in a partition. If None is passed, no maximum is applied. Only applies to the "ocr_only" strategy. + min_partition + The minimum number of characters to include in a partition. Only applies if + processing text/plain content. """ exactly_one(filename=filename, file=file) return partition_pdf_or_image( @@ -91,6 +95,7 @@ def partition_pdf( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, max_partition=max_partition, + min_partition=min_partition, **kwargs, ) @@ -116,6 +121,7 @@ def partition_pdf_or_image( infer_table_structure: bool = False, ocr_languages: str = "eng", max_partition: Optional[int] = 1500, + min_partition: Optional[int] = 0, **kwargs, ) -> List[Element]: """Parses a pdf or image document into a list of interpreted elements.""" @@ -172,6 +178,7 @@ def partition_pdf_or_image( ocr_languages=ocr_languages, is_image=is_image, max_partition=max_partition, + min_partition=min_partition, ) return layout_elements @@ -391,6 +398,7 @@ def _partition_pdf_or_image_with_ocr( ocr_languages: str = "eng", is_image: bool = False, max_partition: Optional[int] = 1500, + min_partition: Optional[int] = 0, ): """Partitions and image or PDF using Tesseract OCR. For PDFs, each page is converted to an image prior to processing.""" @@ -402,7 +410,11 @@ def _partition_pdf_or_image_with_ocr( text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'") else: text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'") - elements = partition_text(text=text, max_partition=max_partition) + elements = partition_text( + text=text, + max_partition=max_partition, + min_partition=min_partition, + ) else: elements = [] page_number = 0 @@ -411,7 +423,11 @@ def _partition_pdf_or_image_with_ocr( metadata = ElementMetadata(filename=filename, page_number=page_number) text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'") - _elements = partition_text(text=text, max_partition=max_partition) + _elements = partition_text( + text=text, + max_partition=max_partition, + min_partition=min_partition, + ) for element in _elements: element.metadata = metadata elements.append(element) diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py index 8fa334e14..a6d799eba 100644 --- a/unstructured/partition/text.py +++ b/unstructured/partition/text.py @@ -1,4 +1,5 @@ import re +import textwrap from typing import IO, Callable, List, Optional, Tuple from unstructured.cleaners.core import clean_bullets, group_broken_paragraphs @@ -26,52 +27,125 @@ from unstructured.partition.text_type import ( ) -def split_by_paragraph(content: str, max_partition: Optional[int] = 1500) -> List[str]: - paragraphs = re.split(PARAGRAPH_PATTERN, content) - if max_partition is None: - return paragraphs +def _split_in_half_at_breakpoint( + content: str, + breakpoint: str = " ", +) -> List[str]: + """Splits a segment of content at the breakpoint closest to the middle""" + mid = len(content) // 2 + for i in range(len(content) // 2): + if content[mid + i] == breakpoint: + mid += i + break + elif content[mid - i] == breakpoint: + mid += -i + break - split_paragraphs = [] - for paragraph in paragraphs: - split_paragraphs.extend( - _split_to_fit_max_content(paragraph, max_partition=max_partition), - ) - return split_paragraphs + return [content[:mid].rstrip(), content[mid:].lstrip()] def _split_content_size_n(content: str, n: int) -> List[str]: - """Splits a string into chunks that are at most size n.""" + """Splits a section of content into chunks that are at most + size n without breaking apart words.""" segments = [] - for i in range(0, len(content), n): - segment = content[i : i + n] # noqa: E203 - segments.append(segment) + if len(content) < n * 2: + segments = list(_split_in_half_at_breakpoint(content)) + else: + segments = textwrap.wrap(content, width=n) return segments -def _split_to_fit_max_content(content: str, max_partition: int = 1500) -> List[str]: - """Splits a section of content so that all of the elements fit into the +def split_content_to_fit_max( + content: str, + max_partition: Optional[int] = 1500, +) -> List[str]: + """Splits a paragraph or section of content so that all of the elements fit into the max partition window.""" sentences = sent_tokenize(content) - num_sentences = len(sentences) - chunks = [] - chunk = "" - - for i, sentence in enumerate(sentences): - if len(sentence) > max_partition: - chunks.extend(_split_content_size_n(sentence, n=max_partition)) - - if len(chunk + " " + sentence) > max_partition: - chunks.append(chunk) - chunk = sentence + tmp_chunk = "" + for sentence in sentences: + if max_partition is not None and len(sentence) > max_partition: + if tmp_chunk: + chunks.append(tmp_chunk) + tmp_chunk = "" + segments = _split_content_size_n(sentence, n=max_partition) + chunks.extend(segments[:-1]) + tmp_chunk = segments[-1] else: - chunk += " " + sentence - if i == num_sentences - 1: - chunks.append(chunk) + if max_partition is not None and len(tmp_chunk + " " + sentence) > max_partition: + chunks.append(tmp_chunk) + tmp_chunk = sentence + else: + if not tmp_chunk: + tmp_chunk = sentence + else: + tmp_chunk += " " + sentence + tmp_chunk = tmp_chunk.strip() + if tmp_chunk: + chunks.append(tmp_chunk) return chunks +def combine_paragraphs_less_than_min( + split_paragraphs: List[str], + max_partition: Optional[int] = 1500, + min_partition: Optional[int] = 0, +) -> List[str]: + """Combine paragraphs less than `min_partition` while not exceeding `max_partition`.""" + if type(split_paragraphs) is not list: + raise ValueError("`split_paragraphs` is not a list") + file_content: List[str] = [] + tmp_paragraph = "" + next_index = 0 + for current_index, paragraph in enumerate(split_paragraphs): + if next_index > current_index: + continue # Skip the current iteration if `next_index`` is already updated + if min_partition is not None and len(paragraph) < min_partition: + # Combine paragraphs that are less than `min_partition`` + # while not exceeding `max_partition`` + tmp_paragraph += paragraph + "\n" + + while len(tmp_paragraph.strip()) < min_partition: + if current_index + 1 == len(split_paragraphs): + # If it's the last paragraph, append the paragraph + # to the previous content + file_content[-1] += " " + tmp_paragraph.rstrip() + tmp_paragraph = "" + break + for offset_index, para in enumerate( + split_paragraphs[current_index + 1 :], start=1 # noqa + ): + if ( + max_partition is not None + and len(tmp_paragraph + "\n" + para) < max_partition + ): + tmp_paragraph += "\n" + para + # Update `next_index` to skip already combined paragraphs + next_index = offset_index + current_index + 1 + + if len(tmp_paragraph.strip()) > min_partition: + break # Stop combining if the combined paragraphs + # meet the `min_partition`` requirement + elif ( + max_partition is not None + and len(tmp_paragraph) < min_partition + and len(tmp_paragraph + "\n" + para) > max_partition + ): + raise ValueError( + "`min_partition` and `max_partition` are defined too close together", + ) + # Add the combined paragraph to the final result + file_content.append( + tmp_paragraph.strip(), + ) + tmp_paragraph = "" + else: + file_content.append(paragraph) + return file_content + + @process_metadata() @add_metadata_with_filetype(FileType.TXT) def partition_text( @@ -83,9 +157,12 @@ def partition_text( metadata_filename: Optional[str] = None, include_metadata: bool = True, max_partition: Optional[int] = 1500, + min_partition: Optional[int] = 0, **kwargs, ) -> List[Element]: - """Partitions an .txt documents into its constituent elements. + """Partitions an .txt documents into its constituent paragraph elements. + If paragraphs are below "min_partition" or above "max_partition" boundaries, + they are combined or split. Parameters ---------- filename @@ -104,10 +181,19 @@ def partition_text( max_partition The maximum number of characters to include in a partition. If None is passed, no maximum is applied. + min_partition + The minimum number of characters to include in a partition. """ if text is not None and text.strip() == "" and not file and not filename: return [] + if ( + min_partition is not None + and max_partition is not None + and (min_partition > max_partition or min_partition < 0 or max_partition < 0) + ): + raise ValueError("Invalid values for min_partition and/or max_partition.") + # Verify that only one of the arguments was provided exactly_one(filename=filename, file=file, text=text) @@ -120,12 +206,33 @@ def partition_text( elif text is not None: file_text = str(text) - if paragraph_grouper is not None: + if paragraph_grouper is False: + pass + elif paragraph_grouper is not None: file_text = paragraph_grouper(file_text) else: file_text = group_broken_paragraphs(file_text) - file_content = split_by_paragraph(file_text, max_partition=max_partition) + if min_partition is not None and len(file_text) < min_partition: + raise ValueError("`min_partition` cannot be larger than the length of file contents.") + + split_paragraphs = re.split(PARAGRAPH_PATTERN, file_text.strip()) + + paragraphs = combine_paragraphs_less_than_min( + split_paragraphs=split_paragraphs, + max_partition=max_partition, + min_partition=min_partition, + ) + + file_content = [] + + for paragraph in paragraphs: + file_content.extend( + split_content_to_fit_max( + content=paragraph, + max_partition=max_partition, + ), + ) elements: List[Element] = [] metadata = ( diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py index 0ef609dfd..6b81f4af5 100644 --- a/unstructured/partition/xml.py +++ b/unstructured/partition/xml.py @@ -54,6 +54,7 @@ def partition_xml( include_metadata: bool = True, encoding: Optional[str] = None, max_partition: Optional[int] = 1500, + min_partition: Optional[int] = 0, **kwargs, ) -> List[Element]: """Partitions an XML document into its document elements. @@ -77,6 +78,8 @@ def partition_xml( max_partition The maximum number of characters to include in a partition. If None is passed, no maximum is applied. + min_partition + The minimum number of characters to include in a partition. """ exactly_one(filename=filename, file=file) @@ -97,6 +100,7 @@ def partition_xml( metadata_filename=metadata_filename, include_metadata=include_metadata, max_partition=max_partition, + min_partition=min_partition, ) return elements