feat: add min_partition kwarg to that combines elements below a specified threshold (#926)

* add min_partition

* functioning _split_content_to_fit_min_max

* create test and make tidy/check

* fix rebase issues

* fix type hinting, remove unused code, add tests

* various changes and refactoring of methods

* add test, refactor, change var names for debugging purposes

* update test

* make tidy/check

* give more descriptive var names and add comments

* update xml partition via partition_text and create test

* fix <pre> bug for test_partition_html_with_pre_tag

* make tidy

* refactor and fix tests

* make tidy/check

* ingest-test-fixtures-update

* change list comprehension to for loop

* fix error check
This commit is contained in:
John 2023-07-24 10:57:24 -05:00 committed by GitHub
parent d0329126ef
commit 676c50a6ec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 314 additions and 45 deletions

View File

@ -1,7 +1,7 @@
## 0.8.2-dev4
### Enhancements
* Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
* set the file's current position to the beginning after reading the file in `convert_to_bytes`
* Add slide notes to pptx

View File

@ -44,6 +44,20 @@ def test_from_string(sample_document):
assert type_tag.text.strip() == "10-K"
def test_from_string_with_pre_tag():
sample_document = """
<pre>
<SEC-DOCUMENT>
<TYPE>10-K
<COMPANY>Proctor & Gamble
</SEC-DOCUMENT>
</pre>
"""
xml_document = XMLDocument.from_string(sample_document)
type_tag = xml_document.document_tree.find(".//type")
assert type_tag.text.strip() == "10-K"
def test_read_with_stylesheet():
filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")

View File

@ -241,6 +241,19 @@ def test_partition_email_from_text_file_with_headers():
assert element.metadata.filename is None
def test_partition_email_from_text_file_max():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
with open(filename) as f:
elements = partition_email(file=f, content_source="text/plain", max_partition=20)
assert len(elements) == 6
def test_partition_email_from_text_file_raises_value_error():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
with pytest.raises(ValueError), open(filename) as f:
partition_email(file=f, content_source="text/plain", min_partition=1000)
def test_partition_email_from_text():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
with open(filename) as f:

View File

@ -144,7 +144,6 @@ def test_partition_image_with_ocr_detects_korean():
def test_partition_image_with_ocr_detects_korean_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
with open(filename, "rb") as f:
elements = image.partition_image(
file=f,

View File

@ -5,7 +5,11 @@ import pytest
from unstructured.cleaners.core import group_broken_paragraphs
from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
from unstructured.partition.text import partition_text
from unstructured.partition.text import (
combine_paragraphs_less_than_min,
partition_text,
split_content_to_fit_max,
)
DIRECTORY = pathlib.Path(__file__).parent.resolve()
@ -18,6 +22,31 @@ EXPECTED_OUTPUT = [
ListItem(text="I love fuzzy blankets"),
]
MIN_MAX_TEXT = """This is a story. This is a story that doesn't matter
because it is just being used as an example. Hi. Hello. Howdy. Hola.
The example is simple and repetitive and long and somewhat boring,
but it serves a purpose. End.""".replace(
"\n",
"",
)
SHORT_PARAGRAPHS = """This is a story.
This is a story that doesn't matter because it is just being used as an example.
Hi.
Hello.
Howdy.
Hola.
The example is simple and repetitive and long and somewhat boring, but it serves a purpose.
End.
"""
@pytest.mark.parametrize(
("filename", "encoding"),
@ -201,6 +230,79 @@ def test_partition_text_splits_long_text(filename="example-docs/norwich-city.txt
assert elements[-1].text.endswith("External links")
def test_partition_text_splits_long_text_max_partition(filename="example-docs/norwich-city.txt"):
elements = partition_text(filename=filename)
elements_max_part = partition_text(filename=filename, max_partition=500)
assert len(elements) < len(elements_max_part)
def test_partition_text_min_max():
segments = partition_text(
text=SHORT_PARAGRAPHS,
min_partition=6,
)
expected = [
"This is a story.",
"This is a story that doesn't matter because it is just being used as an example.",
"Hi. Hello.",
"Howdy.",
"""Hola. The example is simple and repetitive and long and somewhat boring,
but it serves a purpose. End.""".replace(
"\n",
"",
),
]
for segment, test_segment in zip(segments, expected):
assert segment.text == test_segment
segments = partition_text(
text=SHORT_PARAGRAPHS,
max_partition=20,
min_partition=7,
)
expected = [
"This is a story.",
"This is a story that",
"doesn't matter",
"because it is just",
"being used as an",
"example.",
"Hi. Hello.",
"Howdy. Hola.",
"The example is",
"simple and",
"repetitive and long",
"and somewhat boring,",
"but it serves a",
"purpose. End.",
]
for segment, test_segment in zip(segments, expected):
assert segment.text == test_segment
def test_split_content_to_fit_max():
segments = split_content_to_fit_max(
content=MIN_MAX_TEXT,
max_partition=75,
)
assert segments == [
"This is a story.",
"This is a story that doesn't matter because",
"it is just being used as an example. Hi. Hello. Howdy. Hola.",
"The example is simple and repetitive and long",
"and somewhat boring, but it serves a purpose. End.",
]
def test_combine_paragraphs_less_than_min():
segments = combine_paragraphs_less_than_min(
SHORT_PARAGRAPHS.split("\n\n"),
max_partition=1500,
min_partition=7,
)
assert len(segments) < len(SHORT_PARAGRAPHS)
def test_partition_text_doesnt_get_page_breaks():
text = "--------------------"
elements = partition_text(text=text)

View File

@ -7,7 +7,7 @@ from unstructured.file_utils.encoding import read_txt_file
from unstructured.logger import logger
from unstructured.partition.text import (
element_from_text,
split_by_paragraph,
partition_text,
)
VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None]
@ -78,13 +78,16 @@ class XMLDocument(Document):
# Please use bytes input or XML fragments without declaration.
except ValueError:
document_tree = etree.fromstring(content.encode(), self.parser)
if "<pre>" and "</pre>" in content:
tree = etree.HTML(content)
for element in tree.xpath("//pre"):
if not element.text:
continue
text_content = split_by_paragraph(element.text)
text_content = []
for element in partition_text(text=element.text, paragraph_grouper=False):
text_content.append(element.text)
for text in text_content:
element = etree.Element("span")
element.text = str(element_from_text(text=text))

View File

@ -52,7 +52,7 @@ from unstructured.file_utils.filetype import FileType, add_metadata_with_filetyp
from unstructured.logger import logger
from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
from unstructured.partition.html import partition_html
from unstructured.partition.text import partition_text, split_by_paragraph
from unstructured.partition.text import partition_text
VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html", "text/plain"]
@ -232,6 +232,7 @@ def partition_email(
metadata_filename: Optional[str] = None,
process_attachments: bool = False,
attachment_partitioner: Optional[Callable] = None,
min_partition: Optional[int] = 0,
**kwargs,
) -> List[Element]:
"""Partitions an .eml documents into its constituent elements.
@ -258,6 +259,9 @@ def partition_email(
processing the content of the email itself.
attachment_partitioner
The partitioning function to use to process attachments.
min_partition
The minimum number of characters to include in a partition. Only applies if
processing the text/plain content.
"""
if content_source not in VALID_CONTENT_SOURCES:
raise ValueError(
@ -270,7 +274,6 @@ def partition_email(
# Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file, text=text)
detected_encoding = "utf-8"
if filename is not None:
extracted_encoding, msg = parse_email(filename=filename)
@ -342,12 +345,12 @@ def partition_email(
continue
elif content_source == "text/plain":
list_content = split_by_paragraph(content)
elements = partition_text(
text=content,
encoding=encoding,
max_partition=max_partition,
metadata_filename=metadata_filename or filename,
min_partition=min_partition,
)
for idx, element in enumerate(elements):

View File

@ -22,6 +22,7 @@ def partition_msg(
metadata_filename: Optional[str] = None,
process_attachments: bool = False,
attachment_partitioner: Optional[Callable] = None,
min_partition: Optional[int] = 0,
**kwargs,
) -> List[Element]:
"""Partitions a MSFT Outlook .msg file
@ -42,6 +43,9 @@ def partition_msg(
processing the content of the email itself.
attachment_partitioner
The partitioning function to use to process attachments.
min_partition
The minimum number of characters to include in a partition. Only applies if
processing text/plain content.
"""
exactly_one(filename=filename, file=file)
@ -57,7 +61,11 @@ def partition_msg(
if "<html>" in text or "</div>" in text:
elements = partition_html(text=text)
else:
elements = partition_text(text=text, max_partition=max_partition)
elements = partition_text(
text=text,
max_partition=max_partition,
min_partition=min_partition,
)
metadata = build_msg_metadata(msg_obj, metadata_filename or filename)
for element in elements:

View File

@ -51,6 +51,7 @@ def partition_pdf(
max_partition: Optional[int] = 1500,
include_metadata: bool = True,
metadata_filename: Optional[str] = None,
min_partition: Optional[int] = 0,
**kwargs,
) -> List[Element]:
"""Parses a pdf document into a list of interpreted elements.
@ -81,6 +82,9 @@ def partition_pdf(
max_partition
The maximum number of characters to include in a partition. If None is passed,
no maximum is applied. Only applies to the "ocr_only" strategy.
min_partition
The minimum number of characters to include in a partition. Only applies if
processing text/plain content.
"""
exactly_one(filename=filename, file=file)
return partition_pdf_or_image(
@ -91,6 +95,7 @@ def partition_pdf(
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
max_partition=max_partition,
min_partition=min_partition,
**kwargs,
)
@ -116,6 +121,7 @@ def partition_pdf_or_image(
infer_table_structure: bool = False,
ocr_languages: str = "eng",
max_partition: Optional[int] = 1500,
min_partition: Optional[int] = 0,
**kwargs,
) -> List[Element]:
"""Parses a pdf or image document into a list of interpreted elements."""
@ -172,6 +178,7 @@ def partition_pdf_or_image(
ocr_languages=ocr_languages,
is_image=is_image,
max_partition=max_partition,
min_partition=min_partition,
)
return layout_elements
@ -391,6 +398,7 @@ def _partition_pdf_or_image_with_ocr(
ocr_languages: str = "eng",
is_image: bool = False,
max_partition: Optional[int] = 1500,
min_partition: Optional[int] = 0,
):
"""Partitions and image or PDF using Tesseract OCR. For PDFs, each page is converted
to an image prior to processing."""
@ -402,7 +410,11 @@ def _partition_pdf_or_image_with_ocr(
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
else:
text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
elements = partition_text(text=text, max_partition=max_partition)
elements = partition_text(
text=text,
max_partition=max_partition,
min_partition=min_partition,
)
else:
elements = []
page_number = 0
@ -411,7 +423,11 @@ def _partition_pdf_or_image_with_ocr(
metadata = ElementMetadata(filename=filename, page_number=page_number)
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
_elements = partition_text(text=text, max_partition=max_partition)
_elements = partition_text(
text=text,
max_partition=max_partition,
min_partition=min_partition,
)
for element in _elements:
element.metadata = metadata
elements.append(element)

View File

@ -1,4 +1,5 @@
import re
import textwrap
from typing import IO, Callable, List, Optional, Tuple
from unstructured.cleaners.core import clean_bullets, group_broken_paragraphs
@ -26,52 +27,125 @@ from unstructured.partition.text_type import (
)
def split_by_paragraph(content: str, max_partition: Optional[int] = 1500) -> List[str]:
paragraphs = re.split(PARAGRAPH_PATTERN, content)
if max_partition is None:
return paragraphs
def _split_in_half_at_breakpoint(
content: str,
breakpoint: str = " ",
) -> List[str]:
"""Splits a segment of content at the breakpoint closest to the middle"""
mid = len(content) // 2
for i in range(len(content) // 2):
if content[mid + i] == breakpoint:
mid += i
break
elif content[mid - i] == breakpoint:
mid += -i
break
split_paragraphs = []
for paragraph in paragraphs:
split_paragraphs.extend(
_split_to_fit_max_content(paragraph, max_partition=max_partition),
)
return split_paragraphs
return [content[:mid].rstrip(), content[mid:].lstrip()]
def _split_content_size_n(content: str, n: int) -> List[str]:
"""Splits a string into chunks that are at most size n."""
"""Splits a section of content into chunks that are at most
size n without breaking apart words."""
segments = []
for i in range(0, len(content), n):
segment = content[i : i + n] # noqa: E203
segments.append(segment)
if len(content) < n * 2:
segments = list(_split_in_half_at_breakpoint(content))
else:
segments = textwrap.wrap(content, width=n)
return segments
def _split_to_fit_max_content(content: str, max_partition: int = 1500) -> List[str]:
"""Splits a section of content so that all of the elements fit into the
def split_content_to_fit_max(
content: str,
max_partition: Optional[int] = 1500,
) -> List[str]:
"""Splits a paragraph or section of content so that all of the elements fit into the
max partition window."""
sentences = sent_tokenize(content)
num_sentences = len(sentences)
chunks = []
chunk = ""
for i, sentence in enumerate(sentences):
if len(sentence) > max_partition:
chunks.extend(_split_content_size_n(sentence, n=max_partition))
if len(chunk + " " + sentence) > max_partition:
chunks.append(chunk)
chunk = sentence
tmp_chunk = ""
for sentence in sentences:
if max_partition is not None and len(sentence) > max_partition:
if tmp_chunk:
chunks.append(tmp_chunk)
tmp_chunk = ""
segments = _split_content_size_n(sentence, n=max_partition)
chunks.extend(segments[:-1])
tmp_chunk = segments[-1]
else:
chunk += " " + sentence
if i == num_sentences - 1:
chunks.append(chunk)
if max_partition is not None and len(tmp_chunk + " " + sentence) > max_partition:
chunks.append(tmp_chunk)
tmp_chunk = sentence
else:
if not tmp_chunk:
tmp_chunk = sentence
else:
tmp_chunk += " " + sentence
tmp_chunk = tmp_chunk.strip()
if tmp_chunk:
chunks.append(tmp_chunk)
return chunks
def combine_paragraphs_less_than_min(
split_paragraphs: List[str],
max_partition: Optional[int] = 1500,
min_partition: Optional[int] = 0,
) -> List[str]:
"""Combine paragraphs less than `min_partition` while not exceeding `max_partition`."""
if type(split_paragraphs) is not list:
raise ValueError("`split_paragraphs` is not a list")
file_content: List[str] = []
tmp_paragraph = ""
next_index = 0
for current_index, paragraph in enumerate(split_paragraphs):
if next_index > current_index:
continue # Skip the current iteration if `next_index`` is already updated
if min_partition is not None and len(paragraph) < min_partition:
# Combine paragraphs that are less than `min_partition``
# while not exceeding `max_partition``
tmp_paragraph += paragraph + "\n"
while len(tmp_paragraph.strip()) < min_partition:
if current_index + 1 == len(split_paragraphs):
# If it's the last paragraph, append the paragraph
# to the previous content
file_content[-1] += " " + tmp_paragraph.rstrip()
tmp_paragraph = ""
break
for offset_index, para in enumerate(
split_paragraphs[current_index + 1 :], start=1 # noqa
):
if (
max_partition is not None
and len(tmp_paragraph + "\n" + para) < max_partition
):
tmp_paragraph += "\n" + para
# Update `next_index` to skip already combined paragraphs
next_index = offset_index + current_index + 1
if len(tmp_paragraph.strip()) > min_partition:
break # Stop combining if the combined paragraphs
# meet the `min_partition`` requirement
elif (
max_partition is not None
and len(tmp_paragraph) < min_partition
and len(tmp_paragraph + "\n" + para) > max_partition
):
raise ValueError(
"`min_partition` and `max_partition` are defined too close together",
)
# Add the combined paragraph to the final result
file_content.append(
tmp_paragraph.strip(),
)
tmp_paragraph = ""
else:
file_content.append(paragraph)
return file_content
@process_metadata()
@add_metadata_with_filetype(FileType.TXT)
def partition_text(
@ -83,9 +157,12 @@ def partition_text(
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
max_partition: Optional[int] = 1500,
min_partition: Optional[int] = 0,
**kwargs,
) -> List[Element]:
"""Partitions an .txt documents into its constituent elements.
"""Partitions an .txt documents into its constituent paragraph elements.
If paragraphs are below "min_partition" or above "max_partition" boundaries,
they are combined or split.
Parameters
----------
filename
@ -104,10 +181,19 @@ def partition_text(
max_partition
The maximum number of characters to include in a partition. If None is passed,
no maximum is applied.
min_partition
The minimum number of characters to include in a partition.
"""
if text is not None and text.strip() == "" and not file and not filename:
return []
if (
min_partition is not None
and max_partition is not None
and (min_partition > max_partition or min_partition < 0 or max_partition < 0)
):
raise ValueError("Invalid values for min_partition and/or max_partition.")
# Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file, text=text)
@ -120,12 +206,33 @@ def partition_text(
elif text is not None:
file_text = str(text)
if paragraph_grouper is not None:
if paragraph_grouper is False:
pass
elif paragraph_grouper is not None:
file_text = paragraph_grouper(file_text)
else:
file_text = group_broken_paragraphs(file_text)
file_content = split_by_paragraph(file_text, max_partition=max_partition)
if min_partition is not None and len(file_text) < min_partition:
raise ValueError("`min_partition` cannot be larger than the length of file contents.")
split_paragraphs = re.split(PARAGRAPH_PATTERN, file_text.strip())
paragraphs = combine_paragraphs_less_than_min(
split_paragraphs=split_paragraphs,
max_partition=max_partition,
min_partition=min_partition,
)
file_content = []
for paragraph in paragraphs:
file_content.extend(
split_content_to_fit_max(
content=paragraph,
max_partition=max_partition,
),
)
elements: List[Element] = []
metadata = (

View File

@ -54,6 +54,7 @@ def partition_xml(
include_metadata: bool = True,
encoding: Optional[str] = None,
max_partition: Optional[int] = 1500,
min_partition: Optional[int] = 0,
**kwargs,
) -> List[Element]:
"""Partitions an XML document into its document elements.
@ -77,6 +78,8 @@ def partition_xml(
max_partition
The maximum number of characters to include in a partition. If None is passed,
no maximum is applied.
min_partition
The minimum number of characters to include in a partition.
"""
exactly_one(filename=filename, file=file)
@ -97,6 +100,7 @@ def partition_xml(
metadata_filename=metadata_filename,
include_metadata=include_metadata,
max_partition=max_partition,
min_partition=min_partition,
)
return elements