mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-16 17:55:48 +00:00
feat: add min_partition kwarg to that combines elements below a specified threshold (#926)
* add min_partition * functioning _split_content_to_fit_min_max * create test and make tidy/check * fix rebase issues * fix type hinting, remove unused code, add tests * various changes and refactoring of methods * add test, refactor, change var names for debugging purposes * update test * make tidy/check * give more descriptive var names and add comments * update xml partition via partition_text and create test * fix <pre> bug for test_partition_html_with_pre_tag * make tidy * refactor and fix tests * make tidy/check * ingest-test-fixtures-update * change list comprehension to for loop * fix error check
This commit is contained in:
parent
d0329126ef
commit
676c50a6ec
@ -1,7 +1,7 @@
|
|||||||
## 0.8.2-dev4
|
## 0.8.2-dev4
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
* Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
|
||||||
* set the file's current position to the beginning after reading the file in `convert_to_bytes`
|
* set the file's current position to the beginning after reading the file in `convert_to_bytes`
|
||||||
* Add slide notes to pptx
|
* Add slide notes to pptx
|
||||||
|
|
||||||
|
|||||||
@ -44,6 +44,20 @@ def test_from_string(sample_document):
|
|||||||
assert type_tag.text.strip() == "10-K"
|
assert type_tag.text.strip() == "10-K"
|
||||||
|
|
||||||
|
|
||||||
|
def test_from_string_with_pre_tag():
|
||||||
|
sample_document = """
|
||||||
|
<pre>
|
||||||
|
<SEC-DOCUMENT>
|
||||||
|
<TYPE>10-K
|
||||||
|
<COMPANY>Proctor & Gamble
|
||||||
|
</SEC-DOCUMENT>
|
||||||
|
</pre>
|
||||||
|
"""
|
||||||
|
xml_document = XMLDocument.from_string(sample_document)
|
||||||
|
type_tag = xml_document.document_tree.find(".//type")
|
||||||
|
assert type_tag.text.strip() == "10-K"
|
||||||
|
|
||||||
|
|
||||||
def test_read_with_stylesheet():
|
def test_read_with_stylesheet():
|
||||||
filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
|
filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
|
||||||
stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")
|
stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")
|
||||||
|
|||||||
@ -241,6 +241,19 @@ def test_partition_email_from_text_file_with_headers():
|
|||||||
assert element.metadata.filename is None
|
assert element.metadata.filename is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_email_from_text_file_max():
|
||||||
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
|
||||||
|
with open(filename) as f:
|
||||||
|
elements = partition_email(file=f, content_source="text/plain", max_partition=20)
|
||||||
|
assert len(elements) == 6
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_email_from_text_file_raises_value_error():
|
||||||
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
|
||||||
|
with pytest.raises(ValueError), open(filename) as f:
|
||||||
|
partition_email(file=f, content_source="text/plain", min_partition=1000)
|
||||||
|
|
||||||
|
|
||||||
def test_partition_email_from_text():
|
def test_partition_email_from_text():
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
|
||||||
with open(filename) as f:
|
with open(filename) as f:
|
||||||
|
|||||||
@ -144,7 +144,6 @@ def test_partition_image_with_ocr_detects_korean():
|
|||||||
|
|
||||||
def test_partition_image_with_ocr_detects_korean_from_file():
|
def test_partition_image_with_ocr_detects_korean_from_file():
|
||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
|
||||||
|
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
elements = image.partition_image(
|
elements = image.partition_image(
|
||||||
file=f,
|
file=f,
|
||||||
|
|||||||
@ -5,7 +5,11 @@ import pytest
|
|||||||
|
|
||||||
from unstructured.cleaners.core import group_broken_paragraphs
|
from unstructured.cleaners.core import group_broken_paragraphs
|
||||||
from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
|
from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
|
||||||
from unstructured.partition.text import partition_text
|
from unstructured.partition.text import (
|
||||||
|
combine_paragraphs_less_than_min,
|
||||||
|
partition_text,
|
||||||
|
split_content_to_fit_max,
|
||||||
|
)
|
||||||
|
|
||||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||||
|
|
||||||
@ -18,6 +22,31 @@ EXPECTED_OUTPUT = [
|
|||||||
ListItem(text="I love fuzzy blankets"),
|
ListItem(text="I love fuzzy blankets"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
MIN_MAX_TEXT = """This is a story. This is a story that doesn't matter
|
||||||
|
because it is just being used as an example. Hi. Hello. Howdy. Hola.
|
||||||
|
The example is simple and repetitive and long and somewhat boring,
|
||||||
|
but it serves a purpose. End.""".replace(
|
||||||
|
"\n",
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
|
||||||
|
SHORT_PARAGRAPHS = """This is a story.
|
||||||
|
|
||||||
|
This is a story that doesn't matter because it is just being used as an example.
|
||||||
|
|
||||||
|
Hi.
|
||||||
|
|
||||||
|
Hello.
|
||||||
|
|
||||||
|
Howdy.
|
||||||
|
|
||||||
|
Hola.
|
||||||
|
|
||||||
|
The example is simple and repetitive and long and somewhat boring, but it serves a purpose.
|
||||||
|
|
||||||
|
End.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("filename", "encoding"),
|
("filename", "encoding"),
|
||||||
@ -201,6 +230,79 @@ def test_partition_text_splits_long_text(filename="example-docs/norwich-city.txt
|
|||||||
assert elements[-1].text.endswith("External links")
|
assert elements[-1].text.endswith("External links")
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_text_splits_long_text_max_partition(filename="example-docs/norwich-city.txt"):
|
||||||
|
elements = partition_text(filename=filename)
|
||||||
|
elements_max_part = partition_text(filename=filename, max_partition=500)
|
||||||
|
assert len(elements) < len(elements_max_part)
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_text_min_max():
|
||||||
|
segments = partition_text(
|
||||||
|
text=SHORT_PARAGRAPHS,
|
||||||
|
min_partition=6,
|
||||||
|
)
|
||||||
|
expected = [
|
||||||
|
"This is a story.",
|
||||||
|
"This is a story that doesn't matter because it is just being used as an example.",
|
||||||
|
"Hi. Hello.",
|
||||||
|
"Howdy.",
|
||||||
|
"""Hola. The example is simple and repetitive and long and somewhat boring,
|
||||||
|
but it serves a purpose. End.""".replace(
|
||||||
|
"\n",
|
||||||
|
"",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
for segment, test_segment in zip(segments, expected):
|
||||||
|
assert segment.text == test_segment
|
||||||
|
|
||||||
|
segments = partition_text(
|
||||||
|
text=SHORT_PARAGRAPHS,
|
||||||
|
max_partition=20,
|
||||||
|
min_partition=7,
|
||||||
|
)
|
||||||
|
expected = [
|
||||||
|
"This is a story.",
|
||||||
|
"This is a story that",
|
||||||
|
"doesn't matter",
|
||||||
|
"because it is just",
|
||||||
|
"being used as an",
|
||||||
|
"example.",
|
||||||
|
"Hi. Hello.",
|
||||||
|
"Howdy. Hola.",
|
||||||
|
"The example is",
|
||||||
|
"simple and",
|
||||||
|
"repetitive and long",
|
||||||
|
"and somewhat boring,",
|
||||||
|
"but it serves a",
|
||||||
|
"purpose. End.",
|
||||||
|
]
|
||||||
|
for segment, test_segment in zip(segments, expected):
|
||||||
|
assert segment.text == test_segment
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_content_to_fit_max():
|
||||||
|
segments = split_content_to_fit_max(
|
||||||
|
content=MIN_MAX_TEXT,
|
||||||
|
max_partition=75,
|
||||||
|
)
|
||||||
|
assert segments == [
|
||||||
|
"This is a story.",
|
||||||
|
"This is a story that doesn't matter because",
|
||||||
|
"it is just being used as an example. Hi. Hello. Howdy. Hola.",
|
||||||
|
"The example is simple and repetitive and long",
|
||||||
|
"and somewhat boring, but it serves a purpose. End.",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_combine_paragraphs_less_than_min():
|
||||||
|
segments = combine_paragraphs_less_than_min(
|
||||||
|
SHORT_PARAGRAPHS.split("\n\n"),
|
||||||
|
max_partition=1500,
|
||||||
|
min_partition=7,
|
||||||
|
)
|
||||||
|
assert len(segments) < len(SHORT_PARAGRAPHS)
|
||||||
|
|
||||||
|
|
||||||
def test_partition_text_doesnt_get_page_breaks():
|
def test_partition_text_doesnt_get_page_breaks():
|
||||||
text = "--------------------"
|
text = "--------------------"
|
||||||
elements = partition_text(text=text)
|
elements = partition_text(text=text)
|
||||||
|
|||||||
@ -7,7 +7,7 @@ from unstructured.file_utils.encoding import read_txt_file
|
|||||||
from unstructured.logger import logger
|
from unstructured.logger import logger
|
||||||
from unstructured.partition.text import (
|
from unstructured.partition.text import (
|
||||||
element_from_text,
|
element_from_text,
|
||||||
split_by_paragraph,
|
partition_text,
|
||||||
)
|
)
|
||||||
|
|
||||||
VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None]
|
VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None]
|
||||||
@ -78,13 +78,16 @@ class XMLDocument(Document):
|
|||||||
# Please use bytes input or XML fragments without declaration.
|
# Please use bytes input or XML fragments without declaration.
|
||||||
except ValueError:
|
except ValueError:
|
||||||
document_tree = etree.fromstring(content.encode(), self.parser)
|
document_tree = etree.fromstring(content.encode(), self.parser)
|
||||||
|
|
||||||
if "<pre>" and "</pre>" in content:
|
if "<pre>" and "</pre>" in content:
|
||||||
tree = etree.HTML(content)
|
tree = etree.HTML(content)
|
||||||
for element in tree.xpath("//pre"):
|
for element in tree.xpath("//pre"):
|
||||||
if not element.text:
|
if not element.text:
|
||||||
continue
|
continue
|
||||||
text_content = split_by_paragraph(element.text)
|
|
||||||
|
text_content = []
|
||||||
|
for element in partition_text(text=element.text, paragraph_grouper=False):
|
||||||
|
text_content.append(element.text)
|
||||||
|
|
||||||
for text in text_content:
|
for text in text_content:
|
||||||
element = etree.Element("span")
|
element = etree.Element("span")
|
||||||
element.text = str(element_from_text(text=text))
|
element.text = str(element_from_text(text=text))
|
||||||
|
|||||||
@ -52,7 +52,7 @@ from unstructured.file_utils.filetype import FileType, add_metadata_with_filetyp
|
|||||||
from unstructured.logger import logger
|
from unstructured.logger import logger
|
||||||
from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
|
from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
|
||||||
from unstructured.partition.html import partition_html
|
from unstructured.partition.html import partition_html
|
||||||
from unstructured.partition.text import partition_text, split_by_paragraph
|
from unstructured.partition.text import partition_text
|
||||||
|
|
||||||
VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html", "text/plain"]
|
VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html", "text/plain"]
|
||||||
|
|
||||||
@ -232,6 +232,7 @@ def partition_email(
|
|||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
process_attachments: bool = False,
|
process_attachments: bool = False,
|
||||||
attachment_partitioner: Optional[Callable] = None,
|
attachment_partitioner: Optional[Callable] = None,
|
||||||
|
min_partition: Optional[int] = 0,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Partitions an .eml documents into its constituent elements.
|
"""Partitions an .eml documents into its constituent elements.
|
||||||
@ -258,6 +259,9 @@ def partition_email(
|
|||||||
processing the content of the email itself.
|
processing the content of the email itself.
|
||||||
attachment_partitioner
|
attachment_partitioner
|
||||||
The partitioning function to use to process attachments.
|
The partitioning function to use to process attachments.
|
||||||
|
min_partition
|
||||||
|
The minimum number of characters to include in a partition. Only applies if
|
||||||
|
processing the text/plain content.
|
||||||
"""
|
"""
|
||||||
if content_source not in VALID_CONTENT_SOURCES:
|
if content_source not in VALID_CONTENT_SOURCES:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@ -270,7 +274,6 @@ def partition_email(
|
|||||||
|
|
||||||
# Verify that only one of the arguments was provided
|
# Verify that only one of the arguments was provided
|
||||||
exactly_one(filename=filename, file=file, text=text)
|
exactly_one(filename=filename, file=file, text=text)
|
||||||
|
|
||||||
detected_encoding = "utf-8"
|
detected_encoding = "utf-8"
|
||||||
if filename is not None:
|
if filename is not None:
|
||||||
extracted_encoding, msg = parse_email(filename=filename)
|
extracted_encoding, msg = parse_email(filename=filename)
|
||||||
@ -342,12 +345,12 @@ def partition_email(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
elif content_source == "text/plain":
|
elif content_source == "text/plain":
|
||||||
list_content = split_by_paragraph(content)
|
|
||||||
elements = partition_text(
|
elements = partition_text(
|
||||||
text=content,
|
text=content,
|
||||||
encoding=encoding,
|
encoding=encoding,
|
||||||
max_partition=max_partition,
|
max_partition=max_partition,
|
||||||
metadata_filename=metadata_filename or filename,
|
metadata_filename=metadata_filename or filename,
|
||||||
|
min_partition=min_partition,
|
||||||
)
|
)
|
||||||
|
|
||||||
for idx, element in enumerate(elements):
|
for idx, element in enumerate(elements):
|
||||||
|
|||||||
@ -22,6 +22,7 @@ def partition_msg(
|
|||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
process_attachments: bool = False,
|
process_attachments: bool = False,
|
||||||
attachment_partitioner: Optional[Callable] = None,
|
attachment_partitioner: Optional[Callable] = None,
|
||||||
|
min_partition: Optional[int] = 0,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Partitions a MSFT Outlook .msg file
|
"""Partitions a MSFT Outlook .msg file
|
||||||
@ -42,6 +43,9 @@ def partition_msg(
|
|||||||
processing the content of the email itself.
|
processing the content of the email itself.
|
||||||
attachment_partitioner
|
attachment_partitioner
|
||||||
The partitioning function to use to process attachments.
|
The partitioning function to use to process attachments.
|
||||||
|
min_partition
|
||||||
|
The minimum number of characters to include in a partition. Only applies if
|
||||||
|
processing text/plain content.
|
||||||
"""
|
"""
|
||||||
exactly_one(filename=filename, file=file)
|
exactly_one(filename=filename, file=file)
|
||||||
|
|
||||||
@ -57,7 +61,11 @@ def partition_msg(
|
|||||||
if "<html>" in text or "</div>" in text:
|
if "<html>" in text or "</div>" in text:
|
||||||
elements = partition_html(text=text)
|
elements = partition_html(text=text)
|
||||||
else:
|
else:
|
||||||
elements = partition_text(text=text, max_partition=max_partition)
|
elements = partition_text(
|
||||||
|
text=text,
|
||||||
|
max_partition=max_partition,
|
||||||
|
min_partition=min_partition,
|
||||||
|
)
|
||||||
|
|
||||||
metadata = build_msg_metadata(msg_obj, metadata_filename or filename)
|
metadata = build_msg_metadata(msg_obj, metadata_filename or filename)
|
||||||
for element in elements:
|
for element in elements:
|
||||||
|
|||||||
@ -51,6 +51,7 @@ def partition_pdf(
|
|||||||
max_partition: Optional[int] = 1500,
|
max_partition: Optional[int] = 1500,
|
||||||
include_metadata: bool = True,
|
include_metadata: bool = True,
|
||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
|
min_partition: Optional[int] = 0,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Parses a pdf document into a list of interpreted elements.
|
"""Parses a pdf document into a list of interpreted elements.
|
||||||
@ -81,6 +82,9 @@ def partition_pdf(
|
|||||||
max_partition
|
max_partition
|
||||||
The maximum number of characters to include in a partition. If None is passed,
|
The maximum number of characters to include in a partition. If None is passed,
|
||||||
no maximum is applied. Only applies to the "ocr_only" strategy.
|
no maximum is applied. Only applies to the "ocr_only" strategy.
|
||||||
|
min_partition
|
||||||
|
The minimum number of characters to include in a partition. Only applies if
|
||||||
|
processing text/plain content.
|
||||||
"""
|
"""
|
||||||
exactly_one(filename=filename, file=file)
|
exactly_one(filename=filename, file=file)
|
||||||
return partition_pdf_or_image(
|
return partition_pdf_or_image(
|
||||||
@ -91,6 +95,7 @@ def partition_pdf(
|
|||||||
infer_table_structure=infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
ocr_languages=ocr_languages,
|
ocr_languages=ocr_languages,
|
||||||
max_partition=max_partition,
|
max_partition=max_partition,
|
||||||
|
min_partition=min_partition,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -116,6 +121,7 @@ def partition_pdf_or_image(
|
|||||||
infer_table_structure: bool = False,
|
infer_table_structure: bool = False,
|
||||||
ocr_languages: str = "eng",
|
ocr_languages: str = "eng",
|
||||||
max_partition: Optional[int] = 1500,
|
max_partition: Optional[int] = 1500,
|
||||||
|
min_partition: Optional[int] = 0,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Parses a pdf or image document into a list of interpreted elements."""
|
"""Parses a pdf or image document into a list of interpreted elements."""
|
||||||
@ -172,6 +178,7 @@ def partition_pdf_or_image(
|
|||||||
ocr_languages=ocr_languages,
|
ocr_languages=ocr_languages,
|
||||||
is_image=is_image,
|
is_image=is_image,
|
||||||
max_partition=max_partition,
|
max_partition=max_partition,
|
||||||
|
min_partition=min_partition,
|
||||||
)
|
)
|
||||||
|
|
||||||
return layout_elements
|
return layout_elements
|
||||||
@ -391,6 +398,7 @@ def _partition_pdf_or_image_with_ocr(
|
|||||||
ocr_languages: str = "eng",
|
ocr_languages: str = "eng",
|
||||||
is_image: bool = False,
|
is_image: bool = False,
|
||||||
max_partition: Optional[int] = 1500,
|
max_partition: Optional[int] = 1500,
|
||||||
|
min_partition: Optional[int] = 0,
|
||||||
):
|
):
|
||||||
"""Partitions and image or PDF using Tesseract OCR. For PDFs, each page is converted
|
"""Partitions and image or PDF using Tesseract OCR. For PDFs, each page is converted
|
||||||
to an image prior to processing."""
|
to an image prior to processing."""
|
||||||
@ -402,7 +410,11 @@ def _partition_pdf_or_image_with_ocr(
|
|||||||
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
|
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
|
||||||
else:
|
else:
|
||||||
text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
|
text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
|
||||||
elements = partition_text(text=text, max_partition=max_partition)
|
elements = partition_text(
|
||||||
|
text=text,
|
||||||
|
max_partition=max_partition,
|
||||||
|
min_partition=min_partition,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
elements = []
|
elements = []
|
||||||
page_number = 0
|
page_number = 0
|
||||||
@ -411,7 +423,11 @@ def _partition_pdf_or_image_with_ocr(
|
|||||||
metadata = ElementMetadata(filename=filename, page_number=page_number)
|
metadata = ElementMetadata(filename=filename, page_number=page_number)
|
||||||
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
|
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
|
||||||
|
|
||||||
_elements = partition_text(text=text, max_partition=max_partition)
|
_elements = partition_text(
|
||||||
|
text=text,
|
||||||
|
max_partition=max_partition,
|
||||||
|
min_partition=min_partition,
|
||||||
|
)
|
||||||
for element in _elements:
|
for element in _elements:
|
||||||
element.metadata = metadata
|
element.metadata = metadata
|
||||||
elements.append(element)
|
elements.append(element)
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
import re
|
import re
|
||||||
|
import textwrap
|
||||||
from typing import IO, Callable, List, Optional, Tuple
|
from typing import IO, Callable, List, Optional, Tuple
|
||||||
|
|
||||||
from unstructured.cleaners.core import clean_bullets, group_broken_paragraphs
|
from unstructured.cleaners.core import clean_bullets, group_broken_paragraphs
|
||||||
@ -26,52 +27,125 @@ from unstructured.partition.text_type import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def split_by_paragraph(content: str, max_partition: Optional[int] = 1500) -> List[str]:
|
def _split_in_half_at_breakpoint(
|
||||||
paragraphs = re.split(PARAGRAPH_PATTERN, content)
|
content: str,
|
||||||
if max_partition is None:
|
breakpoint: str = " ",
|
||||||
return paragraphs
|
) -> List[str]:
|
||||||
|
"""Splits a segment of content at the breakpoint closest to the middle"""
|
||||||
|
mid = len(content) // 2
|
||||||
|
for i in range(len(content) // 2):
|
||||||
|
if content[mid + i] == breakpoint:
|
||||||
|
mid += i
|
||||||
|
break
|
||||||
|
elif content[mid - i] == breakpoint:
|
||||||
|
mid += -i
|
||||||
|
break
|
||||||
|
|
||||||
split_paragraphs = []
|
return [content[:mid].rstrip(), content[mid:].lstrip()]
|
||||||
for paragraph in paragraphs:
|
|
||||||
split_paragraphs.extend(
|
|
||||||
_split_to_fit_max_content(paragraph, max_partition=max_partition),
|
|
||||||
)
|
|
||||||
return split_paragraphs
|
|
||||||
|
|
||||||
|
|
||||||
def _split_content_size_n(content: str, n: int) -> List[str]:
|
def _split_content_size_n(content: str, n: int) -> List[str]:
|
||||||
"""Splits a string into chunks that are at most size n."""
|
"""Splits a section of content into chunks that are at most
|
||||||
|
size n without breaking apart words."""
|
||||||
segments = []
|
segments = []
|
||||||
for i in range(0, len(content), n):
|
if len(content) < n * 2:
|
||||||
segment = content[i : i + n] # noqa: E203
|
segments = list(_split_in_half_at_breakpoint(content))
|
||||||
segments.append(segment)
|
else:
|
||||||
|
segments = textwrap.wrap(content, width=n)
|
||||||
return segments
|
return segments
|
||||||
|
|
||||||
|
|
||||||
def _split_to_fit_max_content(content: str, max_partition: int = 1500) -> List[str]:
|
def split_content_to_fit_max(
|
||||||
"""Splits a section of content so that all of the elements fit into the
|
content: str,
|
||||||
|
max_partition: Optional[int] = 1500,
|
||||||
|
) -> List[str]:
|
||||||
|
"""Splits a paragraph or section of content so that all of the elements fit into the
|
||||||
max partition window."""
|
max partition window."""
|
||||||
sentences = sent_tokenize(content)
|
sentences = sent_tokenize(content)
|
||||||
num_sentences = len(sentences)
|
|
||||||
|
|
||||||
chunks = []
|
chunks = []
|
||||||
chunk = ""
|
tmp_chunk = ""
|
||||||
|
for sentence in sentences:
|
||||||
for i, sentence in enumerate(sentences):
|
if max_partition is not None and len(sentence) > max_partition:
|
||||||
if len(sentence) > max_partition:
|
if tmp_chunk:
|
||||||
chunks.extend(_split_content_size_n(sentence, n=max_partition))
|
chunks.append(tmp_chunk)
|
||||||
|
tmp_chunk = ""
|
||||||
if len(chunk + " " + sentence) > max_partition:
|
segments = _split_content_size_n(sentence, n=max_partition)
|
||||||
chunks.append(chunk)
|
chunks.extend(segments[:-1])
|
||||||
chunk = sentence
|
tmp_chunk = segments[-1]
|
||||||
else:
|
else:
|
||||||
chunk += " " + sentence
|
if max_partition is not None and len(tmp_chunk + " " + sentence) > max_partition:
|
||||||
if i == num_sentences - 1:
|
chunks.append(tmp_chunk)
|
||||||
chunks.append(chunk)
|
tmp_chunk = sentence
|
||||||
|
else:
|
||||||
|
if not tmp_chunk:
|
||||||
|
tmp_chunk = sentence
|
||||||
|
else:
|
||||||
|
tmp_chunk += " " + sentence
|
||||||
|
tmp_chunk = tmp_chunk.strip()
|
||||||
|
if tmp_chunk:
|
||||||
|
chunks.append(tmp_chunk)
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def combine_paragraphs_less_than_min(
|
||||||
|
split_paragraphs: List[str],
|
||||||
|
max_partition: Optional[int] = 1500,
|
||||||
|
min_partition: Optional[int] = 0,
|
||||||
|
) -> List[str]:
|
||||||
|
"""Combine paragraphs less than `min_partition` while not exceeding `max_partition`."""
|
||||||
|
if type(split_paragraphs) is not list:
|
||||||
|
raise ValueError("`split_paragraphs` is not a list")
|
||||||
|
file_content: List[str] = []
|
||||||
|
tmp_paragraph = ""
|
||||||
|
next_index = 0
|
||||||
|
for current_index, paragraph in enumerate(split_paragraphs):
|
||||||
|
if next_index > current_index:
|
||||||
|
continue # Skip the current iteration if `next_index`` is already updated
|
||||||
|
if min_partition is not None and len(paragraph) < min_partition:
|
||||||
|
# Combine paragraphs that are less than `min_partition``
|
||||||
|
# while not exceeding `max_partition``
|
||||||
|
tmp_paragraph += paragraph + "\n"
|
||||||
|
|
||||||
|
while len(tmp_paragraph.strip()) < min_partition:
|
||||||
|
if current_index + 1 == len(split_paragraphs):
|
||||||
|
# If it's the last paragraph, append the paragraph
|
||||||
|
# to the previous content
|
||||||
|
file_content[-1] += " " + tmp_paragraph.rstrip()
|
||||||
|
tmp_paragraph = ""
|
||||||
|
break
|
||||||
|
for offset_index, para in enumerate(
|
||||||
|
split_paragraphs[current_index + 1 :], start=1 # noqa
|
||||||
|
):
|
||||||
|
if (
|
||||||
|
max_partition is not None
|
||||||
|
and len(tmp_paragraph + "\n" + para) < max_partition
|
||||||
|
):
|
||||||
|
tmp_paragraph += "\n" + para
|
||||||
|
# Update `next_index` to skip already combined paragraphs
|
||||||
|
next_index = offset_index + current_index + 1
|
||||||
|
|
||||||
|
if len(tmp_paragraph.strip()) > min_partition:
|
||||||
|
break # Stop combining if the combined paragraphs
|
||||||
|
# meet the `min_partition`` requirement
|
||||||
|
elif (
|
||||||
|
max_partition is not None
|
||||||
|
and len(tmp_paragraph) < min_partition
|
||||||
|
and len(tmp_paragraph + "\n" + para) > max_partition
|
||||||
|
):
|
||||||
|
raise ValueError(
|
||||||
|
"`min_partition` and `max_partition` are defined too close together",
|
||||||
|
)
|
||||||
|
# Add the combined paragraph to the final result
|
||||||
|
file_content.append(
|
||||||
|
tmp_paragraph.strip(),
|
||||||
|
)
|
||||||
|
tmp_paragraph = ""
|
||||||
|
else:
|
||||||
|
file_content.append(paragraph)
|
||||||
|
return file_content
|
||||||
|
|
||||||
|
|
||||||
@process_metadata()
|
@process_metadata()
|
||||||
@add_metadata_with_filetype(FileType.TXT)
|
@add_metadata_with_filetype(FileType.TXT)
|
||||||
def partition_text(
|
def partition_text(
|
||||||
@ -83,9 +157,12 @@ def partition_text(
|
|||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
include_metadata: bool = True,
|
include_metadata: bool = True,
|
||||||
max_partition: Optional[int] = 1500,
|
max_partition: Optional[int] = 1500,
|
||||||
|
min_partition: Optional[int] = 0,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Partitions an .txt documents into its constituent elements.
|
"""Partitions an .txt documents into its constituent paragraph elements.
|
||||||
|
If paragraphs are below "min_partition" or above "max_partition" boundaries,
|
||||||
|
they are combined or split.
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filename
|
filename
|
||||||
@ -104,10 +181,19 @@ def partition_text(
|
|||||||
max_partition
|
max_partition
|
||||||
The maximum number of characters to include in a partition. If None is passed,
|
The maximum number of characters to include in a partition. If None is passed,
|
||||||
no maximum is applied.
|
no maximum is applied.
|
||||||
|
min_partition
|
||||||
|
The minimum number of characters to include in a partition.
|
||||||
"""
|
"""
|
||||||
if text is not None and text.strip() == "" and not file and not filename:
|
if text is not None and text.strip() == "" and not file and not filename:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
if (
|
||||||
|
min_partition is not None
|
||||||
|
and max_partition is not None
|
||||||
|
and (min_partition > max_partition or min_partition < 0 or max_partition < 0)
|
||||||
|
):
|
||||||
|
raise ValueError("Invalid values for min_partition and/or max_partition.")
|
||||||
|
|
||||||
# Verify that only one of the arguments was provided
|
# Verify that only one of the arguments was provided
|
||||||
exactly_one(filename=filename, file=file, text=text)
|
exactly_one(filename=filename, file=file, text=text)
|
||||||
|
|
||||||
@ -120,12 +206,33 @@ def partition_text(
|
|||||||
elif text is not None:
|
elif text is not None:
|
||||||
file_text = str(text)
|
file_text = str(text)
|
||||||
|
|
||||||
if paragraph_grouper is not None:
|
if paragraph_grouper is False:
|
||||||
|
pass
|
||||||
|
elif paragraph_grouper is not None:
|
||||||
file_text = paragraph_grouper(file_text)
|
file_text = paragraph_grouper(file_text)
|
||||||
else:
|
else:
|
||||||
file_text = group_broken_paragraphs(file_text)
|
file_text = group_broken_paragraphs(file_text)
|
||||||
|
|
||||||
file_content = split_by_paragraph(file_text, max_partition=max_partition)
|
if min_partition is not None and len(file_text) < min_partition:
|
||||||
|
raise ValueError("`min_partition` cannot be larger than the length of file contents.")
|
||||||
|
|
||||||
|
split_paragraphs = re.split(PARAGRAPH_PATTERN, file_text.strip())
|
||||||
|
|
||||||
|
paragraphs = combine_paragraphs_less_than_min(
|
||||||
|
split_paragraphs=split_paragraphs,
|
||||||
|
max_partition=max_partition,
|
||||||
|
min_partition=min_partition,
|
||||||
|
)
|
||||||
|
|
||||||
|
file_content = []
|
||||||
|
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
file_content.extend(
|
||||||
|
split_content_to_fit_max(
|
||||||
|
content=paragraph,
|
||||||
|
max_partition=max_partition,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
elements: List[Element] = []
|
elements: List[Element] = []
|
||||||
metadata = (
|
metadata = (
|
||||||
|
|||||||
@ -54,6 +54,7 @@ def partition_xml(
|
|||||||
include_metadata: bool = True,
|
include_metadata: bool = True,
|
||||||
encoding: Optional[str] = None,
|
encoding: Optional[str] = None,
|
||||||
max_partition: Optional[int] = 1500,
|
max_partition: Optional[int] = 1500,
|
||||||
|
min_partition: Optional[int] = 0,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Partitions an XML document into its document elements.
|
"""Partitions an XML document into its document elements.
|
||||||
@ -77,6 +78,8 @@ def partition_xml(
|
|||||||
max_partition
|
max_partition
|
||||||
The maximum number of characters to include in a partition. If None is passed,
|
The maximum number of characters to include in a partition. If None is passed,
|
||||||
no maximum is applied.
|
no maximum is applied.
|
||||||
|
min_partition
|
||||||
|
The minimum number of characters to include in a partition.
|
||||||
"""
|
"""
|
||||||
exactly_one(filename=filename, file=file)
|
exactly_one(filename=filename, file=file)
|
||||||
|
|
||||||
@ -97,6 +100,7 @@ def partition_xml(
|
|||||||
metadata_filename=metadata_filename,
|
metadata_filename=metadata_filename,
|
||||||
include_metadata=include_metadata,
|
include_metadata=include_metadata,
|
||||||
max_partition=max_partition,
|
max_partition=max_partition,
|
||||||
|
min_partition=min_partition,
|
||||||
)
|
)
|
||||||
|
|
||||||
return elements
|
return elements
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user