mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-10 14:42:24 +00:00
feat: add min_partition kwarg to that combines elements below a specified threshold (#926)
* add min_partition * functioning _split_content_to_fit_min_max * create test and make tidy/check * fix rebase issues * fix type hinting, remove unused code, add tests * various changes and refactoring of methods * add test, refactor, change var names for debugging purposes * update test * make tidy/check * give more descriptive var names and add comments * update xml partition via partition_text and create test * fix <pre> bug for test_partition_html_with_pre_tag * make tidy * refactor and fix tests * make tidy/check * ingest-test-fixtures-update * change list comprehension to for loop * fix error check
This commit is contained in:
parent
d0329126ef
commit
676c50a6ec
@ -1,7 +1,7 @@
|
||||
## 0.8.2-dev4
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
|
||||
* set the file's current position to the beginning after reading the file in `convert_to_bytes`
|
||||
* Add slide notes to pptx
|
||||
|
||||
|
||||
@ -44,6 +44,20 @@ def test_from_string(sample_document):
|
||||
assert type_tag.text.strip() == "10-K"
|
||||
|
||||
|
||||
def test_from_string_with_pre_tag():
|
||||
sample_document = """
|
||||
<pre>
|
||||
<SEC-DOCUMENT>
|
||||
<TYPE>10-K
|
||||
<COMPANY>Proctor & Gamble
|
||||
</SEC-DOCUMENT>
|
||||
</pre>
|
||||
"""
|
||||
xml_document = XMLDocument.from_string(sample_document)
|
||||
type_tag = xml_document.document_tree.find(".//type")
|
||||
assert type_tag.text.strip() == "10-K"
|
||||
|
||||
|
||||
def test_read_with_stylesheet():
|
||||
filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
|
||||
stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")
|
||||
|
||||
@ -241,6 +241,19 @@ def test_partition_email_from_text_file_with_headers():
|
||||
assert element.metadata.filename is None
|
||||
|
||||
|
||||
def test_partition_email_from_text_file_max():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
|
||||
with open(filename) as f:
|
||||
elements = partition_email(file=f, content_source="text/plain", max_partition=20)
|
||||
assert len(elements) == 6
|
||||
|
||||
|
||||
def test_partition_email_from_text_file_raises_value_error():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
|
||||
with pytest.raises(ValueError), open(filename) as f:
|
||||
partition_email(file=f, content_source="text/plain", min_partition=1000)
|
||||
|
||||
|
||||
def test_partition_email_from_text():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
|
||||
with open(filename) as f:
|
||||
|
||||
@ -144,7 +144,6 @@ def test_partition_image_with_ocr_detects_korean():
|
||||
|
||||
def test_partition_image_with_ocr_detects_korean_from_file():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
elements = image.partition_image(
|
||||
file=f,
|
||||
|
||||
@ -5,7 +5,11 @@ import pytest
|
||||
|
||||
from unstructured.cleaners.core import group_broken_paragraphs
|
||||
from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
|
||||
from unstructured.partition.text import partition_text
|
||||
from unstructured.partition.text import (
|
||||
combine_paragraphs_less_than_min,
|
||||
partition_text,
|
||||
split_content_to_fit_max,
|
||||
)
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
@ -18,6 +22,31 @@ EXPECTED_OUTPUT = [
|
||||
ListItem(text="I love fuzzy blankets"),
|
||||
]
|
||||
|
||||
MIN_MAX_TEXT = """This is a story. This is a story that doesn't matter
|
||||
because it is just being used as an example. Hi. Hello. Howdy. Hola.
|
||||
The example is simple and repetitive and long and somewhat boring,
|
||||
but it serves a purpose. End.""".replace(
|
||||
"\n",
|
||||
"",
|
||||
)
|
||||
|
||||
SHORT_PARAGRAPHS = """This is a story.
|
||||
|
||||
This is a story that doesn't matter because it is just being used as an example.
|
||||
|
||||
Hi.
|
||||
|
||||
Hello.
|
||||
|
||||
Howdy.
|
||||
|
||||
Hola.
|
||||
|
||||
The example is simple and repetitive and long and somewhat boring, but it serves a purpose.
|
||||
|
||||
End.
|
||||
"""
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("filename", "encoding"),
|
||||
@ -201,6 +230,79 @@ def test_partition_text_splits_long_text(filename="example-docs/norwich-city.txt
|
||||
assert elements[-1].text.endswith("External links")
|
||||
|
||||
|
||||
def test_partition_text_splits_long_text_max_partition(filename="example-docs/norwich-city.txt"):
|
||||
elements = partition_text(filename=filename)
|
||||
elements_max_part = partition_text(filename=filename, max_partition=500)
|
||||
assert len(elements) < len(elements_max_part)
|
||||
|
||||
|
||||
def test_partition_text_min_max():
|
||||
segments = partition_text(
|
||||
text=SHORT_PARAGRAPHS,
|
||||
min_partition=6,
|
||||
)
|
||||
expected = [
|
||||
"This is a story.",
|
||||
"This is a story that doesn't matter because it is just being used as an example.",
|
||||
"Hi. Hello.",
|
||||
"Howdy.",
|
||||
"""Hola. The example is simple and repetitive and long and somewhat boring,
|
||||
but it serves a purpose. End.""".replace(
|
||||
"\n",
|
||||
"",
|
||||
),
|
||||
]
|
||||
for segment, test_segment in zip(segments, expected):
|
||||
assert segment.text == test_segment
|
||||
|
||||
segments = partition_text(
|
||||
text=SHORT_PARAGRAPHS,
|
||||
max_partition=20,
|
||||
min_partition=7,
|
||||
)
|
||||
expected = [
|
||||
"This is a story.",
|
||||
"This is a story that",
|
||||
"doesn't matter",
|
||||
"because it is just",
|
||||
"being used as an",
|
||||
"example.",
|
||||
"Hi. Hello.",
|
||||
"Howdy. Hola.",
|
||||
"The example is",
|
||||
"simple and",
|
||||
"repetitive and long",
|
||||
"and somewhat boring,",
|
||||
"but it serves a",
|
||||
"purpose. End.",
|
||||
]
|
||||
for segment, test_segment in zip(segments, expected):
|
||||
assert segment.text == test_segment
|
||||
|
||||
|
||||
def test_split_content_to_fit_max():
|
||||
segments = split_content_to_fit_max(
|
||||
content=MIN_MAX_TEXT,
|
||||
max_partition=75,
|
||||
)
|
||||
assert segments == [
|
||||
"This is a story.",
|
||||
"This is a story that doesn't matter because",
|
||||
"it is just being used as an example. Hi. Hello. Howdy. Hola.",
|
||||
"The example is simple and repetitive and long",
|
||||
"and somewhat boring, but it serves a purpose. End.",
|
||||
]
|
||||
|
||||
|
||||
def test_combine_paragraphs_less_than_min():
|
||||
segments = combine_paragraphs_less_than_min(
|
||||
SHORT_PARAGRAPHS.split("\n\n"),
|
||||
max_partition=1500,
|
||||
min_partition=7,
|
||||
)
|
||||
assert len(segments) < len(SHORT_PARAGRAPHS)
|
||||
|
||||
|
||||
def test_partition_text_doesnt_get_page_breaks():
|
||||
text = "--------------------"
|
||||
elements = partition_text(text=text)
|
||||
|
||||
@ -7,7 +7,7 @@ from unstructured.file_utils.encoding import read_txt_file
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.text import (
|
||||
element_from_text,
|
||||
split_by_paragraph,
|
||||
partition_text,
|
||||
)
|
||||
|
||||
VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None]
|
||||
@ -78,13 +78,16 @@ class XMLDocument(Document):
|
||||
# Please use bytes input or XML fragments without declaration.
|
||||
except ValueError:
|
||||
document_tree = etree.fromstring(content.encode(), self.parser)
|
||||
|
||||
if "<pre>" and "</pre>" in content:
|
||||
tree = etree.HTML(content)
|
||||
for element in tree.xpath("//pre"):
|
||||
if not element.text:
|
||||
continue
|
||||
text_content = split_by_paragraph(element.text)
|
||||
|
||||
text_content = []
|
||||
for element in partition_text(text=element.text, paragraph_grouper=False):
|
||||
text_content.append(element.text)
|
||||
|
||||
for text in text_content:
|
||||
element = etree.Element("span")
|
||||
element.text = str(element_from_text(text=text))
|
||||
|
||||
@ -52,7 +52,7 @@ from unstructured.file_utils.filetype import FileType, add_metadata_with_filetyp
|
||||
from unstructured.logger import logger
|
||||
from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.partition.text import partition_text, split_by_paragraph
|
||||
from unstructured.partition.text import partition_text
|
||||
|
||||
VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html", "text/plain"]
|
||||
|
||||
@ -232,6 +232,7 @@ def partition_email(
|
||||
metadata_filename: Optional[str] = None,
|
||||
process_attachments: bool = False,
|
||||
attachment_partitioner: Optional[Callable] = None,
|
||||
min_partition: Optional[int] = 0,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions an .eml documents into its constituent elements.
|
||||
@ -258,6 +259,9 @@ def partition_email(
|
||||
processing the content of the email itself.
|
||||
attachment_partitioner
|
||||
The partitioning function to use to process attachments.
|
||||
min_partition
|
||||
The minimum number of characters to include in a partition. Only applies if
|
||||
processing the text/plain content.
|
||||
"""
|
||||
if content_source not in VALID_CONTENT_SOURCES:
|
||||
raise ValueError(
|
||||
@ -270,7 +274,6 @@ def partition_email(
|
||||
|
||||
# Verify that only one of the arguments was provided
|
||||
exactly_one(filename=filename, file=file, text=text)
|
||||
|
||||
detected_encoding = "utf-8"
|
||||
if filename is not None:
|
||||
extracted_encoding, msg = parse_email(filename=filename)
|
||||
@ -342,12 +345,12 @@ def partition_email(
|
||||
continue
|
||||
|
||||
elif content_source == "text/plain":
|
||||
list_content = split_by_paragraph(content)
|
||||
elements = partition_text(
|
||||
text=content,
|
||||
encoding=encoding,
|
||||
max_partition=max_partition,
|
||||
metadata_filename=metadata_filename or filename,
|
||||
min_partition=min_partition,
|
||||
)
|
||||
|
||||
for idx, element in enumerate(elements):
|
||||
|
||||
@ -22,6 +22,7 @@ def partition_msg(
|
||||
metadata_filename: Optional[str] = None,
|
||||
process_attachments: bool = False,
|
||||
attachment_partitioner: Optional[Callable] = None,
|
||||
min_partition: Optional[int] = 0,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions a MSFT Outlook .msg file
|
||||
@ -42,6 +43,9 @@ def partition_msg(
|
||||
processing the content of the email itself.
|
||||
attachment_partitioner
|
||||
The partitioning function to use to process attachments.
|
||||
min_partition
|
||||
The minimum number of characters to include in a partition. Only applies if
|
||||
processing text/plain content.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
@ -57,7 +61,11 @@ def partition_msg(
|
||||
if "<html>" in text or "</div>" in text:
|
||||
elements = partition_html(text=text)
|
||||
else:
|
||||
elements = partition_text(text=text, max_partition=max_partition)
|
||||
elements = partition_text(
|
||||
text=text,
|
||||
max_partition=max_partition,
|
||||
min_partition=min_partition,
|
||||
)
|
||||
|
||||
metadata = build_msg_metadata(msg_obj, metadata_filename or filename)
|
||||
for element in elements:
|
||||
|
||||
@ -51,6 +51,7 @@ def partition_pdf(
|
||||
max_partition: Optional[int] = 1500,
|
||||
include_metadata: bool = True,
|
||||
metadata_filename: Optional[str] = None,
|
||||
min_partition: Optional[int] = 0,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Parses a pdf document into a list of interpreted elements.
|
||||
@ -81,6 +82,9 @@ def partition_pdf(
|
||||
max_partition
|
||||
The maximum number of characters to include in a partition. If None is passed,
|
||||
no maximum is applied. Only applies to the "ocr_only" strategy.
|
||||
min_partition
|
||||
The minimum number of characters to include in a partition. Only applies if
|
||||
processing text/plain content.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
return partition_pdf_or_image(
|
||||
@ -91,6 +95,7 @@ def partition_pdf(
|
||||
infer_table_structure=infer_table_structure,
|
||||
ocr_languages=ocr_languages,
|
||||
max_partition=max_partition,
|
||||
min_partition=min_partition,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@ -116,6 +121,7 @@ def partition_pdf_or_image(
|
||||
infer_table_structure: bool = False,
|
||||
ocr_languages: str = "eng",
|
||||
max_partition: Optional[int] = 1500,
|
||||
min_partition: Optional[int] = 0,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Parses a pdf or image document into a list of interpreted elements."""
|
||||
@ -172,6 +178,7 @@ def partition_pdf_or_image(
|
||||
ocr_languages=ocr_languages,
|
||||
is_image=is_image,
|
||||
max_partition=max_partition,
|
||||
min_partition=min_partition,
|
||||
)
|
||||
|
||||
return layout_elements
|
||||
@ -391,6 +398,7 @@ def _partition_pdf_or_image_with_ocr(
|
||||
ocr_languages: str = "eng",
|
||||
is_image: bool = False,
|
||||
max_partition: Optional[int] = 1500,
|
||||
min_partition: Optional[int] = 0,
|
||||
):
|
||||
"""Partitions and image or PDF using Tesseract OCR. For PDFs, each page is converted
|
||||
to an image prior to processing."""
|
||||
@ -402,7 +410,11 @@ def _partition_pdf_or_image_with_ocr(
|
||||
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
|
||||
else:
|
||||
text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
|
||||
elements = partition_text(text=text, max_partition=max_partition)
|
||||
elements = partition_text(
|
||||
text=text,
|
||||
max_partition=max_partition,
|
||||
min_partition=min_partition,
|
||||
)
|
||||
else:
|
||||
elements = []
|
||||
page_number = 0
|
||||
@ -411,7 +423,11 @@ def _partition_pdf_or_image_with_ocr(
|
||||
metadata = ElementMetadata(filename=filename, page_number=page_number)
|
||||
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
|
||||
|
||||
_elements = partition_text(text=text, max_partition=max_partition)
|
||||
_elements = partition_text(
|
||||
text=text,
|
||||
max_partition=max_partition,
|
||||
min_partition=min_partition,
|
||||
)
|
||||
for element in _elements:
|
||||
element.metadata = metadata
|
||||
elements.append(element)
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import re
|
||||
import textwrap
|
||||
from typing import IO, Callable, List, Optional, Tuple
|
||||
|
||||
from unstructured.cleaners.core import clean_bullets, group_broken_paragraphs
|
||||
@ -26,52 +27,125 @@ from unstructured.partition.text_type import (
|
||||
)
|
||||
|
||||
|
||||
def split_by_paragraph(content: str, max_partition: Optional[int] = 1500) -> List[str]:
|
||||
paragraphs = re.split(PARAGRAPH_PATTERN, content)
|
||||
if max_partition is None:
|
||||
return paragraphs
|
||||
def _split_in_half_at_breakpoint(
|
||||
content: str,
|
||||
breakpoint: str = " ",
|
||||
) -> List[str]:
|
||||
"""Splits a segment of content at the breakpoint closest to the middle"""
|
||||
mid = len(content) // 2
|
||||
for i in range(len(content) // 2):
|
||||
if content[mid + i] == breakpoint:
|
||||
mid += i
|
||||
break
|
||||
elif content[mid - i] == breakpoint:
|
||||
mid += -i
|
||||
break
|
||||
|
||||
split_paragraphs = []
|
||||
for paragraph in paragraphs:
|
||||
split_paragraphs.extend(
|
||||
_split_to_fit_max_content(paragraph, max_partition=max_partition),
|
||||
)
|
||||
return split_paragraphs
|
||||
return [content[:mid].rstrip(), content[mid:].lstrip()]
|
||||
|
||||
|
||||
def _split_content_size_n(content: str, n: int) -> List[str]:
|
||||
"""Splits a string into chunks that are at most size n."""
|
||||
"""Splits a section of content into chunks that are at most
|
||||
size n without breaking apart words."""
|
||||
segments = []
|
||||
for i in range(0, len(content), n):
|
||||
segment = content[i : i + n] # noqa: E203
|
||||
segments.append(segment)
|
||||
if len(content) < n * 2:
|
||||
segments = list(_split_in_half_at_breakpoint(content))
|
||||
else:
|
||||
segments = textwrap.wrap(content, width=n)
|
||||
return segments
|
||||
|
||||
|
||||
def _split_to_fit_max_content(content: str, max_partition: int = 1500) -> List[str]:
|
||||
"""Splits a section of content so that all of the elements fit into the
|
||||
def split_content_to_fit_max(
|
||||
content: str,
|
||||
max_partition: Optional[int] = 1500,
|
||||
) -> List[str]:
|
||||
"""Splits a paragraph or section of content so that all of the elements fit into the
|
||||
max partition window."""
|
||||
sentences = sent_tokenize(content)
|
||||
num_sentences = len(sentences)
|
||||
|
||||
chunks = []
|
||||
chunk = ""
|
||||
|
||||
for i, sentence in enumerate(sentences):
|
||||
if len(sentence) > max_partition:
|
||||
chunks.extend(_split_content_size_n(sentence, n=max_partition))
|
||||
|
||||
if len(chunk + " " + sentence) > max_partition:
|
||||
chunks.append(chunk)
|
||||
chunk = sentence
|
||||
tmp_chunk = ""
|
||||
for sentence in sentences:
|
||||
if max_partition is not None and len(sentence) > max_partition:
|
||||
if tmp_chunk:
|
||||
chunks.append(tmp_chunk)
|
||||
tmp_chunk = ""
|
||||
segments = _split_content_size_n(sentence, n=max_partition)
|
||||
chunks.extend(segments[:-1])
|
||||
tmp_chunk = segments[-1]
|
||||
else:
|
||||
chunk += " " + sentence
|
||||
if i == num_sentences - 1:
|
||||
chunks.append(chunk)
|
||||
if max_partition is not None and len(tmp_chunk + " " + sentence) > max_partition:
|
||||
chunks.append(tmp_chunk)
|
||||
tmp_chunk = sentence
|
||||
else:
|
||||
if not tmp_chunk:
|
||||
tmp_chunk = sentence
|
||||
else:
|
||||
tmp_chunk += " " + sentence
|
||||
tmp_chunk = tmp_chunk.strip()
|
||||
if tmp_chunk:
|
||||
chunks.append(tmp_chunk)
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def combine_paragraphs_less_than_min(
|
||||
split_paragraphs: List[str],
|
||||
max_partition: Optional[int] = 1500,
|
||||
min_partition: Optional[int] = 0,
|
||||
) -> List[str]:
|
||||
"""Combine paragraphs less than `min_partition` while not exceeding `max_partition`."""
|
||||
if type(split_paragraphs) is not list:
|
||||
raise ValueError("`split_paragraphs` is not a list")
|
||||
file_content: List[str] = []
|
||||
tmp_paragraph = ""
|
||||
next_index = 0
|
||||
for current_index, paragraph in enumerate(split_paragraphs):
|
||||
if next_index > current_index:
|
||||
continue # Skip the current iteration if `next_index`` is already updated
|
||||
if min_partition is not None and len(paragraph) < min_partition:
|
||||
# Combine paragraphs that are less than `min_partition``
|
||||
# while not exceeding `max_partition``
|
||||
tmp_paragraph += paragraph + "\n"
|
||||
|
||||
while len(tmp_paragraph.strip()) < min_partition:
|
||||
if current_index + 1 == len(split_paragraphs):
|
||||
# If it's the last paragraph, append the paragraph
|
||||
# to the previous content
|
||||
file_content[-1] += " " + tmp_paragraph.rstrip()
|
||||
tmp_paragraph = ""
|
||||
break
|
||||
for offset_index, para in enumerate(
|
||||
split_paragraphs[current_index + 1 :], start=1 # noqa
|
||||
):
|
||||
if (
|
||||
max_partition is not None
|
||||
and len(tmp_paragraph + "\n" + para) < max_partition
|
||||
):
|
||||
tmp_paragraph += "\n" + para
|
||||
# Update `next_index` to skip already combined paragraphs
|
||||
next_index = offset_index + current_index + 1
|
||||
|
||||
if len(tmp_paragraph.strip()) > min_partition:
|
||||
break # Stop combining if the combined paragraphs
|
||||
# meet the `min_partition`` requirement
|
||||
elif (
|
||||
max_partition is not None
|
||||
and len(tmp_paragraph) < min_partition
|
||||
and len(tmp_paragraph + "\n" + para) > max_partition
|
||||
):
|
||||
raise ValueError(
|
||||
"`min_partition` and `max_partition` are defined too close together",
|
||||
)
|
||||
# Add the combined paragraph to the final result
|
||||
file_content.append(
|
||||
tmp_paragraph.strip(),
|
||||
)
|
||||
tmp_paragraph = ""
|
||||
else:
|
||||
file_content.append(paragraph)
|
||||
return file_content
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.TXT)
|
||||
def partition_text(
|
||||
@ -83,9 +157,12 @@ def partition_text(
|
||||
metadata_filename: Optional[str] = None,
|
||||
include_metadata: bool = True,
|
||||
max_partition: Optional[int] = 1500,
|
||||
min_partition: Optional[int] = 0,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions an .txt documents into its constituent elements.
|
||||
"""Partitions an .txt documents into its constituent paragraph elements.
|
||||
If paragraphs are below "min_partition" or above "max_partition" boundaries,
|
||||
they are combined or split.
|
||||
Parameters
|
||||
----------
|
||||
filename
|
||||
@ -104,10 +181,19 @@ def partition_text(
|
||||
max_partition
|
||||
The maximum number of characters to include in a partition. If None is passed,
|
||||
no maximum is applied.
|
||||
min_partition
|
||||
The minimum number of characters to include in a partition.
|
||||
"""
|
||||
if text is not None and text.strip() == "" and not file and not filename:
|
||||
return []
|
||||
|
||||
if (
|
||||
min_partition is not None
|
||||
and max_partition is not None
|
||||
and (min_partition > max_partition or min_partition < 0 or max_partition < 0)
|
||||
):
|
||||
raise ValueError("Invalid values for min_partition and/or max_partition.")
|
||||
|
||||
# Verify that only one of the arguments was provided
|
||||
exactly_one(filename=filename, file=file, text=text)
|
||||
|
||||
@ -120,12 +206,33 @@ def partition_text(
|
||||
elif text is not None:
|
||||
file_text = str(text)
|
||||
|
||||
if paragraph_grouper is not None:
|
||||
if paragraph_grouper is False:
|
||||
pass
|
||||
elif paragraph_grouper is not None:
|
||||
file_text = paragraph_grouper(file_text)
|
||||
else:
|
||||
file_text = group_broken_paragraphs(file_text)
|
||||
|
||||
file_content = split_by_paragraph(file_text, max_partition=max_partition)
|
||||
if min_partition is not None and len(file_text) < min_partition:
|
||||
raise ValueError("`min_partition` cannot be larger than the length of file contents.")
|
||||
|
||||
split_paragraphs = re.split(PARAGRAPH_PATTERN, file_text.strip())
|
||||
|
||||
paragraphs = combine_paragraphs_less_than_min(
|
||||
split_paragraphs=split_paragraphs,
|
||||
max_partition=max_partition,
|
||||
min_partition=min_partition,
|
||||
)
|
||||
|
||||
file_content = []
|
||||
|
||||
for paragraph in paragraphs:
|
||||
file_content.extend(
|
||||
split_content_to_fit_max(
|
||||
content=paragraph,
|
||||
max_partition=max_partition,
|
||||
),
|
||||
)
|
||||
|
||||
elements: List[Element] = []
|
||||
metadata = (
|
||||
|
||||
@ -54,6 +54,7 @@ def partition_xml(
|
||||
include_metadata: bool = True,
|
||||
encoding: Optional[str] = None,
|
||||
max_partition: Optional[int] = 1500,
|
||||
min_partition: Optional[int] = 0,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions an XML document into its document elements.
|
||||
@ -77,6 +78,8 @@ def partition_xml(
|
||||
max_partition
|
||||
The maximum number of characters to include in a partition. If None is passed,
|
||||
no maximum is applied.
|
||||
min_partition
|
||||
The minimum number of characters to include in a partition.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
@ -97,6 +100,7 @@ def partition_xml(
|
||||
metadata_filename=metadata_filename,
|
||||
include_metadata=include_metadata,
|
||||
max_partition=max_partition,
|
||||
min_partition=min_partition,
|
||||
)
|
||||
|
||||
return elements
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user