Chore: put back function split_by_paragraph (#992)

* put back function

* not really fixes
This commit is contained in:
Yuming Long 2023-07-28 12:09:35 -04:00 committed by GitHub
parent e017e99b5b
commit 8cff756cef
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 15 deletions

View File

@ -2,6 +2,8 @@
### Enhancements
* Put back useful function `split_by_paragraph`
### Features
### Fixes

View File

@ -31,6 +31,32 @@ from unstructured.partition.text_type import (
)
def split_by_paragraph(
file_text: str,
min_partition: Optional[int] = 0,
max_partition: Optional[int] = 1500,
) -> List[str]:
split_paragraphs = re.split(PARAGRAPH_PATTERN, file_text.strip())
paragraphs = combine_paragraphs_less_than_min(
split_paragraphs=split_paragraphs,
max_partition=max_partition,
min_partition=min_partition,
)
file_content = []
for paragraph in paragraphs:
file_content.extend(
split_content_to_fit_max(
content=paragraph,
max_partition=max_partition,
),
)
return file_content
def _split_in_half_at_breakpoint(
content: str,
breakpoint: str = " ",
@ -226,24 +252,12 @@ def partition_text(
if min_partition is not None and len(file_text) < min_partition:
raise ValueError("`min_partition` cannot be larger than the length of file contents.")
split_paragraphs = re.split(PARAGRAPH_PATTERN, file_text.strip())
paragraphs = combine_paragraphs_less_than_min(
split_paragraphs=split_paragraphs,
max_partition=max_partition,
file_content = split_by_paragraph(
file_text,
min_partition=min_partition,
max_partition=max_partition,
)
file_content = []
for paragraph in paragraphs:
file_content.extend(
split_content_to_fit_max(
content=paragraph,
max_partition=max_partition,
),
)
elements: List[Element] = []
metadata = (
ElementMetadata(