feat: enable grouping broken paragraphs in partition_text (#456)

* cleaning brick to group broken paragraphs

* docs for group_broken_paragraphs

* add docs for partition_text with grouper

* partition_text and auto with paragraph_grouper

* version and changelog

* typo in the docs

* linting, linting, linting

* switch to using regular expressions
This commit is contained in:
Matt Robinson 2023-04-06 14:35:22 -04:00 committed by GitHub
parent ee52a749c3
commit c99c099158
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 161 additions and 6 deletions

View File

@ -1,8 +1,9 @@
## 0.5.12-dev1 ## 0.5.12-dev2
### Enhancements ### Enhancements
* Use the image registry as a cache when building Docker images. * Use the image registry as a cache when building Docker images.
* Adds the ability for `partition_text` to group together broken paragraphs.
### Features ### Features

View File

@ -395,9 +395,26 @@ Examples:
text = f.read() text = f.read()
elements = partition_text(text=text) elements = partition_text(text=text)
If the text has extra line breaks for formatting purposes, you can group
together the broken text using the ``paragraph_grouper`` kwarg. The
``paragraph_grouper`` kwarg is a function that accepts a string and returns
another string.
Examples:
.. code:: python
from unstructured.partition.text import partition_text
from unstructured.cleaners.core import group_broken_paragraphs
text = """The big brown fox
was walking down the lane.
At the end of the lane, the
fox met a bear."""
partition_text(text=text, paragraph_grouper=group_broken_paragraphs)
######## ########
@ -567,6 +584,46 @@ Examples:
clean_trailing_punctuation("ITEM 1A: RISK FACTORS.") clean_trailing_punctuation("ITEM 1A: RISK FACTORS.")
``group_broken_paragraphs``
---------------------------
Groups together paragraphs that are broken up with line breaks
for visual or formatting purposes. This is common in ``.txt`` files.
By default, ``group_broken_paragraphs`` groups together lines split
by ``\n``. You can change that behavior with the ``line_split``
kwarg. The function considers ``\n\n`` to be a paragraph break by
default. You can change that behavior with the ``paragraph_split`` kwarg.
Examples:
.. code:: python
from unstructured.cleaners.core import group_broken_paragraphs
text = """The big brown fox
was walking down the lane.
At the end of the lane, the
fox met a bear."""
group_broken_paragraphs(text)
.. code:: python
from unstructured.cleaners.core import group_broken_paragraphs
text = """The big brown fox
was walking down the lane.
At the end of the lane, the
fox met a bear."""
group_broken_paragraphs(text, line_split="\n\n", paragraph_split="\n\n\n")
``replace_unicode_quotes`` ``replace_unicode_quotes``
-------------------------- --------------------------

View File

@ -1,3 +1,5 @@
import re
import pytest import pytest
from unstructured.cleaners import core from unstructured.cleaners import core
@ -166,6 +168,42 @@ def test_clean_postfix(text, pattern, ignore_case, strip, expected):
assert core.clean_postfix(text, pattern, ignore_case, strip) == expected assert core.clean_postfix(text, pattern, ignore_case, strip) == expected
def test_group_broken_paragraphs():
text = """The big red fox
is walking down the lane.
At the end of the lane
the fox met a friendly bear."""
assert (
core.group_broken_paragraphs(text)
== """The big red fox is walking down the lane.
At the end of the lane the fox met a friendly bear."""
)
def test_group_broken_paragraphs_non_default_settings():
text = """The big red fox
is walking down the lane.
At the end of the lane
the fox met a friendly bear."""
para_split_re = re.compile(r"(\s*\n\s*){3}")
clean_text = core.group_broken_paragraphs(text, paragraph_split=para_split_re)
assert (
clean_text
== """The big red fox is walking down the lane.
At the end of the lane the fox met a friendly bear."""
)
@pytest.mark.parametrize( @pytest.mark.parametrize(
# NOTE(yuming): Tests combined cleaners # NOTE(yuming): Tests combined cleaners
( (

View File

@ -3,6 +3,7 @@ import pathlib
import pytest import pytest
from unstructured.cleaners.core import group_broken_paragraphs
from unstructured.documents.elements import Address, ListItem, NarrativeText, Title from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
from unstructured.partition.text import partition_text from unstructured.partition.text import partition_text
@ -95,3 +96,17 @@ def test_partition_text_captures_everything_even_with_linebreaks():
Title(text="VERY IMPORTANT MEMO"), Title(text="VERY IMPORTANT MEMO"),
Address(text="DOYLESTOWN, PA 18901"), Address(text="DOYLESTOWN, PA 18901"),
] ]
def test_partition_text_groups_broken_paragraphs():
text = """The big brown fox
was walking down the lane.
At the end of the lane,
the fox met a bear."""
elements = partition_text(text=text, paragraph_grouper=group_broken_paragraphs)
assert elements == [
NarrativeText(text="The big brown fox was walking down the lane."),
NarrativeText(text="At the end of the lane, the fox met a bear."),
]

View File

@ -1 +1 @@
__version__ = "0.5.12-dev1" # pragma: no cover __version__ = "0.5.12-dev2" # pragma: no cover

View File

@ -3,7 +3,11 @@ import re
import sys import sys
import unicodedata import unicodedata
from unstructured.nlp.patterns import UNICODE_BULLETS_RE from unstructured.nlp.patterns import (
DOUBLE_PARAGRAPH_PATTERN_RE,
PARAGRAPH_PATTERN_RE,
UNICODE_BULLETS_RE,
)
def clean_non_ascii_chars(text) -> str: def clean_non_ascii_chars(text) -> str:
@ -57,6 +61,30 @@ def clean_ordered_bullets(text) -> str:
return text_cl return text_cl
def group_broken_paragraphs(
text: str,
line_split: re.Pattern = PARAGRAPH_PATTERN_RE,
paragraph_split: re.Pattern = DOUBLE_PARAGRAPH_PATTERN_RE,
) -> str:
"""Groups paragraphs that have line breaks for visual/formatting purposes.
For example:
'''The big red fox
is walking down the lane.
At the end of the lane
the fox met a bear.'''
Gets converted to
'''The big red fox is walking down the lane.
At the end of the land the fox met a bear.'''
"""
paragraphs = paragraph_split.split(text)
clean_paragraphs = [line_split.sub(" ", para) for para in paragraphs if para.strip()]
return "\n\n".join(clean_paragraphs)
# TODO(robinson) - There's likely a cleaner was to accomplish this and get all of the # TODO(robinson) - There's likely a cleaner was to accomplish this and get all of the
# unicode characters instead of just the quotes. Doing this for now since quotes are # unicode characters instead of just the quotes. Doing this for now since quotes are
# an issue that are popping up in the SEC filings tests # an issue that are popping up in the SEC filings tests

View File

@ -70,6 +70,9 @@ EMAIL_HEAD_RE = re.compile(EMAIL_HEAD_PATTERN)
# (incluing \r and \n chars) on either side # (incluing \r and \n chars) on either side
PARAGRAPH_PATTERN = r"\s*\n\s*" # noqa: W605 NOTE(harrell) PARAGRAPH_PATTERN = r"\s*\n\s*" # noqa: W605 NOTE(harrell)
PARAGRAPH_PATTERN_RE = re.compile(PARAGRAPH_PATTERN)
DOUBLE_PARAGRAPH_PATTERN_RE = re.compile("(" + PARAGRAPH_PATTERN + "){2}")
# IP Address examples: ba23::58b5:2236:45g2:88h2 or 10.0.2.01 # IP Address examples: ba23::58b5:2236:45g2:88h2 or 10.0.2.01
IP_ADDRESS_PATTERN = ( IP_ADDRESS_PATTERN = (
"[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}", # noqa: W605 NOTE(harrell) "[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}", # noqa: W605 NOTE(harrell)

View File

@ -1,4 +1,4 @@
from typing import IO, Optional from typing import IO, Callable, Optional
from unstructured.file_utils.filetype import FileType, detect_filetype from unstructured.file_utils.filetype import FileType, detect_filetype
from unstructured.partition.doc import partition_doc from unstructured.partition.doc import partition_doc
@ -24,6 +24,7 @@ def partition(
include_page_breaks: bool = False, include_page_breaks: bool = False,
strategy: str = "hi_res", strategy: str = "hi_res",
encoding: str = "utf-8", encoding: str = "utf-8",
paragraph_grouper: Optional[Callable[[str], str]] = None,
): ):
"""Partitions a document into its constituent elements. Will use libmagic to determine """Partitions a document into its constituent elements. Will use libmagic to determine
the file's type and route it to the appropriate partitioning function. Applies the default the file's type and route it to the appropriate partitioning function. Applies the default
@ -95,7 +96,12 @@ def partition(
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
) )
elif filetype == FileType.TXT: elif filetype == FileType.TXT:
return partition_text(filename=filename, file=file, encoding=encoding) return partition_text(
filename=filename,
file=file,
encoding=encoding,
paragraph_grouper=paragraph_grouper,
)
elif filetype == FileType.PPT: elif filetype == FileType.PPT:
return partition_ppt(filename=filename, file=file, include_page_breaks=include_page_breaks) return partition_ppt(filename=filename, file=file, include_page_breaks=include_page_breaks)
elif filetype == FileType.PPTX: elif filetype == FileType.PPTX:

View File

@ -1,5 +1,5 @@
import re import re
from typing import IO, List, Optional from typing import IO, Callable, List, Optional
from unstructured.cleaners.core import clean_bullets from unstructured.cleaners.core import clean_bullets
from unstructured.documents.elements import ( from unstructured.documents.elements import (
@ -30,6 +30,7 @@ def partition_text(
file: Optional[IO] = None, file: Optional[IO] = None,
text: Optional[str] = None, text: Optional[str] = None,
encoding: Optional[str] = "utf-8", encoding: Optional[str] = "utf-8",
paragraph_grouper: Optional[Callable[[str], str]] = None,
) -> List[Element]: ) -> List[Element]:
"""Partitions an .txt documents into its constituent elements. """Partitions an .txt documents into its constituent elements.
Parameters Parameters
@ -42,6 +43,9 @@ def partition_text(
The string representation of the .txt document. The string representation of the .txt document.
encoding encoding
The encoding method used to decode the text input. If None, utf-8 will be used. The encoding method used to decode the text input. If None, utf-8 will be used.
paragrapher_grouper
A str -> str function for fixing paragraphs that are interrupted by line breaks
for formatting purposes.
""" """
if text is not None and text.strip() == "" and not file and not filename: if text is not None and text.strip() == "" and not file and not filename:
return [] return []
@ -64,6 +68,9 @@ def partition_text(
elif text is not None: elif text is not None:
file_text = str(text) file_text = str(text)
if paragraph_grouper is not None:
file_text = paragraph_grouper(file_text)
file_content = split_by_paragraph(file_text) file_content = split_by_paragraph(file_text)
elements: List[Element] = [] elements: List[Element] = []