mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-27 16:11:02 +00:00
feat: enable grouping broken paragraphs in partition_text (#456)
* cleaning brick to group broken paragraphs * docs for group_broken_paragraphs * add docs for partition_text with grouper * partition_text and auto with paragraph_grouper * version and changelog * typo in the docs * linting, linting, linting * switch to using regular expressions
This commit is contained in:
parent
ee52a749c3
commit
c99c099158
@ -1,8 +1,9 @@
|
|||||||
## 0.5.12-dev1
|
## 0.5.12-dev2
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
* Use the image registry as a cache when building Docker images.
|
* Use the image registry as a cache when building Docker images.
|
||||||
|
* Adds the ability for `partition_text` to group together broken paragraphs.
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
|||||||
@ -395,9 +395,26 @@ Examples:
|
|||||||
text = f.read()
|
text = f.read()
|
||||||
elements = partition_text(text=text)
|
elements = partition_text(text=text)
|
||||||
|
|
||||||
|
If the text has extra line breaks for formatting purposes, you can group
|
||||||
|
together the broken text using the ``paragraph_grouper`` kwarg. The
|
||||||
|
``paragraph_grouper`` kwarg is a function that accepts a string and returns
|
||||||
|
another string.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.partition.text import partition_text
|
||||||
|
from unstructured.cleaners.core import group_broken_paragraphs
|
||||||
|
|
||||||
|
|
||||||
|
text = """The big brown fox
|
||||||
|
was walking down the lane.
|
||||||
|
|
||||||
|
At the end of the lane, the
|
||||||
|
fox met a bear."""
|
||||||
|
|
||||||
|
partition_text(text=text, paragraph_grouper=group_broken_paragraphs)
|
||||||
|
|
||||||
|
|
||||||
########
|
########
|
||||||
@ -567,6 +584,46 @@ Examples:
|
|||||||
clean_trailing_punctuation("ITEM 1A: RISK FACTORS.")
|
clean_trailing_punctuation("ITEM 1A: RISK FACTORS.")
|
||||||
|
|
||||||
|
|
||||||
|
``group_broken_paragraphs``
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
Groups together paragraphs that are broken up with line breaks
|
||||||
|
for visual or formatting purposes. This is common in ``.txt`` files.
|
||||||
|
By default, ``group_broken_paragraphs`` groups together lines split
|
||||||
|
by ``\n``. You can change that behavior with the ``line_split``
|
||||||
|
kwarg. The function considers ``\n\n`` to be a paragraph break by
|
||||||
|
default. You can change that behavior with the ``paragraph_split`` kwarg.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.cleaners.core import group_broken_paragraphs
|
||||||
|
|
||||||
|
text = """The big brown fox
|
||||||
|
was walking down the lane.
|
||||||
|
|
||||||
|
At the end of the lane, the
|
||||||
|
fox met a bear."""
|
||||||
|
|
||||||
|
group_broken_paragraphs(text)
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.cleaners.core import group_broken_paragraphs
|
||||||
|
|
||||||
|
text = """The big brown fox
|
||||||
|
|
||||||
|
was walking down the lane.
|
||||||
|
|
||||||
|
|
||||||
|
At the end of the lane, the
|
||||||
|
|
||||||
|
fox met a bear."""
|
||||||
|
|
||||||
|
group_broken_paragraphs(text, line_split="\n\n", paragraph_split="\n\n\n")
|
||||||
|
|
||||||
|
|
||||||
``replace_unicode_quotes``
|
``replace_unicode_quotes``
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from unstructured.cleaners import core
|
from unstructured.cleaners import core
|
||||||
@ -166,6 +168,42 @@ def test_clean_postfix(text, pattern, ignore_case, strip, expected):
|
|||||||
assert core.clean_postfix(text, pattern, ignore_case, strip) == expected
|
assert core.clean_postfix(text, pattern, ignore_case, strip) == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_group_broken_paragraphs():
|
||||||
|
text = """The big red fox
|
||||||
|
is walking down the lane.
|
||||||
|
|
||||||
|
At the end of the lane
|
||||||
|
the fox met a friendly bear."""
|
||||||
|
|
||||||
|
assert (
|
||||||
|
core.group_broken_paragraphs(text)
|
||||||
|
== """The big red fox is walking down the lane.
|
||||||
|
|
||||||
|
At the end of the lane the fox met a friendly bear."""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_group_broken_paragraphs_non_default_settings():
|
||||||
|
text = """The big red fox
|
||||||
|
|
||||||
|
is walking down the lane.
|
||||||
|
|
||||||
|
|
||||||
|
At the end of the lane
|
||||||
|
|
||||||
|
the fox met a friendly bear."""
|
||||||
|
|
||||||
|
para_split_re = re.compile(r"(\s*\n\s*){3}")
|
||||||
|
|
||||||
|
clean_text = core.group_broken_paragraphs(text, paragraph_split=para_split_re)
|
||||||
|
assert (
|
||||||
|
clean_text
|
||||||
|
== """The big red fox is walking down the lane.
|
||||||
|
|
||||||
|
At the end of the lane the fox met a friendly bear."""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
# NOTE(yuming): Tests combined cleaners
|
# NOTE(yuming): Tests combined cleaners
|
||||||
(
|
(
|
||||||
|
|||||||
@ -3,6 +3,7 @@ import pathlib
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from unstructured.cleaners.core import group_broken_paragraphs
|
||||||
from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
|
from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
|
||||||
from unstructured.partition.text import partition_text
|
from unstructured.partition.text import partition_text
|
||||||
|
|
||||||
@ -95,3 +96,17 @@ def test_partition_text_captures_everything_even_with_linebreaks():
|
|||||||
Title(text="VERY IMPORTANT MEMO"),
|
Title(text="VERY IMPORTANT MEMO"),
|
||||||
Address(text="DOYLESTOWN, PA 18901"),
|
Address(text="DOYLESTOWN, PA 18901"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_text_groups_broken_paragraphs():
|
||||||
|
text = """The big brown fox
|
||||||
|
was walking down the lane.
|
||||||
|
|
||||||
|
At the end of the lane,
|
||||||
|
the fox met a bear."""
|
||||||
|
|
||||||
|
elements = partition_text(text=text, paragraph_grouper=group_broken_paragraphs)
|
||||||
|
assert elements == [
|
||||||
|
NarrativeText(text="The big brown fox was walking down the lane."),
|
||||||
|
NarrativeText(text="At the end of the lane, the fox met a bear."),
|
||||||
|
]
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.5.12-dev1" # pragma: no cover
|
__version__ = "0.5.12-dev2" # pragma: no cover
|
||||||
|
|||||||
@ -3,7 +3,11 @@ import re
|
|||||||
import sys
|
import sys
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
from unstructured.nlp.patterns import UNICODE_BULLETS_RE
|
from unstructured.nlp.patterns import (
|
||||||
|
DOUBLE_PARAGRAPH_PATTERN_RE,
|
||||||
|
PARAGRAPH_PATTERN_RE,
|
||||||
|
UNICODE_BULLETS_RE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def clean_non_ascii_chars(text) -> str:
|
def clean_non_ascii_chars(text) -> str:
|
||||||
@ -57,6 +61,30 @@ def clean_ordered_bullets(text) -> str:
|
|||||||
return text_cl
|
return text_cl
|
||||||
|
|
||||||
|
|
||||||
|
def group_broken_paragraphs(
|
||||||
|
text: str,
|
||||||
|
line_split: re.Pattern = PARAGRAPH_PATTERN_RE,
|
||||||
|
paragraph_split: re.Pattern = DOUBLE_PARAGRAPH_PATTERN_RE,
|
||||||
|
) -> str:
|
||||||
|
"""Groups paragraphs that have line breaks for visual/formatting purposes.
|
||||||
|
For example:
|
||||||
|
|
||||||
|
'''The big red fox
|
||||||
|
is walking down the lane.
|
||||||
|
|
||||||
|
At the end of the lane
|
||||||
|
the fox met a bear.'''
|
||||||
|
|
||||||
|
Gets converted to
|
||||||
|
|
||||||
|
'''The big red fox is walking down the lane.
|
||||||
|
At the end of the land the fox met a bear.'''
|
||||||
|
"""
|
||||||
|
paragraphs = paragraph_split.split(text)
|
||||||
|
clean_paragraphs = [line_split.sub(" ", para) for para in paragraphs if para.strip()]
|
||||||
|
return "\n\n".join(clean_paragraphs)
|
||||||
|
|
||||||
|
|
||||||
# TODO(robinson) - There's likely a cleaner was to accomplish this and get all of the
|
# TODO(robinson) - There's likely a cleaner was to accomplish this and get all of the
|
||||||
# unicode characters instead of just the quotes. Doing this for now since quotes are
|
# unicode characters instead of just the quotes. Doing this for now since quotes are
|
||||||
# an issue that are popping up in the SEC filings tests
|
# an issue that are popping up in the SEC filings tests
|
||||||
|
|||||||
@ -70,6 +70,9 @@ EMAIL_HEAD_RE = re.compile(EMAIL_HEAD_PATTERN)
|
|||||||
# (incluing \r and \n chars) on either side
|
# (incluing \r and \n chars) on either side
|
||||||
PARAGRAPH_PATTERN = r"\s*\n\s*" # noqa: W605 NOTE(harrell)
|
PARAGRAPH_PATTERN = r"\s*\n\s*" # noqa: W605 NOTE(harrell)
|
||||||
|
|
||||||
|
PARAGRAPH_PATTERN_RE = re.compile(PARAGRAPH_PATTERN)
|
||||||
|
DOUBLE_PARAGRAPH_PATTERN_RE = re.compile("(" + PARAGRAPH_PATTERN + "){2}")
|
||||||
|
|
||||||
# IP Address examples: ba23::58b5:2236:45g2:88h2 or 10.0.2.01
|
# IP Address examples: ba23::58b5:2236:45g2:88h2 or 10.0.2.01
|
||||||
IP_ADDRESS_PATTERN = (
|
IP_ADDRESS_PATTERN = (
|
||||||
"[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}", # noqa: W605 NOTE(harrell)
|
"[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}", # noqa: W605 NOTE(harrell)
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
from typing import IO, Optional
|
from typing import IO, Callable, Optional
|
||||||
|
|
||||||
from unstructured.file_utils.filetype import FileType, detect_filetype
|
from unstructured.file_utils.filetype import FileType, detect_filetype
|
||||||
from unstructured.partition.doc import partition_doc
|
from unstructured.partition.doc import partition_doc
|
||||||
@ -24,6 +24,7 @@ def partition(
|
|||||||
include_page_breaks: bool = False,
|
include_page_breaks: bool = False,
|
||||||
strategy: str = "hi_res",
|
strategy: str = "hi_res",
|
||||||
encoding: str = "utf-8",
|
encoding: str = "utf-8",
|
||||||
|
paragraph_grouper: Optional[Callable[[str], str]] = None,
|
||||||
):
|
):
|
||||||
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
||||||
the file's type and route it to the appropriate partitioning function. Applies the default
|
the file's type and route it to the appropriate partitioning function. Applies the default
|
||||||
@ -95,7 +96,12 @@ def partition(
|
|||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.TXT:
|
elif filetype == FileType.TXT:
|
||||||
return partition_text(filename=filename, file=file, encoding=encoding)
|
return partition_text(
|
||||||
|
filename=filename,
|
||||||
|
file=file,
|
||||||
|
encoding=encoding,
|
||||||
|
paragraph_grouper=paragraph_grouper,
|
||||||
|
)
|
||||||
elif filetype == FileType.PPT:
|
elif filetype == FileType.PPT:
|
||||||
return partition_ppt(filename=filename, file=file, include_page_breaks=include_page_breaks)
|
return partition_ppt(filename=filename, file=file, include_page_breaks=include_page_breaks)
|
||||||
elif filetype == FileType.PPTX:
|
elif filetype == FileType.PPTX:
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
import re
|
import re
|
||||||
from typing import IO, List, Optional
|
from typing import IO, Callable, List, Optional
|
||||||
|
|
||||||
from unstructured.cleaners.core import clean_bullets
|
from unstructured.cleaners.core import clean_bullets
|
||||||
from unstructured.documents.elements import (
|
from unstructured.documents.elements import (
|
||||||
@ -30,6 +30,7 @@ def partition_text(
|
|||||||
file: Optional[IO] = None,
|
file: Optional[IO] = None,
|
||||||
text: Optional[str] = None,
|
text: Optional[str] = None,
|
||||||
encoding: Optional[str] = "utf-8",
|
encoding: Optional[str] = "utf-8",
|
||||||
|
paragraph_grouper: Optional[Callable[[str], str]] = None,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Partitions an .txt documents into its constituent elements.
|
"""Partitions an .txt documents into its constituent elements.
|
||||||
Parameters
|
Parameters
|
||||||
@ -42,6 +43,9 @@ def partition_text(
|
|||||||
The string representation of the .txt document.
|
The string representation of the .txt document.
|
||||||
encoding
|
encoding
|
||||||
The encoding method used to decode the text input. If None, utf-8 will be used.
|
The encoding method used to decode the text input. If None, utf-8 will be used.
|
||||||
|
paragrapher_grouper
|
||||||
|
A str -> str function for fixing paragraphs that are interrupted by line breaks
|
||||||
|
for formatting purposes.
|
||||||
"""
|
"""
|
||||||
if text is not None and text.strip() == "" and not file and not filename:
|
if text is not None and text.strip() == "" and not file and not filename:
|
||||||
return []
|
return []
|
||||||
@ -64,6 +68,9 @@ def partition_text(
|
|||||||
elif text is not None:
|
elif text is not None:
|
||||||
file_text = str(text)
|
file_text = str(text)
|
||||||
|
|
||||||
|
if paragraph_grouper is not None:
|
||||||
|
file_text = paragraph_grouper(file_text)
|
||||||
|
|
||||||
file_content = split_by_paragraph(file_text)
|
file_content = split_by_paragraph(file_text)
|
||||||
|
|
||||||
elements: List[Element] = []
|
elements: List[Element] = []
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user