diff --git a/CHANGELOG.md b/CHANGELOG.md index 64d3fd766..c822c44dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,9 @@ -## 0.5.12-dev1 +## 0.5.12-dev2 ### Enhancements * Use the image registry as a cache when building Docker images. +* Adds the ability for `partition_text` to group together broken paragraphs. ### Features diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index 38b4306fa..1c17cc621 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -395,9 +395,26 @@ Examples: text = f.read() elements = partition_text(text=text) +If the text has extra line breaks for formatting purposes, you can group +together the broken text using the ``paragraph_grouper`` kwarg. The +``paragraph_grouper`` kwarg is a function that accepts a string and returns +another string. + +Examples: + +.. code:: python + + from unstructured.partition.text import partition_text + from unstructured.cleaners.core import group_broken_paragraphs + text = """The big brown fox + was walking down the lane. + At the end of the lane, the + fox met a bear.""" + + partition_text(text=text, paragraph_grouper=group_broken_paragraphs) ######## @@ -567,6 +584,46 @@ Examples: clean_trailing_punctuation("ITEM 1A: RISK FACTORS.") +``group_broken_paragraphs`` +--------------------------- + +Groups together paragraphs that are broken up with line breaks +for visual or formatting purposes. This is common in ``.txt`` files. +By default, ``group_broken_paragraphs`` groups together lines split +by ``\n``. You can change that behavior with the ``line_split`` +kwarg. The function considers ``\n\n`` to be a paragraph break by +default. You can change that behavior with the ``paragraph_split`` kwarg. + +Examples: + +.. code:: python + + from unstructured.cleaners.core import group_broken_paragraphs + + text = """The big brown fox + was walking down the lane. + + At the end of the lane, the + fox met a bear.""" + + group_broken_paragraphs(text) + +.. code:: python + + from unstructured.cleaners.core import group_broken_paragraphs + + text = """The big brown fox + + was walking down the lane. + + + At the end of the lane, the + + fox met a bear.""" + + group_broken_paragraphs(text, line_split="\n\n", paragraph_split="\n\n\n") + + ``replace_unicode_quotes`` -------------------------- diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py index 80999f5d2..8c28428b1 100644 --- a/test_unstructured/cleaners/test_core.py +++ b/test_unstructured/cleaners/test_core.py @@ -1,3 +1,5 @@ +import re + import pytest from unstructured.cleaners import core @@ -166,6 +168,42 @@ def test_clean_postfix(text, pattern, ignore_case, strip, expected): assert core.clean_postfix(text, pattern, ignore_case, strip) == expected +def test_group_broken_paragraphs(): + text = """The big red fox +is walking down the lane. + +At the end of the lane +the fox met a friendly bear.""" + + assert ( + core.group_broken_paragraphs(text) + == """The big red fox is walking down the lane. + +At the end of the lane the fox met a friendly bear.""" + ) + + +def test_group_broken_paragraphs_non_default_settings(): + text = """The big red fox + +is walking down the lane. + + +At the end of the lane + +the fox met a friendly bear.""" + + para_split_re = re.compile(r"(\s*\n\s*){3}") + + clean_text = core.group_broken_paragraphs(text, paragraph_split=para_split_re) + assert ( + clean_text + == """The big red fox is walking down the lane. + +At the end of the lane the fox met a friendly bear.""" + ) + + @pytest.mark.parametrize( # NOTE(yuming): Tests combined cleaners ( diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py index 0aab6ffa8..ee76c9a74 100644 --- a/test_unstructured/partition/test_text.py +++ b/test_unstructured/partition/test_text.py @@ -3,6 +3,7 @@ import pathlib import pytest +from unstructured.cleaners.core import group_broken_paragraphs from unstructured.documents.elements import Address, ListItem, NarrativeText, Title from unstructured.partition.text import partition_text @@ -95,3 +96,17 @@ def test_partition_text_captures_everything_even_with_linebreaks(): Title(text="VERY IMPORTANT MEMO"), Address(text="DOYLESTOWN, PA 18901"), ] + + +def test_partition_text_groups_broken_paragraphs(): + text = """The big brown fox +was walking down the lane. + +At the end of the lane, +the fox met a bear.""" + + elements = partition_text(text=text, paragraph_grouper=group_broken_paragraphs) + assert elements == [ + NarrativeText(text="The big brown fox was walking down the lane."), + NarrativeText(text="At the end of the lane, the fox met a bear."), + ] diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 81f919db9..a2908726a 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.5.12-dev1" # pragma: no cover +__version__ = "0.5.12-dev2" # pragma: no cover diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 915bbfeb9..e9cb8a1d5 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -3,7 +3,11 @@ import re import sys import unicodedata -from unstructured.nlp.patterns import UNICODE_BULLETS_RE +from unstructured.nlp.patterns import ( + DOUBLE_PARAGRAPH_PATTERN_RE, + PARAGRAPH_PATTERN_RE, + UNICODE_BULLETS_RE, +) def clean_non_ascii_chars(text) -> str: @@ -57,6 +61,30 @@ def clean_ordered_bullets(text) -> str: return text_cl +def group_broken_paragraphs( + text: str, + line_split: re.Pattern = PARAGRAPH_PATTERN_RE, + paragraph_split: re.Pattern = DOUBLE_PARAGRAPH_PATTERN_RE, +) -> str: + """Groups paragraphs that have line breaks for visual/formatting purposes. + For example: + + '''The big red fox + is walking down the lane. + + At the end of the lane + the fox met a bear.''' + + Gets converted to + + '''The big red fox is walking down the lane. + At the end of the land the fox met a bear.''' + """ + paragraphs = paragraph_split.split(text) + clean_paragraphs = [line_split.sub(" ", para) for para in paragraphs if para.strip()] + return "\n\n".join(clean_paragraphs) + + # TODO(robinson) - There's likely a cleaner was to accomplish this and get all of the # unicode characters instead of just the quotes. Doing this for now since quotes are # an issue that are popping up in the SEC filings tests diff --git a/unstructured/nlp/patterns.py b/unstructured/nlp/patterns.py index 41fbb7b91..013a51d64 100644 --- a/unstructured/nlp/patterns.py +++ b/unstructured/nlp/patterns.py @@ -70,6 +70,9 @@ EMAIL_HEAD_RE = re.compile(EMAIL_HEAD_PATTERN) # (incluing \r and \n chars) on either side PARAGRAPH_PATTERN = r"\s*\n\s*" # noqa: W605 NOTE(harrell) +PARAGRAPH_PATTERN_RE = re.compile(PARAGRAPH_PATTERN) +DOUBLE_PARAGRAPH_PATTERN_RE = re.compile("(" + PARAGRAPH_PATTERN + "){2}") + # IP Address examples: ba23::58b5:2236:45g2:88h2 or 10.0.2.01 IP_ADDRESS_PATTERN = ( "[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}", # noqa: W605 NOTE(harrell) diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 841bac55b..3c2ec98bf 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -1,4 +1,4 @@ -from typing import IO, Optional +from typing import IO, Callable, Optional from unstructured.file_utils.filetype import FileType, detect_filetype from unstructured.partition.doc import partition_doc @@ -24,6 +24,7 @@ def partition( include_page_breaks: bool = False, strategy: str = "hi_res", encoding: str = "utf-8", + paragraph_grouper: Optional[Callable[[str], str]] = None, ): """Partitions a document into its constituent elements. Will use libmagic to determine the file's type and route it to the appropriate partitioning function. Applies the default @@ -95,7 +96,12 @@ def partition( include_page_breaks=include_page_breaks, ) elif filetype == FileType.TXT: - return partition_text(filename=filename, file=file, encoding=encoding) + return partition_text( + filename=filename, + file=file, + encoding=encoding, + paragraph_grouper=paragraph_grouper, + ) elif filetype == FileType.PPT: return partition_ppt(filename=filename, file=file, include_page_breaks=include_page_breaks) elif filetype == FileType.PPTX: diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py index db4d47f33..4bd96f007 100644 --- a/unstructured/partition/text.py +++ b/unstructured/partition/text.py @@ -1,5 +1,5 @@ import re -from typing import IO, List, Optional +from typing import IO, Callable, List, Optional from unstructured.cleaners.core import clean_bullets from unstructured.documents.elements import ( @@ -30,6 +30,7 @@ def partition_text( file: Optional[IO] = None, text: Optional[str] = None, encoding: Optional[str] = "utf-8", + paragraph_grouper: Optional[Callable[[str], str]] = None, ) -> List[Element]: """Partitions an .txt documents into its constituent elements. Parameters @@ -42,6 +43,9 @@ def partition_text( The string representation of the .txt document. encoding The encoding method used to decode the text input. If None, utf-8 will be used. + paragrapher_grouper + A str -> str function for fixing paragraphs that are interrupted by line breaks + for formatting purposes. """ if text is not None and text.strip() == "" and not file and not filename: return [] @@ -64,6 +68,9 @@ def partition_text( elif text is not None: file_text = str(text) + if paragraph_grouper is not None: + file_text = paragraph_grouper(file_text) + file_content = split_by_paragraph(file_text) elements: List[Element] = []