unstructured/test_unstructured/cleaners/test_core.py

import re

import pytest

from unstructured.cleaners import core


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        (
            "\x88This text contains non-ascii characters!\x88",
            "This text contains non-ascii characters!",
        ),
        ("\x93A lovely quote!\x94", "A lovely quote!"),
        ("● An excellent point! ●●●", " An excellent point! "),
        ("Item\xa01A", "Item1A"),
        ("Our dog&apos;s bowl.", "Our dog&apos;s bowl."),
        ("5 w=E2=80=99s", "5 w=E2=80=99s"),
    ],
)
def test_clean_non_ascii_chars(text, expected):
    assert core.clean_non_ascii_chars(text) == expected


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        ("● An excellent point!", "An excellent point!"),
        ("● An excellent point! ●●●", "An excellent point! ●●●"),
        ("An excellent point!", "An excellent point!"),
        ("Morse code! ●●●", "Morse code! ●●●"),
    ],
)
def test_clean_bullets(text, expected):
    assert core.clean_bullets(text=text) == expected
    assert core.clean(text=text, bullets=True) == expected


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        ("1. Introduction:", "Introduction:"),
        ("a. Introduction:", "Introduction:"),
        ("20.3 Morse code ●●●", "Morse code ●●●"),
        ("5.3.1 Convolutional Networks ", "Convolutional Networks"),
        ("D.b.C Recurrent Neural Networks", "Recurrent Neural Networks"),
        ("2.b.1 Recurrent Neural Networks", "Recurrent Neural Networks"),
        ("eins. Neural Networks", "eins. Neural Networks"),
        ("bb.c Feed Forward Neural Networks", "Feed Forward Neural Networks"),
        ("aaa.ccc Metrics", "aaa.ccc Metrics"),
        (" version = 3.8", " version = 3.8"),
        ("1 2. 3 4", "1 2. 3 4"),
        ("1) 2. 3 4", "1) 2. 3 4"),
        ("2,3. Morse code 3. ●●●", "2,3. Morse code 3. ●●●"),
        ("1..2.3 four", "1..2.3 four"),
        ("Fig. 2: The relationship", "Fig. 2: The relationship"),
        ("23 is everywhere", "23 is everywhere"),
    ],
)
def test_clean_ordered_bullets(text, expected):
    assert core.clean_ordered_bullets(text=text) == expected


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        ("The æther is a classic element.", "The aether is a classic element."),
        ("In old texts, Æsop's fables are", "In old texts, AEsop's fables are"),
        ("The buﬀer zone is there.", "The buffer zone is there."),
        ("The ﬁle was found in the system.", "The file was found in the system."),
        ("She had a ﬂower in her hair.", "She had a flower in her hair."),
        ("The coﬃn was placed in the grave.", "The coffin was placed in the grave."),
        ("The buﬄe zone was clearly marked.", "The buffle zone was clearly marked."),
        ("The craﬅsman worked with dedication.", "The craftsman worked with dedication."),
        ("The symbol ʪ is very rare.", "The symbol ls is very rare."),
        ("The word 'cœur' means 'heart' in French.", "The word 'coeur' means 'heart' in French."),
        ("The word 'Œuvre' refers to the works", "The word 'OEuvre' refers to the works"),
        ("The ȹ symbol is used in some contexts.", "The qp symbol is used in some contexts."),
        ("The poﬆman delivers mail daily.", "The postman delivers mail daily."),
        (
            "The symbol ʦ can be found in certain alphabets.",
            "The symbol ts can be found in certain alphabets.",
        ),
    ],
)
def test_clean_ligatures(text, expected):
    assert core.clean_ligatures(text=text) == expected


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        ("\x93A lovely quote!\x94", "“A lovely quote!”"),
        ("\x91A lovely quote!\x92", "‘A lovely quote!’"),
        ("Our dog&apos;s bowl.", "Our dog's bowl."),
    ],
)
def test_replace_unicode_quotes(text, expected):
    assert core.replace_unicode_quotes(text=text) == expected


@pytest.mark.parametrize(
    ("text", "expected"),
    [("5 w=E2=80=99s", "5 w’s")],
)
def test_replace_mime_encodings(text, expected):
    assert core.replace_mime_encodings(text=text) == expected


def test_replace_mime_encodings_works_with_different_encodings():
    text = "5 w=E2=80-99s=E2=80-92"
    assert core.replace_mime_encodings(text=text, encoding="latin-1") == "5 wâ\x80-99sâ\x80-92"


def test_replace_mime_encodings_works_with_right_to_left_encodings():
    text = "=EE=E0=E9=E4"
    assert core.replace_mime_encodings(text=text, encoding="iso-8859-8") == "מאיה"


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        ("“A lovely quote!”", "A lovely quote"),
        ("‘A lovely quote!’", "A lovely quote"),
        ("'()[]{};:'\",.?/\\-_", ""),
    ],
)
def test_remove_punctuation(text, expected):
    assert core.remove_punctuation(text) == expected


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        ("RISK\n\nFACTORS", "RISK FACTORS"),
        ("Item\xa01A", "Item 1A"),
        ("  Risk factors ", "Risk factors"),
        ("Risk   factors ", "Risk factors"),
    ],
)
def test_clean_extra_whitespace(text, expected):
    assert core.clean_extra_whitespace(text) == expected
    assert core.clean(text=text, extra_whitespace=True) == expected


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        ("Risk-factors", "Risk factors"),
        ("Risk – factors", "Risk   factors"),
        ("Risk\u2013factors", "Risk factors"),
        ("Risk factors-\u2013", "Risk factors"),
    ],
)
def test_clean_dashes(text, expected):
    assert core.clean_dashes(text) == expected
    assert core.clean(text=text, dashes=True) == expected


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        ("Item 1A:", "Item 1A"),
        ("Item 1A;", "Item 1A"),
        ("Item 1A.", "Item 1A"),
        ("Item 1A,", "Item 1A"),
        ("Item, 1A: ", "Item, 1A"),
    ],
)
def test_clean_trailing_punctuation(text, expected):
    assert core.clean_trailing_punctuation(text) == expected
    assert core.clean(text=text, trailing_punctuation=True) == expected


@pytest.mark.parametrize(
    ("text", "pattern", "ignore_case", "strip", "expected"),
    [
        ("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
        ("DESC: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
        ("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, False, " A great SUMMARY"),
        ("summary: A great SUMMARY", r"(SUMMARY|DESC):", True, True, "A great SUMMARY"),
    ],
)
def test_clean_prefix(text, pattern, ignore_case, strip, expected):
    assert core.clean_prefix(text, pattern, ignore_case, strip) == expected


@pytest.mark.parametrize(
    ("text", "pattern", "ignore_case", "strip", "expected"),
    [
        ("The END! END", r"(END|STOP)", False, True, "The END!"),
        ("The END! STOP", r"(END|STOP)", False, True, "The END!"),
        ("The END! END", r"(END|STOP)", False, False, "The END! "),
        ("The END! end", r"(END|STOP)", True, True, "The END!"),
    ],
)
def test_clean_postfix(text, pattern, ignore_case, strip, expected):
    assert core.clean_postfix(text, pattern, ignore_case, strip) == expected


def test_group_broken_paragraphs():
    text = """The big red fox
is walking down the lane.

At the end of the lane
the fox met a friendly bear."""

    assert (
        core.group_broken_paragraphs(text)
        == """The big red fox is walking down the lane.

At the end of the lane the fox met a friendly bear."""
    )


def test_group_broken_paragraphs_non_default_settings():
    text = """The big red fox

is walking down the lane.


At the end of the lane

the fox met a friendly bear."""

    para_split_re = re.compile(r"(\s*\n\s*){3}")

    clean_text = core.group_broken_paragraphs(text, paragraph_split=para_split_re)
    assert (
        clean_text
        == """The big red fox is walking down the lane.

At the end of the lane the fox met a friendly bear."""
    )


def test_group_broken_paragraphs_with_bullets():
    text = """○The big red fox
is walking down the lane.

○At the end of the lane
the fox met a friendly bear."""
    assert core.group_bullet_paragraph(text) == [
        "○The big red fox is walking down the lane. ",
        "○At the end of the lane the fox met a friendly bear.",
    ]


def test_group_bullet_paragraph_with_e_bullets():
    text = """e The big red fox
is walking down the lane.

e At the end of the lane
the fox met a friendly bear."""
    assert core.group_bullet_paragraph(text) == [
        "· The big red fox is walking down the lane. ",
        "· At the end of the lane the fox met a friendly bear.",
    ]


@pytest.mark.parametrize(
    # NOTE(yuming): Tests combined cleaners
    (
        "text",
        "extra_whitespace",
        "dashes",
        "bullets",
        "lowercase",
        "trailing_punctuation",
        "expected",
    ),
    [
        ("  Risk-factors ", True, True, False, False, False, "Risk factors"),
        ("● Point!  ●●● ", True, False, True, False, False, "Point! ●●●"),
        ("Risk- factors ", True, False, False, True, False, "risk- factors"),
        ("Risk   factors: ", True, False, False, False, True, "Risk factors"),
        ("● Risk-factors●●● ", False, True, True, False, False, "Risk factors●●●"),
        ("Risk-factors ", False, True, False, True, False, "risk factors"),
        ("Risk-factors: ", False, True, False, False, True, "Risk factors"),
        ("● Point! ●●● ", False, False, True, True, False, "point! ●●●"),
        ("● Point! ●●●: ", False, False, True, False, True, "Point! ●●●"),
        ("Risk factors: ", False, False, False, True, True, "risk factors"),
    ],
)
def test_clean(text, extra_whitespace, dashes, bullets, lowercase, trailing_punctuation, expected):
    assert (
        core.clean(
            text=text,
            extra_whitespace=extra_whitespace,
            dashes=dashes,
            bullets=bullets,
            trailing_punctuation=trailing_punctuation,
            lowercase=lowercase,
        )
        == expected
    )


def test_bytes_string_to_string():
    text = "\xe6\xaf\x8f\xe6\x97\xa5\xe6\x96\xb0\xe9\x97\xbb"
    assert core.bytes_string_to_string(text, "utf-8") == "每日新闻"
-												feat: enable grouping broken paragraphs in `partition_text` (#456)

* cleaning brick to group broken paragraphs

* docs for group_broken_paragraphs

* add docs for partition_text with grouper

* partition_text and auto with paragraph_grouper

* version and changelog

* typo in the docs

* linting, linting, linting

* switch to using regular expressions
											
										
										
											2023-04-06 14:35:22 -04:00
+								import re
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								import pytest
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								from unstructured.cleaners import core
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
-												feat: add staging brick to clean non-ascii characters from unicode (#366)


											
										
										
											2023-03-15 13:31:51 +09:00
+								@pytest.mark.parametrize(
 								    ("text", "expected"),
 								    [
 								        (
 								            "\x88This text contains non-ascii characters!\x88",
 								            "This text contains non-ascii characters!",
 								        ),
 								        ("\x93A lovely quote!\x94", "A lovely quote!"),
 								        ("● An excellent point! ●●●", " An excellent point! "),
 								        ("Item\xa01A", "Item1A"),
 								        ("Our dog&apos;s bowl.", "Our dog&apos;s bowl."),
 								        ("5 w=E2=80=99s", "5 w=E2=80=99s"),
 								    ],
 								)
 								def test_clean_non_ascii_chars(text, expected):
 								    assert core.clean_non_ascii_chars(text) == expected
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "expected"),
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								    [
 								        ("● An excellent point!", "An excellent point!"),
 								        ("● An excellent point! ●●●", "An excellent point! ●●●"),
 								        ("An excellent point!", "An excellent point!"),
 								        ("Morse code! ●●●", "Morse code! ●●●"),
 								    ],
 								)
 								def test_clean_bullets(text, expected):
 								    assert core.clean_bullets(text=text) == expected
 								    assert core.clean(text=text, bullets=True) == expected
-												feat: new bricks for removing and extracting ordered bullets (#128)

* feat: new cleaning brick for ordered bullets

* test: add test for cleaning ordered bullets

* feat: new brick for extracting ordered bullets

* test: add test for extracting ordered bullets

* docs: update CHANGELOG and bump new dev version

* chore: change extract ordered bullets return type to tuple

* chore: made tidy

* chore: regex to split on pattern instead of built-in

* chore: catch ValueError, made tidy and fix incompatible type

* chore: assertion statements in one line of code

* docs: add documentation for new clean and extract bricks to bricks.rst

* docs: refactor CHANGELOG 0.3.5.dev5 to dev6 with new bullets

* docs: update CHANGELOG 0.3.6-dev0 changes and bump version

Co-authored-by: Sebastian Laverde <sebastian@unstructured.io>
											
										
										
											2023-01-05 17:06:26 +01:00
+								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "expected"),
-												feat: new bricks for removing and extracting ordered bullets (#128)

* feat: new cleaning brick for ordered bullets

* test: add test for cleaning ordered bullets

* feat: new brick for extracting ordered bullets

* test: add test for extracting ordered bullets

* docs: update CHANGELOG and bump new dev version

* chore: change extract ordered bullets return type to tuple

* chore: made tidy

* chore: regex to split on pattern instead of built-in

* chore: catch ValueError, made tidy and fix incompatible type

* chore: assertion statements in one line of code

* docs: add documentation for new clean and extract bricks to bricks.rst

* docs: refactor CHANGELOG 0.3.5.dev5 to dev6 with new bullets

* docs: update CHANGELOG 0.3.6-dev0 changes and bump version

Co-authored-by: Sebastian Laverde <sebastian@unstructured.io>
											
										
										
											2023-01-05 17:06:26 +01:00
+								    [
 								        ("1. Introduction:", "Introduction:"),
 								        ("a. Introduction:", "Introduction:"),
 								        ("20.3 Morse code ●●●", "Morse code ●●●"),
 								        ("5.3.1 Convolutional Networks ", "Convolutional Networks"),
 								        ("D.b.C Recurrent Neural Networks", "Recurrent Neural Networks"),
 								        ("2.b.1 Recurrent Neural Networks", "Recurrent Neural Networks"),
 								        ("eins. Neural Networks", "eins. Neural Networks"),
 								        ("bb.c Feed Forward Neural Networks", "Feed Forward Neural Networks"),
 								        ("aaa.ccc Metrics", "aaa.ccc Metrics"),
 								        (" version = 3.8", " version = 3.8"),
 								        ("1 2. 3 4", "1 2. 3 4"),
 								        ("1) 2. 3 4", "1) 2. 3 4"),
 								        ("2,3. Morse code 3. ●●●", "2,3. Morse code 3. ●●●"),
 								        ("1..2.3 four", "1..2.3 four"),
 								        ("Fig. 2: The relationship", "Fig. 2: The relationship"),
 								        ("23 is everywhere", "23 is everywhere"),
 								    ],
 								)
 								def test_clean_ordered_bullets(text, expected):
 								    assert core.clean_ordered_bullets(text=text) == expected
-												Add clean_ligatures to core cleaners (#1326)

# Background


[Ligatures](https://en.wikipedia.org/wiki/Ligature_(writing)#Ligatures_in_Unicode_(Latin_alphabets))
can sometimes show up during the text extraction process when they
should not. Very common examples of this are with the Latin `f` related
ligatures which can be **very subtle** to spot by eye (see example
below), but can wreak havoc later.

```python
"ﬀ": "ff",
"ﬁ": "fi",
"ﬂ": "fl",
"ﬃ": "ffi",
"ﬄ": "ffl",
```

Several libraries already do something like this. Most recently,
`pdfplumber` added this sort of capability as part of the text
extraction process, see https://github.com/jsvine/pdfplumber/issues/598

Instead of incorporating any sort of breaking change to the PDF text
processing in `unstructured`, it is best to add this as another cleaner
and allow users to opt in. In turn, the `clean_ligatures` method has
been added in this PR - with accompanying tests.

# Example

Here is an example PDF that causes the issue. For example: `Beneﬁts`,
which should be `Benefits`.


[example.pdf](https://github.com/Unstructured-IO/unstructured/files/12544344/example.pdf)

```bash
curl -X 'POST' \
    'https://api.unstructured.io/general/v0/general' \
    -H 'accept: application/json' \
    -H 'Content-Type: multipart/form-data' \
    -H 'unstructured-api-key: ${UNSTRUCTURED_API_KEY}' \
    -F 'files=@example.pdf' \
    -s | jq -C .
```

# Notes

An initial list of mappings was added with the most common ligatures.
There is some subjectivity to this, but this should be a relatively safe
starting set. Can always be expanded as needed.

											
										
										
											2023-09-07 17:30:18 -04:00
+								@pytest.mark.parametrize(
 								    ("text", "expected"),
 								    [
 								        ("The æther is a classic element.", "The aether is a classic element."),
 								        ("In old texts, Æsop's fables are", "In old texts, AEsop's fables are"),
 								        ("The buﬀer zone is there.", "The buffer zone is there."),
 								        ("The ﬁle was found in the system.", "The file was found in the system."),
 								        ("She had a ﬂower in her hair.", "She had a flower in her hair."),
 								        ("The coﬃn was placed in the grave.", "The coffin was placed in the grave."),
 								        ("The buﬄe zone was clearly marked.", "The buffle zone was clearly marked."),
 								        ("The craﬅsman worked with dedication.", "The craftsman worked with dedication."),
 								        ("The symbol ʪ is very rare.", "The symbol ls is very rare."),
 								        ("The word 'cœur' means 'heart' in French.", "The word 'coeur' means 'heart' in French."),
 								        ("The word 'Œuvre' refers to the works", "The word 'OEuvre' refers to the works"),
 								        ("The ȹ symbol is used in some contexts.", "The qp symbol is used in some contexts."),
 								        ("The poﬆman delivers mail daily.", "The postman delivers mail daily."),
 								        (
 								            "The symbol ʦ can be found in certain alphabets.",
 								            "The symbol ts can be found in certain alphabets.",
 								        ),
 								    ],
 								)
 								def test_clean_ligatures(text, expected):
 								    assert core.clean_ligatures(text=text) == expected
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "expected"),
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								    [
 								        ("\x93A lovely quote!\x94", "“A lovely quote!”"),
 								        ("\x91A lovely quote!\x92", "‘A lovely quote!’"),
-												feat: Add html escape quotes to cleaning brick (#84)

* feat: Add html escape quotes to cleaning brick

* bump changelog
											
										
										
											2022-11-29 10:58:31 -05:00
+								        ("Our dog&apos;s bowl.", "Our dog's bowl."),
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								    ],
 								)
 								def test_replace_unicode_quotes(text, expected):
 								    assert core.replace_unicode_quotes(text=text) == expected
-												feat: add `partition_email` cleaning brick (#104)

* fix for processing deeply embedded list elements

* fix types in mime encodings cleaner

* first pass on partition_email

* tests for email

* test for mime encodings

* changelog bump

* added note about \n=

* linting, linting, linting

* added email docs

* add partition_email to the readme

* add one more test
											
										
										
											2022-12-19 13:02:44 -05:00
+								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "expected"),
-												feat: add `partition_email` cleaning brick (#104)

* fix for processing deeply embedded list elements

* fix types in mime encodings cleaner

* first pass on partition_email

* tests for email

* test for mime encodings

* changelog bump

* added note about \n=

* linting, linting, linting

* added email docs

* add partition_email to the readme

* add one more test
											
										
										
											2022-12-19 13:02:44 -05:00
+								    [("5 w=E2=80=99s", "5 w’s")],
 								)
 								def test_replace_mime_encodings(text, expected):
 								    assert core.replace_mime_encodings(text=text) == expected
-												fix: allow `replace_mime_encodings` to accept and `encoding` kwarg (#453)

* changelog and version

* added test
											
										
										
											2023-04-05 18:53:38 -04:00
+								def test_replace_mime_encodings_works_with_different_encodings():
 								    text = "5 w=E2=80-99s=E2=80-92"
 								    assert core.replace_mime_encodings(text=text, encoding="latin-1") == "5 wâ\x80-99sâ\x80-92"
-												fix: format Arabic and Hebrew annotated encodings (#823)

* add modified arabic and hebrew encodings

* added calls to format_encoding_str so encoding is checked before use

* added formatting to detect_filetype()

* explicitly provided default value for null encoding parameter

* fixed format of annotated encodings list

* adding hebrew base64 test file

* small lint fixes

* update changelog

* bump version to -dev2
											
										
										
											2023-06-27 18:15:02 -07:00
+								def test_replace_mime_encodings_works_with_right_to_left_encodings():
 								    text = "=EE=E0=E9=E4"
 								    assert core.replace_mime_encodings(text=text, encoding="iso-8859-8") == "מאיה"
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "expected"),
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								    [
 								        ("“A lovely quote!”", "A lovely quote"),
 								        ("‘A lovely quote!’", "A lovely quote"),
 								        ("'()[]{};:'\",.?/\\-_", ""),
 								    ],
 								)
 								def test_remove_punctuation(text, expected):
 								    assert core.remove_punctuation(text) == expected
 								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "expected"),
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								    [
 								        ("RISK\n\nFACTORS", "RISK FACTORS"),
 								        ("Item\xa01A", "Item 1A"),
 								        ("  Risk factors ", "Risk factors"),
 								        ("Risk   factors ", "Risk factors"),
 								    ],
 								)
 								def test_clean_extra_whitespace(text, expected):
 								    assert core.clean_extra_whitespace(text) == expected
 								    assert core.clean(text=text, extra_whitespace=True) == expected
 								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "expected"),
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								    [
 								        ("Risk-factors", "Risk factors"),
 								        ("Risk – factors", "Risk   factors"),
 								        ("Risk\u2013factors", "Risk factors"),
 								        ("Risk factors-\u2013", "Risk factors"),
 								    ],
 								)
 								def test_clean_dashes(text, expected):
 								    assert core.clean_dashes(text) == expected
 								    assert core.clean(text=text, dashes=True) == expected
 								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "expected"),
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								    [
 								        ("Item 1A:", "Item 1A"),
 								        ("Item 1A;", "Item 1A"),
 								        ("Item 1A.", "Item 1A"),
 								        ("Item 1A,", "Item 1A"),
 								        ("Item, 1A: ", "Item, 1A"),
 								    ],
 								)
 								def test_clean_trailing_punctuation(text, expected):
 								    assert core.clean_trailing_punctuation(text) == expected
 								    assert core.clean(text=text, trailing_punctuation=True) == expected
-												feat: Cleaning bricks for removing prefixes and postfixes (#62)

* added prefix and postfix cleaners

* added test for pre and postfix cleaners

* added docs for prefix and postfix bricks

* changelog and bump version

* add dev to version
											
										
										
											2022-11-10 12:24:58 -05:00
+								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "pattern", "ignore_case", "strip", "expected"),
-												feat: Cleaning bricks for removing prefixes and postfixes (#62)

* added prefix and postfix cleaners

* added test for pre and postfix cleaners

* added docs for prefix and postfix bricks

* changelog and bump version

* add dev to version
											
										
										
											2022-11-10 12:24:58 -05:00
+								    [
 								        ("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
 								        ("DESC: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
 								        ("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, False, " A great SUMMARY"),
 								        ("summary: A great SUMMARY", r"(SUMMARY|DESC):", True, True, "A great SUMMARY"),
 								    ],
 								)
 								def test_clean_prefix(text, pattern, ignore_case, strip, expected):
 								    assert core.clean_prefix(text, pattern, ignore_case, strip) == expected
 								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "pattern", "ignore_case", "strip", "expected"),
-												feat: Cleaning bricks for removing prefixes and postfixes (#62)

* added prefix and postfix cleaners

* added test for pre and postfix cleaners

* added docs for prefix and postfix bricks

* changelog and bump version

* add dev to version
											
										
										
											2022-11-10 12:24:58 -05:00
+								    [
 								        ("The END! END", r"(END|STOP)", False, True, "The END!"),
 								        ("The END! STOP", r"(END|STOP)", False, True, "The END!"),
 								        ("The END! END", r"(END|STOP)", False, False, "The END! "),
 								        ("The END! end", r"(END|STOP)", True, True, "The END!"),
 								    ],
 								)
 								def test_clean_postfix(text, pattern, ignore_case, strip, expected):
 								    assert core.clean_postfix(text, pattern, ignore_case, strip) == expected
-												feat: enable grouping broken paragraphs in `partition_text` (#456)

* cleaning brick to group broken paragraphs

* docs for group_broken_paragraphs

* add docs for partition_text with grouper

* partition_text and auto with paragraph_grouper

* version and changelog

* typo in the docs

* linting, linting, linting

* switch to using regular expressions
											
										
										
											2023-04-06 14:35:22 -04:00
+								def test_group_broken_paragraphs():
 								    text = """The big red fox
 								is walking down the lane.
 								At the end of the lane
 								the fox met a friendly bear."""
 								    assert (
 								        core.group_broken_paragraphs(text)
 								        == """The big red fox is walking down the lane.
 								At the end of the lane the fox met a friendly bear."""
 								    )
 								def test_group_broken_paragraphs_non_default_settings():
 								    text = """The big red fox
 								is walking down the lane.
 								At the end of the lane
 								the fox met a friendly bear."""
 								    para_split_re = re.compile(r"(\s*\n\s*){3}")
 								    clean_text = core.group_broken_paragraphs(text, paragraph_split=para_split_re)
 								    assert (
 								        clean_text
 								        == """The big red fox is walking down the lane.
 								At the end of the lane the fox met a friendly bear."""
 								    )
-												fix pdf partition of list items being detected as titles in OCR only mode (#1119)

Closes Github issue #1010

adds group_bullet_paragraph func to handle grouping of bullet items that are split across multiple lines
											
										
										
											2023-08-15 11:35:54 -05:00
+								def test_group_broken_paragraphs_with_bullets():
 								    text = """○The big red fox
 								is walking down the lane.
 								○At the end of the lane
 								the fox met a friendly bear."""
 								    assert core.group_bullet_paragraph(text) == [
 								        "○The big red fox is walking down the lane. ",
 								        "○At the end of the lane the fox met a friendly bear.",
 								    ]
 								def test_group_bullet_paragraph_with_e_bullets():
 								    text = """e The big red fox
 								is walking down the lane.
 								e At the end of the lane
 								the fox met a friendly bear."""
 								    assert core.group_bullet_paragraph(text) == [
 								        "· The big red fox is walking down the lane. ",
 								        "· At the end of the lane the fox met a friendly bear.",
 								    ]
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								@pytest.mark.parametrize(
 								    # NOTE(yuming): Tests combined cleaners
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    (
 								        "text",
 								        "extra_whitespace",
 								        "dashes",
 								        "bullets",
 								        "lowercase",
 								        "trailing_punctuation",
 								        "expected",
 								    ),
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								    [
 								        ("  Risk-factors ", True, True, False, False, False, "Risk factors"),
 								        ("● Point!  ●●● ", True, False, True, False, False, "Point! ●●●"),
 								        ("Risk- factors ", True, False, False, True, False, "risk- factors"),
 								        ("Risk   factors: ", True, False, False, False, True, "Risk factors"),
 								        ("● Risk-factors●●● ", False, True, True, False, False, "Risk factors●●●"),
 								        ("Risk-factors ", False, True, False, True, False, "risk factors"),
 								        ("Risk-factors: ", False, True, False, False, True, "Risk factors"),
 								        ("● Point! ●●● ", False, False, True, True, False, "point! ●●●"),
 								        ("● Point! ●●●: ", False, False, True, False, True, "Point! ●●●"),
 								        ("Risk factors: ", False, False, False, True, True, "risk factors"),
 								    ],
 								)
 								def test_clean(text, extra_whitespace, dashes, bullets, lowercase, trailing_punctuation, expected):
 								    assert (
 								        core.clean(
 								            text=text,
 								            extra_whitespace=extra_whitespace,
 								            dashes=dashes,
 								            bullets=bullets,
 								            trailing_punctuation=trailing_punctuation,
 								            lowercase=lowercase,
 								        )
 								        == expected
 								    )
-												feat: cleaning brick for normalizing bytes string output (#481)

* add cleaning brick for emojis

* changelog and versoin

* docs for bytes_string_to_string

* different test for bytes_string_to_string
											
										
										
											2023-04-13 15:39:08 -04:00
 								def test_bytes_string_to_string():
 								    text = "\xe6\xaf\x8f\xe6\x97\xa5\xe6\x96\xb0\xe9\x97\xbb"
 								    assert core.bytes_string_to_string(text, "utf-8") == "每日新闻"