unstructured/test_unstructured/cleaners/test_core.py

import pytest

from unstructured.cleaners import core


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        (
            "\x88This text contains non-ascii characters!\x88",
            "This text contains non-ascii characters!",
        ),
        ("\x93A lovely quote!\x94", "A lovely quote!"),
        ("● An excellent point! ●●●", " An excellent point! "),
        ("Item\xa01A", "Item1A"),
        ("Our dog&apos;s bowl.", "Our dog&apos;s bowl."),
        ("5 w=E2=80=99s", "5 w=E2=80=99s"),
    ],
)
def test_clean_non_ascii_chars(text, expected):
    assert core.clean_non_ascii_chars(text) == expected


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        ("● An excellent point!", "An excellent point!"),
        ("● An excellent point! ●●●", "An excellent point! ●●●"),
        ("An excellent point!", "An excellent point!"),
        ("Morse code! ●●●", "Morse code! ●●●"),
    ],
)
def test_clean_bullets(text, expected):
    assert core.clean_bullets(text=text) == expected
    assert core.clean(text=text, bullets=True) == expected


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        ("1. Introduction:", "Introduction:"),
        ("a. Introduction:", "Introduction:"),
        ("20.3 Morse code ●●●", "Morse code ●●●"),
        ("5.3.1 Convolutional Networks ", "Convolutional Networks"),
        ("D.b.C Recurrent Neural Networks", "Recurrent Neural Networks"),
        ("2.b.1 Recurrent Neural Networks", "Recurrent Neural Networks"),
        ("eins. Neural Networks", "eins. Neural Networks"),
        ("bb.c Feed Forward Neural Networks", "Feed Forward Neural Networks"),
        ("aaa.ccc Metrics", "aaa.ccc Metrics"),
        (" version = 3.8", " version = 3.8"),
        ("1 2. 3 4", "1 2. 3 4"),
        ("1) 2. 3 4", "1) 2. 3 4"),
        ("2,3. Morse code 3. ●●●", "2,3. Morse code 3. ●●●"),
        ("1..2.3 four", "1..2.3 four"),
        ("Fig. 2: The relationship", "Fig. 2: The relationship"),
        ("23 is everywhere", "23 is everywhere"),
    ],
)
def test_clean_ordered_bullets(text, expected):
    assert core.clean_ordered_bullets(text=text) == expected


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        ("\x93A lovely quote!\x94", "“A lovely quote!”"),
        ("\x91A lovely quote!\x92", "‘A lovely quote!’"),
        ("Our dog&apos;s bowl.", "Our dog's bowl."),
    ],
)
def test_replace_unicode_quotes(text, expected):
    assert core.replace_unicode_quotes(text=text) == expected


@pytest.mark.parametrize(
    ("text", "expected"),
    [("5 w=E2=80=99s", "5 w’s")],
)
def test_replace_mime_encodings(text, expected):
    assert core.replace_mime_encodings(text=text) == expected


def test_replace_mime_encodings_works_with_different_encodings():
    text = "5 w=E2=80-99s=E2=80-92"
    assert core.replace_mime_encodings(text=text, encoding="latin-1") == "5 wâ\x80-99sâ\x80-92"


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        ("“A lovely quote!”", "A lovely quote"),
        ("‘A lovely quote!’", "A lovely quote"),
        ("'()[]{};:'\",.?/\\-_", ""),
    ],
)
def test_remove_punctuation(text, expected):
    assert core.remove_punctuation(text) == expected


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        ("RISK\n\nFACTORS", "RISK FACTORS"),
        ("Item\xa01A", "Item 1A"),
        ("  Risk factors ", "Risk factors"),
        ("Risk   factors ", "Risk factors"),
    ],
)
def test_clean_extra_whitespace(text, expected):
    assert core.clean_extra_whitespace(text) == expected
    assert core.clean(text=text, extra_whitespace=True) == expected


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        ("Risk-factors", "Risk factors"),
        ("Risk – factors", "Risk   factors"),
        ("Risk\u2013factors", "Risk factors"),
        ("Risk factors-\u2013", "Risk factors"),
    ],
)
def test_clean_dashes(text, expected):
    assert core.clean_dashes(text) == expected
    assert core.clean(text=text, dashes=True) == expected


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        ("Item 1A:", "Item 1A"),
        ("Item 1A;", "Item 1A"),
        ("Item 1A.", "Item 1A"),
        ("Item 1A,", "Item 1A"),
        ("Item, 1A: ", "Item, 1A"),
    ],
)
def test_clean_trailing_punctuation(text, expected):
    assert core.clean_trailing_punctuation(text) == expected
    assert core.clean(text=text, trailing_punctuation=True) == expected


@pytest.mark.parametrize(
    ("text", "pattern", "ignore_case", "strip", "expected"),
    [
        ("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
        ("DESC: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
        ("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, False, " A great SUMMARY"),
        ("summary: A great SUMMARY", r"(SUMMARY|DESC):", True, True, "A great SUMMARY"),
    ],
)
def test_clean_prefix(text, pattern, ignore_case, strip, expected):
    assert core.clean_prefix(text, pattern, ignore_case, strip) == expected


@pytest.mark.parametrize(
    ("text", "pattern", "ignore_case", "strip", "expected"),
    [
        ("The END! END", r"(END|STOP)", False, True, "The END!"),
        ("The END! STOP", r"(END|STOP)", False, True, "The END!"),
        ("The END! END", r"(END|STOP)", False, False, "The END! "),
        ("The END! end", r"(END|STOP)", True, True, "The END!"),
    ],
)
def test_clean_postfix(text, pattern, ignore_case, strip, expected):
    assert core.clean_postfix(text, pattern, ignore_case, strip) == expected


@pytest.mark.parametrize(
    # NOTE(yuming): Tests combined cleaners
    (
        "text",
        "extra_whitespace",
        "dashes",
        "bullets",
        "lowercase",
        "trailing_punctuation",
        "expected",
    ),
    [
        ("  Risk-factors ", True, True, False, False, False, "Risk factors"),
        ("● Point!  ●●● ", True, False, True, False, False, "Point! ●●●"),
        ("Risk- factors ", True, False, False, True, False, "risk- factors"),
        ("Risk   factors: ", True, False, False, False, True, "Risk factors"),
        ("● Risk-factors●●● ", False, True, True, False, False, "Risk factors●●●"),
        ("Risk-factors ", False, True, False, True, False, "risk factors"),
        ("Risk-factors: ", False, True, False, False, True, "Risk factors"),
        ("● Point! ●●● ", False, False, True, True, False, "point! ●●●"),
        ("● Point! ●●●: ", False, False, True, False, True, "Point! ●●●"),
        ("Risk factors: ", False, False, False, True, True, "risk factors"),
    ],
)
def test_clean(text, extra_whitespace, dashes, bullets, lowercase, trailing_punctuation, expected):
    assert (
        core.clean(
            text=text,
            extra_whitespace=extra_whitespace,
            dashes=dashes,
            bullets=bullets,
            trailing_punctuation=trailing_punctuation,
            lowercase=lowercase,
        )
        == expected
    )
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								import pytest
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								from unstructured.cleaners import core
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
-												feat: add staging brick to clean non-ascii characters from unicode (#366)


											
										
										
											2023-03-15 13:31:51 +09:00
+								@pytest.mark.parametrize(
 								    ("text", "expected"),
 								    [
 								        (
 								            "\x88This text contains non-ascii characters!\x88",
 								            "This text contains non-ascii characters!",
 								        ),
 								        ("\x93A lovely quote!\x94", "A lovely quote!"),
 								        ("● An excellent point! ●●●", " An excellent point! "),
 								        ("Item\xa01A", "Item1A"),
 								        ("Our dog&apos;s bowl.", "Our dog&apos;s bowl."),
 								        ("5 w=E2=80=99s", "5 w=E2=80=99s"),
 								    ],
 								)
 								def test_clean_non_ascii_chars(text, expected):
 								    assert core.clean_non_ascii_chars(text) == expected
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "expected"),
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								    [
 								        ("● An excellent point!", "An excellent point!"),
 								        ("● An excellent point! ●●●", "An excellent point! ●●●"),
 								        ("An excellent point!", "An excellent point!"),
 								        ("Morse code! ●●●", "Morse code! ●●●"),
 								    ],
 								)
 								def test_clean_bullets(text, expected):
 								    assert core.clean_bullets(text=text) == expected
 								    assert core.clean(text=text, bullets=True) == expected
-												feat: new bricks for removing and extracting ordered bullets (#128)

* feat: new cleaning brick for ordered bullets

* test: add test for cleaning ordered bullets

* feat: new brick for extracting ordered bullets

* test: add test for extracting ordered bullets

* docs: update CHANGELOG and bump new dev version

* chore: change extract ordered bullets return type to tuple

* chore: made tidy

* chore: regex to split on pattern instead of built-in

* chore: catch ValueError, made tidy and fix incompatible type

* chore: assertion statements in one line of code

* docs: add documentation for new clean and extract bricks to bricks.rst

* docs: refactor CHANGELOG 0.3.5.dev5 to dev6 with new bullets

* docs: update CHANGELOG 0.3.6-dev0 changes and bump version

Co-authored-by: Sebastian Laverde <sebastian@unstructured.io>
											
										
										
											2023-01-05 17:06:26 +01:00
+								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "expected"),
-												feat: new bricks for removing and extracting ordered bullets (#128)

* feat: new cleaning brick for ordered bullets

* test: add test for cleaning ordered bullets

* feat: new brick for extracting ordered bullets

* test: add test for extracting ordered bullets

* docs: update CHANGELOG and bump new dev version

* chore: change extract ordered bullets return type to tuple

* chore: made tidy

* chore: regex to split on pattern instead of built-in

* chore: catch ValueError, made tidy and fix incompatible type

* chore: assertion statements in one line of code

* docs: add documentation for new clean and extract bricks to bricks.rst

* docs: refactor CHANGELOG 0.3.5.dev5 to dev6 with new bullets

* docs: update CHANGELOG 0.3.6-dev0 changes and bump version

Co-authored-by: Sebastian Laverde <sebastian@unstructured.io>
											
										
										
											2023-01-05 17:06:26 +01:00
+								    [
 								        ("1. Introduction:", "Introduction:"),
 								        ("a. Introduction:", "Introduction:"),
 								        ("20.3 Morse code ●●●", "Morse code ●●●"),
 								        ("5.3.1 Convolutional Networks ", "Convolutional Networks"),
 								        ("D.b.C Recurrent Neural Networks", "Recurrent Neural Networks"),
 								        ("2.b.1 Recurrent Neural Networks", "Recurrent Neural Networks"),
 								        ("eins. Neural Networks", "eins. Neural Networks"),
 								        ("bb.c Feed Forward Neural Networks", "Feed Forward Neural Networks"),
 								        ("aaa.ccc Metrics", "aaa.ccc Metrics"),
 								        (" version = 3.8", " version = 3.8"),
 								        ("1 2. 3 4", "1 2. 3 4"),
 								        ("1) 2. 3 4", "1) 2. 3 4"),
 								        ("2,3. Morse code 3. ●●●", "2,3. Morse code 3. ●●●"),
 								        ("1..2.3 four", "1..2.3 four"),
 								        ("Fig. 2: The relationship", "Fig. 2: The relationship"),
 								        ("23 is everywhere", "23 is everywhere"),
 								    ],
 								)
 								def test_clean_ordered_bullets(text, expected):
 								    assert core.clean_ordered_bullets(text=text) == expected
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "expected"),
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								    [
 								        ("\x93A lovely quote!\x94", "“A lovely quote!”"),
 								        ("\x91A lovely quote!\x92", "‘A lovely quote!’"),
-												feat: Add html escape quotes to cleaning brick (#84)

* feat: Add html escape quotes to cleaning brick

* bump changelog
											
										
										
											2022-11-29 10:58:31 -05:00
+								        ("Our dog&apos;s bowl.", "Our dog's bowl."),
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								    ],
 								)
 								def test_replace_unicode_quotes(text, expected):
 								    assert core.replace_unicode_quotes(text=text) == expected
-												feat: add `partition_email` cleaning brick (#104)

* fix for processing deeply embedded list elements

* fix types in mime encodings cleaner

* first pass on partition_email

* tests for email

* test for mime encodings

* changelog bump

* added note about \n=

* linting, linting, linting

* added email docs

* add partition_email to the readme

* add one more test
											
										
										
											2022-12-19 13:02:44 -05:00
+								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "expected"),
-												feat: add `partition_email` cleaning brick (#104)

* fix for processing deeply embedded list elements

* fix types in mime encodings cleaner

* first pass on partition_email

* tests for email

* test for mime encodings

* changelog bump

* added note about \n=

* linting, linting, linting

* added email docs

* add partition_email to the readme

* add one more test
											
										
										
											2022-12-19 13:02:44 -05:00
+								    [("5 w=E2=80=99s", "5 w’s")],
 								)
 								def test_replace_mime_encodings(text, expected):
 								    assert core.replace_mime_encodings(text=text) == expected
-												fix: allow `replace_mime_encodings` to accept and `encoding` kwarg (#453)

* changelog and version

* added test
											
										
										
											2023-04-05 18:53:38 -04:00
+								def test_replace_mime_encodings_works_with_different_encodings():
 								    text = "5 w=E2=80-99s=E2=80-92"
 								    assert core.replace_mime_encodings(text=text, encoding="latin-1") == "5 wâ\x80-99sâ\x80-92"
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "expected"),
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								    [
 								        ("“A lovely quote!”", "A lovely quote"),
 								        ("‘A lovely quote!’", "A lovely quote"),
 								        ("'()[]{};:'\",.?/\\-_", ""),
 								    ],
 								)
 								def test_remove_punctuation(text, expected):
 								    assert core.remove_punctuation(text) == expected
 								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "expected"),
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								    [
 								        ("RISK\n\nFACTORS", "RISK FACTORS"),
 								        ("Item\xa01A", "Item 1A"),
 								        ("  Risk factors ", "Risk factors"),
 								        ("Risk   factors ", "Risk factors"),
 								    ],
 								)
 								def test_clean_extra_whitespace(text, expected):
 								    assert core.clean_extra_whitespace(text) == expected
 								    assert core.clean(text=text, extra_whitespace=True) == expected
 								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "expected"),
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								    [
 								        ("Risk-factors", "Risk factors"),
 								        ("Risk – factors", "Risk   factors"),
 								        ("Risk\u2013factors", "Risk factors"),
 								        ("Risk factors-\u2013", "Risk factors"),
 								    ],
 								)
 								def test_clean_dashes(text, expected):
 								    assert core.clean_dashes(text) == expected
 								    assert core.clean(text=text, dashes=True) == expected
 								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "expected"),
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								    [
 								        ("Item 1A:", "Item 1A"),
 								        ("Item 1A;", "Item 1A"),
 								        ("Item 1A.", "Item 1A"),
 								        ("Item 1A,", "Item 1A"),
 								        ("Item, 1A: ", "Item, 1A"),
 								    ],
 								)
 								def test_clean_trailing_punctuation(text, expected):
 								    assert core.clean_trailing_punctuation(text) == expected
 								    assert core.clean(text=text, trailing_punctuation=True) == expected
-												feat: Cleaning bricks for removing prefixes and postfixes (#62)

* added prefix and postfix cleaners

* added test for pre and postfix cleaners

* added docs for prefix and postfix bricks

* changelog and bump version

* add dev to version
											
										
										
											2022-11-10 12:24:58 -05:00
+								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "pattern", "ignore_case", "strip", "expected"),
-												feat: Cleaning bricks for removing prefixes and postfixes (#62)

* added prefix and postfix cleaners

* added test for pre and postfix cleaners

* added docs for prefix and postfix bricks

* changelog and bump version

* add dev to version
											
										
										
											2022-11-10 12:24:58 -05:00
+								    [
 								        ("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
 								        ("DESC: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
 								        ("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, False, " A great SUMMARY"),
 								        ("summary: A great SUMMARY", r"(SUMMARY|DESC):", True, True, "A great SUMMARY"),
 								    ],
 								)
 								def test_clean_prefix(text, pattern, ignore_case, strip, expected):
 								    assert core.clean_prefix(text, pattern, ignore_case, strip) == expected
 								@pytest.mark.parametrize(
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    ("text", "pattern", "ignore_case", "strip", "expected"),
-												feat: Cleaning bricks for removing prefixes and postfixes (#62)

* added prefix and postfix cleaners

* added test for pre and postfix cleaners

* added docs for prefix and postfix bricks

* changelog and bump version

* add dev to version
											
										
										
											2022-11-10 12:24:58 -05:00
+								    [
 								        ("The END! END", r"(END|STOP)", False, True, "The END!"),
 								        ("The END! STOP", r"(END|STOP)", False, True, "The END!"),
 								        ("The END! END", r"(END|STOP)", False, False, "The END! "),
 								        ("The END! end", r"(END|STOP)", True, True, "The END!"),
 								    ],
 								)
 								def test_clean_postfix(text, pattern, ignore_case, strip, expected):
 								    assert core.clean_postfix(text, pattern, ignore_case, strip) == expected
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								@pytest.mark.parametrize(
 								    # NOTE(yuming): Tests combined cleaners
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    (
 								        "text",
 								        "extra_whitespace",
 								        "dashes",
 								        "bullets",
 								        "lowercase",
 								        "trailing_punctuation",
 								        "expected",
 								    ),
-												Initial Release

											
										
										
											2022-06-29 14:35:19 -04:00
+								    [
 								        ("  Risk-factors ", True, True, False, False, False, "Risk factors"),
 								        ("● Point!  ●●● ", True, False, True, False, False, "Point! ●●●"),
 								        ("Risk- factors ", True, False, False, True, False, "risk- factors"),
 								        ("Risk   factors: ", True, False, False, False, True, "Risk factors"),
 								        ("● Risk-factors●●● ", False, True, True, False, False, "Risk factors●●●"),
 								        ("Risk-factors ", False, True, False, True, False, "risk factors"),
 								        ("Risk-factors: ", False, True, False, False, True, "Risk factors"),
 								        ("● Point! ●●● ", False, False, True, True, False, "point! ●●●"),
 								        ("● Point! ●●●: ", False, False, True, False, True, "Point! ●●●"),
 								        ("Risk factors: ", False, False, False, True, True, "risk factors"),
 								    ],
 								)
 								def test_clean(text, extra_whitespace, dashes, bullets, lowercase, trailing_punctuation, expected):
 								    assert (
 								        core.clean(
 								            text=text,
 								            extra_whitespace=extra_whitespace,
 								            dashes=dashes,
 								            bullets=bullets,
 								            trailing_punctuation=trailing_punctuation,
 								            lowercase=lowercase,
 								        )
 								        == expected
 								    )