mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-12 11:35:53 +00:00

* Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
182 lines
6.2 KiB
Python
182 lines
6.2 KiB
Python
import pytest
|
||
|
||
from unstructured.cleaners import core
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("text", "expected"),
|
||
[
|
||
("● An excellent point!", "An excellent point!"),
|
||
("● An excellent point! ●●●", "An excellent point! ●●●"),
|
||
("An excellent point!", "An excellent point!"),
|
||
("Morse code! ●●●", "Morse code! ●●●"),
|
||
],
|
||
)
|
||
def test_clean_bullets(text, expected):
|
||
assert core.clean_bullets(text=text) == expected
|
||
assert core.clean(text=text, bullets=True) == expected
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("text", "expected"),
|
||
[
|
||
("1. Introduction:", "Introduction:"),
|
||
("a. Introduction:", "Introduction:"),
|
||
("20.3 Morse code ●●●", "Morse code ●●●"),
|
||
("5.3.1 Convolutional Networks ", "Convolutional Networks"),
|
||
("D.b.C Recurrent Neural Networks", "Recurrent Neural Networks"),
|
||
("2.b.1 Recurrent Neural Networks", "Recurrent Neural Networks"),
|
||
("eins. Neural Networks", "eins. Neural Networks"),
|
||
("bb.c Feed Forward Neural Networks", "Feed Forward Neural Networks"),
|
||
("aaa.ccc Metrics", "aaa.ccc Metrics"),
|
||
(" version = 3.8", " version = 3.8"),
|
||
("1 2. 3 4", "1 2. 3 4"),
|
||
("1) 2. 3 4", "1) 2. 3 4"),
|
||
("2,3. Morse code 3. ●●●", "2,3. Morse code 3. ●●●"),
|
||
("1..2.3 four", "1..2.3 four"),
|
||
("Fig. 2: The relationship", "Fig. 2: The relationship"),
|
||
("23 is everywhere", "23 is everywhere"),
|
||
],
|
||
)
|
||
def test_clean_ordered_bullets(text, expected):
|
||
assert core.clean_ordered_bullets(text=text) == expected
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("text", "expected"),
|
||
[
|
||
("\x93A lovely quote!\x94", "“A lovely quote!”"),
|
||
("\x91A lovely quote!\x92", "‘A lovely quote!’"),
|
||
("Our dog's bowl.", "Our dog's bowl."),
|
||
],
|
||
)
|
||
def test_replace_unicode_quotes(text, expected):
|
||
assert core.replace_unicode_quotes(text=text) == expected
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("text", "expected"),
|
||
[("5 w=E2=80=99s", "5 w’s")],
|
||
)
|
||
def test_replace_mime_encodings(text, expected):
|
||
assert core.replace_mime_encodings(text=text) == expected
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("text", "expected"),
|
||
[
|
||
("“A lovely quote!”", "A lovely quote"),
|
||
("‘A lovely quote!’", "A lovely quote"),
|
||
("'()[]{};:'\",.?/\\-_", ""),
|
||
],
|
||
)
|
||
def test_remove_punctuation(text, expected):
|
||
assert core.remove_punctuation(text) == expected
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("text", "expected"),
|
||
[
|
||
("RISK\n\nFACTORS", "RISK FACTORS"),
|
||
("Item\xa01A", "Item 1A"),
|
||
(" Risk factors ", "Risk factors"),
|
||
("Risk factors ", "Risk factors"),
|
||
],
|
||
)
|
||
def test_clean_extra_whitespace(text, expected):
|
||
assert core.clean_extra_whitespace(text) == expected
|
||
assert core.clean(text=text, extra_whitespace=True) == expected
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("text", "expected"),
|
||
[
|
||
("Risk-factors", "Risk factors"),
|
||
("Risk – factors", "Risk factors"),
|
||
("Risk\u2013factors", "Risk factors"),
|
||
("Risk factors-\u2013", "Risk factors"),
|
||
],
|
||
)
|
||
def test_clean_dashes(text, expected):
|
||
assert core.clean_dashes(text) == expected
|
||
assert core.clean(text=text, dashes=True) == expected
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("text", "expected"),
|
||
[
|
||
("Item 1A:", "Item 1A"),
|
||
("Item 1A;", "Item 1A"),
|
||
("Item 1A.", "Item 1A"),
|
||
("Item 1A,", "Item 1A"),
|
||
("Item, 1A: ", "Item, 1A"),
|
||
],
|
||
)
|
||
def test_clean_trailing_punctuation(text, expected):
|
||
assert core.clean_trailing_punctuation(text) == expected
|
||
assert core.clean(text=text, trailing_punctuation=True) == expected
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("text", "pattern", "ignore_case", "strip", "expected"),
|
||
[
|
||
("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
|
||
("DESC: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
|
||
("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, False, " A great SUMMARY"),
|
||
("summary: A great SUMMARY", r"(SUMMARY|DESC):", True, True, "A great SUMMARY"),
|
||
],
|
||
)
|
||
def test_clean_prefix(text, pattern, ignore_case, strip, expected):
|
||
assert core.clean_prefix(text, pattern, ignore_case, strip) == expected
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
("text", "pattern", "ignore_case", "strip", "expected"),
|
||
[
|
||
("The END! END", r"(END|STOP)", False, True, "The END!"),
|
||
("The END! STOP", r"(END|STOP)", False, True, "The END!"),
|
||
("The END! END", r"(END|STOP)", False, False, "The END! "),
|
||
("The END! end", r"(END|STOP)", True, True, "The END!"),
|
||
],
|
||
)
|
||
def test_clean_postfix(text, pattern, ignore_case, strip, expected):
|
||
assert core.clean_postfix(text, pattern, ignore_case, strip) == expected
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
# NOTE(yuming): Tests combined cleaners
|
||
(
|
||
"text",
|
||
"extra_whitespace",
|
||
"dashes",
|
||
"bullets",
|
||
"lowercase",
|
||
"trailing_punctuation",
|
||
"expected",
|
||
),
|
||
[
|
||
(" Risk-factors ", True, True, False, False, False, "Risk factors"),
|
||
("● Point! ●●● ", True, False, True, False, False, "Point! ●●●"),
|
||
("Risk- factors ", True, False, False, True, False, "risk- factors"),
|
||
("Risk factors: ", True, False, False, False, True, "Risk factors"),
|
||
("● Risk-factors●●● ", False, True, True, False, False, "Risk factors●●●"),
|
||
("Risk-factors ", False, True, False, True, False, "risk factors"),
|
||
("Risk-factors: ", False, True, False, False, True, "Risk factors"),
|
||
("● Point! ●●● ", False, False, True, True, False, "point! ●●●"),
|
||
("● Point! ●●●: ", False, False, True, False, True, "Point! ●●●"),
|
||
("Risk factors: ", False, False, False, True, True, "risk factors"),
|
||
],
|
||
)
|
||
def test_clean(text, extra_whitespace, dashes, bullets, lowercase, trailing_punctuation, expected):
|
||
assert (
|
||
core.clean(
|
||
text=text,
|
||
extra_whitespace=extra_whitespace,
|
||
dashes=dashes,
|
||
bullets=bullets,
|
||
trailing_punctuation=trailing_punctuation,
|
||
lowercase=lowercase,
|
||
)
|
||
== expected
|
||
)
|