mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-09 10:05:56 +00:00
114 lines
3.7 KiB
Python
114 lines
3.7 KiB
Python
![]() |
import pytest
|
|||
|
|
|||
|
import unstructured.cleaners.core as core
|
|||
|
|
|||
|
|
|||
|
@pytest.mark.parametrize(
|
|||
|
"text, expected",
|
|||
|
[
|
|||
|
("● An excellent point!", "An excellent point!"),
|
|||
|
("● An excellent point! ●●●", "An excellent point! ●●●"),
|
|||
|
("An excellent point!", "An excellent point!"),
|
|||
|
("Morse code! ●●●", "Morse code! ●●●"),
|
|||
|
],
|
|||
|
)
|
|||
|
def test_clean_bullets(text, expected):
|
|||
|
assert core.clean_bullets(text=text) == expected
|
|||
|
assert core.clean(text=text, bullets=True) == expected
|
|||
|
|
|||
|
|
|||
|
@pytest.mark.parametrize(
|
|||
|
"text, expected",
|
|||
|
[
|
|||
|
("\x93A lovely quote!\x94", "“A lovely quote!”"),
|
|||
|
("\x91A lovely quote!\x92", "‘A lovely quote!’"),
|
|||
|
],
|
|||
|
)
|
|||
|
def test_replace_unicode_quotes(text, expected):
|
|||
|
assert core.replace_unicode_quotes(text=text) == expected
|
|||
|
|
|||
|
|
|||
|
@pytest.mark.parametrize(
|
|||
|
"text, expected",
|
|||
|
[
|
|||
|
("“A lovely quote!”", "A lovely quote"),
|
|||
|
("‘A lovely quote!’", "A lovely quote"),
|
|||
|
("'()[]{};:'\",.?/\\-_", ""),
|
|||
|
],
|
|||
|
)
|
|||
|
def test_remove_punctuation(text, expected):
|
|||
|
assert core.remove_punctuation(text) == expected
|
|||
|
|
|||
|
|
|||
|
@pytest.mark.parametrize(
|
|||
|
"text, expected",
|
|||
|
[
|
|||
|
("RISK\n\nFACTORS", "RISK FACTORS"),
|
|||
|
("Item\xa01A", "Item 1A"),
|
|||
|
(" Risk factors ", "Risk factors"),
|
|||
|
("Risk factors ", "Risk factors"),
|
|||
|
],
|
|||
|
)
|
|||
|
def test_clean_extra_whitespace(text, expected):
|
|||
|
assert core.clean_extra_whitespace(text) == expected
|
|||
|
assert core.clean(text=text, extra_whitespace=True) == expected
|
|||
|
|
|||
|
|
|||
|
@pytest.mark.parametrize(
|
|||
|
"text, expected",
|
|||
|
[
|
|||
|
("Risk-factors", "Risk factors"),
|
|||
|
("Risk – factors", "Risk factors"),
|
|||
|
("Risk\u2013factors", "Risk factors"),
|
|||
|
("Risk factors-\u2013", "Risk factors"),
|
|||
|
],
|
|||
|
)
|
|||
|
def test_clean_dashes(text, expected):
|
|||
|
assert core.clean_dashes(text) == expected
|
|||
|
assert core.clean(text=text, dashes=True) == expected
|
|||
|
|
|||
|
|
|||
|
@pytest.mark.parametrize(
|
|||
|
"text, expected",
|
|||
|
[
|
|||
|
("Item 1A:", "Item 1A"),
|
|||
|
("Item 1A;", "Item 1A"),
|
|||
|
("Item 1A.", "Item 1A"),
|
|||
|
("Item 1A,", "Item 1A"),
|
|||
|
("Item, 1A: ", "Item, 1A"),
|
|||
|
],
|
|||
|
)
|
|||
|
def test_clean_trailing_punctuation(text, expected):
|
|||
|
assert core.clean_trailing_punctuation(text) == expected
|
|||
|
assert core.clean(text=text, trailing_punctuation=True) == expected
|
|||
|
|
|||
|
|
|||
|
@pytest.mark.parametrize(
|
|||
|
# NOTE(yuming): Tests combined cleaners
|
|||
|
"text, extra_whitespace, dashes, bullets, lowercase, trailing_punctuation, expected",
|
|||
|
[
|
|||
|
(" Risk-factors ", True, True, False, False, False, "Risk factors"),
|
|||
|
("● Point! ●●● ", True, False, True, False, False, "Point! ●●●"),
|
|||
|
("Risk- factors ", True, False, False, True, False, "risk- factors"),
|
|||
|
("Risk factors: ", True, False, False, False, True, "Risk factors"),
|
|||
|
("● Risk-factors●●● ", False, True, True, False, False, "Risk factors●●●"),
|
|||
|
("Risk-factors ", False, True, False, True, False, "risk factors"),
|
|||
|
("Risk-factors: ", False, True, False, False, True, "Risk factors"),
|
|||
|
("● Point! ●●● ", False, False, True, True, False, "point! ●●●"),
|
|||
|
("● Point! ●●●: ", False, False, True, False, True, "Point! ●●●"),
|
|||
|
("Risk factors: ", False, False, False, True, True, "risk factors"),
|
|||
|
],
|
|||
|
)
|
|||
|
def test_clean(text, extra_whitespace, dashes, bullets, lowercase, trailing_punctuation, expected):
|
|||
|
assert (
|
|||
|
core.clean(
|
|||
|
text=text,
|
|||
|
extra_whitespace=extra_whitespace,
|
|||
|
dashes=dashes,
|
|||
|
bullets=bullets,
|
|||
|
trailing_punctuation=trailing_punctuation,
|
|||
|
lowercase=lowercase,
|
|||
|
)
|
|||
|
== expected
|
|||
|
)
|