Matt Robinson 7a74cdda86
feat: add partition_email cleaning brick (#104)
* fix for processing deeply embedded list elements

* fix types in mime encodings cleaner

* first pass on partition_email

* tests for email

* test for mime encodings

* changelog bump

* added note about \n=

* linting, linting, linting

* added email docs

* add partition_email to the readme

* add one more test
2022-12-19 18:02:44 +00:00

149 lines
5.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pytest
import unstructured.cleaners.core as core
@pytest.mark.parametrize(
"text, expected",
[
("● An excellent point!", "An excellent point!"),
("● An excellent point! ●●●", "An excellent point! ●●●"),
("An excellent point!", "An excellent point!"),
("Morse code! ●●●", "Morse code! ●●●"),
],
)
def test_clean_bullets(text, expected):
assert core.clean_bullets(text=text) == expected
assert core.clean(text=text, bullets=True) == expected
@pytest.mark.parametrize(
"text, expected",
[
("\x93A lovely quote!\x94", "“A lovely quote!”"),
("\x91A lovely quote!\x92", "A lovely quote!"),
("Our dog's bowl.", "Our dog's bowl."),
],
)
def test_replace_unicode_quotes(text, expected):
assert core.replace_unicode_quotes(text=text) == expected
@pytest.mark.parametrize(
"text, expected",
[("5 w=E2=80=99s", "5 ws")],
)
def test_replace_mime_encodings(text, expected):
assert core.replace_mime_encodings(text=text) == expected
@pytest.mark.parametrize(
"text, expected",
[
("“A lovely quote!”", "A lovely quote"),
("A lovely quote!", "A lovely quote"),
("'()[]{};:'\",.?/\\-_", ""),
],
)
def test_remove_punctuation(text, expected):
assert core.remove_punctuation(text) == expected
@pytest.mark.parametrize(
"text, expected",
[
("RISK\n\nFACTORS", "RISK FACTORS"),
("Item\xa01A", "Item 1A"),
(" Risk factors ", "Risk factors"),
("Risk factors ", "Risk factors"),
],
)
def test_clean_extra_whitespace(text, expected):
assert core.clean_extra_whitespace(text) == expected
assert core.clean(text=text, extra_whitespace=True) == expected
@pytest.mark.parametrize(
"text, expected",
[
("Risk-factors", "Risk factors"),
("Risk factors", "Risk factors"),
("Risk\u2013factors", "Risk factors"),
("Risk factors-\u2013", "Risk factors"),
],
)
def test_clean_dashes(text, expected):
assert core.clean_dashes(text) == expected
assert core.clean(text=text, dashes=True) == expected
@pytest.mark.parametrize(
"text, expected",
[
("Item 1A:", "Item 1A"),
("Item 1A;", "Item 1A"),
("Item 1A.", "Item 1A"),
("Item 1A,", "Item 1A"),
("Item, 1A: ", "Item, 1A"),
],
)
def test_clean_trailing_punctuation(text, expected):
assert core.clean_trailing_punctuation(text) == expected
assert core.clean(text=text, trailing_punctuation=True) == expected
@pytest.mark.parametrize(
"text, pattern, ignore_case, strip, expected",
[
("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
("DESC: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, False, " A great SUMMARY"),
("summary: A great SUMMARY", r"(SUMMARY|DESC):", True, True, "A great SUMMARY"),
],
)
def test_clean_prefix(text, pattern, ignore_case, strip, expected):
assert core.clean_prefix(text, pattern, ignore_case, strip) == expected
@pytest.mark.parametrize(
"text, pattern, ignore_case, strip, expected",
[
("The END! END", r"(END|STOP)", False, True, "The END!"),
("The END! STOP", r"(END|STOP)", False, True, "The END!"),
("The END! END", r"(END|STOP)", False, False, "The END! "),
("The END! end", r"(END|STOP)", True, True, "The END!"),
],
)
def test_clean_postfix(text, pattern, ignore_case, strip, expected):
assert core.clean_postfix(text, pattern, ignore_case, strip) == expected
@pytest.mark.parametrize(
# NOTE(yuming): Tests combined cleaners
"text, extra_whitespace, dashes, bullets, lowercase, trailing_punctuation, expected",
[
(" Risk-factors ", True, True, False, False, False, "Risk factors"),
("● Point! ●●● ", True, False, True, False, False, "Point! ●●●"),
("Risk- factors ", True, False, False, True, False, "risk- factors"),
("Risk factors: ", True, False, False, False, True, "Risk factors"),
("● Risk-factors●●● ", False, True, True, False, False, "Risk factors●●●"),
("Risk-factors ", False, True, False, True, False, "risk factors"),
("Risk-factors: ", False, True, False, False, True, "Risk factors"),
("● Point! ●●● ", False, False, True, True, False, "point! ●●●"),
("● Point! ●●●: ", False, False, True, False, True, "Point! ●●●"),
("Risk factors: ", False, False, False, True, True, "risk factors"),
],
)
def test_clean(text, extra_whitespace, dashes, bullets, lowercase, trailing_punctuation, expected):
assert (
core.clean(
text=text,
extra_whitespace=extra_whitespace,
dashes=dashes,
bullets=bullets,
trailing_punctuation=trailing_punctuation,
lowercase=lowercase,
)
== expected
)