shreyanid 433d6af1bc
fix: format Arabic and Hebrew annotated encodings (#823)
* add modified arabic and hebrew encodings

* added calls to format_encoding_str so encoding is checked before use

* added formatting to detect_filetype()

* explicitly provided default value for null encoding parameter

* fixed format of annotated encodings list

* adding hebrew base64 test file

* small lint fixes

* update changelog

* bump version to -dev2
2023-06-27 18:15:02 -07:00

253 lines
8.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import pytest
from unstructured.cleaners import core
@pytest.mark.parametrize(
("text", "expected"),
[
(
"\x88This text contains non-ascii characters!\x88",
"This text contains non-ascii characters!",
),
("\x93A lovely quote!\x94", "A lovely quote!"),
("● An excellent point! ●●●", " An excellent point! "),
("Item\xa01A", "Item1A"),
("Our dog's bowl.", "Our dog's bowl."),
("5 w=E2=80=99s", "5 w=E2=80=99s"),
],
)
def test_clean_non_ascii_chars(text, expected):
assert core.clean_non_ascii_chars(text) == expected
@pytest.mark.parametrize(
("text", "expected"),
[
("● An excellent point!", "An excellent point!"),
("● An excellent point! ●●●", "An excellent point! ●●●"),
("An excellent point!", "An excellent point!"),
("Morse code! ●●●", "Morse code! ●●●"),
],
)
def test_clean_bullets(text, expected):
assert core.clean_bullets(text=text) == expected
assert core.clean(text=text, bullets=True) == expected
@pytest.mark.parametrize(
("text", "expected"),
[
("1. Introduction:", "Introduction:"),
("a. Introduction:", "Introduction:"),
("20.3 Morse code ●●●", "Morse code ●●●"),
("5.3.1 Convolutional Networks ", "Convolutional Networks"),
("D.b.C Recurrent Neural Networks", "Recurrent Neural Networks"),
("2.b.1 Recurrent Neural Networks", "Recurrent Neural Networks"),
("eins. Neural Networks", "eins. Neural Networks"),
("bb.c Feed Forward Neural Networks", "Feed Forward Neural Networks"),
("aaa.ccc Metrics", "aaa.ccc Metrics"),
(" version = 3.8", " version = 3.8"),
("1 2. 3 4", "1 2. 3 4"),
("1) 2. 3 4", "1) 2. 3 4"),
("2,3. Morse code 3. ●●●", "2,3. Morse code 3. ●●●"),
("1..2.3 four", "1..2.3 four"),
("Fig. 2: The relationship", "Fig. 2: The relationship"),
("23 is everywhere", "23 is everywhere"),
],
)
def test_clean_ordered_bullets(text, expected):
assert core.clean_ordered_bullets(text=text) == expected
@pytest.mark.parametrize(
("text", "expected"),
[
("\x93A lovely quote!\x94", "“A lovely quote!”"),
("\x91A lovely quote!\x92", "A lovely quote!"),
("Our dog's bowl.", "Our dog's bowl."),
],
)
def test_replace_unicode_quotes(text, expected):
assert core.replace_unicode_quotes(text=text) == expected
@pytest.mark.parametrize(
("text", "expected"),
[("5 w=E2=80=99s", "5 ws")],
)
def test_replace_mime_encodings(text, expected):
assert core.replace_mime_encodings(text=text) == expected
def test_replace_mime_encodings_works_with_different_encodings():
text = "5 w=E2=80-99s=E2=80-92"
assert core.replace_mime_encodings(text=text, encoding="latin-1") == "5 wâ\x80-99sâ\x80-92"
def test_replace_mime_encodings_works_with_right_to_left_encodings():
text = "=EE=E0=E9=E4"
assert core.replace_mime_encodings(text=text, encoding="iso-8859-8") == "מאיה"
@pytest.mark.parametrize(
("text", "expected"),
[
("“A lovely quote!”", "A lovely quote"),
("A lovely quote!", "A lovely quote"),
("'()[]{};:'\",.?/\\-_", ""),
],
)
def test_remove_punctuation(text, expected):
assert core.remove_punctuation(text) == expected
@pytest.mark.parametrize(
("text", "expected"),
[
("RISK\n\nFACTORS", "RISK FACTORS"),
("Item\xa01A", "Item 1A"),
(" Risk factors ", "Risk factors"),
("Risk factors ", "Risk factors"),
],
)
def test_clean_extra_whitespace(text, expected):
assert core.clean_extra_whitespace(text) == expected
assert core.clean(text=text, extra_whitespace=True) == expected
@pytest.mark.parametrize(
("text", "expected"),
[
("Risk-factors", "Risk factors"),
("Risk factors", "Risk factors"),
("Risk\u2013factors", "Risk factors"),
("Risk factors-\u2013", "Risk factors"),
],
)
def test_clean_dashes(text, expected):
assert core.clean_dashes(text) == expected
assert core.clean(text=text, dashes=True) == expected
@pytest.mark.parametrize(
("text", "expected"),
[
("Item 1A:", "Item 1A"),
("Item 1A;", "Item 1A"),
("Item 1A.", "Item 1A"),
("Item 1A,", "Item 1A"),
("Item, 1A: ", "Item, 1A"),
],
)
def test_clean_trailing_punctuation(text, expected):
assert core.clean_trailing_punctuation(text) == expected
assert core.clean(text=text, trailing_punctuation=True) == expected
@pytest.mark.parametrize(
("text", "pattern", "ignore_case", "strip", "expected"),
[
("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
("DESC: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, False, " A great SUMMARY"),
("summary: A great SUMMARY", r"(SUMMARY|DESC):", True, True, "A great SUMMARY"),
],
)
def test_clean_prefix(text, pattern, ignore_case, strip, expected):
assert core.clean_prefix(text, pattern, ignore_case, strip) == expected
@pytest.mark.parametrize(
("text", "pattern", "ignore_case", "strip", "expected"),
[
("The END! END", r"(END|STOP)", False, True, "The END!"),
("The END! STOP", r"(END|STOP)", False, True, "The END!"),
("The END! END", r"(END|STOP)", False, False, "The END! "),
("The END! end", r"(END|STOP)", True, True, "The END!"),
],
)
def test_clean_postfix(text, pattern, ignore_case, strip, expected):
assert core.clean_postfix(text, pattern, ignore_case, strip) == expected
def test_group_broken_paragraphs():
text = """The big red fox
is walking down the lane.
At the end of the lane
the fox met a friendly bear."""
assert (
core.group_broken_paragraphs(text)
== """The big red fox is walking down the lane.
At the end of the lane the fox met a friendly bear."""
)
def test_group_broken_paragraphs_non_default_settings():
text = """The big red fox
is walking down the lane.
At the end of the lane
the fox met a friendly bear."""
para_split_re = re.compile(r"(\s*\n\s*){3}")
clean_text = core.group_broken_paragraphs(text, paragraph_split=para_split_re)
assert (
clean_text
== """The big red fox is walking down the lane.
At the end of the lane the fox met a friendly bear."""
)
@pytest.mark.parametrize(
# NOTE(yuming): Tests combined cleaners
(
"text",
"extra_whitespace",
"dashes",
"bullets",
"lowercase",
"trailing_punctuation",
"expected",
),
[
(" Risk-factors ", True, True, False, False, False, "Risk factors"),
("● Point! ●●● ", True, False, True, False, False, "Point! ●●●"),
("Risk- factors ", True, False, False, True, False, "risk- factors"),
("Risk factors: ", True, False, False, False, True, "Risk factors"),
("● Risk-factors●●● ", False, True, True, False, False, "Risk factors●●●"),
("Risk-factors ", False, True, False, True, False, "risk factors"),
("Risk-factors: ", False, True, False, False, True, "Risk factors"),
("● Point! ●●● ", False, False, True, True, False, "point! ●●●"),
("● Point! ●●●: ", False, False, True, False, True, "Point! ●●●"),
("Risk factors: ", False, False, False, True, True, "risk factors"),
],
)
def test_clean(text, extra_whitespace, dashes, bullets, lowercase, trailing_punctuation, expected):
assert (
core.clean(
text=text,
extra_whitespace=extra_whitespace,
dashes=dashes,
bullets=bullets,
trailing_punctuation=trailing_punctuation,
lowercase=lowercase,
)
== expected
)
def test_bytes_string_to_string():
text = "\xe6\xaf\x8f\xe6\x97\xa5\xe6\x96\xb0\xe9\x97\xbb"
assert core.bytes_string_to_string(text, "utf-8") == "每日新闻"