2023-04-06 14:35:22 -04:00
|
|
|
|
import re
|
|
|
|
|
|
2022-06-29 14:35:19 -04:00
|
|
|
|
import pytest
|
|
|
|
|
|
2023-02-27 17:30:54 +01:00
|
|
|
|
from unstructured.cleaners import core
|
2022-06-29 14:35:19 -04:00
|
|
|
|
|
|
|
|
|
|
2023-03-15 13:31:51 +09:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
("text", "expected"),
|
|
|
|
|
[
|
|
|
|
|
(
|
|
|
|
|
"\x88This text contains non-ascii characters!\x88",
|
|
|
|
|
"This text contains non-ascii characters!",
|
|
|
|
|
),
|
|
|
|
|
("\x93A lovely quote!\x94", "A lovely quote!"),
|
|
|
|
|
("● An excellent point! ●●●", " An excellent point! "),
|
|
|
|
|
("Item\xa01A", "Item1A"),
|
|
|
|
|
("Our dog's bowl.", "Our dog's bowl."),
|
|
|
|
|
("5 w=E2=80=99s", "5 w=E2=80=99s"),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_clean_non_ascii_chars(text, expected):
|
|
|
|
|
assert core.clean_non_ascii_chars(text) == expected
|
|
|
|
|
|
|
|
|
|
|
2022-06-29 14:35:19 -04:00
|
|
|
|
@pytest.mark.parametrize(
|
2023-02-27 17:30:54 +01:00
|
|
|
|
("text", "expected"),
|
2022-06-29 14:35:19 -04:00
|
|
|
|
[
|
|
|
|
|
("● An excellent point!", "An excellent point!"),
|
|
|
|
|
("● An excellent point! ●●●", "An excellent point! ●●●"),
|
|
|
|
|
("An excellent point!", "An excellent point!"),
|
|
|
|
|
("Morse code! ●●●", "Morse code! ●●●"),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_clean_bullets(text, expected):
|
|
|
|
|
assert core.clean_bullets(text=text) == expected
|
|
|
|
|
assert core.clean(text=text, bullets=True) == expected
|
|
|
|
|
|
|
|
|
|
|
2023-01-05 17:06:26 +01:00
|
|
|
|
@pytest.mark.parametrize(
|
2023-02-27 17:30:54 +01:00
|
|
|
|
("text", "expected"),
|
2023-01-05 17:06:26 +01:00
|
|
|
|
[
|
|
|
|
|
("1. Introduction:", "Introduction:"),
|
|
|
|
|
("a. Introduction:", "Introduction:"),
|
|
|
|
|
("20.3 Morse code ●●●", "Morse code ●●●"),
|
|
|
|
|
("5.3.1 Convolutional Networks ", "Convolutional Networks"),
|
|
|
|
|
("D.b.C Recurrent Neural Networks", "Recurrent Neural Networks"),
|
|
|
|
|
("2.b.1 Recurrent Neural Networks", "Recurrent Neural Networks"),
|
|
|
|
|
("eins. Neural Networks", "eins. Neural Networks"),
|
|
|
|
|
("bb.c Feed Forward Neural Networks", "Feed Forward Neural Networks"),
|
|
|
|
|
("aaa.ccc Metrics", "aaa.ccc Metrics"),
|
|
|
|
|
(" version = 3.8", " version = 3.8"),
|
|
|
|
|
("1 2. 3 4", "1 2. 3 4"),
|
|
|
|
|
("1) 2. 3 4", "1) 2. 3 4"),
|
|
|
|
|
("2,3. Morse code 3. ●●●", "2,3. Morse code 3. ●●●"),
|
|
|
|
|
("1..2.3 four", "1..2.3 four"),
|
|
|
|
|
("Fig. 2: The relationship", "Fig. 2: The relationship"),
|
|
|
|
|
("23 is everywhere", "23 is everywhere"),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_clean_ordered_bullets(text, expected):
|
|
|
|
|
assert core.clean_ordered_bullets(text=text) == expected
|
|
|
|
|
|
|
|
|
|
|
Add clean_ligatures to core cleaners (#1326)
# Background
[Ligatures](https://en.wikipedia.org/wiki/Ligature_(writing)#Ligatures_in_Unicode_(Latin_alphabets))
can sometimes show up during the text extraction process when they
should not. Very common examples of this are with the Latin `f` related
ligatures which can be **very subtle** to spot by eye (see example
below), but can wreak havoc later.
```python
"ff": "ff",
"fi": "fi",
"fl": "fl",
"ffi": "ffi",
"ffl": "ffl",
```
Several libraries already do something like this. Most recently,
`pdfplumber` added this sort of capability as part of the text
extraction process, see https://github.com/jsvine/pdfplumber/issues/598
Instead of incorporating any sort of breaking change to the PDF text
processing in `unstructured`, it is best to add this as another cleaner
and allow users to opt in. In turn, the `clean_ligatures` method has
been added in this PR - with accompanying tests.
# Example
Here is an example PDF that causes the issue. For example: `Benefits`,
which should be `Benefits`.
[example.pdf](https://github.com/Unstructured-IO/unstructured/files/12544344/example.pdf)
```bash
curl -X 'POST' \
'https://api.unstructured.io/general/v0/general' \
-H 'accept: application/json' \
-H 'Content-Type: multipart/form-data' \
-H 'unstructured-api-key: ${UNSTRUCTURED_API_KEY}' \
-F 'files=@example.pdf' \
-s | jq -C .
```
# Notes
An initial list of mappings was added with the most common ligatures.
There is some subjectivity to this, but this should be a relatively safe
starting set. Can always be expanded as needed.
2023-09-07 17:30:18 -04:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
("text", "expected"),
|
|
|
|
|
[
|
|
|
|
|
("The æther is a classic element.", "The aether is a classic element."),
|
|
|
|
|
("In old texts, Æsop's fables are", "In old texts, AEsop's fables are"),
|
|
|
|
|
("The buffer zone is there.", "The buffer zone is there."),
|
|
|
|
|
("The file was found in the system.", "The file was found in the system."),
|
|
|
|
|
("She had a flower in her hair.", "She had a flower in her hair."),
|
|
|
|
|
("The coffin was placed in the grave.", "The coffin was placed in the grave."),
|
|
|
|
|
("The buffle zone was clearly marked.", "The buffle zone was clearly marked."),
|
|
|
|
|
("The craſtsman worked with dedication.", "The craftsman worked with dedication."),
|
|
|
|
|
("The symbol ʪ is very rare.", "The symbol ls is very rare."),
|
|
|
|
|
("The word 'cœur' means 'heart' in French.", "The word 'coeur' means 'heart' in French."),
|
|
|
|
|
("The word 'Œuvre' refers to the works", "The word 'OEuvre' refers to the works"),
|
|
|
|
|
("The ȹ symbol is used in some contexts.", "The qp symbol is used in some contexts."),
|
|
|
|
|
("The postman delivers mail daily.", "The postman delivers mail daily."),
|
|
|
|
|
(
|
|
|
|
|
"The symbol ʦ can be found in certain alphabets.",
|
|
|
|
|
"The symbol ts can be found in certain alphabets.",
|
|
|
|
|
),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_clean_ligatures(text, expected):
|
|
|
|
|
assert core.clean_ligatures(text=text) == expected
|
|
|
|
|
|
|
|
|
|
|
2022-06-29 14:35:19 -04:00
|
|
|
|
@pytest.mark.parametrize(
|
2023-02-27 17:30:54 +01:00
|
|
|
|
("text", "expected"),
|
2022-06-29 14:35:19 -04:00
|
|
|
|
[
|
|
|
|
|
("\x93A lovely quote!\x94", "“A lovely quote!”"),
|
|
|
|
|
("\x91A lovely quote!\x92", "‘A lovely quote!’"),
|
2022-11-29 10:58:31 -05:00
|
|
|
|
("Our dog's bowl.", "Our dog's bowl."),
|
2022-06-29 14:35:19 -04:00
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_replace_unicode_quotes(text, expected):
|
|
|
|
|
assert core.replace_unicode_quotes(text=text) == expected
|
|
|
|
|
|
|
|
|
|
|
2022-12-19 13:02:44 -05:00
|
|
|
|
@pytest.mark.parametrize(
|
2023-02-27 17:30:54 +01:00
|
|
|
|
("text", "expected"),
|
2022-12-19 13:02:44 -05:00
|
|
|
|
[("5 w=E2=80=99s", "5 w’s")],
|
|
|
|
|
)
|
|
|
|
|
def test_replace_mime_encodings(text, expected):
|
|
|
|
|
assert core.replace_mime_encodings(text=text) == expected
|
|
|
|
|
|
|
|
|
|
|
2023-04-05 18:53:38 -04:00
|
|
|
|
def test_replace_mime_encodings_works_with_different_encodings():
|
|
|
|
|
text = "5 w=E2=80-99s=E2=80-92"
|
|
|
|
|
assert core.replace_mime_encodings(text=text, encoding="latin-1") == "5 wâ\x80-99sâ\x80-92"
|
|
|
|
|
|
|
|
|
|
|
2023-06-27 18:15:02 -07:00
|
|
|
|
def test_replace_mime_encodings_works_with_right_to_left_encodings():
|
|
|
|
|
text = "=EE=E0=E9=E4"
|
|
|
|
|
assert core.replace_mime_encodings(text=text, encoding="iso-8859-8") == "מאיה"
|
|
|
|
|
|
|
|
|
|
|
2022-06-29 14:35:19 -04:00
|
|
|
|
@pytest.mark.parametrize(
|
2023-02-27 17:30:54 +01:00
|
|
|
|
("text", "expected"),
|
2022-06-29 14:35:19 -04:00
|
|
|
|
[
|
|
|
|
|
("“A lovely quote!”", "A lovely quote"),
|
|
|
|
|
("‘A lovely quote!’", "A lovely quote"),
|
|
|
|
|
("'()[]{};:'\",.?/\\-_", ""),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_remove_punctuation(text, expected):
|
|
|
|
|
assert core.remove_punctuation(text) == expected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
2023-02-27 17:30:54 +01:00
|
|
|
|
("text", "expected"),
|
2022-06-29 14:35:19 -04:00
|
|
|
|
[
|
|
|
|
|
("RISK\n\nFACTORS", "RISK FACTORS"),
|
|
|
|
|
("Item\xa01A", "Item 1A"),
|
|
|
|
|
(" Risk factors ", "Risk factors"),
|
|
|
|
|
("Risk factors ", "Risk factors"),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_clean_extra_whitespace(text, expected):
|
|
|
|
|
assert core.clean_extra_whitespace(text) == expected
|
|
|
|
|
assert core.clean(text=text, extra_whitespace=True) == expected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
2023-02-27 17:30:54 +01:00
|
|
|
|
("text", "expected"),
|
2022-06-29 14:35:19 -04:00
|
|
|
|
[
|
|
|
|
|
("Risk-factors", "Risk factors"),
|
|
|
|
|
("Risk – factors", "Risk factors"),
|
|
|
|
|
("Risk\u2013factors", "Risk factors"),
|
|
|
|
|
("Risk factors-\u2013", "Risk factors"),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_clean_dashes(text, expected):
|
|
|
|
|
assert core.clean_dashes(text) == expected
|
|
|
|
|
assert core.clean(text=text, dashes=True) == expected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
2023-02-27 17:30:54 +01:00
|
|
|
|
("text", "expected"),
|
2022-06-29 14:35:19 -04:00
|
|
|
|
[
|
|
|
|
|
("Item 1A:", "Item 1A"),
|
|
|
|
|
("Item 1A;", "Item 1A"),
|
|
|
|
|
("Item 1A.", "Item 1A"),
|
|
|
|
|
("Item 1A,", "Item 1A"),
|
|
|
|
|
("Item, 1A: ", "Item, 1A"),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_clean_trailing_punctuation(text, expected):
|
|
|
|
|
assert core.clean_trailing_punctuation(text) == expected
|
|
|
|
|
assert core.clean(text=text, trailing_punctuation=True) == expected
|
|
|
|
|
|
|
|
|
|
|
2022-11-10 12:24:58 -05:00
|
|
|
|
@pytest.mark.parametrize(
|
2023-02-27 17:30:54 +01:00
|
|
|
|
("text", "pattern", "ignore_case", "strip", "expected"),
|
2022-11-10 12:24:58 -05:00
|
|
|
|
[
|
|
|
|
|
("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
|
|
|
|
|
("DESC: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
|
|
|
|
|
("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, False, " A great SUMMARY"),
|
|
|
|
|
("summary: A great SUMMARY", r"(SUMMARY|DESC):", True, True, "A great SUMMARY"),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_clean_prefix(text, pattern, ignore_case, strip, expected):
|
|
|
|
|
assert core.clean_prefix(text, pattern, ignore_case, strip) == expected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
2023-02-27 17:30:54 +01:00
|
|
|
|
("text", "pattern", "ignore_case", "strip", "expected"),
|
2022-11-10 12:24:58 -05:00
|
|
|
|
[
|
|
|
|
|
("The END! END", r"(END|STOP)", False, True, "The END!"),
|
|
|
|
|
("The END! STOP", r"(END|STOP)", False, True, "The END!"),
|
|
|
|
|
("The END! END", r"(END|STOP)", False, False, "The END! "),
|
|
|
|
|
("The END! end", r"(END|STOP)", True, True, "The END!"),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_clean_postfix(text, pattern, ignore_case, strip, expected):
|
|
|
|
|
assert core.clean_postfix(text, pattern, ignore_case, strip) == expected
|
|
|
|
|
|
|
|
|
|
|
2023-04-06 14:35:22 -04:00
|
|
|
|
def test_group_broken_paragraphs():
|
|
|
|
|
text = """The big red fox
|
|
|
|
|
is walking down the lane.
|
|
|
|
|
|
|
|
|
|
At the end of the lane
|
|
|
|
|
the fox met a friendly bear."""
|
|
|
|
|
|
|
|
|
|
assert (
|
|
|
|
|
core.group_broken_paragraphs(text)
|
|
|
|
|
== """The big red fox is walking down the lane.
|
|
|
|
|
|
|
|
|
|
At the end of the lane the fox met a friendly bear."""
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_group_broken_paragraphs_non_default_settings():
|
|
|
|
|
text = """The big red fox
|
|
|
|
|
|
|
|
|
|
is walking down the lane.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
At the end of the lane
|
|
|
|
|
|
|
|
|
|
the fox met a friendly bear."""
|
|
|
|
|
|
|
|
|
|
para_split_re = re.compile(r"(\s*\n\s*){3}")
|
|
|
|
|
|
|
|
|
|
clean_text = core.group_broken_paragraphs(text, paragraph_split=para_split_re)
|
|
|
|
|
assert (
|
|
|
|
|
clean_text
|
|
|
|
|
== """The big red fox is walking down the lane.
|
|
|
|
|
|
|
|
|
|
At the end of the lane the fox met a friendly bear."""
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2023-08-15 11:35:54 -05:00
|
|
|
|
def test_group_broken_paragraphs_with_bullets():
|
|
|
|
|
text = """○The big red fox
|
|
|
|
|
is walking down the lane.
|
|
|
|
|
|
|
|
|
|
○At the end of the lane
|
|
|
|
|
the fox met a friendly bear."""
|
|
|
|
|
assert core.group_bullet_paragraph(text) == [
|
|
|
|
|
"○The big red fox is walking down the lane. ",
|
|
|
|
|
"○At the end of the lane the fox met a friendly bear.",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_group_bullet_paragraph_with_e_bullets():
|
|
|
|
|
text = """e The big red fox
|
|
|
|
|
is walking down the lane.
|
|
|
|
|
|
|
|
|
|
e At the end of the lane
|
|
|
|
|
the fox met a friendly bear."""
|
|
|
|
|
assert core.group_bullet_paragraph(text) == [
|
|
|
|
|
"· The big red fox is walking down the lane. ",
|
|
|
|
|
"· At the end of the lane the fox met a friendly bear.",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
2022-06-29 14:35:19 -04:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
# NOTE(yuming): Tests combined cleaners
|
2023-02-27 17:30:54 +01:00
|
|
|
|
(
|
|
|
|
|
"text",
|
|
|
|
|
"extra_whitespace",
|
|
|
|
|
"dashes",
|
|
|
|
|
"bullets",
|
|
|
|
|
"lowercase",
|
|
|
|
|
"trailing_punctuation",
|
|
|
|
|
"expected",
|
|
|
|
|
),
|
2022-06-29 14:35:19 -04:00
|
|
|
|
[
|
|
|
|
|
(" Risk-factors ", True, True, False, False, False, "Risk factors"),
|
|
|
|
|
("● Point! ●●● ", True, False, True, False, False, "Point! ●●●"),
|
|
|
|
|
("Risk- factors ", True, False, False, True, False, "risk- factors"),
|
|
|
|
|
("Risk factors: ", True, False, False, False, True, "Risk factors"),
|
|
|
|
|
("● Risk-factors●●● ", False, True, True, False, False, "Risk factors●●●"),
|
|
|
|
|
("Risk-factors ", False, True, False, True, False, "risk factors"),
|
|
|
|
|
("Risk-factors: ", False, True, False, False, True, "Risk factors"),
|
|
|
|
|
("● Point! ●●● ", False, False, True, True, False, "point! ●●●"),
|
|
|
|
|
("● Point! ●●●: ", False, False, True, False, True, "Point! ●●●"),
|
|
|
|
|
("Risk factors: ", False, False, False, True, True, "risk factors"),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_clean(text, extra_whitespace, dashes, bullets, lowercase, trailing_punctuation, expected):
|
|
|
|
|
assert (
|
|
|
|
|
core.clean(
|
|
|
|
|
text=text,
|
|
|
|
|
extra_whitespace=extra_whitespace,
|
|
|
|
|
dashes=dashes,
|
|
|
|
|
bullets=bullets,
|
|
|
|
|
trailing_punctuation=trailing_punctuation,
|
|
|
|
|
lowercase=lowercase,
|
|
|
|
|
)
|
|
|
|
|
== expected
|
|
|
|
|
)
|
2023-04-13 15:39:08 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_bytes_string_to_string():
|
|
|
|
|
text = "\xe6\xaf\x8f\xe6\x97\xa5\xe6\x96\xb0\xe9\x97\xbb"
|
|
|
|
|
assert core.bytes_string_to_string(text, "utf-8") == "每日新闻"
|