Tom Aarsen 5eb1466acc
Resolve various style issues to improve overall code quality (#282)
* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
2023-02-27 11:30:54 -05:00

103 lines
3.1 KiB
Python

import datetime
import pytest
from unstructured.cleaners import extract
EMAIL_META_DATA_INPUT = """from ABC.DEF.local ([ba23::58b5:2236:45g2:88h2]) by
\n ABC.DEF.local ([ba23::58b5:2236:45g2:88h2%25]) with mapi id\
n 32.88.5467.123; Fri, 26 Mar 2021 11:04:09 +1200"""
def test_get_indexed_match_raises_with_bad_index():
with pytest.raises(ValueError):
extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", -1)
def test_get_indexed_match_raises_with_index_too_high():
with pytest.raises(ValueError):
extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", 4)
def test_extract_text_before():
text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!"
assert extract.extract_text_before(text, "BLAH", 1) == "Teacher: BLAH"
def test_extract_text_after():
text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!"
assert extract.extract_text_after(text, "BLAH;", 0) == "Student: BLAH BLAH BLAH!"
def test_extract_email_address():
text = "Im Rabn <Im.Rabn@npf.gov.nr>"
assert extract.extract_email_address(text) == ["im.rabn@npf.gov.nr"]
def test_extract_ip_address():
assert extract.extract_ip_address(EMAIL_META_DATA_INPUT) == [
"ba23::58b5:2236:45g2:88h2",
"ba23::58b5:2236:45g2:88h2%25",
]
def test_extract_ip_address_name():
assert extract.extract_ip_address_name(EMAIL_META_DATA_INPUT) == [
"ABC.DEF.local",
"ABC.DEF.local",
]
def test_extract_mapi_id():
assert extract.extract_mapi_id(EMAIL_META_DATA_INPUT) == ["32.88.5467.123"]
def test_extract_datetimetz():
assert extract.extract_datetimetz(EMAIL_META_DATA_INPUT) == datetime.datetime(
2021,
3,
26,
11,
4,
9,
tzinfo=datetime.timezone(datetime.timedelta(seconds=43200)),
)
@pytest.mark.parametrize(
("text", "expected"),
[
("215-867-5309", "215-867-5309"),
("Phone Number: +1 215.867.5309", "+1 215.867.5309"),
("Phone Number: Just Kidding", ""),
],
)
def test_extract_us_phone_number(text, expected):
phone_number = extract.extract_us_phone_number(text)
assert phone_number == expected
@pytest.mark.parametrize(
("text", "expected"),
[
("1. Introduction:", ("1", None, None)),
("a. Introduction:", ("a", None, None)),
("20.3 Morse code ●●●", ("20", "3", None)),
("5.3.1 Convolutional Networks ", ("5", "3", "1")),
("D.b.C Recurrent Neural Networks", ("D", "b", "C")),
("2.b.1 Recurrent Neural Networks", ("2", "b", "1")),
("eins. Neural Networks", (None, None, None)),
("bb.c Feed Forward Neural Networks", ("bb", "c", None)),
("aaa.ccc Metrics", (None, None, None)),
(" version = 3.8", (None, None, None)),
("1 2. 3 4", (None, None, None)),
("1) 2. 3 4", (None, None, None)),
("2,3. Morse code 3. ●●●", (None, None, None)),
("1..2.3 four", (None, None, None)),
("Fig. 2: The relationship", (None, None, None)),
("23 is everywhere", (None, None, None)),
],
)
def test_extract_ordered_bullets(text, expected):
assert extract.extract_ordered_bullets(text=text) == expected