mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-25 09:57:15 +00:00

* Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
103 lines
3.1 KiB
Python
103 lines
3.1 KiB
Python
import datetime
|
|
|
|
import pytest
|
|
|
|
from unstructured.cleaners import extract
|
|
|
|
EMAIL_META_DATA_INPUT = """from ABC.DEF.local ([ba23::58b5:2236:45g2:88h2]) by
|
|
\n ABC.DEF.local ([ba23::58b5:2236:45g2:88h2%25]) with mapi id\
|
|
n 32.88.5467.123; Fri, 26 Mar 2021 11:04:09 +1200"""
|
|
|
|
|
|
def test_get_indexed_match_raises_with_bad_index():
|
|
with pytest.raises(ValueError):
|
|
extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", -1)
|
|
|
|
|
|
def test_get_indexed_match_raises_with_index_too_high():
|
|
with pytest.raises(ValueError):
|
|
extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", 4)
|
|
|
|
|
|
def test_extract_text_before():
|
|
text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!"
|
|
assert extract.extract_text_before(text, "BLAH", 1) == "Teacher: BLAH"
|
|
|
|
|
|
def test_extract_text_after():
|
|
text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!"
|
|
assert extract.extract_text_after(text, "BLAH;", 0) == "Student: BLAH BLAH BLAH!"
|
|
|
|
|
|
def test_extract_email_address():
|
|
text = "Im Rabn <Im.Rabn@npf.gov.nr>"
|
|
assert extract.extract_email_address(text) == ["im.rabn@npf.gov.nr"]
|
|
|
|
|
|
def test_extract_ip_address():
|
|
assert extract.extract_ip_address(EMAIL_META_DATA_INPUT) == [
|
|
"ba23::58b5:2236:45g2:88h2",
|
|
"ba23::58b5:2236:45g2:88h2%25",
|
|
]
|
|
|
|
|
|
def test_extract_ip_address_name():
|
|
assert extract.extract_ip_address_name(EMAIL_META_DATA_INPUT) == [
|
|
"ABC.DEF.local",
|
|
"ABC.DEF.local",
|
|
]
|
|
|
|
|
|
def test_extract_mapi_id():
|
|
assert extract.extract_mapi_id(EMAIL_META_DATA_INPUT) == ["32.88.5467.123"]
|
|
|
|
|
|
def test_extract_datetimetz():
|
|
assert extract.extract_datetimetz(EMAIL_META_DATA_INPUT) == datetime.datetime(
|
|
2021,
|
|
3,
|
|
26,
|
|
11,
|
|
4,
|
|
9,
|
|
tzinfo=datetime.timezone(datetime.timedelta(seconds=43200)),
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("text", "expected"),
|
|
[
|
|
("215-867-5309", "215-867-5309"),
|
|
("Phone Number: +1 215.867.5309", "+1 215.867.5309"),
|
|
("Phone Number: Just Kidding", ""),
|
|
],
|
|
)
|
|
def test_extract_us_phone_number(text, expected):
|
|
phone_number = extract.extract_us_phone_number(text)
|
|
assert phone_number == expected
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("text", "expected"),
|
|
[
|
|
("1. Introduction:", ("1", None, None)),
|
|
("a. Introduction:", ("a", None, None)),
|
|
("20.3 Morse code ●●●", ("20", "3", None)),
|
|
("5.3.1 Convolutional Networks ", ("5", "3", "1")),
|
|
("D.b.C Recurrent Neural Networks", ("D", "b", "C")),
|
|
("2.b.1 Recurrent Neural Networks", ("2", "b", "1")),
|
|
("eins. Neural Networks", (None, None, None)),
|
|
("bb.c Feed Forward Neural Networks", ("bb", "c", None)),
|
|
("aaa.ccc Metrics", (None, None, None)),
|
|
(" version = 3.8", (None, None, None)),
|
|
("1 2. 3 4", (None, None, None)),
|
|
("1) 2. 3 4", (None, None, None)),
|
|
("2,3. Morse code 3. ●●●", (None, None, None)),
|
|
("1..2.3 four", (None, None, None)),
|
|
("Fig. 2: The relationship", (None, None, None)),
|
|
("23 is everywhere", (None, None, None)),
|
|
],
|
|
)
|
|
def test_extract_ordered_bullets(text, expected):
|
|
assert extract.extract_ordered_bullets(text=text) == expected
|