2022-11-10 16:35:37 -05:00
|
|
|
import pytest
|
|
|
|
|
|
|
|
import unstructured.cleaners.extract as extract
|
|
|
|
|
|
|
|
|
|
|
|
def test_get_indexed_match_raises_with_bad_index():
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", -1)
|
|
|
|
|
|
|
|
|
|
|
|
def test_get_indexed_match_raises_with_index_too_high():
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", 4)
|
|
|
|
|
|
|
|
|
|
|
|
def test_extract_text_before():
|
|
|
|
text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!"
|
|
|
|
assert extract.extract_text_before(text, "BLAH", 1) == "Teacher: BLAH"
|
|
|
|
|
|
|
|
|
|
|
|
def test_extract_text_after():
|
|
|
|
text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!"
|
|
|
|
assert extract.extract_text_after(text, "BLAH;", 0) == "Student: BLAH BLAH BLAH!"
|
2023-01-03 13:31:05 -05:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"text, expected",
|
|
|
|
[
|
|
|
|
("215-867-5309", "215-867-5309"),
|
|
|
|
("Phone Number: +1 215.867.5309", "+1 215.867.5309"),
|
|
|
|
("Phone Number: Just Kidding", ""),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_extract_us_phone_number(text, expected):
|
|
|
|
phone_number = extract.extract_us_phone_number(text)
|
|
|
|
assert phone_number == expected
|
2023-01-05 17:06:26 +01:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"text, expected",
|
|
|
|
[
|
|
|
|
("1. Introduction:", ("1", None, None)),
|
|
|
|
("a. Introduction:", ("a", None, None)),
|
|
|
|
("20.3 Morse code ●●●", ("20", "3", None)),
|
|
|
|
("5.3.1 Convolutional Networks ", ("5", "3", "1")),
|
|
|
|
("D.b.C Recurrent Neural Networks", ("D", "b", "C")),
|
|
|
|
("2.b.1 Recurrent Neural Networks", ("2", "b", "1")),
|
|
|
|
("eins. Neural Networks", (None, None, None)),
|
|
|
|
("bb.c Feed Forward Neural Networks", ("bb", "c", None)),
|
|
|
|
("aaa.ccc Metrics", (None, None, None)),
|
|
|
|
(" version = 3.8", (None, None, None)),
|
|
|
|
("1 2. 3 4", (None, None, None)),
|
|
|
|
("1) 2. 3 4", (None, None, None)),
|
|
|
|
("2,3. Morse code 3. ●●●", (None, None, None)),
|
|
|
|
("1..2.3 four", (None, None, None)),
|
|
|
|
("Fig. 2: The relationship", (None, None, None)),
|
|
|
|
("23 is everywhere", (None, None, None)),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_extract_ordered_bullets(text, expected):
|
|
|
|
assert extract.extract_ordered_bullets(text=text) == expected
|