62 lines
2.1 KiB
Python
Raw Normal View History

import pytest
import unstructured.cleaners.extract as extract
def test_get_indexed_match_raises_with_bad_index():
with pytest.raises(ValueError):
extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", -1)
def test_get_indexed_match_raises_with_index_too_high():
with pytest.raises(ValueError):
extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", 4)
def test_extract_text_before():
text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!"
assert extract.extract_text_before(text, "BLAH", 1) == "Teacher: BLAH"
def test_extract_text_after():
text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!"
assert extract.extract_text_after(text, "BLAH;", 0) == "Student: BLAH BLAH BLAH!"
@pytest.mark.parametrize(
"text, expected",
[
("215-867-5309", "215-867-5309"),
("Phone Number: +1 215.867.5309", "+1 215.867.5309"),
("Phone Number: Just Kidding", ""),
],
)
def test_extract_us_phone_number(text, expected):
phone_number = extract.extract_us_phone_number(text)
assert phone_number == expected
@pytest.mark.parametrize(
"text, expected",
[
("1. Introduction:", ("1", None, None)),
("a. Introduction:", ("a", None, None)),
("20.3 Morse code ●●●", ("20", "3", None)),
("5.3.1 Convolutional Networks ", ("5", "3", "1")),
("D.b.C Recurrent Neural Networks", ("D", "b", "C")),
("2.b.1 Recurrent Neural Networks", ("2", "b", "1")),
("eins. Neural Networks", (None, None, None)),
("bb.c Feed Forward Neural Networks", ("bb", "c", None)),
("aaa.ccc Metrics", (None, None, None)),
(" version = 3.8", (None, None, None)),
("1 2. 3 4", (None, None, None)),
("1) 2. 3 4", (None, None, None)),
("2,3. Morse code 3. ●●●", (None, None, None)),
("1..2.3 four", (None, None, None)),
("Fig. 2: The relationship", (None, None, None)),
("23 is everywhere", (None, None, None)),
],
)
def test_extract_ordered_bullets(text, expected):
assert extract.extract_ordered_bullets(text=text) == expected