Sebastian Laverde Alfonso 5a47eb06e9
feat: new bricks for removing and extracting ordered bullets (#128)
* feat: new cleaning brick for ordered bullets

* test: add test for cleaning ordered bullets

* feat: new brick for extracting ordered bullets

* test: add test for extracting ordered bullets

* docs: update CHANGELOG and bump new dev version

* chore: change extract ordered bullets return type to tuple

* chore: made tidy

* chore: regex to split on pattern instead of built-in

* chore: catch ValueError, made tidy and fix incompatible type

* chore: assertion statements in one line of code

* docs: add documentation for new clean and extract bricks to bricks.rst

* docs: refactor CHANGELOG 0.3.5.dev5 to dev6 with new bullets

* docs: update CHANGELOG 0.3.6-dev0 changes and bump version

Co-authored-by: Sebastian Laverde <sebastian@unstructured.io>
2023-01-05 17:06:26 +01:00

62 lines
2.1 KiB
Python

import pytest
import unstructured.cleaners.extract as extract
def test_get_indexed_match_raises_with_bad_index():
with pytest.raises(ValueError):
extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", -1)
def test_get_indexed_match_raises_with_index_too_high():
with pytest.raises(ValueError):
extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", 4)
def test_extract_text_before():
text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!"
assert extract.extract_text_before(text, "BLAH", 1) == "Teacher: BLAH"
def test_extract_text_after():
text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!"
assert extract.extract_text_after(text, "BLAH;", 0) == "Student: BLAH BLAH BLAH!"
@pytest.mark.parametrize(
"text, expected",
[
("215-867-5309", "215-867-5309"),
("Phone Number: +1 215.867.5309", "+1 215.867.5309"),
("Phone Number: Just Kidding", ""),
],
)
def test_extract_us_phone_number(text, expected):
phone_number = extract.extract_us_phone_number(text)
assert phone_number == expected
@pytest.mark.parametrize(
"text, expected",
[
("1. Introduction:", ("1", None, None)),
("a. Introduction:", ("a", None, None)),
("20.3 Morse code ●●●", ("20", "3", None)),
("5.3.1 Convolutional Networks ", ("5", "3", "1")),
("D.b.C Recurrent Neural Networks", ("D", "b", "C")),
("2.b.1 Recurrent Neural Networks", ("2", "b", "1")),
("eins. Neural Networks", (None, None, None)),
("bb.c Feed Forward Neural Networks", ("bb", "c", None)),
("aaa.ccc Metrics", (None, None, None)),
(" version = 3.8", (None, None, None)),
("1 2. 3 4", (None, None, None)),
("1) 2. 3 4", (None, None, None)),
("2,3. Morse code 3. ●●●", (None, None, None)),
("1..2.3 four", (None, None, None)),
("Fig. 2: The relationship", (None, None, None)),
("23 is everywhere", (None, None, None)),
],
)
def test_extract_ordered_bullets(text, expected):
assert extract.extract_ordered_bullets(text=text) == expected