unstructured/test_unstructured/nlp/test_partition.py

145 lines
5.1 KiB
Python
Raw Normal View History

2022-06-29 14:35:19 -04:00
import pytest
import unstructured.nlp.partition as partition
from mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize
@pytest.mark.parametrize(
"text, expected",
[
(
"ITEM 5(a).: MARKET FOR REGISTRANTS COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND "
"ISSUER PURCHASES OF EQUITY SECURITIES",
False,
),
(
"Item 5(a).: Market For Registrants Common Equity, Related Stockholder Matters and "
"Issuer Purchases of Equity Securities",
False,
),
(
"There is a market for registrants common equity, related stockholder matters and "
"issuer purchases of equity securities.",
True,
),
],
)
def test_headings_are_not_narrative_text(text, expected):
assert partition.is_possible_narrative_text(text) == expected
@pytest.mark.parametrize(
"text, expected",
[
("Ask the teacher for an apple.", True),
("Ask Me About Intellectual Property", False), # Exceeds the cap threshold
("7", False), # Fails because it is numeric
("intellectual property", False), # Fails because it does not contain a verb
("", False), # Fails because it is empty
],
)
def test_is_possible_narrative_text(text, expected, monkeypatch):
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
has_verb = partition.is_possible_narrative_text(text, cap_threshold=0.3)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("Intellectual Property", True), # Fails because it exceeds the cap threshold
(
"Ask the teacher for an apple. You might a gold star.",
False,
), # Too many sentences
("7", False), # Fails because it is numeric
("", False), # Fails because it is empty
("ITEM 1A. RISK FACTORS", True), # Two "sentences", but both are short
],
)
def test_is_possible_title(text, expected, monkeypatch):
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
has_verb = partition.is_possible_title(text)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("• This is a fine point!", True),
(" • This is a fine point!", True), # Has an extra space in front of the bullet
("‣ This is a fine point!", True),
(" This is a fine point!", True),
("⁌ This is a fine point!", True),
("⁍ This is a fine point!", True),
("∙ This is a fine point!", True),
("○ This is a fine point!", True),
("● This is a fine point!", True),
("◘ This is a fine point!", True),
("◦ This is a fine point!", True),
("☙ This is a fine point!", True),
("❥ This is a fine point!", True),
("❧ This is a fine point!", True),
("⦾ This is a fine point!", True),
("⦿ This is a fine point!", True),
(" This is a fine point!", True),
("* This is a fine point!", True),
("This is NOT a fine point!", False), # No bullet point
("I love morse code! ● ● ● --- ● ● ●", False), # Not at the beginning
],
)
def test_is_bulletized_text(text, expected):
assert partition.is_bulleted_text(text) is expected
@pytest.mark.parametrize(
"text, expected",
[
("Ask the teacher for an apple", True),
("Intellectual property", False),
],
)
def test_contains_verb(text, expected, monkeypatch):
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
has_verb = partition.contains_verb(text)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("Intellectual Property in the United States", True),
("Intellectual property helps incentivize innovation.", False),
("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
],
)
def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
assert partition.exceeds_cap_ratio(text, threshold=0.3) is expected
def test_sentence_count(monkeypatch):
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
text = "Hi my name is Matt. I work with Crag."
assert partition.sentence_count(text) == 2
def test_item_titles():
text = "ITEM 1(A). THIS IS A TITLE"
assert partition.sentence_count(text, 3) < 2
def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"):
partition_pdf_response = partition.partition_pdf(filename)
assert partition_pdf_response[0]["type"] == "Title"
assert (
partition_pdf_response[0]["text"]
== "LayoutParser : A Unified Toolkit for Deep Learning Based Document Image Analysis"
)