2022-06-29 14:35:19 -04:00
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
|
|
import unstructured.nlp.partition as partition
|
|
|
|
|
|
|
|
|
|
from mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"text, expected",
|
|
|
|
|
[
|
|
|
|
|
(
|
|
|
|
|
"ITEM 5(a).: MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND "
|
|
|
|
|
"ISSUER PURCHASES OF EQUITY SECURITIES",
|
|
|
|
|
False,
|
|
|
|
|
),
|
|
|
|
|
(
|
|
|
|
|
"Item 5(a).: Market For Registrant’s Common Equity, Related Stockholder Matters and "
|
|
|
|
|
"Issuer Purchases of Equity Securities",
|
|
|
|
|
False,
|
|
|
|
|
),
|
|
|
|
|
(
|
|
|
|
|
"There is a market for registrant’s common equity, related stockholder matters and "
|
|
|
|
|
"issuer purchases of equity securities.",
|
|
|
|
|
True,
|
|
|
|
|
),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_headings_are_not_narrative_text(text, expected):
|
|
|
|
|
assert partition.is_possible_narrative_text(text) == expected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"text, expected",
|
|
|
|
|
[
|
|
|
|
|
("Ask the teacher for an apple.", True),
|
|
|
|
|
("Ask Me About Intellectual Property", False), # Exceeds the cap threshold
|
|
|
|
|
("7", False), # Fails because it is numeric
|
|
|
|
|
("intellectual property", False), # Fails because it does not contain a verb
|
|
|
|
|
("", False), # Fails because it is empty
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_is_possible_narrative_text(text, expected, monkeypatch):
|
|
|
|
|
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
|
|
|
|
|
monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
|
|
|
|
|
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
|
|
|
|
|
has_verb = partition.is_possible_narrative_text(text, cap_threshold=0.3)
|
|
|
|
|
assert has_verb is expected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"text, expected",
|
|
|
|
|
[
|
|
|
|
|
("Intellectual Property", True), # Fails because it exceeds the cap threshold
|
|
|
|
|
(
|
|
|
|
|
"Ask the teacher for an apple. You might a gold star.",
|
|
|
|
|
False,
|
|
|
|
|
), # Too many sentences
|
|
|
|
|
("7", False), # Fails because it is numeric
|
|
|
|
|
("", False), # Fails because it is empty
|
|
|
|
|
("ITEM 1A. RISK FACTORS", True), # Two "sentences", but both are short
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_is_possible_title(text, expected, monkeypatch):
|
|
|
|
|
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
|
|
|
|
|
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
|
|
|
|
|
has_verb = partition.is_possible_title(text)
|
|
|
|
|
assert has_verb is expected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"text, expected",
|
|
|
|
|
[
|
|
|
|
|
("• This is a fine point!", True),
|
|
|
|
|
(" • This is a fine point!", True), # Has an extra space in front of the bullet
|
|
|
|
|
("‣ This is a fine point!", True),
|
|
|
|
|
("⁃ This is a fine point!", True),
|
|
|
|
|
("⁌ This is a fine point!", True),
|
|
|
|
|
("⁍ This is a fine point!", True),
|
|
|
|
|
("∙ This is a fine point!", True),
|
|
|
|
|
("○ This is a fine point!", True),
|
|
|
|
|
("● This is a fine point!", True),
|
|
|
|
|
("◘ This is a fine point!", True),
|
|
|
|
|
("◦ This is a fine point!", True),
|
|
|
|
|
("☙ This is a fine point!", True),
|
|
|
|
|
("❥ This is a fine point!", True),
|
|
|
|
|
("❧ This is a fine point!", True),
|
|
|
|
|
("⦾ This is a fine point!", True),
|
|
|
|
|
("⦿ This is a fine point!", True),
|
|
|
|
|
(" This is a fine point!", True),
|
|
|
|
|
("* This is a fine point!", True),
|
|
|
|
|
("This is NOT a fine point!", False), # No bullet point
|
|
|
|
|
("I love morse code! ● ● ● --- ● ● ●", False), # Not at the beginning
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_is_bulletized_text(text, expected):
|
|
|
|
|
assert partition.is_bulleted_text(text) is expected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"text, expected",
|
|
|
|
|
[
|
|
|
|
|
("Ask the teacher for an apple", True),
|
|
|
|
|
("Intellectual property", False),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_contains_verb(text, expected, monkeypatch):
|
|
|
|
|
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
|
|
|
|
|
monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
|
|
|
|
|
has_verb = partition.contains_verb(text)
|
|
|
|
|
assert has_verb is expected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"text, expected",
|
|
|
|
|
[
|
|
|
|
|
("Intellectual Property in the United States", True),
|
|
|
|
|
("Intellectual property helps incentivize innovation.", False),
|
|
|
|
|
("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
|
|
|
|
|
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
|
|
|
|
|
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
|
|
|
|
|
assert partition.exceeds_cap_ratio(text, threshold=0.3) is expected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_sentence_count(monkeypatch):
|
|
|
|
|
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
|
|
|
|
|
text = "Hi my name is Matt. I work with Crag."
|
|
|
|
|
assert partition.sentence_count(text) == 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_item_titles():
|
|
|
|
|
text = "ITEM 1(A). THIS IS A TITLE"
|
|
|
|
|
assert partition.sentence_count(text, 3) < 2
|
2022-11-16 17:48:30 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"):
|
|
|
|
|
partition_pdf_response = partition.partition_pdf(filename)
|
|
|
|
|
assert partition_pdf_response[0]["type"] == "Title"
|
|
|
|
|
assert (
|
|
|
|
|
partition_pdf_response[0]["text"]
|
|
|
|
|
== "LayoutParser : A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
|
|
|
|
)
|