unstructured/test_unstructured/nlp/test_partition.py
2022-09-26 14:55:20 -07:00

136 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pytest
import unstructured.nlp.partition as partition
from mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize
@pytest.mark.parametrize(
"text, expected",
[
(
"ITEM 5(a).: MARKET FOR REGISTRANTS COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND "
"ISSUER PURCHASES OF EQUITY SECURITIES",
False,
),
(
"Item 5(a).: Market For Registrants Common Equity, Related Stockholder Matters and "
"Issuer Purchases of Equity Securities",
False,
),
(
"There is a market for registrants common equity, related stockholder matters and "
"issuer purchases of equity securities.",
True,
),
],
)
def test_headings_are_not_narrative_text(text, expected):
assert partition.is_possible_narrative_text(text) == expected
@pytest.mark.parametrize(
"text, expected",
[
("Ask the teacher for an apple.", True),
("Ask Me About Intellectual Property", False), # Exceeds the cap threshold
("7", False), # Fails because it is numeric
("intellectual property", False), # Fails because it does not contain a verb
("", False), # Fails because it is empty
],
)
def test_is_possible_narrative_text(text, expected, monkeypatch):
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
has_verb = partition.is_possible_narrative_text(text, cap_threshold=0.3)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("Intellectual Property", True), # Fails because it exceeds the cap threshold
(
"Ask the teacher for an apple. You might a gold star.",
False,
), # Too many sentences
("7", False), # Fails because it is numeric
("", False), # Fails because it is empty
("ITEM 1A. RISK FACTORS", True), # Two "sentences", but both are short
],
)
def test_is_possible_title(text, expected, monkeypatch):
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
has_verb = partition.is_possible_title(text)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("• This is a fine point!", True),
(" • This is a fine point!", True), # Has an extra space in front of the bullet
("‣ This is a fine point!", True),
(" This is a fine point!", True),
("⁌ This is a fine point!", True),
("⁍ This is a fine point!", True),
("∙ This is a fine point!", True),
("○ This is a fine point!", True),
("● This is a fine point!", True),
("◘ This is a fine point!", True),
("◦ This is a fine point!", True),
("☙ This is a fine point!", True),
("❥ This is a fine point!", True),
("❧ This is a fine point!", True),
("⦾ This is a fine point!", True),
("⦿ This is a fine point!", True),
(" This is a fine point!", True),
("* This is a fine point!", True),
("This is NOT a fine point!", False), # No bullet point
("I love morse code! ● ● ● --- ● ● ●", False), # Not at the beginning
],
)
def test_is_bulletized_text(text, expected):
assert partition.is_bulleted_text(text) is expected
@pytest.mark.parametrize(
"text, expected",
[
("Ask the teacher for an apple", True),
("Intellectual property", False),
],
)
def test_contains_verb(text, expected, monkeypatch):
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
has_verb = partition.contains_verb(text)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("Intellectual Property in the United States", True),
("Intellectual property helps incentivize innovation.", False),
("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
],
)
def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
assert partition.exceeds_cap_ratio(text, threshold=0.3) is expected
def test_sentence_count(monkeypatch):
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
text = "Hi my name is Matt. I work with Crag."
assert partition.sentence_count(text) == 2
def test_item_titles():
text = "ITEM 1(A). THIS IS A TITLE"
assert partition.sentence_count(text, 3) < 2