unstructured/test_unstructured/nlp/test_partition.py
Sebastian Laverde Alfonso baa15d0098
feat: new partitioning brick that calls the document image analysis API (#68)
* docs: add new feature to the CHANGELOG.md, bump the version, update __version__.py

* feat: new partition to call the document image analysis API

* fix: remove duplicated dependency on partition.py

* fix: linting error due to line-lenght > 100

* test: add test to call partition_pdf brick

* chore: new short example-doc pdf for speed up in test X8

* fix: add missing return statement to _read to pass check

* feat: new partitioning brick to call doc parse API

* docs: version update fix in CHANGELOG

* refactor: no nested ifs

* docs: documentation for new brick partition_pdf

* refactor: made tidy

* docs: minor doc refactor

Co-authored-by: Sebastian Laverde <sebastian@unstructured.io>
2022-11-16 17:48:30 +01:00

145 lines
5.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pytest
import unstructured.nlp.partition as partition
from mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize
@pytest.mark.parametrize(
"text, expected",
[
(
"ITEM 5(a).: MARKET FOR REGISTRANTS COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND "
"ISSUER PURCHASES OF EQUITY SECURITIES",
False,
),
(
"Item 5(a).: Market For Registrants Common Equity, Related Stockholder Matters and "
"Issuer Purchases of Equity Securities",
False,
),
(
"There is a market for registrants common equity, related stockholder matters and "
"issuer purchases of equity securities.",
True,
),
],
)
def test_headings_are_not_narrative_text(text, expected):
assert partition.is_possible_narrative_text(text) == expected
@pytest.mark.parametrize(
"text, expected",
[
("Ask the teacher for an apple.", True),
("Ask Me About Intellectual Property", False), # Exceeds the cap threshold
("7", False), # Fails because it is numeric
("intellectual property", False), # Fails because it does not contain a verb
("", False), # Fails because it is empty
],
)
def test_is_possible_narrative_text(text, expected, monkeypatch):
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
has_verb = partition.is_possible_narrative_text(text, cap_threshold=0.3)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("Intellectual Property", True), # Fails because it exceeds the cap threshold
(
"Ask the teacher for an apple. You might a gold star.",
False,
), # Too many sentences
("7", False), # Fails because it is numeric
("", False), # Fails because it is empty
("ITEM 1A. RISK FACTORS", True), # Two "sentences", but both are short
],
)
def test_is_possible_title(text, expected, monkeypatch):
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
has_verb = partition.is_possible_title(text)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("• This is a fine point!", True),
(" • This is a fine point!", True), # Has an extra space in front of the bullet
("‣ This is a fine point!", True),
(" This is a fine point!", True),
("⁌ This is a fine point!", True),
("⁍ This is a fine point!", True),
("∙ This is a fine point!", True),
("○ This is a fine point!", True),
("● This is a fine point!", True),
("◘ This is a fine point!", True),
("◦ This is a fine point!", True),
("☙ This is a fine point!", True),
("❥ This is a fine point!", True),
("❧ This is a fine point!", True),
("⦾ This is a fine point!", True),
("⦿ This is a fine point!", True),
(" This is a fine point!", True),
("* This is a fine point!", True),
("This is NOT a fine point!", False), # No bullet point
("I love morse code! ● ● ● --- ● ● ●", False), # Not at the beginning
],
)
def test_is_bulletized_text(text, expected):
assert partition.is_bulleted_text(text) is expected
@pytest.mark.parametrize(
"text, expected",
[
("Ask the teacher for an apple", True),
("Intellectual property", False),
],
)
def test_contains_verb(text, expected, monkeypatch):
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
has_verb = partition.contains_verb(text)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("Intellectual Property in the United States", True),
("Intellectual property helps incentivize innovation.", False),
("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
],
)
def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
assert partition.exceeds_cap_ratio(text, threshold=0.3) is expected
def test_sentence_count(monkeypatch):
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
text = "Hi my name is Matt. I work with Crag."
assert partition.sentence_count(text) == 2
def test_item_titles():
text = "ITEM 1(A). THIS IS A TITLE"
assert partition.sentence_count(text, 3) < 2
def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"):
partition_pdf_response = partition.partition_pdf(filename)
assert partition_pdf_response[0]["type"] == "Title"
assert (
partition_pdf_response[0]["text"]
== "LayoutParser : A Unified Toolkit for Deep Learning Based Document Image Analysis"
)