mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-03 07:05:20 +00:00

* docs: add new feature to the CHANGELOG.md, bump the version, update __version__.py * feat: new partition to call the document image analysis API * fix: remove duplicated dependency on partition.py * fix: linting error due to line-lenght > 100 * test: add test to call partition_pdf brick * chore: new short example-doc pdf for speed up in test X8 * fix: add missing return statement to _read to pass check * feat: new partitioning brick to call doc parse API * docs: version update fix in CHANGELOG * refactor: no nested ifs * docs: documentation for new brick partition_pdf * refactor: made tidy * docs: minor doc refactor Co-authored-by: Sebastian Laverde <sebastian@unstructured.io>
145 lines
5.1 KiB
Python
145 lines
5.1 KiB
Python
import pytest
|
||
|
||
import unstructured.nlp.partition as partition
|
||
|
||
from mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
"text, expected",
|
||
[
|
||
(
|
||
"ITEM 5(a).: MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND "
|
||
"ISSUER PURCHASES OF EQUITY SECURITIES",
|
||
False,
|
||
),
|
||
(
|
||
"Item 5(a).: Market For Registrant’s Common Equity, Related Stockholder Matters and "
|
||
"Issuer Purchases of Equity Securities",
|
||
False,
|
||
),
|
||
(
|
||
"There is a market for registrant’s common equity, related stockholder matters and "
|
||
"issuer purchases of equity securities.",
|
||
True,
|
||
),
|
||
],
|
||
)
|
||
def test_headings_are_not_narrative_text(text, expected):
|
||
assert partition.is_possible_narrative_text(text) == expected
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
"text, expected",
|
||
[
|
||
("Ask the teacher for an apple.", True),
|
||
("Ask Me About Intellectual Property", False), # Exceeds the cap threshold
|
||
("7", False), # Fails because it is numeric
|
||
("intellectual property", False), # Fails because it does not contain a verb
|
||
("", False), # Fails because it is empty
|
||
],
|
||
)
|
||
def test_is_possible_narrative_text(text, expected, monkeypatch):
|
||
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
|
||
monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
|
||
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
|
||
has_verb = partition.is_possible_narrative_text(text, cap_threshold=0.3)
|
||
assert has_verb is expected
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
"text, expected",
|
||
[
|
||
("Intellectual Property", True), # Fails because it exceeds the cap threshold
|
||
(
|
||
"Ask the teacher for an apple. You might a gold star.",
|
||
False,
|
||
), # Too many sentences
|
||
("7", False), # Fails because it is numeric
|
||
("", False), # Fails because it is empty
|
||
("ITEM 1A. RISK FACTORS", True), # Two "sentences", but both are short
|
||
],
|
||
)
|
||
def test_is_possible_title(text, expected, monkeypatch):
|
||
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
|
||
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
|
||
has_verb = partition.is_possible_title(text)
|
||
assert has_verb is expected
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
"text, expected",
|
||
[
|
||
("• This is a fine point!", True),
|
||
(" • This is a fine point!", True), # Has an extra space in front of the bullet
|
||
("‣ This is a fine point!", True),
|
||
("⁃ This is a fine point!", True),
|
||
("⁌ This is a fine point!", True),
|
||
("⁍ This is a fine point!", True),
|
||
("∙ This is a fine point!", True),
|
||
("○ This is a fine point!", True),
|
||
("● This is a fine point!", True),
|
||
("◘ This is a fine point!", True),
|
||
("◦ This is a fine point!", True),
|
||
("☙ This is a fine point!", True),
|
||
("❥ This is a fine point!", True),
|
||
("❧ This is a fine point!", True),
|
||
("⦾ This is a fine point!", True),
|
||
("⦿ This is a fine point!", True),
|
||
(" This is a fine point!", True),
|
||
("* This is a fine point!", True),
|
||
("This is NOT a fine point!", False), # No bullet point
|
||
("I love morse code! ● ● ● --- ● ● ●", False), # Not at the beginning
|
||
],
|
||
)
|
||
def test_is_bulletized_text(text, expected):
|
||
assert partition.is_bulleted_text(text) is expected
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
"text, expected",
|
||
[
|
||
("Ask the teacher for an apple", True),
|
||
("Intellectual property", False),
|
||
],
|
||
)
|
||
def test_contains_verb(text, expected, monkeypatch):
|
||
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
|
||
monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
|
||
has_verb = partition.contains_verb(text)
|
||
assert has_verb is expected
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
"text, expected",
|
||
[
|
||
("Intellectual Property in the United States", True),
|
||
("Intellectual property helps incentivize innovation.", False),
|
||
("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
|
||
],
|
||
)
|
||
def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
|
||
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
|
||
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
|
||
assert partition.exceeds_cap_ratio(text, threshold=0.3) is expected
|
||
|
||
|
||
def test_sentence_count(monkeypatch):
|
||
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
|
||
text = "Hi my name is Matt. I work with Crag."
|
||
assert partition.sentence_count(text) == 2
|
||
|
||
|
||
def test_item_titles():
|
||
text = "ITEM 1(A). THIS IS A TITLE"
|
||
assert partition.sentence_count(text, 3) < 2
|
||
|
||
|
||
def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"):
|
||
partition_pdf_response = partition.partition_pdf(filename)
|
||
assert partition_pdf_response[0]["type"] == "Title"
|
||
assert (
|
||
partition_pdf_response[0]["text"]
|
||
== "LayoutParser : A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
||
)
|