mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
chore: Reorganize partition bricks under partition directory (#76)
* move partition_pdf to partition folder * move partition.py * refactor partioning bricks into partition diretory * import to nlp for backward compatibility * update docs * update version and bump changelog * fix typo in changelog * update readme reference
This commit is contained in:
parent
53fcf4e912
commit
08e091c5a9
@ -1,6 +1,7 @@
|
||||
## 0.3.0-dev1
|
||||
## 0.3.0-dev2
|
||||
|
||||
* Removing the local PDF parsing code and any dependencies and tests.
|
||||
* Reorganizes the staging bricks in the unstructured.partition module
|
||||
|
||||
## 0.2.6
|
||||
|
||||
|
@ -91,7 +91,7 @@ titles and narrative text.
|
||||
You can use the following workflow to parse PDF documents.
|
||||
|
||||
```python
|
||||
from unstructured.nlp.partition import partition_pdf
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
|
||||
elements = partition_pdf("example-docs/layout-parser-paper.pdf")
|
||||
print(doc)
|
||||
|
@ -14,6 +14,24 @@ The partitioning bricks in ``unstructured`` differentiate between different sect
|
||||
of text in a document. For example, the partitioning bricks can help distinguish between
|
||||
titles, narrative text, and tables.
|
||||
|
||||
|
||||
``partition_pdf``
|
||||
---------------------
|
||||
|
||||
The ``partition_pdf`` function segments a PDF document by calling the document image analysis API.
|
||||
The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API,
|
||||
if desired.
|
||||
|
||||
Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
|
||||
# Returns a List[Element] present in the pages of the parsed pdf document
|
||||
elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf")
|
||||
|
||||
|
||||
``is_bulleted_text``
|
||||
----------------------
|
||||
|
||||
@ -24,7 +42,7 @@ Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.nlp.partition import is_bulleted_text
|
||||
from unstructured.partition.text_type import is_bulleted_text
|
||||
|
||||
# Returns True
|
||||
is_bulleted_text("● An excellent point!")
|
||||
@ -52,7 +70,7 @@ Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.nlp.partition import is_possible_narrative_text
|
||||
from unstructured.partition.text_type import is_possible_narrative_text
|
||||
|
||||
# Returns True because the example passes all the checks
|
||||
example_1 = "Make sure you brush your teeth before you go to bed."
|
||||
@ -83,7 +101,7 @@ Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.nlp.partition import is_possible_title
|
||||
from unstructured.partition.text_type import is_possible_title
|
||||
|
||||
# Returns True because the text passes all the tests
|
||||
example_2 = "ITEM 1A. RISK FACTORS"
|
||||
@ -116,7 +134,7 @@ Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.nlp.partition import contains_verb
|
||||
from unstructured.partition.text_type import contains_verb
|
||||
|
||||
# Returns True because the text contains a verb
|
||||
example_1 = "I am going to run to the store to pick up some milk."
|
||||
@ -139,7 +157,7 @@ Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.nlp.partition import sentence_count
|
||||
from unstructured.partition.text_type import sentence_count
|
||||
|
||||
example = "Look at me! I am a document with two sentences."
|
||||
|
||||
@ -162,7 +180,7 @@ Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.nlp.partition import exceeds_cap_ratio
|
||||
from unstructured.partition.text_type import exceeds_cap_ratio
|
||||
|
||||
# Returns True because the text is more than 30% caps
|
||||
example_1 = "LOOK AT ME I AM YELLING"
|
||||
@ -176,22 +194,6 @@ Examples:
|
||||
exceeds_cap_ratio(example_2, threshold=0.01)
|
||||
|
||||
|
||||
``partition_pdf``
|
||||
---------------------
|
||||
|
||||
The ``partition_pdf`` function segments a PDF document by calling the document image analysis API.
|
||||
The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API,
|
||||
if desired.
|
||||
|
||||
Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.nlp.partition import partition_pdf
|
||||
|
||||
# Returns a List[Element] present in the pages of the parsed pdf document
|
||||
elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf")
|
||||
|
||||
|
||||
########
|
||||
Cleaning
|
||||
|
0
test_unstructured/__init__.py
Normal file
0
test_unstructured/__init__.py
Normal file
0
test_unstructured/nlp/__init__.py
Normal file
0
test_unstructured/nlp/__init__.py
Normal file
@ -1,144 +1,7 @@
|
||||
import pytest
|
||||
|
||||
import unstructured.nlp.partition as partition
|
||||
|
||||
from mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected",
|
||||
[
|
||||
(
|
||||
"ITEM 5(a).: MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND "
|
||||
"ISSUER PURCHASES OF EQUITY SECURITIES",
|
||||
False,
|
||||
),
|
||||
(
|
||||
"Item 5(a).: Market For Registrant’s Common Equity, Related Stockholder Matters and "
|
||||
"Issuer Purchases of Equity Securities",
|
||||
False,
|
||||
),
|
||||
(
|
||||
"There is a market for registrant’s common equity, related stockholder matters and "
|
||||
"issuer purchases of equity securities.",
|
||||
True,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_headings_are_not_narrative_text(text, expected):
|
||||
assert partition.is_possible_narrative_text(text) == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected",
|
||||
[
|
||||
("Ask the teacher for an apple.", True),
|
||||
("Ask Me About Intellectual Property", False), # Exceeds the cap threshold
|
||||
("7", False), # Fails because it is numeric
|
||||
("intellectual property", False), # Fails because it does not contain a verb
|
||||
("", False), # Fails because it is empty
|
||||
],
|
||||
)
|
||||
def test_is_possible_narrative_text(text, expected, monkeypatch):
|
||||
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
|
||||
monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
|
||||
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
|
||||
has_verb = partition.is_possible_narrative_text(text, cap_threshold=0.3)
|
||||
assert has_verb is expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected",
|
||||
[
|
||||
("Intellectual Property", True), # Fails because it exceeds the cap threshold
|
||||
(
|
||||
"Ask the teacher for an apple. You might a gold star.",
|
||||
False,
|
||||
), # Too many sentences
|
||||
("7", False), # Fails because it is numeric
|
||||
("", False), # Fails because it is empty
|
||||
("ITEM 1A. RISK FACTORS", True), # Two "sentences", but both are short
|
||||
],
|
||||
)
|
||||
def test_is_possible_title(text, expected, monkeypatch):
|
||||
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
|
||||
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
|
||||
has_verb = partition.is_possible_title(text)
|
||||
assert has_verb is expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected",
|
||||
[
|
||||
("• This is a fine point!", True),
|
||||
(" • This is a fine point!", True), # Has an extra space in front of the bullet
|
||||
("‣ This is a fine point!", True),
|
||||
("⁃ This is a fine point!", True),
|
||||
("⁌ This is a fine point!", True),
|
||||
("⁍ This is a fine point!", True),
|
||||
("∙ This is a fine point!", True),
|
||||
("○ This is a fine point!", True),
|
||||
("● This is a fine point!", True),
|
||||
("◘ This is a fine point!", True),
|
||||
("◦ This is a fine point!", True),
|
||||
("☙ This is a fine point!", True),
|
||||
("❥ This is a fine point!", True),
|
||||
("❧ This is a fine point!", True),
|
||||
("⦾ This is a fine point!", True),
|
||||
("⦿ This is a fine point!", True),
|
||||
(" This is a fine point!", True),
|
||||
("* This is a fine point!", True),
|
||||
("This is NOT a fine point!", False), # No bullet point
|
||||
("I love morse code! ● ● ● --- ● ● ●", False), # Not at the beginning
|
||||
],
|
||||
)
|
||||
def test_is_bulletized_text(text, expected):
|
||||
assert partition.is_bulleted_text(text) is expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected",
|
||||
[
|
||||
("Ask the teacher for an apple", True),
|
||||
("Intellectual property", False),
|
||||
],
|
||||
)
|
||||
def test_contains_verb(text, expected, monkeypatch):
|
||||
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
|
||||
monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
|
||||
has_verb = partition.contains_verb(text)
|
||||
assert has_verb is expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected",
|
||||
[
|
||||
("Intellectual Property in the United States", True),
|
||||
("Intellectual property helps incentivize innovation.", False),
|
||||
("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
|
||||
],
|
||||
)
|
||||
def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
|
||||
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
|
||||
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
|
||||
assert partition.exceeds_cap_ratio(text, threshold=0.3) is expected
|
||||
|
||||
|
||||
def test_sentence_count(monkeypatch):
|
||||
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
|
||||
text = "Hi my name is Matt. I work with Crag."
|
||||
assert partition.sentence_count(text) == 2
|
||||
|
||||
|
||||
def test_item_titles():
|
||||
text = "ITEM 1(A). THIS IS A TITLE"
|
||||
assert partition.sentence_count(text, 3) < 2
|
||||
|
||||
|
||||
def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"):
|
||||
partition_pdf_response = partition.partition_pdf(filename)
|
||||
assert partition_pdf_response[0]["type"] == "Title"
|
||||
assert (
|
||||
partition_pdf_response[0]["text"]
|
||||
== "LayoutParser : A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
||||
# flake8: noqa
|
||||
from unstructured.nlp.partition import (
|
||||
is_bulleted_text,
|
||||
is_possible_narrative_text,
|
||||
is_possible_title,
|
||||
partition_pdf,
|
||||
)
|
||||
|
@ -2,7 +2,7 @@ from typing import List, Tuple
|
||||
|
||||
import unstructured.nlp.tokenize as tokenize
|
||||
|
||||
from mock_nltk import mock_sent_tokenize, mock_word_tokenize
|
||||
from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
|
||||
|
||||
|
||||
def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:
|
||||
|
10
test_unstructured/partition/test_pdf.py
Normal file
10
test_unstructured/partition/test_pdf.py
Normal file
@ -0,0 +1,10 @@
|
||||
import unstructured.partition.pdf as pdf
|
||||
|
||||
|
||||
def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"):
|
||||
partition_pdf_response = pdf.partition_pdf(filename)
|
||||
assert partition_pdf_response[0]["type"] == "Title"
|
||||
assert (
|
||||
partition_pdf_response[0]["text"]
|
||||
== "LayoutParser : A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
||||
)
|
135
test_unstructured/partition/test_text_type.py
Normal file
135
test_unstructured/partition/test_text_type.py
Normal file
@ -0,0 +1,135 @@
|
||||
import pytest
|
||||
|
||||
import unstructured.partition.text_type as text_type
|
||||
|
||||
from test_unstructured.nlp.mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected",
|
||||
[
|
||||
(
|
||||
"ITEM 5(a).: MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND "
|
||||
"ISSUER PURCHASES OF EQUITY SECURITIES",
|
||||
False,
|
||||
),
|
||||
(
|
||||
"Item 5(a).: Market For Registrant’s Common Equity, Related Stockholder Matters and "
|
||||
"Issuer Purchases of Equity Securities",
|
||||
False,
|
||||
),
|
||||
(
|
||||
"There is a market for registrant’s common equity, related stockholder matters and "
|
||||
"issuer purchases of equity securities.",
|
||||
True,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_headings_are_not_narrative_text(text, expected):
|
||||
assert text_type.is_possible_narrative_text(text) == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected",
|
||||
[
|
||||
("Ask the teacher for an apple.", True),
|
||||
("Ask Me About Intellectual Property", False), # Exceeds the cap threshold
|
||||
("7", False), # Fails because it is numeric
|
||||
("intellectual property", False), # Fails because it does not contain a verb
|
||||
("", False), # Fails because it is empty
|
||||
],
|
||||
)
|
||||
def test_is_possible_narrative_text(text, expected, monkeypatch):
|
||||
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
|
||||
monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag)
|
||||
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
|
||||
has_verb = text_type.is_possible_narrative_text(text, cap_threshold=0.3)
|
||||
assert has_verb is expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected",
|
||||
[
|
||||
("Intellectual Property", True), # Fails because it exceeds the cap threshold
|
||||
(
|
||||
"Ask the teacher for an apple. You might a gold star.",
|
||||
False,
|
||||
), # Too many sentences
|
||||
("7", False), # Fails because it is numeric
|
||||
("", False), # Fails because it is empty
|
||||
("ITEM 1A. RISK FACTORS", True), # Two "sentences", but both are short
|
||||
],
|
||||
)
|
||||
def test_is_possible_title(text, expected, monkeypatch):
|
||||
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
|
||||
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
|
||||
has_verb = text_type.is_possible_title(text)
|
||||
assert has_verb is expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected",
|
||||
[
|
||||
("• This is a fine point!", True),
|
||||
(" • This is a fine point!", True), # Has an extra space in front of the bullet
|
||||
("‣ This is a fine point!", True),
|
||||
("⁃ This is a fine point!", True),
|
||||
("⁌ This is a fine point!", True),
|
||||
("⁍ This is a fine point!", True),
|
||||
("∙ This is a fine point!", True),
|
||||
("○ This is a fine point!", True),
|
||||
("● This is a fine point!", True),
|
||||
("◘ This is a fine point!", True),
|
||||
("◦ This is a fine point!", True),
|
||||
("☙ This is a fine point!", True),
|
||||
("❥ This is a fine point!", True),
|
||||
("❧ This is a fine point!", True),
|
||||
("⦾ This is a fine point!", True),
|
||||
("⦿ This is a fine point!", True),
|
||||
(" This is a fine point!", True),
|
||||
("* This is a fine point!", True),
|
||||
("This is NOT a fine point!", False), # No bullet point
|
||||
("I love morse code! ● ● ● --- ● ● ●", False), # Not at the beginning
|
||||
],
|
||||
)
|
||||
def test_is_bulletized_text(text, expected):
|
||||
assert text_type.is_bulleted_text(text) is expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected",
|
||||
[
|
||||
("Ask the teacher for an apple", True),
|
||||
("Intellectual property", False),
|
||||
],
|
||||
)
|
||||
def test_contains_verb(text, expected, monkeypatch):
|
||||
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
|
||||
monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag)
|
||||
has_verb = text_type.contains_verb(text)
|
||||
assert has_verb is expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected",
|
||||
[
|
||||
("Intellectual Property in the United States", True),
|
||||
("Intellectual property helps incentivize innovation.", False),
|
||||
("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
|
||||
],
|
||||
)
|
||||
def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
|
||||
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
|
||||
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
|
||||
assert text_type.exceeds_cap_ratio(text, threshold=0.3) is expected
|
||||
|
||||
|
||||
def test_sentence_count(monkeypatch):
|
||||
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
|
||||
text = "Hi my name is Matt. I work with Crag."
|
||||
assert text_type.sentence_count(text) == 2
|
||||
|
||||
|
||||
def test_item_titles():
|
||||
text = "ITEM 1(A). THIS IS A TITLE"
|
||||
assert text_type.sentence_count(text, 3) < 2
|
@ -1 +1 @@
|
||||
__version__ = "0.3.0-dev1" # pragma: no cover
|
||||
__version__ = "0.3.0-dev2" # pragma: no cover
|
||||
|
@ -15,7 +15,7 @@ from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
|
||||
from unstructured.documents.base import Page
|
||||
from unstructured.documents.elements import ListItem, Element, NarrativeText, Title
|
||||
from unstructured.documents.xml import XMLDocument
|
||||
from unstructured.nlp.partition import (
|
||||
from unstructured.partition.text_type import (
|
||||
is_bulleted_text,
|
||||
is_possible_narrative_text,
|
||||
is_possible_title,
|
||||
|
@ -1,160 +1,7 @@
|
||||
"""partition.py implements logic for partitioning plain text documents into sections."""
|
||||
import sys
|
||||
import requests # type: ignore
|
||||
|
||||
if sys.version_info < (3, 8):
|
||||
from typing_extensions import Final, List, Optional
|
||||
else:
|
||||
from typing import Final, List, Optional
|
||||
|
||||
from unstructured.cleaners.core import remove_punctuation
|
||||
from unstructured.documents.elements import Element, Text
|
||||
from unstructured.nlp.patterns import UNICODE_BULLETS_RE
|
||||
from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
|
||||
from unstructured.logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
|
||||
|
||||
|
||||
def is_possible_narrative_text(text: str, cap_threshold: float = 0.3) -> bool:
|
||||
"""Checks to see if the text passes all of the checks for a narrative text section."""
|
||||
if len(text) == 0:
|
||||
logger.debug("Not narrative. Text is empty.")
|
||||
return False
|
||||
|
||||
if text.isnumeric():
|
||||
logger.debug(f"Not narrative. Text is all numeric:\n\n{text}")
|
||||
return False
|
||||
|
||||
if exceeds_cap_ratio(text, threshold=cap_threshold):
|
||||
logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
|
||||
return False
|
||||
|
||||
if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
|
||||
logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def is_possible_title(text: str, sentence_min_length: int = 5) -> bool:
|
||||
"""Checks to see if the text passes all of the checks for a valid title."""
|
||||
if len(text) == 0:
|
||||
logger.debug("Not a title. Text is empty.")
|
||||
return False
|
||||
|
||||
if text.isnumeric():
|
||||
logger.debug(f"Not a title. Text is all numeric:\n\n{text}")
|
||||
return False
|
||||
|
||||
# NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS"
|
||||
# that sometimes get tokenized as separate sentences due to the period, but are still
|
||||
# valid titles
|
||||
if sentence_count(text, min_length=sentence_min_length) > 1:
|
||||
logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def is_bulleted_text(text: str) -> bool:
|
||||
"""Checks to see if the section of text is part of a bulleted list."""
|
||||
return UNICODE_BULLETS_RE.match(text.strip()) is not None
|
||||
|
||||
|
||||
def contains_verb(text: str) -> bool:
|
||||
"""Use a POS tagger to check if a segment contains verbs. If the section does not have verbs,
|
||||
that indicates that it is not narrative text."""
|
||||
pos_tags = pos_tag(text)
|
||||
for _, tag in pos_tags:
|
||||
if tag in POS_VERB_TAGS:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def sentence_count(text: str, min_length: Optional[int] = None) -> int:
|
||||
"""Checks the sentence count for a section of text. Titles should not be more than one
|
||||
sentence.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text
|
||||
The string of the text to count
|
||||
min_length
|
||||
The min number of words a section needs to be for it to be considered a sentence.
|
||||
"""
|
||||
sentences = sent_tokenize(text)
|
||||
count = 0
|
||||
for sentence in sentences:
|
||||
sentence = remove_punctuation(sentence)
|
||||
words = [word for word in word_tokenize(sentence) if word != "."]
|
||||
if min_length and len(words) < min_length:
|
||||
logger.debug(
|
||||
f"Skipping sentence because does not exceed {min_length} word tokens\n"
|
||||
f"{sentence}"
|
||||
# flake8: noqa
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
from unstructured.partition.text_type import (
|
||||
is_bulleted_text,
|
||||
is_possible_narrative_text,
|
||||
is_possible_title,
|
||||
)
|
||||
continue
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def exceeds_cap_ratio(text: str, threshold: float = 0.3) -> bool:
|
||||
"""Checks the title ratio in a section of text. If a sufficient proportion of the text is
|
||||
capitalized."""
|
||||
# NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
|
||||
# The assumption is that sections with multiple sentences are not titles.
|
||||
if sentence_count(text, 3) > 1:
|
||||
logger.debug(f"Text does not contain multiple sentences:\n\n{text}")
|
||||
return False
|
||||
|
||||
tokens = word_tokenize(text)
|
||||
capitalized = sum([word.istitle() or word.isupper() for word in tokens])
|
||||
ratio = capitalized / len(tokens)
|
||||
return ratio > threshold
|
||||
|
||||
|
||||
def partition_pdf(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
url: str = "https://ml.unstructured.io/",
|
||||
template: Optional[str] = "base-model",
|
||||
token: Optional[str] = None,
|
||||
) -> List[Element]:
|
||||
"""Calls the document parsing API.
|
||||
Parameters
|
||||
----------
|
||||
filename
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object as bytes --> open(filename, "rb").
|
||||
template
|
||||
A string defining the model to be used. Default "base-model" makes reference to layout/pdf.
|
||||
url
|
||||
A string endpoint to self-host an inference API, if desired.
|
||||
token
|
||||
A string defining the authentication token for a self-host url.
|
||||
"""
|
||||
if not filename and not file:
|
||||
raise FileNotFoundError("No filename nor file were specified")
|
||||
|
||||
healthcheck_response = requests.models.Response()
|
||||
if not token:
|
||||
healthcheck_response = requests.get(url=f"{url}healthcheck")
|
||||
|
||||
if healthcheck_response.status_code != 200:
|
||||
return [Text(text="error: endpoint api healthcheck has failed!")]
|
||||
|
||||
url = f"{url}layout/pdf" if template == "base-model" else f"{url}/{template}"
|
||||
file_ = (filename, file if file else open(filename, "rb"))
|
||||
response = requests.post(
|
||||
url=url,
|
||||
headers={"Authorization": f"Bearer {token}" if token else ""},
|
||||
files={"file": file_},
|
||||
)
|
||||
if response.status_code == 200:
|
||||
pages = response.json()["pages"]
|
||||
return [element for page in pages for element in page["elements"]]
|
||||
else:
|
||||
return [Text(text=f"error: response status code = {response.status_code}")]
|
||||
|
55
unstructured/partition/pdf.py
Normal file
55
unstructured/partition/pdf.py
Normal file
@ -0,0 +1,55 @@
|
||||
import requests # type: ignore
|
||||
|
||||
import sys
|
||||
|
||||
if sys.version_info < (3, 8):
|
||||
from typing_extensions import List, Optional
|
||||
else:
|
||||
from typing import List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element, Text
|
||||
|
||||
|
||||
def partition_pdf(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
url: str = "https://ml.unstructured.io/",
|
||||
template: Optional[str] = "base-model",
|
||||
token: Optional[str] = None,
|
||||
) -> List[Element]:
|
||||
"""Calls the document parsing API.
|
||||
Parameters
|
||||
----------
|
||||
filename
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object as bytes --> open(filename, "rb").
|
||||
template
|
||||
A string defining the model to be used. Default "base-model" makes reference to layout/pdf.
|
||||
url
|
||||
A string endpoint to self-host an inference API, if desired.
|
||||
token
|
||||
A string defining the authentication token for a self-host url.
|
||||
"""
|
||||
if not filename and not file:
|
||||
raise FileNotFoundError("No filename nor file were specified")
|
||||
|
||||
healthcheck_response = requests.models.Response()
|
||||
if not token:
|
||||
healthcheck_response = requests.get(url=f"{url}healthcheck")
|
||||
|
||||
if healthcheck_response.status_code != 200:
|
||||
return [Text(text="error: endpoint api healthcheck has failed!")]
|
||||
|
||||
url = f"{url}layout/pdf" if template == "base-model" else f"{url}/{template}"
|
||||
file_ = (filename, file if file else open(filename, "rb"))
|
||||
response = requests.post(
|
||||
url=url,
|
||||
headers={"Authorization": f"Bearer {token}" if token else ""},
|
||||
files={"file": file_},
|
||||
)
|
||||
if response.status_code == 200:
|
||||
pages = response.json()["pages"]
|
||||
return [element for page in pages for element in page["elements"]]
|
||||
else:
|
||||
return [Text(text=f"error: response status code = {response.status_code}")]
|
113
unstructured/partition/text_type.py
Normal file
113
unstructured/partition/text_type.py
Normal file
@ -0,0 +1,113 @@
|
||||
"""partition.py implements logic for partitioning plain text documents into sections."""
|
||||
import sys
|
||||
|
||||
if sys.version_info < (3, 8):
|
||||
from typing_extensions import Final, List, Optional
|
||||
else:
|
||||
from typing import Final, List, Optional
|
||||
|
||||
from unstructured.cleaners.core import remove_punctuation
|
||||
from unstructured.nlp.patterns import UNICODE_BULLETS_RE
|
||||
from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
|
||||
from unstructured.logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
|
||||
|
||||
|
||||
def is_possible_narrative_text(text: str, cap_threshold: float = 0.3) -> bool:
|
||||
"""Checks to see if the text passes all of the checks for a narrative text section."""
|
||||
if len(text) == 0:
|
||||
logger.debug("Not narrative. Text is empty.")
|
||||
return False
|
||||
|
||||
if text.isnumeric():
|
||||
logger.debug(f"Not narrative. Text is all numeric:\n\n{text}")
|
||||
return False
|
||||
|
||||
if exceeds_cap_ratio(text, threshold=cap_threshold):
|
||||
logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
|
||||
return False
|
||||
|
||||
if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
|
||||
logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def is_possible_title(text: str, sentence_min_length: int = 5) -> bool:
|
||||
"""Checks to see if the text passes all of the checks for a valid title."""
|
||||
if len(text) == 0:
|
||||
logger.debug("Not a title. Text is empty.")
|
||||
return False
|
||||
|
||||
if text.isnumeric():
|
||||
logger.debug(f"Not a title. Text is all numeric:\n\n{text}")
|
||||
return False
|
||||
|
||||
# NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS"
|
||||
# that sometimes get tokenized as separate sentences due to the period, but are still
|
||||
# valid titles
|
||||
if sentence_count(text, min_length=sentence_min_length) > 1:
|
||||
logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def is_bulleted_text(text: str) -> bool:
|
||||
"""Checks to see if the section of text is part of a bulleted list."""
|
||||
return UNICODE_BULLETS_RE.match(text.strip()) is not None
|
||||
|
||||
|
||||
def contains_verb(text: str) -> bool:
|
||||
"""Use a POS tagger to check if a segment contains verbs. If the section does not have verbs,
|
||||
that indicates that it is not narrative text."""
|
||||
pos_tags = pos_tag(text)
|
||||
for _, tag in pos_tags:
|
||||
if tag in POS_VERB_TAGS:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def sentence_count(text: str, min_length: Optional[int] = None) -> int:
|
||||
"""Checks the sentence count for a section of text. Titles should not be more than one
|
||||
sentence.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text
|
||||
The string of the text to count
|
||||
min_length
|
||||
The min number of words a section needs to be for it to be considered a sentence.
|
||||
"""
|
||||
sentences = sent_tokenize(text)
|
||||
count = 0
|
||||
for sentence in sentences:
|
||||
sentence = remove_punctuation(sentence)
|
||||
words = [word for word in word_tokenize(sentence) if word != "."]
|
||||
if min_length and len(words) < min_length:
|
||||
logger.debug(
|
||||
f"Skipping sentence because does not exceed {min_length} word tokens\n"
|
||||
f"{sentence}"
|
||||
)
|
||||
continue
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def exceeds_cap_ratio(text: str, threshold: float = 0.3) -> bool:
|
||||
"""Checks the title ratio in a section of text. If a sufficient proportion of the text is
|
||||
capitalized."""
|
||||
# NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
|
||||
# The assumption is that sections with multiple sentences are not titles.
|
||||
if sentence_count(text, 3) > 1:
|
||||
logger.debug(f"Text does not contain multiple sentences:\n\n{text}")
|
||||
return False
|
||||
|
||||
tokens = word_tokenize(text)
|
||||
capitalized = sum([word.istitle() or word.isupper() for word in tokens])
|
||||
ratio = capitalized / len(tokens)
|
||||
return ratio > threshold
|
Loading…
x
Reference in New Issue
Block a user