mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
chore: Reorganize partition bricks under partition directory (#76)
* move partition_pdf to partition folder * move partition.py * refactor partioning bricks into partition diretory * import to nlp for backward compatibility * update docs * update version and bump changelog * fix typo in changelog * update readme reference
This commit is contained in:
parent
53fcf4e912
commit
08e091c5a9
@ -1,6 +1,7 @@
|
|||||||
## 0.3.0-dev1
|
## 0.3.0-dev2
|
||||||
|
|
||||||
* Removing the local PDF parsing code and any dependencies and tests.
|
* Removing the local PDF parsing code and any dependencies and tests.
|
||||||
|
* Reorganizes the staging bricks in the unstructured.partition module
|
||||||
|
|
||||||
## 0.2.6
|
## 0.2.6
|
||||||
|
|
||||||
|
@ -91,7 +91,7 @@ titles and narrative text.
|
|||||||
You can use the following workflow to parse PDF documents.
|
You can use the following workflow to parse PDF documents.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from unstructured.nlp.partition import partition_pdf
|
from unstructured.partition.pdf import partition_pdf
|
||||||
|
|
||||||
elements = partition_pdf("example-docs/layout-parser-paper.pdf")
|
elements = partition_pdf("example-docs/layout-parser-paper.pdf")
|
||||||
print(doc)
|
print(doc)
|
||||||
|
@ -14,6 +14,24 @@ The partitioning bricks in ``unstructured`` differentiate between different sect
|
|||||||
of text in a document. For example, the partitioning bricks can help distinguish between
|
of text in a document. For example, the partitioning bricks can help distinguish between
|
||||||
titles, narrative text, and tables.
|
titles, narrative text, and tables.
|
||||||
|
|
||||||
|
|
||||||
|
``partition_pdf``
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
The ``partition_pdf`` function segments a PDF document by calling the document image analysis API.
|
||||||
|
The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API,
|
||||||
|
if desired.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.partition.pdf import partition_pdf
|
||||||
|
|
||||||
|
# Returns a List[Element] present in the pages of the parsed pdf document
|
||||||
|
elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf")
|
||||||
|
|
||||||
|
|
||||||
``is_bulleted_text``
|
``is_bulleted_text``
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
@ -24,7 +42,7 @@ Examples:
|
|||||||
|
|
||||||
.. code:: python
|
.. code:: python
|
||||||
|
|
||||||
from unstructured.nlp.partition import is_bulleted_text
|
from unstructured.partition.text_type import is_bulleted_text
|
||||||
|
|
||||||
# Returns True
|
# Returns True
|
||||||
is_bulleted_text("● An excellent point!")
|
is_bulleted_text("● An excellent point!")
|
||||||
@ -52,7 +70,7 @@ Examples:
|
|||||||
|
|
||||||
.. code:: python
|
.. code:: python
|
||||||
|
|
||||||
from unstructured.nlp.partition import is_possible_narrative_text
|
from unstructured.partition.text_type import is_possible_narrative_text
|
||||||
|
|
||||||
# Returns True because the example passes all the checks
|
# Returns True because the example passes all the checks
|
||||||
example_1 = "Make sure you brush your teeth before you go to bed."
|
example_1 = "Make sure you brush your teeth before you go to bed."
|
||||||
@ -83,7 +101,7 @@ Examples:
|
|||||||
|
|
||||||
.. code:: python
|
.. code:: python
|
||||||
|
|
||||||
from unstructured.nlp.partition import is_possible_title
|
from unstructured.partition.text_type import is_possible_title
|
||||||
|
|
||||||
# Returns True because the text passes all the tests
|
# Returns True because the text passes all the tests
|
||||||
example_2 = "ITEM 1A. RISK FACTORS"
|
example_2 = "ITEM 1A. RISK FACTORS"
|
||||||
@ -116,7 +134,7 @@ Examples:
|
|||||||
|
|
||||||
.. code:: python
|
.. code:: python
|
||||||
|
|
||||||
from unstructured.nlp.partition import contains_verb
|
from unstructured.partition.text_type import contains_verb
|
||||||
|
|
||||||
# Returns True because the text contains a verb
|
# Returns True because the text contains a verb
|
||||||
example_1 = "I am going to run to the store to pick up some milk."
|
example_1 = "I am going to run to the store to pick up some milk."
|
||||||
@ -139,7 +157,7 @@ Examples:
|
|||||||
|
|
||||||
.. code:: python
|
.. code:: python
|
||||||
|
|
||||||
from unstructured.nlp.partition import sentence_count
|
from unstructured.partition.text_type import sentence_count
|
||||||
|
|
||||||
example = "Look at me! I am a document with two sentences."
|
example = "Look at me! I am a document with two sentences."
|
||||||
|
|
||||||
@ -162,7 +180,7 @@ Examples:
|
|||||||
|
|
||||||
.. code:: python
|
.. code:: python
|
||||||
|
|
||||||
from unstructured.nlp.partition import exceeds_cap_ratio
|
from unstructured.partition.text_type import exceeds_cap_ratio
|
||||||
|
|
||||||
# Returns True because the text is more than 30% caps
|
# Returns True because the text is more than 30% caps
|
||||||
example_1 = "LOOK AT ME I AM YELLING"
|
example_1 = "LOOK AT ME I AM YELLING"
|
||||||
@ -176,22 +194,6 @@ Examples:
|
|||||||
exceeds_cap_ratio(example_2, threshold=0.01)
|
exceeds_cap_ratio(example_2, threshold=0.01)
|
||||||
|
|
||||||
|
|
||||||
``partition_pdf``
|
|
||||||
---------------------
|
|
||||||
|
|
||||||
The ``partition_pdf`` function segments a PDF document by calling the document image analysis API.
|
|
||||||
The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API,
|
|
||||||
if desired.
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
|
|
||||||
.. code:: python
|
|
||||||
|
|
||||||
from unstructured.nlp.partition import partition_pdf
|
|
||||||
|
|
||||||
# Returns a List[Element] present in the pages of the parsed pdf document
|
|
||||||
elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf")
|
|
||||||
|
|
||||||
|
|
||||||
########
|
########
|
||||||
Cleaning
|
Cleaning
|
||||||
|
0
test_unstructured/__init__.py
Normal file
0
test_unstructured/__init__.py
Normal file
0
test_unstructured/nlp/__init__.py
Normal file
0
test_unstructured/nlp/__init__.py
Normal file
@ -1,144 +1,7 @@
|
|||||||
import pytest
|
# flake8: noqa
|
||||||
|
from unstructured.nlp.partition import (
|
||||||
import unstructured.nlp.partition as partition
|
is_bulleted_text,
|
||||||
|
is_possible_narrative_text,
|
||||||
from mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize
|
is_possible_title,
|
||||||
|
partition_pdf,
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"text, expected",
|
|
||||||
[
|
|
||||||
(
|
|
||||||
"ITEM 5(a).: MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND "
|
|
||||||
"ISSUER PURCHASES OF EQUITY SECURITIES",
|
|
||||||
False,
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"Item 5(a).: Market For Registrant’s Common Equity, Related Stockholder Matters and "
|
|
||||||
"Issuer Purchases of Equity Securities",
|
|
||||||
False,
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"There is a market for registrant’s common equity, related stockholder matters and "
|
|
||||||
"issuer purchases of equity securities.",
|
|
||||||
True,
|
|
||||||
),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_headings_are_not_narrative_text(text, expected):
|
|
||||||
assert partition.is_possible_narrative_text(text) == expected
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"text, expected",
|
|
||||||
[
|
|
||||||
("Ask the teacher for an apple.", True),
|
|
||||||
("Ask Me About Intellectual Property", False), # Exceeds the cap threshold
|
|
||||||
("7", False), # Fails because it is numeric
|
|
||||||
("intellectual property", False), # Fails because it does not contain a verb
|
|
||||||
("", False), # Fails because it is empty
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_is_possible_narrative_text(text, expected, monkeypatch):
|
|
||||||
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
|
|
||||||
monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
|
|
||||||
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
|
|
||||||
has_verb = partition.is_possible_narrative_text(text, cap_threshold=0.3)
|
|
||||||
assert has_verb is expected
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"text, expected",
|
|
||||||
[
|
|
||||||
("Intellectual Property", True), # Fails because it exceeds the cap threshold
|
|
||||||
(
|
|
||||||
"Ask the teacher for an apple. You might a gold star.",
|
|
||||||
False,
|
|
||||||
), # Too many sentences
|
|
||||||
("7", False), # Fails because it is numeric
|
|
||||||
("", False), # Fails because it is empty
|
|
||||||
("ITEM 1A. RISK FACTORS", True), # Two "sentences", but both are short
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_is_possible_title(text, expected, monkeypatch):
|
|
||||||
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
|
|
||||||
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
|
|
||||||
has_verb = partition.is_possible_title(text)
|
|
||||||
assert has_verb is expected
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"text, expected",
|
|
||||||
[
|
|
||||||
("• This is a fine point!", True),
|
|
||||||
(" • This is a fine point!", True), # Has an extra space in front of the bullet
|
|
||||||
("‣ This is a fine point!", True),
|
|
||||||
("⁃ This is a fine point!", True),
|
|
||||||
("⁌ This is a fine point!", True),
|
|
||||||
("⁍ This is a fine point!", True),
|
|
||||||
("∙ This is a fine point!", True),
|
|
||||||
("○ This is a fine point!", True),
|
|
||||||
("● This is a fine point!", True),
|
|
||||||
("◘ This is a fine point!", True),
|
|
||||||
("◦ This is a fine point!", True),
|
|
||||||
("☙ This is a fine point!", True),
|
|
||||||
("❥ This is a fine point!", True),
|
|
||||||
("❧ This is a fine point!", True),
|
|
||||||
("⦾ This is a fine point!", True),
|
|
||||||
("⦿ This is a fine point!", True),
|
|
||||||
(" This is a fine point!", True),
|
|
||||||
("* This is a fine point!", True),
|
|
||||||
("This is NOT a fine point!", False), # No bullet point
|
|
||||||
("I love morse code! ● ● ● --- ● ● ●", False), # Not at the beginning
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_is_bulletized_text(text, expected):
|
|
||||||
assert partition.is_bulleted_text(text) is expected
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"text, expected",
|
|
||||||
[
|
|
||||||
("Ask the teacher for an apple", True),
|
|
||||||
("Intellectual property", False),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_contains_verb(text, expected, monkeypatch):
|
|
||||||
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
|
|
||||||
monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
|
|
||||||
has_verb = partition.contains_verb(text)
|
|
||||||
assert has_verb is expected
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"text, expected",
|
|
||||||
[
|
|
||||||
("Intellectual Property in the United States", True),
|
|
||||||
("Intellectual property helps incentivize innovation.", False),
|
|
||||||
("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
|
|
||||||
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
|
|
||||||
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
|
|
||||||
assert partition.exceeds_cap_ratio(text, threshold=0.3) is expected
|
|
||||||
|
|
||||||
|
|
||||||
def test_sentence_count(monkeypatch):
|
|
||||||
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
|
|
||||||
text = "Hi my name is Matt. I work with Crag."
|
|
||||||
assert partition.sentence_count(text) == 2
|
|
||||||
|
|
||||||
|
|
||||||
def test_item_titles():
|
|
||||||
text = "ITEM 1(A). THIS IS A TITLE"
|
|
||||||
assert partition.sentence_count(text, 3) < 2
|
|
||||||
|
|
||||||
|
|
||||||
def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"):
|
|
||||||
partition_pdf_response = partition.partition_pdf(filename)
|
|
||||||
assert partition_pdf_response[0]["type"] == "Title"
|
|
||||||
assert (
|
|
||||||
partition_pdf_response[0]["text"]
|
|
||||||
== "LayoutParser : A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
|
||||||
)
|
)
|
||||||
|
@ -2,7 +2,7 @@ from typing import List, Tuple
|
|||||||
|
|
||||||
import unstructured.nlp.tokenize as tokenize
|
import unstructured.nlp.tokenize as tokenize
|
||||||
|
|
||||||
from mock_nltk import mock_sent_tokenize, mock_word_tokenize
|
from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
|
||||||
|
|
||||||
|
|
||||||
def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:
|
def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:
|
||||||
|
10
test_unstructured/partition/test_pdf.py
Normal file
10
test_unstructured/partition/test_pdf.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
import unstructured.partition.pdf as pdf
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"):
|
||||||
|
partition_pdf_response = pdf.partition_pdf(filename)
|
||||||
|
assert partition_pdf_response[0]["type"] == "Title"
|
||||||
|
assert (
|
||||||
|
partition_pdf_response[0]["text"]
|
||||||
|
== "LayoutParser : A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
||||||
|
)
|
135
test_unstructured/partition/test_text_type.py
Normal file
135
test_unstructured/partition/test_text_type.py
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
import unstructured.partition.text_type as text_type
|
||||||
|
|
||||||
|
from test_unstructured.nlp.mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text, expected",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"ITEM 5(a).: MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND "
|
||||||
|
"ISSUER PURCHASES OF EQUITY SECURITIES",
|
||||||
|
False,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Item 5(a).: Market For Registrant’s Common Equity, Related Stockholder Matters and "
|
||||||
|
"Issuer Purchases of Equity Securities",
|
||||||
|
False,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"There is a market for registrant’s common equity, related stockholder matters and "
|
||||||
|
"issuer purchases of equity securities.",
|
||||||
|
True,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_headings_are_not_narrative_text(text, expected):
|
||||||
|
assert text_type.is_possible_narrative_text(text) == expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text, expected",
|
||||||
|
[
|
||||||
|
("Ask the teacher for an apple.", True),
|
||||||
|
("Ask Me About Intellectual Property", False), # Exceeds the cap threshold
|
||||||
|
("7", False), # Fails because it is numeric
|
||||||
|
("intellectual property", False), # Fails because it does not contain a verb
|
||||||
|
("", False), # Fails because it is empty
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_is_possible_narrative_text(text, expected, monkeypatch):
|
||||||
|
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
|
||||||
|
monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag)
|
||||||
|
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
|
||||||
|
has_verb = text_type.is_possible_narrative_text(text, cap_threshold=0.3)
|
||||||
|
assert has_verb is expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text, expected",
|
||||||
|
[
|
||||||
|
("Intellectual Property", True), # Fails because it exceeds the cap threshold
|
||||||
|
(
|
||||||
|
"Ask the teacher for an apple. You might a gold star.",
|
||||||
|
False,
|
||||||
|
), # Too many sentences
|
||||||
|
("7", False), # Fails because it is numeric
|
||||||
|
("", False), # Fails because it is empty
|
||||||
|
("ITEM 1A. RISK FACTORS", True), # Two "sentences", but both are short
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_is_possible_title(text, expected, monkeypatch):
|
||||||
|
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
|
||||||
|
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
|
||||||
|
has_verb = text_type.is_possible_title(text)
|
||||||
|
assert has_verb is expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text, expected",
|
||||||
|
[
|
||||||
|
("• This is a fine point!", True),
|
||||||
|
(" • This is a fine point!", True), # Has an extra space in front of the bullet
|
||||||
|
("‣ This is a fine point!", True),
|
||||||
|
("⁃ This is a fine point!", True),
|
||||||
|
("⁌ This is a fine point!", True),
|
||||||
|
("⁍ This is a fine point!", True),
|
||||||
|
("∙ This is a fine point!", True),
|
||||||
|
("○ This is a fine point!", True),
|
||||||
|
("● This is a fine point!", True),
|
||||||
|
("◘ This is a fine point!", True),
|
||||||
|
("◦ This is a fine point!", True),
|
||||||
|
("☙ This is a fine point!", True),
|
||||||
|
("❥ This is a fine point!", True),
|
||||||
|
("❧ This is a fine point!", True),
|
||||||
|
("⦾ This is a fine point!", True),
|
||||||
|
("⦿ This is a fine point!", True),
|
||||||
|
(" This is a fine point!", True),
|
||||||
|
("* This is a fine point!", True),
|
||||||
|
("This is NOT a fine point!", False), # No bullet point
|
||||||
|
("I love morse code! ● ● ● --- ● ● ●", False), # Not at the beginning
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_is_bulletized_text(text, expected):
|
||||||
|
assert text_type.is_bulleted_text(text) is expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text, expected",
|
||||||
|
[
|
||||||
|
("Ask the teacher for an apple", True),
|
||||||
|
("Intellectual property", False),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_contains_verb(text, expected, monkeypatch):
|
||||||
|
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
|
||||||
|
monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag)
|
||||||
|
has_verb = text_type.contains_verb(text)
|
||||||
|
assert has_verb is expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text, expected",
|
||||||
|
[
|
||||||
|
("Intellectual Property in the United States", True),
|
||||||
|
("Intellectual property helps incentivize innovation.", False),
|
||||||
|
("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
|
||||||
|
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
|
||||||
|
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
|
||||||
|
assert text_type.exceeds_cap_ratio(text, threshold=0.3) is expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_sentence_count(monkeypatch):
|
||||||
|
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
|
||||||
|
text = "Hi my name is Matt. I work with Crag."
|
||||||
|
assert text_type.sentence_count(text) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_item_titles():
|
||||||
|
text = "ITEM 1(A). THIS IS A TITLE"
|
||||||
|
assert text_type.sentence_count(text, 3) < 2
|
@ -1 +1 @@
|
|||||||
__version__ = "0.3.0-dev1" # pragma: no cover
|
__version__ = "0.3.0-dev2" # pragma: no cover
|
||||||
|
@ -15,7 +15,7 @@ from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
|
|||||||
from unstructured.documents.base import Page
|
from unstructured.documents.base import Page
|
||||||
from unstructured.documents.elements import ListItem, Element, NarrativeText, Title
|
from unstructured.documents.elements import ListItem, Element, NarrativeText, Title
|
||||||
from unstructured.documents.xml import XMLDocument
|
from unstructured.documents.xml import XMLDocument
|
||||||
from unstructured.nlp.partition import (
|
from unstructured.partition.text_type import (
|
||||||
is_bulleted_text,
|
is_bulleted_text,
|
||||||
is_possible_narrative_text,
|
is_possible_narrative_text,
|
||||||
is_possible_title,
|
is_possible_title,
|
||||||
|
@ -1,160 +1,7 @@
|
|||||||
"""partition.py implements logic for partitioning plain text documents into sections."""
|
# flake8: noqa
|
||||||
import sys
|
from unstructured.partition.pdf import partition_pdf
|
||||||
import requests # type: ignore
|
from unstructured.partition.text_type import (
|
||||||
|
is_bulleted_text,
|
||||||
if sys.version_info < (3, 8):
|
is_possible_narrative_text,
|
||||||
from typing_extensions import Final, List, Optional
|
is_possible_title,
|
||||||
else:
|
|
||||||
from typing import Final, List, Optional
|
|
||||||
|
|
||||||
from unstructured.cleaners.core import remove_punctuation
|
|
||||||
from unstructured.documents.elements import Element, Text
|
|
||||||
from unstructured.nlp.patterns import UNICODE_BULLETS_RE
|
|
||||||
from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
|
|
||||||
from unstructured.logger import get_logger
|
|
||||||
|
|
||||||
logger = get_logger()
|
|
||||||
|
|
||||||
POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
|
|
||||||
|
|
||||||
|
|
||||||
def is_possible_narrative_text(text: str, cap_threshold: float = 0.3) -> bool:
|
|
||||||
"""Checks to see if the text passes all of the checks for a narrative text section."""
|
|
||||||
if len(text) == 0:
|
|
||||||
logger.debug("Not narrative. Text is empty.")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if text.isnumeric():
|
|
||||||
logger.debug(f"Not narrative. Text is all numeric:\n\n{text}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if exceeds_cap_ratio(text, threshold=cap_threshold):
|
|
||||||
logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
|
|
||||||
logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def is_possible_title(text: str, sentence_min_length: int = 5) -> bool:
|
|
||||||
"""Checks to see if the text passes all of the checks for a valid title."""
|
|
||||||
if len(text) == 0:
|
|
||||||
logger.debug("Not a title. Text is empty.")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if text.isnumeric():
|
|
||||||
logger.debug(f"Not a title. Text is all numeric:\n\n{text}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS"
|
|
||||||
# that sometimes get tokenized as separate sentences due to the period, but are still
|
|
||||||
# valid titles
|
|
||||||
if sentence_count(text, min_length=sentence_min_length) > 1:
|
|
||||||
logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def is_bulleted_text(text: str) -> bool:
|
|
||||||
"""Checks to see if the section of text is part of a bulleted list."""
|
|
||||||
return UNICODE_BULLETS_RE.match(text.strip()) is not None
|
|
||||||
|
|
||||||
|
|
||||||
def contains_verb(text: str) -> bool:
|
|
||||||
"""Use a POS tagger to check if a segment contains verbs. If the section does not have verbs,
|
|
||||||
that indicates that it is not narrative text."""
|
|
||||||
pos_tags = pos_tag(text)
|
|
||||||
for _, tag in pos_tags:
|
|
||||||
if tag in POS_VERB_TAGS:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def sentence_count(text: str, min_length: Optional[int] = None) -> int:
|
|
||||||
"""Checks the sentence count for a section of text. Titles should not be more than one
|
|
||||||
sentence.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
text
|
|
||||||
The string of the text to count
|
|
||||||
min_length
|
|
||||||
The min number of words a section needs to be for it to be considered a sentence.
|
|
||||||
"""
|
|
||||||
sentences = sent_tokenize(text)
|
|
||||||
count = 0
|
|
||||||
for sentence in sentences:
|
|
||||||
sentence = remove_punctuation(sentence)
|
|
||||||
words = [word for word in word_tokenize(sentence) if word != "."]
|
|
||||||
if min_length and len(words) < min_length:
|
|
||||||
logger.debug(
|
|
||||||
f"Skipping sentence because does not exceed {min_length} word tokens\n"
|
|
||||||
f"{sentence}"
|
|
||||||
)
|
)
|
||||||
continue
|
|
||||||
count += 1
|
|
||||||
return count
|
|
||||||
|
|
||||||
|
|
||||||
def exceeds_cap_ratio(text: str, threshold: float = 0.3) -> bool:
|
|
||||||
"""Checks the title ratio in a section of text. If a sufficient proportion of the text is
|
|
||||||
capitalized."""
|
|
||||||
# NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
|
|
||||||
# The assumption is that sections with multiple sentences are not titles.
|
|
||||||
if sentence_count(text, 3) > 1:
|
|
||||||
logger.debug(f"Text does not contain multiple sentences:\n\n{text}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
tokens = word_tokenize(text)
|
|
||||||
capitalized = sum([word.istitle() or word.isupper() for word in tokens])
|
|
||||||
ratio = capitalized / len(tokens)
|
|
||||||
return ratio > threshold
|
|
||||||
|
|
||||||
|
|
||||||
def partition_pdf(
|
|
||||||
filename: str = "",
|
|
||||||
file: Optional[bytes] = None,
|
|
||||||
url: str = "https://ml.unstructured.io/",
|
|
||||||
template: Optional[str] = "base-model",
|
|
||||||
token: Optional[str] = None,
|
|
||||||
) -> List[Element]:
|
|
||||||
"""Calls the document parsing API.
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
filename
|
|
||||||
A string defining the target filename path.
|
|
||||||
file
|
|
||||||
A file-like object as bytes --> open(filename, "rb").
|
|
||||||
template
|
|
||||||
A string defining the model to be used. Default "base-model" makes reference to layout/pdf.
|
|
||||||
url
|
|
||||||
A string endpoint to self-host an inference API, if desired.
|
|
||||||
token
|
|
||||||
A string defining the authentication token for a self-host url.
|
|
||||||
"""
|
|
||||||
if not filename and not file:
|
|
||||||
raise FileNotFoundError("No filename nor file were specified")
|
|
||||||
|
|
||||||
healthcheck_response = requests.models.Response()
|
|
||||||
if not token:
|
|
||||||
healthcheck_response = requests.get(url=f"{url}healthcheck")
|
|
||||||
|
|
||||||
if healthcheck_response.status_code != 200:
|
|
||||||
return [Text(text="error: endpoint api healthcheck has failed!")]
|
|
||||||
|
|
||||||
url = f"{url}layout/pdf" if template == "base-model" else f"{url}/{template}"
|
|
||||||
file_ = (filename, file if file else open(filename, "rb"))
|
|
||||||
response = requests.post(
|
|
||||||
url=url,
|
|
||||||
headers={"Authorization": f"Bearer {token}" if token else ""},
|
|
||||||
files={"file": file_},
|
|
||||||
)
|
|
||||||
if response.status_code == 200:
|
|
||||||
pages = response.json()["pages"]
|
|
||||||
return [element for page in pages for element in page["elements"]]
|
|
||||||
else:
|
|
||||||
return [Text(text=f"error: response status code = {response.status_code}")]
|
|
||||||
|
55
unstructured/partition/pdf.py
Normal file
55
unstructured/partition/pdf.py
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
import requests # type: ignore
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
if sys.version_info < (3, 8):
|
||||||
|
from typing_extensions import List, Optional
|
||||||
|
else:
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from unstructured.documents.elements import Element, Text
|
||||||
|
|
||||||
|
|
||||||
|
def partition_pdf(
|
||||||
|
filename: str = "",
|
||||||
|
file: Optional[bytes] = None,
|
||||||
|
url: str = "https://ml.unstructured.io/",
|
||||||
|
template: Optional[str] = "base-model",
|
||||||
|
token: Optional[str] = None,
|
||||||
|
) -> List[Element]:
|
||||||
|
"""Calls the document parsing API.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename
|
||||||
|
A string defining the target filename path.
|
||||||
|
file
|
||||||
|
A file-like object as bytes --> open(filename, "rb").
|
||||||
|
template
|
||||||
|
A string defining the model to be used. Default "base-model" makes reference to layout/pdf.
|
||||||
|
url
|
||||||
|
A string endpoint to self-host an inference API, if desired.
|
||||||
|
token
|
||||||
|
A string defining the authentication token for a self-host url.
|
||||||
|
"""
|
||||||
|
if not filename and not file:
|
||||||
|
raise FileNotFoundError("No filename nor file were specified")
|
||||||
|
|
||||||
|
healthcheck_response = requests.models.Response()
|
||||||
|
if not token:
|
||||||
|
healthcheck_response = requests.get(url=f"{url}healthcheck")
|
||||||
|
|
||||||
|
if healthcheck_response.status_code != 200:
|
||||||
|
return [Text(text="error: endpoint api healthcheck has failed!")]
|
||||||
|
|
||||||
|
url = f"{url}layout/pdf" if template == "base-model" else f"{url}/{template}"
|
||||||
|
file_ = (filename, file if file else open(filename, "rb"))
|
||||||
|
response = requests.post(
|
||||||
|
url=url,
|
||||||
|
headers={"Authorization": f"Bearer {token}" if token else ""},
|
||||||
|
files={"file": file_},
|
||||||
|
)
|
||||||
|
if response.status_code == 200:
|
||||||
|
pages = response.json()["pages"]
|
||||||
|
return [element for page in pages for element in page["elements"]]
|
||||||
|
else:
|
||||||
|
return [Text(text=f"error: response status code = {response.status_code}")]
|
113
unstructured/partition/text_type.py
Normal file
113
unstructured/partition/text_type.py
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
"""partition.py implements logic for partitioning plain text documents into sections."""
|
||||||
|
import sys
|
||||||
|
|
||||||
|
if sys.version_info < (3, 8):
|
||||||
|
from typing_extensions import Final, List, Optional
|
||||||
|
else:
|
||||||
|
from typing import Final, List, Optional
|
||||||
|
|
||||||
|
from unstructured.cleaners.core import remove_punctuation
|
||||||
|
from unstructured.nlp.patterns import UNICODE_BULLETS_RE
|
||||||
|
from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
|
||||||
|
from unstructured.logger import get_logger
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
|
||||||
|
|
||||||
|
|
||||||
|
def is_possible_narrative_text(text: str, cap_threshold: float = 0.3) -> bool:
|
||||||
|
"""Checks to see if the text passes all of the checks for a narrative text section."""
|
||||||
|
if len(text) == 0:
|
||||||
|
logger.debug("Not narrative. Text is empty.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if text.isnumeric():
|
||||||
|
logger.debug(f"Not narrative. Text is all numeric:\n\n{text}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if exceeds_cap_ratio(text, threshold=cap_threshold):
|
||||||
|
logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
|
||||||
|
logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def is_possible_title(text: str, sentence_min_length: int = 5) -> bool:
|
||||||
|
"""Checks to see if the text passes all of the checks for a valid title."""
|
||||||
|
if len(text) == 0:
|
||||||
|
logger.debug("Not a title. Text is empty.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if text.isnumeric():
|
||||||
|
logger.debug(f"Not a title. Text is all numeric:\n\n{text}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS"
|
||||||
|
# that sometimes get tokenized as separate sentences due to the period, but are still
|
||||||
|
# valid titles
|
||||||
|
if sentence_count(text, min_length=sentence_min_length) > 1:
|
||||||
|
logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def is_bulleted_text(text: str) -> bool:
|
||||||
|
"""Checks to see if the section of text is part of a bulleted list."""
|
||||||
|
return UNICODE_BULLETS_RE.match(text.strip()) is not None
|
||||||
|
|
||||||
|
|
||||||
|
def contains_verb(text: str) -> bool:
|
||||||
|
"""Use a POS tagger to check if a segment contains verbs. If the section does not have verbs,
|
||||||
|
that indicates that it is not narrative text."""
|
||||||
|
pos_tags = pos_tag(text)
|
||||||
|
for _, tag in pos_tags:
|
||||||
|
if tag in POS_VERB_TAGS:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def sentence_count(text: str, min_length: Optional[int] = None) -> int:
|
||||||
|
"""Checks the sentence count for a section of text. Titles should not be more than one
|
||||||
|
sentence.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
text
|
||||||
|
The string of the text to count
|
||||||
|
min_length
|
||||||
|
The min number of words a section needs to be for it to be considered a sentence.
|
||||||
|
"""
|
||||||
|
sentences = sent_tokenize(text)
|
||||||
|
count = 0
|
||||||
|
for sentence in sentences:
|
||||||
|
sentence = remove_punctuation(sentence)
|
||||||
|
words = [word for word in word_tokenize(sentence) if word != "."]
|
||||||
|
if min_length and len(words) < min_length:
|
||||||
|
logger.debug(
|
||||||
|
f"Skipping sentence because does not exceed {min_length} word tokens\n"
|
||||||
|
f"{sentence}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
count += 1
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def exceeds_cap_ratio(text: str, threshold: float = 0.3) -> bool:
|
||||||
|
"""Checks the title ratio in a section of text. If a sufficient proportion of the text is
|
||||||
|
capitalized."""
|
||||||
|
# NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
|
||||||
|
# The assumption is that sections with multiple sentences are not titles.
|
||||||
|
if sentence_count(text, 3) > 1:
|
||||||
|
logger.debug(f"Text does not contain multiple sentences:\n\n{text}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
tokens = word_tokenize(text)
|
||||||
|
capitalized = sum([word.istitle() or word.isupper() for word in tokens])
|
||||||
|
ratio = capitalized / len(tokens)
|
||||||
|
return ratio > threshold
|
Loading…
x
Reference in New Issue
Block a user