chore: Reorganize partition bricks under partition directory (#76)

* move partition_pdf to partition folder

* move partition.py

* refactor partioning bricks into partition diretory

* import to nlp for backward compatibility

* update docs

* update version and bump changelog

* fix typo in changelog

* update readme reference
This commit is contained in:
Matt Robinson 2022-11-21 17:27:23 -05:00 committed by GitHub
parent 53fcf4e912
commit 08e091c5a9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 356 additions and 330 deletions

View File

@ -1,6 +1,7 @@
## 0.3.0-dev1 ## 0.3.0-dev2
* Removing the local PDF parsing code and any dependencies and tests. * Removing the local PDF parsing code and any dependencies and tests.
* Reorganizes the staging bricks in the unstructured.partition module
## 0.2.6 ## 0.2.6

View File

@ -91,7 +91,7 @@ titles and narrative text.
You can use the following workflow to parse PDF documents. You can use the following workflow to parse PDF documents.
```python ```python
from unstructured.nlp.partition import partition_pdf from unstructured.partition.pdf import partition_pdf
elements = partition_pdf("example-docs/layout-parser-paper.pdf") elements = partition_pdf("example-docs/layout-parser-paper.pdf")
print(doc) print(doc)

View File

@ -14,6 +14,24 @@ The partitioning bricks in ``unstructured`` differentiate between different sect
of text in a document. For example, the partitioning bricks can help distinguish between of text in a document. For example, the partitioning bricks can help distinguish between
titles, narrative text, and tables. titles, narrative text, and tables.
``partition_pdf``
---------------------
The ``partition_pdf`` function segments a PDF document by calling the document image analysis API.
The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API,
if desired.
Examples:
.. code:: python
from unstructured.partition.pdf import partition_pdf
# Returns a List[Element] present in the pages of the parsed pdf document
elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf")
``is_bulleted_text`` ``is_bulleted_text``
---------------------- ----------------------
@ -24,7 +42,7 @@ Examples:
.. code:: python .. code:: python
from unstructured.nlp.partition import is_bulleted_text from unstructured.partition.text_type import is_bulleted_text
# Returns True # Returns True
is_bulleted_text("● An excellent point!") is_bulleted_text("● An excellent point!")
@ -52,7 +70,7 @@ Examples:
.. code:: python .. code:: python
from unstructured.nlp.partition import is_possible_narrative_text from unstructured.partition.text_type import is_possible_narrative_text
# Returns True because the example passes all the checks # Returns True because the example passes all the checks
example_1 = "Make sure you brush your teeth before you go to bed." example_1 = "Make sure you brush your teeth before you go to bed."
@ -83,7 +101,7 @@ Examples:
.. code:: python .. code:: python
from unstructured.nlp.partition import is_possible_title from unstructured.partition.text_type import is_possible_title
# Returns True because the text passes all the tests # Returns True because the text passes all the tests
example_2 = "ITEM 1A. RISK FACTORS" example_2 = "ITEM 1A. RISK FACTORS"
@ -116,7 +134,7 @@ Examples:
.. code:: python .. code:: python
from unstructured.nlp.partition import contains_verb from unstructured.partition.text_type import contains_verb
# Returns True because the text contains a verb # Returns True because the text contains a verb
example_1 = "I am going to run to the store to pick up some milk." example_1 = "I am going to run to the store to pick up some milk."
@ -139,7 +157,7 @@ Examples:
.. code:: python .. code:: python
from unstructured.nlp.partition import sentence_count from unstructured.partition.text_type import sentence_count
example = "Look at me! I am a document with two sentences." example = "Look at me! I am a document with two sentences."
@ -162,7 +180,7 @@ Examples:
.. code:: python .. code:: python
from unstructured.nlp.partition import exceeds_cap_ratio from unstructured.partition.text_type import exceeds_cap_ratio
# Returns True because the text is more than 30% caps # Returns True because the text is more than 30% caps
example_1 = "LOOK AT ME I AM YELLING" example_1 = "LOOK AT ME I AM YELLING"
@ -176,22 +194,6 @@ Examples:
exceeds_cap_ratio(example_2, threshold=0.01) exceeds_cap_ratio(example_2, threshold=0.01)
``partition_pdf``
---------------------
The ``partition_pdf`` function segments a PDF document by calling the document image analysis API.
The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API,
if desired.
Examples:
.. code:: python
from unstructured.nlp.partition import partition_pdf
# Returns a List[Element] present in the pages of the parsed pdf document
elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf")
######## ########
Cleaning Cleaning

View File

View File

View File

@ -1,144 +1,7 @@
import pytest # flake8: noqa
from unstructured.nlp.partition import (
import unstructured.nlp.partition as partition is_bulleted_text,
is_possible_narrative_text,
from mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize is_possible_title,
partition_pdf,
@pytest.mark.parametrize(
"text, expected",
[
(
"ITEM 5(a).: MARKET FOR REGISTRANTS COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND "
"ISSUER PURCHASES OF EQUITY SECURITIES",
False,
),
(
"Item 5(a).: Market For Registrants Common Equity, Related Stockholder Matters and "
"Issuer Purchases of Equity Securities",
False,
),
(
"There is a market for registrants common equity, related stockholder matters and "
"issuer purchases of equity securities.",
True,
),
],
)
def test_headings_are_not_narrative_text(text, expected):
assert partition.is_possible_narrative_text(text) == expected
@pytest.mark.parametrize(
"text, expected",
[
("Ask the teacher for an apple.", True),
("Ask Me About Intellectual Property", False), # Exceeds the cap threshold
("7", False), # Fails because it is numeric
("intellectual property", False), # Fails because it does not contain a verb
("", False), # Fails because it is empty
],
)
def test_is_possible_narrative_text(text, expected, monkeypatch):
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
has_verb = partition.is_possible_narrative_text(text, cap_threshold=0.3)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("Intellectual Property", True), # Fails because it exceeds the cap threshold
(
"Ask the teacher for an apple. You might a gold star.",
False,
), # Too many sentences
("7", False), # Fails because it is numeric
("", False), # Fails because it is empty
("ITEM 1A. RISK FACTORS", True), # Two "sentences", but both are short
],
)
def test_is_possible_title(text, expected, monkeypatch):
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
has_verb = partition.is_possible_title(text)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("• This is a fine point!", True),
(" • This is a fine point!", True), # Has an extra space in front of the bullet
("‣ This is a fine point!", True),
(" This is a fine point!", True),
("⁌ This is a fine point!", True),
("⁍ This is a fine point!", True),
("∙ This is a fine point!", True),
("○ This is a fine point!", True),
("● This is a fine point!", True),
("◘ This is a fine point!", True),
("◦ This is a fine point!", True),
("☙ This is a fine point!", True),
("❥ This is a fine point!", True),
("❧ This is a fine point!", True),
("⦾ This is a fine point!", True),
("⦿ This is a fine point!", True),
(" This is a fine point!", True),
("* This is a fine point!", True),
("This is NOT a fine point!", False), # No bullet point
("I love morse code! ● ● ● --- ● ● ●", False), # Not at the beginning
],
)
def test_is_bulletized_text(text, expected):
assert partition.is_bulleted_text(text) is expected
@pytest.mark.parametrize(
"text, expected",
[
("Ask the teacher for an apple", True),
("Intellectual property", False),
],
)
def test_contains_verb(text, expected, monkeypatch):
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
has_verb = partition.contains_verb(text)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("Intellectual Property in the United States", True),
("Intellectual property helps incentivize innovation.", False),
("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
],
)
def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
assert partition.exceeds_cap_ratio(text, threshold=0.3) is expected
def test_sentence_count(monkeypatch):
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
text = "Hi my name is Matt. I work with Crag."
assert partition.sentence_count(text) == 2
def test_item_titles():
text = "ITEM 1(A). THIS IS A TITLE"
assert partition.sentence_count(text, 3) < 2
def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"):
partition_pdf_response = partition.partition_pdf(filename)
assert partition_pdf_response[0]["type"] == "Title"
assert (
partition_pdf_response[0]["text"]
== "LayoutParser : A Unified Toolkit for Deep Learning Based Document Image Analysis"
) )

View File

@ -2,7 +2,7 @@ from typing import List, Tuple
import unstructured.nlp.tokenize as tokenize import unstructured.nlp.tokenize as tokenize
from mock_nltk import mock_sent_tokenize, mock_word_tokenize from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]: def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:

View File

@ -0,0 +1,10 @@
import unstructured.partition.pdf as pdf
def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"):
partition_pdf_response = pdf.partition_pdf(filename)
assert partition_pdf_response[0]["type"] == "Title"
assert (
partition_pdf_response[0]["text"]
== "LayoutParser : A Unified Toolkit for Deep Learning Based Document Image Analysis"
)

View File

@ -0,0 +1,135 @@
import pytest
import unstructured.partition.text_type as text_type
from test_unstructured.nlp.mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize
@pytest.mark.parametrize(
"text, expected",
[
(
"ITEM 5(a).: MARKET FOR REGISTRANTS COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND "
"ISSUER PURCHASES OF EQUITY SECURITIES",
False,
),
(
"Item 5(a).: Market For Registrants Common Equity, Related Stockholder Matters and "
"Issuer Purchases of Equity Securities",
False,
),
(
"There is a market for registrants common equity, related stockholder matters and "
"issuer purchases of equity securities.",
True,
),
],
)
def test_headings_are_not_narrative_text(text, expected):
assert text_type.is_possible_narrative_text(text) == expected
@pytest.mark.parametrize(
"text, expected",
[
("Ask the teacher for an apple.", True),
("Ask Me About Intellectual Property", False), # Exceeds the cap threshold
("7", False), # Fails because it is numeric
("intellectual property", False), # Fails because it does not contain a verb
("", False), # Fails because it is empty
],
)
def test_is_possible_narrative_text(text, expected, monkeypatch):
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag)
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
has_verb = text_type.is_possible_narrative_text(text, cap_threshold=0.3)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("Intellectual Property", True), # Fails because it exceeds the cap threshold
(
"Ask the teacher for an apple. You might a gold star.",
False,
), # Too many sentences
("7", False), # Fails because it is numeric
("", False), # Fails because it is empty
("ITEM 1A. RISK FACTORS", True), # Two "sentences", but both are short
],
)
def test_is_possible_title(text, expected, monkeypatch):
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
has_verb = text_type.is_possible_title(text)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("• This is a fine point!", True),
(" • This is a fine point!", True), # Has an extra space in front of the bullet
("‣ This is a fine point!", True),
(" This is a fine point!", True),
("⁌ This is a fine point!", True),
("⁍ This is a fine point!", True),
("∙ This is a fine point!", True),
("○ This is a fine point!", True),
("● This is a fine point!", True),
("◘ This is a fine point!", True),
("◦ This is a fine point!", True),
("☙ This is a fine point!", True),
("❥ This is a fine point!", True),
("❧ This is a fine point!", True),
("⦾ This is a fine point!", True),
("⦿ This is a fine point!", True),
(" This is a fine point!", True),
("* This is a fine point!", True),
("This is NOT a fine point!", False), # No bullet point
("I love morse code! ● ● ● --- ● ● ●", False), # Not at the beginning
],
)
def test_is_bulletized_text(text, expected):
assert text_type.is_bulleted_text(text) is expected
@pytest.mark.parametrize(
"text, expected",
[
("Ask the teacher for an apple", True),
("Intellectual property", False),
],
)
def test_contains_verb(text, expected, monkeypatch):
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag)
has_verb = text_type.contains_verb(text)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("Intellectual Property in the United States", True),
("Intellectual property helps incentivize innovation.", False),
("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
],
)
def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
assert text_type.exceeds_cap_ratio(text, threshold=0.3) is expected
def test_sentence_count(monkeypatch):
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
text = "Hi my name is Matt. I work with Crag."
assert text_type.sentence_count(text) == 2
def test_item_titles():
text = "ITEM 1(A). THIS IS A TITLE"
assert text_type.sentence_count(text, 3) < 2

View File

@ -1 +1 @@
__version__ = "0.3.0-dev1" # pragma: no cover __version__ = "0.3.0-dev2" # pragma: no cover

View File

@ -15,7 +15,7 @@ from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
from unstructured.documents.base import Page from unstructured.documents.base import Page
from unstructured.documents.elements import ListItem, Element, NarrativeText, Title from unstructured.documents.elements import ListItem, Element, NarrativeText, Title
from unstructured.documents.xml import XMLDocument from unstructured.documents.xml import XMLDocument
from unstructured.nlp.partition import ( from unstructured.partition.text_type import (
is_bulleted_text, is_bulleted_text,
is_possible_narrative_text, is_possible_narrative_text,
is_possible_title, is_possible_title,

View File

@ -1,160 +1,7 @@
"""partition.py implements logic for partitioning plain text documents into sections.""" # flake8: noqa
import sys from unstructured.partition.pdf import partition_pdf
import requests # type: ignore from unstructured.partition.text_type import (
is_bulleted_text,
if sys.version_info < (3, 8): is_possible_narrative_text,
from typing_extensions import Final, List, Optional is_possible_title,
else:
from typing import Final, List, Optional
from unstructured.cleaners.core import remove_punctuation
from unstructured.documents.elements import Element, Text
from unstructured.nlp.patterns import UNICODE_BULLETS_RE
from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
from unstructured.logger import get_logger
logger = get_logger()
POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
def is_possible_narrative_text(text: str, cap_threshold: float = 0.3) -> bool:
"""Checks to see if the text passes all of the checks for a narrative text section."""
if len(text) == 0:
logger.debug("Not narrative. Text is empty.")
return False
if text.isnumeric():
logger.debug(f"Not narrative. Text is all numeric:\n\n{text}")
return False
if exceeds_cap_ratio(text, threshold=cap_threshold):
logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
return False
if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
return False
return True
def is_possible_title(text: str, sentence_min_length: int = 5) -> bool:
"""Checks to see if the text passes all of the checks for a valid title."""
if len(text) == 0:
logger.debug("Not a title. Text is empty.")
return False
if text.isnumeric():
logger.debug(f"Not a title. Text is all numeric:\n\n{text}")
return False
# NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS"
# that sometimes get tokenized as separate sentences due to the period, but are still
# valid titles
if sentence_count(text, min_length=sentence_min_length) > 1:
logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}")
return False
return True
def is_bulleted_text(text: str) -> bool:
"""Checks to see if the section of text is part of a bulleted list."""
return UNICODE_BULLETS_RE.match(text.strip()) is not None
def contains_verb(text: str) -> bool:
"""Use a POS tagger to check if a segment contains verbs. If the section does not have verbs,
that indicates that it is not narrative text."""
pos_tags = pos_tag(text)
for _, tag in pos_tags:
if tag in POS_VERB_TAGS:
return True
return False
def sentence_count(text: str, min_length: Optional[int] = None) -> int:
"""Checks the sentence count for a section of text. Titles should not be more than one
sentence.
Parameters
----------
text
The string of the text to count
min_length
The min number of words a section needs to be for it to be considered a sentence.
"""
sentences = sent_tokenize(text)
count = 0
for sentence in sentences:
sentence = remove_punctuation(sentence)
words = [word for word in word_tokenize(sentence) if word != "."]
if min_length and len(words) < min_length:
logger.debug(
f"Skipping sentence because does not exceed {min_length} word tokens\n"
f"{sentence}"
) )
continue
count += 1
return count
def exceeds_cap_ratio(text: str, threshold: float = 0.3) -> bool:
"""Checks the title ratio in a section of text. If a sufficient proportion of the text is
capitalized."""
# NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
# The assumption is that sections with multiple sentences are not titles.
if sentence_count(text, 3) > 1:
logger.debug(f"Text does not contain multiple sentences:\n\n{text}")
return False
tokens = word_tokenize(text)
capitalized = sum([word.istitle() or word.isupper() for word in tokens])
ratio = capitalized / len(tokens)
return ratio > threshold
def partition_pdf(
filename: str = "",
file: Optional[bytes] = None,
url: str = "https://ml.unstructured.io/",
template: Optional[str] = "base-model",
token: Optional[str] = None,
) -> List[Element]:
"""Calls the document parsing API.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object as bytes --> open(filename, "rb").
template
A string defining the model to be used. Default "base-model" makes reference to layout/pdf.
url
A string endpoint to self-host an inference API, if desired.
token
A string defining the authentication token for a self-host url.
"""
if not filename and not file:
raise FileNotFoundError("No filename nor file were specified")
healthcheck_response = requests.models.Response()
if not token:
healthcheck_response = requests.get(url=f"{url}healthcheck")
if healthcheck_response.status_code != 200:
return [Text(text="error: endpoint api healthcheck has failed!")]
url = f"{url}layout/pdf" if template == "base-model" else f"{url}/{template}"
file_ = (filename, file if file else open(filename, "rb"))
response = requests.post(
url=url,
headers={"Authorization": f"Bearer {token}" if token else ""},
files={"file": file_},
)
if response.status_code == 200:
pages = response.json()["pages"]
return [element for page in pages for element in page["elements"]]
else:
return [Text(text=f"error: response status code = {response.status_code}")]

View File

@ -0,0 +1,55 @@
import requests # type: ignore
import sys
if sys.version_info < (3, 8):
from typing_extensions import List, Optional
else:
from typing import List, Optional
from unstructured.documents.elements import Element, Text
def partition_pdf(
filename: str = "",
file: Optional[bytes] = None,
url: str = "https://ml.unstructured.io/",
template: Optional[str] = "base-model",
token: Optional[str] = None,
) -> List[Element]:
"""Calls the document parsing API.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object as bytes --> open(filename, "rb").
template
A string defining the model to be used. Default "base-model" makes reference to layout/pdf.
url
A string endpoint to self-host an inference API, if desired.
token
A string defining the authentication token for a self-host url.
"""
if not filename and not file:
raise FileNotFoundError("No filename nor file were specified")
healthcheck_response = requests.models.Response()
if not token:
healthcheck_response = requests.get(url=f"{url}healthcheck")
if healthcheck_response.status_code != 200:
return [Text(text="error: endpoint api healthcheck has failed!")]
url = f"{url}layout/pdf" if template == "base-model" else f"{url}/{template}"
file_ = (filename, file if file else open(filename, "rb"))
response = requests.post(
url=url,
headers={"Authorization": f"Bearer {token}" if token else ""},
files={"file": file_},
)
if response.status_code == 200:
pages = response.json()["pages"]
return [element for page in pages for element in page["elements"]]
else:
return [Text(text=f"error: response status code = {response.status_code}")]

View File

@ -0,0 +1,113 @@
"""partition.py implements logic for partitioning plain text documents into sections."""
import sys
if sys.version_info < (3, 8):
from typing_extensions import Final, List, Optional
else:
from typing import Final, List, Optional
from unstructured.cleaners.core import remove_punctuation
from unstructured.nlp.patterns import UNICODE_BULLETS_RE
from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
from unstructured.logger import get_logger
logger = get_logger()
POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
def is_possible_narrative_text(text: str, cap_threshold: float = 0.3) -> bool:
"""Checks to see if the text passes all of the checks for a narrative text section."""
if len(text) == 0:
logger.debug("Not narrative. Text is empty.")
return False
if text.isnumeric():
logger.debug(f"Not narrative. Text is all numeric:\n\n{text}")
return False
if exceeds_cap_ratio(text, threshold=cap_threshold):
logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
return False
if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
return False
return True
def is_possible_title(text: str, sentence_min_length: int = 5) -> bool:
"""Checks to see if the text passes all of the checks for a valid title."""
if len(text) == 0:
logger.debug("Not a title. Text is empty.")
return False
if text.isnumeric():
logger.debug(f"Not a title. Text is all numeric:\n\n{text}")
return False
# NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS"
# that sometimes get tokenized as separate sentences due to the period, but are still
# valid titles
if sentence_count(text, min_length=sentence_min_length) > 1:
logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}")
return False
return True
def is_bulleted_text(text: str) -> bool:
"""Checks to see if the section of text is part of a bulleted list."""
return UNICODE_BULLETS_RE.match(text.strip()) is not None
def contains_verb(text: str) -> bool:
"""Use a POS tagger to check if a segment contains verbs. If the section does not have verbs,
that indicates that it is not narrative text."""
pos_tags = pos_tag(text)
for _, tag in pos_tags:
if tag in POS_VERB_TAGS:
return True
return False
def sentence_count(text: str, min_length: Optional[int] = None) -> int:
"""Checks the sentence count for a section of text. Titles should not be more than one
sentence.
Parameters
----------
text
The string of the text to count
min_length
The min number of words a section needs to be for it to be considered a sentence.
"""
sentences = sent_tokenize(text)
count = 0
for sentence in sentences:
sentence = remove_punctuation(sentence)
words = [word for word in word_tokenize(sentence) if word != "."]
if min_length and len(words) < min_length:
logger.debug(
f"Skipping sentence because does not exceed {min_length} word tokens\n"
f"{sentence}"
)
continue
count += 1
return count
def exceeds_cap_ratio(text: str, threshold: float = 0.3) -> bool:
"""Checks the title ratio in a section of text. If a sufficient proportion of the text is
capitalized."""
# NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
# The assumption is that sections with multiple sentences are not titles.
if sentence_count(text, 3) > 1:
logger.debug(f"Text does not contain multiple sentences:\n\n{text}")
return False
tokens = word_tokenize(text)
capitalized = sum([word.istitle() or word.isupper() for word in tokens])
ratio = capitalized / len(tokens)
return ratio > threshold