chore: Reorganize partition bricks under partition directory (#76)

* move partition_pdf to partition folder

* move partition.py

* refactor partioning bricks into partition diretory

* import to nlp for backward compatibility

* update docs

* update version and bump changelog

* fix typo in changelog

* update readme reference
This commit is contained in:
Matt Robinson 2022-11-21 17:27:23 -05:00 committed by GitHub
parent 53fcf4e912
commit 08e091c5a9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 356 additions and 330 deletions

View File

@ -1,6 +1,7 @@
## 0.3.0-dev1
## 0.3.0-dev2
* Removing the local PDF parsing code and any dependencies and tests.
* Reorganizes the staging bricks in the unstructured.partition module
## 0.2.6

View File

@ -91,7 +91,7 @@ titles and narrative text.
You can use the following workflow to parse PDF documents.
```python
from unstructured.nlp.partition import partition_pdf
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf("example-docs/layout-parser-paper.pdf")
print(doc)

View File

@ -14,6 +14,24 @@ The partitioning bricks in ``unstructured`` differentiate between different sect
of text in a document. For example, the partitioning bricks can help distinguish between
titles, narrative text, and tables.
``partition_pdf``
---------------------
The ``partition_pdf`` function segments a PDF document by calling the document image analysis API.
The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API,
if desired.
Examples:
.. code:: python
from unstructured.partition.pdf import partition_pdf
# Returns a List[Element] present in the pages of the parsed pdf document
elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf")
``is_bulleted_text``
----------------------
@ -24,7 +42,7 @@ Examples:
.. code:: python
from unstructured.nlp.partition import is_bulleted_text
from unstructured.partition.text_type import is_bulleted_text
# Returns True
is_bulleted_text("● An excellent point!")
@ -52,7 +70,7 @@ Examples:
.. code:: python
from unstructured.nlp.partition import is_possible_narrative_text
from unstructured.partition.text_type import is_possible_narrative_text
# Returns True because the example passes all the checks
example_1 = "Make sure you brush your teeth before you go to bed."
@ -83,7 +101,7 @@ Examples:
.. code:: python
from unstructured.nlp.partition import is_possible_title
from unstructured.partition.text_type import is_possible_title
# Returns True because the text passes all the tests
example_2 = "ITEM 1A. RISK FACTORS"
@ -116,7 +134,7 @@ Examples:
.. code:: python
from unstructured.nlp.partition import contains_verb
from unstructured.partition.text_type import contains_verb
# Returns True because the text contains a verb
example_1 = "I am going to run to the store to pick up some milk."
@ -139,7 +157,7 @@ Examples:
.. code:: python
from unstructured.nlp.partition import sentence_count
from unstructured.partition.text_type import sentence_count
example = "Look at me! I am a document with two sentences."
@ -162,7 +180,7 @@ Examples:
.. code:: python
from unstructured.nlp.partition import exceeds_cap_ratio
from unstructured.partition.text_type import exceeds_cap_ratio
# Returns True because the text is more than 30% caps
example_1 = "LOOK AT ME I AM YELLING"
@ -176,22 +194,6 @@ Examples:
exceeds_cap_ratio(example_2, threshold=0.01)
``partition_pdf``
---------------------
The ``partition_pdf`` function segments a PDF document by calling the document image analysis API.
The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API,
if desired.
Examples:
.. code:: python
from unstructured.nlp.partition import partition_pdf
# Returns a List[Element] present in the pages of the parsed pdf document
elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf")
########
Cleaning

View File

View File

View File

@ -1,144 +1,7 @@
import pytest
import unstructured.nlp.partition as partition
from mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize
@pytest.mark.parametrize(
"text, expected",
[
(
"ITEM 5(a).: MARKET FOR REGISTRANTS COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND "
"ISSUER PURCHASES OF EQUITY SECURITIES",
False,
),
(
"Item 5(a).: Market For Registrants Common Equity, Related Stockholder Matters and "
"Issuer Purchases of Equity Securities",
False,
),
(
"There is a market for registrants common equity, related stockholder matters and "
"issuer purchases of equity securities.",
True,
),
],
)
def test_headings_are_not_narrative_text(text, expected):
assert partition.is_possible_narrative_text(text) == expected
@pytest.mark.parametrize(
"text, expected",
[
("Ask the teacher for an apple.", True),
("Ask Me About Intellectual Property", False), # Exceeds the cap threshold
("7", False), # Fails because it is numeric
("intellectual property", False), # Fails because it does not contain a verb
("", False), # Fails because it is empty
],
)
def test_is_possible_narrative_text(text, expected, monkeypatch):
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
has_verb = partition.is_possible_narrative_text(text, cap_threshold=0.3)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("Intellectual Property", True), # Fails because it exceeds the cap threshold
(
"Ask the teacher for an apple. You might a gold star.",
False,
), # Too many sentences
("7", False), # Fails because it is numeric
("", False), # Fails because it is empty
("ITEM 1A. RISK FACTORS", True), # Two "sentences", but both are short
],
)
def test_is_possible_title(text, expected, monkeypatch):
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
has_verb = partition.is_possible_title(text)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("• This is a fine point!", True),
(" • This is a fine point!", True), # Has an extra space in front of the bullet
("‣ This is a fine point!", True),
(" This is a fine point!", True),
("⁌ This is a fine point!", True),
("⁍ This is a fine point!", True),
("∙ This is a fine point!", True),
("○ This is a fine point!", True),
("● This is a fine point!", True),
("◘ This is a fine point!", True),
("◦ This is a fine point!", True),
("☙ This is a fine point!", True),
("❥ This is a fine point!", True),
("❧ This is a fine point!", True),
("⦾ This is a fine point!", True),
("⦿ This is a fine point!", True),
(" This is a fine point!", True),
("* This is a fine point!", True),
("This is NOT a fine point!", False), # No bullet point
("I love morse code! ● ● ● --- ● ● ●", False), # Not at the beginning
],
)
def test_is_bulletized_text(text, expected):
assert partition.is_bulleted_text(text) is expected
@pytest.mark.parametrize(
"text, expected",
[
("Ask the teacher for an apple", True),
("Intellectual property", False),
],
)
def test_contains_verb(text, expected, monkeypatch):
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
has_verb = partition.contains_verb(text)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("Intellectual Property in the United States", True),
("Intellectual property helps incentivize innovation.", False),
("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
],
)
def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
assert partition.exceeds_cap_ratio(text, threshold=0.3) is expected
def test_sentence_count(monkeypatch):
monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
text = "Hi my name is Matt. I work with Crag."
assert partition.sentence_count(text) == 2
def test_item_titles():
text = "ITEM 1(A). THIS IS A TITLE"
assert partition.sentence_count(text, 3) < 2
def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"):
partition_pdf_response = partition.partition_pdf(filename)
assert partition_pdf_response[0]["type"] == "Title"
assert (
partition_pdf_response[0]["text"]
== "LayoutParser : A Unified Toolkit for Deep Learning Based Document Image Analysis"
# flake8: noqa
from unstructured.nlp.partition import (
is_bulleted_text,
is_possible_narrative_text,
is_possible_title,
partition_pdf,
)

View File

@ -2,7 +2,7 @@ from typing import List, Tuple
import unstructured.nlp.tokenize as tokenize
from mock_nltk import mock_sent_tokenize, mock_word_tokenize
from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:

View File

@ -0,0 +1,10 @@
import unstructured.partition.pdf as pdf
def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"):
partition_pdf_response = pdf.partition_pdf(filename)
assert partition_pdf_response[0]["type"] == "Title"
assert (
partition_pdf_response[0]["text"]
== "LayoutParser : A Unified Toolkit for Deep Learning Based Document Image Analysis"
)

View File

@ -0,0 +1,135 @@
import pytest
import unstructured.partition.text_type as text_type
from test_unstructured.nlp.mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize
@pytest.mark.parametrize(
"text, expected",
[
(
"ITEM 5(a).: MARKET FOR REGISTRANTS COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND "
"ISSUER PURCHASES OF EQUITY SECURITIES",
False,
),
(
"Item 5(a).: Market For Registrants Common Equity, Related Stockholder Matters and "
"Issuer Purchases of Equity Securities",
False,
),
(
"There is a market for registrants common equity, related stockholder matters and "
"issuer purchases of equity securities.",
True,
),
],
)
def test_headings_are_not_narrative_text(text, expected):
assert text_type.is_possible_narrative_text(text) == expected
@pytest.mark.parametrize(
"text, expected",
[
("Ask the teacher for an apple.", True),
("Ask Me About Intellectual Property", False), # Exceeds the cap threshold
("7", False), # Fails because it is numeric
("intellectual property", False), # Fails because it does not contain a verb
("", False), # Fails because it is empty
],
)
def test_is_possible_narrative_text(text, expected, monkeypatch):
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag)
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
has_verb = text_type.is_possible_narrative_text(text, cap_threshold=0.3)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("Intellectual Property", True), # Fails because it exceeds the cap threshold
(
"Ask the teacher for an apple. You might a gold star.",
False,
), # Too many sentences
("7", False), # Fails because it is numeric
("", False), # Fails because it is empty
("ITEM 1A. RISK FACTORS", True), # Two "sentences", but both are short
],
)
def test_is_possible_title(text, expected, monkeypatch):
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
has_verb = text_type.is_possible_title(text)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("• This is a fine point!", True),
(" • This is a fine point!", True), # Has an extra space in front of the bullet
("‣ This is a fine point!", True),
(" This is a fine point!", True),
("⁌ This is a fine point!", True),
("⁍ This is a fine point!", True),
("∙ This is a fine point!", True),
("○ This is a fine point!", True),
("● This is a fine point!", True),
("◘ This is a fine point!", True),
("◦ This is a fine point!", True),
("☙ This is a fine point!", True),
("❥ This is a fine point!", True),
("❧ This is a fine point!", True),
("⦾ This is a fine point!", True),
("⦿ This is a fine point!", True),
(" This is a fine point!", True),
("* This is a fine point!", True),
("This is NOT a fine point!", False), # No bullet point
("I love morse code! ● ● ● --- ● ● ●", False), # Not at the beginning
],
)
def test_is_bulletized_text(text, expected):
assert text_type.is_bulleted_text(text) is expected
@pytest.mark.parametrize(
"text, expected",
[
("Ask the teacher for an apple", True),
("Intellectual property", False),
],
)
def test_contains_verb(text, expected, monkeypatch):
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag)
has_verb = text_type.contains_verb(text)
assert has_verb is expected
@pytest.mark.parametrize(
"text, expected",
[
("Intellectual Property in the United States", True),
("Intellectual property helps incentivize innovation.", False),
("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
],
)
def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
assert text_type.exceeds_cap_ratio(text, threshold=0.3) is expected
def test_sentence_count(monkeypatch):
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
text = "Hi my name is Matt. I work with Crag."
assert text_type.sentence_count(text) == 2
def test_item_titles():
text = "ITEM 1(A). THIS IS A TITLE"
assert text_type.sentence_count(text, 3) < 2

View File

@ -1 +1 @@
__version__ = "0.3.0-dev1" # pragma: no cover
__version__ = "0.3.0-dev2" # pragma: no cover

View File

@ -15,7 +15,7 @@ from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
from unstructured.documents.base import Page
from unstructured.documents.elements import ListItem, Element, NarrativeText, Title
from unstructured.documents.xml import XMLDocument
from unstructured.nlp.partition import (
from unstructured.partition.text_type import (
is_bulleted_text,
is_possible_narrative_text,
is_possible_title,

View File

@ -1,160 +1,7 @@
"""partition.py implements logic for partitioning plain text documents into sections."""
import sys
import requests # type: ignore
if sys.version_info < (3, 8):
from typing_extensions import Final, List, Optional
else:
from typing import Final, List, Optional
from unstructured.cleaners.core import remove_punctuation
from unstructured.documents.elements import Element, Text
from unstructured.nlp.patterns import UNICODE_BULLETS_RE
from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
from unstructured.logger import get_logger
logger = get_logger()
POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
def is_possible_narrative_text(text: str, cap_threshold: float = 0.3) -> bool:
"""Checks to see if the text passes all of the checks for a narrative text section."""
if len(text) == 0:
logger.debug("Not narrative. Text is empty.")
return False
if text.isnumeric():
logger.debug(f"Not narrative. Text is all numeric:\n\n{text}")
return False
if exceeds_cap_ratio(text, threshold=cap_threshold):
logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
return False
if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
return False
return True
def is_possible_title(text: str, sentence_min_length: int = 5) -> bool:
"""Checks to see if the text passes all of the checks for a valid title."""
if len(text) == 0:
logger.debug("Not a title. Text is empty.")
return False
if text.isnumeric():
logger.debug(f"Not a title. Text is all numeric:\n\n{text}")
return False
# NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS"
# that sometimes get tokenized as separate sentences due to the period, but are still
# valid titles
if sentence_count(text, min_length=sentence_min_length) > 1:
logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}")
return False
return True
def is_bulleted_text(text: str) -> bool:
"""Checks to see if the section of text is part of a bulleted list."""
return UNICODE_BULLETS_RE.match(text.strip()) is not None
def contains_verb(text: str) -> bool:
"""Use a POS tagger to check if a segment contains verbs. If the section does not have verbs,
that indicates that it is not narrative text."""
pos_tags = pos_tag(text)
for _, tag in pos_tags:
if tag in POS_VERB_TAGS:
return True
return False
def sentence_count(text: str, min_length: Optional[int] = None) -> int:
"""Checks the sentence count for a section of text. Titles should not be more than one
sentence.
Parameters
----------
text
The string of the text to count
min_length
The min number of words a section needs to be for it to be considered a sentence.
"""
sentences = sent_tokenize(text)
count = 0
for sentence in sentences:
sentence = remove_punctuation(sentence)
words = [word for word in word_tokenize(sentence) if word != "."]
if min_length and len(words) < min_length:
logger.debug(
f"Skipping sentence because does not exceed {min_length} word tokens\n"
f"{sentence}"
# flake8: noqa
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.text_type import (
is_bulleted_text,
is_possible_narrative_text,
is_possible_title,
)
continue
count += 1
return count
def exceeds_cap_ratio(text: str, threshold: float = 0.3) -> bool:
"""Checks the title ratio in a section of text. If a sufficient proportion of the text is
capitalized."""
# NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
# The assumption is that sections with multiple sentences are not titles.
if sentence_count(text, 3) > 1:
logger.debug(f"Text does not contain multiple sentences:\n\n{text}")
return False
tokens = word_tokenize(text)
capitalized = sum([word.istitle() or word.isupper() for word in tokens])
ratio = capitalized / len(tokens)
return ratio > threshold
def partition_pdf(
filename: str = "",
file: Optional[bytes] = None,
url: str = "https://ml.unstructured.io/",
template: Optional[str] = "base-model",
token: Optional[str] = None,
) -> List[Element]:
"""Calls the document parsing API.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object as bytes --> open(filename, "rb").
template
A string defining the model to be used. Default "base-model" makes reference to layout/pdf.
url
A string endpoint to self-host an inference API, if desired.
token
A string defining the authentication token for a self-host url.
"""
if not filename and not file:
raise FileNotFoundError("No filename nor file were specified")
healthcheck_response = requests.models.Response()
if not token:
healthcheck_response = requests.get(url=f"{url}healthcheck")
if healthcheck_response.status_code != 200:
return [Text(text="error: endpoint api healthcheck has failed!")]
url = f"{url}layout/pdf" if template == "base-model" else f"{url}/{template}"
file_ = (filename, file if file else open(filename, "rb"))
response = requests.post(
url=url,
headers={"Authorization": f"Bearer {token}" if token else ""},
files={"file": file_},
)
if response.status_code == 200:
pages = response.json()["pages"]
return [element for page in pages for element in page["elements"]]
else:
return [Text(text=f"error: response status code = {response.status_code}")]

View File

@ -0,0 +1,55 @@
import requests # type: ignore
import sys
if sys.version_info < (3, 8):
from typing_extensions import List, Optional
else:
from typing import List, Optional
from unstructured.documents.elements import Element, Text
def partition_pdf(
filename: str = "",
file: Optional[bytes] = None,
url: str = "https://ml.unstructured.io/",
template: Optional[str] = "base-model",
token: Optional[str] = None,
) -> List[Element]:
"""Calls the document parsing API.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object as bytes --> open(filename, "rb").
template
A string defining the model to be used. Default "base-model" makes reference to layout/pdf.
url
A string endpoint to self-host an inference API, if desired.
token
A string defining the authentication token for a self-host url.
"""
if not filename and not file:
raise FileNotFoundError("No filename nor file were specified")
healthcheck_response = requests.models.Response()
if not token:
healthcheck_response = requests.get(url=f"{url}healthcheck")
if healthcheck_response.status_code != 200:
return [Text(text="error: endpoint api healthcheck has failed!")]
url = f"{url}layout/pdf" if template == "base-model" else f"{url}/{template}"
file_ = (filename, file if file else open(filename, "rb"))
response = requests.post(
url=url,
headers={"Authorization": f"Bearer {token}" if token else ""},
files={"file": file_},
)
if response.status_code == 200:
pages = response.json()["pages"]
return [element for page in pages for element in page["elements"]]
else:
return [Text(text=f"error: response status code = {response.status_code}")]

View File

@ -0,0 +1,113 @@
"""partition.py implements logic for partitioning plain text documents into sections."""
import sys
if sys.version_info < (3, 8):
from typing_extensions import Final, List, Optional
else:
from typing import Final, List, Optional
from unstructured.cleaners.core import remove_punctuation
from unstructured.nlp.patterns import UNICODE_BULLETS_RE
from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
from unstructured.logger import get_logger
logger = get_logger()
POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
def is_possible_narrative_text(text: str, cap_threshold: float = 0.3) -> bool:
"""Checks to see if the text passes all of the checks for a narrative text section."""
if len(text) == 0:
logger.debug("Not narrative. Text is empty.")
return False
if text.isnumeric():
logger.debug(f"Not narrative. Text is all numeric:\n\n{text}")
return False
if exceeds_cap_ratio(text, threshold=cap_threshold):
logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
return False
if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
return False
return True
def is_possible_title(text: str, sentence_min_length: int = 5) -> bool:
"""Checks to see if the text passes all of the checks for a valid title."""
if len(text) == 0:
logger.debug("Not a title. Text is empty.")
return False
if text.isnumeric():
logger.debug(f"Not a title. Text is all numeric:\n\n{text}")
return False
# NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS"
# that sometimes get tokenized as separate sentences due to the period, but are still
# valid titles
if sentence_count(text, min_length=sentence_min_length) > 1:
logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}")
return False
return True
def is_bulleted_text(text: str) -> bool:
"""Checks to see if the section of text is part of a bulleted list."""
return UNICODE_BULLETS_RE.match(text.strip()) is not None
def contains_verb(text: str) -> bool:
"""Use a POS tagger to check if a segment contains verbs. If the section does not have verbs,
that indicates that it is not narrative text."""
pos_tags = pos_tag(text)
for _, tag in pos_tags:
if tag in POS_VERB_TAGS:
return True
return False
def sentence_count(text: str, min_length: Optional[int] = None) -> int:
"""Checks the sentence count for a section of text. Titles should not be more than one
sentence.
Parameters
----------
text
The string of the text to count
min_length
The min number of words a section needs to be for it to be considered a sentence.
"""
sentences = sent_tokenize(text)
count = 0
for sentence in sentences:
sentence = remove_punctuation(sentence)
words = [word for word in word_tokenize(sentence) if word != "."]
if min_length and len(words) < min_length:
logger.debug(
f"Skipping sentence because does not exceed {min_length} word tokens\n"
f"{sentence}"
)
continue
count += 1
return count
def exceeds_cap_ratio(text: str, threshold: float = 0.3) -> bool:
"""Checks the title ratio in a section of text. If a sufficient proportion of the text is
capitalized."""
# NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
# The assumption is that sections with multiple sentences are not titles.
if sentence_count(text, 3) > 1:
logger.debug(f"Text does not contain multiple sentences:\n\n{text}")
return False
tokens = word_tokenize(text)
capitalized = sum([word.istitle() or word.isupper() for word in tokens])
ratio = capitalized / len(tokens)
return ratio > threshold