chore: Reorganize partition bricks under partition directory (#76)

* move partition_pdf to partition folder * move partition.py * refactor partioning bricks into partition diretory * import to nlp for backward compatibility * update docs * update version and bump changelog * fix typo in changelog * update readme reference
2025-06-27 02:30:08 +00:00 · 2022-11-21 17:27:23 -05:00 · 2022-11-21 17:27:23 -05:00 · 08e091c5a9
commit 08e091c5a9
parent 53fcf4e912
14 changed files with 356 additions and 330 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,6 +1,7 @@
-## 0.3.0-dev1
+## 0.3.0-dev2
 * Removing the local PDF parsing code and any dependencies and tests.
 * Reorganizes the staging bricks in the unstructured.partition module
 ## 0.2.6
--- a/README.md
+++ b/README.md
@ -91,7 +91,7 @@ titles and narrative text.
 You can use the following workflow to parse PDF documents.
 ```python
-from unstructured.nlp.partition import partition_pdf
+from unstructured.partition.pdf import partition_pdf
 elements = partition_pdf("example-docs/layout-parser-paper.pdf")
 print(doc)
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -14,6 +14,24 @@ The partitioning bricks in ``unstructured`` differentiate between different sect
 of text in a document. For example, the partitioning bricks can help distinguish between
 titles, narrative text, and tables.
 ``partition_pdf``
 ---------------------
 The ``partition_pdf`` function segments a PDF document by calling the document image analysis API. 
 The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API,
 if desired.
 Examples:
 .. code:: python
  from unstructured.partition.pdf import partition_pdf
  # Returns a List[Element] present in the pages of the parsed pdf document
  elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf")
 ``is_bulleted_text``
 ----------------------
@ -24,7 +42,7 @@ Examples:
 .. code:: python
-  from unstructured.nlp.partition import is_bulleted_text
+  from unstructured.partition.text_type import is_bulleted_text
  # Returns True
  is_bulleted_text("● An excellent point!")
@ -52,7 +70,7 @@ Examples:
 .. code:: python
-  from unstructured.nlp.partition import is_possible_narrative_text
+  from unstructured.partition.text_type import is_possible_narrative_text
  # Returns True because the example passes all the checks
  example_1 = "Make sure you brush your teeth before you go to bed."
@ -83,7 +101,7 @@ Examples:
 .. code:: python
-  from unstructured.nlp.partition import is_possible_title
+  from unstructured.partition.text_type import is_possible_title
  # Returns True because the text passes all the tests
  example_2 = "ITEM 1A. RISK FACTORS"
@ -116,7 +134,7 @@ Examples:
 .. code:: python
-  from unstructured.nlp.partition import contains_verb
+  from unstructured.partition.text_type import contains_verb
  # Returns True because the text contains a verb
  example_1 = "I am going to run to the store to pick up some milk."
@ -139,7 +157,7 @@ Examples:
 .. code:: python
-  from unstructured.nlp.partition import sentence_count
+  from unstructured.partition.text_type import sentence_count
  example = "Look at me! I am a document with two sentences."
@ -162,7 +180,7 @@ Examples:
 .. code:: python
-  from unstructured.nlp.partition import exceeds_cap_ratio
+  from unstructured.partition.text_type import exceeds_cap_ratio
  # Returns True because the text is more than 30% caps
  example_1 = "LOOK AT ME I AM YELLING"
@ -176,22 +194,6 @@ Examples:
  exceeds_cap_ratio(example_2, threshold=0.01)
 ``partition_pdf``
 ---------------------
 The ``partition_pdf`` function segments a PDF document by calling the document image analysis API. 
 The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API,
 if desired.
 Examples:
 .. code:: python
  from unstructured.nlp.partition import partition_pdf
  # Returns a List[Element] present in the pages of the parsed pdf document
  elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf")
 ########
 Cleaning
--- a/test_unstructured/init.py
+++ b/test_unstructured/init.py
--- a/test_unstructured/nlp/init.py
+++ b/test_unstructured/nlp/init.py
--- a/test_unstructured/nlp/test_partition.py
+++ b/test_unstructured/nlp/test_partition.py
@ -1,144 +1,7 @@
-import pytest
+# flake8: noqa
-
+from unstructured.nlp.partition import (
-import unstructured.nlp.partition as partition
+    is_bulleted_text,
-
+    is_possible_narrative_text,
-from mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize
+    is_possible_title,
-
+    partition_pdf,
@pytest.mark.parametrize(
    "text, expected",
    [
        (
            "ITEM 5(a).: MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND "
            "ISSUER PURCHASES OF EQUITY SECURITIES",
            False,
        ),
        (
            "Item 5(a).: Market For Registrant’s Common Equity, Related Stockholder Matters and "
            "Issuer Purchases of Equity Securities",
            False,
        ),
        (
            "There is a market for registrant’s common equity, related stockholder matters and "
            "issuer purchases of equity securities.",
            True,
        ),
    ],
 )
 def test_headings_are_not_narrative_text(text, expected):
    assert partition.is_possible_narrative_text(text) == expected
@pytest.mark.parametrize(
    "text, expected",
    [
        ("Ask the teacher for an apple.", True),
        ("Ask Me About Intellectual Property", False),  # Exceeds the cap threshold
        ("7", False),  # Fails because it is numeric
        ("intellectual property", False),  # Fails because it does not contain a verb
        ("", False),  # Fails because it is empty
    ],
 )
 def test_is_possible_narrative_text(text, expected, monkeypatch):
    monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
    monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
    monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
    has_verb = partition.is_possible_narrative_text(text, cap_threshold=0.3)
    assert has_verb is expected
@pytest.mark.parametrize(
    "text, expected",
    [
        ("Intellectual Property", True),  # Fails because it exceeds the cap threshold
        (
            "Ask the teacher for an apple. You might a gold star.",
            False,
        ),  # Too many sentences
        ("7", False),  # Fails because it is numeric
        ("", False),  # Fails because it is empty
        ("ITEM 1A. RISK FACTORS", True),  # Two "sentences", but both are short
    ],
 )
 def test_is_possible_title(text, expected, monkeypatch):
    monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
    monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
    has_verb = partition.is_possible_title(text)
    assert has_verb is expected
@pytest.mark.parametrize(
    "text, expected",
    [
        ("• This is a fine point!", True),
        (" • This is a fine point!", True),  # Has an extra space in front of the bullet
        ("‣ This is a fine point!", True),
        ("⁃ This is a fine point!", True),
        ("⁌ This is a fine point!", True),
        ("⁍ This is a fine point!", True),
        ("∙ This is a fine point!", True),
        ("○ This is a fine point!", True),
        ("● This is a fine point!", True),
        ("◘ This is a fine point!", True),
        ("◦  This is a fine point!", True),
        ("☙ This is a fine point!", True),
        ("❥ This is a fine point!", True),
        ("❧ This is a fine point!", True),
        ("⦾ This is a fine point!", True),
        ("⦿ This is a fine point!", True),
        ("  This is a fine point!", True),
        ("* This is a fine point!", True),
        ("This is NOT a fine point!", False),  # No bullet point
        ("I love morse code! ● ● ● --- ● ● ●", False),  # Not at the beginning
    ],
 )
 def test_is_bulletized_text(text, expected):
    assert partition.is_bulleted_text(text) is expected
@pytest.mark.parametrize(
    "text, expected",
    [
        ("Ask the teacher for an apple", True),
        ("Intellectual property", False),
    ],
 )
 def test_contains_verb(text, expected, monkeypatch):
    monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
    monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
    has_verb = partition.contains_verb(text)
    assert has_verb is expected
@pytest.mark.parametrize(
    "text, expected",
    [
        ("Intellectual Property in the United States", True),
        ("Intellectual property helps incentivize innovation.", False),
        ("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
    ],
 )
 def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
    monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
    monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
    assert partition.exceeds_cap_ratio(text, threshold=0.3) is expected
 def test_sentence_count(monkeypatch):
    monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
    text = "Hi my name is Matt. I work with Crag."
    assert partition.sentence_count(text) == 2
 def test_item_titles():
    text = "ITEM 1(A). THIS IS A TITLE"
    assert partition.sentence_count(text, 3) < 2
 def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"):
    partition_pdf_response = partition.partition_pdf(filename)
    assert partition_pdf_response[0]["type"] == "Title"
    assert (
        partition_pdf_response[0]["text"]
        == "LayoutParser : A Uniﬁed Toolkit for Deep Learning Based Document Image Analysis"
 )
--- a/test_unstructured/nlp/test_tokenize.py
+++ b/test_unstructured/nlp/test_tokenize.py
@ -2,7 +2,7 @@ from typing import List, Tuple
 import unstructured.nlp.tokenize as tokenize
-from mock_nltk import mock_sent_tokenize, mock_word_tokenize
+from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
 def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:
--- a/test_unstructured/partition/test_pdf.py
+++ b/test_unstructured/partition/test_pdf.py
@ -0,0 +1,10 @@
 import unstructured.partition.pdf as pdf
 def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"):
    partition_pdf_response = pdf.partition_pdf(filename)
    assert partition_pdf_response[0]["type"] == "Title"
    assert (
        partition_pdf_response[0]["text"]
        == "LayoutParser : A Uniﬁed Toolkit for Deep Learning Based Document Image Analysis"
    )
--- a/test_unstructured/partition/test_text_type.py
+++ b/test_unstructured/partition/test_text_type.py
@ -0,0 +1,135 @@
 import pytest
 import unstructured.partition.text_type as text_type
 from test_unstructured.nlp.mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize
@pytest.mark.parametrize(
    "text, expected",
    [
        (
            "ITEM 5(a).: MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND "
            "ISSUER PURCHASES OF EQUITY SECURITIES",
            False,
        ),
        (
            "Item 5(a).: Market For Registrant’s Common Equity, Related Stockholder Matters and "
            "Issuer Purchases of Equity Securities",
            False,
        ),
        (
            "There is a market for registrant’s common equity, related stockholder matters and "
            "issuer purchases of equity securities.",
            True,
        ),
    ],
 )
 def test_headings_are_not_narrative_text(text, expected):
    assert text_type.is_possible_narrative_text(text) == expected
@pytest.mark.parametrize(
    "text, expected",
    [
        ("Ask the teacher for an apple.", True),
        ("Ask Me About Intellectual Property", False),  # Exceeds the cap threshold
        ("7", False),  # Fails because it is numeric
        ("intellectual property", False),  # Fails because it does not contain a verb
        ("", False),  # Fails because it is empty
    ],
 )
 def test_is_possible_narrative_text(text, expected, monkeypatch):
    monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
    monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag)
    monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
    has_verb = text_type.is_possible_narrative_text(text, cap_threshold=0.3)
    assert has_verb is expected
@pytest.mark.parametrize(
    "text, expected",
    [
        ("Intellectual Property", True),  # Fails because it exceeds the cap threshold
        (
            "Ask the teacher for an apple. You might a gold star.",
            False,
        ),  # Too many sentences
        ("7", False),  # Fails because it is numeric
        ("", False),  # Fails because it is empty
        ("ITEM 1A. RISK FACTORS", True),  # Two "sentences", but both are short
    ],
 )
 def test_is_possible_title(text, expected, monkeypatch):
    monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
    monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
    has_verb = text_type.is_possible_title(text)
    assert has_verb is expected
@pytest.mark.parametrize(
    "text, expected",
    [
        ("• This is a fine point!", True),
        (" • This is a fine point!", True),  # Has an extra space in front of the bullet
        ("‣ This is a fine point!", True),
        ("⁃ This is a fine point!", True),
        ("⁌ This is a fine point!", True),
        ("⁍ This is a fine point!", True),
        ("∙ This is a fine point!", True),
        ("○ This is a fine point!", True),
        ("● This is a fine point!", True),
        ("◘ This is a fine point!", True),
        ("◦  This is a fine point!", True),
        ("☙ This is a fine point!", True),
        ("❥ This is a fine point!", True),
        ("❧ This is a fine point!", True),
        ("⦾ This is a fine point!", True),
        ("⦿ This is a fine point!", True),
        ("  This is a fine point!", True),
        ("* This is a fine point!", True),
        ("This is NOT a fine point!", False),  # No bullet point
        ("I love morse code! ● ● ● --- ● ● ●", False),  # Not at the beginning
    ],
 )
 def test_is_bulletized_text(text, expected):
    assert text_type.is_bulleted_text(text) is expected
@pytest.mark.parametrize(
    "text, expected",
    [
        ("Ask the teacher for an apple", True),
        ("Intellectual property", False),
    ],
 )
 def test_contains_verb(text, expected, monkeypatch):
    monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
    monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag)
    has_verb = text_type.contains_verb(text)
    assert has_verb is expected
@pytest.mark.parametrize(
    "text, expected",
    [
        ("Intellectual Property in the United States", True),
        ("Intellectual property helps incentivize innovation.", False),
        ("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
    ],
 )
 def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
    monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
    monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
    assert text_type.exceeds_cap_ratio(text, threshold=0.3) is expected
 def test_sentence_count(monkeypatch):
    monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
    text = "Hi my name is Matt. I work with Crag."
    assert text_type.sentence_count(text) == 2
 def test_item_titles():
    text = "ITEM 1(A). THIS IS A TITLE"
    assert text_type.sentence_count(text, 3) < 2
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.3.0-dev1"  # pragma: no cover
+__version__ = "0.3.0-dev2"  # pragma: no cover
--- a/unstructured/documents/html.py
+++ b/unstructured/documents/html.py
@ -15,7 +15,7 @@ from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
 from unstructured.documents.base import Page
 from unstructured.documents.elements import ListItem, Element, NarrativeText, Title
 from unstructured.documents.xml import XMLDocument
-from unstructured.nlp.partition import (
+from unstructured.partition.text_type import (
    is_bulleted_text,
    is_possible_narrative_text,
    is_possible_title,
--- a/unstructured/nlp/partition.py
+++ b/unstructured/nlp/partition.py
@ -1,160 +1,7 @@
-"""partition.py implements logic for partitioning plain text documents into sections."""
+# flake8: noqa
-import sys
+from unstructured.partition.pdf import partition_pdf
-import requests  # type: ignore
+from unstructured.partition.text_type import (
-
+    is_bulleted_text,
-if sys.version_info < (3, 8):
+    is_possible_narrative_text,
-    from typing_extensions import Final, List, Optional
+    is_possible_title,
 else:
    from typing import Final, List, Optional
 from unstructured.cleaners.core import remove_punctuation
 from unstructured.documents.elements import Element, Text
 from unstructured.nlp.patterns import UNICODE_BULLETS_RE
 from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
 from unstructured.logger import get_logger
 logger = get_logger()
 POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
 def is_possible_narrative_text(text: str, cap_threshold: float = 0.3) -> bool:
    """Checks to see if the text passes all of the checks for a narrative text section."""
    if len(text) == 0:
        logger.debug("Not narrative. Text is empty.")
        return False
    if text.isnumeric():
        logger.debug(f"Not narrative. Text is all numeric:\n\n{text}")
        return False
    if exceeds_cap_ratio(text, threshold=cap_threshold):
        logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
        return False
    if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
        logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
        return False
    return True
 def is_possible_title(text: str, sentence_min_length: int = 5) -> bool:
    """Checks to see if the text passes all of the checks for a valid title."""
    if len(text) == 0:
        logger.debug("Not a title. Text is empty.")
        return False
    if text.isnumeric():
        logger.debug(f"Not a title. Text is all numeric:\n\n{text}")
        return False
    # NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS"
    # that sometimes get tokenized as separate sentences due to the period, but are still
    # valid titles
    if sentence_count(text, min_length=sentence_min_length) > 1:
        logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}")
        return False
    return True
 def is_bulleted_text(text: str) -> bool:
    """Checks to see if the section of text is part of a bulleted list."""
    return UNICODE_BULLETS_RE.match(text.strip()) is not None
 def contains_verb(text: str) -> bool:
    """Use a POS tagger to check if a segment contains verbs. If the section does not have verbs,
    that indicates that it is not narrative text."""
    pos_tags = pos_tag(text)
    for _, tag in pos_tags:
        if tag in POS_VERB_TAGS:
            return True
    return False
 def sentence_count(text: str, min_length: Optional[int] = None) -> int:
    """Checks the sentence count for a section of text. Titles should not be more than one
    sentence.
    Parameters
    ----------
    text
        The string of the text to count
    min_length
        The min number of words a section needs to be for it to be considered a sentence.
    """
    sentences = sent_tokenize(text)
    count = 0
    for sentence in sentences:
        sentence = remove_punctuation(sentence)
        words = [word for word in word_tokenize(sentence) if word != "."]
        if min_length and len(words) < min_length:
            logger.debug(
                f"Skipping sentence because does not exceed {min_length} word tokens\n"
                f"{sentence}"
 )
            continue
        count += 1
    return count
 def exceeds_cap_ratio(text: str, threshold: float = 0.3) -> bool:
    """Checks the title ratio in a section of text. If a sufficient proportion of the text is
    capitalized."""
    # NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
    # The assumption is that sections with multiple sentences are not titles.
    if sentence_count(text, 3) > 1:
        logger.debug(f"Text does not contain multiple sentences:\n\n{text}")
        return False
    tokens = word_tokenize(text)
    capitalized = sum([word.istitle() or word.isupper() for word in tokens])
    ratio = capitalized / len(tokens)
    return ratio > threshold
 def partition_pdf(
    filename: str = "",
    file: Optional[bytes] = None,
    url: str = "https://ml.unstructured.io/",
    template: Optional[str] = "base-model",
    token: Optional[str] = None,
 ) -> List[Element]:
    """Calls the document parsing API.
    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object as bytes --> open(filename, "rb").
    template
        A string defining the model to be used. Default "base-model" makes reference to layout/pdf.
    url
        A string endpoint to self-host an inference API, if desired.
    token
        A string defining the authentication token for a self-host url.
    """
    if not filename and not file:
        raise FileNotFoundError("No filename nor file were specified")
    healthcheck_response = requests.models.Response()
    if not token:
        healthcheck_response = requests.get(url=f"{url}healthcheck")
    if healthcheck_response.status_code != 200:
        return [Text(text="error: endpoint api healthcheck has failed!")]
    url = f"{url}layout/pdf" if template == "base-model" else f"{url}/{template}"
    file_ = (filename, file if file else open(filename, "rb"))
    response = requests.post(
        url=url,
        headers={"Authorization": f"Bearer {token}" if token else ""},
        files={"file": file_},
    )
    if response.status_code == 200:
        pages = response.json()["pages"]
        return [element for page in pages for element in page["elements"]]
    else:
        return [Text(text=f"error: response status code = {response.status_code}")]
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -0,0 +1,55 @@
 import requests  # type: ignore
 import sys
 if sys.version_info < (3, 8):
    from typing_extensions import List, Optional
 else:
    from typing import List, Optional
 from unstructured.documents.elements import Element, Text
 def partition_pdf(
    filename: str = "",
    file: Optional[bytes] = None,
    url: str = "https://ml.unstructured.io/",
    template: Optional[str] = "base-model",
    token: Optional[str] = None,
 ) -> List[Element]:
    """Calls the document parsing API.
    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object as bytes --> open(filename, "rb").
    template
        A string defining the model to be used. Default "base-model" makes reference to layout/pdf.
    url
        A string endpoint to self-host an inference API, if desired.
    token
        A string defining the authentication token for a self-host url.
    """
    if not filename and not file:
        raise FileNotFoundError("No filename nor file were specified")
    healthcheck_response = requests.models.Response()
    if not token:
        healthcheck_response = requests.get(url=f"{url}healthcheck")
    if healthcheck_response.status_code != 200:
        return [Text(text="error: endpoint api healthcheck has failed!")]
    url = f"{url}layout/pdf" if template == "base-model" else f"{url}/{template}"
    file_ = (filename, file if file else open(filename, "rb"))
    response = requests.post(
        url=url,
        headers={"Authorization": f"Bearer {token}" if token else ""},
        files={"file": file_},
    )
    if response.status_code == 200:
        pages = response.json()["pages"]
        return [element for page in pages for element in page["elements"]]
    else:
        return [Text(text=f"error: response status code = {response.status_code}")]
--- a/unstructured/partition/text_type.py
+++ b/unstructured/partition/text_type.py
@ -0,0 +1,113 @@
 """partition.py implements logic for partitioning plain text documents into sections."""
 import sys
 if sys.version_info < (3, 8):
    from typing_extensions import Final, List, Optional
 else:
    from typing import Final, List, Optional
 from unstructured.cleaners.core import remove_punctuation
 from unstructured.nlp.patterns import UNICODE_BULLETS_RE
 from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
 from unstructured.logger import get_logger
 logger = get_logger()
 POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
 def is_possible_narrative_text(text: str, cap_threshold: float = 0.3) -> bool:
    """Checks to see if the text passes all of the checks for a narrative text section."""
    if len(text) == 0:
        logger.debug("Not narrative. Text is empty.")
        return False
    if text.isnumeric():
        logger.debug(f"Not narrative. Text is all numeric:\n\n{text}")
        return False
    if exceeds_cap_ratio(text, threshold=cap_threshold):
        logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
        return False
    if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
        logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
        return False
    return True
 def is_possible_title(text: str, sentence_min_length: int = 5) -> bool:
    """Checks to see if the text passes all of the checks for a valid title."""
    if len(text) == 0:
        logger.debug("Not a title. Text is empty.")
        return False
    if text.isnumeric():
        logger.debug(f"Not a title. Text is all numeric:\n\n{text}")
        return False
    # NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS"
    # that sometimes get tokenized as separate sentences due to the period, but are still
    # valid titles
    if sentence_count(text, min_length=sentence_min_length) > 1:
        logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}")
        return False
    return True
 def is_bulleted_text(text: str) -> bool:
    """Checks to see if the section of text is part of a bulleted list."""
    return UNICODE_BULLETS_RE.match(text.strip()) is not None
 def contains_verb(text: str) -> bool:
    """Use a POS tagger to check if a segment contains verbs. If the section does not have verbs,
    that indicates that it is not narrative text."""
    pos_tags = pos_tag(text)
    for _, tag in pos_tags:
        if tag in POS_VERB_TAGS:
            return True
    return False
 def sentence_count(text: str, min_length: Optional[int] = None) -> int:
    """Checks the sentence count for a section of text. Titles should not be more than one
    sentence.
    Parameters
    ----------
    text
        The string of the text to count
    min_length
        The min number of words a section needs to be for it to be considered a sentence.
    """
    sentences = sent_tokenize(text)
    count = 0
    for sentence in sentences:
        sentence = remove_punctuation(sentence)
        words = [word for word in word_tokenize(sentence) if word != "."]
        if min_length and len(words) < min_length:
            logger.debug(
                f"Skipping sentence because does not exceed {min_length} word tokens\n"
                f"{sentence}"
            )
            continue
        count += 1
    return count
 def exceeds_cap_ratio(text: str, threshold: float = 0.3) -> bool:
    """Checks the title ratio in a section of text. If a sufficient proportion of the text is
    capitalized."""
    # NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
    # The assumption is that sections with multiple sentences are not titles.
    if sentence_count(text, 3) > 1:
        logger.debug(f"Text does not contain multiple sentences:\n\n{text}")
        return False
    tokens = word_tokenize(text)
    capitalized = sum([word.istitle() or word.isupper() for word in tokens])
    ratio = capitalized / len(tokens)
    return ratio > threshold
`@ -1 +1 @@`
	`__version__ = "0.3.0-dev1" # pragma: no cover`	`__version__ = "0.3.0-dev2" # pragma: no cover`