From 08e091c5a9061ad7653fbcd2c318b08c585d7b21 Mon Sep 17 00:00:00 2001
From: Matt Robinson <mrobinson@unstructured.io>
Date: Mon, 21 Nov 2022 17:27:23 -0500
Subject: [PATCH] chore: Reorganize partition bricks under partition directory
 (#76)

* move partition_pdf to partition folder

* move partition.py

* refactor partioning bricks into partition diretory

* import to nlp for backward compatibility

* update docs

* update version and bump changelog

* fix typo in changelog

* update readme reference
---
 CHANGELOG.md                                  |   3 +-
 README.md                                     |   2 +-
 docs/source/bricks.rst                        |  46 ++---
 test_unstructured/__init__.py                 |   0
 test_unstructured/nlp/__init__.py             |   0
 test_unstructured/nlp/test_partition.py       | 149 +---------------
 test_unstructured/nlp/test_tokenize.py        |   2 +-
 test_unstructured/partition/test_pdf.py       |  10 ++
 test_unstructured/partition/test_text_type.py | 135 ++++++++++++++
 unstructured/__version__.py                   |   2 +-
 unstructured/documents/html.py                |   2 +-
 unstructured/nlp/partition.py                 | 167 +-----------------
 unstructured/partition/pdf.py                 |  55 ++++++
 unstructured/partition/text_type.py           | 113 ++++++++++++
 14 files changed, 356 insertions(+), 330 deletions(-)
 create mode 100644 test_unstructured/__init__.py
 create mode 100644 test_unstructured/nlp/__init__.py
 create mode 100644 test_unstructured/partition/test_pdf.py
 create mode 100644 test_unstructured/partition/test_text_type.py
 create mode 100644 unstructured/partition/pdf.py
 create mode 100644 unstructured/partition/text_type.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index daebd51d8..f17315c8f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,7 @@
-## 0.3.0-dev1
+## 0.3.0-dev2
 
 * Removing the local PDF parsing code and any dependencies and tests.
+* Reorganizes the staging bricks in the unstructured.partition module
 
 ## 0.2.6
 
diff --git a/README.md b/README.md
index 5585cf310..8ab87165e 100644
--- a/README.md
+++ b/README.md
@@ -91,7 +91,7 @@ titles and narrative text.
 You can use the following workflow to parse PDF documents.
 
 ```python
-from unstructured.nlp.partition import partition_pdf
+from unstructured.partition.pdf import partition_pdf
 
 elements = partition_pdf("example-docs/layout-parser-paper.pdf")
 print(doc)
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
index ed8db280e..e19c3e87e 100644
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@@ -14,6 +14,24 @@ The partitioning bricks in ``unstructured`` differentiate between different sect
 of text in a document. For example, the partitioning bricks can help distinguish between
 titles, narrative text, and tables.
 
+
+``partition_pdf``
+---------------------
+
+The ``partition_pdf`` function segments a PDF document by calling the document image analysis API. 
+The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API,
+if desired.
+
+Examples:
+
+.. code:: python
+
+  from unstructured.partition.pdf import partition_pdf
+
+  # Returns a List[Element] present in the pages of the parsed pdf document
+  elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf")
+
+
 ``is_bulleted_text``
 ----------------------
 
@@ -24,7 +42,7 @@ Examples:
 
 .. code:: python
 
-  from unstructured.nlp.partition import is_bulleted_text
+  from unstructured.partition.text_type import is_bulleted_text
 
   # Returns True
   is_bulleted_text("● An excellent point!")
@@ -52,7 +70,7 @@ Examples:
 
 .. code:: python
 
-  from unstructured.nlp.partition import is_possible_narrative_text
+  from unstructured.partition.text_type import is_possible_narrative_text
 
   # Returns True because the example passes all the checks
   example_1 = "Make sure you brush your teeth before you go to bed."
@@ -83,7 +101,7 @@ Examples:
 
 .. code:: python
 
-  from unstructured.nlp.partition import is_possible_title
+  from unstructured.partition.text_type import is_possible_title
 
   # Returns True because the text passes all the tests
   example_2 = "ITEM 1A. RISK FACTORS"
@@ -116,7 +134,7 @@ Examples:
 
 .. code:: python
 
-  from unstructured.nlp.partition import contains_verb
+  from unstructured.partition.text_type import contains_verb
 
   # Returns True because the text contains a verb
   example_1 = "I am going to run to the store to pick up some milk."
@@ -139,7 +157,7 @@ Examples:
 
 .. code:: python
 
-  from unstructured.nlp.partition import sentence_count
+  from unstructured.partition.text_type import sentence_count
 
   example = "Look at me! I am a document with two sentences."
 
@@ -162,7 +180,7 @@ Examples:
 
 .. code:: python
 
-  from unstructured.nlp.partition import exceeds_cap_ratio
+  from unstructured.partition.text_type import exceeds_cap_ratio
 
   # Returns True because the text is more than 30% caps
   example_1 = "LOOK AT ME I AM YELLING"
@@ -174,23 +192,7 @@ Examples:
 
   # Returns False because the text is more than 1% caps
   exceeds_cap_ratio(example_2, threshold=0.01)
-  
-  
-``partition_pdf``
----------------------
 
-The ``partition_pdf`` function segments a PDF document by calling the document image analysis API. 
-The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API,
-if desired.
-
-Examples:
-
-.. code:: python
-
-  from unstructured.nlp.partition import partition_pdf
-
-  # Returns a List[Element] present in the pages of the parsed pdf document
-  elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf")
 
 
 ########
diff --git a/test_unstructured/__init__.py b/test_unstructured/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test_unstructured/nlp/__init__.py b/test_unstructured/nlp/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test_unstructured/nlp/test_partition.py b/test_unstructured/nlp/test_partition.py
index 0f9034902..2382933e5 100644
--- a/test_unstructured/nlp/test_partition.py
+++ b/test_unstructured/nlp/test_partition.py
@@ -1,144 +1,7 @@
-import pytest
-
-import unstructured.nlp.partition as partition
-
-from mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize
-
-
-@pytest.mark.parametrize(
-    "text, expected",
-    [
-        (
-            "ITEM 5(a).: MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND "
-            "ISSUER PURCHASES OF EQUITY SECURITIES",
-            False,
-        ),
-        (
-            "Item 5(a).: Market For Registrant’s Common Equity, Related Stockholder Matters and "
-            "Issuer Purchases of Equity Securities",
-            False,
-        ),
-        (
-            "There is a market for registrant’s common equity, related stockholder matters and "
-            "issuer purchases of equity securities.",
-            True,
-        ),
-    ],
+# flake8: noqa
+from unstructured.nlp.partition import (
+    is_bulleted_text,
+    is_possible_narrative_text,
+    is_possible_title,
+    partition_pdf,
 )
-def test_headings_are_not_narrative_text(text, expected):
-    assert partition.is_possible_narrative_text(text) == expected
-
-
-@pytest.mark.parametrize(
-    "text, expected",
-    [
-        ("Ask the teacher for an apple.", True),
-        ("Ask Me About Intellectual Property", False),  # Exceeds the cap threshold
-        ("7", False),  # Fails because it is numeric
-        ("intellectual property", False),  # Fails because it does not contain a verb
-        ("", False),  # Fails because it is empty
-    ],
-)
-def test_is_possible_narrative_text(text, expected, monkeypatch):
-    monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
-    monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
-    monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
-    has_verb = partition.is_possible_narrative_text(text, cap_threshold=0.3)
-    assert has_verb is expected
-
-
-@pytest.mark.parametrize(
-    "text, expected",
-    [
-        ("Intellectual Property", True),  # Fails because it exceeds the cap threshold
-        (
-            "Ask the teacher for an apple. You might a gold star.",
-            False,
-        ),  # Too many sentences
-        ("7", False),  # Fails because it is numeric
-        ("", False),  # Fails because it is empty
-        ("ITEM 1A. RISK FACTORS", True),  # Two "sentences", but both are short
-    ],
-)
-def test_is_possible_title(text, expected, monkeypatch):
-    monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
-    monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
-    has_verb = partition.is_possible_title(text)
-    assert has_verb is expected
-
-
-@pytest.mark.parametrize(
-    "text, expected",
-    [
-        ("• This is a fine point!", True),
-        (" • This is a fine point!", True),  # Has an extra space in front of the bullet
-        ("‣ This is a fine point!", True),
-        ("⁃ This is a fine point!", True),
-        ("⁌ This is a fine point!", True),
-        ("⁍ This is a fine point!", True),
-        ("∙ This is a fine point!", True),
-        ("○ This is a fine point!", True),
-        ("● This is a fine point!", True),
-        ("◘ This is a fine point!", True),
-        ("◦  This is a fine point!", True),
-        ("☙ This is a fine point!", True),
-        ("❥ This is a fine point!", True),
-        ("❧ This is a fine point!", True),
-        ("⦾ This is a fine point!", True),
-        ("⦿ This is a fine point!", True),
-        ("  This is a fine point!", True),
-        ("* This is a fine point!", True),
-        ("This is NOT a fine point!", False),  # No bullet point
-        ("I love morse code! ● ● ● --- ● ● ●", False),  # Not at the beginning
-    ],
-)
-def test_is_bulletized_text(text, expected):
-    assert partition.is_bulleted_text(text) is expected
-
-
-@pytest.mark.parametrize(
-    "text, expected",
-    [
-        ("Ask the teacher for an apple", True),
-        ("Intellectual property", False),
-    ],
-)
-def test_contains_verb(text, expected, monkeypatch):
-    monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
-    monkeypatch.setattr(partition, "pos_tag", mock_pos_tag)
-    has_verb = partition.contains_verb(text)
-    assert has_verb is expected
-
-
-@pytest.mark.parametrize(
-    "text, expected",
-    [
-        ("Intellectual Property in the United States", True),
-        ("Intellectual property helps incentivize innovation.", False),
-        ("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
-    ],
-)
-def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
-    monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize)
-    monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
-    assert partition.exceeds_cap_ratio(text, threshold=0.3) is expected
-
-
-def test_sentence_count(monkeypatch):
-    monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize)
-    text = "Hi my name is Matt. I work with Crag."
-    assert partition.sentence_count(text) == 2
-
-
-def test_item_titles():
-    text = "ITEM 1(A). THIS IS A TITLE"
-    assert partition.sentence_count(text, 3) < 2
-
-
-def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"):
-    partition_pdf_response = partition.partition_pdf(filename)
-    assert partition_pdf_response[0]["type"] == "Title"
-    assert (
-        partition_pdf_response[0]["text"]
-        == "LayoutParser : A Uniﬁed Toolkit for Deep Learning Based Document Image Analysis"
-    )
diff --git a/test_unstructured/nlp/test_tokenize.py b/test_unstructured/nlp/test_tokenize.py
index 88b8f9686..42ccde3cd 100644
--- a/test_unstructured/nlp/test_tokenize.py
+++ b/test_unstructured/nlp/test_tokenize.py
@@ -2,7 +2,7 @@ from typing import List, Tuple
 
 import unstructured.nlp.tokenize as tokenize
 
-from mock_nltk import mock_sent_tokenize, mock_word_tokenize
+from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
 
 
 def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:
diff --git a/test_unstructured/partition/test_pdf.py b/test_unstructured/partition/test_pdf.py
new file mode 100644
index 000000000..56c3a2c13
--- /dev/null
+++ b/test_unstructured/partition/test_pdf.py
@@ -0,0 +1,10 @@
+import unstructured.partition.pdf as pdf
+
+
+def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"):
+    partition_pdf_response = pdf.partition_pdf(filename)
+    assert partition_pdf_response[0]["type"] == "Title"
+    assert (
+        partition_pdf_response[0]["text"]
+        == "LayoutParser : A Uniﬁed Toolkit for Deep Learning Based Document Image Analysis"
+    )
diff --git a/test_unstructured/partition/test_text_type.py b/test_unstructured/partition/test_text_type.py
new file mode 100644
index 000000000..060251c2b
--- /dev/null
+++ b/test_unstructured/partition/test_text_type.py
@@ -0,0 +1,135 @@
+import pytest
+
+import unstructured.partition.text_type as text_type
+
+from test_unstructured.nlp.mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize
+
+
+@pytest.mark.parametrize(
+    "text, expected",
+    [
+        (
+            "ITEM 5(a).: MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND "
+            "ISSUER PURCHASES OF EQUITY SECURITIES",
+            False,
+        ),
+        (
+            "Item 5(a).: Market For Registrant’s Common Equity, Related Stockholder Matters and "
+            "Issuer Purchases of Equity Securities",
+            False,
+        ),
+        (
+            "There is a market for registrant’s common equity, related stockholder matters and "
+            "issuer purchases of equity securities.",
+            True,
+        ),
+    ],
+)
+def test_headings_are_not_narrative_text(text, expected):
+    assert text_type.is_possible_narrative_text(text) == expected
+
+
+@pytest.mark.parametrize(
+    "text, expected",
+    [
+        ("Ask the teacher for an apple.", True),
+        ("Ask Me About Intellectual Property", False),  # Exceeds the cap threshold
+        ("7", False),  # Fails because it is numeric
+        ("intellectual property", False),  # Fails because it does not contain a verb
+        ("", False),  # Fails because it is empty
+    ],
+)
+def test_is_possible_narrative_text(text, expected, monkeypatch):
+    monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
+    monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag)
+    monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
+    has_verb = text_type.is_possible_narrative_text(text, cap_threshold=0.3)
+    assert has_verb is expected
+
+
+@pytest.mark.parametrize(
+    "text, expected",
+    [
+        ("Intellectual Property", True),  # Fails because it exceeds the cap threshold
+        (
+            "Ask the teacher for an apple. You might a gold star.",
+            False,
+        ),  # Too many sentences
+        ("7", False),  # Fails because it is numeric
+        ("", False),  # Fails because it is empty
+        ("ITEM 1A. RISK FACTORS", True),  # Two "sentences", but both are short
+    ],
+)
+def test_is_possible_title(text, expected, monkeypatch):
+    monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
+    monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
+    has_verb = text_type.is_possible_title(text)
+    assert has_verb is expected
+
+
+@pytest.mark.parametrize(
+    "text, expected",
+    [
+        ("• This is a fine point!", True),
+        (" • This is a fine point!", True),  # Has an extra space in front of the bullet
+        ("‣ This is a fine point!", True),
+        ("⁃ This is a fine point!", True),
+        ("⁌ This is a fine point!", True),
+        ("⁍ This is a fine point!", True),
+        ("∙ This is a fine point!", True),
+        ("○ This is a fine point!", True),
+        ("● This is a fine point!", True),
+        ("◘ This is a fine point!", True),
+        ("◦  This is a fine point!", True),
+        ("☙ This is a fine point!", True),
+        ("❥ This is a fine point!", True),
+        ("❧ This is a fine point!", True),
+        ("⦾ This is a fine point!", True),
+        ("⦿ This is a fine point!", True),
+        ("  This is a fine point!", True),
+        ("* This is a fine point!", True),
+        ("This is NOT a fine point!", False),  # No bullet point
+        ("I love morse code! ● ● ● --- ● ● ●", False),  # Not at the beginning
+    ],
+)
+def test_is_bulletized_text(text, expected):
+    assert text_type.is_bulleted_text(text) is expected
+
+
+@pytest.mark.parametrize(
+    "text, expected",
+    [
+        ("Ask the teacher for an apple", True),
+        ("Intellectual property", False),
+    ],
+)
+def test_contains_verb(text, expected, monkeypatch):
+    monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
+    monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag)
+    has_verb = text_type.contains_verb(text)
+    assert has_verb is expected
+
+
+@pytest.mark.parametrize(
+    "text, expected",
+    [
+        ("Intellectual Property in the United States", True),
+        ("Intellectual property helps incentivize innovation.", False),
+        ("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
+    ],
+)
+def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
+    monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
+    monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
+    assert text_type.exceeds_cap_ratio(text, threshold=0.3) is expected
+
+
+def test_sentence_count(monkeypatch):
+    monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
+    text = "Hi my name is Matt. I work with Crag."
+    assert text_type.sentence_count(text) == 2
+
+
+def test_item_titles():
+    text = "ITEM 1(A). THIS IS A TITLE"
+    assert text_type.sentence_count(text, 3) < 2
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 086e68431..73c410303 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.3.0-dev1"  # pragma: no cover
+__version__ = "0.3.0-dev2"  # pragma: no cover
diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py
index 5abe8198c..a8f5091c3 100644
--- a/unstructured/documents/html.py
+++ b/unstructured/documents/html.py
@@ -15,7 +15,7 @@ from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
 from unstructured.documents.base import Page
 from unstructured.documents.elements import ListItem, Element, NarrativeText, Title
 from unstructured.documents.xml import XMLDocument
-from unstructured.nlp.partition import (
+from unstructured.partition.text_type import (
     is_bulleted_text,
     is_possible_narrative_text,
     is_possible_title,
diff --git a/unstructured/nlp/partition.py b/unstructured/nlp/partition.py
index 0649452f6..3ffa9919c 100644
--- a/unstructured/nlp/partition.py
+++ b/unstructured/nlp/partition.py
@@ -1,160 +1,7 @@
-"""partition.py implements logic for partitioning plain text documents into sections."""
-import sys
-import requests  # type: ignore
-
-if sys.version_info < (3, 8):
-    from typing_extensions import Final, List, Optional
-else:
-    from typing import Final, List, Optional
-
-from unstructured.cleaners.core import remove_punctuation
-from unstructured.documents.elements import Element, Text
-from unstructured.nlp.patterns import UNICODE_BULLETS_RE
-from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
-from unstructured.logger import get_logger
-
-logger = get_logger()
-
-POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
-
-
-def is_possible_narrative_text(text: str, cap_threshold: float = 0.3) -> bool:
-    """Checks to see if the text passes all of the checks for a narrative text section."""
-    if len(text) == 0:
-        logger.debug("Not narrative. Text is empty.")
-        return False
-
-    if text.isnumeric():
-        logger.debug(f"Not narrative. Text is all numeric:\n\n{text}")
-        return False
-
-    if exceeds_cap_ratio(text, threshold=cap_threshold):
-        logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
-        return False
-
-    if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
-        logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
-        return False
-
-    return True
-
-
-def is_possible_title(text: str, sentence_min_length: int = 5) -> bool:
-    """Checks to see if the text passes all of the checks for a valid title."""
-    if len(text) == 0:
-        logger.debug("Not a title. Text is empty.")
-        return False
-
-    if text.isnumeric():
-        logger.debug(f"Not a title. Text is all numeric:\n\n{text}")
-        return False
-
-    # NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS"
-    # that sometimes get tokenized as separate sentences due to the period, but are still
-    # valid titles
-    if sentence_count(text, min_length=sentence_min_length) > 1:
-        logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}")
-        return False
-
-    return True
-
-
-def is_bulleted_text(text: str) -> bool:
-    """Checks to see if the section of text is part of a bulleted list."""
-    return UNICODE_BULLETS_RE.match(text.strip()) is not None
-
-
-def contains_verb(text: str) -> bool:
-    """Use a POS tagger to check if a segment contains verbs. If the section does not have verbs,
-    that indicates that it is not narrative text."""
-    pos_tags = pos_tag(text)
-    for _, tag in pos_tags:
-        if tag in POS_VERB_TAGS:
-            return True
-    return False
-
-
-def sentence_count(text: str, min_length: Optional[int] = None) -> int:
-    """Checks the sentence count for a section of text. Titles should not be more than one
-    sentence.
-
-    Parameters
-    ----------
-    text
-        The string of the text to count
-    min_length
-        The min number of words a section needs to be for it to be considered a sentence.
-    """
-    sentences = sent_tokenize(text)
-    count = 0
-    for sentence in sentences:
-        sentence = remove_punctuation(sentence)
-        words = [word for word in word_tokenize(sentence) if word != "."]
-        if min_length and len(words) < min_length:
-            logger.debug(
-                f"Skipping sentence because does not exceed {min_length} word tokens\n"
-                f"{sentence}"
-            )
-            continue
-        count += 1
-    return count
-
-
-def exceeds_cap_ratio(text: str, threshold: float = 0.3) -> bool:
-    """Checks the title ratio in a section of text. If a sufficient proportion of the text is
-    capitalized."""
-    # NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
-    # The assumption is that sections with multiple sentences are not titles.
-    if sentence_count(text, 3) > 1:
-        logger.debug(f"Text does not contain multiple sentences:\n\n{text}")
-        return False
-
-    tokens = word_tokenize(text)
-    capitalized = sum([word.istitle() or word.isupper() for word in tokens])
-    ratio = capitalized / len(tokens)
-    return ratio > threshold
-
-
-def partition_pdf(
-    filename: str = "",
-    file: Optional[bytes] = None,
-    url: str = "https://ml.unstructured.io/",
-    template: Optional[str] = "base-model",
-    token: Optional[str] = None,
-) -> List[Element]:
-    """Calls the document parsing API.
-    Parameters
-    ----------
-    filename
-        A string defining the target filename path.
-    file
-        A file-like object as bytes --> open(filename, "rb").
-    template
-        A string defining the model to be used. Default "base-model" makes reference to layout/pdf.
-    url
-        A string endpoint to self-host an inference API, if desired.
-    token
-        A string defining the authentication token for a self-host url.
-    """
-    if not filename and not file:
-        raise FileNotFoundError("No filename nor file were specified")
-
-    healthcheck_response = requests.models.Response()
-    if not token:
-        healthcheck_response = requests.get(url=f"{url}healthcheck")
-
-    if healthcheck_response.status_code != 200:
-        return [Text(text="error: endpoint api healthcheck has failed!")]
-
-    url = f"{url}layout/pdf" if template == "base-model" else f"{url}/{template}"
-    file_ = (filename, file if file else open(filename, "rb"))
-    response = requests.post(
-        url=url,
-        headers={"Authorization": f"Bearer {token}" if token else ""},
-        files={"file": file_},
-    )
-    if response.status_code == 200:
-        pages = response.json()["pages"]
-        return [element for page in pages for element in page["elements"]]
-    else:
-        return [Text(text=f"error: response status code = {response.status_code}")]
+# flake8: noqa
+from unstructured.partition.pdf import partition_pdf
+from unstructured.partition.text_type import (
+    is_bulleted_text,
+    is_possible_narrative_text,
+    is_possible_title,
+)
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
new file mode 100644
index 000000000..aa4724e1d
--- /dev/null
+++ b/unstructured/partition/pdf.py
@@ -0,0 +1,55 @@
+import requests  # type: ignore
+
+import sys
+
+if sys.version_info < (3, 8):
+    from typing_extensions import List, Optional
+else:
+    from typing import List, Optional
+
+from unstructured.documents.elements import Element, Text
+
+
+def partition_pdf(
+    filename: str = "",
+    file: Optional[bytes] = None,
+    url: str = "https://ml.unstructured.io/",
+    template: Optional[str] = "base-model",
+    token: Optional[str] = None,
+) -> List[Element]:
+    """Calls the document parsing API.
+    Parameters
+    ----------
+    filename
+        A string defining the target filename path.
+    file
+        A file-like object as bytes --> open(filename, "rb").
+    template
+        A string defining the model to be used. Default "base-model" makes reference to layout/pdf.
+    url
+        A string endpoint to self-host an inference API, if desired.
+    token
+        A string defining the authentication token for a self-host url.
+    """
+    if not filename and not file:
+        raise FileNotFoundError("No filename nor file were specified")
+
+    healthcheck_response = requests.models.Response()
+    if not token:
+        healthcheck_response = requests.get(url=f"{url}healthcheck")
+
+    if healthcheck_response.status_code != 200:
+        return [Text(text="error: endpoint api healthcheck has failed!")]
+
+    url = f"{url}layout/pdf" if template == "base-model" else f"{url}/{template}"
+    file_ = (filename, file if file else open(filename, "rb"))
+    response = requests.post(
+        url=url,
+        headers={"Authorization": f"Bearer {token}" if token else ""},
+        files={"file": file_},
+    )
+    if response.status_code == 200:
+        pages = response.json()["pages"]
+        return [element for page in pages for element in page["elements"]]
+    else:
+        return [Text(text=f"error: response status code = {response.status_code}")]
diff --git a/unstructured/partition/text_type.py b/unstructured/partition/text_type.py
new file mode 100644
index 000000000..ea88d3889
--- /dev/null
+++ b/unstructured/partition/text_type.py
@@ -0,0 +1,113 @@
+"""partition.py implements logic for partitioning plain text documents into sections."""
+import sys
+
+if sys.version_info < (3, 8):
+    from typing_extensions import Final, List, Optional
+else:
+    from typing import Final, List, Optional
+
+from unstructured.cleaners.core import remove_punctuation
+from unstructured.nlp.patterns import UNICODE_BULLETS_RE
+from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
+from unstructured.logger import get_logger
+
+logger = get_logger()
+
+POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
+
+
+def is_possible_narrative_text(text: str, cap_threshold: float = 0.3) -> bool:
+    """Checks to see if the text passes all of the checks for a narrative text section."""
+    if len(text) == 0:
+        logger.debug("Not narrative. Text is empty.")
+        return False
+
+    if text.isnumeric():
+        logger.debug(f"Not narrative. Text is all numeric:\n\n{text}")
+        return False
+
+    if exceeds_cap_ratio(text, threshold=cap_threshold):
+        logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
+        return False
+
+    if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
+        logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
+        return False
+
+    return True
+
+
+def is_possible_title(text: str, sentence_min_length: int = 5) -> bool:
+    """Checks to see if the text passes all of the checks for a valid title."""
+    if len(text) == 0:
+        logger.debug("Not a title. Text is empty.")
+        return False
+
+    if text.isnumeric():
+        logger.debug(f"Not a title. Text is all numeric:\n\n{text}")
+        return False
+
+    # NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS"
+    # that sometimes get tokenized as separate sentences due to the period, but are still
+    # valid titles
+    if sentence_count(text, min_length=sentence_min_length) > 1:
+        logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}")
+        return False
+
+    return True
+
+
+def is_bulleted_text(text: str) -> bool:
+    """Checks to see if the section of text is part of a bulleted list."""
+    return UNICODE_BULLETS_RE.match(text.strip()) is not None
+
+
+def contains_verb(text: str) -> bool:
+    """Use a POS tagger to check if a segment contains verbs. If the section does not have verbs,
+    that indicates that it is not narrative text."""
+    pos_tags = pos_tag(text)
+    for _, tag in pos_tags:
+        if tag in POS_VERB_TAGS:
+            return True
+    return False
+
+
+def sentence_count(text: str, min_length: Optional[int] = None) -> int:
+    """Checks the sentence count for a section of text. Titles should not be more than one
+    sentence.
+
+    Parameters
+    ----------
+    text
+        The string of the text to count
+    min_length
+        The min number of words a section needs to be for it to be considered a sentence.
+    """
+    sentences = sent_tokenize(text)
+    count = 0
+    for sentence in sentences:
+        sentence = remove_punctuation(sentence)
+        words = [word for word in word_tokenize(sentence) if word != "."]
+        if min_length and len(words) < min_length:
+            logger.debug(
+                f"Skipping sentence because does not exceed {min_length} word tokens\n"
+                f"{sentence}"
+            )
+            continue
+        count += 1
+    return count
+
+
+def exceeds_cap_ratio(text: str, threshold: float = 0.3) -> bool:
+    """Checks the title ratio in a section of text. If a sufficient proportion of the text is
+    capitalized."""
+    # NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
+    # The assumption is that sections with multiple sentences are not titles.
+    if sentence_count(text, 3) > 1:
+        logger.debug(f"Text does not contain multiple sentences:\n\n{text}")
+        return False
+
+    tokens = word_tokenize(text)
+    capitalized = sum([word.istitle() or word.isupper() for word in tokens])
+    ratio = capitalized / len(tokens)
+    return ratio > threshold