From 08e091c5a9061ad7653fbcd2c318b08c585d7b21 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Mon, 21 Nov 2022 17:27:23 -0500 Subject: [PATCH] chore: Reorganize partition bricks under partition directory (#76) * move partition_pdf to partition folder * move partition.py * refactor partioning bricks into partition diretory * import to nlp for backward compatibility * update docs * update version and bump changelog * fix typo in changelog * update readme reference --- CHANGELOG.md | 3 +- README.md | 2 +- docs/source/bricks.rst | 46 ++--- test_unstructured/__init__.py | 0 test_unstructured/nlp/__init__.py | 0 test_unstructured/nlp/test_partition.py | 149 +--------------- test_unstructured/nlp/test_tokenize.py | 2 +- test_unstructured/partition/test_pdf.py | 10 ++ test_unstructured/partition/test_text_type.py | 135 ++++++++++++++ unstructured/__version__.py | 2 +- unstructured/documents/html.py | 2 +- unstructured/nlp/partition.py | 167 +----------------- unstructured/partition/pdf.py | 55 ++++++ unstructured/partition/text_type.py | 113 ++++++++++++ 14 files changed, 356 insertions(+), 330 deletions(-) create mode 100644 test_unstructured/__init__.py create mode 100644 test_unstructured/nlp/__init__.py create mode 100644 test_unstructured/partition/test_pdf.py create mode 100644 test_unstructured/partition/test_text_type.py create mode 100644 unstructured/partition/pdf.py create mode 100644 unstructured/partition/text_type.py diff --git a/CHANGELOG.md b/CHANGELOG.md index daebd51d8..f17315c8f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ -## 0.3.0-dev1 +## 0.3.0-dev2 * Removing the local PDF parsing code and any dependencies and tests. +* Reorganizes the staging bricks in the unstructured.partition module ## 0.2.6 diff --git a/README.md b/README.md index 5585cf310..8ab87165e 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ titles and narrative text. You can use the following workflow to parse PDF documents. ```python -from unstructured.nlp.partition import partition_pdf +from unstructured.partition.pdf import partition_pdf elements = partition_pdf("example-docs/layout-parser-paper.pdf") print(doc) diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index ed8db280e..e19c3e87e 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -14,6 +14,24 @@ The partitioning bricks in ``unstructured`` differentiate between different sect of text in a document. For example, the partitioning bricks can help distinguish between titles, narrative text, and tables. + +``partition_pdf`` +--------------------- + +The ``partition_pdf`` function segments a PDF document by calling the document image analysis API. +The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API, +if desired. + +Examples: + +.. code:: python + + from unstructured.partition.pdf import partition_pdf + + # Returns a List[Element] present in the pages of the parsed pdf document + elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf") + + ``is_bulleted_text`` ---------------------- @@ -24,7 +42,7 @@ Examples: .. code:: python - from unstructured.nlp.partition import is_bulleted_text + from unstructured.partition.text_type import is_bulleted_text # Returns True is_bulleted_text("● An excellent point!") @@ -52,7 +70,7 @@ Examples: .. code:: python - from unstructured.nlp.partition import is_possible_narrative_text + from unstructured.partition.text_type import is_possible_narrative_text # Returns True because the example passes all the checks example_1 = "Make sure you brush your teeth before you go to bed." @@ -83,7 +101,7 @@ Examples: .. code:: python - from unstructured.nlp.partition import is_possible_title + from unstructured.partition.text_type import is_possible_title # Returns True because the text passes all the tests example_2 = "ITEM 1A. RISK FACTORS" @@ -116,7 +134,7 @@ Examples: .. code:: python - from unstructured.nlp.partition import contains_verb + from unstructured.partition.text_type import contains_verb # Returns True because the text contains a verb example_1 = "I am going to run to the store to pick up some milk." @@ -139,7 +157,7 @@ Examples: .. code:: python - from unstructured.nlp.partition import sentence_count + from unstructured.partition.text_type import sentence_count example = "Look at me! I am a document with two sentences." @@ -162,7 +180,7 @@ Examples: .. code:: python - from unstructured.nlp.partition import exceeds_cap_ratio + from unstructured.partition.text_type import exceeds_cap_ratio # Returns True because the text is more than 30% caps example_1 = "LOOK AT ME I AM YELLING" @@ -174,23 +192,7 @@ Examples: # Returns False because the text is more than 1% caps exceeds_cap_ratio(example_2, threshold=0.01) - - -``partition_pdf`` ---------------------- -The ``partition_pdf`` function segments a PDF document by calling the document image analysis API. -The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API, -if desired. - -Examples: - -.. code:: python - - from unstructured.nlp.partition import partition_pdf - - # Returns a List[Element] present in the pages of the parsed pdf document - elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf") ######## diff --git a/test_unstructured/__init__.py b/test_unstructured/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test_unstructured/nlp/__init__.py b/test_unstructured/nlp/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test_unstructured/nlp/test_partition.py b/test_unstructured/nlp/test_partition.py index 0f9034902..2382933e5 100644 --- a/test_unstructured/nlp/test_partition.py +++ b/test_unstructured/nlp/test_partition.py @@ -1,144 +1,7 @@ -import pytest - -import unstructured.nlp.partition as partition - -from mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize - - -@pytest.mark.parametrize( - "text, expected", - [ - ( - "ITEM 5(a).: MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND " - "ISSUER PURCHASES OF EQUITY SECURITIES", - False, - ), - ( - "Item 5(a).: Market For Registrant’s Common Equity, Related Stockholder Matters and " - "Issuer Purchases of Equity Securities", - False, - ), - ( - "There is a market for registrant’s common equity, related stockholder matters and " - "issuer purchases of equity securities.", - True, - ), - ], +# flake8: noqa +from unstructured.nlp.partition import ( + is_bulleted_text, + is_possible_narrative_text, + is_possible_title, + partition_pdf, ) -def test_headings_are_not_narrative_text(text, expected): - assert partition.is_possible_narrative_text(text) == expected - - -@pytest.mark.parametrize( - "text, expected", - [ - ("Ask the teacher for an apple.", True), - ("Ask Me About Intellectual Property", False), # Exceeds the cap threshold - ("7", False), # Fails because it is numeric - ("intellectual property", False), # Fails because it does not contain a verb - ("", False), # Fails because it is empty - ], -) -def test_is_possible_narrative_text(text, expected, monkeypatch): - monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize) - monkeypatch.setattr(partition, "pos_tag", mock_pos_tag) - monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize) - has_verb = partition.is_possible_narrative_text(text, cap_threshold=0.3) - assert has_verb is expected - - -@pytest.mark.parametrize( - "text, expected", - [ - ("Intellectual Property", True), # Fails because it exceeds the cap threshold - ( - "Ask the teacher for an apple. You might a gold star.", - False, - ), # Too many sentences - ("7", False), # Fails because it is numeric - ("", False), # Fails because it is empty - ("ITEM 1A. RISK FACTORS", True), # Two "sentences", but both are short - ], -) -def test_is_possible_title(text, expected, monkeypatch): - monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize) - monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize) - has_verb = partition.is_possible_title(text) - assert has_verb is expected - - -@pytest.mark.parametrize( - "text, expected", - [ - ("• This is a fine point!", True), - (" • This is a fine point!", True), # Has an extra space in front of the bullet - ("‣ This is a fine point!", True), - ("⁃ This is a fine point!", True), - ("⁌ This is a fine point!", True), - ("⁍ This is a fine point!", True), - ("∙ This is a fine point!", True), - ("○ This is a fine point!", True), - ("● This is a fine point!", True), - ("◘ This is a fine point!", True), - ("◦ This is a fine point!", True), - ("☙ This is a fine point!", True), - ("❥ This is a fine point!", True), - ("❧ This is a fine point!", True), - ("⦾ This is a fine point!", True), - ("⦿ This is a fine point!", True), - (" This is a fine point!", True), - ("* This is a fine point!", True), - ("This is NOT a fine point!", False), # No bullet point - ("I love morse code! ● ● ● --- ● ● ●", False), # Not at the beginning - ], -) -def test_is_bulletized_text(text, expected): - assert partition.is_bulleted_text(text) is expected - - -@pytest.mark.parametrize( - "text, expected", - [ - ("Ask the teacher for an apple", True), - ("Intellectual property", False), - ], -) -def test_contains_verb(text, expected, monkeypatch): - monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize) - monkeypatch.setattr(partition, "pos_tag", mock_pos_tag) - has_verb = partition.contains_verb(text) - assert has_verb is expected - - -@pytest.mark.parametrize( - "text, expected", - [ - ("Intellectual Property in the United States", True), - ("Intellectual property helps incentivize innovation.", False), - ("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False), - ], -) -def test_contains_exceeds_cap_ratio(text, expected, monkeypatch): - monkeypatch.setattr(partition, "word_tokenize", mock_word_tokenize) - monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize) - assert partition.exceeds_cap_ratio(text, threshold=0.3) is expected - - -def test_sentence_count(monkeypatch): - monkeypatch.setattr(partition, "sent_tokenize", mock_sent_tokenize) - text = "Hi my name is Matt. I work with Crag." - assert partition.sentence_count(text) == 2 - - -def test_item_titles(): - text = "ITEM 1(A). THIS IS A TITLE" - assert partition.sentence_count(text, 3) < 2 - - -def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"): - partition_pdf_response = partition.partition_pdf(filename) - assert partition_pdf_response[0]["type"] == "Title" - assert ( - partition_pdf_response[0]["text"] - == "LayoutParser : A Unified Toolkit for Deep Learning Based Document Image Analysis" - ) diff --git a/test_unstructured/nlp/test_tokenize.py b/test_unstructured/nlp/test_tokenize.py index 88b8f9686..42ccde3cd 100644 --- a/test_unstructured/nlp/test_tokenize.py +++ b/test_unstructured/nlp/test_tokenize.py @@ -2,7 +2,7 @@ from typing import List, Tuple import unstructured.nlp.tokenize as tokenize -from mock_nltk import mock_sent_tokenize, mock_word_tokenize +from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]: diff --git a/test_unstructured/partition/test_pdf.py b/test_unstructured/partition/test_pdf.py new file mode 100644 index 000000000..56c3a2c13 --- /dev/null +++ b/test_unstructured/partition/test_pdf.py @@ -0,0 +1,10 @@ +import unstructured.partition.pdf as pdf + + +def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"): + partition_pdf_response = pdf.partition_pdf(filename) + assert partition_pdf_response[0]["type"] == "Title" + assert ( + partition_pdf_response[0]["text"] + == "LayoutParser : A Unified Toolkit for Deep Learning Based Document Image Analysis" + ) diff --git a/test_unstructured/partition/test_text_type.py b/test_unstructured/partition/test_text_type.py new file mode 100644 index 000000000..060251c2b --- /dev/null +++ b/test_unstructured/partition/test_text_type.py @@ -0,0 +1,135 @@ +import pytest + +import unstructured.partition.text_type as text_type + +from test_unstructured.nlp.mock_nltk import mock_pos_tag, mock_sent_tokenize, mock_word_tokenize + + +@pytest.mark.parametrize( + "text, expected", + [ + ( + "ITEM 5(a).: MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND " + "ISSUER PURCHASES OF EQUITY SECURITIES", + False, + ), + ( + "Item 5(a).: Market For Registrant’s Common Equity, Related Stockholder Matters and " + "Issuer Purchases of Equity Securities", + False, + ), + ( + "There is a market for registrant’s common equity, related stockholder matters and " + "issuer purchases of equity securities.", + True, + ), + ], +) +def test_headings_are_not_narrative_text(text, expected): + assert text_type.is_possible_narrative_text(text) == expected + + +@pytest.mark.parametrize( + "text, expected", + [ + ("Ask the teacher for an apple.", True), + ("Ask Me About Intellectual Property", False), # Exceeds the cap threshold + ("7", False), # Fails because it is numeric + ("intellectual property", False), # Fails because it does not contain a verb + ("", False), # Fails because it is empty + ], +) +def test_is_possible_narrative_text(text, expected, monkeypatch): + monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize) + monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag) + monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize) + has_verb = text_type.is_possible_narrative_text(text, cap_threshold=0.3) + assert has_verb is expected + + +@pytest.mark.parametrize( + "text, expected", + [ + ("Intellectual Property", True), # Fails because it exceeds the cap threshold + ( + "Ask the teacher for an apple. You might a gold star.", + False, + ), # Too many sentences + ("7", False), # Fails because it is numeric + ("", False), # Fails because it is empty + ("ITEM 1A. RISK FACTORS", True), # Two "sentences", but both are short + ], +) +def test_is_possible_title(text, expected, monkeypatch): + monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize) + monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize) + has_verb = text_type.is_possible_title(text) + assert has_verb is expected + + +@pytest.mark.parametrize( + "text, expected", + [ + ("• This is a fine point!", True), + (" • This is a fine point!", True), # Has an extra space in front of the bullet + ("‣ This is a fine point!", True), + ("⁃ This is a fine point!", True), + ("⁌ This is a fine point!", True), + ("⁍ This is a fine point!", True), + ("∙ This is a fine point!", True), + ("○ This is a fine point!", True), + ("● This is a fine point!", True), + ("◘ This is a fine point!", True), + ("◦ This is a fine point!", True), + ("☙ This is a fine point!", True), + ("❥ This is a fine point!", True), + ("❧ This is a fine point!", True), + ("⦾ This is a fine point!", True), + ("⦿ This is a fine point!", True), + (" This is a fine point!", True), + ("* This is a fine point!", True), + ("This is NOT a fine point!", False), # No bullet point + ("I love morse code! ● ● ● --- ● ● ●", False), # Not at the beginning + ], +) +def test_is_bulletized_text(text, expected): + assert text_type.is_bulleted_text(text) is expected + + +@pytest.mark.parametrize( + "text, expected", + [ + ("Ask the teacher for an apple", True), + ("Intellectual property", False), + ], +) +def test_contains_verb(text, expected, monkeypatch): + monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize) + monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag) + has_verb = text_type.contains_verb(text) + assert has_verb is expected + + +@pytest.mark.parametrize( + "text, expected", + [ + ("Intellectual Property in the United States", True), + ("Intellectual property helps incentivize innovation.", False), + ("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False), + ], +) +def test_contains_exceeds_cap_ratio(text, expected, monkeypatch): + monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize) + monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize) + assert text_type.exceeds_cap_ratio(text, threshold=0.3) is expected + + +def test_sentence_count(monkeypatch): + monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize) + text = "Hi my name is Matt. I work with Crag." + assert text_type.sentence_count(text) == 2 + + +def test_item_titles(): + text = "ITEM 1(A). THIS IS A TITLE" + assert text_type.sentence_count(text, 3) < 2 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 086e68431..73c410303 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.3.0-dev1" # pragma: no cover +__version__ = "0.3.0-dev2" # pragma: no cover diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index 5abe8198c..a8f5091c3 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -15,7 +15,7 @@ from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes from unstructured.documents.base import Page from unstructured.documents.elements import ListItem, Element, NarrativeText, Title from unstructured.documents.xml import XMLDocument -from unstructured.nlp.partition import ( +from unstructured.partition.text_type import ( is_bulleted_text, is_possible_narrative_text, is_possible_title, diff --git a/unstructured/nlp/partition.py b/unstructured/nlp/partition.py index 0649452f6..3ffa9919c 100644 --- a/unstructured/nlp/partition.py +++ b/unstructured/nlp/partition.py @@ -1,160 +1,7 @@ -"""partition.py implements logic for partitioning plain text documents into sections.""" -import sys -import requests # type: ignore - -if sys.version_info < (3, 8): - from typing_extensions import Final, List, Optional -else: - from typing import Final, List, Optional - -from unstructured.cleaners.core import remove_punctuation -from unstructured.documents.elements import Element, Text -from unstructured.nlp.patterns import UNICODE_BULLETS_RE -from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize -from unstructured.logger import get_logger - -logger = get_logger() - -POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"] - - -def is_possible_narrative_text(text: str, cap_threshold: float = 0.3) -> bool: - """Checks to see if the text passes all of the checks for a narrative text section.""" - if len(text) == 0: - logger.debug("Not narrative. Text is empty.") - return False - - if text.isnumeric(): - logger.debug(f"Not narrative. Text is all numeric:\n\n{text}") - return False - - if exceeds_cap_ratio(text, threshold=cap_threshold): - logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}") - return False - - if (sentence_count(text, 3) < 2) and (not contains_verb(text)): - logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}") - return False - - return True - - -def is_possible_title(text: str, sentence_min_length: int = 5) -> bool: - """Checks to see if the text passes all of the checks for a valid title.""" - if len(text) == 0: - logger.debug("Not a title. Text is empty.") - return False - - if text.isnumeric(): - logger.debug(f"Not a title. Text is all numeric:\n\n{text}") - return False - - # NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS" - # that sometimes get tokenized as separate sentences due to the period, but are still - # valid titles - if sentence_count(text, min_length=sentence_min_length) > 1: - logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}") - return False - - return True - - -def is_bulleted_text(text: str) -> bool: - """Checks to see if the section of text is part of a bulleted list.""" - return UNICODE_BULLETS_RE.match(text.strip()) is not None - - -def contains_verb(text: str) -> bool: - """Use a POS tagger to check if a segment contains verbs. If the section does not have verbs, - that indicates that it is not narrative text.""" - pos_tags = pos_tag(text) - for _, tag in pos_tags: - if tag in POS_VERB_TAGS: - return True - return False - - -def sentence_count(text: str, min_length: Optional[int] = None) -> int: - """Checks the sentence count for a section of text. Titles should not be more than one - sentence. - - Parameters - ---------- - text - The string of the text to count - min_length - The min number of words a section needs to be for it to be considered a sentence. - """ - sentences = sent_tokenize(text) - count = 0 - for sentence in sentences: - sentence = remove_punctuation(sentence) - words = [word for word in word_tokenize(sentence) if word != "."] - if min_length and len(words) < min_length: - logger.debug( - f"Skipping sentence because does not exceed {min_length} word tokens\n" - f"{sentence}" - ) - continue - count += 1 - return count - - -def exceeds_cap_ratio(text: str, threshold: float = 0.3) -> bool: - """Checks the title ratio in a section of text. If a sufficient proportion of the text is - capitalized.""" - # NOTE(robinson) - Currently limiting this to only sections of text with one sentence. - # The assumption is that sections with multiple sentences are not titles. - if sentence_count(text, 3) > 1: - logger.debug(f"Text does not contain multiple sentences:\n\n{text}") - return False - - tokens = word_tokenize(text) - capitalized = sum([word.istitle() or word.isupper() for word in tokens]) - ratio = capitalized / len(tokens) - return ratio > threshold - - -def partition_pdf( - filename: str = "", - file: Optional[bytes] = None, - url: str = "https://ml.unstructured.io/", - template: Optional[str] = "base-model", - token: Optional[str] = None, -) -> List[Element]: - """Calls the document parsing API. - Parameters - ---------- - filename - A string defining the target filename path. - file - A file-like object as bytes --> open(filename, "rb"). - template - A string defining the model to be used. Default "base-model" makes reference to layout/pdf. - url - A string endpoint to self-host an inference API, if desired. - token - A string defining the authentication token for a self-host url. - """ - if not filename and not file: - raise FileNotFoundError("No filename nor file were specified") - - healthcheck_response = requests.models.Response() - if not token: - healthcheck_response = requests.get(url=f"{url}healthcheck") - - if healthcheck_response.status_code != 200: - return [Text(text="error: endpoint api healthcheck has failed!")] - - url = f"{url}layout/pdf" if template == "base-model" else f"{url}/{template}" - file_ = (filename, file if file else open(filename, "rb")) - response = requests.post( - url=url, - headers={"Authorization": f"Bearer {token}" if token else ""}, - files={"file": file_}, - ) - if response.status_code == 200: - pages = response.json()["pages"] - return [element for page in pages for element in page["elements"]] - else: - return [Text(text=f"error: response status code = {response.status_code}")] +# flake8: noqa +from unstructured.partition.pdf import partition_pdf +from unstructured.partition.text_type import ( + is_bulleted_text, + is_possible_narrative_text, + is_possible_title, +) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py new file mode 100644 index 000000000..aa4724e1d --- /dev/null +++ b/unstructured/partition/pdf.py @@ -0,0 +1,55 @@ +import requests # type: ignore + +import sys + +if sys.version_info < (3, 8): + from typing_extensions import List, Optional +else: + from typing import List, Optional + +from unstructured.documents.elements import Element, Text + + +def partition_pdf( + filename: str = "", + file: Optional[bytes] = None, + url: str = "https://ml.unstructured.io/", + template: Optional[str] = "base-model", + token: Optional[str] = None, +) -> List[Element]: + """Calls the document parsing API. + Parameters + ---------- + filename + A string defining the target filename path. + file + A file-like object as bytes --> open(filename, "rb"). + template + A string defining the model to be used. Default "base-model" makes reference to layout/pdf. + url + A string endpoint to self-host an inference API, if desired. + token + A string defining the authentication token for a self-host url. + """ + if not filename and not file: + raise FileNotFoundError("No filename nor file were specified") + + healthcheck_response = requests.models.Response() + if not token: + healthcheck_response = requests.get(url=f"{url}healthcheck") + + if healthcheck_response.status_code != 200: + return [Text(text="error: endpoint api healthcheck has failed!")] + + url = f"{url}layout/pdf" if template == "base-model" else f"{url}/{template}" + file_ = (filename, file if file else open(filename, "rb")) + response = requests.post( + url=url, + headers={"Authorization": f"Bearer {token}" if token else ""}, + files={"file": file_}, + ) + if response.status_code == 200: + pages = response.json()["pages"] + return [element for page in pages for element in page["elements"]] + else: + return [Text(text=f"error: response status code = {response.status_code}")] diff --git a/unstructured/partition/text_type.py b/unstructured/partition/text_type.py new file mode 100644 index 000000000..ea88d3889 --- /dev/null +++ b/unstructured/partition/text_type.py @@ -0,0 +1,113 @@ +"""partition.py implements logic for partitioning plain text documents into sections.""" +import sys + +if sys.version_info < (3, 8): + from typing_extensions import Final, List, Optional +else: + from typing import Final, List, Optional + +from unstructured.cleaners.core import remove_punctuation +from unstructured.nlp.patterns import UNICODE_BULLETS_RE +from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize +from unstructured.logger import get_logger + +logger = get_logger() + +POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"] + + +def is_possible_narrative_text(text: str, cap_threshold: float = 0.3) -> bool: + """Checks to see if the text passes all of the checks for a narrative text section.""" + if len(text) == 0: + logger.debug("Not narrative. Text is empty.") + return False + + if text.isnumeric(): + logger.debug(f"Not narrative. Text is all numeric:\n\n{text}") + return False + + if exceeds_cap_ratio(text, threshold=cap_threshold): + logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}") + return False + + if (sentence_count(text, 3) < 2) and (not contains_verb(text)): + logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}") + return False + + return True + + +def is_possible_title(text: str, sentence_min_length: int = 5) -> bool: + """Checks to see if the text passes all of the checks for a valid title.""" + if len(text) == 0: + logger.debug("Not a title. Text is empty.") + return False + + if text.isnumeric(): + logger.debug(f"Not a title. Text is all numeric:\n\n{text}") + return False + + # NOTE(robinson) - The min length is to capture content such as "ITEM 1A. RISK FACTORS" + # that sometimes get tokenized as separate sentences due to the period, but are still + # valid titles + if sentence_count(text, min_length=sentence_min_length) > 1: + logger.debug(f"Not a title. Text is longer than {sentence_min_length} sentences:\n\n{text}") + return False + + return True + + +def is_bulleted_text(text: str) -> bool: + """Checks to see if the section of text is part of a bulleted list.""" + return UNICODE_BULLETS_RE.match(text.strip()) is not None + + +def contains_verb(text: str) -> bool: + """Use a POS tagger to check if a segment contains verbs. If the section does not have verbs, + that indicates that it is not narrative text.""" + pos_tags = pos_tag(text) + for _, tag in pos_tags: + if tag in POS_VERB_TAGS: + return True + return False + + +def sentence_count(text: str, min_length: Optional[int] = None) -> int: + """Checks the sentence count for a section of text. Titles should not be more than one + sentence. + + Parameters + ---------- + text + The string of the text to count + min_length + The min number of words a section needs to be for it to be considered a sentence. + """ + sentences = sent_tokenize(text) + count = 0 + for sentence in sentences: + sentence = remove_punctuation(sentence) + words = [word for word in word_tokenize(sentence) if word != "."] + if min_length and len(words) < min_length: + logger.debug( + f"Skipping sentence because does not exceed {min_length} word tokens\n" + f"{sentence}" + ) + continue + count += 1 + return count + + +def exceeds_cap_ratio(text: str, threshold: float = 0.3) -> bool: + """Checks the title ratio in a section of text. If a sufficient proportion of the text is + capitalized.""" + # NOTE(robinson) - Currently limiting this to only sections of text with one sentence. + # The assumption is that sections with multiple sentences are not titles. + if sentence_count(text, 3) > 1: + logger.debug(f"Text does not contain multiple sentences:\n\n{text}") + return False + + tokens = word_tokenize(text) + capitalized = sum([word.istitle() or word.isupper() for word in tokens]) + ratio = capitalized / len(tokens) + return ratio > threshold