fix: no UserWarning when partition_pdf is called (#179)

This commit is contained in:
Matt Robinson 2023-01-27 12:08:18 -05:00 committed by GitHub
parent 339c133326
commit e6cfde5c4a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 32 additions and 6 deletions

View File

@ -1,4 +1,4 @@
## 0.4.5-dev0
## 0.4.5-dev1
* Loosen the default cap threshold to `0.5`.
* Add a `NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling the cap ratio threshold.
@ -7,6 +7,7 @@
is insufficient to determine that the text is narrative.
* Upper cased text is lower cased before checking for verbs. This helps avoid some missed verbs.
* Adds an `Address` element for capturing elements that only contain an address.
* Suppress the `UserWarning` when detectron is called
## 0.4.4

View File

@ -1,6 +1,7 @@
import os
import pathlib
import pytest
import warnings
import docx
@ -136,6 +137,9 @@ def test_auto_partition_text_from_file():
elements = partition(file=f)
assert len(elements) > 0
assert elements == EXPECTED_TEXT_OUTPUT
def test_auto_partition_pdf_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
elements = partition(filename=filename)
@ -150,7 +154,23 @@ def test_auto_partition_pdf_from_file():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
with open(filename, "rb") as f:
elements = partition(file=f)
assert len(elements) > 0
assert isinstance(elements[0], Title)
assert elements[0].text.startswith("LayoutParser")
assert isinstance(elements[1], NarrativeText)
assert elements[1].text.startswith("Zejiang Shen 1")
def test_partition_pdf_doesnt_raise_warning():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
# NOTE(robinson): This is the recommended way to check that no warning is emitted,
# per the pytest docs.
# ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
# #additional-use-cases-of-warnings-in-tests
with warnings.catch_warnings():
warnings.simplefilter("error")
partition(filename=filename)
def test_auto_partition_jpg():

View File

@ -1 +1 @@
__version__ = "0.4.5-dev0" # pragma: no cover
__version__ = "0.4.5-dev1" # pragma: no cover

View File

@ -1,4 +1,5 @@
from typing import List, Optional
import warnings
from unstructured.documents.elements import Element
from unstructured.partition import _partition_via_api
@ -54,9 +55,13 @@ def partition_pdf_or_image(
out_template: Optional[str] = template
if route_args[0] == "layout":
out_template = None
layout_elements = _partition_pdf_or_image_local(
filename=filename, file=file, template=out_template, is_image=is_image
)
# NOTE(robinson): Catches a UserWarning that occurs when detectron is called
with warnings.catch_warnings():
warnings.simplefilter("ignore")
layout_elements = _partition_pdf_or_image_local(
filename=filename, file=file, template=out_template, is_image=is_image
)
else:
# NOTE(alan): Remove these lines after different models are handled by routing
if template == "checkbox":