mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 15:13:35 +00:00
fix: no UserWarning when partition_pdf is called (#179)
This commit is contained in:
parent
339c133326
commit
e6cfde5c4a
@ -1,4 +1,4 @@
|
||||
## 0.4.5-dev0
|
||||
## 0.4.5-dev1
|
||||
|
||||
* Loosen the default cap threshold to `0.5`.
|
||||
* Add a `NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling the cap ratio threshold.
|
||||
@ -7,6 +7,7 @@
|
||||
is insufficient to determine that the text is narrative.
|
||||
* Upper cased text is lower cased before checking for verbs. This helps avoid some missed verbs.
|
||||
* Adds an `Address` element for capturing elements that only contain an address.
|
||||
* Suppress the `UserWarning` when detectron is called
|
||||
|
||||
## 0.4.4
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
import os
|
||||
import pathlib
|
||||
import pytest
|
||||
import warnings
|
||||
|
||||
import docx
|
||||
|
||||
@ -136,6 +137,9 @@ def test_auto_partition_text_from_file():
|
||||
elements = partition(file=f)
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_TEXT_OUTPUT
|
||||
|
||||
|
||||
def test_auto_partition_pdf_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
||||
elements = partition(filename=filename)
|
||||
|
||||
@ -150,7 +154,23 @@ def test_auto_partition_pdf_from_file():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition(file=f)
|
||||
assert len(elements) > 0
|
||||
|
||||
assert isinstance(elements[0], Title)
|
||||
assert elements[0].text.startswith("LayoutParser")
|
||||
|
||||
assert isinstance(elements[1], NarrativeText)
|
||||
assert elements[1].text.startswith("Zejiang Shen 1")
|
||||
|
||||
|
||||
def test_partition_pdf_doesnt_raise_warning():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
||||
# NOTE(robinson): This is the recommended way to check that no warning is emitted,
|
||||
# per the pytest docs.
|
||||
# ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
|
||||
# #additional-use-cases-of-warnings-in-tests
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error")
|
||||
partition(filename=filename)
|
||||
|
||||
|
||||
def test_auto_partition_jpg():
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.4.5-dev0" # pragma: no cover
|
||||
__version__ = "0.4.5-dev1" # pragma: no cover
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
from typing import List, Optional
|
||||
import warnings
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.partition import _partition_via_api
|
||||
@ -54,9 +55,13 @@ def partition_pdf_or_image(
|
||||
out_template: Optional[str] = template
|
||||
if route_args[0] == "layout":
|
||||
out_template = None
|
||||
layout_elements = _partition_pdf_or_image_local(
|
||||
filename=filename, file=file, template=out_template, is_image=is_image
|
||||
)
|
||||
|
||||
# NOTE(robinson): Catches a UserWarning that occurs when detectron is called
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
layout_elements = _partition_pdf_or_image_local(
|
||||
filename=filename, file=file, template=out_template, is_image=is_image
|
||||
)
|
||||
else:
|
||||
# NOTE(alan): Remove these lines after different models are handled by routing
|
||||
if template == "checkbox":
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user