mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-24 17:41:15 +00:00

auto strategy was choosing the fast strategy in cases where the pdf contents were just a flat image, resulting in no output. This PR changes the behavior of auto so that elements that can be extracted by fast are extracted, a cursory examination of the elements is made to see if there are elements with text present, and if so then these elements are used as the output. Otherwise fallback strategies come into play.
83 lines
2.3 KiB
Python
83 lines
2.3 KiB
Python
import os
|
|
|
|
import pytest
|
|
|
|
from unstructured.partition import pdf, strategies
|
|
|
|
|
|
def test_validate_strategy_validates():
|
|
# Nothing should raise for a valid strategy
|
|
strategies.validate_strategy("hi_res", "pdf")
|
|
|
|
|
|
def test_validate_strategy_raises_for_bad_filetype():
|
|
with pytest.raises(ValueError):
|
|
strategies.validate_strategy("fast", "image")
|
|
|
|
|
|
def test_validate_strategy_raises_for_bad_strategy():
|
|
with pytest.raises(ValueError):
|
|
strategies.validate_strategy("totally_guess_the_text", "image")
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filename", "from_file", "expected"),
|
|
[
|
|
("layout-parser-paper-fast.pdf", True, True),
|
|
("copy-protected.pdf", True, True),
|
|
("loremipsum-flat.pdf", True, False),
|
|
("layout-parser-paper-fast.pdf", False, True),
|
|
("copy-protected.pdf", False, True),
|
|
("loremipsum-flat.pdf", False, False),
|
|
],
|
|
)
|
|
def test_is_pdf_text_extractable(filename, from_file, expected):
|
|
filename = os.path.join("example-docs", filename)
|
|
|
|
if from_file:
|
|
with open(filename, "rb") as f:
|
|
extractable = pdf.extractable_elements(file=f)
|
|
else:
|
|
extractable = pdf.extractable_elements(filename=filename)
|
|
|
|
assert bool(extractable) is expected
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("infer_table_structure", "expected"),
|
|
[
|
|
(True, "hi_res"),
|
|
(False, "ocr_only"),
|
|
],
|
|
)
|
|
def test_determine_image_auto_strategy(infer_table_structure, expected):
|
|
strategy = strategies._determine_image_auto_strategy(
|
|
infer_table_structure=infer_table_structure,
|
|
)
|
|
assert strategy is expected
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("pdf_text_extractable", "infer_table_structure", "expected"),
|
|
[
|
|
(True, True, "hi_res"),
|
|
(False, True, "hi_res"),
|
|
(True, False, "fast"),
|
|
(False, False, "ocr_only"),
|
|
],
|
|
)
|
|
def test_determine_image_pdf_strategy(pdf_text_extractable, infer_table_structure, expected):
|
|
strategy = strategies._determine_pdf_auto_strategy(
|
|
pdf_text_extractable=pdf_text_extractable,
|
|
infer_table_structure=infer_table_structure,
|
|
)
|
|
assert strategy is expected
|
|
|
|
|
|
def test_determine_pdf_or_image_strategy_fallback_ocr():
|
|
strategy = strategies.determine_pdf_or_image_strategy(
|
|
strategy="fast",
|
|
is_image=True,
|
|
)
|
|
assert strategy == "ocr_only"
|