qued 79f734d3f9
fix: better extractable check (#900)
auto strategy was choosing the fast strategy in cases where the pdf contents were just a flat image, resulting in no output. This PR changes the behavior of auto so that elements that can be extracted by fast are extracted, a cursory examination of the elements is made to see if there are elements with text present, and if so then these elements are used as the output. Otherwise fallback strategies come into play.
2023-07-07 23:41:37 -05:00

83 lines
2.3 KiB
Python

import os
import pytest
from unstructured.partition import pdf, strategies
def test_validate_strategy_validates():
# Nothing should raise for a valid strategy
strategies.validate_strategy("hi_res", "pdf")
def test_validate_strategy_raises_for_bad_filetype():
with pytest.raises(ValueError):
strategies.validate_strategy("fast", "image")
def test_validate_strategy_raises_for_bad_strategy():
with pytest.raises(ValueError):
strategies.validate_strategy("totally_guess_the_text", "image")
@pytest.mark.parametrize(
("filename", "from_file", "expected"),
[
("layout-parser-paper-fast.pdf", True, True),
("copy-protected.pdf", True, True),
("loremipsum-flat.pdf", True, False),
("layout-parser-paper-fast.pdf", False, True),
("copy-protected.pdf", False, True),
("loremipsum-flat.pdf", False, False),
],
)
def test_is_pdf_text_extractable(filename, from_file, expected):
filename = os.path.join("example-docs", filename)
if from_file:
with open(filename, "rb") as f:
extractable = pdf.extractable_elements(file=f)
else:
extractable = pdf.extractable_elements(filename=filename)
assert bool(extractable) is expected
@pytest.mark.parametrize(
("infer_table_structure", "expected"),
[
(True, "hi_res"),
(False, "ocr_only"),
],
)
def test_determine_image_auto_strategy(infer_table_structure, expected):
strategy = strategies._determine_image_auto_strategy(
infer_table_structure=infer_table_structure,
)
assert strategy is expected
@pytest.mark.parametrize(
("pdf_text_extractable", "infer_table_structure", "expected"),
[
(True, True, "hi_res"),
(False, True, "hi_res"),
(True, False, "fast"),
(False, False, "ocr_only"),
],
)
def test_determine_image_pdf_strategy(pdf_text_extractable, infer_table_structure, expected):
strategy = strategies._determine_pdf_auto_strategy(
pdf_text_extractable=pdf_text_extractable,
infer_table_structure=infer_table_structure,
)
assert strategy is expected
def test_determine_pdf_or_image_strategy_fallback_ocr():
strategy = strategies.determine_pdf_or_image_strategy(
strategy="fast",
is_image=True,
)
assert strategy == "ocr_only"