Matt Robinson 3d3f3df3ec
enhancement: add "ocr_only" strategy for PDFs (#553)
* add tests for validating strategy

* refactor into determine_pdf_strategy function

* refactor pdf strategies into strategies

* remove commented out code

* remove unreachable code

* add in handling for image types

* a little more refactoring

* import ocr partioning for images

* catch warnings, partition type for valid strategies

* fallback to ocr_only from fast

* fallback logic for hi_res

* test for fallback to ocr only

* fallback logic ofr ocr_only

* more tests for fallback logic

* update doc strings

* version and changelog

* linting, linting, linting

* update docs to include notes about strategy

* fix typos

* change back patched filename
2023-05-08 17:21:24 +00:00

42 lines
1.1 KiB
Python

import os
import pytest
from unstructured.partition import strategies
def test_validate_strategy_validates():
# Nothing should raise for a valid strategy
strategies.validate_strategy("hi_res", "pdf")
def test_validate_strategy_raises_for_bad_filetype():
with pytest.raises(ValueError):
strategies.validate_strategy("fast", "image")
def test_validate_strategy_raises_for_bad_strategy():
with pytest.raises(ValueError):
strategies.validate_strategy("totally_guess_the_text", "image")
@pytest.mark.parametrize(
("filename", "from_file", "expected"),
[
("layout-parser-paper-fast.pdf", True, True),
("copy-protected.pdf", True, False),
("layout-parser-paper-fast.pdf", False, True),
("copy-protected.pdf", False, False),
],
)
def test_is_pdf_text_extractable(filename, from_file, expected):
filename = os.path.join("example-docs", filename)
if from_file:
with open(filename, "rb") as f:
extractable = strategies.is_pdf_text_extractable(file=f)
else:
extractable = strategies.is_pdf_text_extractable(filename=filename)
assert extractable is expected