mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-24 17:41:15 +00:00

* add tests for validating strategy * refactor into determine_pdf_strategy function * refactor pdf strategies into strategies * remove commented out code * remove unreachable code * add in handling for image types * a little more refactoring * import ocr partioning for images * catch warnings, partition type for valid strategies * fallback to ocr_only from fast * fallback logic for hi_res * test for fallback to ocr only * fallback logic ofr ocr_only * more tests for fallback logic * update doc strings * version and changelog * linting, linting, linting * update docs to include notes about strategy * fix typos * change back patched filename
42 lines
1.1 KiB
Python
42 lines
1.1 KiB
Python
import os
|
|
|
|
import pytest
|
|
|
|
from unstructured.partition import strategies
|
|
|
|
|
|
def test_validate_strategy_validates():
|
|
# Nothing should raise for a valid strategy
|
|
strategies.validate_strategy("hi_res", "pdf")
|
|
|
|
|
|
def test_validate_strategy_raises_for_bad_filetype():
|
|
with pytest.raises(ValueError):
|
|
strategies.validate_strategy("fast", "image")
|
|
|
|
|
|
def test_validate_strategy_raises_for_bad_strategy():
|
|
with pytest.raises(ValueError):
|
|
strategies.validate_strategy("totally_guess_the_text", "image")
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filename", "from_file", "expected"),
|
|
[
|
|
("layout-parser-paper-fast.pdf", True, True),
|
|
("copy-protected.pdf", True, False),
|
|
("layout-parser-paper-fast.pdf", False, True),
|
|
("copy-protected.pdf", False, False),
|
|
],
|
|
)
|
|
def test_is_pdf_text_extractable(filename, from_file, expected):
|
|
filename = os.path.join("example-docs", filename)
|
|
|
|
if from_file:
|
|
with open(filename, "rb") as f:
|
|
extractable = strategies.is_pdf_text_extractable(file=f)
|
|
else:
|
|
extractable = strategies.is_pdf_text_extractable(filename=filename)
|
|
|
|
assert extractable is expected
|