test: Add test to ensure languages trickle down to ocr (#1857)

Closes
[#93](https://github.com/Unstructured-IO/unstructured-inference/issues/93).

Adds a test to ensure language parameters are passed all the way from
`partition_pdf` down to the OCR calls.

#### Testing:

CI should pass.
This commit is contained in:
qued 2023-10-24 11:54:19 -05:00 committed by GitHub
parent b530e0a2be
commit 44cef80c82
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1057,3 +1057,35 @@ def test_partition_model_name_default_to_None():
)
except AttributeError:
pytest.fail("partition_pdf() raised AttributeError unexpectedly!")
@pytest.mark.parametrize(
("strategy", "ocr_func"),
[
(
"hi_res",
"unstructured_pytesseract.image_to_data",
),
(
"ocr_only",
"unstructured_pytesseract.run_and_get_multiple_output",
),
],
)
def test_ocr_language_passes_through(strategy, ocr_func):
# Create an exception that will be raised directly after OCR is called to stop execution
class CallException(Exception):
pass
mock_ocr_func = mock.Mock(side_effect=CallException("Function called!"))
# Patch the ocr function with the mock that will record the call and then terminate
with mock.patch(ocr_func, mock_ocr_func), pytest.raises(CallException):
pdf.partition_pdf(
"example-docs/layout-parser-paper-fast.pdf",
strategy=strategy,
ocr_languages="kor",
)
# Check that the language parameter was passed down as expected
kwargs = mock_ocr_func.call_args.kwargs
assert "lang" in kwargs
assert kwargs["lang"] == "kor"