OCRmyPDF/tests/plugins/tesseract_badutf8.py
2022-07-28 01:10:07 -07:00

57 lines
1.5 KiB
Python

# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
"""Tesseract bad utf8
In some cases, some versions of Tesseract can output binary gibberish or data
that is not UTF-8 compatible, so we are forced to check that we can convert it
and present it to the user.
"""
from __future__ import annotations
from contextlib import contextmanager
from subprocess import CalledProcessError
from unittest.mock import patch
from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine
def bad_utf8(*args, **kwargs):
raise CalledProcessError(
1,
'tesseract',
output=b'\x96\xb3\x8c\xf8\x82\xc8UTF-8\x0a', # "Invalid UTF-8" in Shift JIS
stderr=b"",
)
@contextmanager
def patch_tesseract_run():
with patch('ocrmypdf._exec.tesseract.run') as mock:
mock.side_effect = bad_utf8
yield
mock.assert_called()
class BadUtf8OcrEngine(TesseractOcrEngine):
@staticmethod
def generate_hocr(input_file, output_hocr, output_text, options):
with patch_tesseract_run():
TesseractOcrEngine.generate_hocr(
input_file, output_hocr, output_text, options
)
@staticmethod
def generate_pdf(input_file, output_pdf, output_text, options):
with patch_tesseract_run():
TesseractOcrEngine.generate_pdf(
input_file, output_pdf, output_text, options
)
@hookimpl
def get_ocr_engine():
return BadUtf8OcrEngine()