mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-02-08 16:16:13 +00:00
57 lines
1.5 KiB
Python
57 lines
1.5 KiB
Python
# SPDX-FileCopyrightText: 2022 James R. Barlow
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
"""Tesseract bad utf8
|
|
|
|
In some cases, some versions of Tesseract can output binary gibberish or data
|
|
that is not UTF-8 compatible, so we are forced to check that we can convert it
|
|
and present it to the user.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from contextlib import contextmanager
|
|
from subprocess import CalledProcessError
|
|
from unittest.mock import patch
|
|
|
|
from ocrmypdf import hookimpl
|
|
from ocrmypdf.builtin_plugins.tesseract_ocr import TesseractOcrEngine
|
|
|
|
|
|
def bad_utf8(*args, **kwargs):
|
|
raise CalledProcessError(
|
|
1,
|
|
'tesseract',
|
|
output=b'\x96\xb3\x8c\xf8\x82\xc8UTF-8\x0a', # "Invalid UTF-8" in Shift JIS
|
|
stderr=b"",
|
|
)
|
|
|
|
|
|
@contextmanager
|
|
def patch_tesseract_run():
|
|
with patch('ocrmypdf._exec.tesseract.run') as mock:
|
|
mock.side_effect = bad_utf8
|
|
yield
|
|
mock.assert_called()
|
|
|
|
|
|
class BadUtf8OcrEngine(TesseractOcrEngine):
|
|
@staticmethod
|
|
def generate_hocr(input_file, output_hocr, output_text, options):
|
|
with patch_tesseract_run():
|
|
TesseractOcrEngine.generate_hocr(
|
|
input_file, output_hocr, output_text, options
|
|
)
|
|
|
|
@staticmethod
|
|
def generate_pdf(input_file, output_pdf, output_text, options):
|
|
with patch_tesseract_run():
|
|
TesseractOcrEngine.generate_pdf(
|
|
input_file, output_pdf, output_text, options
|
|
)
|
|
|
|
|
|
@hookimpl
|
|
def get_ocr_engine():
|
|
return BadUtf8OcrEngine()
|