OCRmyPDF/tests/plugins/tesseract_debug_rotate.py
2024-04-07 00:25:32 -07:00

99 lines
3.0 KiB
Python

# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
"""Tesseract no-op/fixed rotate plugin.
To quickly run tests where getting OCR output is not necessary and we want to test
the rotation pipeline.
In 'hocr' mode, create a .hocr file that specifies no text found.
In 'pdf' mode, convert the image to PDF using another program.
In orientation check mode, report 0, 90, 180, 270... based on page number.
"""
from __future__ import annotations
import pikepdf
from PIL import Image
from ocrmypdf import OcrEngine, OrientationConfidence, hookimpl
from ocrmypdf.helpers import page_number
HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name='ocr-system' content='tesseract 4.1.1' />
<meta name='ocr-capabilities'
content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "x.tif"; bbox 0 0 {0} {1}; ppageno 0'>
<div class='ocr_carea' id='block_1_1' title="bbox 0 1 {0} {1}">
<p class='ocr_par' dir='ltr' id='par_1' title="bbox 0 1 {0} {1}">
<span class='ocr_line' id='line_1' title="bbox 0 1 {0} {1}">
<span class='ocrx_word' id='word_1' title="bbox 0 1 {0} {1}"> </span>
</span>
</p>
</div>
</div>
</body>
</html>'''
class FixedRotateNoopOcrEngine(OcrEngine):
@staticmethod
def version():
return '4.1.1'
@staticmethod
def creator_tag(options):
tag = '-PDF' if options.pdf_renderer == 'sandwich' else '-hOCR'
return f"NO-OP {tag} {FixedRotateNoopOcrEngine.version()}"
def __str__(self):
return f"NO-OP {FixedRotateNoopOcrEngine.version()}"
@staticmethod
def languages(options):
return {'eng'}
@staticmethod
def get_orientation(input_file, options):
page = page_number(input_file)
angle = ((page - 1) * 90) % 360
return OrientationConfidence(angle=angle, confidence=99.9)
@staticmethod
def generate_hocr(input_file, output_hocr, output_text, options):
with (
Image.open(input_file) as im,
open(output_hocr, 'w', encoding='utf-8') as f,
):
w, h = im.size
f.write(HOCR_TEMPLATE.format(str(w), str(h)))
with open(output_text, 'w') as f:
f.write('')
@staticmethod
def generate_pdf(input_file, output_pdf, output_text, options):
with Image.open(input_file) as im:
dpi = im.info['dpi']
pagesize = im.size[0] / dpi[0], im.size[1] / dpi[1]
ptsize = pagesize[0] * 72, pagesize[1] * 72
pdf = pikepdf.new()
pdf.add_blank_page(page_size=ptsize)
pdf.save(output_pdf, static_id=True)
output_text.write_text('')
@hookimpl
def get_ocr_engine():
return FixedRotateNoopOcrEngine()