# SPDX-FileCopyrightText: 2022 James R. Barlow # SPDX-License-Identifier: MIT """Tesseract no-op/fixed rotate plugin To quickly run tests where getting OCR output is not necessary and we want to test the rotation pipeline. In 'hocr' mode, create a .hocr file that specifies no text found. In 'pdf' mode, convert the image to PDF using another program. In orientation check mode, report 0, 90, 180, 270... based on page number. """ from __future__ import annotations import pikepdf from PIL import Image from ocrmypdf import OcrEngine, OrientationConfidence, hookimpl from ocrmypdf.helpers import page_number HOCR_TEMPLATE = '''

''' class FixedRotateNoopOcrEngine(OcrEngine): @staticmethod def version(): return '4.0.0' @staticmethod def creator_tag(options): tag = '-PDF' if options.pdf_renderer == 'sandwich' else '' return f"NO-OP {tag} {FixedRotateNoopOcrEngine.version()}" def __str__(self): return f"NO-OP {FixedRotateNoopOcrEngine.version()}" @staticmethod def languages(options): return {'eng'} @staticmethod def get_orientation(input_file, options): page = page_number(input_file) angle = ((page - 1) * 90) % 360 return OrientationConfidence(angle=angle, confidence=99.9) @staticmethod def generate_hocr(input_file, output_hocr, output_text, options): with Image.open(input_file) as im, open( output_hocr, 'w', encoding='utf-8' ) as f: w, h = im.size f.write(HOCR_TEMPLATE.format(str(w), str(h))) with open(output_text, 'w') as f: f.write('') @staticmethod def generate_pdf(input_file, output_pdf, output_text, options): with Image.open(input_file) as im: dpi = im.info['dpi'] pagesize = im.size[0] / dpi[0], im.size[1] / dpi[1] ptsize = pagesize[0] * 72, pagesize[1] * 72 pdf = pikepdf.new() pdf.add_blank_page(page_size=ptsize) pdf.save(output_pdf, static_id=True) output_text.write_text('') @hookimpl def get_ocr_engine(): return FixedRotateNoopOcrEngine()