#!/usr/bin/env python3 # © 2015 James R. Barlow: github.com/jbarlow83 import sys import os import re import shutil from functools import lru_cache from ..exceptions import MissingDependencyError, TesseractConfigError from ..helpers import page_number from . import get_program from collections import namedtuple from textwrap import dedent from subprocess import Popen, PIPE, CalledProcessError, \ TimeoutExpired, check_output, STDOUT, DEVNULL OrientationConfidence = namedtuple( 'OrientationConfidence', ('angle', 'confidence')) HOCR_TEMPLATE = '''

''' @lru_cache(maxsize=1) def version(): args_tess = [ get_program('tesseract'), '--version' ] try: versions = check_output( args_tess, close_fds=True, universal_newlines=True, stderr=STDOUT) except CalledProcessError as e: print("Could not find Tesseract executable on system PATH.", file=sys.stderr) raise MissingDependencyError from e tesseract_version = re.match(r'tesseract\s(.+)', versions).group(1) return tesseract_version def v4(): "Is this Tesseract v4.0?" return (version() >= '4') def has_textonly_pdf(): if version() == '4.00.00alpha': # textonly_pdf added during the 4.00.00alpha cycle, so we must test # more carefully to see if it is present args_tess = [ get_program('tesseract'), '--print-parameters' ] try: params = check_output( args_tess, close_fds=True, universal_newlines=True, stderr=STDOUT) except CalledProcessError as e: print("Could not --print-parameters from tesseract", file=sys.stderr) raise MissingDependencyError from e if 'textonly_pdf' in params: return True else: return v4() def psm(): "If Tesseract 4.0, use argument --psm instead of -psm" return '--psm' if v4() else '-psm' @lru_cache(maxsize=1) def languages(): args_tess = [ get_program('tesseract'), '--list-langs' ] try: langs = check_output( args_tess, close_fds=True, universal_newlines=True, stderr=STDOUT) except CalledProcessError as e: msg = dedent("""Tesseract failed to report available languages. Output from Tesseract: ----------- """) msg += e.output print(msg, file=sys.stderr) raise MissingDependencyError from e return set(lang.strip() for lang in langs.splitlines()[1:]) def tess_base_args(languages, engine_mode): args = [ get_program('tesseract'), ] if languages: args.extend(['-l', '+'.join(languages)]) if engine_mode is not None and v4(): args.extend(['--oem', str(engine_mode)]) return args def get_orientation(input_file, language: list, engine_mode, timeout: float, log): args_tesseract = tess_base_args(language, engine_mode) + [ psm(), '0', input_file, 'stdout' ] try: stdout = check_output( args_tesseract, close_fds=True, stderr=STDOUT, universal_newlines=True, timeout=timeout) except TimeoutExpired: return OrientationConfidence(angle=0, confidence=0.0) except CalledProcessError as e: tesseract_log_output(log, e.output, input_file) if ('Too few characters. Skipping this page' in e.output or 'Image too large' in e.output): return OrientationConfidence(0, 0) raise e from e else: osd = {} for line in stdout.splitlines(): line = line.strip() parts = line.split(':', maxsplit=2) if len(parts) == 2: osd[parts[0].strip()] = parts[1].strip() angle = int(osd.get('Orientation in degrees', 0)) if 'Orientation' in osd: # Tesseract < 3.04.01 # reports "Orientation in degrees" as a counterclockwise angle # We keep it clockwise assert 'Rotate' not in osd angle = -angle % 360 else: # Tesseract == 3.04.01, hopefully also Tesseract > 3.04.01 # reports "Orientation in degrees" as a clockwise angle assert 'Rotate' in osd oc = OrientationConfidence( angle=angle, confidence=float(osd.get('Orientation confidence', 0))) return oc def tesseract_log_output(log, stdout, input_file): lines = stdout.splitlines() prefix = "{0:4d}: [tesseract] ".format(page_number(input_file)) for line in lines: if line.startswith("Tesseract Open Source"): continue elif line.startswith("Warning in pixReadMem"): continue elif 'diacritics' in line: log.warning(prefix + "lots of diacritics - possibly poor OCR") elif line.startswith('OSD: Weak margin'): log.warning(prefix + "unsure about page orientation") elif 'error' in line.lower() or 'exception' in line.lower(): log.error(prefix + line.strip()) elif 'read_params_file' in line.lower(): log.error(prefix + line.strip()) else: log.info(prefix + line.strip()) def page_timedout(log, input_file): prefix = "{0:4d}: [tesseract] ".format(page_number(input_file)) log.warning(prefix + " took too long to OCR - skipping") def _generate_null_hocr(output_hocr, image): """Produce a .hocr file that reports no text detected on a page that is the same size as the input image.""" from PIL import Image im = Image.open(image) w, h = im.size with open(output_hocr, 'w', encoding="utf-8") as f: f.write(HOCR_TEMPLATE.format(w, h)) def generate_hocr(input_file, output_hocr, language: list, engine_mode, tessconfig: list, timeout: float, pagesegmode: int, log): badxml = os.path.splitext(output_hocr)[0] + '.badxml' args_tesseract = tess_base_args(language, engine_mode) if pagesegmode is not None: args_tesseract.extend([psm(), str(pagesegmode)]) args_tesseract.extend([ input_file, badxml, 'hocr' ] + tessconfig) try: log.debug(args_tesseract) stdout = check_output( args_tesseract, close_fds=True, stderr=STDOUT, universal_newlines=True, timeout=timeout) except TimeoutExpired: # Generate a HOCR file with no recognized text if tesseract times out # Temporary workaround to hocrTransform not being able to function if # it does not have a valid hOCR file. page_timedout(log, input_file) _generate_null_hocr(output_hocr, input_file) except CalledProcessError as e: tesseract_log_output(log, e.output, input_file) if 'read_params_file: parameter not found' in e.output: raise TesseractConfigError() from e if 'Image too large' in e.output: _generate_null_hocr(output_hocr, input_file) return raise e from e else: tesseract_log_output(log, stdout, input_file) if os.path.exists(badxml + '.html'): # Tesseract 3.02 appends suffix ".html" on its own (.badxml.html) shutil.move(badxml + '.html', badxml) elif os.path.exists(badxml + '.hocr'): # Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr) shutil.move(badxml + '.hocr', badxml) # Tesseract 3.03 inserts source filename into hocr file without # escaping it, creating invalid XML and breaking the parser. # As a workaround, rewrite the hocr file, replacing the filename # with a space. Don't know if Tesseract 3.02 does the same. regex_nested_single_quotes = re.compile( r"""title='image "([^"]*)";""") with open(badxml, mode='r', encoding='utf-8') as f_in, \ open(output_hocr, mode='w', encoding='utf-8') as f_out: for line in f_in: line = regex_nested_single_quotes.sub( r"""title='image " ";""", line) f_out.write(line) def generate_pdf(input_image, skip_pdf, output_pdf, language: list, engine_mode, text_only: bool, tessconfig: list, timeout: float, pagesegmode: int, log): '''Use Tesseract to render a PDF. input_image -- image to analyze skip_pdf -- if we time out, use this file as output output_pdf -- file to generate language -- list of languages to consider engine_mode -- engine mode argument for tess v4 text_only -- enable tesseract text only mode? tessconfig -- tesseract configuration timeout -- timeout (seconds) log -- logger object ''' args_tesseract = tess_base_args(language, engine_mode) if pagesegmode is not None: args_tesseract.extend([psm(), str(pagesegmode)]) if text_only: args_tesseract.extend(['-c', 'textonly_pdf=1']) args_tesseract.extend([ input_image, os.path.splitext(output_pdf)[0], # Tesseract appends suffix 'pdf' ] + tessconfig) try: log.debug(args_tesseract) stdout = check_output( args_tesseract, close_fds=True, stderr=STDOUT, universal_newlines=True, timeout=timeout) except TimeoutExpired: page_timedout(log, input_image) shutil.copy(skip_pdf, output_pdf) except CalledProcessError as e: tesseract_log_output(log, e.output, input_image) if 'read_params_file: parameter not found' in e.output: raise TesseractConfigError() from e if 'Image too large' in e.output: shutil.copy(skip_pdf, output_pdf) return raise e from e else: tesseract_log_output(log, stdout, input_image)