#!/usr/bin/env python3
# © 2015 James R. Barlow: github.com/jbarlow83
import sys
import os
import re
import shutil
from functools import lru_cache
from ..exceptions import MissingDependencyError, TesseractConfigError
from ..helpers import page_number
from . import get_program
from collections import namedtuple
from textwrap import dedent
from subprocess import Popen, PIPE, CalledProcessError, \
TimeoutExpired, check_output, STDOUT, DEVNULL
OrientationConfidence = namedtuple(
'OrientationConfidence',
('angle', 'confidence'))
HOCR_TEMPLATE = '''
'''
@lru_cache(maxsize=1)
def version():
args_tess = [
get_program('tesseract'),
'--version'
]
try:
versions = check_output(
args_tess, close_fds=True, universal_newlines=True,
stderr=STDOUT)
except CalledProcessError as e:
print("Could not find Tesseract executable on system PATH.",
file=sys.stderr)
raise MissingDependencyError from e
tesseract_version = re.match(r'tesseract\s(.+)', versions).group(1)
return tesseract_version
def v4():
"Is this Tesseract v4.0?"
return (version() >= '4')
def has_textonly_pdf():
if version() == '4.00.00alpha':
# textonly_pdf added during the 4.00.00alpha cycle, so we must test
# more carefully to see if it is present
args_tess = [
get_program('tesseract'),
'--print-parameters'
]
try:
params = check_output(
args_tess, close_fds=True, universal_newlines=True,
stderr=STDOUT)
except CalledProcessError as e:
print("Could not --print-parameters from tesseract",
file=sys.stderr)
raise MissingDependencyError from e
if 'textonly_pdf' in params:
return True
else:
return v4()
def psm():
"If Tesseract 4.0, use argument --psm instead of -psm"
return '--psm' if v4() else '-psm'
@lru_cache(maxsize=1)
def languages():
args_tess = [
get_program('tesseract'),
'--list-langs'
]
try:
langs = check_output(
args_tess, close_fds=True, universal_newlines=True,
stderr=STDOUT)
except CalledProcessError as e:
msg = dedent("""Tesseract failed to report available languages.
Output from Tesseract:
-----------
""")
msg += e.output
print(msg, file=sys.stderr)
raise MissingDependencyError from e
return set(lang.strip() for lang in langs.splitlines()[1:])
def tess_base_args(languages, engine_mode):
args = [
get_program('tesseract'),
]
if languages:
args.extend(['-l', '+'.join(languages)])
if engine_mode is not None and v4():
args.extend(['--oem', str(engine_mode)])
return args
def get_orientation(input_file, language: list, engine_mode, timeout: float,
log):
args_tesseract = tess_base_args(language, engine_mode) + [
psm(), '0',
input_file,
'stdout'
]
try:
stdout = check_output(
args_tesseract, close_fds=True, stderr=STDOUT,
universal_newlines=True, timeout=timeout)
except TimeoutExpired:
return OrientationConfidence(angle=0, confidence=0.0)
except CalledProcessError as e:
tesseract_log_output(log, e.output, input_file)
if ('Too few characters. Skipping this page' in e.output or
'Image too large' in e.output):
return OrientationConfidence(0, 0)
raise e from e
else:
osd = {}
for line in stdout.splitlines():
line = line.strip()
parts = line.split(':', maxsplit=2)
if len(parts) == 2:
osd[parts[0].strip()] = parts[1].strip()
angle = int(osd.get('Orientation in degrees', 0))
if 'Orientation' in osd:
# Tesseract < 3.04.01
# reports "Orientation in degrees" as a counterclockwise angle
# We keep it clockwise
assert 'Rotate' not in osd
angle = -angle % 360
else:
# Tesseract == 3.04.01, hopefully also Tesseract > 3.04.01
# reports "Orientation in degrees" as a clockwise angle
assert 'Rotate' in osd
oc = OrientationConfidence(
angle=angle,
confidence=float(osd.get('Orientation confidence', 0)))
return oc
def tesseract_log_output(log, stdout, input_file):
lines = stdout.splitlines()
prefix = "{0:4d}: [tesseract] ".format(page_number(input_file))
for line in lines:
if line.startswith("Tesseract Open Source"):
continue
elif line.startswith("Warning in pixReadMem"):
continue
elif 'diacritics' in line:
log.warning(prefix + "lots of diacritics - possibly poor OCR")
elif line.startswith('OSD: Weak margin'):
log.warning(prefix + "unsure about page orientation")
elif 'error' in line.lower() or 'exception' in line.lower():
log.error(prefix + line.strip())
elif 'read_params_file' in line.lower():
log.error(prefix + line.strip())
else:
log.info(prefix + line.strip())
def page_timedout(log, input_file):
prefix = "{0:4d}: [tesseract] ".format(page_number(input_file))
log.warning(prefix + " took too long to OCR - skipping")
def _generate_null_hocr(output_hocr, image):
"""Produce a .hocr file that reports no text detected on a page that is
the same size as the input image."""
from PIL import Image
im = Image.open(image)
w, h = im.size
with open(output_hocr, 'w', encoding="utf-8") as f:
f.write(HOCR_TEMPLATE.format(w, h))
def generate_hocr(input_file, output_hocr, language: list, engine_mode,
tessconfig: list,
timeout: float, pagesegmode: int, log):
badxml = os.path.splitext(output_hocr)[0] + '.badxml'
args_tesseract = tess_base_args(language, engine_mode)
if pagesegmode is not None:
args_tesseract.extend([psm(), str(pagesegmode)])
args_tesseract.extend([
input_file,
badxml,
'hocr'
] + tessconfig)
try:
log.debug(args_tesseract)
stdout = check_output(
args_tesseract, close_fds=True, stderr=STDOUT,
universal_newlines=True, timeout=timeout)
except TimeoutExpired:
# Generate a HOCR file with no recognized text if tesseract times out
# Temporary workaround to hocrTransform not being able to function if
# it does not have a valid hOCR file.
page_timedout(log, input_file)
_generate_null_hocr(output_hocr, input_file)
except CalledProcessError as e:
tesseract_log_output(log, e.output, input_file)
if 'read_params_file: parameter not found' in e.output:
raise TesseractConfigError() from e
if 'Image too large' in e.output:
_generate_null_hocr(output_hocr, input_file)
return
raise e from e
else:
tesseract_log_output(log, stdout, input_file)
if os.path.exists(badxml + '.html'):
# Tesseract 3.02 appends suffix ".html" on its own (.badxml.html)
shutil.move(badxml + '.html', badxml)
elif os.path.exists(badxml + '.hocr'):
# Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr)
shutil.move(badxml + '.hocr', badxml)
# Tesseract 3.03 inserts source filename into hocr file without
# escaping it, creating invalid XML and breaking the parser.
# As a workaround, rewrite the hocr file, replacing the filename
# with a space. Don't know if Tesseract 3.02 does the same.
regex_nested_single_quotes = re.compile(
r"""title='image "([^"]*)";""")
with open(badxml, mode='r', encoding='utf-8') as f_in, \
open(output_hocr, mode='w', encoding='utf-8') as f_out:
for line in f_in:
line = regex_nested_single_quotes.sub(
r"""title='image " ";""", line)
f_out.write(line)
def generate_pdf(input_image, skip_pdf, output_pdf, language: list,
engine_mode, text_only: bool,
tessconfig: list, timeout: float, pagesegmode: int, log):
'''Use Tesseract to render a PDF.
input_image -- image to analyze
skip_pdf -- if we time out, use this file as output
output_pdf -- file to generate
language -- list of languages to consider
engine_mode -- engine mode argument for tess v4
text_only -- enable tesseract text only mode?
tessconfig -- tesseract configuration
timeout -- timeout (seconds)
log -- logger object
'''
args_tesseract = tess_base_args(language, engine_mode)
if pagesegmode is not None:
args_tesseract.extend([psm(), str(pagesegmode)])
if text_only:
args_tesseract.extend(['-c', 'textonly_pdf=1'])
args_tesseract.extend([
input_image,
os.path.splitext(output_pdf)[0], # Tesseract appends suffix
'pdf'
] + tessconfig)
try:
log.debug(args_tesseract)
stdout = check_output(
args_tesseract, close_fds=True, stderr=STDOUT,
universal_newlines=True, timeout=timeout)
except TimeoutExpired:
page_timedout(log, input_image)
shutil.copy(skip_pdf, output_pdf)
except CalledProcessError as e:
tesseract_log_output(log, e.output, input_image)
if 'read_params_file: parameter not found' in e.output:
raise TesseractConfigError() from e
if 'Image too large' in e.output:
shutil.copy(skip_pdf, output_pdf)
return
raise e from e
else:
tesseract_log_output(log, stdout, input_image)