OCRmyPDF/ocrmypdf/exec/tesseract.py
James R. Barlow 8c17c9918e Add documentation and test cases for —tesseract-config
This parameter has existed for along time but never really got any
attention.
2017-01-28 22:06:51 -08:00

322 lines
11 KiB
Python

#!/usr/bin/env python3
# © 2015 James R. Barlow: github.com/jbarlow83
import sys
import os
import re
import shutil
from functools import lru_cache
from ..exceptions import MissingDependencyError, TesseractConfigError
from ..helpers import page_number
from . import get_program
from collections import namedtuple
from textwrap import dedent
from subprocess import Popen, PIPE, CalledProcessError, \
TimeoutExpired, check_output, STDOUT, DEVNULL
OrientationConfidence = namedtuple(
'OrientationConfidence',
('angle', 'confidence'))
HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name='ocr-system' content='tesseract 3.02.02' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "x.tif"; bbox 0 0 {0} {1}; ppageno 0'>
<div class='ocr_carea' id='block_1_1' title="bbox 0 1 {0} {1}">
<p class='ocr_par' dir='ltr' id='par_1' title="bbox 0 1 {0} {1}">
<span class='ocr_line' id='line_1' title="bbox 0 1 {0} {1}"><span class='ocrx_word' id='word_1' title="bbox 0 1 {0} {1}"> </span>
</span>
</p>
</div>
</div>
</body>
</html>'''
@lru_cache(maxsize=1)
def version():
args_tess = [
get_program('tesseract'),
'--version'
]
try:
versions = check_output(
args_tess, close_fds=True, universal_newlines=True,
stderr=STDOUT)
except CalledProcessError as e:
print("Could not find Tesseract executable on system PATH.",
file=sys.stderr)
raise MissingDependencyError from e
tesseract_version = re.match(r'tesseract\s(.+)', versions).group(1)
return tesseract_version
def v4():
"Is this Tesseract v4.0?"
return (version() >= '4')
def has_textonly_pdf():
if version() == '4.00.00alpha':
# textonly_pdf added during the 4.00.00alpha cycle, so we must test
# more carefully to see if it is present
args_tess = [
get_program('tesseract'),
'--print-parameters'
]
try:
params = check_output(
args_tess, close_fds=True, universal_newlines=True,
stderr=STDOUT)
except CalledProcessError as e:
print("Could not --print-parameters from tesseract",
file=sys.stderr)
raise MissingDependencyError from e
if 'textonly_pdf' in params:
return True
else:
return v4()
def psm():
"If Tesseract 4.0, use argument --psm instead of -psm"
return '--psm' if v4() else '-psm'
@lru_cache(maxsize=1)
def languages():
args_tess = [
get_program('tesseract'),
'--list-langs'
]
try:
langs = check_output(
args_tess, close_fds=True, universal_newlines=True,
stderr=STDOUT)
except CalledProcessError as e:
msg = dedent("""Tesseract failed to report available languages.
Output from Tesseract:
-----------
""")
msg += e.output
print(msg, file=sys.stderr)
raise MissingDependencyError from e
return set(lang.strip() for lang in langs.splitlines()[1:])
def tess_base_args(languages, engine_mode):
args = [
get_program('tesseract'),
]
if languages:
args.extend(['-l', '+'.join(languages)])
if engine_mode is not None and v4():
args.extend(['--oem', str(engine_mode)])
return args
def get_orientation(input_file, language: list, engine_mode, timeout: float,
log):
args_tesseract = tess_base_args(language, engine_mode) + [
psm(), '0',
input_file,
'stdout'
]
try:
stdout = check_output(
args_tesseract, close_fds=True, stderr=STDOUT,
universal_newlines=True, timeout=timeout)
except TimeoutExpired:
return OrientationConfidence(angle=0, confidence=0.0)
except CalledProcessError as e:
tesseract_log_output(log, e.output, input_file)
if ('Too few characters. Skipping this page' in e.output or
'Image too large' in e.output):
return OrientationConfidence(0, 0)
raise e from e
else:
osd = {}
for line in stdout.splitlines():
line = line.strip()
parts = line.split(':', maxsplit=2)
if len(parts) == 2:
osd[parts[0].strip()] = parts[1].strip()
angle = int(osd.get('Orientation in degrees', 0))
if 'Orientation' in osd:
# Tesseract < 3.04.01
# reports "Orientation in degrees" as a counterclockwise angle
# We keep it clockwise
assert 'Rotate' not in osd
angle = -angle % 360
else:
# Tesseract == 3.04.01, hopefully also Tesseract > 3.04.01
# reports "Orientation in degrees" as a clockwise angle
assert 'Rotate' in osd
oc = OrientationConfidence(
angle=angle,
confidence=float(osd.get('Orientation confidence', 0)))
return oc
def tesseract_log_output(log, stdout, input_file):
lines = stdout.splitlines()
prefix = "{0:4d}: [tesseract] ".format(page_number(input_file))
for line in lines:
if line.startswith("Tesseract Open Source"):
continue
elif line.startswith("Warning in pixReadMem"):
continue
elif 'diacritics' in line:
log.warning(prefix + "lots of diacritics - possibly poor OCR")
elif line.startswith('OSD: Weak margin'):
log.warning(prefix + "unsure about page orientation")
elif 'error' in line.lower() or 'exception' in line.lower():
log.error(prefix + line.strip())
elif 'read_params_file' in line.lower():
log.error(prefix + line.strip())
else:
log.info(prefix + line.strip())
def page_timedout(log, input_file):
prefix = "{0:4d}: [tesseract] ".format(page_number(input_file))
log.warning(prefix + " took too long to OCR - skipping")
def _generate_null_hocr(output_hocr, image):
"""Produce a .hocr file that reports no text detected on a page that is
the same size as the input image."""
from PIL import Image
im = Image.open(image)
w, h = im.size
with open(output_hocr, 'w', encoding="utf-8") as f:
f.write(HOCR_TEMPLATE.format(w, h))
def generate_hocr(input_file, output_hocr, language: list, engine_mode,
tessconfig: list,
timeout: float, pagesegmode: int, log):
badxml = os.path.splitext(output_hocr)[0] + '.badxml'
args_tesseract = tess_base_args(language, engine_mode)
if pagesegmode is not None:
args_tesseract.extend([psm(), str(pagesegmode)])
args_tesseract.extend([
input_file,
badxml,
'hocr'
] + tessconfig)
try:
log.debug(args_tesseract)
stdout = check_output(
args_tesseract, close_fds=True, stderr=STDOUT,
universal_newlines=True, timeout=timeout)
except TimeoutExpired:
# Generate a HOCR file with no recognized text if tesseract times out
# Temporary workaround to hocrTransform not being able to function if
# it does not have a valid hOCR file.
page_timedout(log, input_file)
_generate_null_hocr(output_hocr, input_file)
except CalledProcessError as e:
tesseract_log_output(log, e.output, input_file)
if 'read_params_file: parameter not found' in e.output:
raise TesseractConfigError() from e
if 'Image too large' in e.output:
_generate_null_hocr(output_hocr, input_file)
return
raise e from e
else:
tesseract_log_output(log, stdout, input_file)
if os.path.exists(badxml + '.html'):
# Tesseract 3.02 appends suffix ".html" on its own (.badxml.html)
shutil.move(badxml + '.html', badxml)
elif os.path.exists(badxml + '.hocr'):
# Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr)
shutil.move(badxml + '.hocr', badxml)
# Tesseract 3.03 inserts source filename into hocr file without
# escaping it, creating invalid XML and breaking the parser.
# As a workaround, rewrite the hocr file, replacing the filename
# with a space. Don't know if Tesseract 3.02 does the same.
regex_nested_single_quotes = re.compile(
r"""title='image "([^"]*)";""")
with open(badxml, mode='r', encoding='utf-8') as f_in, \
open(output_hocr, mode='w', encoding='utf-8') as f_out:
for line in f_in:
line = regex_nested_single_quotes.sub(
r"""title='image " ";""", line)
f_out.write(line)
def generate_pdf(input_image, skip_pdf, output_pdf, language: list,
engine_mode, text_only: bool,
tessconfig: list, timeout: float, pagesegmode: int, log):
'''Use Tesseract to render a PDF.
input_image -- image to analyze
skip_pdf -- if we time out, use this file as output
output_pdf -- file to generate
language -- list of languages to consider
engine_mode -- engine mode argument for tess v4
text_only -- enable tesseract text only mode?
tessconfig -- tesseract configuration
timeout -- timeout (seconds)
log -- logger object
'''
args_tesseract = tess_base_args(language, engine_mode)
if pagesegmode is not None:
args_tesseract.extend([psm(), str(pagesegmode)])
if text_only:
args_tesseract.extend(['-c', 'textonly_pdf=1'])
args_tesseract.extend([
input_image,
os.path.splitext(output_pdf)[0], # Tesseract appends suffix
'pdf'
] + tessconfig)
try:
log.debug(args_tesseract)
stdout = check_output(
args_tesseract, close_fds=True, stderr=STDOUT,
universal_newlines=True, timeout=timeout)
except TimeoutExpired:
page_timedout(log, input_image)
shutil.copy(skip_pdf, output_pdf)
except CalledProcessError as e:
tesseract_log_output(log, e.output, input_image)
if 'read_params_file: parameter not found' in e.output:
raise TesseractConfigError() from e
if 'Image too large' in e.output:
shutil.copy(skip_pdf, output_pdf)
return
raise e from e
else:
tesseract_log_output(log, stdout, input_image)