mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-10-10 07:27:29 +00:00
322 lines
11 KiB
Python
322 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
# © 2015 James R. Barlow: github.com/jbarlow83
|
|
|
|
import sys
|
|
import os
|
|
import re
|
|
import shutil
|
|
from functools import lru_cache
|
|
from ..exceptions import MissingDependencyError, TesseractConfigError
|
|
from ..helpers import page_number
|
|
from . import get_program
|
|
from collections import namedtuple
|
|
from textwrap import dedent
|
|
|
|
from subprocess import Popen, PIPE, CalledProcessError, \
|
|
TimeoutExpired, check_output, STDOUT, DEVNULL
|
|
|
|
|
|
OrientationConfidence = namedtuple(
|
|
'OrientationConfidence',
|
|
('angle', 'confidence'))
|
|
|
|
HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
|
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
|
<head>
|
|
<title></title>
|
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
|
<meta name='ocr-system' content='tesseract 3.02.02' />
|
|
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
|
|
</head>
|
|
<body>
|
|
<div class='ocr_page' id='page_1' title='image "x.tif"; bbox 0 0 {0} {1}; ppageno 0'>
|
|
<div class='ocr_carea' id='block_1_1' title="bbox 0 1 {0} {1}">
|
|
<p class='ocr_par' dir='ltr' id='par_1' title="bbox 0 1 {0} {1}">
|
|
<span class='ocr_line' id='line_1' title="bbox 0 1 {0} {1}"><span class='ocrx_word' id='word_1' title="bbox 0 1 {0} {1}"> </span>
|
|
</span>
|
|
</p>
|
|
</div>
|
|
</div>
|
|
</body>
|
|
</html>'''
|
|
|
|
|
|
@lru_cache(maxsize=1)
|
|
def version():
|
|
args_tess = [
|
|
get_program('tesseract'),
|
|
'--version'
|
|
]
|
|
try:
|
|
versions = check_output(
|
|
args_tess, close_fds=True, universal_newlines=True,
|
|
stderr=STDOUT)
|
|
except CalledProcessError as e:
|
|
print("Could not find Tesseract executable on system PATH.",
|
|
file=sys.stderr)
|
|
raise MissingDependencyError from e
|
|
|
|
tesseract_version = re.match(r'tesseract\s(.+)', versions).group(1)
|
|
return tesseract_version
|
|
|
|
|
|
def v4():
|
|
"Is this Tesseract v4.0?"
|
|
return (version() >= '4')
|
|
|
|
|
|
def has_textonly_pdf():
|
|
if version() == '4.00.00alpha':
|
|
# textonly_pdf added during the 4.00.00alpha cycle, so we must test
|
|
# more carefully to see if it is present
|
|
args_tess = [
|
|
get_program('tesseract'),
|
|
'--print-parameters'
|
|
]
|
|
try:
|
|
params = check_output(
|
|
args_tess, close_fds=True, universal_newlines=True,
|
|
stderr=STDOUT)
|
|
except CalledProcessError as e:
|
|
print("Could not --print-parameters from tesseract",
|
|
file=sys.stderr)
|
|
raise MissingDependencyError from e
|
|
if 'textonly_pdf' in params:
|
|
return True
|
|
else:
|
|
return v4()
|
|
|
|
|
|
def psm():
|
|
"If Tesseract 4.0, use argument --psm instead of -psm"
|
|
return '--psm' if v4() else '-psm'
|
|
|
|
|
|
@lru_cache(maxsize=1)
|
|
def languages():
|
|
args_tess = [
|
|
get_program('tesseract'),
|
|
'--list-langs'
|
|
]
|
|
try:
|
|
langs = check_output(
|
|
args_tess, close_fds=True, universal_newlines=True,
|
|
stderr=STDOUT)
|
|
except CalledProcessError as e:
|
|
msg = dedent("""Tesseract failed to report available languages.
|
|
Output from Tesseract:
|
|
-----------
|
|
""")
|
|
msg += e.output
|
|
print(msg, file=sys.stderr)
|
|
raise MissingDependencyError from e
|
|
return set(lang.strip() for lang in langs.splitlines()[1:])
|
|
|
|
|
|
def tess_base_args(languages, engine_mode):
|
|
args = [
|
|
get_program('tesseract'),
|
|
]
|
|
if languages:
|
|
args.extend(['-l', '+'.join(languages)])
|
|
if engine_mode is not None and v4():
|
|
args.extend(['--oem', str(engine_mode)])
|
|
return args
|
|
|
|
|
|
def get_orientation(input_file, language: list, engine_mode, timeout: float,
|
|
log):
|
|
args_tesseract = tess_base_args(language, engine_mode) + [
|
|
psm(), '0',
|
|
input_file,
|
|
'stdout'
|
|
]
|
|
|
|
try:
|
|
stdout = check_output(
|
|
args_tesseract, close_fds=True, stderr=STDOUT,
|
|
universal_newlines=True, timeout=timeout)
|
|
except TimeoutExpired:
|
|
return OrientationConfidence(angle=0, confidence=0.0)
|
|
except CalledProcessError as e:
|
|
tesseract_log_output(log, e.output, input_file)
|
|
if ('Too few characters. Skipping this page' in e.output or
|
|
'Image too large' in e.output):
|
|
return OrientationConfidence(0, 0)
|
|
raise e from e
|
|
else:
|
|
osd = {}
|
|
for line in stdout.splitlines():
|
|
line = line.strip()
|
|
parts = line.split(':', maxsplit=2)
|
|
if len(parts) == 2:
|
|
osd[parts[0].strip()] = parts[1].strip()
|
|
|
|
angle = int(osd.get('Orientation in degrees', 0))
|
|
if 'Orientation' in osd:
|
|
# Tesseract < 3.04.01
|
|
# reports "Orientation in degrees" as a counterclockwise angle
|
|
# We keep it clockwise
|
|
assert 'Rotate' not in osd
|
|
angle = -angle % 360
|
|
else:
|
|
# Tesseract == 3.04.01, hopefully also Tesseract > 3.04.01
|
|
# reports "Orientation in degrees" as a clockwise angle
|
|
assert 'Rotate' in osd
|
|
|
|
oc = OrientationConfidence(
|
|
angle=angle,
|
|
confidence=float(osd.get('Orientation confidence', 0)))
|
|
return oc
|
|
|
|
|
|
def tesseract_log_output(log, stdout, input_file):
|
|
lines = stdout.splitlines()
|
|
prefix = "{0:4d}: [tesseract] ".format(page_number(input_file))
|
|
for line in lines:
|
|
if line.startswith("Tesseract Open Source"):
|
|
continue
|
|
elif line.startswith("Warning in pixReadMem"):
|
|
continue
|
|
elif 'diacritics' in line:
|
|
log.warning(prefix + "lots of diacritics - possibly poor OCR")
|
|
elif line.startswith('OSD: Weak margin'):
|
|
log.warning(prefix + "unsure about page orientation")
|
|
elif 'error' in line.lower() or 'exception' in line.lower():
|
|
log.error(prefix + line.strip())
|
|
elif 'read_params_file' in line.lower():
|
|
log.error(prefix + line.strip())
|
|
else:
|
|
log.info(prefix + line.strip())
|
|
|
|
|
|
def page_timedout(log, input_file):
|
|
prefix = "{0:4d}: [tesseract] ".format(page_number(input_file))
|
|
log.warning(prefix + " took too long to OCR - skipping")
|
|
|
|
|
|
def _generate_null_hocr(output_hocr, image):
|
|
"""Produce a .hocr file that reports no text detected on a page that is
|
|
the same size as the input image."""
|
|
from PIL import Image
|
|
|
|
im = Image.open(image)
|
|
w, h = im.size
|
|
|
|
with open(output_hocr, 'w', encoding="utf-8") as f:
|
|
f.write(HOCR_TEMPLATE.format(w, h))
|
|
|
|
|
|
def generate_hocr(input_file, output_hocr, language: list, engine_mode,
|
|
tessconfig: list,
|
|
timeout: float, pagesegmode: int, log):
|
|
|
|
badxml = os.path.splitext(output_hocr)[0] + '.badxml'
|
|
|
|
args_tesseract = tess_base_args(language, engine_mode)
|
|
|
|
if pagesegmode is not None:
|
|
args_tesseract.extend([psm(), str(pagesegmode)])
|
|
|
|
args_tesseract.extend([
|
|
input_file,
|
|
badxml,
|
|
'hocr'
|
|
] + tessconfig)
|
|
try:
|
|
log.debug(args_tesseract)
|
|
stdout = check_output(
|
|
args_tesseract, close_fds=True, stderr=STDOUT,
|
|
universal_newlines=True, timeout=timeout)
|
|
except TimeoutExpired:
|
|
# Generate a HOCR file with no recognized text if tesseract times out
|
|
# Temporary workaround to hocrTransform not being able to function if
|
|
# it does not have a valid hOCR file.
|
|
page_timedout(log, input_file)
|
|
_generate_null_hocr(output_hocr, input_file)
|
|
except CalledProcessError as e:
|
|
tesseract_log_output(log, e.output, input_file)
|
|
if 'read_params_file: parameter not found' in e.output:
|
|
raise TesseractConfigError() from e
|
|
if 'Image too large' in e.output:
|
|
_generate_null_hocr(output_hocr, input_file)
|
|
return
|
|
|
|
raise e from e
|
|
else:
|
|
tesseract_log_output(log, stdout, input_file)
|
|
|
|
if os.path.exists(badxml + '.html'):
|
|
# Tesseract 3.02 appends suffix ".html" on its own (.badxml.html)
|
|
shutil.move(badxml + '.html', badxml)
|
|
elif os.path.exists(badxml + '.hocr'):
|
|
# Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr)
|
|
shutil.move(badxml + '.hocr', badxml)
|
|
|
|
# Tesseract 3.03 inserts source filename into hocr file without
|
|
# escaping it, creating invalid XML and breaking the parser.
|
|
# As a workaround, rewrite the hocr file, replacing the filename
|
|
# with a space. Don't know if Tesseract 3.02 does the same.
|
|
|
|
regex_nested_single_quotes = re.compile(
|
|
r"""title='image "([^"]*)";""")
|
|
with open(badxml, mode='r', encoding='utf-8') as f_in, \
|
|
open(output_hocr, mode='w', encoding='utf-8') as f_out:
|
|
for line in f_in:
|
|
line = regex_nested_single_quotes.sub(
|
|
r"""title='image " ";""", line)
|
|
f_out.write(line)
|
|
|
|
|
|
def generate_pdf(input_image, skip_pdf, output_pdf, language: list,
|
|
engine_mode, text_only: bool,
|
|
tessconfig: list, timeout: float, pagesegmode: int, log):
|
|
'''Use Tesseract to render a PDF.
|
|
|
|
input_image -- image to analyze
|
|
skip_pdf -- if we time out, use this file as output
|
|
output_pdf -- file to generate
|
|
language -- list of languages to consider
|
|
engine_mode -- engine mode argument for tess v4
|
|
text_only -- enable tesseract text only mode?
|
|
tessconfig -- tesseract configuration
|
|
timeout -- timeout (seconds)
|
|
log -- logger object
|
|
'''
|
|
|
|
args_tesseract = tess_base_args(language, engine_mode)
|
|
|
|
if pagesegmode is not None:
|
|
args_tesseract.extend([psm(), str(pagesegmode)])
|
|
|
|
if text_only:
|
|
args_tesseract.extend(['-c', 'textonly_pdf=1'])
|
|
|
|
args_tesseract.extend([
|
|
input_image,
|
|
os.path.splitext(output_pdf)[0], # Tesseract appends suffix
|
|
'pdf'
|
|
] + tessconfig)
|
|
|
|
try:
|
|
log.debug(args_tesseract)
|
|
stdout = check_output(
|
|
args_tesseract, close_fds=True, stderr=STDOUT,
|
|
universal_newlines=True, timeout=timeout)
|
|
except TimeoutExpired:
|
|
page_timedout(log, input_image)
|
|
shutil.copy(skip_pdf, output_pdf)
|
|
except CalledProcessError as e:
|
|
tesseract_log_output(log, e.output, input_image)
|
|
if 'read_params_file: parameter not found' in e.output:
|
|
raise TesseractConfigError() from e
|
|
|
|
if 'Image too large' in e.output:
|
|
shutil.copy(skip_pdf, output_pdf)
|
|
return
|
|
raise e from e
|
|
else:
|
|
tesseract_log_output(log, stdout, input_image)
|