OCRmyPDF/tests/spoof/tesseract_cache.py
James R. Barlow 8c17c9918e Add documentation and test cases for —tesseract-config
This parameter has existed for along time but never really got any
attention.
2017-01-28 22:06:51 -08:00

158 lines
4.2 KiB
Python
Executable File

#!/usr/bin/env python3
import sys
import os
import hashlib
import shutil
import subprocess
"""Cache output of tesseract to speed up test suite
The cache is keyed by a hash that includes the tesseract version, some of
the command line, and the binary dump of the input file. The output file,
stdout, and stderr are replicated on a cache hit.
Page orientation checks are also cached (-psm 0 stdout)
Errors and crashes are not cached.
Things not checked:
-changes to tesseract installation that don't affect --version
Will fail on Tesseract 3.02.02 in "hocr" mode because it doesn't produce
the incorrect file extension.
"""
CACHE_PATH = os.path.abspath(os.path.join(
os.path.dirname(__file__), '..', 'cache'))
def real_tesseract():
tess_args = ['tesseract'] + sys.argv[1:]
os.execvp("tesseract", tess_args)
return # Not reachable
def main():
operation = sys.argv[-1]
# For anything unexpected operation, defer to real tesseract binary
# Currently this includes all use of "--tesseract-config"
if operation != 'hocr' and operation != 'pdf' and operation != 'stdout':
real_tesseract()
return # Not reachable
try:
os.makedirs(CACHE_PATH)
except FileExistsError:
pass
m = hashlib.sha1()
tess_version = subprocess.check_output(
['tesseract', '--version'],
stderr=subprocess.STDOUT)
if b'4.00.00alpha' in tess_version:
# Tesseract 4.x alpha is a moving target, don't cache it
real_tesseract()
return
m.update(tess_version)
# Insert this source file into the hash function, to ensure that any
# changes to this file invalidate previous hashes
with open(__file__, 'rb') as f:
m.update(f.read())
m.update(operation.encode())
try:
lang = sys.argv[sys.argv.index('-l') + 1]
m.update(lang.encode())
except ValueError:
m.update(b'default-lang')
psm_arg = ''
if '--psm' in sys.argv:
psm_arg = '--psm'
elif '-psm' in sys.argv:
psm_arg = '-psm'
if psm_arg:
try:
psm = sys.argv[sys.argv.index(psm_arg) + 1]
m.update(psm.encode())
except ValueError:
m.update(b'default-psm')
else:
m.update(b'default-psm')
if operation == 'stdout' and psm != '0':
real_tesseract()
return
if operation == 'stdout':
input_file = sys.argv[-2]
output_file = 'stdout'
else:
input_file = sys.argv[-3]
output_file = sys.argv[-2]
if operation == 'hocr':
output_file += '.hocr'
elif operation == 'pdf':
output_file += '.pdf'
with open(input_file, 'rb') as f:
m.update(f.read())
cache_name = os.path.join(CACHE_PATH, m.hexdigest())
print(cache_name)
if os.path.exists(cache_name):
# Cache hit
print("Tesseract cache hit", file=sys.stderr)
if operation != 'stdout':
shutil.copy(cache_name, output_file)
# Replicate output
with open(cache_name + '.stdout', 'rb') as f:
sys.stdout.buffer.write(f.read())
with open(cache_name + '.stderr', 'rb') as f:
sys.stderr.buffer.write(f.read())
sys.exit(0)
# Cache miss
print("Tesseract cache miss", file=sys.stderr)
# Call tesseract
p = subprocess.Popen(
['tesseract'] + sys.argv[1:],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
if p.returncode != 0:
# Do not cache errors or crashes
print("Tesseract error", file=sys.stderr)
sys.stdout.buffer.write(stdout)
sys.stderr.buffer.write(stderr)
return p.returncode
with open(cache_name + '.stdout', 'wb') as f:
f.write(stdout)
with open(cache_name + '.stderr', 'wb') as f:
f.write(stderr)
sys.stdout.buffer.write(stdout)
sys.stderr.buffer.write(stderr)
# Insert file into cache
if output_file != 'stdout':
if os.path.exists(output_file):
shutil.copy(output_file, cache_name)
else:
print("Could not find output file", file=sys.stderr)
else:
open(cache_name, 'w').close()
if __name__ == '__main__':
main()