OCRmyPDF/tests/spoof/tesseract_cache.py
James R. Barlow a4f07756a5 tesseract caching: don't transcode tesseract's output, hash source file
For sanity's sake, deal with tesseract streams in binary without
transcoding (via universal_newlines, etc.). The only differences are
printing messages regarding spoofing.

Also hash the source file so that changes to the cache mechanism
invalidate old cache automatically. That is probably too aggressive,
but simple and safer than the previous approach.
2016-10-28 16:44:12 -07:00

143 lines
3.7 KiB
Python
Executable File

#!/usr/bin/env python3
import sys
import os
import hashlib
import shutil
import subprocess
"""Cache output of tesseract to speed up test suite
The cache is keyed by a hash that includes the tesseract version, some of
the command line, and the binary dump of the input file. The output file,
stdout, and stderr are replicated on a cache hit.
Page orientation checks are also cached (-psm 0 stdout)
Errors and crashes are not cached.
Things not checked:
-changes to tesseract installation that don't affect --version
Will fail on Tesseract 3.02.02 in "hocr" mode because it doesn't produce
the incorrect file extension.
"""
CACHE_PATH = os.path.abspath(os.path.join(
os.path.dirname(__file__), '..', 'cache'))
def real_tesseract():
tess_args = ['tesseract'] + sys.argv[1:]
os.execvp("tesseract", tess_args)
return # Not reachable
def main():
operation = sys.argv[-1]
# For anything unexpected operation, defer to real tesseract binary
if operation != 'hocr' and operation != 'pdf' and operation != 'stdout':
real_tesseract()
return # Not reachable
try:
os.makedirs(CACHE_PATH)
except FileExistsError:
pass
m = hashlib.sha1()
tess_version = subprocess.check_output(
['tesseract', '--version'],
stderr=subprocess.STDOUT)
m.update(tess_version)
# Insert this source file into the hash function, to ensure that any
# changes to this file invalidate previous hashes
with open(__file__, 'rb') as f:
m.update(f.read())
m.update(operation.encode())
try:
lang = sys.argv[sys.argv.index('-l') + 1]
m.update(lang.encode())
except ValueError:
pass
try:
psm = sys.argv[sys.argv.index('-psm') + 1]
m.update(psm.encode())
except ValueError:
pass
if operation == 'stdout' and psm != '0':
real_tesseract()
return
if operation == 'stdout':
input_file = sys.argv[-2]
output_file = 'stdout'
else:
input_file = sys.argv[-3]
output_file = sys.argv[-2]
if operation == 'hocr':
output_file += '.hocr'
elif operation == 'pdf':
output_file += '.pdf'
with open(input_file, 'rb') as f:
m.update(f.read())
cache_name = os.path.join(CACHE_PATH, m.hexdigest())
print(cache_name)
if os.path.exists(cache_name):
# Cache hit
print("Tesseract cache hit", file=sys.stderr)
if operation != 'stdout':
shutil.copy(cache_name, output_file)
# Replicate output
with open(cache_name + '.stdout', 'rb') as f:
sys.stdout.buffer.write(f.read())
with open(cache_name + '.stderr', 'rb') as f:
sys.stderr.buffer.write(f.read())
sys.exit(0)
# Cache miss
print("Tesseract cache miss", file=sys.stderr)
# Call tesseract
p = subprocess.Popen(
['tesseract'] + sys.argv[1:],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
if p.returncode != 0:
# Do not cache errors or crashes
print("Tesseract error", file=sys.stderr)
sys.stdout.buffer.write(stdout)
sys.stderr.buffer.write(stderr)
return p.returncode
with open(cache_name + '.stdout', 'wb') as f:
f.write(stdout)
with open(cache_name + '.stderr', 'wb') as f:
f.write(stderr)
sys.stdout.buffer.write(stdout)
sys.stderr.buffer.write(stderr)
# Insert file into cache
if output_file != 'stdout':
if os.path.exists(output_file):
shutil.copy(output_file, cache_name)
else:
print("Could not find output file", file=sys.stderr)
else:
open(cache_name, 'w').close()
if __name__ == '__main__':
main()