2015-12-17 12:52:12 -08:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
import sys
|
|
|
|
import os
|
|
|
|
import hashlib
|
|
|
|
import shutil
|
|
|
|
import subprocess
|
|
|
|
|
|
|
|
|
2016-08-26 15:04:08 -07:00
|
|
|
"""Cache output of tesseract to speed up test suite
|
|
|
|
|
|
|
|
The cache is keyed by a hash that includes the tesseract version, some of
|
|
|
|
the command line, and the binary dump of the input file. The output file,
|
|
|
|
stdout, and stderr are replicated on a cache hit.
|
|
|
|
|
|
|
|
Page orientation checks are also cached (-psm 0 stdout)
|
|
|
|
|
|
|
|
Errors and crashes are not cached.
|
|
|
|
|
|
|
|
Things not checked:
|
|
|
|
-changes to tesseract installation that don't affect --version
|
|
|
|
|
|
|
|
Will fail on Tesseract 3.02.02 in "hocr" mode because it doesn't produce
|
|
|
|
the incorrect file extension.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
2015-12-17 12:52:12 -08:00
|
|
|
CACHE_PATH = os.path.abspath(os.path.join(
|
|
|
|
os.path.dirname(__file__), '..', 'cache'))
|
|
|
|
|
|
|
|
|
2016-02-08 02:21:56 -08:00
|
|
|
def real_tesseract():
|
|
|
|
tess_args = ['tesseract'] + sys.argv[1:]
|
|
|
|
os.execvp("tesseract", tess_args)
|
|
|
|
return # Not reachable
|
|
|
|
|
2015-12-17 12:52:12 -08:00
|
|
|
def main():
|
|
|
|
operation = sys.argv[-1]
|
2016-02-08 02:21:56 -08:00
|
|
|
# For anything unexpected operation, defer to real tesseract binary
|
|
|
|
if operation != 'hocr' and operation != 'pdf' and operation != 'stdout':
|
|
|
|
real_tesseract()
|
2015-12-17 12:52:12 -08:00
|
|
|
return # Not reachable
|
|
|
|
|
|
|
|
try:
|
|
|
|
os.makedirs(CACHE_PATH)
|
|
|
|
except FileExistsError:
|
|
|
|
pass
|
|
|
|
|
|
|
|
m = hashlib.sha1()
|
|
|
|
|
|
|
|
version = subprocess.check_output(
|
|
|
|
['tesseract', '--version'],
|
|
|
|
stderr=subprocess.STDOUT)
|
|
|
|
|
2015-12-17 14:00:17 -08:00
|
|
|
m.update(version)
|
2015-12-17 12:52:12 -08:00
|
|
|
m.update(operation.encode())
|
|
|
|
|
|
|
|
try:
|
|
|
|
lang = sys.argv[sys.argv.index('-l') + 1]
|
|
|
|
m.update(lang.encode())
|
|
|
|
except ValueError:
|
|
|
|
pass
|
2016-01-11 17:22:50 -08:00
|
|
|
try:
|
|
|
|
psm = sys.argv[sys.argv.index('-psm') + 1]
|
|
|
|
m.update(psm.encode())
|
|
|
|
except ValueError:
|
|
|
|
pass
|
2015-12-17 12:52:12 -08:00
|
|
|
|
2016-02-08 02:21:56 -08:00
|
|
|
if operation == 'stdout' and psm != '0':
|
|
|
|
real_tesseract()
|
|
|
|
return
|
|
|
|
|
|
|
|
if operation == 'stdout':
|
|
|
|
input_file = sys.argv[-2]
|
|
|
|
output_file = 'stdout'
|
|
|
|
else:
|
|
|
|
input_file = sys.argv[-3]
|
|
|
|
output_file = sys.argv[-2]
|
2015-12-17 12:52:12 -08:00
|
|
|
|
|
|
|
if operation == 'hocr':
|
|
|
|
output_file += '.hocr'
|
|
|
|
elif operation == 'pdf':
|
|
|
|
output_file += '.pdf'
|
|
|
|
|
|
|
|
with open(input_file, 'rb') as f:
|
|
|
|
m.update(f.read())
|
|
|
|
cache_name = os.path.join(CACHE_PATH, m.hexdigest())
|
2016-02-08 02:21:56 -08:00
|
|
|
print(cache_name)
|
2015-12-17 12:52:12 -08:00
|
|
|
if os.path.exists(cache_name):
|
|
|
|
# Cache hit
|
|
|
|
print("Tesseract cache hit", file=sys.stderr)
|
2016-02-08 02:21:56 -08:00
|
|
|
if operation != 'stdout':
|
|
|
|
shutil.copy(cache_name, output_file)
|
|
|
|
|
|
|
|
# Replicate output
|
|
|
|
with open(cache_name + '.stdout', 'r') as f:
|
|
|
|
print(f.read(), end='')
|
|
|
|
with open(cache_name + '.stderr', 'r') as f:
|
|
|
|
print(f.read(), end='', file=sys.stderr)
|
2015-12-17 12:52:12 -08:00
|
|
|
sys.exit(0)
|
|
|
|
|
|
|
|
# Cache miss
|
|
|
|
print("Tesseract cache miss", file=sys.stderr)
|
|
|
|
|
|
|
|
# Call tesseract
|
2016-02-08 02:21:56 -08:00
|
|
|
p = subprocess.Popen(
|
|
|
|
['tesseract'] + sys.argv[1:],
|
|
|
|
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
|
|
|
universal_newlines=True)
|
|
|
|
stdout, stderr = p.communicate()
|
|
|
|
|
2016-02-20 04:53:02 -08:00
|
|
|
if p.returncode != 0:
|
|
|
|
# Do not cache errors or crashes
|
|
|
|
print("Tesseract error", file=sys.stderr)
|
|
|
|
print(stdout, end='')
|
|
|
|
print(stderr, end='', file=sys.stderr)
|
|
|
|
return p.returncode
|
|
|
|
|
2016-02-08 02:21:56 -08:00
|
|
|
with open(cache_name + '.stdout', 'w') as f:
|
|
|
|
f.write(stdout)
|
|
|
|
with open(cache_name + '.stderr', 'w') as f:
|
|
|
|
f.write(stderr)
|
|
|
|
print(stdout, end='')
|
|
|
|
print(stderr, end='', file=sys.stderr)
|
2015-12-17 12:52:12 -08:00
|
|
|
|
|
|
|
# Insert file into cache
|
2016-02-08 02:21:56 -08:00
|
|
|
if output_file != 'stdout':
|
|
|
|
if os.path.exists(output_file):
|
|
|
|
shutil.copy(output_file, cache_name)
|
|
|
|
else:
|
|
|
|
print("Could not find output file", file=sys.stderr)
|
2015-12-17 12:52:12 -08:00
|
|
|
else:
|
2016-02-08 02:21:56 -08:00
|
|
|
open(cache_name, 'w').close()
|
2015-12-17 12:52:12 -08:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|