mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-10-16 02:20:12 +00:00

For sanity's sake, deal with tesseract streams in binary without transcoding (via universal_newlines, etc.). The only differences are printing messages regarding spoofing. Also hash the source file so that changes to the cache mechanism invalidate old cache automatically. That is probably too aggressive, but simple and safer than the previous approach.
143 lines
3.7 KiB
Python
Executable File
143 lines
3.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import sys
|
|
import os
|
|
import hashlib
|
|
import shutil
|
|
import subprocess
|
|
|
|
|
|
"""Cache output of tesseract to speed up test suite
|
|
|
|
The cache is keyed by a hash that includes the tesseract version, some of
|
|
the command line, and the binary dump of the input file. The output file,
|
|
stdout, and stderr are replicated on a cache hit.
|
|
|
|
Page orientation checks are also cached (-psm 0 stdout)
|
|
|
|
Errors and crashes are not cached.
|
|
|
|
Things not checked:
|
|
-changes to tesseract installation that don't affect --version
|
|
|
|
Will fail on Tesseract 3.02.02 in "hocr" mode because it doesn't produce
|
|
the incorrect file extension.
|
|
|
|
"""
|
|
|
|
|
|
CACHE_PATH = os.path.abspath(os.path.join(
|
|
os.path.dirname(__file__), '..', 'cache'))
|
|
|
|
|
|
def real_tesseract():
|
|
tess_args = ['tesseract'] + sys.argv[1:]
|
|
os.execvp("tesseract", tess_args)
|
|
return # Not reachable
|
|
|
|
def main():
|
|
operation = sys.argv[-1]
|
|
# For anything unexpected operation, defer to real tesseract binary
|
|
if operation != 'hocr' and operation != 'pdf' and operation != 'stdout':
|
|
real_tesseract()
|
|
return # Not reachable
|
|
|
|
try:
|
|
os.makedirs(CACHE_PATH)
|
|
except FileExistsError:
|
|
pass
|
|
|
|
m = hashlib.sha1()
|
|
|
|
tess_version = subprocess.check_output(
|
|
['tesseract', '--version'],
|
|
stderr=subprocess.STDOUT)
|
|
|
|
m.update(tess_version)
|
|
|
|
# Insert this source file into the hash function, to ensure that any
|
|
# changes to this file invalidate previous hashes
|
|
with open(__file__, 'rb') as f:
|
|
m.update(f.read())
|
|
|
|
m.update(operation.encode())
|
|
|
|
try:
|
|
lang = sys.argv[sys.argv.index('-l') + 1]
|
|
m.update(lang.encode())
|
|
except ValueError:
|
|
pass
|
|
try:
|
|
psm = sys.argv[sys.argv.index('-psm') + 1]
|
|
m.update(psm.encode())
|
|
except ValueError:
|
|
pass
|
|
|
|
if operation == 'stdout' and psm != '0':
|
|
real_tesseract()
|
|
return
|
|
|
|
if operation == 'stdout':
|
|
input_file = sys.argv[-2]
|
|
output_file = 'stdout'
|
|
else:
|
|
input_file = sys.argv[-3]
|
|
output_file = sys.argv[-2]
|
|
|
|
if operation == 'hocr':
|
|
output_file += '.hocr'
|
|
elif operation == 'pdf':
|
|
output_file += '.pdf'
|
|
|
|
with open(input_file, 'rb') as f:
|
|
m.update(f.read())
|
|
cache_name = os.path.join(CACHE_PATH, m.hexdigest())
|
|
print(cache_name)
|
|
if os.path.exists(cache_name):
|
|
# Cache hit
|
|
print("Tesseract cache hit", file=sys.stderr)
|
|
if operation != 'stdout':
|
|
shutil.copy(cache_name, output_file)
|
|
|
|
# Replicate output
|
|
with open(cache_name + '.stdout', 'rb') as f:
|
|
sys.stdout.buffer.write(f.read())
|
|
with open(cache_name + '.stderr', 'rb') as f:
|
|
sys.stderr.buffer.write(f.read())
|
|
sys.exit(0)
|
|
|
|
# Cache miss
|
|
print("Tesseract cache miss", file=sys.stderr)
|
|
|
|
# Call tesseract
|
|
p = subprocess.Popen(
|
|
['tesseract'] + sys.argv[1:],
|
|
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
stdout, stderr = p.communicate()
|
|
|
|
if p.returncode != 0:
|
|
# Do not cache errors or crashes
|
|
print("Tesseract error", file=sys.stderr)
|
|
sys.stdout.buffer.write(stdout)
|
|
sys.stderr.buffer.write(stderr)
|
|
return p.returncode
|
|
|
|
with open(cache_name + '.stdout', 'wb') as f:
|
|
f.write(stdout)
|
|
with open(cache_name + '.stderr', 'wb') as f:
|
|
f.write(stderr)
|
|
sys.stdout.buffer.write(stdout)
|
|
sys.stderr.buffer.write(stderr)
|
|
|
|
# Insert file into cache
|
|
if output_file != 'stdout':
|
|
if os.path.exists(output_file):
|
|
shutil.copy(output_file, cache_name)
|
|
else:
|
|
print("Could not find output file", file=sys.stderr)
|
|
else:
|
|
open(cache_name, 'w').close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|