mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-10-10 23:49:31 +00:00
158 lines
4.2 KiB
Python
Executable File
158 lines
4.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import sys
|
|
import os
|
|
import hashlib
|
|
import shutil
|
|
import subprocess
|
|
|
|
|
|
"""Cache output of tesseract to speed up test suite
|
|
|
|
The cache is keyed by a hash that includes the tesseract version, some of
|
|
the command line, and the binary dump of the input file. The output file,
|
|
stdout, and stderr are replicated on a cache hit.
|
|
|
|
Page orientation checks are also cached (-psm 0 stdout)
|
|
|
|
Errors and crashes are not cached.
|
|
|
|
Things not checked:
|
|
-changes to tesseract installation that don't affect --version
|
|
|
|
Will fail on Tesseract 3.02.02 in "hocr" mode because it doesn't produce
|
|
the incorrect file extension.
|
|
|
|
"""
|
|
|
|
|
|
CACHE_PATH = os.path.abspath(os.path.join(
|
|
os.path.dirname(__file__), '..', 'cache'))
|
|
|
|
|
|
def real_tesseract():
|
|
tess_args = ['tesseract'] + sys.argv[1:]
|
|
os.execvp("tesseract", tess_args)
|
|
return # Not reachable
|
|
|
|
def main():
|
|
operation = sys.argv[-1]
|
|
# For anything unexpected operation, defer to real tesseract binary
|
|
# Currently this includes all use of "--tesseract-config"
|
|
if operation != 'hocr' and operation != 'pdf' and operation != 'stdout':
|
|
real_tesseract()
|
|
return # Not reachable
|
|
|
|
try:
|
|
os.makedirs(CACHE_PATH)
|
|
except FileExistsError:
|
|
pass
|
|
|
|
m = hashlib.sha1()
|
|
|
|
tess_version = subprocess.check_output(
|
|
['tesseract', '--version'],
|
|
stderr=subprocess.STDOUT)
|
|
|
|
if b'4.00.00alpha' in tess_version:
|
|
# Tesseract 4.x alpha is a moving target, don't cache it
|
|
real_tesseract()
|
|
return
|
|
|
|
m.update(tess_version)
|
|
|
|
# Insert this source file into the hash function, to ensure that any
|
|
# changes to this file invalidate previous hashes
|
|
with open(__file__, 'rb') as f:
|
|
m.update(f.read())
|
|
|
|
m.update(operation.encode())
|
|
|
|
try:
|
|
lang = sys.argv[sys.argv.index('-l') + 1]
|
|
m.update(lang.encode())
|
|
except ValueError:
|
|
m.update(b'default-lang')
|
|
|
|
psm_arg = ''
|
|
if '--psm' in sys.argv:
|
|
psm_arg = '--psm'
|
|
elif '-psm' in sys.argv:
|
|
psm_arg = '-psm'
|
|
if psm_arg:
|
|
try:
|
|
psm = sys.argv[sys.argv.index(psm_arg) + 1]
|
|
m.update(psm.encode())
|
|
except ValueError:
|
|
m.update(b'default-psm')
|
|
else:
|
|
m.update(b'default-psm')
|
|
|
|
if operation == 'stdout' and psm != '0':
|
|
real_tesseract()
|
|
return
|
|
|
|
if operation == 'stdout':
|
|
input_file = sys.argv[-2]
|
|
output_file = 'stdout'
|
|
else:
|
|
input_file = sys.argv[-3]
|
|
output_file = sys.argv[-2]
|
|
|
|
if operation == 'hocr':
|
|
output_file += '.hocr'
|
|
elif operation == 'pdf':
|
|
output_file += '.pdf'
|
|
|
|
with open(input_file, 'rb') as f:
|
|
m.update(f.read())
|
|
cache_name = os.path.join(CACHE_PATH, m.hexdigest())
|
|
print(cache_name)
|
|
if os.path.exists(cache_name):
|
|
# Cache hit
|
|
print("Tesseract cache hit", file=sys.stderr)
|
|
if operation != 'stdout':
|
|
shutil.copy(cache_name, output_file)
|
|
|
|
# Replicate output
|
|
with open(cache_name + '.stdout', 'rb') as f:
|
|
sys.stdout.buffer.write(f.read())
|
|
with open(cache_name + '.stderr', 'rb') as f:
|
|
sys.stderr.buffer.write(f.read())
|
|
sys.exit(0)
|
|
|
|
# Cache miss
|
|
print("Tesseract cache miss", file=sys.stderr)
|
|
|
|
# Call tesseract
|
|
p = subprocess.Popen(
|
|
['tesseract'] + sys.argv[1:],
|
|
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
stdout, stderr = p.communicate()
|
|
|
|
if p.returncode != 0:
|
|
# Do not cache errors or crashes
|
|
print("Tesseract error", file=sys.stderr)
|
|
sys.stdout.buffer.write(stdout)
|
|
sys.stderr.buffer.write(stderr)
|
|
return p.returncode
|
|
|
|
with open(cache_name + '.stdout', 'wb') as f:
|
|
f.write(stdout)
|
|
with open(cache_name + '.stderr', 'wb') as f:
|
|
f.write(stderr)
|
|
sys.stdout.buffer.write(stdout)
|
|
sys.stderr.buffer.write(stderr)
|
|
|
|
# Insert file into cache
|
|
if output_file != 'stdout':
|
|
if os.path.exists(output_file):
|
|
shutil.copy(output_file, cache_name)
|
|
else:
|
|
print("Could not find output file", file=sys.stderr)
|
|
else:
|
|
open(cache_name, 'w').close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|