OCRmyPDF/tests/spoof/tesseract_cache.py

#!/usr/bin/env python3
# © 2016 James R. Barlow: github.com/jbarlow83
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


import sys
import os
import hashlib
import shutil
import subprocess


"""Cache output of tesseract to speed up test suite

The cache is keyed by a hash that includes the tesseract version, some of
the command line, and the binary dump of the input file, and this file itself.
 Therefore any updates to this file invalidate cache. Uses SHA-1 because it is
 fast and defeating a hash collision here is not exactly a priority. :P

The output files, stdout, and stderr are replicated on a cache hit. The output
files are either a .pdf and .txt or .hocr and .txt.

Page orientation checks are also cached (-psm 0 stdout)

Errors and crashes are not cached. If the arguments don't match a known
caching template then real tesseract is called with the same arguments.

Things not checked:
-changes to tesseract installation that don't affect --version

Assumes Tesseract 3.04 or higher.

Will fail on Tesseract 3.02.02 in "hocr" mode because it doesn't produce
the incorrect file extension. Will fail on 3.03 because that has no sidecar
text support. Will fail to replicate a 3.04 bug if wrong parameter order is
given.

"""


CACHE_PATH = os.path.abspath(os.path.join(
        os.path.dirname(__file__), '..', 'cache'))


def real_tesseract():
    tess_args = ['tesseract'] + sys.argv[1:]
    os.execvp("tesseract", tess_args)
    return  # Not reachable

def main():
    os.environ['PATH'] = os.environ['_OCRMYPDF_SAVE_PATH']
    operation = sys.argv[-2]
    sidecar = False
    if sys.argv[-1] == 'txt':
        sidecar = True
    elif sys.argv[-1] == 'stdout':
        operation = 'stdout'

    # For anything unexpected operation, defer to real tesseract binary
    # Currently this includes all use of "--tesseract-config"
    if operation != 'hocr' and operation != 'pdf' and operation != 'stdout':
        real_tesseract()
        return  # Not reachable

    try:
        os.makedirs(CACHE_PATH)
    except FileExistsError:
        pass

    m = hashlib.sha1()

    tess_version = subprocess.check_output(
        ['tesseract', '--version'],
        stderr=subprocess.STDOUT)

    m.update(tess_version)

    # Insert this source file into the hash function, to ensure that any
    # changes to this file invalidate previous hashes
    with open(__file__, 'rb') as f:
        m.update(f.read())

    m.update(operation.encode())

    try:
        lang = sys.argv[sys.argv.index('-l') + 1]
        m.update(lang.encode())
    except ValueError:
        m.update(b'default-lang')

    try:
        textonly = sys.argv[sys.argv.index('-c') + 1]
        m.update(textonly.encode())
    except ValueError:
        m.update(b'textonly_pdf=0')

    psm_arg = ''
    if '--psm' in sys.argv:
        psm_arg = '--psm'
    elif '-psm' in sys.argv:
        psm_arg = '-psm'
    if psm_arg:
        try:
            psm = sys.argv[sys.argv.index(psm_arg) + 1]
            m.update(psm.encode())
        except ValueError:
            m.update(b'default-psm')
    else:
        m.update(b'default-psm')

    if operation == 'stdout' and psm != '0':
        real_tesseract()
        return

    if operation == 'stdout':
        # tesseract [--options] ... input stdout
        input_file = sys.argv[-2]
        output_file = 'stdout'
        sidecar_file = ''
    else:
        # tesseract [--options] ... input output txt hocr|pdf
        input_file = sys.argv[-4]
        output_file = sys.argv[-3]
        sidecar_file = sys.argv[-3]

    if operation == 'hocr':
        output_file += '.hocr'
        sidecar_file += '.txt'
    elif operation == 'pdf':
        output_file += '.pdf'
        sidecar_file += '.txt'

    with open(input_file, 'rb') as f:
        m.update(f.read())
    cache_name = os.path.join(CACHE_PATH, m.hexdigest())
    print(cache_name)
    if os.path.exists(cache_name):
        # Cache hit
        print("Tesseract cache hit", file=sys.stderr)
        if operation != 'stdout':
            shutil.copy(cache_name, output_file)
            if sidecar:
                shutil.copy(cache_name + '.sidecar', sidecar_file)

        # Replicate output
        with open(cache_name + '.stdout', 'rb') as f:
            sys.stdout.buffer.write(f.read())
        with open(cache_name + '.stderr', 'rb') as f:
            sys.stderr.buffer.write(f.read())
        sys.exit(0)

    # Cache miss
    print("Tesseract cache miss", file=sys.stderr)

    # Call tesseract
    p = subprocess.Popen(
            ['tesseract'] + sys.argv[1:],
            stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()

    if p.returncode != 0:
        # Do not cache errors or crashes
        print("Tesseract error", file=sys.stderr)
        sys.stdout.buffer.write(stdout)
        sys.stderr.buffer.write(stderr)
        return p.returncode

    with open(cache_name + '.stdout', 'wb') as f:
        f.write(stdout)
    with open(cache_name + '.stderr', 'wb') as f:
        f.write(stderr)
    sys.stdout.buffer.write(stdout)
    sys.stderr.buffer.write(stderr)

    # Insert file into cache
    if output_file != 'stdout':
        if os.path.exists(output_file):
            shutil.copy(output_file, cache_name)
        else:
            print("Could not find output file", file=sys.stderr)
        if sidecar and os.path.exists(sidecar_file):
            shutil.copy(sidecar_file, cache_name + '.sidecar')
    else:
        open(cache_name, 'w').close()


if __name__ == '__main__':
    main()
Add tesseract caching to speed up tests 2015-12-17 12:52:12 -08:00			`#!/usr/bin/env python3`
Update copyrights 2017-05-14 23:38:28 -07:00			`# © 2016 James R. Barlow: github.com/jbarlow83`
Add license notice to all files Source files to GPL3 Exceptions: -tests/spoof/* to MIT -hocrtransform.py -_unicodefun.py Test resources to CC BY-SA 4.0 except when otherwise noted. Add GPL license. 2018-03-14 14:40:48 -07:00			`#`
			`# Permission is hereby granted, free of charge, to any person obtaining a`
			`# copy of this software and associated documentation files (the`
			`# "Software"), to deal in the Software without restriction, including`
			`# without limitation the rights to use, copy, modify, merge, publish,`
			`# distribute, sublicense, and/or sell copies of the Software, and to`
			`# permit persons to whom the Software is furnished to do so, subject to`
			`# the following conditions:`
			`#`
			`# The above copyright notice and this permission notice shall be included`
			`# in all copies or substantial portions of the Software.`
			`#`
			`# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS`
			`# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF`
			`# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.`
			`# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY`
			`# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,`
			`# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE`
			`# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.`

Update copyrights 2017-05-14 23:38:28 -07:00
Add tesseract caching to speed up tests 2015-12-17 12:52:12 -08:00			`import sys`
			`import os`
			`import hashlib`
			`import shutil`
			`import subprocess`


Improve some documentation for tests 2016-08-26 15:04:08 -07:00			`"""Cache output of tesseract to speed up test suite`

			`The cache is keyed by a hash that includes the tesseract version, some of`
tesseract_cache: update explanatory notes 2017-05-14 23:54:09 -07:00			`the command line, and the binary dump of the input file, and this file itself.`
			`Therefore any updates to this file invalidate cache. Uses SHA-1 because it is`
			`fast and defeating a hash collision here is not exactly a priority. :P`

			`The output files, stdout, and stderr are replicated on a cache hit. The output`
			`files are either a .pdf and .txt or .hocr and .txt.`
Improve some documentation for tests 2016-08-26 15:04:08 -07:00
			`Page orientation checks are also cached (-psm 0 stdout)`

tesseract_cache: update explanatory notes 2017-05-14 23:54:09 -07:00			`Errors and crashes are not cached. If the arguments don't match a known`
			`caching template then real tesseract is called with the same arguments.`
Improve some documentation for tests 2016-08-26 15:04:08 -07:00
			`Things not checked:`
			`-changes to tesseract installation that don't affect --version`

tesseract_cache: update explanatory notes 2017-05-14 23:54:09 -07:00			`Assumes Tesseract 3.04 or higher.`

Improve some documentation for tests 2016-08-26 15:04:08 -07:00			`Will fail on Tesseract 3.02.02 in "hocr" mode because it doesn't produce`
tesseract_cache: update explanatory notes 2017-05-14 23:54:09 -07:00			`the incorrect file extension. Will fail on 3.03 because that has no sidecar`
			`text support. Will fail to replicate a 3.04 bug if wrong parameter order is`
			`given.`
Improve some documentation for tests 2016-08-26 15:04:08 -07:00
			`"""`


Add tesseract caching to speed up tests 2015-12-17 12:52:12 -08:00			`CACHE_PATH = os.path.abspath(os.path.join(`
			`os.path.dirname(__file__), '..', 'cache'))`


Update tesseract spoofing to cache orientation and script detection checks No cache: 269 s With cache: 144 s test_oversample[tesseract] now fails, all others good 2016-02-08 02:21:56 -08:00			`def real_tesseract():`
			`tess_args = ['tesseract'] + sys.argv[1:]`
			`os.execvp("tesseract", tess_args)`
			`return # Not reachable`

Add tesseract caching to speed up tests 2015-12-17 12:52:12 -08:00			`def main():`
Remove the OCRMYPDF_program environment variables Really, this was just replicating the functionality of the PATH environment variable, and users probably do that anyway. 2018-03-24 15:07:02 -07:00			`os.environ['PATH'] = os.environ['_OCRMYPDF_SAVE_PATH']`
Fix: Tesseract 3.04 is sensitive to order of configuration commands “txt hocr” is not acceptable and does not produce expected output .txt while “hocr text” works fine, so switch the order everywhere. Should fix #169 2017-05-14 23:27:46 -07:00			`operation = sys.argv[-2]`
Fix test suite breakage after sidecar feature added Forgot to update tesseract spoofers to account for change in tesseract parameters. Also the change to outputting multiple files in the collate steps affected how ruffus passes information into downstream consumers of those files. 2017-05-11 00:17:24 -07:00			`sidecar = False`
Fix: Tesseract 3.04 is sensitive to order of configuration commands “txt hocr” is not acceptable and does not produce expected output .txt while “hocr text” works fine, so switch the order everywhere. Should fix #169 2017-05-14 23:27:46 -07:00			`if sys.argv[-1] == 'txt':`
Fix test suite breakage after sidecar feature added Forgot to update tesseract spoofers to account for change in tesseract parameters. Also the change to outputting multiple files in the collate steps affected how ruffus passes information into downstream consumers of those files. 2017-05-11 00:17:24 -07:00			`sidecar = True`
Fix: Tesseract 3.04 is sensitive to order of configuration commands “txt hocr” is not acceptable and does not produce expected output .txt while “hocr text” works fine, so switch the order everywhere. Should fix #169 2017-05-14 23:27:46 -07:00			`elif sys.argv[-1] == 'stdout':`
			`operation = 'stdout'`
Fix test suite breakage after sidecar feature added Forgot to update tesseract spoofers to account for change in tesseract parameters. Also the change to outputting multiple files in the collate steps affected how ruffus passes information into downstream consumers of those files. 2017-05-11 00:17:24 -07:00
Update tesseract spoofing to cache orientation and script detection checks No cache: 269 s With cache: 144 s test_oversample[tesseract] now fails, all others good 2016-02-08 02:21:56 -08:00			`# For anything unexpected operation, defer to real tesseract binary`
Add documentation and test cases for —tesseract-config This parameter has existed for along time but never really got any attention. 2017-01-28 22:06:51 -08:00			`# Currently this includes all use of "--tesseract-config"`
Update tesseract spoofing to cache orientation and script detection checks No cache: 269 s With cache: 144 s test_oversample[tesseract] now fails, all others good 2016-02-08 02:21:56 -08:00			`if operation != 'hocr' and operation != 'pdf' and operation != 'stdout':`
			`real_tesseract()`
Add tesseract caching to speed up tests 2015-12-17 12:52:12 -08:00			`return # Not reachable`

			`try:`
			`os.makedirs(CACHE_PATH)`
			`except FileExistsError:`
			`pass`

			`m = hashlib.sha1()`

tesseract caching: don't transcode tesseract's output, hash source file For sanity's sake, deal with tesseract streams in binary without transcoding (via universal_newlines, etc.). The only differences are printing messages regarding spoofing. Also hash the source file so that changes to the cache mechanism invalidate old cache automatically. That is probably too aggressive, but simple and safer than the previous approach. 2016-10-28 16:44:12 -07:00			`tess_version = subprocess.check_output(`
Add tesseract caching to speed up tests 2015-12-17 12:52:12 -08:00			`['tesseract', '--version'],`
			`stderr=subprocess.STDOUT)`

tesseract caching: don't transcode tesseract's output, hash source file For sanity's sake, deal with tesseract streams in binary without transcoding (via universal_newlines, etc.). The only differences are printing messages regarding spoofing. Also hash the source file so that changes to the cache mechanism invalidate old cache automatically. That is probably too aggressive, but simple and safer than the previous approach. 2016-10-28 16:44:12 -07:00			`m.update(tess_version)`

			`# Insert this source file into the hash function, to ensure that any`
			`# changes to this file invalidate previous hashes`
			`with open(__file__, 'rb') as f:`
			`m.update(f.read())`

Add tesseract caching to speed up tests 2015-12-17 12:52:12 -08:00			`m.update(operation.encode())`

			`try:`
			`lang = sys.argv[sys.argv.index('-l') + 1]`
			`m.update(lang.encode())`
			`except ValueError:`
Add documentation and test cases for —tesseract-config This parameter has existed for along time but never really got any attention. 2017-01-28 22:06:51 -08:00			`m.update(b'default-lang')`

Rename “tess4” renderer to “sandwich” and make it default in Tess 3.05.01 Tesseract 3.05.01 backported the textonly_pdf=1 which allows the use of this superior PDF renderer prior to 4.00 alpha. This means that the tess4 name is no longer accurate, so call it a sandwich because of its merge-preserve characteristic. Preserve the tess4 name. Fix the documentation and tests to reflect this. Make it the default, because it’s better. It does not have the issues the “tesseract” renderer does prior to Tess 3.05.00 with rendering PDFs that Ghostscript corrupts, and it produces better output without re-rastering. Deprecate some old stuff to avoid the test suite growing obscenely large. 2017-06-13 13:09:12 -07:00			`try:`
			`textonly = sys.argv[sys.argv.index('-c') + 1]`
			`m.update(textonly.encode())`
			`except ValueError:`
			`m.update(b'textonly_pdf=0')`

Add documentation and test cases for —tesseract-config This parameter has existed for along time but never really got any attention. 2017-01-28 22:06:51 -08:00			`psm_arg = ''`
			`if '--psm' in sys.argv:`
			`psm_arg = '--psm'`
			`elif '-psm' in sys.argv:`
			`psm_arg = '-psm'`
			`if psm_arg:`
			`try:`
			`psm = sys.argv[sys.argv.index(psm_arg) + 1]`
			`m.update(psm.encode())`
			`except ValueError:`
			`m.update(b'default-psm')`
			`else:`
			`m.update(b'default-psm')`
Add tesseract caching to speed up tests 2015-12-17 12:52:12 -08:00
Update tesseract spoofing to cache orientation and script detection checks No cache: 269 s With cache: 144 s test_oversample[tesseract] now fails, all others good 2016-02-08 02:21:56 -08:00			`if operation == 'stdout' and psm != '0':`
			`real_tesseract()`
			`return`

			`if operation == 'stdout':`
Fix test suite breakage after sidecar feature added Forgot to update tesseract spoofers to account for change in tesseract parameters. Also the change to outputting multiple files in the collate steps affected how ruffus passes information into downstream consumers of those files. 2017-05-11 00:17:24 -07:00			`# tesseract [--options] ... input stdout`
Update tesseract spoofing to cache orientation and script detection checks No cache: 269 s With cache: 144 s test_oversample[tesseract] now fails, all others good 2016-02-08 02:21:56 -08:00			`input_file = sys.argv[-2]`
			`output_file = 'stdout'`
Fix test suite breakage after sidecar feature added Forgot to update tesseract spoofers to account for change in tesseract parameters. Also the change to outputting multiple files in the collate steps affected how ruffus passes information into downstream consumers of those files. 2017-05-11 00:17:24 -07:00			`sidecar_file = ''`
Update tesseract spoofing to cache orientation and script detection checks No cache: 269 s With cache: 144 s test_oversample[tesseract] now fails, all others good 2016-02-08 02:21:56 -08:00			`else:`
Fix test suite breakage after sidecar feature added Forgot to update tesseract spoofers to account for change in tesseract parameters. Also the change to outputting multiple files in the collate steps affected how ruffus passes information into downstream consumers of those files. 2017-05-11 00:17:24 -07:00			`# tesseract [--options] ... input output txt hocr\|pdf`
			`input_file = sys.argv[-4]`
			`output_file = sys.argv[-3]`
			`sidecar_file = sys.argv[-3]`
Add tesseract caching to speed up tests 2015-12-17 12:52:12 -08:00
			`if operation == 'hocr':`
			`output_file += '.hocr'`
Fix test suite breakage after sidecar feature added Forgot to update tesseract spoofers to account for change in tesseract parameters. Also the change to outputting multiple files in the collate steps affected how ruffus passes information into downstream consumers of those files. 2017-05-11 00:17:24 -07:00			`sidecar_file += '.txt'`
Add tesseract caching to speed up tests 2015-12-17 12:52:12 -08:00			`elif operation == 'pdf':`
			`output_file += '.pdf'`
Fix test suite breakage after sidecar feature added Forgot to update tesseract spoofers to account for change in tesseract parameters. Also the change to outputting multiple files in the collate steps affected how ruffus passes information into downstream consumers of those files. 2017-05-11 00:17:24 -07:00			`sidecar_file += '.txt'`
Add tesseract caching to speed up tests 2015-12-17 12:52:12 -08:00
			`with open(input_file, 'rb') as f:`
			`m.update(f.read())`
			`cache_name = os.path.join(CACHE_PATH, m.hexdigest())`
Update tesseract spoofing to cache orientation and script detection checks No cache: 269 s With cache: 144 s test_oversample[tesseract] now fails, all others good 2016-02-08 02:21:56 -08:00			`print(cache_name)`
Add tesseract caching to speed up tests 2015-12-17 12:52:12 -08:00			`if os.path.exists(cache_name):`
			`# Cache hit`
			`print("Tesseract cache hit", file=sys.stderr)`
Update tesseract spoofing to cache orientation and script detection checks No cache: 269 s With cache: 144 s test_oversample[tesseract] now fails, all others good 2016-02-08 02:21:56 -08:00			`if operation != 'stdout':`
			`shutil.copy(cache_name, output_file)`
Fix test suite breakage after sidecar feature added Forgot to update tesseract spoofers to account for change in tesseract parameters. Also the change to outputting multiple files in the collate steps affected how ruffus passes information into downstream consumers of those files. 2017-05-11 00:17:24 -07:00			`if sidecar:`
			`shutil.copy(cache_name + '.sidecar', sidecar_file)`
Update tesseract spoofing to cache orientation and script detection checks No cache: 269 s With cache: 144 s test_oversample[tesseract] now fails, all others good 2016-02-08 02:21:56 -08:00
			`# Replicate output`
tesseract caching: don't transcode tesseract's output, hash source file For sanity's sake, deal with tesseract streams in binary without transcoding (via universal_newlines, etc.). The only differences are printing messages regarding spoofing. Also hash the source file so that changes to the cache mechanism invalidate old cache automatically. That is probably too aggressive, but simple and safer than the previous approach. 2016-10-28 16:44:12 -07:00			`with open(cache_name + '.stdout', 'rb') as f:`
			`sys.stdout.buffer.write(f.read())`
			`with open(cache_name + '.stderr', 'rb') as f:`
			`sys.stderr.buffer.write(f.read())`
Add tesseract caching to speed up tests 2015-12-17 12:52:12 -08:00			`sys.exit(0)`

			`# Cache miss`
			`print("Tesseract cache miss", file=sys.stderr)`

			`# Call tesseract`
Update tesseract spoofing to cache orientation and script detection checks No cache: 269 s With cache: 144 s test_oversample[tesseract] now fails, all others good 2016-02-08 02:21:56 -08:00			`p = subprocess.Popen(`
			`['tesseract'] + sys.argv[1:],`
tesseract caching: don't transcode tesseract's output, hash source file For sanity's sake, deal with tesseract streams in binary without transcoding (via universal_newlines, etc.). The only differences are printing messages regarding spoofing. Also hash the source file so that changes to the cache mechanism invalidate old cache automatically. That is probably too aggressive, but simple and safer than the previous approach. 2016-10-28 16:44:12 -07:00			`stdout=subprocess.PIPE, stderr=subprocess.PIPE)`
Update tesseract spoofing to cache orientation and script detection checks No cache: 269 s With cache: 144 s test_oversample[tesseract] now fails, all others good 2016-02-08 02:21:56 -08:00			`stdout, stderr = p.communicate()`

Gracefully recover from tesseract's failure to process very large images And test cases to check this 2016-02-20 04:53:02 -08:00			`if p.returncode != 0:`
			`# Do not cache errors or crashes`
			`print("Tesseract error", file=sys.stderr)`
tesseract caching: don't transcode tesseract's output, hash source file For sanity's sake, deal with tesseract streams in binary without transcoding (via universal_newlines, etc.). The only differences are printing messages regarding spoofing. Also hash the source file so that changes to the cache mechanism invalidate old cache automatically. That is probably too aggressive, but simple and safer than the previous approach. 2016-10-28 16:44:12 -07:00			`sys.stdout.buffer.write(stdout)`
			`sys.stderr.buffer.write(stderr)`
Gracefully recover from tesseract's failure to process very large images And test cases to check this 2016-02-20 04:53:02 -08:00			`return p.returncode`

tesseract caching: don't transcode tesseract's output, hash source file For sanity's sake, deal with tesseract streams in binary without transcoding (via universal_newlines, etc.). The only differences are printing messages regarding spoofing. Also hash the source file so that changes to the cache mechanism invalidate old cache automatically. That is probably too aggressive, but simple and safer than the previous approach. 2016-10-28 16:44:12 -07:00			`with open(cache_name + '.stdout', 'wb') as f:`
Update tesseract spoofing to cache orientation and script detection checks No cache: 269 s With cache: 144 s test_oversample[tesseract] now fails, all others good 2016-02-08 02:21:56 -08:00			`f.write(stdout)`
tesseract caching: don't transcode tesseract's output, hash source file For sanity's sake, deal with tesseract streams in binary without transcoding (via universal_newlines, etc.). The only differences are printing messages regarding spoofing. Also hash the source file so that changes to the cache mechanism invalidate old cache automatically. That is probably too aggressive, but simple and safer than the previous approach. 2016-10-28 16:44:12 -07:00			`with open(cache_name + '.stderr', 'wb') as f:`
Update tesseract spoofing to cache orientation and script detection checks No cache: 269 s With cache: 144 s test_oversample[tesseract] now fails, all others good 2016-02-08 02:21:56 -08:00			`f.write(stderr)`
tesseract caching: don't transcode tesseract's output, hash source file For sanity's sake, deal with tesseract streams in binary without transcoding (via universal_newlines, etc.). The only differences are printing messages regarding spoofing. Also hash the source file so that changes to the cache mechanism invalidate old cache automatically. That is probably too aggressive, but simple and safer than the previous approach. 2016-10-28 16:44:12 -07:00			`sys.stdout.buffer.write(stdout)`
			`sys.stderr.buffer.write(stderr)`
Add tesseract caching to speed up tests 2015-12-17 12:52:12 -08:00
			`# Insert file into cache`
Update tesseract spoofing to cache orientation and script detection checks No cache: 269 s With cache: 144 s test_oversample[tesseract] now fails, all others good 2016-02-08 02:21:56 -08:00			`if output_file != 'stdout':`
			`if os.path.exists(output_file):`
			`shutil.copy(output_file, cache_name)`
			`else:`
			`print("Could not find output file", file=sys.stderr)`
Fix test suite breakage after sidecar feature added Forgot to update tesseract spoofers to account for change in tesseract parameters. Also the change to outputting multiple files in the collate steps affected how ruffus passes information into downstream consumers of those files. 2017-05-11 00:17:24 -07:00			`if sidecar and os.path.exists(sidecar_file):`
			`shutil.copy(sidecar_file, cache_name + '.sidecar')`
Add tesseract caching to speed up tests 2015-12-17 12:52:12 -08:00			`else:`
Update tesseract spoofing to cache orientation and script detection checks No cache: 269 s With cache: 144 s test_oversample[tesseract] now fails, all others good 2016-02-08 02:21:56 -08:00			`open(cache_name, 'w').close()`
Add tesseract caching to speed up tests 2015-12-17 12:52:12 -08:00

			`if __name__ == '__main__':`
			`main()`