Add documentation and test cases for —tesseract-config

This parameter has existed for along time but never really got any
attention.
This commit is contained in:
James R. Barlow 2017-01-28 22:06:51 -08:00
parent ea0dd99d0b
commit 8c17c9918e
6 changed files with 110 additions and 9 deletions

View File

@ -59,10 +59,35 @@ In addition to tesseract, OCRmyPDF uses the following external binaries:
In each case OCRmyPDF will check the environment variable ``OCRMYPDF_{program}`` before asking the system to find ``{program}`` on the PATH. For example, you could redirect OCRmyPDF to ``OCRMYPDF_GS`` to override Ghostscript.
Changing tesseract configuration variables
""""""""""""""""""""""""""""""""""""""""""
You can override tesseract's default `control parameters <https://github.com/tesseract-ocr/tesseract/wiki/ControlParams>`_ with a configuration file.
As an example, this configuration will disable Tesseract's dictionary for current language. Normally the dictionary is helpful for interpolating words that are unclear, but it may interfere with OCR if the document does not contain many words (for example, a list of part numbers).
Create a file named "no-dict.cfg" with these contents:
::
load_system_dawg 0
language_model_penalty_non_dict_word 0
language_model_penalty_non_freq_dict_word 0
then run ocrmypdf as follows (along with any other desired arguments):
.. code-block:: bash
ocrmypdf --tesseract-config no-dict.cfg input.pdf output.pdf
.. warning::
Some combinations of control parameters will break Tesseract or break assumptions that OCRmyPDF makes about Tesseract's output.
Changing the PDF renderer
-------------------------
rasterizing
Converting a PDF to an image for display.

View File

@ -207,7 +207,7 @@ advanced = parser.add_argument_group(
"Advanced options for power users")
advanced.add_argument(
'--tesseract-config', action='append', metavar='CFG', default=[],
help="additional Tesseract configuration files")
help="additional Tesseract configuration files -- see documentation")
advanced.add_argument(
'--tesseract-pagesegmode', action='store', type=int, metavar='PSM',
choices=range(0, 14),

View File

@ -14,6 +14,7 @@ class ExitCode(IntEnum):
already_done_ocr = 6
child_process_error = 7
encrypted_pdf = 8
invalid_config = 9
other_error = 15
ctrl_c = 130
@ -52,3 +53,7 @@ class SubprocessOutputError(ExitCodeException):
class EncryptedPdfError(ExitCodeException):
exit_code = ExitCode.encrypted_pdf
class TesseractConfigError(ExitCodeException):
exit_code = ExitCode.invalid_config

View File

@ -6,7 +6,7 @@ import os
import re
import shutil
from functools import lru_cache
from ..exceptions import MissingDependencyError
from ..exceptions import MissingDependencyError, TesseractConfigError
from ..helpers import page_number
from . import get_program
from collections import namedtuple
@ -186,6 +186,8 @@ def tesseract_log_output(log, stdout, input_file):
log.warning(prefix + "unsure about page orientation")
elif 'error' in line.lower() or 'exception' in line.lower():
log.error(prefix + line.strip())
elif 'read_params_file' in line.lower():
log.error(prefix + line.strip())
else:
log.info(prefix + line.strip())
@ -236,6 +238,8 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode,
_generate_null_hocr(output_hocr, input_file)
except CalledProcessError as e:
tesseract_log_output(log, e.output, input_file)
if 'read_params_file: parameter not found' in e.output:
raise TesseractConfigError() from e
if 'Image too large' in e.output:
_generate_null_hocr(output_hocr, input_file)
return
@ -243,6 +247,7 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode,
raise e from e
else:
tesseract_log_output(log, stdout, input_file)
if os.path.exists(badxml + '.html'):
# Tesseract 3.02 appends suffix ".html" on its own (.badxml.html)
shutil.move(badxml + '.html', badxml)
@ -305,6 +310,9 @@ def generate_pdf(input_image, skip_pdf, output_pdf, language: list,
shutil.copy(skip_pdf, output_pdf)
except CalledProcessError as e:
tesseract_log_output(log, e.output, input_image)
if 'read_params_file: parameter not found' in e.output:
raise TesseractConfigError() from e
if 'Image too large' in e.output:
shutil.copy(skip_pdf, output_pdf)
return

View File

@ -37,6 +37,7 @@ def real_tesseract():
def main():
operation = sys.argv[-1]
# For anything unexpected operation, defer to real tesseract binary
# Currently this includes all use of "--tesseract-config"
if operation != 'hocr' and operation != 'pdf' and operation != 'stdout':
real_tesseract()
return # Not reachable
@ -52,6 +53,11 @@ def main():
['tesseract', '--version'],
stderr=subprocess.STDOUT)
if b'4.00.00alpha' in tess_version:
# Tesseract 4.x alpha is a moving target, don't cache it
real_tesseract()
return
m.update(tess_version)
# Insert this source file into the hash function, to ensure that any
@ -65,12 +71,21 @@ def main():
lang = sys.argv[sys.argv.index('-l') + 1]
m.update(lang.encode())
except ValueError:
pass
try:
psm = sys.argv[sys.argv.index('-psm') + 1]
m.update(psm.encode())
except ValueError:
pass
m.update(b'default-lang')
psm_arg = ''
if '--psm' in sys.argv:
psm_arg = '--psm'
elif '-psm' in sys.argv:
psm_arg = '-psm'
if psm_arg:
try:
psm = sys.argv[sys.argv.index(psm_arg) + 1]
m.update(psm.encode())
except ValueError:
m.update(b'default-psm')
else:
m.update(b'default-psm')
if operation == 'stdout' and psm != '0':
real_tesseract()

View File

@ -690,3 +690,51 @@ def test_destination_not_writable(spoof_tesseract_noop, resources, outdir):
resources / 'jbig2.pdf', protected_file,
env=spoof_tesseract_noop)
assert p.returncode == ExitCode.file_access_error, "Expected error"
def test_tesseract_config_valid(resources, outdir):
cfg_file = outdir / 'test.cfg'
with cfg_file.open('w') as f:
f.write('''\
load_system_dawg 0
language_model_penalty_non_dict_word 0
language_model_penalty_non_freq_dict_word 0
''')
check_ocrmypdf(
resources / 'ccitt.pdf', outdir / 'out.pdf',
'--tesseract-config', str(cfg_file))
@pytest.mark.parametrize('renderer', [
'hocr',
'tesseract',
])
def test_tesseract_config_notfound(renderer, resources, outdir):
cfg_file = outdir / 'nofile.cfg'
p, out, err = run_ocrmypdf(
resources / 'ccitt.pdf', outdir / 'out.pdf',
'--pdf-renderer', renderer,
'--tesseract-config', str(cfg_file))
assert "Can't open" in err, "No error message about missing config file"
assert p.returncode == ExitCode.ok
@pytest.mark.parametrize('renderer', [
'hocr',
'tesseract',
])
def test_tesseract_config_invalid(renderer, resources, outdir):
cfg_file = outdir / 'test.cfg'
with cfg_file.open('w') as f:
f.write('''\
THIS FILE IS INVALID
''')
p, out, err = run_ocrmypdf(
resources / 'ccitt.pdf', outdir / 'out.pdf',
'--pdf-renderer', renderer,
'--tesseract-config', str(cfg_file))
assert "parameter not found" in err, "No error message"
assert p.returncode == ExitCode.invalid_config