From 8c17c9918e2987b9dafab51a26b1a0967b02d5a0 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sat, 28 Jan 2017 22:06:51 -0800 Subject: [PATCH] =?UTF-8?q?Add=20documentation=20and=20test=20cases=20for?= =?UTF-8?q?=20=E2=80=94tesseract-config?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This parameter has existed for along time but never really got any attention. --- docs/advanced.rst | 27 ++++++++++++++++++- ocrmypdf/__main__.py | 2 +- ocrmypdf/exceptions.py | 5 ++++ ocrmypdf/exec/tesseract.py | 10 ++++++- tests/spoof/tesseract_cache.py | 27 ++++++++++++++----- tests/test_main.py | 48 ++++++++++++++++++++++++++++++++++ 6 files changed, 110 insertions(+), 9 deletions(-) diff --git a/docs/advanced.rst b/docs/advanced.rst index 2364a791..7b1c9c95 100644 --- a/docs/advanced.rst +++ b/docs/advanced.rst @@ -59,10 +59,35 @@ In addition to tesseract, OCRmyPDF uses the following external binaries: In each case OCRmyPDF will check the environment variable ``OCRMYPDF_{program}`` before asking the system to find ``{program}`` on the PATH. For example, you could redirect OCRmyPDF to ``OCRMYPDF_GS`` to override Ghostscript. +Changing tesseract configuration variables +"""""""""""""""""""""""""""""""""""""""""" + +You can override tesseract's default `control parameters `_ with a configuration file. + +As an example, this configuration will disable Tesseract's dictionary for current language. Normally the dictionary is helpful for interpolating words that are unclear, but it may interfere with OCR if the document does not contain many words (for example, a list of part numbers). + +Create a file named "no-dict.cfg" with these contents: + +:: + + load_system_dawg 0 + language_model_penalty_non_dict_word 0 + language_model_penalty_non_freq_dict_word 0 + +then run ocrmypdf as follows (along with any other desired arguments): + +.. code-block:: bash + + ocrmypdf --tesseract-config no-dict.cfg input.pdf output.pdf + +.. warning:: + + Some combinations of control parameters will break Tesseract or break assumptions that OCRmyPDF makes about Tesseract's output. + + Changing the PDF renderer ------------------------- - rasterizing Converting a PDF to an image for display. diff --git a/ocrmypdf/__main__.py b/ocrmypdf/__main__.py index 838e9820..3ea968c1 100755 --- a/ocrmypdf/__main__.py +++ b/ocrmypdf/__main__.py @@ -207,7 +207,7 @@ advanced = parser.add_argument_group( "Advanced options for power users") advanced.add_argument( '--tesseract-config', action='append', metavar='CFG', default=[], - help="additional Tesseract configuration files") + help="additional Tesseract configuration files -- see documentation") advanced.add_argument( '--tesseract-pagesegmode', action='store', type=int, metavar='PSM', choices=range(0, 14), diff --git a/ocrmypdf/exceptions.py b/ocrmypdf/exceptions.py index cdf938fe..bfad6108 100644 --- a/ocrmypdf/exceptions.py +++ b/ocrmypdf/exceptions.py @@ -14,6 +14,7 @@ class ExitCode(IntEnum): already_done_ocr = 6 child_process_error = 7 encrypted_pdf = 8 + invalid_config = 9 other_error = 15 ctrl_c = 130 @@ -52,3 +53,7 @@ class SubprocessOutputError(ExitCodeException): class EncryptedPdfError(ExitCodeException): exit_code = ExitCode.encrypted_pdf + + +class TesseractConfigError(ExitCodeException): + exit_code = ExitCode.invalid_config diff --git a/ocrmypdf/exec/tesseract.py b/ocrmypdf/exec/tesseract.py index 6a938c32..3debdc83 100644 --- a/ocrmypdf/exec/tesseract.py +++ b/ocrmypdf/exec/tesseract.py @@ -6,7 +6,7 @@ import os import re import shutil from functools import lru_cache -from ..exceptions import MissingDependencyError +from ..exceptions import MissingDependencyError, TesseractConfigError from ..helpers import page_number from . import get_program from collections import namedtuple @@ -186,6 +186,8 @@ def tesseract_log_output(log, stdout, input_file): log.warning(prefix + "unsure about page orientation") elif 'error' in line.lower() or 'exception' in line.lower(): log.error(prefix + line.strip()) + elif 'read_params_file' in line.lower(): + log.error(prefix + line.strip()) else: log.info(prefix + line.strip()) @@ -236,6 +238,8 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode, _generate_null_hocr(output_hocr, input_file) except CalledProcessError as e: tesseract_log_output(log, e.output, input_file) + if 'read_params_file: parameter not found' in e.output: + raise TesseractConfigError() from e if 'Image too large' in e.output: _generate_null_hocr(output_hocr, input_file) return @@ -243,6 +247,7 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode, raise e from e else: tesseract_log_output(log, stdout, input_file) + if os.path.exists(badxml + '.html'): # Tesseract 3.02 appends suffix ".html" on its own (.badxml.html) shutil.move(badxml + '.html', badxml) @@ -305,6 +310,9 @@ def generate_pdf(input_image, skip_pdf, output_pdf, language: list, shutil.copy(skip_pdf, output_pdf) except CalledProcessError as e: tesseract_log_output(log, e.output, input_image) + if 'read_params_file: parameter not found' in e.output: + raise TesseractConfigError() from e + if 'Image too large' in e.output: shutil.copy(skip_pdf, output_pdf) return diff --git a/tests/spoof/tesseract_cache.py b/tests/spoof/tesseract_cache.py index 0e117015..86b14a28 100755 --- a/tests/spoof/tesseract_cache.py +++ b/tests/spoof/tesseract_cache.py @@ -37,6 +37,7 @@ def real_tesseract(): def main(): operation = sys.argv[-1] # For anything unexpected operation, defer to real tesseract binary + # Currently this includes all use of "--tesseract-config" if operation != 'hocr' and operation != 'pdf' and operation != 'stdout': real_tesseract() return # Not reachable @@ -52,6 +53,11 @@ def main(): ['tesseract', '--version'], stderr=subprocess.STDOUT) + if b'4.00.00alpha' in tess_version: + # Tesseract 4.x alpha is a moving target, don't cache it + real_tesseract() + return + m.update(tess_version) # Insert this source file into the hash function, to ensure that any @@ -65,12 +71,21 @@ def main(): lang = sys.argv[sys.argv.index('-l') + 1] m.update(lang.encode()) except ValueError: - pass - try: - psm = sys.argv[sys.argv.index('-psm') + 1] - m.update(psm.encode()) - except ValueError: - pass + m.update(b'default-lang') + + psm_arg = '' + if '--psm' in sys.argv: + psm_arg = '--psm' + elif '-psm' in sys.argv: + psm_arg = '-psm' + if psm_arg: + try: + psm = sys.argv[sys.argv.index(psm_arg) + 1] + m.update(psm.encode()) + except ValueError: + m.update(b'default-psm') + else: + m.update(b'default-psm') if operation == 'stdout' and psm != '0': real_tesseract() diff --git a/tests/test_main.py b/tests/test_main.py index 1ce5cc09..c9091e3a 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -690,3 +690,51 @@ def test_destination_not_writable(spoof_tesseract_noop, resources, outdir): resources / 'jbig2.pdf', protected_file, env=spoof_tesseract_noop) assert p.returncode == ExitCode.file_access_error, "Expected error" + + +def test_tesseract_config_valid(resources, outdir): + cfg_file = outdir / 'test.cfg' + with cfg_file.open('w') as f: + f.write('''\ +load_system_dawg 0 +language_model_penalty_non_dict_word 0 +language_model_penalty_non_freq_dict_word 0 +''') + + check_ocrmypdf( + resources / 'ccitt.pdf', outdir / 'out.pdf', + '--tesseract-config', str(cfg_file)) + + +@pytest.mark.parametrize('renderer', [ + 'hocr', + 'tesseract', + ]) +def test_tesseract_config_notfound(renderer, resources, outdir): + cfg_file = outdir / 'nofile.cfg' + + p, out, err = run_ocrmypdf( + resources / 'ccitt.pdf', outdir / 'out.pdf', + '--pdf-renderer', renderer, + '--tesseract-config', str(cfg_file)) + assert "Can't open" in err, "No error message about missing config file" + assert p.returncode == ExitCode.ok + + +@pytest.mark.parametrize('renderer', [ + 'hocr', + 'tesseract', + ]) +def test_tesseract_config_invalid(renderer, resources, outdir): + cfg_file = outdir / 'test.cfg' + with cfg_file.open('w') as f: + f.write('''\ +THIS FILE IS INVALID +''') + + p, out, err = run_ocrmypdf( + resources / 'ccitt.pdf', outdir / 'out.pdf', + '--pdf-renderer', renderer, + '--tesseract-config', str(cfg_file)) + assert "parameter not found" in err, "No error message" + assert p.returncode == ExitCode.invalid_config