mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-10-09 23:18:59 +00:00
Add documentation and test cases for —tesseract-config
This parameter has existed for along time but never really got any attention.
This commit is contained in:
parent
ea0dd99d0b
commit
8c17c9918e
@ -59,10 +59,35 @@ In addition to tesseract, OCRmyPDF uses the following external binaries:
|
||||
|
||||
In each case OCRmyPDF will check the environment variable ``OCRMYPDF_{program}`` before asking the system to find ``{program}`` on the PATH. For example, you could redirect OCRmyPDF to ``OCRMYPDF_GS`` to override Ghostscript.
|
||||
|
||||
Changing tesseract configuration variables
|
||||
""""""""""""""""""""""""""""""""""""""""""
|
||||
|
||||
You can override tesseract's default `control parameters <https://github.com/tesseract-ocr/tesseract/wiki/ControlParams>`_ with a configuration file.
|
||||
|
||||
As an example, this configuration will disable Tesseract's dictionary for current language. Normally the dictionary is helpful for interpolating words that are unclear, but it may interfere with OCR if the document does not contain many words (for example, a list of part numbers).
|
||||
|
||||
Create a file named "no-dict.cfg" with these contents:
|
||||
|
||||
::
|
||||
|
||||
load_system_dawg 0
|
||||
language_model_penalty_non_dict_word 0
|
||||
language_model_penalty_non_freq_dict_word 0
|
||||
|
||||
then run ocrmypdf as follows (along with any other desired arguments):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ocrmypdf --tesseract-config no-dict.cfg input.pdf output.pdf
|
||||
|
||||
.. warning::
|
||||
|
||||
Some combinations of control parameters will break Tesseract or break assumptions that OCRmyPDF makes about Tesseract's output.
|
||||
|
||||
|
||||
Changing the PDF renderer
|
||||
-------------------------
|
||||
|
||||
|
||||
rasterizing
|
||||
Converting a PDF to an image for display.
|
||||
|
||||
|
@ -207,7 +207,7 @@ advanced = parser.add_argument_group(
|
||||
"Advanced options for power users")
|
||||
advanced.add_argument(
|
||||
'--tesseract-config', action='append', metavar='CFG', default=[],
|
||||
help="additional Tesseract configuration files")
|
||||
help="additional Tesseract configuration files -- see documentation")
|
||||
advanced.add_argument(
|
||||
'--tesseract-pagesegmode', action='store', type=int, metavar='PSM',
|
||||
choices=range(0, 14),
|
||||
|
@ -14,6 +14,7 @@ class ExitCode(IntEnum):
|
||||
already_done_ocr = 6
|
||||
child_process_error = 7
|
||||
encrypted_pdf = 8
|
||||
invalid_config = 9
|
||||
other_error = 15
|
||||
ctrl_c = 130
|
||||
|
||||
@ -52,3 +53,7 @@ class SubprocessOutputError(ExitCodeException):
|
||||
|
||||
class EncryptedPdfError(ExitCodeException):
|
||||
exit_code = ExitCode.encrypted_pdf
|
||||
|
||||
|
||||
class TesseractConfigError(ExitCodeException):
|
||||
exit_code = ExitCode.invalid_config
|
||||
|
@ -6,7 +6,7 @@ import os
|
||||
import re
|
||||
import shutil
|
||||
from functools import lru_cache
|
||||
from ..exceptions import MissingDependencyError
|
||||
from ..exceptions import MissingDependencyError, TesseractConfigError
|
||||
from ..helpers import page_number
|
||||
from . import get_program
|
||||
from collections import namedtuple
|
||||
@ -186,6 +186,8 @@ def tesseract_log_output(log, stdout, input_file):
|
||||
log.warning(prefix + "unsure about page orientation")
|
||||
elif 'error' in line.lower() or 'exception' in line.lower():
|
||||
log.error(prefix + line.strip())
|
||||
elif 'read_params_file' in line.lower():
|
||||
log.error(prefix + line.strip())
|
||||
else:
|
||||
log.info(prefix + line.strip())
|
||||
|
||||
@ -236,6 +238,8 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode,
|
||||
_generate_null_hocr(output_hocr, input_file)
|
||||
except CalledProcessError as e:
|
||||
tesseract_log_output(log, e.output, input_file)
|
||||
if 'read_params_file: parameter not found' in e.output:
|
||||
raise TesseractConfigError() from e
|
||||
if 'Image too large' in e.output:
|
||||
_generate_null_hocr(output_hocr, input_file)
|
||||
return
|
||||
@ -243,6 +247,7 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode,
|
||||
raise e from e
|
||||
else:
|
||||
tesseract_log_output(log, stdout, input_file)
|
||||
|
||||
if os.path.exists(badxml + '.html'):
|
||||
# Tesseract 3.02 appends suffix ".html" on its own (.badxml.html)
|
||||
shutil.move(badxml + '.html', badxml)
|
||||
@ -305,6 +310,9 @@ def generate_pdf(input_image, skip_pdf, output_pdf, language: list,
|
||||
shutil.copy(skip_pdf, output_pdf)
|
||||
except CalledProcessError as e:
|
||||
tesseract_log_output(log, e.output, input_image)
|
||||
if 'read_params_file: parameter not found' in e.output:
|
||||
raise TesseractConfigError() from e
|
||||
|
||||
if 'Image too large' in e.output:
|
||||
shutil.copy(skip_pdf, output_pdf)
|
||||
return
|
||||
|
@ -37,6 +37,7 @@ def real_tesseract():
|
||||
def main():
|
||||
operation = sys.argv[-1]
|
||||
# For anything unexpected operation, defer to real tesseract binary
|
||||
# Currently this includes all use of "--tesseract-config"
|
||||
if operation != 'hocr' and operation != 'pdf' and operation != 'stdout':
|
||||
real_tesseract()
|
||||
return # Not reachable
|
||||
@ -52,6 +53,11 @@ def main():
|
||||
['tesseract', '--version'],
|
||||
stderr=subprocess.STDOUT)
|
||||
|
||||
if b'4.00.00alpha' in tess_version:
|
||||
# Tesseract 4.x alpha is a moving target, don't cache it
|
||||
real_tesseract()
|
||||
return
|
||||
|
||||
m.update(tess_version)
|
||||
|
||||
# Insert this source file into the hash function, to ensure that any
|
||||
@ -65,12 +71,21 @@ def main():
|
||||
lang = sys.argv[sys.argv.index('-l') + 1]
|
||||
m.update(lang.encode())
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
psm = sys.argv[sys.argv.index('-psm') + 1]
|
||||
m.update(psm.encode())
|
||||
except ValueError:
|
||||
pass
|
||||
m.update(b'default-lang')
|
||||
|
||||
psm_arg = ''
|
||||
if '--psm' in sys.argv:
|
||||
psm_arg = '--psm'
|
||||
elif '-psm' in sys.argv:
|
||||
psm_arg = '-psm'
|
||||
if psm_arg:
|
||||
try:
|
||||
psm = sys.argv[sys.argv.index(psm_arg) + 1]
|
||||
m.update(psm.encode())
|
||||
except ValueError:
|
||||
m.update(b'default-psm')
|
||||
else:
|
||||
m.update(b'default-psm')
|
||||
|
||||
if operation == 'stdout' and psm != '0':
|
||||
real_tesseract()
|
||||
|
@ -690,3 +690,51 @@ def test_destination_not_writable(spoof_tesseract_noop, resources, outdir):
|
||||
resources / 'jbig2.pdf', protected_file,
|
||||
env=spoof_tesseract_noop)
|
||||
assert p.returncode == ExitCode.file_access_error, "Expected error"
|
||||
|
||||
|
||||
def test_tesseract_config_valid(resources, outdir):
|
||||
cfg_file = outdir / 'test.cfg'
|
||||
with cfg_file.open('w') as f:
|
||||
f.write('''\
|
||||
load_system_dawg 0
|
||||
language_model_penalty_non_dict_word 0
|
||||
language_model_penalty_non_freq_dict_word 0
|
||||
''')
|
||||
|
||||
check_ocrmypdf(
|
||||
resources / 'ccitt.pdf', outdir / 'out.pdf',
|
||||
'--tesseract-config', str(cfg_file))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('renderer', [
|
||||
'hocr',
|
||||
'tesseract',
|
||||
])
|
||||
def test_tesseract_config_notfound(renderer, resources, outdir):
|
||||
cfg_file = outdir / 'nofile.cfg'
|
||||
|
||||
p, out, err = run_ocrmypdf(
|
||||
resources / 'ccitt.pdf', outdir / 'out.pdf',
|
||||
'--pdf-renderer', renderer,
|
||||
'--tesseract-config', str(cfg_file))
|
||||
assert "Can't open" in err, "No error message about missing config file"
|
||||
assert p.returncode == ExitCode.ok
|
||||
|
||||
|
||||
@pytest.mark.parametrize('renderer', [
|
||||
'hocr',
|
||||
'tesseract',
|
||||
])
|
||||
def test_tesseract_config_invalid(renderer, resources, outdir):
|
||||
cfg_file = outdir / 'test.cfg'
|
||||
with cfg_file.open('w') as f:
|
||||
f.write('''\
|
||||
THIS FILE IS INVALID
|
||||
''')
|
||||
|
||||
p, out, err = run_ocrmypdf(
|
||||
resources / 'ccitt.pdf', outdir / 'out.pdf',
|
||||
'--pdf-renderer', renderer,
|
||||
'--tesseract-config', str(cfg_file))
|
||||
assert "parameter not found" in err, "No error message"
|
||||
assert p.returncode == ExitCode.invalid_config
|
||||
|
Loading…
x
Reference in New Issue
Block a user