Remove tesseract renderer entirely

Grafting lets us work with older Tesseract versions as if they could use
sandwich, so there is no point in keeping it. It's been deprecated for a
long time now anyway.
This commit is contained in:
James R. Barlow 2018-05-10 14:06:13 -07:00
parent e0bb898f29
commit b8f3ead541
4 changed files with 8 additions and 53 deletions

View File

@ -105,7 +105,7 @@ rendering
Creating a new PDF from other data (such as an existing PDF).
OCRmyPDF has three PDF renderers: ``sandwich``, ``hocr``, ``tesseract``. The renderer may be selected using ``--pdf-renderer``. The default is ``auto`` which lets OCRmyPDF select the renderer to use. Currently, ``auto`` selects ``sandwich`` for Tesseract 3.05.01 or newer, or ``hocr`` for older versions of Tesseract.
OCRmyPDF has these PDF renderers: ``sandwich`` and ``hocr``. The renderer may be selected using ``--pdf-renderer``. The default is ``auto`` which lets OCRmyPDF select the renderer to use. Currently, ``auto`` selects ``sandwich`` for Tesseract 3.05.01 or newer, or ``hocr`` for older versions of Tesseract.
The ``sandwich`` renderer
"""""""""""""""""""""""""
@ -132,4 +132,4 @@ This works in all versions of Tesseract.
The ``tesseract`` renderer
""""""""""""""""""""""""""
The ``tesseract`` renderer is deprecated, and is now an alias for the ``sandwich`` renderer. The alias will be removed in a future release.
The ``tesseract`` renderer was removed. OCRmyPDF's new approach to text layer grafting makes it functionally equivalent to ``sandwich``.

View File

@ -59,18 +59,3 @@ If you wish to add a single language pack, you could do the following:
.. code-block:: bash
host$ docker commit <container_id> ocrmypdf-tess4-heb
Known limitations
-----------------
As of v4.2, users of ocrmypdf working languages outside the Latin alphabet should use the following syntax:
.. code-block:: bash
ocrmypdf -l eng+gre --output-type pdf --pdf-renderer tesseract
The reasons for this are:
* The latest version of Ghostscript (9.19 as of this writing) has unfixed bugs in Unicode handling that generate invalid character maps, so Ghostscript cannot be used for PDF/A conversion
* The default "hocr" PDF renderer does not handle Asian fonts properly

View File

@ -248,10 +248,6 @@ ocrsettings.add_argument(
help="Skip OCR on any pages that already contain text, but include the "
"page in final output; useful for PDFs that contain a mix of "
"images, text pages, and/or previously OCRed pages")
# ocrsettings.add_argument(
# '--redo-ocr', action='store_true',
# help="removing any existing OCR text, but otherwise preserve mixed PDF "
# "pages")
ocrsettings.add_argument(
'--skip-big', type=float, metavar='MPixels',
@ -284,15 +280,9 @@ advanced.add_argument(
)
advanced.add_argument(
'--pdf-renderer',
choices=['auto', 'tesseract', 'hocr', 'sandwich'], default='auto',
choices=['auto', 'hocr', 'sandwich'], default='auto',
help="Choose OCR PDF renderer - the default option is to let OCRmyPDF "
"choose."
"auto - let OCRmyPDF choose; "
"sandwich - default renderer for Tesseract 3.05.01 and newer; "
"hocr - default renderer for older versions of Tesseract; "
"tesseract - gives better results for non-Latin languages and "
"Tesseract older than 3.05.01 but has problems with some versions "
" of Ghostscript; deprecated"
"choose. See documentation for discussion."
)
advanced.add_argument(
'--tesseract-timeout', default=180.0, type=float, metavar='SECONDS',
@ -361,12 +351,6 @@ def check_options_languages(options, _log):
def check_options_output(options, log):
if options.pdf_renderer == 'tesseract':
log.warning(
"--pdf-renderer=tesseract is now the same as "
"--pdf-renderer=sandwich. The 'tesseract' option is deprecated.")
options.pdf_renderer = 'sandwich'
if options.pdf_renderer == 'auto':
if tesseract.version() < '3.05' \
and options.output_type.startswith('pdfa'):
@ -378,7 +362,7 @@ def check_options_output(options, log):
and tesseract.version() < '3.05' \
and options.output_type.startswith('pdfa'):
log.warning(
"For best results use --pdf-renderer=tesseract "
"For best results use --pdf-renderer=sandwich "
"--output-type=pdf to disable PDF/A generation via "
"Ghostscript, which is known to corrupt the OCR text of "
"some PDFs produced your version of Tesseract.")
@ -413,13 +397,6 @@ def check_options_preprocessing(options, log):
raise MissingDependencyError(
"Install the 'unpaper' program to use --clean, --clean-final.")
if options.clean and \
not options.clean_final and \
options.pdf_renderer == 'tesseract':
log.info(
"Tesseract PDF renderer cannot render --clean pages without "
"also performing --clean-final, so --clean-final is assumed.")
def check_options_ocr_behavior(options, log):
if options.force_ocr and options.skip_text:
@ -427,9 +404,6 @@ def check_options_ocr_behavior(options, log):
None,
"Error: --force-ocr and --skip-text are mutually incompatible.")
# if options.redo_ocr and (options.skip_text or options.force_ocr):
# raise argparse.ArgumentError(
# "Error: --redo-ocr and other OCR options are incompatible.")
languages = set(options.language)
if options.pdf_renderer == 'hocr' and \
not languages.issubset(HOCR_OK_LANGS):
@ -442,7 +416,7 @@ def check_options_ocr_behavior(options, log):
"Use --pdf-renderer auto (the default) to avoid this issue.")
else:
msg += (
"Use --pdf-renderer tesseract --output-type pdf to avoid "
"Use --pdf-renderer sandwich --output-type pdf to avoid "
"this issue")
log.warning(msg)
elif ghostscript.version() < '9.20' and \
@ -460,10 +434,6 @@ def check_options_advanced(options, log):
if options.tesseract_oem and not tesseract.v4():
log.warning(
"--tesseract-oem requires Tesseract 4.x -- argument ignored")
if options.pdf_renderer == 'sandwich' and not tesseract.has_textonly_pdf():
raise MissingDependencyError(
"--pdf-renderer sandwich requires Tesseract 4.x "
"commit 3d9fb3b or later")
if options.pdfa_image_compression != 'auto' and \
options.output_type.startswith('pdfa'):
log.warning(
@ -596,7 +566,7 @@ def do_ruffus_exception(ruffus_five_tuple, options, log):
ocrmypdf cannot automatically correct the problem on its own.
Try using
ocrmypdf --pdf-renderer tesseract [..other args..]
ocrmypdf --pdf-renderer sandwich [..other args..]
"""))
exit_code = ExitCode.input_file
elif exc_name.startswith('ocrmypdf.exceptions.'):

View File

@ -151,7 +151,7 @@ def test_remove_background(spoof_tesseract_noop, resources, outdir):
@pytest.mark.parametrize(
"pdf",
['palette.pdf', 'cmyk.pdf', 'ccitt.pdf', 'jbig2.pdf', 'lichtenstein.pdf'])
@pytest.mark.parametrize("renderer", ['auto', 'tesseract'])
@pytest.mark.parametrize("renderer", ['sandwich', 'hocr'])
@pytest.mark.parametrize("output_type", ['pdf', 'pdfa'])
def test_exotic_image(spoof_tesseract_cache, pdf, renderer, output_type,
resources, outdir):