Remove tesseract renderer entirely

Grafting lets us work with older Tesseract versions as if they could use sandwich, so there is no point in keeping it. It's been deprecated for a long time now anyway.
2026-01-08 05:02:39 +00:00 · 2018-05-10 14:06:13 -07:00 · 2018-05-10 14:06:13 -07:00 · b8f3ead541
commit b8f3ead541
parent e0bb898f29
4 changed files with 8 additions and 53 deletions
--- a/docs/advanced.rst
+++ b/docs/advanced.rst
@ -105,7 +105,7 @@ rendering
  Creating a new PDF from other data (such as an existing PDF).


-OCRmyPDF has three PDF renderers: ``sandwich``, ``hocr``, ``tesseract``. The renderer may be selected using ``--pdf-renderer``. The default is ``auto`` which lets OCRmyPDF select the renderer to use. Currently, ``auto`` selects ``sandwich`` for Tesseract 3.05.01 or newer, or ``hocr`` for older versions of Tesseract.
+OCRmyPDF has these PDF renderers: ``sandwich`` and ``hocr``. The renderer may be selected using ``--pdf-renderer``. The default is ``auto`` which lets OCRmyPDF select the renderer to use. Currently, ``auto`` selects ``sandwich`` for Tesseract 3.05.01 or newer, or ``hocr`` for older versions of Tesseract.

 The ``sandwich`` renderer
 """""""""""""""""""""""""
@ -132,4 +132,4 @@ This works in all versions of Tesseract.
 The ``tesseract`` renderer
 """"""""""""""""""""""""""

-The ``tesseract`` renderer is deprecated, and is now an alias for the ``sandwich`` renderer.  The alias will be removed in a future release.
+The ``tesseract`` renderer was removed. OCRmyPDF's new approach to text layer grafting makes it functionally equivalent to ``sandwich``.
--- a/docs/languages.rst
+++ b/docs/languages.rst
@ -59,18 +59,3 @@ If you wish to add a single language pack, you could do the following:
 .. code-block:: bash

    host$ docker commit <container_id> ocrmypdf-tess4-heb
-
-
-Known limitations
-----------------
-
-As of v4.2, users of ocrmypdf working languages outside the Latin alphabet should use the following syntax:
-
-.. code-block:: bash
-
-	ocrmypdf -l eng+gre --output-type pdf --pdf-renderer tesseract
-
-The reasons for this are:
-
-* The latest version of Ghostscript (9.19 as of this writing) has unfixed bugs in Unicode handling that generate invalid character maps, so Ghostscript cannot be used for PDF/A conversion
-* The default "hocr" PDF renderer does not handle Asian fonts properly
--- a/src/ocrmypdf/main.py
+++ b/src/ocrmypdf/main.py
@ -248,10 +248,6 @@ ocrsettings.add_argument(
    help="Skip OCR on any pages that already contain text, but include the "
         "page in final output; useful for PDFs that contain a mix of "
         "images, text pages, and/or previously OCRed pages")
-# ocrsettings.add_argument(
-#     '--redo-ocr', action='store_true',
-#     help="removing any existing OCR text, but otherwise preserve mixed PDF "
-#          "pages")

 ocrsettings.add_argument(
    '--skip-big', type=float, metavar='MPixels',
@ -284,15 +280,9 @@ advanced.add_argument(
    )
 advanced.add_argument(
    '--pdf-renderer',
-    choices=['auto', 'tesseract', 'hocr', 'sandwich'], default='auto',
+    choices=['auto', 'hocr', 'sandwich'], default='auto',
    help="Choose OCR PDF renderer - the default option is to let OCRmyPDF "
-         "choose."
-         "auto - let OCRmyPDF choose; "
-         "sandwich - default renderer for Tesseract 3.05.01 and newer; "
-         "hocr - default renderer for older versions of Tesseract; "
-         "tesseract - gives better results for non-Latin languages and "
-         "Tesseract older than 3.05.01 but has problems with some versions "
-         " of Ghostscript; deprecated"
+         "choose.  See documentation for discussion."
    )
 advanced.add_argument(
    '--tesseract-timeout', default=180.0, type=float, metavar='SECONDS',
@ -361,12 +351,6 @@ def check_options_languages(options, _log):


 def check_options_output(options, log):
-    if options.pdf_renderer == 'tesseract':
-        log.warning(
-            "--pdf-renderer=tesseract is now the same as "
-            "--pdf-renderer=sandwich. The 'tesseract' option is deprecated.")
-        options.pdf_renderer = 'sandwich'
-
    if options.pdf_renderer == 'auto':
        if tesseract.version() < '3.05' \
                and options.output_type.startswith('pdfa'):
@ -378,7 +362,7 @@ def check_options_output(options, log):
            and tesseract.version() < '3.05' \
            and options.output_type.startswith('pdfa'):
        log.warning(
-            "For best results use --pdf-renderer=tesseract "
+            "For best results use --pdf-renderer=sandwich "
            "--output-type=pdf to disable PDF/A generation via "
            "Ghostscript, which is known to corrupt the OCR text of "
            "some PDFs produced your version of Tesseract.")
@ -413,13 +397,6 @@ def check_options_preprocessing(options, log):
            raise MissingDependencyError(
                "Install the 'unpaper' program to use --clean, --clean-final.")

-    if options.clean and \
-            not options.clean_final and \
-            options.pdf_renderer == 'tesseract':
-        log.info(
-            "Tesseract PDF renderer cannot render --clean pages without "
-            "also performing --clean-final, so --clean-final is assumed.")
-

 def check_options_ocr_behavior(options, log):
    if options.force_ocr and options.skip_text:
@ -427,9 +404,6 @@ def check_options_ocr_behavior(options, log):
            None,
            "Error: --force-ocr and --skip-text are mutually incompatible.")

-    # if options.redo_ocr and (options.skip_text or options.force_ocr):
-    #     raise argparse.ArgumentError(
-    #         "Error: --redo-ocr and other OCR options are incompatible.")
    languages = set(options.language)
    if options.pdf_renderer == 'hocr' and \
            not languages.issubset(HOCR_OK_LANGS):
@ -442,7 +416,7 @@ def check_options_ocr_behavior(options, log):
                "Use --pdf-renderer auto (the default) to avoid this issue.")
        else:
            msg += (
-                "Use --pdf-renderer tesseract --output-type pdf to avoid "
+                "Use --pdf-renderer sandwich --output-type pdf to avoid "
                "this issue")
        log.warning(msg)
    elif ghostscript.version() < '9.20' and \
@ -460,10 +434,6 @@ def check_options_advanced(options, log):
    if options.tesseract_oem and not tesseract.v4():
        log.warning(
            "--tesseract-oem requires Tesseract 4.x -- argument ignored")
-    if options.pdf_renderer == 'sandwich' and not tesseract.has_textonly_pdf():
-        raise MissingDependencyError(
-            "--pdf-renderer sandwich requires Tesseract 4.x "
-            "commit 3d9fb3b or later")
    if options.pdfa_image_compression != 'auto' and \
            options.output_type.startswith('pdfa'):
        log.warning(
@ -596,7 +566,7 @@ def do_ruffus_exception(ruffus_five_tuple, options, log):
            ocrmypdf cannot automatically correct the problem on its own.

            Try using
-                ocrmypdf --pdf-renderer tesseract  [..other args..]
+                ocrmypdf --pdf-renderer sandwich  [..other args..]
            """))
        exit_code = ExitCode.input_file
    elif exc_name.startswith('ocrmypdf.exceptions.'):
--- a/tests/test_main.py
+++ b/tests/test_main.py
@ -151,7 +151,7 @@ def test_remove_background(spoof_tesseract_noop, resources, outdir):
@pytest.mark.parametrize(
    "pdf",
    ['palette.pdf', 'cmyk.pdf', 'ccitt.pdf', 'jbig2.pdf', 'lichtenstein.pdf'])
-@pytest.mark.parametrize("renderer", ['auto', 'tesseract'])
+@pytest.mark.parametrize("renderer", ['sandwich', 'hocr'])
@pytest.mark.parametrize("output_type", ['pdf', 'pdfa'])
 def test_exotic_image(spoof_tesseract_cache, pdf, renderer, output_type,
                      resources, outdir):