Remove the old tesseract pdf_renderer

2025-08-19 14:12:38 +00:00 · 2018-05-01 17:31:34 -07:00 · 2018-05-01 17:31:34 -07:00 · a9abe13185
commit a9abe13185
parent 6b315e8315
3 changed files with 23 additions and 32 deletions
--- a/src/ocrmypdf/main.py
+++ b/src/ocrmypdf/main.py
@ -364,31 +364,27 @@ def check_options_languages(options, _log):
 def check_options_output(options, log):
    if options.pdf_renderer == 'auto':
        if tesseract.has_textonly_pdf():
            options.pdf_renderer = 'sandwich'
        else:
            options.pdf_renderer = 'hocr'
    if options.pdf_renderer == 'sandwich' and not tesseract.has_textonly_pdf():
        raise MissingDependencyError(
            "The 'sandwich' renderer requires Tesseract 3.05.01 or newer; "
            "or Tesseract 4.00 alpha newer than February 2017.")
    if options.pdf_renderer == 'tesseract':
-        if tesseract.version() < '3.05' and \
+        log.warning(
-                options.output_type.startswith('pdfa'):
+            "--pdf-renderer=tesseract is now the same as "
            "--pdf-renderer=sandwich. The 'tesseract' option is deprecated.")
        options.pdf_renderer = 'sandwich'
    if options.pdf_renderer == 'auto':
        if tesseract.version() < '3.05' \
                and options.output_type.startswith('pdfa'):
            options.pdf_renderer = 'hocr'
        else:
            options.pdf_renderer = 'sandwich'
    if options.pdf_renderer == 'sandwich' \
            and tesseract.version() < '3.05' \
            and options.output_type.startswith('pdfa'):
        log.warning(
            "For best results use --pdf-renderer=tesseract "
            "--output-type=pdf to disable PDF/A generation via "
            "Ghostscript, which is known to corrupt the OCR text of "
            "some PDFs produced your version of Tesseract.")
        elif tesseract.has_textonly_pdf():
            log.warning(
                "The argument --pdf-renderer=tesseract provides support for "
                "versions of tesseract older than your version. For best "
                "results omit this argument and let OCRmyPDF choose the "
                "best available renderer.")
    if options.debug_rendering and options.pdf_renderer != 'hocr':
        log.info(
--- a/src/ocrmypdf/pipeline.py
+++ b/src/ocrmypdf/pipeline.py
@ -1265,8 +1265,6 @@ def build_pipeline(options, work_folder, log, context):
        extras=[log, context])
    task_select_image_layer.graphviz(
        fillcolor='"#00cc66"', shape='diamond')
    task_select_image_layer.active_if(
        options.pdf_renderer == 'hocr' or options.pdf_renderer == 'sandwich')
    task_render_hocr_page = main_pipeline.transform(
        task_func=render_hocr_page,
@ -1296,8 +1294,7 @@ def build_pipeline(options, work_folder, log, context):
                os.path.join(work_folder, r'\1.text.txt')],
        extras=[log, context])
    task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
-    task_ocr_tesseract_textonly_pdf.active_if(
+    task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'sandwich')
        options.pdf_renderer == 'tesseract' or options.pdf_renderer == 'sandwich')
    task_weave_layers = main_pipeline.collate(
        task_func=weave_layers,
--- a/tests/test_main.py
+++ b/tests/test_main.py
@ -42,9 +42,7 @@ run_ocrmypdf = pytest.helpers.run_ocrmypdf
 spoof = pytest.helpers.spoof
-RENDERERS = ['hocr', 'tesseract']
+RENDERERS = ['hocr', 'sandwich']
 if tesseract.has_textonly_pdf():
    RENDERERS.append('sandwich')
@pytest.fixture(scope='session')
@ -624,7 +622,7 @@ def test_rotated_skew_timeout(resources, outpdf):
    """This document contains an image that is rotated 90 into place with a
    /Rotate tag and intentionally skewed by altering the transformation matrix.
-    This tests for a bug where the combinatino of preprocessing and a tesseract
+    This tests for a bug where the combination of preprocessing and a tesseract
    timeout produced a page whose dimensions did not match the original's.
    """