diff --git a/src/ocrmypdf/__main__.py b/src/ocrmypdf/__main__.py index ad975b16..12b4f2c5 100755 --- a/src/ocrmypdf/__main__.py +++ b/src/ocrmypdf/__main__.py @@ -364,31 +364,27 @@ def check_options_languages(options, _log): def check_options_output(options, log): - if options.pdf_renderer == 'auto': - if tesseract.has_textonly_pdf(): - options.pdf_renderer = 'sandwich' - else: - options.pdf_renderer = 'hocr' - - if options.pdf_renderer == 'sandwich' and not tesseract.has_textonly_pdf(): - raise MissingDependencyError( - "The 'sandwich' renderer requires Tesseract 3.05.01 or newer; " - "or Tesseract 4.00 alpha newer than February 2017.") - if options.pdf_renderer == 'tesseract': - if tesseract.version() < '3.05' and \ - options.output_type.startswith('pdfa'): - log.warning( - "For best results use --pdf-renderer=tesseract " - "--output-type=pdf to disable PDF/A generation via " - "Ghostscript, which is known to corrupt the OCR text of " - "some PDFs produced your version of Tesseract.") - elif tesseract.has_textonly_pdf(): - log.warning( - "The argument --pdf-renderer=tesseract provides support for " - "versions of tesseract older than your version. For best " - "results omit this argument and let OCRmyPDF choose the " - "best available renderer.") + log.warning( + "--pdf-renderer=tesseract is now the same as " + "--pdf-renderer=sandwich. The 'tesseract' option is deprecated.") + options.pdf_renderer = 'sandwich' + + if options.pdf_renderer == 'auto': + if tesseract.version() < '3.05' \ + and options.output_type.startswith('pdfa'): + options.pdf_renderer = 'hocr' + else: + options.pdf_renderer = 'sandwich' + + if options.pdf_renderer == 'sandwich' \ + and tesseract.version() < '3.05' \ + and options.output_type.startswith('pdfa'): + log.warning( + "For best results use --pdf-renderer=tesseract " + "--output-type=pdf to disable PDF/A generation via " + "Ghostscript, which is known to corrupt the OCR text of " + "some PDFs produced your version of Tesseract.") if options.debug_rendering and options.pdf_renderer != 'hocr': log.info( diff --git a/src/ocrmypdf/pipeline.py b/src/ocrmypdf/pipeline.py index 644ef109..9116534a 100644 --- a/src/ocrmypdf/pipeline.py +++ b/src/ocrmypdf/pipeline.py @@ -1265,8 +1265,6 @@ def build_pipeline(options, work_folder, log, context): extras=[log, context]) task_select_image_layer.graphviz( fillcolor='"#00cc66"', shape='diamond') - task_select_image_layer.active_if( - options.pdf_renderer == 'hocr' or options.pdf_renderer == 'sandwich') task_render_hocr_page = main_pipeline.transform( task_func=render_hocr_page, @@ -1296,8 +1294,7 @@ def build_pipeline(options, work_folder, log, context): os.path.join(work_folder, r'\1.text.txt')], extras=[log, context]) task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"') - task_ocr_tesseract_textonly_pdf.active_if( - options.pdf_renderer == 'tesseract' or options.pdf_renderer == 'sandwich') + task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'sandwich') task_weave_layers = main_pipeline.collate( task_func=weave_layers, diff --git a/tests/test_main.py b/tests/test_main.py index fbad9ffb..e8ff3d42 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -42,9 +42,7 @@ run_ocrmypdf = pytest.helpers.run_ocrmypdf spoof = pytest.helpers.spoof -RENDERERS = ['hocr', 'tesseract'] -if tesseract.has_textonly_pdf(): - RENDERERS.append('sandwich') +RENDERERS = ['hocr', 'sandwich'] @pytest.fixture(scope='session') @@ -624,7 +622,7 @@ def test_rotated_skew_timeout(resources, outpdf): """This document contains an image that is rotated 90 into place with a /Rotate tag and intentionally skewed by altering the transformation matrix. - This tests for a bug where the combinatino of preprocessing and a tesseract + This tests for a bug where the combination of preprocessing and a tesseract timeout produced a page whose dimensions did not match the original's. """