Remove the old tesseract pdf_renderer

This commit is contained in:
James R. Barlow 2018-05-01 17:31:34 -07:00
parent 6b315e8315
commit a9abe13185
3 changed files with 23 additions and 32 deletions

View File

@ -364,31 +364,27 @@ def check_options_languages(options, _log):
def check_options_output(options, log):
if options.pdf_renderer == 'auto':
if tesseract.has_textonly_pdf():
options.pdf_renderer = 'sandwich'
else:
options.pdf_renderer = 'hocr'
if options.pdf_renderer == 'sandwich' and not tesseract.has_textonly_pdf():
raise MissingDependencyError(
"The 'sandwich' renderer requires Tesseract 3.05.01 or newer; "
"or Tesseract 4.00 alpha newer than February 2017.")
if options.pdf_renderer == 'tesseract':
if tesseract.version() < '3.05' and \
options.output_type.startswith('pdfa'):
log.warning(
"--pdf-renderer=tesseract is now the same as "
"--pdf-renderer=sandwich. The 'tesseract' option is deprecated.")
options.pdf_renderer = 'sandwich'
if options.pdf_renderer == 'auto':
if tesseract.version() < '3.05' \
and options.output_type.startswith('pdfa'):
options.pdf_renderer = 'hocr'
else:
options.pdf_renderer = 'sandwich'
if options.pdf_renderer == 'sandwich' \
and tesseract.version() < '3.05' \
and options.output_type.startswith('pdfa'):
log.warning(
"For best results use --pdf-renderer=tesseract "
"--output-type=pdf to disable PDF/A generation via "
"Ghostscript, which is known to corrupt the OCR text of "
"some PDFs produced your version of Tesseract.")
elif tesseract.has_textonly_pdf():
log.warning(
"The argument --pdf-renderer=tesseract provides support for "
"versions of tesseract older than your version. For best "
"results omit this argument and let OCRmyPDF choose the "
"best available renderer.")
if options.debug_rendering and options.pdf_renderer != 'hocr':
log.info(

View File

@ -1265,8 +1265,6 @@ def build_pipeline(options, work_folder, log, context):
extras=[log, context])
task_select_image_layer.graphviz(
fillcolor='"#00cc66"', shape='diamond')
task_select_image_layer.active_if(
options.pdf_renderer == 'hocr' or options.pdf_renderer == 'sandwich')
task_render_hocr_page = main_pipeline.transform(
task_func=render_hocr_page,
@ -1296,8 +1294,7 @@ def build_pipeline(options, work_folder, log, context):
os.path.join(work_folder, r'\1.text.txt')],
extras=[log, context])
task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
task_ocr_tesseract_textonly_pdf.active_if(
options.pdf_renderer == 'tesseract' or options.pdf_renderer == 'sandwich')
task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'sandwich')
task_weave_layers = main_pipeline.collate(
task_func=weave_layers,

View File

@ -42,9 +42,7 @@ run_ocrmypdf = pytest.helpers.run_ocrmypdf
spoof = pytest.helpers.spoof
RENDERERS = ['hocr', 'tesseract']
if tesseract.has_textonly_pdf():
RENDERERS.append('sandwich')
RENDERERS = ['hocr', 'sandwich']
@pytest.fixture(scope='session')
@ -624,7 +622,7 @@ def test_rotated_skew_timeout(resources, outpdf):
"""This document contains an image that is rotated 90 into place with a
/Rotate tag and intentionally skewed by altering the transformation matrix.
This tests for a bug where the combinatino of preprocessing and a tesseract
This tests for a bug where the combination of preprocessing and a tesseract
timeout produced a page whose dimensions did not match the original's.
"""