mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-08-19 14:12:38 +00:00
Remove the old tesseract pdf_renderer
This commit is contained in:
parent
6b315e8315
commit
a9abe13185
@ -364,31 +364,27 @@ def check_options_languages(options, _log):
|
||||
|
||||
|
||||
def check_options_output(options, log):
|
||||
if options.pdf_renderer == 'auto':
|
||||
if tesseract.has_textonly_pdf():
|
||||
options.pdf_renderer = 'sandwich'
|
||||
else:
|
||||
options.pdf_renderer = 'hocr'
|
||||
|
||||
if options.pdf_renderer == 'sandwich' and not tesseract.has_textonly_pdf():
|
||||
raise MissingDependencyError(
|
||||
"The 'sandwich' renderer requires Tesseract 3.05.01 or newer; "
|
||||
"or Tesseract 4.00 alpha newer than February 2017.")
|
||||
|
||||
if options.pdf_renderer == 'tesseract':
|
||||
if tesseract.version() < '3.05' and \
|
||||
options.output_type.startswith('pdfa'):
|
||||
log.warning(
|
||||
"For best results use --pdf-renderer=tesseract "
|
||||
"--output-type=pdf to disable PDF/A generation via "
|
||||
"Ghostscript, which is known to corrupt the OCR text of "
|
||||
"some PDFs produced your version of Tesseract.")
|
||||
elif tesseract.has_textonly_pdf():
|
||||
log.warning(
|
||||
"The argument --pdf-renderer=tesseract provides support for "
|
||||
"versions of tesseract older than your version. For best "
|
||||
"results omit this argument and let OCRmyPDF choose the "
|
||||
"best available renderer.")
|
||||
log.warning(
|
||||
"--pdf-renderer=tesseract is now the same as "
|
||||
"--pdf-renderer=sandwich. The 'tesseract' option is deprecated.")
|
||||
options.pdf_renderer = 'sandwich'
|
||||
|
||||
if options.pdf_renderer == 'auto':
|
||||
if tesseract.version() < '3.05' \
|
||||
and options.output_type.startswith('pdfa'):
|
||||
options.pdf_renderer = 'hocr'
|
||||
else:
|
||||
options.pdf_renderer = 'sandwich'
|
||||
|
||||
if options.pdf_renderer == 'sandwich' \
|
||||
and tesseract.version() < '3.05' \
|
||||
and options.output_type.startswith('pdfa'):
|
||||
log.warning(
|
||||
"For best results use --pdf-renderer=tesseract "
|
||||
"--output-type=pdf to disable PDF/A generation via "
|
||||
"Ghostscript, which is known to corrupt the OCR text of "
|
||||
"some PDFs produced your version of Tesseract.")
|
||||
|
||||
if options.debug_rendering and options.pdf_renderer != 'hocr':
|
||||
log.info(
|
||||
|
@ -1265,8 +1265,6 @@ def build_pipeline(options, work_folder, log, context):
|
||||
extras=[log, context])
|
||||
task_select_image_layer.graphviz(
|
||||
fillcolor='"#00cc66"', shape='diamond')
|
||||
task_select_image_layer.active_if(
|
||||
options.pdf_renderer == 'hocr' or options.pdf_renderer == 'sandwich')
|
||||
|
||||
task_render_hocr_page = main_pipeline.transform(
|
||||
task_func=render_hocr_page,
|
||||
@ -1296,8 +1294,7 @@ def build_pipeline(options, work_folder, log, context):
|
||||
os.path.join(work_folder, r'\1.text.txt')],
|
||||
extras=[log, context])
|
||||
task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
|
||||
task_ocr_tesseract_textonly_pdf.active_if(
|
||||
options.pdf_renderer == 'tesseract' or options.pdf_renderer == 'sandwich')
|
||||
task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'sandwich')
|
||||
|
||||
task_weave_layers = main_pipeline.collate(
|
||||
task_func=weave_layers,
|
||||
|
@ -42,9 +42,7 @@ run_ocrmypdf = pytest.helpers.run_ocrmypdf
|
||||
spoof = pytest.helpers.spoof
|
||||
|
||||
|
||||
RENDERERS = ['hocr', 'tesseract']
|
||||
if tesseract.has_textonly_pdf():
|
||||
RENDERERS.append('sandwich')
|
||||
RENDERERS = ['hocr', 'sandwich']
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
@ -624,7 +622,7 @@ def test_rotated_skew_timeout(resources, outpdf):
|
||||
"""This document contains an image that is rotated 90 into place with a
|
||||
/Rotate tag and intentionally skewed by altering the transformation matrix.
|
||||
|
||||
This tests for a bug where the combinatino of preprocessing and a tesseract
|
||||
This tests for a bug where the combination of preprocessing and a tesseract
|
||||
timeout produced a page whose dimensions did not match the original's.
|
||||
"""
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user