mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-08-19 14:12:38 +00:00
Remove the old tesseract pdf_renderer
This commit is contained in:
parent
6b315e8315
commit
a9abe13185
@ -364,31 +364,27 @@ def check_options_languages(options, _log):
|
|||||||
|
|
||||||
|
|
||||||
def check_options_output(options, log):
|
def check_options_output(options, log):
|
||||||
if options.pdf_renderer == 'auto':
|
|
||||||
if tesseract.has_textonly_pdf():
|
|
||||||
options.pdf_renderer = 'sandwich'
|
|
||||||
else:
|
|
||||||
options.pdf_renderer = 'hocr'
|
|
||||||
|
|
||||||
if options.pdf_renderer == 'sandwich' and not tesseract.has_textonly_pdf():
|
|
||||||
raise MissingDependencyError(
|
|
||||||
"The 'sandwich' renderer requires Tesseract 3.05.01 or newer; "
|
|
||||||
"or Tesseract 4.00 alpha newer than February 2017.")
|
|
||||||
|
|
||||||
if options.pdf_renderer == 'tesseract':
|
if options.pdf_renderer == 'tesseract':
|
||||||
if tesseract.version() < '3.05' and \
|
log.warning(
|
||||||
options.output_type.startswith('pdfa'):
|
"--pdf-renderer=tesseract is now the same as "
|
||||||
|
"--pdf-renderer=sandwich. The 'tesseract' option is deprecated.")
|
||||||
|
options.pdf_renderer = 'sandwich'
|
||||||
|
|
||||||
|
if options.pdf_renderer == 'auto':
|
||||||
|
if tesseract.version() < '3.05' \
|
||||||
|
and options.output_type.startswith('pdfa'):
|
||||||
|
options.pdf_renderer = 'hocr'
|
||||||
|
else:
|
||||||
|
options.pdf_renderer = 'sandwich'
|
||||||
|
|
||||||
|
if options.pdf_renderer == 'sandwich' \
|
||||||
|
and tesseract.version() < '3.05' \
|
||||||
|
and options.output_type.startswith('pdfa'):
|
||||||
log.warning(
|
log.warning(
|
||||||
"For best results use --pdf-renderer=tesseract "
|
"For best results use --pdf-renderer=tesseract "
|
||||||
"--output-type=pdf to disable PDF/A generation via "
|
"--output-type=pdf to disable PDF/A generation via "
|
||||||
"Ghostscript, which is known to corrupt the OCR text of "
|
"Ghostscript, which is known to corrupt the OCR text of "
|
||||||
"some PDFs produced your version of Tesseract.")
|
"some PDFs produced your version of Tesseract.")
|
||||||
elif tesseract.has_textonly_pdf():
|
|
||||||
log.warning(
|
|
||||||
"The argument --pdf-renderer=tesseract provides support for "
|
|
||||||
"versions of tesseract older than your version. For best "
|
|
||||||
"results omit this argument and let OCRmyPDF choose the "
|
|
||||||
"best available renderer.")
|
|
||||||
|
|
||||||
if options.debug_rendering and options.pdf_renderer != 'hocr':
|
if options.debug_rendering and options.pdf_renderer != 'hocr':
|
||||||
log.info(
|
log.info(
|
||||||
|
@ -1265,8 +1265,6 @@ def build_pipeline(options, work_folder, log, context):
|
|||||||
extras=[log, context])
|
extras=[log, context])
|
||||||
task_select_image_layer.graphviz(
|
task_select_image_layer.graphviz(
|
||||||
fillcolor='"#00cc66"', shape='diamond')
|
fillcolor='"#00cc66"', shape='diamond')
|
||||||
task_select_image_layer.active_if(
|
|
||||||
options.pdf_renderer == 'hocr' or options.pdf_renderer == 'sandwich')
|
|
||||||
|
|
||||||
task_render_hocr_page = main_pipeline.transform(
|
task_render_hocr_page = main_pipeline.transform(
|
||||||
task_func=render_hocr_page,
|
task_func=render_hocr_page,
|
||||||
@ -1296,8 +1294,7 @@ def build_pipeline(options, work_folder, log, context):
|
|||||||
os.path.join(work_folder, r'\1.text.txt')],
|
os.path.join(work_folder, r'\1.text.txt')],
|
||||||
extras=[log, context])
|
extras=[log, context])
|
||||||
task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
|
task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
|
||||||
task_ocr_tesseract_textonly_pdf.active_if(
|
task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'sandwich')
|
||||||
options.pdf_renderer == 'tesseract' or options.pdf_renderer == 'sandwich')
|
|
||||||
|
|
||||||
task_weave_layers = main_pipeline.collate(
|
task_weave_layers = main_pipeline.collate(
|
||||||
task_func=weave_layers,
|
task_func=weave_layers,
|
||||||
|
@ -42,9 +42,7 @@ run_ocrmypdf = pytest.helpers.run_ocrmypdf
|
|||||||
spoof = pytest.helpers.spoof
|
spoof = pytest.helpers.spoof
|
||||||
|
|
||||||
|
|
||||||
RENDERERS = ['hocr', 'tesseract']
|
RENDERERS = ['hocr', 'sandwich']
|
||||||
if tesseract.has_textonly_pdf():
|
|
||||||
RENDERERS.append('sandwich')
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope='session')
|
||||||
@ -624,7 +622,7 @@ def test_rotated_skew_timeout(resources, outpdf):
|
|||||||
"""This document contains an image that is rotated 90 into place with a
|
"""This document contains an image that is rotated 90 into place with a
|
||||||
/Rotate tag and intentionally skewed by altering the transformation matrix.
|
/Rotate tag and intentionally skewed by altering the transformation matrix.
|
||||||
|
|
||||||
This tests for a bug where the combinatino of preprocessing and a tesseract
|
This tests for a bug where the combination of preprocessing and a tesseract
|
||||||
timeout produced a page whose dimensions did not match the original's.
|
timeout produced a page whose dimensions did not match the original's.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user