mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-01-08 05:02:39 +00:00
Remove tesseract renderer entirely
Grafting lets us work with older Tesseract versions as if they could use sandwich, so there is no point in keeping it. It's been deprecated for a long time now anyway.
This commit is contained in:
parent
e0bb898f29
commit
b8f3ead541
@ -105,7 +105,7 @@ rendering
|
||||
Creating a new PDF from other data (such as an existing PDF).
|
||||
|
||||
|
||||
OCRmyPDF has three PDF renderers: ``sandwich``, ``hocr``, ``tesseract``. The renderer may be selected using ``--pdf-renderer``. The default is ``auto`` which lets OCRmyPDF select the renderer to use. Currently, ``auto`` selects ``sandwich`` for Tesseract 3.05.01 or newer, or ``hocr`` for older versions of Tesseract.
|
||||
OCRmyPDF has these PDF renderers: ``sandwich`` and ``hocr``. The renderer may be selected using ``--pdf-renderer``. The default is ``auto`` which lets OCRmyPDF select the renderer to use. Currently, ``auto`` selects ``sandwich`` for Tesseract 3.05.01 or newer, or ``hocr`` for older versions of Tesseract.
|
||||
|
||||
The ``sandwich`` renderer
|
||||
"""""""""""""""""""""""""
|
||||
@ -132,4 +132,4 @@ This works in all versions of Tesseract.
|
||||
The ``tesseract`` renderer
|
||||
""""""""""""""""""""""""""
|
||||
|
||||
The ``tesseract`` renderer is deprecated, and is now an alias for the ``sandwich`` renderer. The alias will be removed in a future release.
|
||||
The ``tesseract`` renderer was removed. OCRmyPDF's new approach to text layer grafting makes it functionally equivalent to ``sandwich``.
|
||||
@ -59,18 +59,3 @@ If you wish to add a single language pack, you could do the following:
|
||||
.. code-block:: bash
|
||||
|
||||
host$ docker commit <container_id> ocrmypdf-tess4-heb
|
||||
|
||||
|
||||
Known limitations
|
||||
-----------------
|
||||
|
||||
As of v4.2, users of ocrmypdf working languages outside the Latin alphabet should use the following syntax:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ocrmypdf -l eng+gre --output-type pdf --pdf-renderer tesseract
|
||||
|
||||
The reasons for this are:
|
||||
|
||||
* The latest version of Ghostscript (9.19 as of this writing) has unfixed bugs in Unicode handling that generate invalid character maps, so Ghostscript cannot be used for PDF/A conversion
|
||||
* The default "hocr" PDF renderer does not handle Asian fonts properly
|
||||
@ -248,10 +248,6 @@ ocrsettings.add_argument(
|
||||
help="Skip OCR on any pages that already contain text, but include the "
|
||||
"page in final output; useful for PDFs that contain a mix of "
|
||||
"images, text pages, and/or previously OCRed pages")
|
||||
# ocrsettings.add_argument(
|
||||
# '--redo-ocr', action='store_true',
|
||||
# help="removing any existing OCR text, but otherwise preserve mixed PDF "
|
||||
# "pages")
|
||||
|
||||
ocrsettings.add_argument(
|
||||
'--skip-big', type=float, metavar='MPixels',
|
||||
@ -284,15 +280,9 @@ advanced.add_argument(
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--pdf-renderer',
|
||||
choices=['auto', 'tesseract', 'hocr', 'sandwich'], default='auto',
|
||||
choices=['auto', 'hocr', 'sandwich'], default='auto',
|
||||
help="Choose OCR PDF renderer - the default option is to let OCRmyPDF "
|
||||
"choose."
|
||||
"auto - let OCRmyPDF choose; "
|
||||
"sandwich - default renderer for Tesseract 3.05.01 and newer; "
|
||||
"hocr - default renderer for older versions of Tesseract; "
|
||||
"tesseract - gives better results for non-Latin languages and "
|
||||
"Tesseract older than 3.05.01 but has problems with some versions "
|
||||
" of Ghostscript; deprecated"
|
||||
"choose. See documentation for discussion."
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--tesseract-timeout', default=180.0, type=float, metavar='SECONDS',
|
||||
@ -361,12 +351,6 @@ def check_options_languages(options, _log):
|
||||
|
||||
|
||||
def check_options_output(options, log):
|
||||
if options.pdf_renderer == 'tesseract':
|
||||
log.warning(
|
||||
"--pdf-renderer=tesseract is now the same as "
|
||||
"--pdf-renderer=sandwich. The 'tesseract' option is deprecated.")
|
||||
options.pdf_renderer = 'sandwich'
|
||||
|
||||
if options.pdf_renderer == 'auto':
|
||||
if tesseract.version() < '3.05' \
|
||||
and options.output_type.startswith('pdfa'):
|
||||
@ -378,7 +362,7 @@ def check_options_output(options, log):
|
||||
and tesseract.version() < '3.05' \
|
||||
and options.output_type.startswith('pdfa'):
|
||||
log.warning(
|
||||
"For best results use --pdf-renderer=tesseract "
|
||||
"For best results use --pdf-renderer=sandwich "
|
||||
"--output-type=pdf to disable PDF/A generation via "
|
||||
"Ghostscript, which is known to corrupt the OCR text of "
|
||||
"some PDFs produced your version of Tesseract.")
|
||||
@ -413,13 +397,6 @@ def check_options_preprocessing(options, log):
|
||||
raise MissingDependencyError(
|
||||
"Install the 'unpaper' program to use --clean, --clean-final.")
|
||||
|
||||
if options.clean and \
|
||||
not options.clean_final and \
|
||||
options.pdf_renderer == 'tesseract':
|
||||
log.info(
|
||||
"Tesseract PDF renderer cannot render --clean pages without "
|
||||
"also performing --clean-final, so --clean-final is assumed.")
|
||||
|
||||
|
||||
def check_options_ocr_behavior(options, log):
|
||||
if options.force_ocr and options.skip_text:
|
||||
@ -427,9 +404,6 @@ def check_options_ocr_behavior(options, log):
|
||||
None,
|
||||
"Error: --force-ocr and --skip-text are mutually incompatible.")
|
||||
|
||||
# if options.redo_ocr and (options.skip_text or options.force_ocr):
|
||||
# raise argparse.ArgumentError(
|
||||
# "Error: --redo-ocr and other OCR options are incompatible.")
|
||||
languages = set(options.language)
|
||||
if options.pdf_renderer == 'hocr' and \
|
||||
not languages.issubset(HOCR_OK_LANGS):
|
||||
@ -442,7 +416,7 @@ def check_options_ocr_behavior(options, log):
|
||||
"Use --pdf-renderer auto (the default) to avoid this issue.")
|
||||
else:
|
||||
msg += (
|
||||
"Use --pdf-renderer tesseract --output-type pdf to avoid "
|
||||
"Use --pdf-renderer sandwich --output-type pdf to avoid "
|
||||
"this issue")
|
||||
log.warning(msg)
|
||||
elif ghostscript.version() < '9.20' and \
|
||||
@ -460,10 +434,6 @@ def check_options_advanced(options, log):
|
||||
if options.tesseract_oem and not tesseract.v4():
|
||||
log.warning(
|
||||
"--tesseract-oem requires Tesseract 4.x -- argument ignored")
|
||||
if options.pdf_renderer == 'sandwich' and not tesseract.has_textonly_pdf():
|
||||
raise MissingDependencyError(
|
||||
"--pdf-renderer sandwich requires Tesseract 4.x "
|
||||
"commit 3d9fb3b or later")
|
||||
if options.pdfa_image_compression != 'auto' and \
|
||||
options.output_type.startswith('pdfa'):
|
||||
log.warning(
|
||||
@ -596,7 +566,7 @@ def do_ruffus_exception(ruffus_five_tuple, options, log):
|
||||
ocrmypdf cannot automatically correct the problem on its own.
|
||||
|
||||
Try using
|
||||
ocrmypdf --pdf-renderer tesseract [..other args..]
|
||||
ocrmypdf --pdf-renderer sandwich [..other args..]
|
||||
"""))
|
||||
exit_code = ExitCode.input_file
|
||||
elif exc_name.startswith('ocrmypdf.exceptions.'):
|
||||
|
||||
@ -151,7 +151,7 @@ def test_remove_background(spoof_tesseract_noop, resources, outdir):
|
||||
@pytest.mark.parametrize(
|
||||
"pdf",
|
||||
['palette.pdf', 'cmyk.pdf', 'ccitt.pdf', 'jbig2.pdf', 'lichtenstein.pdf'])
|
||||
@pytest.mark.parametrize("renderer", ['auto', 'tesseract'])
|
||||
@pytest.mark.parametrize("renderer", ['sandwich', 'hocr'])
|
||||
@pytest.mark.parametrize("output_type", ['pdf', 'pdfa'])
|
||||
def test_exotic_image(spoof_tesseract_cache, pdf, renderer, output_type,
|
||||
resources, outdir):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user