mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-12-28 07:30:04 +00:00
Drop support for Tesseract 3
This commit is contained in:
parent
72b920eb16
commit
8b90c45437
@ -47,16 +47,16 @@ Some relevant environment variables that influence Tesseract's behavior include:
|
||||
|
||||
Controls the number of threads Tesseract will use. OCRmyPDF will manage this environment if it is not already set. (Currently, it will set it to 1 because this gives the best results in testing.)
|
||||
|
||||
For example, if you are testing tesseract 4.00 and don't wish to use an existing tesseract 3.04 installation, you can launch OCRmyPDF as follows:
|
||||
For example, if you have a development build of Tesseract don't wish to use the system installation, you can launch OCRmyPDF as follows:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
env \
|
||||
PATH=/home/user/src/tesseract4/api:$PATH \
|
||||
TESSDATA_PREFIX=/home/user/src/tesseract4 \
|
||||
ocrmypdf --tesseract-oem 2 input.pdf output.pdf
|
||||
PATH=/home/user/src/tesseract/api:$PATH \
|
||||
TESSDATA_PREFIX=/home/user/src/tesseract \
|
||||
ocrmypdf input.pdf output.pdf
|
||||
|
||||
In this example ``TESSDATA_PREFIX`` directs Tesseract 4.0 to use LSTM training data. ``--tesseract-oem 1`` requests tesseract 4.0's new LSTM engine. (Tesseract 4.0 only.)
|
||||
In this example ``TESSDATA_PREFIX`` is required to redirect Tesseract to an alternate folder for its "tessdata" files.
|
||||
|
||||
|
||||
Overriding other support programs
|
||||
@ -107,7 +107,7 @@ rendering
|
||||
Creating a new PDF from other data (such as an existing PDF).
|
||||
|
||||
|
||||
OCRmyPDF has these PDF renderers: ``sandwich`` and ``hocr``. The renderer may be selected using ``--pdf-renderer``. The default is ``auto`` which lets OCRmyPDF select the renderer to use. Currently, ``auto`` selects ``sandwich`` for Tesseract 3.05.01 or newer, or ``hocr`` for older versions of Tesseract.
|
||||
OCRmyPDF has these PDF renderers: ``sandwich`` and ``hocr``. The renderer may be selected using ``--pdf-renderer``. The default is ``auto`` which lets OCRmyPDF select the renderer to use. Currently, ``auto`` always selects ``sandwich``.
|
||||
|
||||
The ``sandwich`` renderer
|
||||
"""""""""""""""""""""""""
|
||||
@ -118,8 +118,6 @@ Currently this is the best renderer for most uses, however it is implemented in
|
||||
|
||||
When image preprocessing features like ``--deskew`` are used, the original PDF will be rendered as a full page and the OCR layer will be placed on top.
|
||||
|
||||
If a PDF created with this renderer using Tesseract versions older than 3.05.00 is then passed through Ghostscript's pdfwrite feature, the OCR text *may* be corrupted. The ``--output-type=pdfa`` argument will produce a warning in this situation. For this reason, OCRmyPDF automatically selects the ``hocr`` for older Tesseract versions.
|
||||
|
||||
The ``hocr`` renderer
|
||||
"""""""""""""""""""""
|
||||
|
||||
|
||||
@ -81,7 +81,7 @@ This produces a file named "output.pdf" and a companion text file named "output.
|
||||
OCR images, not PDFs
|
||||
--------------------
|
||||
|
||||
If you are starting with images, you can just use Tesseract 3.04 or later directly to convert images to PDFs:
|
||||
If you are starting with images, you can just use Tesseract directly to convert images to PDFs:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
|
||||
@ -495,17 +495,15 @@ OCRmyPDF currently requires these external programs and libraries to be installe
|
||||
|
||||
- Python 3.6 or newer
|
||||
- Ghostscript 9.15 or newer
|
||||
- libexempi3 2.2.0 or newer
|
||||
- qpdf 8.1.0 or newer
|
||||
- Tesseract 3.04 or newer
|
||||
- Tesseract 4.0.0-alpha or newer
|
||||
|
||||
As of ocrmypdf 7.2.1, the following versions are recommended:
|
||||
|
||||
- Python 3.7
|
||||
- Ghostscript 9.23 or newer
|
||||
- libexempi3 2.4.5 or newer
|
||||
- qpdf 8.2.1
|
||||
- Tesseract 4.0.0-rc1
|
||||
- Tesseract 4.0.0 or newer
|
||||
- jbig2enc 0.29 or newer
|
||||
- pngquant 2.5 or newer
|
||||
- unpaper 6.1
|
||||
|
||||
2
setup.py
2
setup.py
@ -170,7 +170,7 @@ if not forced and command.startswith('install') or \
|
||||
command in ['check', 'test', 'nosetests', 'easy_install']:
|
||||
check_external_program(
|
||||
program='tesseract',
|
||||
need_version='3.04', # using backport for Travis CI
|
||||
need_version='4.0.0', # using backport for Travis CI
|
||||
package={'darwin': 'tesseract', 'linux': 'tesseract-ocr'}
|
||||
)
|
||||
check_external_program(
|
||||
|
||||
@ -49,8 +49,6 @@ from ._unicodefun import verify_python3_env
|
||||
# -------------
|
||||
# External dependencies
|
||||
|
||||
MINIMUM_TESS_VERSION = '3.04'
|
||||
|
||||
HOCR_OK_LANGS = frozenset([
|
||||
'eng', 'deu', 'spa', 'ita', 'por'
|
||||
])
|
||||
@ -68,11 +66,11 @@ if 'IDE_PROJECT_ROOTS' in os.environ:
|
||||
|
||||
verify_python3_env()
|
||||
|
||||
if tesseract.version() < MINIMUM_TESS_VERSION:
|
||||
if not tesseract.v4:
|
||||
complain(
|
||||
"Please install tesseract {0} or newer "
|
||||
"Please install tesseract 4.0.0 or newer "
|
||||
"(currently installed version is {1})".format(
|
||||
MINIMUM_TESS_VERSION, tesseract.version()))
|
||||
tesseract.version()))
|
||||
sys.exit(ExitCode.missing_dependency)
|
||||
|
||||
# -------------
|
||||
@ -420,9 +418,7 @@ def check_options_languages(options, _log):
|
||||
def check_options_output(options, log):
|
||||
# We have these constraints to check for.
|
||||
# 1. Ghostscript < 9.20 mangles multibyte Unicode
|
||||
# 2. Tesseract < 3.05 embeds an older version of GlyphlessFont with which
|
||||
# no version of Ghostscript handles correctly.
|
||||
# 3. hocr doesn't work on non-Latin languages (so don't select it)
|
||||
# 2. hocr doesn't work on non-Latin languages (so don't select it)
|
||||
|
||||
languages = set(options.language)
|
||||
is_latin = languages.issubset(HOCR_OK_LANGS)
|
||||
@ -448,28 +444,7 @@ def check_options_output(options, log):
|
||||
|
||||
# Decide on what renderer to use
|
||||
if options.pdf_renderer == 'auto':
|
||||
if tesseract.version() < '3.05' \
|
||||
and options.output_type.startswith('pdfa') \
|
||||
and is_latin:
|
||||
options.pdf_renderer = 'hocr'
|
||||
else:
|
||||
options.pdf_renderer = 'sandwich'
|
||||
|
||||
if options.pdf_renderer == 'sandwich' \
|
||||
and tesseract.version() < '3.05':
|
||||
msg = (
|
||||
"Ghostscript will corrupt the OCR text of PDFs produced by "
|
||||
"Tesseract 3.04.xx and older. For best results, upgrade to a "
|
||||
"newer release of Tesseract. "
|
||||
)
|
||||
|
||||
if options.output_type.startswith('pdfa'):
|
||||
msg += (
|
||||
"The argument --output-type=pdfa* requires Ghostscript, so "
|
||||
"the PDF will be invalid. If you cannot upgrade Tesseract, "
|
||||
"use --output-type=pdf.")
|
||||
raise MissingDependencyError(msg)
|
||||
log.warning(msg)
|
||||
options.pdf_renderer = 'sandwich'
|
||||
|
||||
if options.output_type == 'pdfa':
|
||||
options.output_type = 'pdfa-2'
|
||||
|
||||
@ -40,8 +40,8 @@ HOCR_TEMPLATE = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<head>
|
||||
<title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
|
||||
<meta name='ocr-system' content='tesseract 3.05.00' />
|
||||
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
|
||||
<meta name='ocr-system' content='tesseract 4.0.0' />
|
||||
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/>
|
||||
</head>
|
||||
<body>
|
||||
<div class='ocr_page' id='page_1' title='image "_blank.png"; bbox 0 0 {0} {1}; ppageno 0'>
|
||||
@ -65,7 +65,7 @@ def v4():
|
||||
def has_textonly_pdf():
|
||||
"""Does Tesseract have textonly_pdf capability?
|
||||
|
||||
Available in 3.05.01, and v4.00.00alpha since January 2017. Best to
|
||||
Available in v4.00.00alpha since January 2017. Best to
|
||||
parse the parameter list
|
||||
"""
|
||||
args_tess = [
|
||||
@ -86,11 +86,6 @@ def has_textonly_pdf():
|
||||
return False
|
||||
|
||||
|
||||
def psm():
|
||||
"If Tesseract 4.0, use argument --psm instead of -psm"
|
||||
return '--psm' if v4() else '-psm'
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def languages():
|
||||
def lang_error(output):
|
||||
@ -135,7 +130,7 @@ def tess_base_args(langs, engine_mode):
|
||||
|
||||
def get_orientation(input_file, engine_mode, timeout: float, log):
|
||||
args_tesseract = tess_base_args(['osd'], engine_mode) + [
|
||||
psm(), '0',
|
||||
'--psm', '0',
|
||||
fspath(input_file),
|
||||
'stdout'
|
||||
]
|
||||
@ -160,17 +155,6 @@ def get_orientation(input_file, engine_mode, timeout: float, log):
|
||||
osd[parts[0].strip()] = parts[1].strip()
|
||||
|
||||
angle = int(osd.get('Orientation in degrees', 0))
|
||||
if 'Orientation' in osd:
|
||||
# Tesseract < 3.04.01
|
||||
# reports "Orientation in degrees" as a counterclockwise angle
|
||||
# We keep it clockwise
|
||||
assert 'Rotate' not in osd
|
||||
angle = -angle % 360
|
||||
else:
|
||||
# Tesseract >= 3.04.01
|
||||
# reports "Orientation in degrees" as a clockwise angle
|
||||
assert 'Rotate' in osd
|
||||
|
||||
oc = OrientationConfidence(
|
||||
angle=angle,
|
||||
confidence=float(osd.get('Orientation confidence', 0)))
|
||||
@ -247,7 +231,7 @@ def generate_hocr(input_file, output_files, language: list, engine_mode,
|
||||
args_tesseract = tess_base_args(language, engine_mode)
|
||||
|
||||
if pagesegmode is not None:
|
||||
args_tesseract.extend([psm(), str(pagesegmode)])
|
||||
args_tesseract.extend(['--psm', str(pagesegmode)])
|
||||
|
||||
if user_words:
|
||||
args_tesseract.extend(['--user-words', user_words])
|
||||
@ -257,9 +241,6 @@ def generate_hocr(input_file, output_files, language: list, engine_mode,
|
||||
|
||||
# Reminder: test suite tesseract spoofers will break after any changes
|
||||
# to the number of order parameters here
|
||||
# Tesseract 3.04 requires the order here to be "hocr txt" and will fail
|
||||
# on "txt hocr"
|
||||
|
||||
args_tesseract.extend([
|
||||
input_file,
|
||||
prefix,
|
||||
@ -329,7 +310,7 @@ def generate_pdf(*, input_image, skip_pdf=None, output_pdf, output_text,
|
||||
args_tesseract = tess_base_args(language, engine_mode)
|
||||
|
||||
if pagesegmode is not None:
|
||||
args_tesseract.extend([psm(), str(pagesegmode)])
|
||||
args_tesseract.extend(['--psm', str(pagesegmode)])
|
||||
|
||||
if text_only and has_textonly_pdf():
|
||||
args_tesseract.extend(['-c', 'textonly_pdf=1'])
|
||||
|
||||
@ -34,9 +34,12 @@ language pack version mismatches
|
||||
"""
|
||||
|
||||
|
||||
VERSION_STRING = '''tesseract 3.05.01
|
||||
leptonica-1.72
|
||||
libjpeg 8d : libpng 1.6.19 : libtiff 4.0.6 : zlib 1.2.5
|
||||
VERSION_STRING = '''tesseract 4.0.0
|
||||
leptonica-1.77.0
|
||||
libjpeg 9c : libpng 1.6.35 : libtiff 4.0.10 : zlib 1.2.11 : libopenjp2 2.3.0
|
||||
Found AVX2
|
||||
Found AVX
|
||||
Found SSE
|
||||
SPOOFED
|
||||
'''
|
||||
|
||||
|
||||
@ -24,9 +24,12 @@
|
||||
import sys
|
||||
|
||||
|
||||
VERSION_STRING = '''tesseract 3.05.01
|
||||
leptonica-1.74.4
|
||||
libjpeg 9b : libpng 1.6.32 : libtiff 4.0.8 : zlib 1.2.8
|
||||
VERSION_STRING = '''tesseract 4.0.0
|
||||
leptonica-1.77.0
|
||||
libjpeg 9c : libpng 1.6.35 : libtiff 4.0.10 : zlib 1.2.11 : libopenjp2 2.3.0
|
||||
Found AVX2
|
||||
Found AVX
|
||||
Found SSE
|
||||
SPOOFED: return error claiming image too big
|
||||
'''
|
||||
|
||||
|
||||
@ -45,12 +45,7 @@ machines with AVX2, the cache is now bundled.
|
||||
|
||||
Certain operations are not cached and routed to tesseract directly.
|
||||
|
||||
Assumes Tesseract 3.04 or higher.
|
||||
|
||||
Will fail on Tesseract 3.02.02 in "hocr" mode because it doesn't produce
|
||||
the incorrect file extension. Will fail on 3.03 because that has no sidecar
|
||||
text support. Will fail to replicate a 3.04 bug if wrong parameter order is
|
||||
given.
|
||||
Assumes Tesseract 4.0.0-alpha or higher.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
@ -25,10 +25,13 @@ import os
|
||||
import signal
|
||||
|
||||
|
||||
VERSION_STRING = '''tesseract 3.05.01
|
||||
leptonica-1.74.4
|
||||
libjpeg 9b : libpng 1.6.32 : libtiff 4.0.8 : zlib 1.2.8
|
||||
SPOOFED: CRASH ON OCR or -psm 0
|
||||
VERSION_STRING = '''tesseract 4.0.0
|
||||
leptonica-1.77.0
|
||||
libjpeg 9c : libpng 1.6.35 : libtiff 4.0.10 : zlib 1.2.11 : libopenjp2 2.3.0
|
||||
Found AVX2
|
||||
Found AVX
|
||||
Found SSE
|
||||
SPOOFED: CRASH ON OCR or --psm 0
|
||||
'''
|
||||
|
||||
"""Simulates a Tesseract crash when asked to run OCR
|
||||
|
||||
@ -37,9 +37,12 @@ import PyPDF2 as pypdf
|
||||
from PIL import Image
|
||||
|
||||
|
||||
VERSION_STRING = '''tesseract 3.05.01
|
||||
leptonica-1.72
|
||||
libjpeg 8d : libpng 1.6.19 : libtiff 4.0.6 : zlib 1.2.5
|
||||
VERSION_STRING = '''tesseract 4.0.0
|
||||
leptonica-1.77.0
|
||||
libjpeg 9c : libpng 1.6.35 : libtiff 4.0.10 : zlib 1.2.11 : libopenjp2 2.3.0
|
||||
Found AVX2
|
||||
Found AVX
|
||||
Found SSE
|
||||
SPOOFED
|
||||
'''
|
||||
|
||||
@ -50,7 +53,7 @@ HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<head>
|
||||
<title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
||||
<meta name='ocr-system' content='tesseract 3.02.02' />
|
||||
<meta name='ocr-system' content='tesseract 4.0.0' />
|
||||
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
@ -1,54 +0,0 @@
|
||||
# © 2017 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import pytest
|
||||
from ocrmypdf.exceptions import ExitCode
|
||||
from ocrmypdf.exec import tesseract
|
||||
|
||||
|
||||
# Skip all tests in this file if not tesseract 3
|
||||
pytestmark = pytest.mark.skipif(tesseract.v4(),
|
||||
reason="tesseract 3.x required")
|
||||
|
||||
|
||||
@pytest.mark.skipif(tesseract.has_textonly_pdf(),
|
||||
reason="check that missing dep is reported on old tess3")
|
||||
def test_textonly_pdf_on_older_tess3(resources, no_outpdf):
|
||||
p, _, _ = pytest.helpers.run_ocrmypdf(
|
||||
resources / 'linn.pdf',
|
||||
no_outpdf, '--pdf-renderer', 'sandwich')
|
||||
|
||||
assert p.returncode == ExitCode.missing_dependency
|
||||
|
||||
|
||||
@pytest.mark.skipif(not tesseract.has_textonly_pdf(),
|
||||
reason="check that feature is exercised on new test3")
|
||||
def test_textonly_pdf_on_newer_tess3(resources, no_outpdf):
|
||||
p, _, _ = pytest.helpers.run_ocrmypdf(
|
||||
resources / 'linn.pdf',
|
||||
no_outpdf, '--pdf-renderer', 'sandwich')
|
||||
|
||||
assert p.returncode == ExitCode.ok
|
||||
|
||||
|
||||
def test_oem_on_tess3(resources, no_outpdf):
|
||||
p, _, err = pytest.helpers.run_ocrmypdf(
|
||||
resources / 'aspect.pdf',
|
||||
no_outpdf, '--tesseract-oem', '1')
|
||||
|
||||
assert p.returncode == ExitCode.ok
|
||||
assert 'argument ignored' in err
|
||||
Loading…
x
Reference in New Issue
Block a user