Drop support for Tesseract 3

2025-12-28 07:30:04 +00:00 · 2018-12-30 00:47:12 -08:00 · 2018-12-30 00:47:12 -08:00 · 8b90c45437
commit 8b90c45437
parent 72b920eb16
12 changed files with 48 additions and 143 deletions
--- a/docs/advanced.rst
+++ b/docs/advanced.rst
@ -47,16 +47,16 @@ Some relevant environment variables that influence Tesseract's behavior include:

    Controls the number of threads Tesseract will use. OCRmyPDF will manage this environment if it is not already set. (Currently, it will set it to 1 because this gives the best results in testing.)

-For example, if you are testing tesseract 4.00 and don't wish to use an existing tesseract 3.04 installation, you can launch OCRmyPDF as follows:
+For example, if you have a development build of Tesseract don't wish to use the system installation, you can launch OCRmyPDF as follows:

 .. code-block:: bash

    env \
-        PATH=/home/user/src/tesseract4/api:$PATH \
-        TESSDATA_PREFIX=/home/user/src/tesseract4 \
-        ocrmypdf --tesseract-oem 2 input.pdf output.pdf
+        PATH=/home/user/src/tesseract/api:$PATH \
+        TESSDATA_PREFIX=/home/user/src/tesseract \
+        ocrmypdf input.pdf output.pdf

-In this example ``TESSDATA_PREFIX`` directs Tesseract 4.0 to use LSTM training data. ``--tesseract-oem 1`` requests tesseract 4.0's new LSTM engine. (Tesseract 4.0 only.)
+In this example ``TESSDATA_PREFIX`` is required to redirect Tesseract to an alternate folder for its "tessdata" files.


 Overriding other support programs
@ -107,7 +107,7 @@ rendering
  Creating a new PDF from other data (such as an existing PDF).


-OCRmyPDF has these PDF renderers: ``sandwich`` and ``hocr``. The renderer may be selected using ``--pdf-renderer``. The default is ``auto`` which lets OCRmyPDF select the renderer to use. Currently, ``auto`` selects ``sandwich`` for Tesseract 3.05.01 or newer, or ``hocr`` for older versions of Tesseract.
+OCRmyPDF has these PDF renderers: ``sandwich`` and ``hocr``. The renderer may be selected using ``--pdf-renderer``. The default is ``auto`` which lets OCRmyPDF select the renderer to use. Currently, ``auto`` always selects ``sandwich``.

 The ``sandwich`` renderer
 """""""""""""""""""""""""
@ -118,8 +118,6 @@ Currently this is the best renderer for most uses, however it is implemented in

 When image preprocessing features like ``--deskew`` are used, the original PDF will be rendered as a full page and the OCR layer will be placed on top.

-If a PDF created with this renderer using Tesseract versions older than 3.05.00 is then passed through Ghostscript's pdfwrite feature, the OCR text *may* be corrupted. The ``--output-type=pdfa`` argument will produce a warning in this situation.  For this reason, OCRmyPDF automatically selects the ``hocr`` for older Tesseract versions.
-
 The ``hocr`` renderer
 """""""""""""""""""""

--- a/docs/cookbook.rst
+++ b/docs/cookbook.rst
@ -81,7 +81,7 @@ This produces a file named "output.pdf" and a companion text file named "output.
 OCR images, not PDFs
 --------------------

-If you are starting with images, you can just use Tesseract 3.04 or later directly to convert images to PDFs:
+If you are starting with images, you can just use Tesseract directly to convert images to PDFs:

 .. code-block:: bash

--- a/docs/installation.rst
+++ b/docs/installation.rst
@ -495,17 +495,15 @@ OCRmyPDF currently requires these external programs and libraries to be installe

 - Python 3.6 or newer
 - Ghostscript 9.15 or newer
- libexempi3 2.2.0 or newer
 - qpdf 8.1.0 or newer
- Tesseract 3.04 or newer
+- Tesseract 4.0.0-alpha or newer

 As of ocrmypdf 7.2.1, the following versions are recommended:

 - Python 3.7
 - Ghostscript 9.23 or newer
- libexempi3 2.4.5 or newer
 - qpdf 8.2.1
- Tesseract 4.0.0-rc1
+- Tesseract 4.0.0 or newer
 - jbig2enc 0.29 or newer
 - pngquant 2.5 or newer
 - unpaper 6.1
--- a/setup.py
+++ b/setup.py
@ -170,7 +170,7 @@ if not forced and command.startswith('install') or \
        command in ['check', 'test', 'nosetests', 'easy_install']:
    check_external_program(
        program='tesseract',
-        need_version='3.04',  # using backport for Travis CI
+        need_version='4.0.0',  # using backport for Travis CI
        package={'darwin': 'tesseract', 'linux': 'tesseract-ocr'}
    )
    check_external_program(
--- a/src/ocrmypdf/main.py
+++ b/src/ocrmypdf/main.py
@ -49,8 +49,6 @@ from ._unicodefun import verify_python3_env
 # -------------
 # External dependencies

-MINIMUM_TESS_VERSION = '3.04'
-
 HOCR_OK_LANGS = frozenset([
    'eng', 'deu', 'spa', 'ita', 'por'
 ])
@ -68,11 +66,11 @@ if 'IDE_PROJECT_ROOTS' in os.environ:

 verify_python3_env()

-if tesseract.version() < MINIMUM_TESS_VERSION:
+if not tesseract.v4:
    complain(
-        "Please install tesseract {0} or newer "
+        "Please install tesseract 4.0.0 or newer "
        "(currently installed version is {1})".format(
-            MINIMUM_TESS_VERSION, tesseract.version()))
+            tesseract.version()))
    sys.exit(ExitCode.missing_dependency)

 # -------------
@ -420,9 +418,7 @@ def check_options_languages(options, _log):
 def check_options_output(options, log):
    # We have these constraints to check for.
    # 1. Ghostscript < 9.20 mangles multibyte Unicode
-    # 2. Tesseract < 3.05 embeds an older version of GlyphlessFont with which
-    #    no version of Ghostscript handles correctly.
-    # 3. hocr doesn't work on non-Latin languages (so don't select it)
+    # 2. hocr doesn't work on non-Latin languages (so don't select it)

    languages = set(options.language)
    is_latin = languages.issubset(HOCR_OK_LANGS)
@ -448,28 +444,7 @@ def check_options_output(options, log):

    # Decide on what renderer to use
    if options.pdf_renderer == 'auto':
-        if tesseract.version() < '3.05' \
-                and options.output_type.startswith('pdfa') \
-                and is_latin:
-            options.pdf_renderer = 'hocr'
-        else:
-            options.pdf_renderer = 'sandwich'
-
-    if options.pdf_renderer == 'sandwich' \
-            and tesseract.version() < '3.05':
-        msg = (
-            "Ghostscript will corrupt the OCR text of PDFs produced by "
-            "Tesseract 3.04.xx and older.  For best results, upgrade to a "
-            "newer release of Tesseract. "
-        )
-
-        if options.output_type.startswith('pdfa'):
-            msg += (
-                "The argument --output-type=pdfa* requires Ghostscript, so "
-                "the PDF will be invalid.  If you cannot upgrade Tesseract, "
-                "use --output-type=pdf.")
-            raise MissingDependencyError(msg)
-        log.warning(msg)
+        options.pdf_renderer = 'sandwich'

    if options.output_type == 'pdfa':
        options.output_type = 'pdfa-2'
--- a/src/ocrmypdf/exec/tesseract.py
+++ b/src/ocrmypdf/exec/tesseract.py
@ -40,8 +40,8 @@ HOCR_TEMPLATE = """<?xml version="1.0" encoding="UTF-8"?>
 <head>
  <title></title>
 <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
-  <meta name='ocr-system' content='tesseract 3.05.00' />
-  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
+  <meta name='ocr-system' content='tesseract 4.0.0' />
+  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/>
 </head>
 <body>
  <div class='ocr_page' id='page_1' title='image "_blank.png"; bbox 0 0 {0} {1}; ppageno 0'>
@ -65,7 +65,7 @@ def v4():
 def has_textonly_pdf():
    """Does Tesseract have textonly_pdf capability?

-    Available in 3.05.01, and v4.00.00alpha since January 2017. Best to
+    Available in v4.00.00alpha since January 2017. Best to
    parse the parameter list
    """
    args_tess = [
@ -86,11 +86,6 @@ def has_textonly_pdf():
    return False


-def psm():
-    "If Tesseract 4.0, use argument --psm instead of -psm"
-    return '--psm' if v4() else '-psm'
-
-
@lru_cache(maxsize=1)
 def languages():
    def lang_error(output):
@ -135,7 +130,7 @@ def tess_base_args(langs, engine_mode):

 def get_orientation(input_file, engine_mode, timeout: float, log):
    args_tesseract = tess_base_args(['osd'], engine_mode) + [
-        psm(), '0',
+        '--psm', '0',
        fspath(input_file),
        'stdout'
    ]
@ -160,17 +155,6 @@ def get_orientation(input_file, engine_mode, timeout: float, log):
                osd[parts[0].strip()] = parts[1].strip()

        angle = int(osd.get('Orientation in degrees', 0))
-        if 'Orientation' in osd:
-            # Tesseract < 3.04.01
-            # reports "Orientation in degrees" as a counterclockwise angle
-            # We keep it clockwise
-            assert 'Rotate' not in osd
-            angle = -angle % 360
-        else:
-            # Tesseract >= 3.04.01
-            # reports "Orientation in degrees" as a clockwise angle
-            assert 'Rotate' in osd
-
        oc = OrientationConfidence(
            angle=angle,
            confidence=float(osd.get('Orientation confidence', 0)))
@ -247,7 +231,7 @@ def generate_hocr(input_file, output_files, language: list, engine_mode,
    args_tesseract = tess_base_args(language, engine_mode)

    if pagesegmode is not None:
-        args_tesseract.extend([psm(), str(pagesegmode)])
+        args_tesseract.extend(['--psm', str(pagesegmode)])

    if user_words:
        args_tesseract.extend(['--user-words', user_words])
@ -257,9 +241,6 @@ def generate_hocr(input_file, output_files, language: list, engine_mode,

    # Reminder: test suite tesseract spoofers will break after any changes
    # to the number of order parameters here
-    # Tesseract 3.04 requires the order here to be "hocr txt" and will fail
-    # on "txt hocr"
-
    args_tesseract.extend([
        input_file,
        prefix,
@ -329,7 +310,7 @@ def generate_pdf(*, input_image, skip_pdf=None, output_pdf, output_text,
    args_tesseract = tess_base_args(language, engine_mode)

    if pagesegmode is not None:
-        args_tesseract.extend([psm(), str(pagesegmode)])
+        args_tesseract.extend(['--psm', str(pagesegmode)])

    if text_only and has_textonly_pdf():
        args_tesseract.extend(['-c', 'textonly_pdf=1'])
--- a/tests/spoof/tesseract_badutf8.py
+++ b/tests/spoof/tesseract_badutf8.py
@ -34,9 +34,12 @@ language pack version mismatches
 """


-VERSION_STRING = '''tesseract 3.05.01
- leptonica-1.72
-  libjpeg 8d : libpng 1.6.19 : libtiff 4.0.6 : zlib 1.2.5
+VERSION_STRING = '''tesseract 4.0.0
+ leptonica-1.77.0
+  libjpeg 9c : libpng 1.6.35 : libtiff 4.0.10 : zlib 1.2.11 : libopenjp2 2.3.0
+ Found AVX2
+ Found AVX
+ Found SSE
 SPOOFED
 '''

--- a/tests/spoof/tesseract_big_image_error.py
+++ b/tests/spoof/tesseract_big_image_error.py
@ -24,9 +24,12 @@
 import sys


-VERSION_STRING = '''tesseract 3.05.01
- leptonica-1.74.4
-  libjpeg 9b : libpng 1.6.32 : libtiff 4.0.8 : zlib 1.2.8
+VERSION_STRING = '''tesseract 4.0.0
+ leptonica-1.77.0
+  libjpeg 9c : libpng 1.6.35 : libtiff 4.0.10 : zlib 1.2.11 : libopenjp2 2.3.0
+ Found AVX2
+ Found AVX
+ Found SSE
 SPOOFED: return error claiming image too big
 '''

--- a/tests/spoof/tesseract_cache.py
+++ b/tests/spoof/tesseract_cache.py
@ -45,12 +45,7 @@ machines with AVX2, the cache is now bundled.

 Certain operations are not cached and routed to tesseract directly.

-Assumes Tesseract 3.04 or higher.
-
-Will fail on Tesseract 3.02.02 in "hocr" mode because it doesn't produce
-the incorrect file extension. Will fail on 3.03 because that has no sidecar
-text support. Will fail to replicate a 3.04 bug if wrong parameter order is
-given.
+Assumes Tesseract 4.0.0-alpha or higher.

 """

--- a/tests/spoof/tesseract_crash.py
+++ b/tests/spoof/tesseract_crash.py
@ -25,10 +25,13 @@ import os
 import signal


-VERSION_STRING = '''tesseract 3.05.01
- leptonica-1.74.4
-  libjpeg 9b : libpng 1.6.32 : libtiff 4.0.8 : zlib 1.2.8
-SPOOFED: CRASH ON OCR or -psm 0
+VERSION_STRING = '''tesseract 4.0.0
+ leptonica-1.77.0
+  libjpeg 9c : libpng 1.6.35 : libtiff 4.0.10 : zlib 1.2.11 : libopenjp2 2.3.0
+ Found AVX2
+ Found AVX
+ Found SSE
+SPOOFED: CRASH ON OCR or --psm 0
 '''

 """Simulates a Tesseract crash when asked to run OCR
--- a/tests/spoof/tesseract_noop.py
+++ b/tests/spoof/tesseract_noop.py
@ -37,9 +37,12 @@ import PyPDF2 as pypdf
 from PIL import Image


-VERSION_STRING = '''tesseract 3.05.01
- leptonica-1.72
-  libjpeg 8d : libpng 1.6.19 : libtiff 4.0.6 : zlib 1.2.5
+VERSION_STRING = '''tesseract 4.0.0
+ leptonica-1.77.0
+  libjpeg 9c : libpng 1.6.35 : libtiff 4.0.10 : zlib 1.2.11 : libopenjp2 2.3.0
+ Found AVX2
+ Found AVX
+ Found SSE
 SPOOFED
 '''

@ -50,7 +53,7 @@ HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
 <head>
  <title></title>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-  <meta name='ocr-system' content='tesseract 3.02.02' />
+  <meta name='ocr-system' content='tesseract 4.0.0' />
  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
 </head>
 <body>
--- a/tests/test_tess3.py
+++ b/tests/test_tess3.py
@ -1,54 +0,0 @@
-# © 2017 James R. Barlow: github.com/jbarlow83
-#
-# This file is part of OCRmyPDF.
-#
-# OCRmyPDF is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# OCRmyPDF is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
-
-import pytest
-from ocrmypdf.exceptions import ExitCode
-from ocrmypdf.exec import tesseract
-
-
-# Skip all tests in this file if not tesseract 3
-pytestmark = pytest.mark.skipif(tesseract.v4(),
-                                reason="tesseract 3.x required")
-
-
-@pytest.mark.skipif(tesseract.has_textonly_pdf(),
-                    reason="check that missing dep is reported on old tess3")
-def test_textonly_pdf_on_older_tess3(resources, no_outpdf):
-    p, _, _ = pytest.helpers.run_ocrmypdf(
-        resources / 'linn.pdf',
-        no_outpdf, '--pdf-renderer', 'sandwich')
-
-    assert p.returncode == ExitCode.missing_dependency
-
-
-@pytest.mark.skipif(not tesseract.has_textonly_pdf(),
-                    reason="check that feature is exercised on new test3")
-def test_textonly_pdf_on_newer_tess3(resources, no_outpdf):
-    p, _, _ = pytest.helpers.run_ocrmypdf(
-        resources / 'linn.pdf',
-        no_outpdf, '--pdf-renderer', 'sandwich')
-
-    assert p.returncode == ExitCode.ok
-
-
-def test_oem_on_tess3(resources, no_outpdf):
-    p, _, err = pytest.helpers.run_ocrmypdf(
-        resources / 'aspect.pdf',
-        no_outpdf, '--tesseract-oem', '1')
-
-    assert p.returncode == ExitCode.ok
-    assert 'argument ignored' in err