Refactor tesseract_env variable into the plugin

Removed all cases except one in api.py, which isn't worth solving because it should be removed anyway. This also fixes a logic error in the OMP_THREAD_LIMIT decision, api.py did not use pass kwargs correctly so they never worked before.
2026-01-05 19:51:07 +00:00 · 2020-05-26 02:13:17 -07:00 · 2020-05-26 02:13:17 -07:00 · aa060db5bc
commit aa060db5bc
parent d43212d30b
7 changed files with 42 additions and 37 deletions
--- a/src/ocrmypdf/_plugin_manager.py
+++ b/src/ocrmypdf/_plugin_manager.py
@ -15,6 +15,7 @@
 # You should have received a copy of the GNU General Public License
 # along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.

+import argparse
 import importlib
 import importlib.util
 import sys
@ -50,7 +51,9 @@ def get_plugin_manager(plugins: List[str], builtins=True):
    return pm


-def get_parser_options_plugins(args):
+def get_parser_options_plugins(
+    args,
+) -> (argparse.ArgumentParser, argparse.Namespace, pluggy.PluginManager):
    pre_options, _unused = plugins_only_parser.parse_known_args(args=args)
    plugin_manager = get_plugin_manager(pre_options.plugins)

--- a/src/ocrmypdf/_sync.py
+++ b/src/ocrmypdf/_sync.py
@ -216,25 +216,6 @@ def exec_concurrent(context):
    if max_workers > 1:
        log.info("Start processing %d pages concurrently", max_workers)

-    # Tesseract 4.x can be multithreaded, and we also run multiple workers. We want
-    # to manage how many threads it uses to avoid creating total threads than cores.
-    # Performance testing shows we're better off
-    # parallelizing ocrmypdf and forcing Tesseract to be single threaded, which we
-    # get by setting the envvar OMP_THREAD_LIMIT to 1. But if the page count of the
-    # input file is small, then we allow Tesseract to use threads, subject to the
-    # constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers.
-    # As of Tesseract 4.1, 3 threads is the most effective on a 4 core/8 thread system.
-    tess_threads = min(3, context.options.jobs // max_workers)
-    if context.options.tesseract_env is None:
-        context.options.tesseract_env = os.environ.copy()
-    context.options.tesseract_env.setdefault('OMP_THREAD_LIMIT', str(tess_threads))
-    try:
-        tess_threads = int(context.options.tesseract_env['OMP_THREAD_LIMIT'])
-    except ValueError:  # OMP_THREAD_LIMIT initialized to non-numeric
-        context.log.error("Environment variable OMP_THREAD_LIMIT is not numeric")
-    if tess_threads > 1:
-        log.info("Using Tesseract OpenMP thread limit %d", tess_threads)
-
    sidecars = [None] * len(context.pdfinfo)
    ocrgraft = OcrGrafter(context)

--- a/src/ocrmypdf/api.py
+++ b/src/ocrmypdf/api.py
@ -15,6 +15,7 @@
 # You should have received a copy of the GNU General Public License
 # along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.

+import inspect
 import logging
 import os
 import sys
@ -178,11 +179,6 @@ def create_options(
    options = parser.parse_args(cmdline)
    for keyword, val in deferred:
        setattr(options, keyword, val)
-
-    # If we are running a Tesseract spoof, ensure it knows what the input file is
-    if os.environ.get('PYTEST_CURRENT_TEST') and options.tesseract_env:
-        options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file)
-
    return options


@ -233,7 +229,6 @@ def ocr(  # pylint: disable=unused-argument
    plugins: Iterable[str] = None,
    keep_temporary_files: bool = None,
    progress_bar: bool = None,
-    tesseract_env: Dict[str, str] = None,
    **kwargs,
 ):
    """Run OCRmyPDF on one PDF or image.
@ -245,7 +240,6 @@ def ocr(  # pylint: disable=unused-argument
        use_threads (bool): Use worker threads instead of processes. This reduces
            performance but may make debugging easier since it is easier to set
            breakpoints.
-        tesseract_env (dict): Override environment variables for Tesseract
    Raises:
        ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging
            with the OCR layer.
@ -274,10 +268,13 @@ def ocr(  # pylint: disable=unused-argument

    parser = get_parser()
    _plugin_manager = get_plugin_manager(plugins)
-    _plugin_manager.hook.add_options(parser=parser)
+    _plugin_manager.hook.add_options(parser=parser)  # pylint: disable=no-member

-    options = create_options(
-        **{k: v for k, v in locals().items() if not k.startswith('_')}
-    )
+    create_options_kwargs = {
+        k: v for k, v in locals().items() if not k.startswith('_') and k != 'kwargs'
+    }
+    create_options_kwargs.update(kwargs)
+
+    options = create_options(**create_options_kwargs)
    check_options(options, _plugin_manager)
    return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True)
--- a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py
+++ b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py
@ -15,7 +15,9 @@
 # You should have received a copy of the GNU General Public License
 # along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.

+import argparse
 import logging
+import os

 from ocrmypdf import hookimpl
 from ocrmypdf.cli import numeric
@ -79,6 +81,7 @@ def add_options(parser):
        metavar='FILE',
        help="Specify the location of the Tesseract user patterns file.",
    )
+    tess.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS)


@hookimpl
@ -115,6 +118,28 @@ def check_options(options):
        )


+@hookimpl
+def validate(pdfinfo, options):
+    # If we are running a Tesseract spoof, ensure it knows what the input file is
+    if os.environ.get('PYTEST_CURRENT_TEST') and options.tesseract_env:
+        options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(options.input_file)
+
+    # Tesseract 4.x can be multithreaded, and we also run multiple workers. We want
+    # to manage how many threads it uses to avoid creating total threads than cores.
+    # Performance testing shows we're better off
+    # parallelizing ocrmypdf and forcing Tesseract to be single threaded, which we
+    # get by setting the envvar OMP_THREAD_LIMIT to 1. But if the page count of the
+    # input file is small, then we allow Tesseract to use threads, subject to the
+    # constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers.
+    # As of Tesseract 4.1, 3 threads is the most effective on a 4 core/8 thread system.
+    if not options.tesseract_env.get('OMP_THREAD_LIMIT', '').isnumeric():
+        tess_threads = min(3, options.jobs // len(pdfinfo), len(pdfinfo))
+        options.tesseract_env['OMP_THREAD_LIMIT'] = str(tess_threads)
+
+    if tess_threads > 1:
+        log.info("Using Tesseract OpenMP thread limit %d", tess_threads)
+
+
 class TesseractOcrEngine(OcrEngine):
    @staticmethod
    def version():
--- a/src/ocrmypdf/cli.py
+++ b/src/ocrmypdf/cli.py
@ -474,7 +474,6 @@ Online documentation is located at:
        action='store_true',
        help="Keep temporary files (helpful for debugging)",
    )
-    debugging.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS)
    return parser


--- a/src/ocrmypdf/pluginspec.py
+++ b/src/ocrmypdf/pluginspec.py
@ -40,7 +40,7 @@ def add_options(parser: ArgumentParser) -> None:

@hookspec
 def check_options(options: Namespace) -> None:
-    """Called to notify a plugin that a file will be processed.
+    """Called to ask the plugin to check all of its options.

 	The plugin may modify the *options*. All objects that are in options must
 	be picklable so they can be marshalled to child worker processes.
--- a/tests/test_unpaper.py
+++ b/tests/test_unpaper.py
@ -20,7 +20,7 @@ from unittest.mock import patch

 import pytest

-from ocrmypdf._plugin_manager import get_plugin_manager
+from ocrmypdf._plugin_manager import get_parser_options_plugins
 from ocrmypdf._validation import check_options
 from ocrmypdf.cli import get_parser
 from ocrmypdf.exceptions import ExitCode, MissingDependencyError
@ -43,13 +43,13 @@ def spoof_unpaper_oldversion(tmp_path_factory):
 def test_no_unpaper(resources, no_outpdf):
    input_ = fspath(resources / "c02-22.pdf")
    output = fspath(no_outpdf)
-    options = get_parser().parse_args(args=["--clean", input_, output])
-    plugin_manager = get_plugin_manager(options.plugins)
+
+    _parser, options, pm = get_parser_options_plugins(["--clean", input_, output])
    with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version:
        mock_unpaper_version.side_effect = FileNotFoundError("unpaper")

        with pytest.raises(MissingDependencyError):
-            check_options(options, plugin_manager)
+            check_options(options, pm)


 def test_old_unpaper(spoof_unpaper_oldversion, resources, no_outpdf):