Refactor tesseract_env variable into the plugin

Removed all cases except one in api.py, which isn't worth solving because
it should be removed anyway.

This also fixes a logic error in the OMP_THREAD_LIMIT decision, api.py
did not use pass kwargs correctly so they never worked before.
This commit is contained in:
James R. Barlow 2020-05-26 02:13:17 -07:00
parent d43212d30b
commit aa060db5bc
7 changed files with 42 additions and 37 deletions

View File

@ -15,6 +15,7 @@
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import argparse
import importlib
import importlib.util
import sys
@ -50,7 +51,9 @@ def get_plugin_manager(plugins: List[str], builtins=True):
return pm
def get_parser_options_plugins(args):
def get_parser_options_plugins(
args,
) -> (argparse.ArgumentParser, argparse.Namespace, pluggy.PluginManager):
pre_options, _unused = plugins_only_parser.parse_known_args(args=args)
plugin_manager = get_plugin_manager(pre_options.plugins)

View File

@ -216,25 +216,6 @@ def exec_concurrent(context):
if max_workers > 1:
log.info("Start processing %d pages concurrently", max_workers)
# Tesseract 4.x can be multithreaded, and we also run multiple workers. We want
# to manage how many threads it uses to avoid creating total threads than cores.
# Performance testing shows we're better off
# parallelizing ocrmypdf and forcing Tesseract to be single threaded, which we
# get by setting the envvar OMP_THREAD_LIMIT to 1. But if the page count of the
# input file is small, then we allow Tesseract to use threads, subject to the
# constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers.
# As of Tesseract 4.1, 3 threads is the most effective on a 4 core/8 thread system.
tess_threads = min(3, context.options.jobs // max_workers)
if context.options.tesseract_env is None:
context.options.tesseract_env = os.environ.copy()
context.options.tesseract_env.setdefault('OMP_THREAD_LIMIT', str(tess_threads))
try:
tess_threads = int(context.options.tesseract_env['OMP_THREAD_LIMIT'])
except ValueError: # OMP_THREAD_LIMIT initialized to non-numeric
context.log.error("Environment variable OMP_THREAD_LIMIT is not numeric")
if tess_threads > 1:
log.info("Using Tesseract OpenMP thread limit %d", tess_threads)
sidecars = [None] * len(context.pdfinfo)
ocrgraft = OcrGrafter(context)

View File

@ -15,6 +15,7 @@
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import inspect
import logging
import os
import sys
@ -178,11 +179,6 @@ def create_options(
options = parser.parse_args(cmdline)
for keyword, val in deferred:
setattr(options, keyword, val)
# If we are running a Tesseract spoof, ensure it knows what the input file is
if os.environ.get('PYTEST_CURRENT_TEST') and options.tesseract_env:
options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file)
return options
@ -233,7 +229,6 @@ def ocr( # pylint: disable=unused-argument
plugins: Iterable[str] = None,
keep_temporary_files: bool = None,
progress_bar: bool = None,
tesseract_env: Dict[str, str] = None,
**kwargs,
):
"""Run OCRmyPDF on one PDF or image.
@ -245,7 +240,6 @@ def ocr( # pylint: disable=unused-argument
use_threads (bool): Use worker threads instead of processes. This reduces
performance but may make debugging easier since it is easier to set
breakpoints.
tesseract_env (dict): Override environment variables for Tesseract
Raises:
ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging
with the OCR layer.
@ -274,10 +268,13 @@ def ocr( # pylint: disable=unused-argument
parser = get_parser()
_plugin_manager = get_plugin_manager(plugins)
_plugin_manager.hook.add_options(parser=parser)
_plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member
options = create_options(
**{k: v for k, v in locals().items() if not k.startswith('_')}
)
create_options_kwargs = {
k: v for k, v in locals().items() if not k.startswith('_') and k != 'kwargs'
}
create_options_kwargs.update(kwargs)
options = create_options(**create_options_kwargs)
check_options(options, _plugin_manager)
return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True)

View File

@ -15,7 +15,9 @@
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import argparse
import logging
import os
from ocrmypdf import hookimpl
from ocrmypdf.cli import numeric
@ -79,6 +81,7 @@ def add_options(parser):
metavar='FILE',
help="Specify the location of the Tesseract user patterns file.",
)
tess.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS)
@hookimpl
@ -115,6 +118,28 @@ def check_options(options):
)
@hookimpl
def validate(pdfinfo, options):
# If we are running a Tesseract spoof, ensure it knows what the input file is
if os.environ.get('PYTEST_CURRENT_TEST') and options.tesseract_env:
options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(options.input_file)
# Tesseract 4.x can be multithreaded, and we also run multiple workers. We want
# to manage how many threads it uses to avoid creating total threads than cores.
# Performance testing shows we're better off
# parallelizing ocrmypdf and forcing Tesseract to be single threaded, which we
# get by setting the envvar OMP_THREAD_LIMIT to 1. But if the page count of the
# input file is small, then we allow Tesseract to use threads, subject to the
# constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers.
# As of Tesseract 4.1, 3 threads is the most effective on a 4 core/8 thread system.
if not options.tesseract_env.get('OMP_THREAD_LIMIT', '').isnumeric():
tess_threads = min(3, options.jobs // len(pdfinfo), len(pdfinfo))
options.tesseract_env['OMP_THREAD_LIMIT'] = str(tess_threads)
if tess_threads > 1:
log.info("Using Tesseract OpenMP thread limit %d", tess_threads)
class TesseractOcrEngine(OcrEngine):
@staticmethod
def version():

View File

@ -474,7 +474,6 @@ Online documentation is located at:
action='store_true',
help="Keep temporary files (helpful for debugging)",
)
debugging.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS)
return parser

View File

@ -40,7 +40,7 @@ def add_options(parser: ArgumentParser) -> None:
@hookspec
def check_options(options: Namespace) -> None:
"""Called to notify a plugin that a file will be processed.
"""Called to ask the plugin to check all of its options.
The plugin may modify the *options*. All objects that are in options must
be picklable so they can be marshalled to child worker processes.

View File

@ -20,7 +20,7 @@ from unittest.mock import patch
import pytest
from ocrmypdf._plugin_manager import get_plugin_manager
from ocrmypdf._plugin_manager import get_parser_options_plugins
from ocrmypdf._validation import check_options
from ocrmypdf.cli import get_parser
from ocrmypdf.exceptions import ExitCode, MissingDependencyError
@ -43,13 +43,13 @@ def spoof_unpaper_oldversion(tmp_path_factory):
def test_no_unpaper(resources, no_outpdf):
input_ = fspath(resources / "c02-22.pdf")
output = fspath(no_outpdf)
options = get_parser().parse_args(args=["--clean", input_, output])
plugin_manager = get_plugin_manager(options.plugins)
_parser, options, pm = get_parser_options_plugins(["--clean", input_, output])
with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version:
mock_unpaper_version.side_effect = FileNotFoundError("unpaper")
with pytest.raises(MissingDependencyError):
check_options(options, plugin_manager)
check_options(options, pm)
def test_old_unpaper(spoof_unpaper_oldversion, resources, no_outpdf):