mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-01-05 19:51:07 +00:00
Refactor tesseract_env variable into the plugin
Removed all cases except one in api.py, which isn't worth solving because it should be removed anyway. This also fixes a logic error in the OMP_THREAD_LIMIT decision, api.py did not use pass kwargs correctly so they never worked before.
This commit is contained in:
parent
d43212d30b
commit
aa060db5bc
@ -15,6 +15,7 @@
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import argparse
|
||||
import importlib
|
||||
import importlib.util
|
||||
import sys
|
||||
@ -50,7 +51,9 @@ def get_plugin_manager(plugins: List[str], builtins=True):
|
||||
return pm
|
||||
|
||||
|
||||
def get_parser_options_plugins(args):
|
||||
def get_parser_options_plugins(
|
||||
args,
|
||||
) -> (argparse.ArgumentParser, argparse.Namespace, pluggy.PluginManager):
|
||||
pre_options, _unused = plugins_only_parser.parse_known_args(args=args)
|
||||
plugin_manager = get_plugin_manager(pre_options.plugins)
|
||||
|
||||
|
||||
@ -216,25 +216,6 @@ def exec_concurrent(context):
|
||||
if max_workers > 1:
|
||||
log.info("Start processing %d pages concurrently", max_workers)
|
||||
|
||||
# Tesseract 4.x can be multithreaded, and we also run multiple workers. We want
|
||||
# to manage how many threads it uses to avoid creating total threads than cores.
|
||||
# Performance testing shows we're better off
|
||||
# parallelizing ocrmypdf and forcing Tesseract to be single threaded, which we
|
||||
# get by setting the envvar OMP_THREAD_LIMIT to 1. But if the page count of the
|
||||
# input file is small, then we allow Tesseract to use threads, subject to the
|
||||
# constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers.
|
||||
# As of Tesseract 4.1, 3 threads is the most effective on a 4 core/8 thread system.
|
||||
tess_threads = min(3, context.options.jobs // max_workers)
|
||||
if context.options.tesseract_env is None:
|
||||
context.options.tesseract_env = os.environ.copy()
|
||||
context.options.tesseract_env.setdefault('OMP_THREAD_LIMIT', str(tess_threads))
|
||||
try:
|
||||
tess_threads = int(context.options.tesseract_env['OMP_THREAD_LIMIT'])
|
||||
except ValueError: # OMP_THREAD_LIMIT initialized to non-numeric
|
||||
context.log.error("Environment variable OMP_THREAD_LIMIT is not numeric")
|
||||
if tess_threads > 1:
|
||||
log.info("Using Tesseract OpenMP thread limit %d", tess_threads)
|
||||
|
||||
sidecars = [None] * len(context.pdfinfo)
|
||||
ocrgraft = OcrGrafter(context)
|
||||
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import inspect
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
@ -178,11 +179,6 @@ def create_options(
|
||||
options = parser.parse_args(cmdline)
|
||||
for keyword, val in deferred:
|
||||
setattr(options, keyword, val)
|
||||
|
||||
# If we are running a Tesseract spoof, ensure it knows what the input file is
|
||||
if os.environ.get('PYTEST_CURRENT_TEST') and options.tesseract_env:
|
||||
options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file)
|
||||
|
||||
return options
|
||||
|
||||
|
||||
@ -233,7 +229,6 @@ def ocr( # pylint: disable=unused-argument
|
||||
plugins: Iterable[str] = None,
|
||||
keep_temporary_files: bool = None,
|
||||
progress_bar: bool = None,
|
||||
tesseract_env: Dict[str, str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Run OCRmyPDF on one PDF or image.
|
||||
@ -245,7 +240,6 @@ def ocr( # pylint: disable=unused-argument
|
||||
use_threads (bool): Use worker threads instead of processes. This reduces
|
||||
performance but may make debugging easier since it is easier to set
|
||||
breakpoints.
|
||||
tesseract_env (dict): Override environment variables for Tesseract
|
||||
Raises:
|
||||
ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging
|
||||
with the OCR layer.
|
||||
@ -274,10 +268,13 @@ def ocr( # pylint: disable=unused-argument
|
||||
|
||||
parser = get_parser()
|
||||
_plugin_manager = get_plugin_manager(plugins)
|
||||
_plugin_manager.hook.add_options(parser=parser)
|
||||
_plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member
|
||||
|
||||
options = create_options(
|
||||
**{k: v for k, v in locals().items() if not k.startswith('_')}
|
||||
)
|
||||
create_options_kwargs = {
|
||||
k: v for k, v in locals().items() if not k.startswith('_') and k != 'kwargs'
|
||||
}
|
||||
create_options_kwargs.update(kwargs)
|
||||
|
||||
options = create_options(**create_options_kwargs)
|
||||
check_options(options, _plugin_manager)
|
||||
return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True)
|
||||
|
||||
@ -15,7 +15,9 @@
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
|
||||
from ocrmypdf import hookimpl
|
||||
from ocrmypdf.cli import numeric
|
||||
@ -79,6 +81,7 @@ def add_options(parser):
|
||||
metavar='FILE',
|
||||
help="Specify the location of the Tesseract user patterns file.",
|
||||
)
|
||||
tess.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS)
|
||||
|
||||
|
||||
@hookimpl
|
||||
@ -115,6 +118,28 @@ def check_options(options):
|
||||
)
|
||||
|
||||
|
||||
@hookimpl
|
||||
def validate(pdfinfo, options):
|
||||
# If we are running a Tesseract spoof, ensure it knows what the input file is
|
||||
if os.environ.get('PYTEST_CURRENT_TEST') and options.tesseract_env:
|
||||
options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(options.input_file)
|
||||
|
||||
# Tesseract 4.x can be multithreaded, and we also run multiple workers. We want
|
||||
# to manage how many threads it uses to avoid creating total threads than cores.
|
||||
# Performance testing shows we're better off
|
||||
# parallelizing ocrmypdf and forcing Tesseract to be single threaded, which we
|
||||
# get by setting the envvar OMP_THREAD_LIMIT to 1. But if the page count of the
|
||||
# input file is small, then we allow Tesseract to use threads, subject to the
|
||||
# constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers.
|
||||
# As of Tesseract 4.1, 3 threads is the most effective on a 4 core/8 thread system.
|
||||
if not options.tesseract_env.get('OMP_THREAD_LIMIT', '').isnumeric():
|
||||
tess_threads = min(3, options.jobs // len(pdfinfo), len(pdfinfo))
|
||||
options.tesseract_env['OMP_THREAD_LIMIT'] = str(tess_threads)
|
||||
|
||||
if tess_threads > 1:
|
||||
log.info("Using Tesseract OpenMP thread limit %d", tess_threads)
|
||||
|
||||
|
||||
class TesseractOcrEngine(OcrEngine):
|
||||
@staticmethod
|
||||
def version():
|
||||
|
||||
@ -474,7 +474,6 @@ Online documentation is located at:
|
||||
action='store_true',
|
||||
help="Keep temporary files (helpful for debugging)",
|
||||
)
|
||||
debugging.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS)
|
||||
return parser
|
||||
|
||||
|
||||
|
||||
@ -40,7 +40,7 @@ def add_options(parser: ArgumentParser) -> None:
|
||||
|
||||
@hookspec
|
||||
def check_options(options: Namespace) -> None:
|
||||
"""Called to notify a plugin that a file will be processed.
|
||||
"""Called to ask the plugin to check all of its options.
|
||||
|
||||
The plugin may modify the *options*. All objects that are in options must
|
||||
be picklable so they can be marshalled to child worker processes.
|
||||
|
||||
@ -20,7 +20,7 @@ from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from ocrmypdf._plugin_manager import get_plugin_manager
|
||||
from ocrmypdf._plugin_manager import get_parser_options_plugins
|
||||
from ocrmypdf._validation import check_options
|
||||
from ocrmypdf.cli import get_parser
|
||||
from ocrmypdf.exceptions import ExitCode, MissingDependencyError
|
||||
@ -43,13 +43,13 @@ def spoof_unpaper_oldversion(tmp_path_factory):
|
||||
def test_no_unpaper(resources, no_outpdf):
|
||||
input_ = fspath(resources / "c02-22.pdf")
|
||||
output = fspath(no_outpdf)
|
||||
options = get_parser().parse_args(args=["--clean", input_, output])
|
||||
plugin_manager = get_plugin_manager(options.plugins)
|
||||
|
||||
_parser, options, pm = get_parser_options_plugins(["--clean", input_, output])
|
||||
with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version:
|
||||
mock_unpaper_version.side_effect = FileNotFoundError("unpaper")
|
||||
|
||||
with pytest.raises(MissingDependencyError):
|
||||
check_options(options, plugin_manager)
|
||||
check_options(options, pm)
|
||||
|
||||
|
||||
def test_old_unpaper(spoof_unpaper_oldversion, resources, no_outpdf):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user