mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-12-29 16:10:06 +00:00
Merge 'feature/unpaper-args'
This commit is contained in:
commit
3bcc6d6121
@ -278,6 +278,13 @@ preprocessing.add_argument(
|
||||
help="Clean page as above, and incorporate the cleaned image in the final "
|
||||
"PDF. Might remove desired content.",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'--unpaper-args',
|
||||
type=str,
|
||||
default=None,
|
||||
help="A quoted string of arguments to pass to unpaper. Requires --clean. "
|
||||
"Example: --unpaper-args '--layout double'.",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'--oversample',
|
||||
metavar='DPI',
|
||||
@ -623,12 +630,23 @@ def _optional_program_recommended(name, version_fn, min_version, for_argument):
|
||||
|
||||
|
||||
def check_options_preprocessing(options, log):
|
||||
if options.unpaper_args and not options.clean:
|
||||
raise argparse.ArgumentError(
|
||||
None, "--clean is required for --unpaper-args"
|
||||
)
|
||||
if any((options.clean, options.clean_final)):
|
||||
from .exec import unpaper
|
||||
|
||||
_optional_program_required(
|
||||
'unpaper', unpaper.version, '6.1', '--clean, --clean-final'
|
||||
)
|
||||
try:
|
||||
if options.unpaper_args:
|
||||
options.unpaper_args = unpaper.validate_custom_args(
|
||||
options.unpaper_args
|
||||
)
|
||||
except Exception as e:
|
||||
raise argparse.ArgumentError(None, str(e))
|
||||
|
||||
|
||||
def check_options_ocr_behavior(options, log):
|
||||
|
||||
@ -566,7 +566,7 @@ def preprocess_clean(input_file, output_file, log, context):
|
||||
pageinfo = get_pageinfo(input_file, context)
|
||||
dpi = get_page_square_dpi(pageinfo, options)
|
||||
|
||||
unpaper.clean(input_file, output_file, dpi, log)
|
||||
unpaper.clean(input_file, output_file, dpi, log, options.unpaper_args)
|
||||
|
||||
|
||||
def select_ocr_image(infiles, output_file, log, context):
|
||||
|
||||
@ -19,13 +19,15 @@
|
||||
# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
|
||||
|
||||
import os
|
||||
import shlex
|
||||
import subprocess
|
||||
import sys
|
||||
from functools import lru_cache
|
||||
from subprocess import STDOUT, CalledProcessError, check_output
|
||||
from tempfile import NamedTemporaryFile
|
||||
from subprocess import PIPE, STDOUT, CalledProcessError
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from . import get_version
|
||||
from ..exceptions import MissingDependencyError
|
||||
from ..exceptions import MissingDependencyError, SubprocessOutputError
|
||||
|
||||
try:
|
||||
from PIL import Image
|
||||
@ -64,43 +66,64 @@ def run(input_file, output_file, dpi, log, mode_args):
|
||||
im.close()
|
||||
raise MissingDependencyError() from e
|
||||
|
||||
with NamedTemporaryFile(suffix=suffix) as input_pnm, NamedTemporaryFile(
|
||||
suffix=suffix, mode="r+b"
|
||||
) as output_pnm:
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
input_pnm = os.path.join(tmpdir, f'input{suffix}')
|
||||
output_pnm = os.path.join(tmpdir, f'output{suffix}')
|
||||
im.save(input_pnm, format='PPM')
|
||||
im.close()
|
||||
|
||||
os.unlink(output_pnm.name)
|
||||
|
||||
args_unpaper.extend([input_pnm.name, output_pnm.name])
|
||||
# To prevent any shenanigans from accepting arbitrary parameters in
|
||||
# --unpaper-args, we:
|
||||
# 1) run with cwd set to a tmpdir with only unpaper's files
|
||||
# 2) forbid the use of '/' in arguments, to prevent changing paths
|
||||
# 3) append absolute paths for the input and output file
|
||||
# This should ensure that a user cannot clobber some other file with
|
||||
# their unpaper arguments (whether intentionally or otherwise)
|
||||
args_unpaper.extend([input_pnm, output_pnm])
|
||||
try:
|
||||
stdout = check_output(
|
||||
args_unpaper, close_fds=True, universal_newlines=True, stderr=STDOUT
|
||||
proc = subprocess.run(
|
||||
args_unpaper,
|
||||
check=True,
|
||||
close_fds=True,
|
||||
universal_newlines=True,
|
||||
stderr=STDOUT,
|
||||
cwd=tmpdir,
|
||||
stdout=PIPE,
|
||||
)
|
||||
except CalledProcessError as e:
|
||||
log.debug(e.output)
|
||||
raise e from e
|
||||
else:
|
||||
log.debug(stdout)
|
||||
# unpaper sets dpi to 72
|
||||
Image.open(output_pnm.name).save(output_file, dpi=(dpi, dpi))
|
||||
log.debug(proc.stdout)
|
||||
# unpaper sets dpi to 72; fix this
|
||||
try:
|
||||
Image.open(output_pnm).save(output_file, dpi=(dpi, dpi))
|
||||
except (FileNotFoundError, OSError):
|
||||
raise SubprocessOutputError(
|
||||
"unpaper: failed to produce the expected output file. Called with: "
|
||||
+ str(args_unpaper)
|
||||
) from None
|
||||
|
||||
|
||||
def clean(input_file, output_file, dpi, log):
|
||||
run(
|
||||
input_file,
|
||||
output_file,
|
||||
dpi,
|
||||
log,
|
||||
[
|
||||
'--layout',
|
||||
'none',
|
||||
'--mask-scan-size',
|
||||
'100', # don't blank out narrow columns
|
||||
'--no-border-align', # don't align visible content to borders
|
||||
'--no-mask-center', # don't center visible content within page
|
||||
'--no-grayfilter', # don't remove light gray areas
|
||||
'--no-blackfilter', # don't remove solid black areas
|
||||
'--no-deskew', # don't deskew
|
||||
],
|
||||
)
|
||||
def validate_custom_args(args: str):
|
||||
unpaper_args = shlex.split(args)
|
||||
if any('/' in arg for arg in unpaper_args):
|
||||
raise ValueError('No filenames allowed in --unpaper-args')
|
||||
return unpaper_args
|
||||
|
||||
|
||||
def clean(input_file, output_file, dpi, log, unpaper_args=None):
|
||||
default_args = [
|
||||
'--layout',
|
||||
'none',
|
||||
'--mask-scan-size',
|
||||
'100', # don't blank out narrow columns
|
||||
'--no-border-align', # don't align visible content to borders
|
||||
'--no-mask-center', # don't center visible content within page
|
||||
'--no-grayfilter', # don't remove light gray areas
|
||||
'--no-blackfilter', # don't remove solid black areas
|
||||
'--no-deskew', # don't deskew
|
||||
]
|
||||
if not unpaper_args:
|
||||
unpaper_args = default_args
|
||||
run(input_file, output_file, dpi, log, unpaper_args)
|
||||
|
||||
@ -147,7 +147,7 @@ def no_outpdf(tmpdir):
|
||||
|
||||
@pytest.helpers.register
|
||||
def check_ocrmypdf(input_file, output_file, *args, env=None):
|
||||
"Run ocrmypdf and confirmed that a valid file was created"
|
||||
"""Run ocrmypdf and confirmed that a valid file was created"""
|
||||
|
||||
p, out, err = run_ocrmypdf(input_file, output_file, *args, env=env)
|
||||
# ensure py.test collects the output, use -s to view
|
||||
|
||||
@ -15,11 +15,16 @@
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import argparse
|
||||
from os import fspath
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from ocrmypdf import __main__ as main
|
||||
from ocrmypdf.exceptions import ExitCode
|
||||
from ocrmypdf.exec import unpaper
|
||||
|
||||
# pytest.helpers is dynamic
|
||||
# pylint: disable=no-member
|
||||
@ -30,26 +35,79 @@ run_ocrmypdf = pytest.helpers.run_ocrmypdf
|
||||
spoof = pytest.helpers.spoof
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
def have_unpaper():
|
||||
try:
|
||||
unpaper.version()
|
||||
except Exception:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def spoof_unpaper_oldversion(tmpdir_factory):
|
||||
return spoof(tmpdir_factory, unpaper='unpaper_oldversion.py')
|
||||
return spoof(tmpdir_factory, unpaper="unpaper_oldversion.py")
|
||||
|
||||
|
||||
@pytest.mark.skipif(True, reason="needs new fixture implementation")
|
||||
def test_no_unpaper(resources, no_outpdf):
|
||||
# <disable unpaper here>
|
||||
p, out, err = run_ocrmypdf(
|
||||
resources / 'c02-22.pdf', no_outpdf, '--clean', env=os.environ
|
||||
)
|
||||
assert p.returncode == ExitCode.missing_dependency
|
||||
input_ = fspath(resources / "c02-22.pdf")
|
||||
output = fspath(no_outpdf)
|
||||
options = main.parser.parse_args(args=["--clean", input_, output])
|
||||
|
||||
with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version:
|
||||
mock_unpaper_version.side_effect = FileNotFoundError("unpaper")
|
||||
with pytest.raises(SystemExit):
|
||||
main.check_options(options, log=MagicMock())
|
||||
|
||||
|
||||
def test_old_unpaper(spoof_unpaper_oldversion, resources, no_outpdf):
|
||||
p, out, err = run_ocrmypdf(
|
||||
resources / 'c02-22.pdf', no_outpdf, '--clean', env=spoof_unpaper_oldversion
|
||||
resources / "c02-22.pdf", no_outpdf, "--clean", env=spoof_unpaper_oldversion
|
||||
)
|
||||
assert p.returncode == ExitCode.missing_dependency
|
||||
|
||||
|
||||
@pytest.mark.skipif(not have_unpaper(), reason="requires unpaper")
|
||||
def test_clean(spoof_tesseract_noop, resources, outpdf):
|
||||
check_ocrmypdf(resources / 'skew.pdf', outpdf, '-c', env=spoof_tesseract_noop)
|
||||
check_ocrmypdf(resources / "skew.pdf", outpdf, "-c", env=spoof_tesseract_noop)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not have_unpaper(), reason="requires unpaper")
|
||||
def test_unpaper_args_valid(spoof_tesseract_noop, resources, outpdf):
|
||||
check_ocrmypdf(
|
||||
resources / "skew.pdf",
|
||||
outpdf,
|
||||
"-c",
|
||||
"--unpaper-args",
|
||||
"--layout double", # Spaces required here
|
||||
env=spoof_tesseract_noop,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not have_unpaper(), reason="requires unpaper")
|
||||
def test_unpaper_args_invalid_filename(spoof_tesseract_noop, resources, outpdf):
|
||||
p, out, err = run_ocrmypdf(
|
||||
resources / "skew.pdf",
|
||||
outpdf,
|
||||
"-c",
|
||||
"--unpaper-args",
|
||||
"/etc/passwd",
|
||||
env=spoof_tesseract_noop,
|
||||
)
|
||||
assert "No filenames allowed" in err
|
||||
assert p.returncode == ExitCode.bad_args
|
||||
|
||||
|
||||
@pytest.mark.skipif(not have_unpaper(), reason="requires unpaper")
|
||||
def test_unpaper_args_invalid(spoof_tesseract_noop, resources, outpdf):
|
||||
p, out, err = run_ocrmypdf(
|
||||
resources / "skew.pdf",
|
||||
outpdf,
|
||||
"-c",
|
||||
"--unpaper-args",
|
||||
"unpaper is not going to like these arguments",
|
||||
env=spoof_tesseract_noop,
|
||||
)
|
||||
# Can't tell difference between unpaper choking on bad arguments or some
|
||||
# other unpaper failure
|
||||
assert p.returncode == ExitCode.child_process_error
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user