Merge 'feature/unpaper-args'

This commit is contained in:
James R. Barlow 2019-02-07 17:06:28 -08:00
commit 3bcc6d6121
5 changed files with 143 additions and 44 deletions

View File

@ -278,6 +278,13 @@ preprocessing.add_argument(
help="Clean page as above, and incorporate the cleaned image in the final "
"PDF. Might remove desired content.",
)
preprocessing.add_argument(
'--unpaper-args',
type=str,
default=None,
help="A quoted string of arguments to pass to unpaper. Requires --clean. "
"Example: --unpaper-args '--layout double'.",
)
preprocessing.add_argument(
'--oversample',
metavar='DPI',
@ -623,12 +630,23 @@ def _optional_program_recommended(name, version_fn, min_version, for_argument):
def check_options_preprocessing(options, log):
if options.unpaper_args and not options.clean:
raise argparse.ArgumentError(
None, "--clean is required for --unpaper-args"
)
if any((options.clean, options.clean_final)):
from .exec import unpaper
_optional_program_required(
'unpaper', unpaper.version, '6.1', '--clean, --clean-final'
)
try:
if options.unpaper_args:
options.unpaper_args = unpaper.validate_custom_args(
options.unpaper_args
)
except Exception as e:
raise argparse.ArgumentError(None, str(e))
def check_options_ocr_behavior(options, log):

View File

@ -566,7 +566,7 @@ def preprocess_clean(input_file, output_file, log, context):
pageinfo = get_pageinfo(input_file, context)
dpi = get_page_square_dpi(pageinfo, options)
unpaper.clean(input_file, output_file, dpi, log)
unpaper.clean(input_file, output_file, dpi, log, options.unpaper_args)
def select_ocr_image(infiles, output_file, log, context):

View File

@ -19,13 +19,15 @@
# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
import os
import shlex
import subprocess
import sys
from functools import lru_cache
from subprocess import STDOUT, CalledProcessError, check_output
from tempfile import NamedTemporaryFile
from subprocess import PIPE, STDOUT, CalledProcessError
from tempfile import TemporaryDirectory
from . import get_version
from ..exceptions import MissingDependencyError
from ..exceptions import MissingDependencyError, SubprocessOutputError
try:
from PIL import Image
@ -64,43 +66,64 @@ def run(input_file, output_file, dpi, log, mode_args):
im.close()
raise MissingDependencyError() from e
with NamedTemporaryFile(suffix=suffix) as input_pnm, NamedTemporaryFile(
suffix=suffix, mode="r+b"
) as output_pnm:
with TemporaryDirectory() as tmpdir:
input_pnm = os.path.join(tmpdir, f'input{suffix}')
output_pnm = os.path.join(tmpdir, f'output{suffix}')
im.save(input_pnm, format='PPM')
im.close()
os.unlink(output_pnm.name)
args_unpaper.extend([input_pnm.name, output_pnm.name])
# To prevent any shenanigans from accepting arbitrary parameters in
# --unpaper-args, we:
# 1) run with cwd set to a tmpdir with only unpaper's files
# 2) forbid the use of '/' in arguments, to prevent changing paths
# 3) append absolute paths for the input and output file
# This should ensure that a user cannot clobber some other file with
# their unpaper arguments (whether intentionally or otherwise)
args_unpaper.extend([input_pnm, output_pnm])
try:
stdout = check_output(
args_unpaper, close_fds=True, universal_newlines=True, stderr=STDOUT
proc = subprocess.run(
args_unpaper,
check=True,
close_fds=True,
universal_newlines=True,
stderr=STDOUT,
cwd=tmpdir,
stdout=PIPE,
)
except CalledProcessError as e:
log.debug(e.output)
raise e from e
else:
log.debug(stdout)
# unpaper sets dpi to 72
Image.open(output_pnm.name).save(output_file, dpi=(dpi, dpi))
log.debug(proc.stdout)
# unpaper sets dpi to 72; fix this
try:
Image.open(output_pnm).save(output_file, dpi=(dpi, dpi))
except (FileNotFoundError, OSError):
raise SubprocessOutputError(
"unpaper: failed to produce the expected output file. Called with: "
+ str(args_unpaper)
) from None
def clean(input_file, output_file, dpi, log):
run(
input_file,
output_file,
dpi,
log,
[
'--layout',
'none',
'--mask-scan-size',
'100', # don't blank out narrow columns
'--no-border-align', # don't align visible content to borders
'--no-mask-center', # don't center visible content within page
'--no-grayfilter', # don't remove light gray areas
'--no-blackfilter', # don't remove solid black areas
'--no-deskew', # don't deskew
],
)
def validate_custom_args(args: str):
unpaper_args = shlex.split(args)
if any('/' in arg for arg in unpaper_args):
raise ValueError('No filenames allowed in --unpaper-args')
return unpaper_args
def clean(input_file, output_file, dpi, log, unpaper_args=None):
default_args = [
'--layout',
'none',
'--mask-scan-size',
'100', # don't blank out narrow columns
'--no-border-align', # don't align visible content to borders
'--no-mask-center', # don't center visible content within page
'--no-grayfilter', # don't remove light gray areas
'--no-blackfilter', # don't remove solid black areas
'--no-deskew', # don't deskew
]
if not unpaper_args:
unpaper_args = default_args
run(input_file, output_file, dpi, log, unpaper_args)

View File

@ -147,7 +147,7 @@ def no_outpdf(tmpdir):
@pytest.helpers.register
def check_ocrmypdf(input_file, output_file, *args, env=None):
"Run ocrmypdf and confirmed that a valid file was created"
"""Run ocrmypdf and confirmed that a valid file was created"""
p, out, err = run_ocrmypdf(input_file, output_file, *args, env=env)
# ensure py.test collects the output, use -s to view

View File

@ -15,11 +15,16 @@
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import argparse
from os import fspath
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from ocrmypdf import __main__ as main
from ocrmypdf.exceptions import ExitCode
from ocrmypdf.exec import unpaper
# pytest.helpers is dynamic
# pylint: disable=no-member
@ -30,26 +35,79 @@ run_ocrmypdf = pytest.helpers.run_ocrmypdf
spoof = pytest.helpers.spoof
@pytest.fixture(scope='session')
def have_unpaper():
try:
unpaper.version()
except Exception:
return False
else:
return True
@pytest.fixture(scope="session")
def spoof_unpaper_oldversion(tmpdir_factory):
return spoof(tmpdir_factory, unpaper='unpaper_oldversion.py')
return spoof(tmpdir_factory, unpaper="unpaper_oldversion.py")
@pytest.mark.skipif(True, reason="needs new fixture implementation")
def test_no_unpaper(resources, no_outpdf):
# <disable unpaper here>
p, out, err = run_ocrmypdf(
resources / 'c02-22.pdf', no_outpdf, '--clean', env=os.environ
)
assert p.returncode == ExitCode.missing_dependency
input_ = fspath(resources / "c02-22.pdf")
output = fspath(no_outpdf)
options = main.parser.parse_args(args=["--clean", input_, output])
with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version:
mock_unpaper_version.side_effect = FileNotFoundError("unpaper")
with pytest.raises(SystemExit):
main.check_options(options, log=MagicMock())
def test_old_unpaper(spoof_unpaper_oldversion, resources, no_outpdf):
p, out, err = run_ocrmypdf(
resources / 'c02-22.pdf', no_outpdf, '--clean', env=spoof_unpaper_oldversion
resources / "c02-22.pdf", no_outpdf, "--clean", env=spoof_unpaper_oldversion
)
assert p.returncode == ExitCode.missing_dependency
@pytest.mark.skipif(not have_unpaper(), reason="requires unpaper")
def test_clean(spoof_tesseract_noop, resources, outpdf):
check_ocrmypdf(resources / 'skew.pdf', outpdf, '-c', env=spoof_tesseract_noop)
check_ocrmypdf(resources / "skew.pdf", outpdf, "-c", env=spoof_tesseract_noop)
@pytest.mark.skipif(not have_unpaper(), reason="requires unpaper")
def test_unpaper_args_valid(spoof_tesseract_noop, resources, outpdf):
check_ocrmypdf(
resources / "skew.pdf",
outpdf,
"-c",
"--unpaper-args",
"--layout double", # Spaces required here
env=spoof_tesseract_noop,
)
@pytest.mark.skipif(not have_unpaper(), reason="requires unpaper")
def test_unpaper_args_invalid_filename(spoof_tesseract_noop, resources, outpdf):
p, out, err = run_ocrmypdf(
resources / "skew.pdf",
outpdf,
"-c",
"--unpaper-args",
"/etc/passwd",
env=spoof_tesseract_noop,
)
assert "No filenames allowed" in err
assert p.returncode == ExitCode.bad_args
@pytest.mark.skipif(not have_unpaper(), reason="requires unpaper")
def test_unpaper_args_invalid(spoof_tesseract_noop, resources, outpdf):
p, out, err = run_ocrmypdf(
resources / "skew.pdf",
outpdf,
"-c",
"--unpaper-args",
"unpaper is not going to like these arguments",
env=spoof_tesseract_noop,
)
# Can't tell difference between unpaper choking on bad arguments or some
# other unpaper failure
assert p.returncode == ExitCode.child_process_error