2022-07-28 01:06:46 -07:00
|
|
|
# SPDX-FileCopyrightText: 2022 James R. Barlow
|
|
|
|
# SPDX-License-Identifier: MPL-2.0
|
2018-03-24 15:21:44 -07:00
|
|
|
|
2022-07-23 00:39:24 -07:00
|
|
|
from __future__ import annotations
|
|
|
|
|
2022-01-11 10:44:38 -08:00
|
|
|
import logging
|
2019-02-07 16:21:02 -08:00
|
|
|
from os import fspath
|
2019-05-11 12:44:17 -07:00
|
|
|
from unittest.mock import patch
|
2018-12-30 01:28:15 -08:00
|
|
|
|
2018-03-24 15:21:44 -07:00
|
|
|
import pytest
|
2019-05-20 18:01:17 -07:00
|
|
|
|
2022-01-11 10:44:38 -08:00
|
|
|
from ocrmypdf._exec import unpaper
|
2020-05-26 02:13:17 -07:00
|
|
|
from ocrmypdf._plugin_manager import get_parser_options_plugins
|
2019-03-28 20:16:10 +01:00
|
|
|
from ocrmypdf._validation import check_options
|
2019-05-20 18:01:17 -07:00
|
|
|
from ocrmypdf.exceptions import ExitCode, MissingDependencyError
|
2018-03-24 15:21:44 -07:00
|
|
|
|
2023-04-14 00:38:34 -07:00
|
|
|
from .conftest import check_ocrmypdf, have_unpaper, run_ocrmypdf
|
2018-03-24 15:21:44 -07:00
|
|
|
|
2021-04-07 01:56:51 -07:00
|
|
|
# pylint: disable=redefined-outer-name
|
2019-02-07 16:21:02 -08:00
|
|
|
|
2022-01-11 10:44:38 -08:00
|
|
|
needs_unpaper = pytest.mark.skipif(not have_unpaper(), reason="requires unpaper")
|
|
|
|
|
2019-02-07 16:21:02 -08:00
|
|
|
|
2018-03-24 15:21:44 -07:00
|
|
|
def test_no_unpaper(resources, no_outpdf):
|
2019-02-07 16:21:02 -08:00
|
|
|
input_ = fspath(resources / "c02-22.pdf")
|
|
|
|
output = fspath(no_outpdf)
|
2020-05-26 02:13:17 -07:00
|
|
|
|
|
|
|
_parser, options, pm = get_parser_options_plugins(["--clean", input_, output])
|
2020-12-28 23:51:55 -08:00
|
|
|
with patch("ocrmypdf._exec.unpaper.version") as mock:
|
|
|
|
mock.side_effect = FileNotFoundError("unpaper")
|
2020-05-16 01:50:37 -07:00
|
|
|
|
2019-05-20 18:01:17 -07:00
|
|
|
with pytest.raises(MissingDependencyError):
|
2020-05-26 02:13:17 -07:00
|
|
|
check_options(options, pm)
|
2020-12-28 23:51:55 -08:00
|
|
|
mock.assert_called()
|
2018-03-24 15:21:44 -07:00
|
|
|
|
|
|
|
|
2020-06-02 02:42:14 -07:00
|
|
|
def test_old_unpaper(resources, no_outpdf):
|
|
|
|
input_ = fspath(resources / "c02-22.pdf")
|
|
|
|
output = fspath(no_outpdf)
|
|
|
|
|
|
|
|
_parser, options, pm = get_parser_options_plugins(["--clean", input_, output])
|
2020-12-28 23:51:55 -08:00
|
|
|
with patch("ocrmypdf._exec.unpaper.version") as mock:
|
|
|
|
mock.return_value = '0.5'
|
2020-06-02 02:42:14 -07:00
|
|
|
|
|
|
|
with pytest.raises(MissingDependencyError):
|
|
|
|
check_options(options, pm)
|
2020-12-28 23:51:55 -08:00
|
|
|
mock.assert_called()
|
2018-03-24 15:21:44 -07:00
|
|
|
|
|
|
|
|
2022-01-11 10:44:38 -08:00
|
|
|
@needs_unpaper
|
2020-06-01 03:06:40 -07:00
|
|
|
def test_clean(resources, outpdf):
|
|
|
|
check_ocrmypdf(
|
|
|
|
resources / "skew.pdf",
|
|
|
|
outpdf,
|
|
|
|
"-c",
|
|
|
|
'--plugin',
|
|
|
|
'tests/plugins/tesseract_noop.py',
|
|
|
|
)
|
2019-02-07 16:21:02 -08:00
|
|
|
|
|
|
|
|
2022-01-11 10:44:38 -08:00
|
|
|
@needs_unpaper
|
2020-06-01 03:06:40 -07:00
|
|
|
def test_unpaper_args_valid(resources, outpdf):
|
2019-02-07 16:21:02 -08:00
|
|
|
check_ocrmypdf(
|
|
|
|
resources / "skew.pdf",
|
|
|
|
outpdf,
|
|
|
|
"-c",
|
|
|
|
"--unpaper-args",
|
|
|
|
"--layout double", # Spaces required here
|
2020-06-01 03:06:40 -07:00
|
|
|
'--plugin',
|
|
|
|
'tests/plugins/tesseract_noop.py',
|
2019-02-07 16:21:02 -08:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2022-01-11 10:44:38 -08:00
|
|
|
@needs_unpaper
|
2020-06-01 03:06:40 -07:00
|
|
|
def test_unpaper_args_invalid_filename(resources, outpdf):
|
2021-12-06 17:00:25 -08:00
|
|
|
p = run_ocrmypdf(
|
2019-02-07 16:21:02 -08:00
|
|
|
resources / "skew.pdf",
|
|
|
|
outpdf,
|
|
|
|
"-c",
|
|
|
|
"--unpaper-args",
|
|
|
|
"/etc/passwd",
|
2020-06-01 03:06:40 -07:00
|
|
|
'--plugin',
|
|
|
|
'tests/plugins/tesseract_noop.py',
|
2019-02-07 16:21:02 -08:00
|
|
|
)
|
2021-12-06 17:00:25 -08:00
|
|
|
assert "No filenames allowed" in p.stderr
|
2019-02-07 16:21:02 -08:00
|
|
|
assert p.returncode == ExitCode.bad_args
|
|
|
|
|
|
|
|
|
2022-01-11 10:44:38 -08:00
|
|
|
@needs_unpaper
|
2020-06-01 03:06:40 -07:00
|
|
|
def test_unpaper_args_invalid(resources, outpdf):
|
2021-12-06 17:00:25 -08:00
|
|
|
p = run_ocrmypdf(
|
2019-02-07 16:21:02 -08:00
|
|
|
resources / "skew.pdf",
|
|
|
|
outpdf,
|
|
|
|
"-c",
|
|
|
|
"--unpaper-args",
|
|
|
|
"unpaper is not going to like these arguments",
|
2020-06-01 03:06:40 -07:00
|
|
|
'--plugin',
|
|
|
|
'tests/plugins/tesseract_noop.py',
|
2019-02-07 16:21:02 -08:00
|
|
|
)
|
|
|
|
# Can't tell difference between unpaper choking on bad arguments or some
|
|
|
|
# other unpaper failure
|
|
|
|
assert p.returncode == ExitCode.child_process_error
|
2022-01-11 10:44:38 -08:00
|
|
|
|
|
|
|
|
|
|
|
@needs_unpaper
|
|
|
|
def test_unpaper_image_too_big(resources, outdir, caplog):
|
|
|
|
with patch('ocrmypdf._exec.unpaper.UNPAPER_IMAGE_PIXEL_LIMIT', 42):
|
|
|
|
infile = resources / 'crom.png'
|
|
|
|
unpaper.clean(infile, outdir / 'out.png', dpi=300) == infile
|
|
|
|
|
|
|
|
assert any(
|
|
|
|
'too large for cleaning' in rec.message
|
|
|
|
for rec in caplog.get_records('call')
|
|
|
|
if rec.levelno == logging.WARNING
|
|
|
|
)
|