2017-01-26 13:22:01 -08:00
|
|
|
# © 2017 James R. Barlow: github.com/jbarlow83
|
2018-03-14 14:40:48 -07:00
|
|
|
#
|
2020-08-05 00:44:42 -07:00
|
|
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
|
2017-01-26 13:22:01 -08:00
|
|
|
|
|
|
|
import os
|
|
|
|
import platform
|
2018-12-30 01:28:15 -08:00
|
|
|
import sys
|
2018-11-02 00:31:50 -07:00
|
|
|
from pathlib import Path
|
2019-03-05 22:25:22 -08:00
|
|
|
from subprocess import PIPE, run
|
2017-01-26 13:22:01 -08:00
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
2020-06-09 15:27:14 -07:00
|
|
|
from ocrmypdf import api, pdfinfo
|
2020-06-09 14:55:54 -07:00
|
|
|
from ocrmypdf._exec import unpaper
|
2020-05-16 03:24:31 -07:00
|
|
|
from ocrmypdf._plugin_manager import get_parser_options_plugins
|
2019-12-19 15:29:56 -08:00
|
|
|
|
2019-06-05 03:07:48 -07:00
|
|
|
if sys.version_info < (3, 5):
|
|
|
|
print("Requires Python 3.5+")
|
2017-01-26 13:22:01 -08:00
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
def is_linux():
|
|
|
|
return platform.system() == 'Linux'
|
|
|
|
|
|
|
|
|
2017-03-02 22:27:06 -08:00
|
|
|
def is_macos():
|
|
|
|
return platform.system() == 'Darwin'
|
|
|
|
|
|
|
|
|
2017-01-26 13:22:01 -08:00
|
|
|
def running_in_docker():
|
2017-02-13 02:16:06 -08:00
|
|
|
# Docker creates a file named /.dockerenv (newer versions) or
|
|
|
|
# /.dockerinit (older) -- this is undocumented, not an offical test
|
2020-05-06 02:53:47 -07:00
|
|
|
return Path('/.dockerenv').exists() or Path('/.dockerinit').exists()
|
2017-01-26 16:38:59 -08:00
|
|
|
|
|
|
|
|
2019-03-05 22:33:13 -08:00
|
|
|
def have_unpaper():
|
|
|
|
try:
|
|
|
|
unpaper.version()
|
2020-05-03 00:51:17 -07:00
|
|
|
except Exception: # pylint: disable=broad-except
|
2019-03-05 22:33:13 -08:00
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
2020-05-06 02:53:47 -07:00
|
|
|
TESTS_ROOT = Path(__file__).parent.resolve()
|
|
|
|
PROJECT_ROOT = TESTS_ROOT
|
2017-01-26 16:38:59 -08:00
|
|
|
OCRMYPDF = [sys.executable, '-m', 'ocrmypdf']
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def resources():
|
|
|
|
return Path(TESTS_ROOT) / 'resources'
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def ocrmypdf_exec():
|
|
|
|
return OCRMYPDF
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="function")
|
2019-06-01 01:55:51 -07:00
|
|
|
def outdir(tmp_path):
|
|
|
|
return tmp_path
|
2017-01-26 16:38:59 -08:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="function")
|
2019-06-01 01:55:51 -07:00
|
|
|
def outpdf(tmp_path):
|
|
|
|
return tmp_path / 'out.pdf'
|
2017-01-26 16:38:59 -08:00
|
|
|
|
|
|
|
|
2021-09-26 01:07:34 -07:00
|
|
|
@pytest.fixture(scope="function")
|
|
|
|
def outtxt(tmp_path):
|
|
|
|
return tmp_path / 'out.txt'
|
|
|
|
|
|
|
|
|
2017-01-26 16:38:59 -08:00
|
|
|
@pytest.fixture(scope="function")
|
2019-06-01 01:55:51 -07:00
|
|
|
def no_outpdf(tmp_path):
|
2017-01-26 16:38:59 -08:00
|
|
|
"""This just documents the fact that a test is not expected to produce
|
|
|
|
output. Unfortunately an assertion failure inside a test fixture produces
|
|
|
|
an error rather than a test failure, so no testing is done. It's up to
|
|
|
|
the test to confirm that no output file was created."""
|
2019-06-01 01:55:51 -07:00
|
|
|
return tmp_path / 'no_output.pdf'
|
2017-01-26 16:38:59 -08:00
|
|
|
|
|
|
|
|
2020-06-09 00:30:13 -07:00
|
|
|
def check_ocrmypdf(input_file, output_file, *args):
|
2019-02-07 16:21:02 -08:00
|
|
|
"""Run ocrmypdf and confirmed that a valid file was created"""
|
2020-05-16 03:24:31 -07:00
|
|
|
args = [str(input_file), str(output_file)] + [
|
|
|
|
str(arg) for arg in args if arg is not None
|
|
|
|
]
|
2017-01-26 16:38:59 -08:00
|
|
|
|
2020-05-16 03:24:31 -07:00
|
|
|
_parser, options, plugin_manager = get_parser_options_plugins(args=args)
|
2020-05-16 01:50:37 -07:00
|
|
|
api.check_options(options, plugin_manager)
|
|
|
|
result = api.run_pipeline(options, plugin_manager=plugin_manager, api=True)
|
2019-06-03 01:45:27 -07:00
|
|
|
|
|
|
|
assert result == 0
|
2020-05-06 02:53:47 -07:00
|
|
|
assert output_file.exists(), "Output file not created"
|
|
|
|
assert output_file.stat().st_size > 100, "PDF too small or empty"
|
2019-06-03 01:45:27 -07:00
|
|
|
|
2017-01-26 16:38:59 -08:00
|
|
|
return output_file
|
|
|
|
|
|
|
|
|
2020-06-09 00:30:13 -07:00
|
|
|
def run_ocrmypdf_api(input_file, output_file, *args):
|
2019-12-30 22:38:50 -08:00
|
|
|
"""Run ocrmypdf via API and let caller deal with results
|
|
|
|
|
|
|
|
Does not currently have a way to manipulate the PATH except for Tesseract.
|
|
|
|
"""
|
2019-11-28 14:03:18 -08:00
|
|
|
|
2020-05-16 03:24:31 -07:00
|
|
|
args = [str(input_file), str(output_file)] + [
|
|
|
|
str(arg) for arg in args if arg is not None
|
|
|
|
]
|
|
|
|
_parser, options, plugin_manager = get_parser_options_plugins(args=args)
|
2019-11-28 14:03:18 -08:00
|
|
|
|
2020-05-16 01:50:37 -07:00
|
|
|
api.check_options(options, plugin_manager)
|
2020-05-02 03:34:31 -07:00
|
|
|
return api.run_pipeline(options, plugin_manager=None, api=False)
|
2019-11-28 14:03:18 -08:00
|
|
|
|
|
|
|
|
2020-11-07 00:48:08 -08:00
|
|
|
def run_ocrmypdf(input_file, output_file, *args, text=True):
|
2017-01-26 16:38:59 -08:00
|
|
|
"Run ocrmypdf and let caller deal with results"
|
|
|
|
|
2019-03-05 22:25:22 -08:00
|
|
|
p_args = (
|
|
|
|
OCRMYPDF
|
|
|
|
+ [str(arg) for arg in args if arg is not None]
|
|
|
|
+ [str(input_file), str(output_file)]
|
2018-12-30 01:27:49 -08:00
|
|
|
)
|
2019-12-31 15:39:45 -08:00
|
|
|
|
2020-06-09 00:30:13 -07:00
|
|
|
env = os.environ.copy()
|
2019-03-05 22:25:22 -08:00
|
|
|
p = run(
|
2020-05-03 00:51:17 -07:00
|
|
|
p_args,
|
|
|
|
stdout=PIPE,
|
|
|
|
stderr=PIPE,
|
2021-11-13 00:24:01 -08:00
|
|
|
text=text,
|
2020-05-03 00:51:17 -07:00
|
|
|
env=env,
|
|
|
|
check=False,
|
2019-03-05 22:25:22 -08:00
|
|
|
)
|
|
|
|
# print(p.stderr)
|
|
|
|
return p, p.stdout, p.stderr
|
2017-03-24 13:23:03 -07:00
|
|
|
|
|
|
|
|
|
|
|
def first_page_dimensions(pdf):
|
2017-05-19 15:48:23 -07:00
|
|
|
info = pdfinfo.PdfInfo(pdf)
|
2017-03-24 13:23:03 -07:00
|
|
|
page0 = info[0]
|
2017-05-19 16:17:36 -07:00
|
|
|
return (page0.width_inches, page0.height_inches)
|
2018-08-03 00:42:59 -07:00
|
|
|
|
|
|
|
|
|
|
|
def pytest_addoption(parser):
|
|
|
|
parser.addoption(
|
2018-12-30 01:27:49 -08:00
|
|
|
"--runslow",
|
|
|
|
action="store_true",
|
|
|
|
default=False,
|
|
|
|
help=(
|
|
|
|
"run slow tests only useful for development (unlikely to be "
|
|
|
|
"useful for downstream packagers)"
|
|
|
|
),
|
2018-08-03 00:42:59 -07:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def pytest_collection_modifyitems(config, items):
|
|
|
|
if config.getoption("--runslow"):
|
|
|
|
# --runslow given in cli: do not skip slow tests
|
|
|
|
return
|
|
|
|
skip_slow = pytest.mark.skip(reason="need --runslow option to run")
|
|
|
|
for item in items:
|
|
|
|
if "slow" in item.keywords:
|
|
|
|
item.add_marker(skip_slow)
|