OCRmyPDF/tests/conftest.py

# © 2017 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.

import os
import platform
import sys
from contextlib import contextmanager
from pathlib import Path
from subprocess import PIPE, run
from ocrmypdf import api, cli

import pytest

pytest_plugins = ['helpers_namespace']

try:
    from pytest_cov.embed import cleanup_on_sigterm
except ImportError:
    pass
else:
    cleanup_on_sigterm()

# pylint: disable=E1101
# pytest.helpers is dynamic so it confuses pylint

if sys.version_info < (3, 5):
    print("Requires Python 3.5+")
    sys.exit(1)


@pytest.helpers.register
def is_linux():
    return platform.system() == 'Linux'


@pytest.helpers.register
def is_macos():
    return platform.system() == 'Darwin'


@pytest.helpers.register
def running_in_docker():
    # Docker creates a file named /.dockerenv (newer versions) or
    # /.dockerinit (older) -- this is undocumented, not an offical test
    return os.path.exists('/.dockerenv') or os.path.exists('/.dockerinit')


@pytest.helpers.register
def running_in_travis():
    return os.environ.get('TRAVIS') == 'true'


@pytest.helpers.register
def have_unpaper():
    try:
        from ocrmypdf.exec import unpaper

        unpaper.version()
    except Exception:
        return False
    return True


TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
SPOOF_PATH = os.path.join(TESTS_ROOT, 'spoof')
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
OCRMYPDF = [sys.executable, '-m', 'ocrmypdf']


@pytest.helpers.register
def spoof(tmp_path_factory, **kwargs):
    """Modify PATH to override subprocess executables

    spoof(program1='replacement', ...)

    Creates temporary directory with symlinks to targets.

    """
    env = os.environ.copy()
    slug = '-'.join(v.replace('.py', '') for v in sorted(kwargs.values()))
    spoofer_base = tmp_path_factory.mktemp('spoofers')
    tmpdir = Path(spoofer_base / slug)
    tmpdir.mkdir(parents=True)

    for replace_program, with_spoof in kwargs.items():
        spoofer = Path(SPOOF_PATH) / with_spoof
        spoofer.chmod(0o755)
        (tmpdir / replace_program).symlink_to(spoofer)

    env['_OCRMYPDF_SAVE_PATH'] = env['PATH']
    env['PATH'] = str(tmpdir) + ":" + env['PATH']

    return env


@pytest.helpers.register
@contextmanager
def os_environ(new_env):
    old_env = os.environ.copy()
    if new_env is None:
        new_env = {}

    for k, v in new_env.items():
        if k != 'PYTEST_CURRENT_TEST':
            os.environ[k] = v
    yield
    new_keys = set(os.environ.copy()) - set(old_env)
    for k in new_keys:
        if k != 'PYTEST_CURRENT_TEST':
            del os.environ[k]
    for k in old_env:
        if k != 'PYTEST_CURRENT_TEST':
            os.environ[k] = old_env[k]

    for k, v in os.environ.copy().items():
        if k != 'PYTEST_CURRENT_TEST':
            assert v == old_env[k]


@pytest.fixture(scope='session')
def spoof_tesseract_noop(tmp_path_factory):
    return spoof(tmp_path_factory, tesseract='tesseract_noop.py')


@pytest.fixture(scope='session')
def spoof_tesseract_cache(tmp_path_factory):
    if running_in_docker():
        return os.environ.copy()
    return spoof(tmp_path_factory, tesseract="tesseract_cache.py")


@pytest.fixture
def resources():
    return Path(TESTS_ROOT) / 'resources'


@pytest.fixture
def ocrmypdf_exec():
    return OCRMYPDF


@pytest.fixture(scope="function")
def outdir(tmp_path):
    return tmp_path


@pytest.fixture(scope="function")
def outpdf(tmp_path):
    return tmp_path / 'out.pdf'


@pytest.fixture(scope="function")
def no_outpdf(tmp_path):
    """This just documents the fact that a test is not expected to produce
    output. Unfortunately an assertion failure inside a test fixture produces
    an error rather than a test failure, so no testing is done. It's up to
    the test to confirm that no output file was created."""
    return tmp_path / 'no_output.pdf'


@pytest.helpers.register
def check_ocrmypdf(input_file, output_file, *args, env=None):
    """Run ocrmypdf and confirmed that a valid file was created"""

    options = cli.parser.parse_args(
        [str(input_file), str(output_file)]
        + [str(arg) for arg in args if arg is not None]
    )
    api.check_options(options)
    if env:
        options.tesseract_env = env
        options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file)
    result = api.run_pipeline(options, api=True)

    assert result == 0
    assert os.path.exists(str(output_file)), "Output file not created"
    assert os.stat(str(output_file)).st_size > 100, "PDF too small or empty"

    return output_file


@pytest.helpers.register
def run_ocrmypdf(input_file, output_file, *args, env=None, universal_newlines=True):
    "Run ocrmypdf and let caller deal with results"

    if env is None:
        env = os.environ

    p_args = (
        OCRMYPDF
        + [str(arg) for arg in args if arg is not None]
        + [str(input_file), str(output_file)]
    )
    p = run(
        p_args, stdout=PIPE, stderr=PIPE, universal_newlines=universal_newlines, env=env
    )
    # print(p.stderr)
    return p, p.stdout, p.stderr


@pytest.helpers.register
def first_page_dimensions(pdf):
    from ocrmypdf import pdfinfo

    info = pdfinfo.PdfInfo(pdf)
    page0 = info[0]
    return (page0.width_inches, page0.height_inches)


def pytest_addoption(parser):
    parser.addoption(
        "--runslow",
        action="store_true",
        default=False,
        help=(
            "run slow tests only useful for development (unlikely to be "
            "useful for downstream packagers)"
        ),
    )


def pytest_collection_modifyitems(config, items):
    if config.getoption("--runslow"):
        # --runslow given in cli: do not skip slow tests
        return
    skip_slow = pytest.mark.skip(reason="need --runslow option to run")
    for item in items:
        if "slow" in item.keywords:
            item.add_marker(skip_slow)
Move duplicate test code into common namespace 2017-01-26 13:22:01 -08:00			`# © 2017 James R. Barlow: github.com/jbarlow83`
Add license notice to all files Source files to GPL3 Exceptions: -tests/spoof/* to MIT -hocrtransform.py -_unicodefun.py Test resources to CC BY-SA 4.0 except when otherwise noted. Add GPL license. 2018-03-14 14:40:48 -07:00			`#`
			`# This file is part of OCRmyPDF.`
			`#`
			`# OCRmyPDF is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# OCRmyPDF is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.`
Move duplicate test code into common namespace 2017-01-26 13:22:01 -08:00
			`import os`
			`import platform`
Sort imports with isort 2018-12-30 01:28:15 -08:00			`import sys`
Convert one test to use API 2019-05-22 23:53:48 -07:00			`from contextlib import contextmanager`
Set up code coverage (it works with multiprocessing now!) 2018-11-02 00:31:50 -07:00			`from pathlib import Path`
Convert most uses of subprocess.Popen to subprocess.run in test suite 2019-03-05 22:25:22 -08:00			`from subprocess import PIPE, run`
Change most tests to use ocrmypdf API instead of subprocess The main benefit of this is code coverage gains can actually follow it. Also removes most ugly os.environ hacks. 2019-06-03 01:45:27 -07:00			`from ocrmypdf import api, cli`
Move duplicate test code into common namespace 2017-01-26 13:22:01 -08:00
			`import pytest`

Sort imports with isort 2018-12-30 01:28:15 -08:00			`pytest_plugins = ['helpers_namespace']`

Set up code coverage (it works with multiprocessing now!) 2018-11-02 00:31:50 -07:00			`try:`
			`from pytest_cov.embed import cleanup_on_sigterm`
			`except ImportError:`
			`pass`
			`else:`
			`cleanup_on_sigterm()`
Move duplicate test code into common namespace 2017-01-26 13:22:01 -08:00
Fix several pylint errors and warnings 2018-06-23 00:54:22 -07:00			`# pylint: disable=E1101`
			`# pytest.helpers is dynamic so it confuses pylint`

Fix some error messages that printed directly to sys.stderr instead of logging 2019-06-05 03:07:48 -07:00			`if sys.version_info < (3, 5):`
			`print("Requires Python 3.5+")`
Move duplicate test code into common namespace 2017-01-26 13:22:01 -08:00			`sys.exit(1)`


			`@pytest.helpers.register`
			`def is_linux():`
			`return platform.system() == 'Linux'`


Improvements to macOS test and work on homebrew tap autobrew Squashed commits: [3f06c1e] Try setting up homebrew tap autobuilding [01532f1] Strict mode error in brew 2017-03-02 22:27:06 -08:00			`@pytest.helpers.register`
			`def is_macos():`
			`return platform.system() == 'Darwin'`


Move duplicate test code into common namespace 2017-01-26 13:22:01 -08:00			`@pytest.helpers.register`
			`def running_in_docker():`
Fix running_in_docker() check failing on newer Docker This test has to work to ensure spoof/tesseract_cache.py has a writable directory to put cache into. Otherwise those tests fail. 2017-02-13 02:16:06 -08:00			`# Docker creates a file named /.dockerenv (newer versions) or`
			`# /.dockerinit (older) -- this is undocumented, not an offical test`
			`return os.path.exists('/.dockerenv') or os.path.exists('/.dockerinit')`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00

Improvements to macOS test and work on homebrew tap autobrew Squashed commits: [3f06c1e] Try setting up homebrew tap autobuilding [01532f1] Strict mode error in brew 2017-03-02 22:27:06 -08:00			`@pytest.helpers.register`
			`def running_in_travis():`
			`return os.environ.get('TRAVIS') == 'true'`


Fix test suite so --clean is not requested when unpaper is not installed 2019-03-05 22:33:13 -08:00			`@pytest.helpers.register`
			`def have_unpaper():`
			`try:`
			`from ocrmypdf.exec import unpaper`

			`unpaper.version()`
			`except Exception:`
			`return False`
			`return True`


Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))`
			`SPOOF_PATH = os.path.join(TESTS_ROOT, 'spoof')`
			`PROJECT_ROOT = os.path.dirname(TESTS_ROOT)`
			`OCRMYPDF = [sys.executable, '-m', 'ocrmypdf']`


			`@pytest.helpers.register`
Use newer pytest tmp_path API 2019-06-01 01:55:51 -07:00			`def spoof(tmp_path_factory, **kwargs):`
Remove the OCRMYPDF_program environment variables Really, this was just replicating the functionality of the PATH environment variable, and users probably do that anyway. 2018-03-24 15:07:02 -07:00			`"""Modify PATH to override subprocess executables`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00
			`spoof(program1='replacement', ...)`

Remove the OCRMYPDF_program environment variables Really, this was just replicating the functionality of the PATH environment variable, and users probably do that anyway. 2018-03-24 15:07:02 -07:00			`Creates temporary directory with symlinks to targets.`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00
			`"""`
			`env = os.environ.copy()`
Remove the OCRMYPDF_program environment variables Really, this was just replicating the functionality of the PATH environment variable, and users probably do that anyway. 2018-03-24 15:07:02 -07:00			`slug = '-'.join(v.replace('.py', '') for v in sorted(kwargs.values()))`
Use newer pytest tmp_path API 2019-06-01 01:55:51 -07:00			`spoofer_base = tmp_path_factory.mktemp('spoofers')`
			`tmpdir = Path(spoofer_base / slug)`
Remove the OCRMYPDF_program environment variables Really, this was just replicating the functionality of the PATH environment variable, and users probably do that anyway. 2018-03-24 15:07:02 -07:00			`tmpdir.mkdir(parents=True)`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00
			`for replace_program, with_spoof in kwargs.items():`
Remove the OCRMYPDF_program environment variables Really, this was just replicating the functionality of the PATH environment variable, and users probably do that anyway. 2018-03-24 15:07:02 -07:00			`spoofer = Path(SPOOF_PATH) / with_spoof`
			`spoofer.chmod(0o755)`
			`(tmpdir / replace_program).symlink_to(spoofer)`

			`env['_OCRMYPDF_SAVE_PATH'] = env['PATH']`
			`env['PATH'] = str(tmpdir) + ":" + env['PATH']`
Fix several pylint errors and warnings 2018-06-23 00:54:22 -07:00
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`return env`


Convert one test to use API 2019-05-22 23:53:48 -07:00			`@pytest.helpers.register`
			`@contextmanager`
			`def os_environ(new_env):`
			`old_env = os.environ.copy()`
conftest: don't modify PYTEST_CURRENT_TEST when manipulating os.environ It confuses pytest. 2019-06-01 01:41:39 -07:00			`if new_env is None:`
			`new_env = {}`
Convert one test to use API 2019-05-22 23:53:48 -07:00
			`for k, v in new_env.items():`
conftest: don't modify PYTEST_CURRENT_TEST when manipulating os.environ It confuses pytest. 2019-06-01 01:41:39 -07:00			`if k != 'PYTEST_CURRENT_TEST':`
			`os.environ[k] = v`
Convert one test to use API 2019-05-22 23:53:48 -07:00			`yield`
			`new_keys = set(os.environ.copy()) - set(old_env)`
			`for k in new_keys:`
conftest: don't modify PYTEST_CURRENT_TEST when manipulating os.environ It confuses pytest. 2019-06-01 01:41:39 -07:00			`if k != 'PYTEST_CURRENT_TEST':`
			`del os.environ[k]`
Convert one test to use API 2019-05-22 23:53:48 -07:00			`for k in old_env:`
conftest: don't modify PYTEST_CURRENT_TEST when manipulating os.environ It confuses pytest. 2019-06-01 01:41:39 -07:00			`if k != 'PYTEST_CURRENT_TEST':`
			`os.environ[k] = old_env[k]`

			`for k, v in os.environ.copy().items():`
			`if k != 'PYTEST_CURRENT_TEST':`
			`assert v == old_env[k]`
Convert one test to use API 2019-05-22 23:53:48 -07:00

Remove the OCRMYPDF_program environment variables Really, this was just replicating the functionality of the PATH environment variable, and users probably do that anyway. 2018-03-24 15:07:02 -07:00			`@pytest.fixture(scope='session')`
Use newer pytest tmp_path API 2019-06-01 01:55:51 -07:00			`def spoof_tesseract_noop(tmp_path_factory):`
			`return spoof(tmp_path_factory, tesseract='tesseract_noop.py')`
Refactor common test fixtures 2017-05-29 12:47:55 -07:00

Remove the OCRMYPDF_program environment variables Really, this was just replicating the functionality of the PATH environment variable, and users probably do that anyway. 2018-03-24 15:07:02 -07:00			`@pytest.fixture(scope='session')`
Use newer pytest tmp_path API 2019-06-01 01:55:51 -07:00			`def spoof_tesseract_cache(tmp_path_factory):`
Refactor common test fixtures 2017-05-29 12:47:55 -07:00			`if running_in_docker():`
			`return os.environ.copy()`
Use newer pytest tmp_path API 2019-06-01 01:55:51 -07:00			`return spoof(tmp_path_factory, tesseract="tesseract_cache.py")`
Refactor common test fixtures 2017-05-29 12:47:55 -07:00

Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`@pytest.fixture`
			`def resources():`
			`return Path(TESTS_ROOT) / 'resources'`


			`@pytest.fixture`
			`def ocrmypdf_exec():`
			`return OCRMYPDF`


			`@pytest.fixture(scope="function")`
Use newer pytest tmp_path API 2019-06-01 01:55:51 -07:00			`def outdir(tmp_path):`
			`return tmp_path`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00

			`@pytest.fixture(scope="function")`
Use newer pytest tmp_path API 2019-06-01 01:55:51 -07:00			`def outpdf(tmp_path):`
			`return tmp_path / 'out.pdf'`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00

			`@pytest.fixture(scope="function")`
Use newer pytest tmp_path API 2019-06-01 01:55:51 -07:00			`def no_outpdf(tmp_path):`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`"""This just documents the fact that a test is not expected to produce`
			`output. Unfortunately an assertion failure inside a test fixture produces`
			`an error rather than a test failure, so no testing is done. It's up to`
			`the test to confirm that no output file was created."""`
Use newer pytest tmp_path API 2019-06-01 01:55:51 -07:00			`return tmp_path / 'no_output.pdf'`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00

			`@pytest.helpers.register`
			`def check_ocrmypdf(input_file, output_file, *args, env=None):`
unpaper-args: add test case and harden feature 2019-02-07 16:21:02 -08:00			`"""Run ocrmypdf and confirmed that a valid file was created"""`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00
Change most tests to use ocrmypdf API instead of subprocess The main benefit of this is code coverage gains can actually follow it. Also removes most ugly os.environ hacks. 2019-06-03 01:45:27 -07:00			`options = cli.parser.parse_args(`
tests: fix interpretation of None as omitted argument 2019-08-11 16:16:10 -07:00			`[str(input_file), str(output_file)]`
			`+ [str(arg) for arg in args if arg is not None]`
Change most tests to use ocrmypdf API instead of subprocess The main benefit of this is code coverage gains can actually follow it. Also removes most ugly os.environ hacks. 2019-06-03 01:45:27 -07:00			`)`
			`api.check_options(options)`
			`if env:`
			`options.tesseract_env = env`
Fix TypeError "environment can only contain strings" Apparently Windows Python doesn't coerce pathlib.Path to str. 2019-11-19 18:01:10 -08:00			`options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file)`
Change most tests to use ocrmypdf API instead of subprocess The main benefit of this is code coverage gains can actually follow it. Also removes most ugly os.environ hacks. 2019-06-03 01:45:27 -07:00			`result = api.run_pipeline(options, api=True)`

			`assert result == 0`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`assert os.path.exists(str(output_file)), "Output file not created"`
			`assert os.stat(str(output_file)).st_size > 100, "PDF too small or empty"`
Change most tests to use ocrmypdf API instead of subprocess The main benefit of this is code coverage gains can actually follow it. Also removes most ugly os.environ hacks. 2019-06-03 01:45:27 -07:00
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`return output_file`


			`@pytest.helpers.register`
Reformat with black 2018-12-30 01:27:49 -08:00			`def run_ocrmypdf(input_file, output_file, *args, env=None, universal_newlines=True):`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`"Run ocrmypdf and let caller deal with results"`

			`if env is None:`
			`env = os.environ`

Convert most uses of subprocess.Popen to subprocess.run in test suite 2019-03-05 22:25:22 -08:00			`p_args = (`
			`OCRMYPDF`
			`+ [str(arg) for arg in args if arg is not None]`
			`+ [str(input_file), str(output_file)]`
Reformat with black 2018-12-30 01:27:49 -08:00			`)`
Convert most uses of subprocess.Popen to subprocess.run in test suite 2019-03-05 22:25:22 -08:00			`p = run(`
			`p_args, stdout=PIPE, stderr=PIPE, universal_newlines=universal_newlines, env=env`
			`)`
			`# print(p.stderr)`
			`return p, p.stdout, p.stderr`
Fix issue #147: unpaper loses DPI information, affects —pdf-renderer tess4 2017-03-24 13:23:03 -07:00

			`@pytest.helpers.register`
			`def first_page_dimensions(pdf):`
Rename pageinfo to pdfinfo 2017-05-19 15:48:23 -07:00			`from ocrmypdf import pdfinfo`
Reformat with black 2018-12-30 01:27:49 -08:00
Rename pageinfo to pdfinfo 2017-05-19 15:48:23 -07:00			`info = pdfinfo.PdfInfo(pdf)`
Fix issue #147: unpaper loses DPI information, affects —pdf-renderer tess4 2017-03-24 13:23:03 -07:00			`page0 = info[0]`
pdfinfo: replace most remaining dict-style access 2017-05-19 16:17:36 -07:00			`return (page0.width_inches, page0.height_inches)`
Add intensive (optional) rotation test 2018-08-03 00:42:59 -07:00

			`def pytest_addoption(parser):`
			`parser.addoption(`
Reformat with black 2018-12-30 01:27:49 -08:00			`"--runslow",`
			`action="store_true",`
			`default=False,`
			`help=(`
			`"run slow tests only useful for development (unlikely to be "`
			`"useful for downstream packagers)"`
			`),`
Add intensive (optional) rotation test 2018-08-03 00:42:59 -07:00			`)`


			`def pytest_collection_modifyitems(config, items):`
			`if config.getoption("--runslow"):`
			`# --runslow given in cli: do not skip slow tests`
			`return`
			`skip_slow = pytest.mark.skip(reason="need --runslow option to run")`
			`for item in items:`
			`if "slow" in item.keywords:`
			`item.add_marker(skip_slow)`