OCRmyPDF/tests/conftest.py

# © 2017 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.


import os
import platform
import sys
from pathlib import Path
from subprocess import PIPE, run

import pytest

from ocrmypdf import api, pdfinfo
from ocrmypdf._exec import unpaper
from ocrmypdf._plugin_manager import get_parser_options_plugins

if sys.version_info < (3, 5):
    print("Requires Python 3.5+")
    sys.exit(1)


def is_linux():
    return platform.system() == 'Linux'


def is_macos():
    return platform.system() == 'Darwin'


def running_in_docker():
    # Docker creates a file named /.dockerenv (newer versions) or
    # /.dockerinit (older) -- this is undocumented, not an offical test
    return Path('/.dockerenv').exists() or Path('/.dockerinit').exists()


def have_unpaper():
    try:
        unpaper.version()
    except Exception:  # pylint: disable=broad-except
        return False
    return True


TESTS_ROOT = Path(__file__).parent.resolve()
PROJECT_ROOT = TESTS_ROOT
OCRMYPDF = [sys.executable, '-m', 'ocrmypdf']


@pytest.fixture
def resources():
    return Path(TESTS_ROOT) / 'resources'


@pytest.fixture
def ocrmypdf_exec():
    return OCRMYPDF


@pytest.fixture(scope="function")
def outdir(tmp_path):
    return tmp_path


@pytest.fixture(scope="function")
def outpdf(tmp_path):
    return tmp_path / 'out.pdf'


@pytest.fixture(scope="function")
def outtxt(tmp_path):
    return tmp_path / 'out.txt'


@pytest.fixture(scope="function")
def no_outpdf(tmp_path):
    """This just documents the fact that a test is not expected to produce
    output. Unfortunately an assertion failure inside a test fixture produces
    an error rather than a test failure, so no testing is done. It's up to
    the test to confirm that no output file was created."""
    return tmp_path / 'no_output.pdf'


def check_ocrmypdf(input_file, output_file, *args):
    """Run ocrmypdf and confirmed that a valid file was created"""
    args = [str(input_file), str(output_file)] + [
        str(arg) for arg in args if arg is not None
    ]

    _parser, options, plugin_manager = get_parser_options_plugins(args=args)
    api.check_options(options, plugin_manager)
    result = api.run_pipeline(options, plugin_manager=plugin_manager, api=True)

    assert result == 0
    assert output_file.exists(), "Output file not created"
    assert output_file.stat().st_size > 100, "PDF too small or empty"

    return output_file


def run_ocrmypdf_api(input_file, output_file, *args):
    """Run ocrmypdf via API and let caller deal with results

    Does not currently have a way to manipulate the PATH except for Tesseract.
    """

    args = [str(input_file), str(output_file)] + [
        str(arg) for arg in args if arg is not None
    ]
    _parser, options, plugin_manager = get_parser_options_plugins(args=args)

    api.check_options(options, plugin_manager)
    return api.run_pipeline(options, plugin_manager=None, api=False)


def run_ocrmypdf(input_file, output_file, *args, text=True):
    "Run ocrmypdf and let caller deal with results"

    p_args = (
        OCRMYPDF
        + [str(arg) for arg in args if arg is not None]
        + [str(input_file), str(output_file)]
    )

    env = os.environ.copy()
    p = run(
        p_args,
        stdout=PIPE,
        stderr=PIPE,
        text=text,
        env=env,
        check=False,
    )
    # print(p.stderr)
    return p, p.stdout, p.stderr


def first_page_dimensions(pdf):
    info = pdfinfo.PdfInfo(pdf)
    page0 = info[0]
    return (page0.width_inches, page0.height_inches)


def pytest_addoption(parser):
    parser.addoption(
        "--runslow",
        action="store_true",
        default=False,
        help=(
            "run slow tests only useful for development (unlikely to be "
            "useful for downstream packagers)"
        ),
    )


def pytest_collection_modifyitems(config, items):
    if config.getoption("--runslow"):
        # --runslow given in cli: do not skip slow tests
        return
    skip_slow = pytest.mark.skip(reason="need --runslow option to run")
    for item in items:
        if "slow" in item.keywords:
            item.add_marker(skip_slow)
Move duplicate test code into common namespace 2017-01-26 13:22:01 -08:00			`# © 2017 James R. Barlow: github.com/jbarlow83`
Add license notice to all files Source files to GPL3 Exceptions: -tests/spoof/* to MIT -hocrtransform.py -_unicodefun.py Test resources to CC BY-SA 4.0 except when otherwise noted. Add GPL license. 2018-03-14 14:40:48 -07:00			`#`
Change license of all GPLv3 files to MPL-2.0 https://github.com/jbarlow83/OCRmyPDF/issues/600 2020-08-05 00:44:42 -07:00			`# This Source Code Form is subject to the terms of the Mozilla Public`
			`# License, v. 2.0. If a copy of the MPL was not distributed with this`
			`# file, You can obtain one at http://mozilla.org/MPL/2.0/.`

Move duplicate test code into common namespace 2017-01-26 13:22:01 -08:00
			`import os`
			`import platform`
Sort imports with isort 2018-12-30 01:28:15 -08:00			`import sys`
Set up code coverage (it works with multiprocessing now!) 2018-11-02 00:31:50 -07:00			`from pathlib import Path`
Convert most uses of subprocess.Popen to subprocess.run in test suite 2019-03-05 22:25:22 -08:00			`from subprocess import PIPE, run`
Move duplicate test code into common namespace 2017-01-26 13:22:01 -08:00
			`import pytest`

Pre-release delinting 2020-06-09 15:27:14 -07:00			`from ocrmypdf import api, pdfinfo`
Rename ocrmypdf.exec -> ocrmypdf._exec 2020-06-09 14:55:54 -07:00			`from ocrmypdf._exec import unpaper`
Move Tesseract specific arguments to plugin 2020-05-16 03:24:31 -07:00			`from ocrmypdf._plugin_manager import get_parser_options_plugins`
Sort imports 2019-12-19 15:29:56 -08:00
Fix some error messages that printed directly to sys.stderr instead of logging 2019-06-05 03:07:48 -07:00			`if sys.version_info < (3, 5):`
			`print("Requires Python 3.5+")`
Move duplicate test code into common namespace 2017-01-26 13:22:01 -08:00			`sys.exit(1)`


			`def is_linux():`
			`return platform.system() == 'Linux'`


Improvements to macOS test and work on homebrew tap autobrew Squashed commits: [3f06c1e] Try setting up homebrew tap autobuilding [01532f1] Strict mode error in brew 2017-03-02 22:27:06 -08:00			`def is_macos():`
			`return platform.system() == 'Darwin'`


Move duplicate test code into common namespace 2017-01-26 13:22:01 -08:00			`def running_in_docker():`
Fix running_in_docker() check failing on newer Docker This test has to work to ensure spoof/tesseract_cache.py has a writable directory to put cache into. Otherwise those tests fail. 2017-02-13 02:16:06 -08:00			`# Docker creates a file named /.dockerenv (newer versions) or`
			`# /.dockerinit (older) -- this is undocumented, not an offical test`
Convert many uses of str paths to Path 2020-05-06 02:53:47 -07:00			`return Path('/.dockerenv').exists() or Path('/.dockerinit').exists()`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00

Fix test suite so --clean is not requested when unpaper is not installed 2019-03-05 22:33:13 -08:00			`def have_unpaper():`
			`try:`
			`unpaper.version()`
Delinting 2020-05-03 00:51:17 -07:00			`except Exception: # pylint: disable=broad-except`
Fix test suite so --clean is not requested when unpaper is not installed 2019-03-05 22:33:13 -08:00			`return False`
			`return True`


Convert many uses of str paths to Path 2020-05-06 02:53:47 -07:00			`TESTS_ROOT = Path(__file__).parent.resolve()`
			`PROJECT_ROOT = TESTS_ROOT`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`OCRMYPDF = [sys.executable, '-m', 'ocrmypdf']`


			`@pytest.fixture`
			`def resources():`
			`return Path(TESTS_ROOT) / 'resources'`


			`@pytest.fixture`
			`def ocrmypdf_exec():`
			`return OCRMYPDF`


			`@pytest.fixture(scope="function")`
Use newer pytest tmp_path API 2019-06-01 01:55:51 -07:00			`def outdir(tmp_path):`
			`return tmp_path`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00

			`@pytest.fixture(scope="function")`
Use newer pytest tmp_path API 2019-06-01 01:55:51 -07:00			`def outpdf(tmp_path):`
			`return tmp_path / 'out.pdf'`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00

Implement --output-type=none to skip producing the PDF and use only the sidecar Closes #787 2021-09-26 01:07:34 -07:00			`@pytest.fixture(scope="function")`
			`def outtxt(tmp_path):`
			`return tmp_path / 'out.txt'`


Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`@pytest.fixture(scope="function")`
Use newer pytest tmp_path API 2019-06-01 01:55:51 -07:00			`def no_outpdf(tmp_path):`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`"""This just documents the fact that a test is not expected to produce`
			`output. Unfortunately an assertion failure inside a test fixture produces`
			`an error rather than a test failure, so no testing is done. It's up to`
			`the test to confirm that no output file was created."""`
Use newer pytest tmp_path API 2019-06-01 01:55:51 -07:00			`return tmp_path / 'no_output.pdf'`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00

Remove _OCRMYPDF_TEST_PATH environment variable 2020-06-09 00:30:13 -07:00			`def check_ocrmypdf(input_file, output_file, *args):`
unpaper-args: add test case and harden feature 2019-02-07 16:21:02 -08:00			`"""Run ocrmypdf and confirmed that a valid file was created"""`
Move Tesseract specific arguments to plugin 2020-05-16 03:24:31 -07:00			`args = [str(input_file), str(output_file)] + [`
			`str(arg) for arg in args if arg is not None`
			`]`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00
Move Tesseract specific arguments to plugin 2020-05-16 03:24:31 -07:00			`_parser, options, plugin_manager = get_parser_options_plugins(args=args)`
Compare requested languages to OCR engine instead of tesseract directly Also refactoring to facilitating validation needing the plugin manager. 2020-05-16 01:50:37 -07:00			`api.check_options(options, plugin_manager)`
			`result = api.run_pipeline(options, plugin_manager=plugin_manager, api=True)`
Change most tests to use ocrmypdf API instead of subprocess The main benefit of this is code coverage gains can actually follow it. Also removes most ugly os.environ hacks. 2019-06-03 01:45:27 -07:00
			`assert result == 0`
Convert many uses of str paths to Path 2020-05-06 02:53:47 -07:00			`assert output_file.exists(), "Output file not created"`
			`assert output_file.stat().st_size > 100, "PDF too small or empty"`
Change most tests to use ocrmypdf API instead of subprocess The main benefit of this is code coverage gains can actually follow it. Also removes most ugly os.environ hacks. 2019-06-03 01:45:27 -07:00
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`return output_file`


Remove _OCRMYPDF_TEST_PATH environment variable 2020-06-09 00:30:13 -07:00			`def run_ocrmypdf_api(input_file, output_file, *args):`
test: environment warnings/cleanup 2019-12-30 22:38:50 -08:00			`"""Run ocrmypdf via API and let caller deal with results`

			`Does not currently have a way to manipulate the PATH except for Tesseract.`
			`"""`
test: Replace many instances of run_ocrmypdf in subprocess with inline 2019-11-28 14:03:18 -08:00
Move Tesseract specific arguments to plugin 2020-05-16 03:24:31 -07:00			`args = [str(input_file), str(output_file)] + [`
			`str(arg) for arg in args if arg is not None`
			`]`
			`_parser, options, plugin_manager = get_parser_options_plugins(args=args)`
test: Replace many instances of run_ocrmypdf in subprocess with inline 2019-11-28 14:03:18 -08:00
Compare requested languages to OCR engine instead of tesseract directly Also refactoring to facilitating validation needing the plugin manager. 2020-05-16 01:50:37 -07:00			`api.check_options(options, plugin_manager)`
Support plugin invocation with API 2020-05-02 03:34:31 -07:00			`return api.run_pipeline(options, plugin_manager=None, api=False)`
test: Replace many instances of run_ocrmypdf in subprocess with inline 2019-11-28 14:03:18 -08:00

Replace most uses of universal_newlines with text The parameters are equivalent but the latter is better named. Since Python 3.6 doesn't support text= we use our wrapper to add it in that place. This is for subprocess.run. 2020-11-07 00:48:08 -08:00			`def run_ocrmypdf(input_file, output_file, *args, text=True):`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`"Run ocrmypdf and let caller deal with results"`

Convert most uses of subprocess.Popen to subprocess.run in test suite 2019-03-05 22:25:22 -08:00			`p_args = (`
			`OCRMYPDF`
			`+ [str(arg) for arg in args if arg is not None]`
			`+ [str(input_file), str(output_file)]`
Reformat with black 2018-12-30 01:27:49 -08:00			`)`
Try to set up subprocess coverage better 2019-12-31 15:39:45 -08:00
Remove _OCRMYPDF_TEST_PATH environment variable 2020-06-09 00:30:13 -07:00			`env = os.environ.copy()`
Convert most uses of subprocess.Popen to subprocess.run in test suite 2019-03-05 22:25:22 -08:00			`p = run(`
Delinting 2020-05-03 00:51:17 -07:00			`p_args,`
			`stdout=PIPE,`
			`stderr=PIPE,`
Remove most Python 3.6 special casing 2021-11-13 00:24:01 -08:00			`text=text,`
Delinting 2020-05-03 00:51:17 -07:00			`env=env,`
			`check=False,`
Convert most uses of subprocess.Popen to subprocess.run in test suite 2019-03-05 22:25:22 -08:00			`)`
			`# print(p.stderr)`
			`return p, p.stdout, p.stderr`
Fix issue #147: unpaper loses DPI information, affects —pdf-renderer tess4 2017-03-24 13:23:03 -07:00

			`def first_page_dimensions(pdf):`
Rename pageinfo to pdfinfo 2017-05-19 15:48:23 -07:00			`info = pdfinfo.PdfInfo(pdf)`
Fix issue #147: unpaper loses DPI information, affects —pdf-renderer tess4 2017-03-24 13:23:03 -07:00			`page0 = info[0]`
pdfinfo: replace most remaining dict-style access 2017-05-19 16:17:36 -07:00			`return (page0.width_inches, page0.height_inches)`
Add intensive (optional) rotation test 2018-08-03 00:42:59 -07:00

			`def pytest_addoption(parser):`
			`parser.addoption(`
Reformat with black 2018-12-30 01:27:49 -08:00			`"--runslow",`
			`action="store_true",`
			`default=False,`
			`help=(`
			`"run slow tests only useful for development (unlikely to be "`
			`"useful for downstream packagers)"`
			`),`
Add intensive (optional) rotation test 2018-08-03 00:42:59 -07:00			`)`


			`def pytest_collection_modifyitems(config, items):`
			`if config.getoption("--runslow"):`
			`# --runslow given in cli: do not skip slow tests`
			`return`
			`skip_slow = pytest.mark.skip(reason="need --runslow option to run")`
			`for item in items:`
			`if "slow" in item.keywords:`
			`item.add_marker(skip_slow)`