OCRmyPDF/tests/test_stdio.py

# © 2019 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.

import os
import sys
from pathlib import Path
from subprocess import DEVNULL, PIPE, run, Popen

import pytest

from ocrmypdf.exceptions import ExitCode
from ocrmypdf.exec import qpdf

# pytest.helpers is dynamic
# pylint: disable=no-member,redefined-outer-name

run_ocrmypdf = pytest.helpers.run_ocrmypdf
spoof = pytest.helpers.spoof


@pytest.fixture(scope='session')
def spoof_tess_bad_utf8(tmp_path_factory):
    return spoof(tmp_path_factory, tesseract='tesseract_badutf8.py')


def test_stdin(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf):
    input_file = str(resources / 'francais.pdf')
    output_file = str(outpdf)

    # Runs: ocrmypdf - output.pdf < testfile.pdf
    with open(input_file, 'rb') as input_stream:
        p_args = ocrmypdf_exec + ['-', output_file]
        p = run(
            p_args,
            stdout=PIPE,
            stderr=PIPE,
            stdin=input_stream,
            env=spoof_tesseract_noop,
        )
        assert p.returncode == ExitCode.ok


def test_stdout(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf):
    input_file = str(resources / 'francais.pdf')
    output_file = str(outpdf)

    # Runs: ocrmypdf francais.pdf - > test_stdout.pdf
    with open(output_file, 'wb') as output_stream:
        p_args = ocrmypdf_exec + [input_file, '-']
        p = run(
            p_args,
            stdout=output_stream,
            stderr=PIPE,
            stdin=DEVNULL,
            env=spoof_tesseract_noop,
        )
        assert p.returncode == ExitCode.ok

    assert qpdf.check(output_file, log=None)


@pytest.mark.skipif(
    sys.version_info[0:3] >= (3, 6, 4), reason="issue fixed in Python 3.6.4"
)
def test_closed_streams(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf):
    input_file = str(resources / 'francais.pdf')
    output_file = str(outpdf)

    def evil_closer():
        os.close(0)
        os.close(1)

    p_args = ocrmypdf_exec + [input_file, output_file]
    p = Popen(  # pylint: disable=subprocess-popen-preexec-fn
        p_args,
        close_fds=True,
        stdout=None,
        stderr=PIPE,
        stdin=None,
        env=spoof_tesseract_noop,
        preexec_fn=evil_closer,
    )
    out, err = p.communicate()
    print(err.decode())
    assert p.returncode == ExitCode.ok


@pytest.mark.skipif(sys.version_info >= (3, 7, 0), reason='better utf-8')
@pytest.mark.skipif(
    Path('/etc/alpine-release').exists(), reason="invalid test on alpine"
)
def test_bad_locale():
    env = os.environ.copy()
    env['LC_ALL'] = 'C'

    p, out, err = run_ocrmypdf('a', 'b', env=env)
    assert out == '', "stdout not clean"
    assert p.returncode != 0
    assert 'configured to use ASCII as encoding' in err, "should whine"


@pytest.mark.parametrize('renderer', ['hocr', 'sandwich'])
def test_bad_utf8(spoof_tess_bad_utf8, renderer, resources, no_outpdf):
    p, out, err = run_ocrmypdf(
        resources / 'ccitt.pdf',
        no_outpdf,
        '--pdf-renderer',
        renderer,
        env=spoof_tess_bad_utf8,
    )

    assert out == '', "stdout not clean"
    assert p.returncode != 0
    assert 'not utf-8' in err, "should whine about utf-8"
    assert '\\x96' in err, 'should repeat backslash encoded output'


def test_dev_null(spoof_tesseract_noop, resources):
    p, out, err = run_ocrmypdf(
        resources / 'trivial.pdf', os.devnull, '--force-ocr', env=spoof_tesseract_noop
    )
    assert p.returncode == 0, "could not send output to /dev/null"
    assert len(out) == 0, "wrote to stdout"
tests: split out stdin/stdout tests 2019-08-09 01:23:49 -07:00			`# © 2019 James R. Barlow: github.com/jbarlow83`
			`#`
			`# This file is part of OCRmyPDF.`
			`#`
			`# OCRmyPDF is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# OCRmyPDF is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.`

			`import os`
			`import sys`
			`from pathlib import Path`
			`from subprocess import DEVNULL, PIPE, run, Popen`

			`import pytest`

			`from ocrmypdf.exceptions import ExitCode`
			`from ocrmypdf.exec import qpdf`

			`# pytest.helpers is dynamic`
			`# pylint: disable=no-member,redefined-outer-name`

			`run_ocrmypdf = pytest.helpers.run_ocrmypdf`
			`spoof = pytest.helpers.spoof`


			`@pytest.fixture(scope='session')`
			`def spoof_tess_bad_utf8(tmp_path_factory):`
			`return spoof(tmp_path_factory, tesseract='tesseract_badutf8.py')`


			`def test_stdin(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf):`
			`input_file = str(resources / 'francais.pdf')`
			`output_file = str(outpdf)`

			`# Runs: ocrmypdf - output.pdf < testfile.pdf`
			`with open(input_file, 'rb') as input_stream:`
			`p_args = ocrmypdf_exec + ['-', output_file]`
			`p = run(`
			`p_args,`
			`stdout=PIPE,`
			`stderr=PIPE,`
			`stdin=input_stream,`
			`env=spoof_tesseract_noop,`
			`)`
			`assert p.returncode == ExitCode.ok`


			`def test_stdout(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf):`
			`input_file = str(resources / 'francais.pdf')`
			`output_file = str(outpdf)`

			`# Runs: ocrmypdf francais.pdf - > test_stdout.pdf`
			`with open(output_file, 'wb') as output_stream:`
			`p_args = ocrmypdf_exec + [input_file, '-']`
			`p = run(`
			`p_args,`
			`stdout=output_stream,`
			`stderr=PIPE,`
			`stdin=DEVNULL,`
			`env=spoof_tesseract_noop,`
			`)`
			`assert p.returncode == ExitCode.ok`

			`assert qpdf.check(output_file, log=None)`


			`@pytest.mark.skipif(`
			`sys.version_info[0:3] >= (3, 6, 4), reason="issue fixed in Python 3.6.4"`
			`)`
			`def test_closed_streams(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf):`
			`input_file = str(resources / 'francais.pdf')`
			`output_file = str(outpdf)`

			`def evil_closer():`
			`os.close(0)`
			`os.close(1)`

			`p_args = ocrmypdf_exec + [input_file, output_file]`
			`p = Popen( # pylint: disable=subprocess-popen-preexec-fn`
			`p_args,`
			`close_fds=True,`
			`stdout=None,`
			`stderr=PIPE,`
			`stdin=None,`
			`env=spoof_tesseract_noop,`
			`preexec_fn=evil_closer,`
			`)`
			`out, err = p.communicate()`
			`print(err.decode())`
			`assert p.returncode == ExitCode.ok`


			`@pytest.mark.skipif(sys.version_info >= (3, 7, 0), reason='better utf-8')`
			`@pytest.mark.skipif(`
			`Path('/etc/alpine-release').exists(), reason="invalid test on alpine"`
			`)`
			`def test_bad_locale():`
			`env = os.environ.copy()`
			`env['LC_ALL'] = 'C'`

			`p, out, err = run_ocrmypdf('a', 'b', env=env)`
			`assert out == '', "stdout not clean"`
			`assert p.returncode != 0`
			`assert 'configured to use ASCII as encoding' in err, "should whine"`


			`@pytest.mark.parametrize('renderer', ['hocr', 'sandwich'])`
			`def test_bad_utf8(spoof_tess_bad_utf8, renderer, resources, no_outpdf):`
			`p, out, err = run_ocrmypdf(`
			`resources / 'ccitt.pdf',`
			`no_outpdf,`
			`'--pdf-renderer',`
			`renderer,`
			`env=spoof_tess_bad_utf8,`
			`)`

			`assert out == '', "stdout not clean"`
			`assert p.returncode != 0`
			`assert 'not utf-8' in err, "should whine about utf-8"`
			`assert '\\x96' in err, 'should repeat backslash encoded output'`


			`def test_dev_null(spoof_tesseract_noop, resources):`
			`p, out, err = run_ocrmypdf(`
			`resources / 'trivial.pdf', os.devnull, '--force-ocr', env=spoof_tesseract_noop`
			`)`
			`assert p.returncode == 0, "could not send output to /dev/null"`
			`assert len(out) == 0, "wrote to stdout"`