tests: split out stdin/stdout tests

This commit is contained in:
James R. Barlow 2019-08-09 01:23:49 -07:00
parent 7bfcd0a9d5
commit a1a7b973e9
2 changed files with 139 additions and 108 deletions

View File

@ -1,4 +1,4 @@
# © 2015-17 James R. Barlow: github.com/jbarlow83
# © 2015-19 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
@ -18,10 +18,9 @@
import logging
import os
import shutil
import sys
from math import isclose
from pathlib import Path
from subprocess import DEVNULL, PIPE, run, Popen
from subprocess import PIPE, run
import PIL
import pytest
@ -84,11 +83,6 @@ def spoof_no_tess_gs_raster_fail(tmp_path_factory):
)
@pytest.fixture(scope='session')
def spoof_tess_bad_utf8(tmp_path_factory):
return spoof(tmp_path_factory, tesseract='tesseract_badutf8.py')
def test_quick(spoof_tesseract_cache, resources, outpdf):
check_ocrmypdf(resources / 'ccitt.pdf', outpdf, env=spoof_tesseract_cache)
@ -550,68 +544,6 @@ def test_jbig2_passthrough(spoof_tesseract_cache, resources, outpdf):
assert out_pageinfo[0].images[0].enc == Encoding.jbig2
def test_stdin(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf):
input_file = str(resources / 'francais.pdf')
output_file = str(outpdf)
# Runs: ocrmypdf - output.pdf < testfile.pdf
with open(input_file, 'rb') as input_stream:
p_args = ocrmypdf_exec + ['-', output_file]
p = run(
p_args,
stdout=PIPE,
stderr=PIPE,
stdin=input_stream,
env=spoof_tesseract_noop,
)
assert p.returncode == ExitCode.ok
def test_stdout(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf):
input_file = str(resources / 'francais.pdf')
output_file = str(outpdf)
# Runs: ocrmypdf francais.pdf - > test_stdout.pdf
with open(output_file, 'wb') as output_stream:
p_args = ocrmypdf_exec + [input_file, '-']
p = run(
p_args,
stdout=output_stream,
stderr=PIPE,
stdin=DEVNULL,
env=spoof_tesseract_noop,
)
assert p.returncode == ExitCode.ok
assert qpdf.check(output_file, log=None)
@pytest.mark.skipif(
sys.version_info[0:3] >= (3, 6, 4), reason="issue fixed in Python 3.6.4"
)
def test_closed_streams(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf):
input_file = str(resources / 'francais.pdf')
output_file = str(outpdf)
def evil_closer():
os.close(0)
os.close(1)
p_args = ocrmypdf_exec + [input_file, output_file]
p = Popen( # pylint: disable=subprocess-popen-preexec-fn
p_args,
close_fds=True,
stdout=None,
stderr=PIPE,
stdin=None,
env=spoof_tesseract_noop,
preexec_fn=evil_closer,
)
out, err = p.communicate()
print(err.decode())
assert p.returncode == ExitCode.ok
def test_masks(spoof_tesseract_noop, resources, outpdf):
assert (
ocrmypdf.ocr(
@ -993,36 +925,6 @@ def test_pdfa_n(spoof_tesseract_cache, pdfa_level, resources, outpdf):
assert pdfa_info['conformance'] == f'PDF/A-{pdfa_level}B'
@pytest.mark.skipif(sys.version_info >= (3, 7, 0), reason='better utf-8')
@pytest.mark.skipif(
Path('/etc/alpine-release').exists(), reason="invalid test on alpine"
)
def test_bad_locale():
env = os.environ.copy()
env['LC_ALL'] = 'C'
p, out, err = run_ocrmypdf('a', 'b', env=env)
assert out == '', "stdout not clean"
assert p.returncode != 0
assert 'configured to use ASCII as encoding' in err, "should whine"
@pytest.mark.parametrize('renderer', RENDERERS)
def test_bad_utf8(spoof_tess_bad_utf8, renderer, resources, no_outpdf):
p, out, err = run_ocrmypdf(
resources / 'ccitt.pdf',
no_outpdf,
'--pdf-renderer',
renderer,
env=spoof_tess_bad_utf8,
)
assert out == '', "stdout not clean"
assert p.returncode != 0
assert 'not utf-8' in err, "should whine about utf-8"
assert '\\x96' in err, 'should repeat backslash encoded output'
@pytest.mark.skipif(
PIL.__version__ < '5.0.0', reason="Pillow < 5.0.0 doesn't raise the exception"
)
@ -1051,14 +953,6 @@ def test_text_curves(spoof_tesseract_noop, resources, outpdf):
assert len(info.pages[0].images) != 0, "force did not rasterize"
def test_dev_null(spoof_tesseract_noop, resources):
p, out, err = run_ocrmypdf(
resources / 'trivial.pdf', os.devnull, '--force-ocr', env=spoof_tesseract_noop
)
assert p.returncode == 0, "could not send output to /dev/null"
assert len(out) == 0, "wrote to stdout"
def test_output_is_dir(spoof_tesseract_noop, resources, outdir):
p, out, err = run_ocrmypdf(
resources / 'trivial.pdf', outdir, '--force-ocr', env=spoof_tesseract_noop

137
tests/test_stdio.py Normal file
View File

@ -0,0 +1,137 @@
# © 2019 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import os
import sys
from pathlib import Path
from subprocess import DEVNULL, PIPE, run, Popen
import pytest
from ocrmypdf.exceptions import ExitCode
from ocrmypdf.exec import qpdf
# pytest.helpers is dynamic
# pylint: disable=no-member,redefined-outer-name
run_ocrmypdf = pytest.helpers.run_ocrmypdf
spoof = pytest.helpers.spoof
@pytest.fixture(scope='session')
def spoof_tess_bad_utf8(tmp_path_factory):
return spoof(tmp_path_factory, tesseract='tesseract_badutf8.py')
def test_stdin(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf):
input_file = str(resources / 'francais.pdf')
output_file = str(outpdf)
# Runs: ocrmypdf - output.pdf < testfile.pdf
with open(input_file, 'rb') as input_stream:
p_args = ocrmypdf_exec + ['-', output_file]
p = run(
p_args,
stdout=PIPE,
stderr=PIPE,
stdin=input_stream,
env=spoof_tesseract_noop,
)
assert p.returncode == ExitCode.ok
def test_stdout(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf):
input_file = str(resources / 'francais.pdf')
output_file = str(outpdf)
# Runs: ocrmypdf francais.pdf - > test_stdout.pdf
with open(output_file, 'wb') as output_stream:
p_args = ocrmypdf_exec + [input_file, '-']
p = run(
p_args,
stdout=output_stream,
stderr=PIPE,
stdin=DEVNULL,
env=spoof_tesseract_noop,
)
assert p.returncode == ExitCode.ok
assert qpdf.check(output_file, log=None)
@pytest.mark.skipif(
sys.version_info[0:3] >= (3, 6, 4), reason="issue fixed in Python 3.6.4"
)
def test_closed_streams(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf):
input_file = str(resources / 'francais.pdf')
output_file = str(outpdf)
def evil_closer():
os.close(0)
os.close(1)
p_args = ocrmypdf_exec + [input_file, output_file]
p = Popen( # pylint: disable=subprocess-popen-preexec-fn
p_args,
close_fds=True,
stdout=None,
stderr=PIPE,
stdin=None,
env=spoof_tesseract_noop,
preexec_fn=evil_closer,
)
out, err = p.communicate()
print(err.decode())
assert p.returncode == ExitCode.ok
@pytest.mark.skipif(sys.version_info >= (3, 7, 0), reason='better utf-8')
@pytest.mark.skipif(
Path('/etc/alpine-release').exists(), reason="invalid test on alpine"
)
def test_bad_locale():
env = os.environ.copy()
env['LC_ALL'] = 'C'
p, out, err = run_ocrmypdf('a', 'b', env=env)
assert out == '', "stdout not clean"
assert p.returncode != 0
assert 'configured to use ASCII as encoding' in err, "should whine"
@pytest.mark.parametrize('renderer', ['hocr', 'sandwich'])
def test_bad_utf8(spoof_tess_bad_utf8, renderer, resources, no_outpdf):
p, out, err = run_ocrmypdf(
resources / 'ccitt.pdf',
no_outpdf,
'--pdf-renderer',
renderer,
env=spoof_tess_bad_utf8,
)
assert out == '', "stdout not clean"
assert p.returncode != 0
assert 'not utf-8' in err, "should whine about utf-8"
assert '\\x96' in err, 'should repeat backslash encoded output'
def test_dev_null(spoof_tesseract_noop, resources):
p, out, err = run_ocrmypdf(
resources / 'trivial.pdf', os.devnull, '--force-ocr', env=spoof_tesseract_noop
)
assert p.returncode == 0, "could not send output to /dev/null"
assert len(out) == 0, "wrote to stdout"