test: Replace many instances of run_ocrmypdf in subprocess with inline

This commit is contained in:
James R. Barlow 2019-11-28 14:03:18 -08:00
parent a3726e4ce3
commit fde550f9a7
4 changed files with 57 additions and 37 deletions

View File

@ -193,6 +193,22 @@ def check_ocrmypdf(input_file, output_file, *args, env=None):
return output_file
@pytest.helpers.register
def run_ocrmypdf_api(input_file, output_file, *args, env=None):
"Run ocrmypdf and let caller deal with results"
options = cli.parser.parse_args(
[str(input_file), str(output_file)]
+ [str(arg) for arg in args if arg is not None]
)
api.check_options(options)
if env:
options.tesseract_env = env
options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file)
return api.run_pipeline(options, api=False)
@pytest.helpers.register
def run_ocrmypdf(input_file, output_file, *args, env=None, universal_newlines=True):
"Run ocrmypdf and let caller deal with results"

View File

@ -39,6 +39,7 @@ from ocrmypdf.pdfinfo import Colorspace, Encoding, PdfInfo
check_ocrmypdf = pytest.helpers.check_ocrmypdf
run_ocrmypdf = pytest.helpers.run_ocrmypdf
run_ocrmypdf_api = pytest.helpers.run_ocrmypdf_api
spoof = pytest.helpers.spoof
@ -197,8 +198,8 @@ def test_oversample(spoof_tesseract_cache, renderer, resources, outpdf):
def test_repeat_ocr(resources, no_outpdf):
p, _, _ = run_ocrmypdf(resources / 'graph_ocred.pdf', no_outpdf)
assert p.returncode != 0
result = run_ocrmypdf_api(resources / 'graph_ocred.pdf', no_outpdf)
assert result == ExitCode.already_done_ocr
def test_force_ocr(spoof_tesseract_cache, resources, outpdf):
@ -300,34 +301,34 @@ def test_maximum_options(
)
def test_tesseract_missing_tessdata(resources, no_outpdf):
def test_tesseract_missing_tessdata(resources, no_outpdf, tmpdir):
env = os.environ.copy()
env['TESSDATA_PREFIX'] = '/tmp'
env['TESSDATA_PREFIX'] = tmpdir
p, _, err = run_ocrmypdf(
resources / 'graph_ocred.pdf', no_outpdf, '-v', '1', '--skip-text', env=env
returncode = run_ocrmypdf_api(
resources / 'graph.pdf', no_outpdf, '-v', '1', '--skip-text', env=env
)
assert p.returncode == ExitCode.missing_dependency, err
assert returncode == ExitCode.missing_dependency
def test_invalid_input_pdf(resources, no_outpdf):
p, out, err = run_ocrmypdf(resources / 'invalid.pdf', no_outpdf)
assert p.returncode == ExitCode.input_file, err
result = run_ocrmypdf_api(resources / 'invalid.pdf', no_outpdf)
assert result == ExitCode.input_file
def test_blank_input_pdf(resources, outpdf):
p, out, err = run_ocrmypdf(resources / 'blank.pdf', outpdf)
assert p.returncode == ExitCode.ok
result = run_ocrmypdf_api(resources / 'blank.pdf', outpdf)
assert result == ExitCode.ok
def test_force_ocr_on_pdf_with_no_images(spoof_tesseract_crash, resources, no_outpdf):
# As a correctness test, make sure that --force-ocr on a PDF with no
# content still triggers tesseract. If tesseract crashes, then it was
# called.
p, _, err = run_ocrmypdf(
result = run_ocrmypdf_api(
resources / 'blank.pdf', no_outpdf, '--force-ocr', env=spoof_tesseract_crash
)
assert p.returncode == ExitCode.child_process_error, err
assert result == ExitCode.child_process_error
assert not os.path.exists(no_outpdf)
@ -340,7 +341,7 @@ def test_german(spoof_tesseract_cache, resources, outdir):
# properly. It is fine that we are testing -l deu on a French file because
# we are exercising the functionality not going for accuracy.
sidecar = outdir / 'francais.txt'
p, out, err = run_ocrmypdf(
result = run_ocrmypdf_api(
resources / 'francais.pdf',
outdir / 'francais.pdf',
'-l',
@ -351,16 +352,16 @@ def test_german(spoof_tesseract_cache, resources, outdir):
)
if 'deu' not in tesseract.languages():
pytest.xfail(reason="tesseract-deu language pack not installed")
assert p.returncode == ExitCode.ok, "Requires tesseract deu language pack"
assert result == ExitCode.ok, "Requires tesseract deu language pack"
def test_klingon(resources, outpdf):
p, out, err = run_ocrmypdf(resources / 'francais.pdf', outpdf, '-l', 'klz')
p, _, _ = run_ocrmypdf(resources / 'francais.pdf', outpdf, '-l', 'klz')
assert p.returncode == ExitCode.missing_dependency
def test_missing_docinfo(spoof_tesseract_noop, resources, outpdf):
p, out, err = run_ocrmypdf(
result = run_ocrmypdf_api(
resources / 'missing_docinfo.pdf',
outpdf,
'-l',
@ -368,7 +369,7 @@ def test_missing_docinfo(spoof_tesseract_noop, resources, outpdf):
'--skip-text',
env=spoof_tesseract_noop,
)
assert p.returncode == ExitCode.ok, err
assert result == ExitCode.ok
def test_uppercase_extension(spoof_tesseract_noop, resources, outdir):
@ -379,24 +380,24 @@ def test_uppercase_extension(spoof_tesseract_noop, resources, outdir):
)
def test_input_file_not_found(no_outpdf):
def test_input_file_not_found(caplog, no_outpdf):
input_file = "does not exist.pdf"
p, out, err = run_ocrmypdf(input_file, no_outpdf)
assert p.returncode == ExitCode.input_file
assert input_file in out or input_file in err
result = run_ocrmypdf_api(input_file, no_outpdf)
assert result == ExitCode.input_file
assert input_file in caplog.text
def test_input_file_not_a_pdf(no_outpdf):
def test_input_file_not_a_pdf(caplog, no_outpdf):
input_file = __file__ # Try to OCR this file
p, out, err = run_ocrmypdf(input_file, no_outpdf)
assert p.returncode == ExitCode.input_file
assert input_file in out or input_file in err
result = run_ocrmypdf_api(input_file, no_outpdf)
assert result == ExitCode.input_file
assert input_file in caplog.text
def test_encrypted(resources, no_outpdf):
p, out, err = run_ocrmypdf(resources / 'skew-encrypted.pdf', no_outpdf)
assert p.returncode == ExitCode.encrypted_pdf
assert out.find('encrypted')
def test_encrypted(resources, caplog, no_outpdf):
result = run_ocrmypdf_api(resources / 'skew-encrypted.pdf', no_outpdf)
assert result == ExitCode.encrypted_pdf
assert 'encryption must be removed' in caplog.text
@pytest.mark.parametrize('renderer', RENDERERS)
@ -415,8 +416,8 @@ def test_pagesegmode(renderer, spoof_tesseract_cache, resources, outpdf):
@pytest.mark.parametrize('renderer', RENDERERS)
def test_tesseract_crash(renderer, spoof_tesseract_crash, resources, no_outpdf):
p, out, err = run_ocrmypdf(
def test_tesseract_crash(renderer, spoof_tesseract_crash, resources, no_outpdf, caplog):
result = run_ocrmypdf_api(
resources / 'ccitt.pdf',
no_outpdf,
'-v',
@ -425,9 +426,9 @@ def test_tesseract_crash(renderer, spoof_tesseract_crash, resources, no_outpdf):
renderer,
env=spoof_tesseract_crash,
)
assert p.returncode == ExitCode.child_process_error
assert result == ExitCode.child_process_error
assert not os.path.exists(no_outpdf)
assert "ERROR" in err
assert "SubprocessOutputError" in caplog.text
def test_tesseract_crash_autorotate(spoof_tesseract_crash, resources, no_outpdf):

View File

@ -29,6 +29,7 @@ from ocrmypdf.exec import qpdf
# pylint: disable=no-member,redefined-outer-name
run_ocrmypdf = pytest.helpers.run_ocrmypdf
run_ocrmypdf_api = pytest.helpers.run_ocrmypdf
spoof = pytest.helpers.spoof

View File

@ -24,6 +24,7 @@ from ocrmypdf.pdfinfo import PdfInfo
check_ocrmypdf = pytest.helpers.check_ocrmypdf
run_ocrmypdf = pytest.helpers.run_ocrmypdf
run_ocrmypdf_api = pytest.helpers.run_ocrmypdf_api
spoof = pytest.helpers.spoof
@ -32,9 +33,10 @@ def poster(resources):
return resources / 'poster.pdf'
def test_userunit_ghostscript_fails(poster, no_outpdf):
p, out, err = run_ocrmypdf(poster, no_outpdf, '--output-type=pdfa')
assert p.returncode == ExitCode.input_file
def test_userunit_ghostscript_fails(poster, no_outpdf, caplog):
result = run_ocrmypdf_api(poster, no_outpdf, '--output-type=pdfa')
assert result == ExitCode.input_file
assert 'not supported by Ghostscript' in caplog.text
def test_userunit_qpdf_passes(spoof_tesseract_cache, poster, outpdf):