OCRmyPDF/tests/test_main.py
James R. Barlow bd534c3313 main.py -> __main__.py
Executing a package with python -m packagename will check for
__main__.py inside the package.  In other words main.py should have
always been named __main__.py.

In the unlikely event that someone depends on "import ocrmypdf.main"
being meaningful, main.py continues to exist and replicates the
behavior of __main__.  (It's unlikely because import ocrmypdf.main does
unpythonic ruffus-related things at things import time, essentially
configuring itself to work with sys.argv.  To fix another day.)

This should solve the problem of Debian needing to run test suites
before installation and afterwards for continuous integration without
having to patch either file, as python -m ocrmypdf will follow import
order.  That is, if the current directory contains "ocrmypdf/" (e.g.
staging a new version) then that will be tested, else sys.path will
be checked.
2016-08-31 17:01:42 -07:00

639 lines
20 KiB
Python

#!/usr/bin/env python3
# © 2015 James R. Barlow: github.com/jbarlow83
from __future__ import print_function
from subprocess import Popen, PIPE, check_output, check_call, DEVNULL
import os
import shutil
from contextlib import suppress
import sys
import pytest
from ocrmypdf.pageinfo import pdf_get_all_pageinfo
import PyPDF2 as pypdf
from ocrmypdf import ExitCode
from ocrmypdf import leptonica
from ocrmypdf.pdfa import file_claims_pdfa
import platform
if sys.version_info.major < 3:
print("Requires Python 3.4+")
sys.exit(1)
TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
SPOOF_PATH = os.path.join(TESTS_ROOT, 'spoof')
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
TEST_RESOURCES = os.path.join(PROJECT_ROOT, 'tests', 'resources')
TEST_OUTPUT = os.environ.get(
'OCRMYPDF_TEST_OUTPUT',
default=os.path.join(PROJECT_ROOT, 'tests', 'output', 'main'))
OCRMYPDF = [sys.executable, '-m', 'ocrmypdf']
def running_in_docker():
# Docker creates a file named /.dockerinit
return os.path.exists('/.dockerinit')
def is_linux():
return platform.system() == 'Linux'
def setup_module():
with suppress(FileNotFoundError):
shutil.rmtree(TEST_OUTPUT)
with suppress(FileExistsError):
os.makedirs(TEST_OUTPUT)
def _infile(input_basename):
return os.path.join(TEST_RESOURCES, input_basename)
def _outfile(output_basename):
return os.path.join(TEST_OUTPUT, os.path.basename(output_basename))
def check_ocrmypdf(input_basename, output_basename, *args, env=None):
"Run ocrmypdf and confirmed that a valid file was created"
input_file = _infile(input_basename)
output_file = _outfile(output_basename)
p, out, err = run_ocrmypdf(input_basename, output_basename, *args, env=env)
if p.returncode != 0:
print('stdout\n======')
print(out)
print('stderr\n======')
print(err)
assert p.returncode == 0
assert os.path.exists(output_file), "Output file not created"
assert os.stat(output_file).st_size > 100, "PDF too small or empty"
return output_file
def run_ocrmypdf(input_basename, output_basename, *args, env=None):
"Run ocrmypdf and let caller deal with results"
input_file = _infile(input_basename)
output_file = _outfile(output_basename)
if env is None:
env = os.environ
p_args = OCRMYPDF + list(args) + [input_file, output_file]
p = Popen(
p_args, close_fds=True, stdout=PIPE, stderr=PIPE,
universal_newlines=True, env=env)
out, err = p.communicate()
return p, out, err
def spoof(replace_program, with_spoof):
"""Modify environment variables to override subprocess executables
Before running any executable, ocrmypdf checks the environment variable
OCRMYPDF_PROGRAMNAME to override default program name/location, e.g.
OCRMYPDF_GS redirects from the system path Ghostscript ("gs") to elsewhere.
"""
env = os.environ.copy()
spoofer = os.path.join(SPOOF_PATH, with_spoof)
if not os.access(spoofer, os.X_OK):
os.chmod(spoofer, 0o755)
env['OCRMYPDF_' + replace_program.upper()] = spoofer
return env
@pytest.fixture
def spoof_tesseract_noop():
return spoof('tesseract', 'tesseract_noop.py')
@pytest.fixture
def spoof_tesseract_cache():
if running_in_docker():
return os.environ.copy()
return spoof('tesseract', "tesseract_cache.py")
@pytest.fixture
def spoof_tesseract_crash():
return spoof('tesseract', 'tesseract_crash.py')
@pytest.fixture
def spoof_tesseract_big_image_error():
return spoof('tesseract', 'tesseract_big_image_error.py')
def test_quick(spoof_tesseract_noop):
check_ocrmypdf('c02-22.pdf', 'test_quick.pdf', env=spoof_tesseract_noop)
def test_deskew(spoof_tesseract_noop):
# Run with deskew
deskewed_pdf = check_ocrmypdf(
'skew.pdf', 'test_deskew.pdf', '-d', '-v', '1', env=spoof_tesseract_noop)
# Now render as an image again and use Leptonica to find the skew angle
# to confirm that it was deskewed
from ocrmypdf.ghostscript import rasterize_pdf
import logging
log = logging.getLogger()
deskewed_png = _outfile('deskewed.png')
rasterize_pdf(
deskewed_pdf,
deskewed_png,
xres=150,
yres=150,
raster_device='pngmono',
log=log)
from ocrmypdf.leptonica import Pix
pix = Pix.read(deskewed_png)
skew_angle, skew_confidence = pix.find_skew()
print(skew_angle)
assert -0.5 < skew_angle < 0.5, "Deskewing failed"
def test_clean(spoof_tesseract_noop):
check_ocrmypdf('skew.pdf', 'test_clean.pdf', '-c',
env=spoof_tesseract_noop)
# This will run 5 * 2 * 2 = 20 test cases
@pytest.mark.parametrize(
"pdf",
['palette.pdf', 'cmyk.pdf', 'ccitt.pdf', 'jbig2.pdf', 'lichtenstein.pdf'])
@pytest.mark.parametrize("renderer", ['hocr', 'tesseract'])
@pytest.mark.parametrize("output_type", ['pdf', 'pdfa'])
def test_exotic_image(spoof_tesseract_cache, pdf, renderer, output_type):
check_ocrmypdf(
pdf,
'test_{0}_{1}.pdf'.format(pdf, renderer),
'-dc',
'-v', '1',
'--output-type', output_type,
'--pdf-renderer', renderer, env=spoof_tesseract_cache)
@pytest.mark.parametrize("output_type", [
'pdfa', 'pdf'
])
def test_preserve_metadata(spoof_tesseract_noop, output_type):
pdf_before = pypdf.PdfFileReader(_infile('graph.pdf'))
output = check_ocrmypdf('graph.pdf', 'test_metadata_preserve.pdf',
'--output-type', output_type,
env=spoof_tesseract_noop)
pdf_after = pypdf.PdfFileReader(output)
for key in ('/Title', '/Author'):
assert pdf_before.documentInfo[key] == pdf_after.documentInfo[key]
pdfa_info = file_claims_pdfa(output)
assert pdfa_info['output'] == output_type
@pytest.mark.skipif(
is_linux() and not running_in_docker(),
reason="likely to fail if Linux locale is not configured correctly")
@pytest.mark.parametrize("output_type", [
'pdfa', 'pdf'
])
def test_override_metadata(spoof_tesseract_noop, output_type):
input_file = _infile('c02-22.pdf')
output_file = _outfile('test_override_metadata.pdf')
german = 'Du siehst den Wald vor lauter Bäumen nicht.'
chinese = '孔子'
high_unicode = 'U+1030C is: 𐌌'
p, out, err = run_ocrmypdf(
input_file, output_file,
'--title', german,
'--author', chinese,
'--subject', high_unicode,
'--output-type', output_type,
env=spoof_tesseract_noop)
assert p.returncode == ExitCode.ok
pdf = output_file
out_pdfinfo = check_output(['pdfinfo', pdf], universal_newlines=True)
lines_pdfinfo = out_pdfinfo.splitlines()
pdfinfo = {}
for line in lines_pdfinfo:
k, v = line.strip().split(':', maxsplit=1)
pdfinfo[k.strip()] = v.strip()
assert pdfinfo['Title'] == german
assert pdfinfo['Author'] == chinese
assert pdfinfo['Subject'] == high_unicode
assert pdfinfo.get('Keywords', '') == ''
pdfa_info = file_claims_pdfa(output_file)
assert pdfa_info['output'] == output_type
@pytest.mark.parametrize('renderer', [
'hocr',
'tesseract',
])
def test_oversample(spoof_tesseract_cache, renderer):
oversampled_pdf = check_ocrmypdf(
'skew.pdf', 'test_oversample_%s.pdf' % renderer, '--oversample', '350',
'-f',
'--pdf-renderer', renderer, env=spoof_tesseract_cache)
pdfinfo = pdf_get_all_pageinfo(oversampled_pdf)
print(pdfinfo[0]['xres'])
assert abs(pdfinfo[0]['xres'] - 350) < 1
def test_repeat_ocr():
p, _, _ = run_ocrmypdf('graph_ocred.pdf', 'wontwork.pdf')
assert p.returncode != 0
def test_force_ocr(spoof_tesseract_cache):
out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f',
env=spoof_tesseract_cache)
pdfinfo = pdf_get_all_pageinfo(out)
assert pdfinfo[0]['has_text']
def test_skip_ocr(spoof_tesseract_cache):
check_ocrmypdf('graph_ocred.pdf', 'test_skip.pdf', '-s',
env=spoof_tesseract_cache)
def test_argsfile(spoof_tesseract_noop):
with open(_outfile('test_argsfile.txt'), 'w') as argsfile:
print('--title', 'ArgsFile Test', '--author', 'Test Cases',
sep='\n', end='\n', file=argsfile)
check_ocrmypdf('graph.pdf', 'test_argsfile.pdf',
'@' + _outfile('test_argsfile.txt'),
env=spoof_tesseract_noop)
def check_monochrome_correlation(
reference_pdf, reference_pageno, test_pdf, test_pageno):
import ocrmypdf.ghostscript as ghostscript
import logging
gslog = logging.getLogger()
reference_png = _outfile('{}.ref{:04d}.png'.format(
reference_pdf, reference_pageno))
test_png = _outfile('{}.test{:04d}.png'.format(
test_pdf, test_pageno))
def rasterize(pdf, pageno, png):
if os.path.exists(png):
print(png)
return
ghostscript.rasterize_pdf(
pdf,
png,
xres=100, yres=100,
raster_device='pngmono', log=gslog, pageno=pageno)
rasterize(reference_pdf, reference_pageno, reference_png)
rasterize(test_pdf, test_pageno, test_png)
pix_ref = leptonica.Pix.read(reference_png)
pix_test = leptonica.Pix.read(test_png)
return leptonica.Pix.correlation_binary(pix_ref, pix_test)
def test_monochrome_correlation():
# Verify leptonica: check that an incorrect rotated image has poor
# correlation with reference
corr = check_monochrome_correlation(
reference_pdf=_infile('cardinal.pdf'),
reference_pageno=1, # north facing page
test_pdf=_infile('cardinal.pdf'),
test_pageno=3, # south facing page
)
assert corr < 0.10
corr = check_monochrome_correlation(
reference_pdf=_infile('cardinal.pdf'),
reference_pageno=2,
test_pdf=_infile('cardinal.pdf'),
test_pageno=2,
)
assert corr > 0.90
@pytest.mark.parametrize('renderer', [
'hocr',
'tesseract',
])
def test_autorotate(spoof_tesseract_cache, renderer):
# cardinal.pdf contains four copies of an image rotated in each cardinal
# direction - these ones are "burned in" not tagged with /Rotate
out = check_ocrmypdf('cardinal.pdf', 'test_autorotate_%s.pdf' % renderer,
'-r', '-v', '1', env=spoof_tesseract_cache)
for n in range(1, 4+1):
correlation = check_monochrome_correlation(
reference_pdf=_infile('cardinal.pdf'),
reference_pageno=1,
test_pdf=out,
test_pageno=n)
assert correlation > 0.80
def test_autorotate_threshold_low(spoof_tesseract_cache):
out = check_ocrmypdf('cardinal.pdf', 'test_autorotate_threshold_low.pdf',
'--rotate-pages-threshold', '1',
'-r', '-v', '1', env=spoof_tesseract_cache)
# Low threshold -> always rotate -> expect high correlation between
# reference page and test page
correlation = check_monochrome_correlation(
reference_pdf=_infile('cardinal.pdf'),
reference_pageno=1,
test_pdf=out,
test_pageno=3)
assert correlation > 0.80
def test_autorotate_threshold_high(spoof_tesseract_cache):
out = check_ocrmypdf('cardinal.pdf', 'test_autorotate_threshold_high.pdf',
'--rotate-pages-threshold', '99',
'-r', '-v', '1', env=spoof_tesseract_cache)
# High threshold -> never rotate -> expect low correlation since
# test page will not be rotated
correlation = check_monochrome_correlation(
reference_pdf=_infile('cardinal.pdf'),
reference_pageno=1,
test_pdf=out,
test_pageno=3)
assert correlation < 0.10
@pytest.mark.parametrize('renderer', [
'hocr',
'tesseract',
])
def test_ocr_timeout(renderer):
out = check_ocrmypdf('skew.pdf', 'test_timeout_%s.pdf' % renderer,
'--tesseract-timeout', '1.0')
pdfinfo = pdf_get_all_pageinfo(out)
assert not pdfinfo[0]['has_text']
def test_skip_big(spoof_tesseract_cache):
out = check_ocrmypdf('enormous.pdf', 'test_enormous.pdf',
'--skip-big', '10', env=spoof_tesseract_cache)
pdfinfo = pdf_get_all_pageinfo(out)
assert not pdfinfo[0]['has_text']
@pytest.mark.parametrize('renderer', ['hocr', 'tesseract'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_maximum_options(spoof_tesseract_cache, renderer, output_type):
check_ocrmypdf(
'multipage.pdf', 'test_multipage%s.pdf' % renderer,
'-d', '-c', '-i', '-g', '-f', '-k', '--oversample', '300',
'--skip-big', '10', '--title', 'Too Many Weird Files',
'--author', 'py.test', '--pdf-renderer', renderer,
'--output-type', output_type,
env=spoof_tesseract_cache)
def test_tesseract_missing_tessdata():
env = os.environ.copy()
env['TESSDATA_PREFIX'] = '/tmp'
p, _, err = run_ocrmypdf(
'graph_ocred.pdf', 'not_a_pdfa.pdf', '-v', '1', '--skip-text', env=env)
assert p.returncode == ExitCode.missing_dependency, err
def test_invalid_input_pdf():
p, out, err = run_ocrmypdf(
'invalid.pdf', 'wont_be_created.pdf')
assert p.returncode == ExitCode.input_file, err
def test_blank_input_pdf():
p, out, err = run_ocrmypdf(
'blank.pdf', 'still_blank.pdf')
assert p.returncode == ExitCode.ok
def test_force_ocr_on_pdf_with_no_images(spoof_tesseract_crash):
# As a correctness test, make sure that --force-ocr on a PDF with no
# content still triggers tesseract. If tesseract crashes, then it was
# called.
p, _, err = run_ocrmypdf(
'blank.pdf', 'wont_be_created.pdf', '--force-ocr',
env=spoof_tesseract_crash)
assert p.returncode == ExitCode.child_process_error, err
assert not os.path.exists(_outfile('wontwork.pdf'))
def test_french(spoof_tesseract_cache):
p, out, err = run_ocrmypdf(
'francais.pdf', 'francais.pdf', '-l', 'fra', env=spoof_tesseract_cache)
assert p.returncode == ExitCode.ok, \
"This test may fail if Tesseract language packs are missing"
def test_klingon():
p, out, err = run_ocrmypdf(
'francais.pdf', 'francais.pdf', '-l', 'klz')
assert p.returncode == ExitCode.bad_args
def test_missing_docinfo(spoof_tesseract_noop):
p, out, err = run_ocrmypdf(
'missing_docinfo.pdf', 'missing_docinfo.pdf', '-l', 'eng', '-c',
env=spoof_tesseract_noop)
assert p.returncode == ExitCode.ok, err
@pytest.mark.skipif(running_in_docker(),
reason="writes to tests/resources")
def test_uppercase_extension(spoof_tesseract_noop):
shutil.copy(_infile("skew.pdf"), _infile("UPPERCASE.PDF"))
try:
check_ocrmypdf("UPPERCASE.PDF", "UPPERCASE_OUT.PDF",
env=spoof_tesseract_noop)
finally:
os.unlink(_infile("UPPERCASE.PDF"))
def test_input_file_not_found():
input_file = "does not exist.pdf"
p, out, err = run_ocrmypdf(
_infile(input_file),
_outfile("will not happen.pdf"))
assert p.returncode == ExitCode.input_file
assert (input_file in out or input_file in err)
def test_input_file_not_a_pdf():
input_file = __file__ # Try to OCR this file
p, out, err = run_ocrmypdf(
_infile(input_file),
_outfile("will not happen.pdf"))
assert p.returncode == ExitCode.input_file
assert (input_file in out or input_file in err)
def test_qpdf_repair_fails():
env = os.environ.copy()
env['OCRMYPDF_QPDF'] = os.path.abspath('./spoof/qpdf_dummy_return2.py')
p, out, err = run_ocrmypdf(
'-v', '1',
'c02-22.pdf', 'wont_be_created.pdf', env=env)
print(out)
print(err)
assert p.returncode == ExitCode.input_file
def test_encrypted():
p, out, err = run_ocrmypdf('skew-encrypted.pdf', 'wont_be_created.pdf')
assert p.returncode == ExitCode.input_file
assert out.find('password')
@pytest.mark.parametrize('renderer', [
'hocr',
'tesseract',
])
def test_pagesegmode(renderer, spoof_tesseract_cache):
check_ocrmypdf(
'skew.pdf', 'test_psm_%s.pdf' % renderer,
'--tesseract-pagesegmode', '7',
'-v', '1',
'--pdf-renderer', renderer, env=spoof_tesseract_cache)
@pytest.mark.parametrize('renderer', [
'hocr',
'tesseract',
])
def test_tesseract_crash(renderer, spoof_tesseract_crash):
p, out, err = run_ocrmypdf(
'ccitt.pdf', 'wontwork.pdf', '-v', '1',
'--pdf-renderer', renderer, env=spoof_tesseract_crash)
assert p.returncode == ExitCode.child_process_error
assert not os.path.exists(_outfile('wontwork.pdf'))
assert "ERROR" in err
def test_tesseract_crash_autorotate(spoof_tesseract_crash):
p, out, err = run_ocrmypdf(
'ccitt.pdf', 'wontwork.pdf',
'-r', env=spoof_tesseract_crash)
assert p.returncode == ExitCode.child_process_error
assert not os.path.exists(_outfile('wontwork.pdf'))
assert "ERROR" in err
print(out)
print(err)
@pytest.mark.parametrize('renderer', [
'hocr',
'tesseract',
])
def test_tesseract_image_too_big(renderer, spoof_tesseract_big_image_error):
check_ocrmypdf(
'hugemono.pdf', 'hugemono_%s.pdf' % renderer, '-r',
'--pdf-renderer', renderer, env=spoof_tesseract_big_image_error)
def test_no_unpaper():
env = os.environ.copy()
env['OCRMYPDF_UNPAPER'] = os.path.abspath('./spoof/no_unpaper_here.py')
p, out, err = run_ocrmypdf(
'c02-22.pdf', 'wont_be_created.pdf', '--clean', env=env)
assert p.returncode == ExitCode.missing_dependency
def test_old_unpaper():
env = os.environ.copy()
env['OCRMYPDF_UNPAPER'] = os.path.abspath('./spoof/unpaper_oldversion.py')
p, out, err = run_ocrmypdf(
'c02-22.pdf', 'wont_be_created.pdf', '--clean', env=env)
assert p.returncode == ExitCode.missing_dependency
def test_algo4():
p, _, _ = run_ocrmypdf('encrypted_algo4.pdf', 'wontwork.pdf')
assert p.returncode == ExitCode.encrypted_pdf
@pytest.mark.parametrize('renderer', [
'hocr']) # tesseract cannot pass this test - resamples to square image
def test_non_square_resolution(renderer, spoof_tesseract_cache):
# Confirm input image is non-square resolution
in_pageinfo = pdf_get_all_pageinfo(_infile('aspect.pdf'))
assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']
out = 'aspect_%s.pdf' % renderer
check_ocrmypdf(
'aspect.pdf', out,
'--pdf-renderer', renderer, env=spoof_tesseract_cache)
out_pageinfo = pdf_get_all_pageinfo(_outfile(out))
# Confirm resolution was kept the same
assert in_pageinfo[0]['xres'] == out_pageinfo[0]['xres']
assert in_pageinfo[0]['yres'] == out_pageinfo[0]['yres']
def test_image_to_pdf(spoof_tesseract_noop):
check_ocrmypdf(
'LinnSequencer.jpg', 'image_to_pdf.pdf', '--image-dpi', '200',
env=spoof_tesseract_noop)
def test_jbig2_passthrough(spoof_tesseract_cache):
out = check_ocrmypdf(
'jbig2.pdf', 'jbig2_out.pdf',
'--output-type', 'pdf',
'--pdf-renderer', 'hocr',
env=spoof_tesseract_cache)
out_pageinfo = pdf_get_all_pageinfo(out)
assert out_pageinfo[0]['images'][0]['enc'] == 'jbig2'
def test_stdin(spoof_tesseract_noop):
input_file = _infile('francais.pdf')
output_file = _outfile('test_stdin.pdf')
# Runs: cat testfile.pdf | ocrmypdf - output.pdf
p1_args = ['cat', input_file]
p1 = Popen(p1_args, close_fds=True, stdin=DEVNULL, stdout=PIPE)
p2_args = OCRMYPDF + ['-', output_file]
p2 = Popen(
p2_args, close_fds=True, stdout=PIPE, stderr=PIPE,
stdin=p1.stdout, env=spoof_tesseract_noop)
p1.stdout.close()
out, err = p2.communicate()
assert p2.returncode == ExitCode.ok
def test_masks(spoof_tesseract_noop):
check_ocrmypdf('masks.pdf', 'test_masks.pdf', env=spoof_tesseract_noop)
def test_linearized_pdf_and_indirect_object(spoof_tesseract_noop):
check_ocrmypdf('milk.pdf', 'test_milk.pdf', env=spoof_tesseract_noop)