mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-07-24 17:33:47 +00:00

Executing a package with python -m packagename will check for __main__.py inside the package. In other words main.py should have always been named __main__.py. In the unlikely event that someone depends on "import ocrmypdf.main" being meaningful, main.py continues to exist and replicates the behavior of __main__. (It's unlikely because import ocrmypdf.main does unpythonic ruffus-related things at things import time, essentially configuring itself to work with sys.argv. To fix another day.) This should solve the problem of Debian needing to run test suites before installation and afterwards for continuous integration without having to patch either file, as python -m ocrmypdf will follow import order. That is, if the current directory contains "ocrmypdf/" (e.g. staging a new version) then that will be tested, else sys.path will be checked.
639 lines
20 KiB
Python
639 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
# © 2015 James R. Barlow: github.com/jbarlow83
|
|
|
|
from __future__ import print_function
|
|
from subprocess import Popen, PIPE, check_output, check_call, DEVNULL
|
|
import os
|
|
import shutil
|
|
from contextlib import suppress
|
|
import sys
|
|
import pytest
|
|
from ocrmypdf.pageinfo import pdf_get_all_pageinfo
|
|
import PyPDF2 as pypdf
|
|
from ocrmypdf import ExitCode
|
|
from ocrmypdf import leptonica
|
|
from ocrmypdf.pdfa import file_claims_pdfa
|
|
import platform
|
|
|
|
|
|
if sys.version_info.major < 3:
|
|
print("Requires Python 3.4+")
|
|
sys.exit(1)
|
|
|
|
TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
|
|
SPOOF_PATH = os.path.join(TESTS_ROOT, 'spoof')
|
|
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
|
|
TEST_RESOURCES = os.path.join(PROJECT_ROOT, 'tests', 'resources')
|
|
TEST_OUTPUT = os.environ.get(
|
|
'OCRMYPDF_TEST_OUTPUT',
|
|
default=os.path.join(PROJECT_ROOT, 'tests', 'output', 'main'))
|
|
OCRMYPDF = [sys.executable, '-m', 'ocrmypdf']
|
|
|
|
|
|
def running_in_docker():
|
|
# Docker creates a file named /.dockerinit
|
|
return os.path.exists('/.dockerinit')
|
|
|
|
|
|
def is_linux():
|
|
return platform.system() == 'Linux'
|
|
|
|
|
|
def setup_module():
|
|
with suppress(FileNotFoundError):
|
|
shutil.rmtree(TEST_OUTPUT)
|
|
with suppress(FileExistsError):
|
|
os.makedirs(TEST_OUTPUT)
|
|
|
|
|
|
def _infile(input_basename):
|
|
return os.path.join(TEST_RESOURCES, input_basename)
|
|
|
|
|
|
def _outfile(output_basename):
|
|
return os.path.join(TEST_OUTPUT, os.path.basename(output_basename))
|
|
|
|
|
|
def check_ocrmypdf(input_basename, output_basename, *args, env=None):
|
|
"Run ocrmypdf and confirmed that a valid file was created"
|
|
input_file = _infile(input_basename)
|
|
output_file = _outfile(output_basename)
|
|
|
|
p, out, err = run_ocrmypdf(input_basename, output_basename, *args, env=env)
|
|
if p.returncode != 0:
|
|
print('stdout\n======')
|
|
print(out)
|
|
print('stderr\n======')
|
|
print(err)
|
|
assert p.returncode == 0
|
|
assert os.path.exists(output_file), "Output file not created"
|
|
assert os.stat(output_file).st_size > 100, "PDF too small or empty"
|
|
return output_file
|
|
|
|
|
|
def run_ocrmypdf(input_basename, output_basename, *args, env=None):
|
|
"Run ocrmypdf and let caller deal with results"
|
|
input_file = _infile(input_basename)
|
|
output_file = _outfile(output_basename)
|
|
|
|
if env is None:
|
|
env = os.environ
|
|
|
|
p_args = OCRMYPDF + list(args) + [input_file, output_file]
|
|
p = Popen(
|
|
p_args, close_fds=True, stdout=PIPE, stderr=PIPE,
|
|
universal_newlines=True, env=env)
|
|
out, err = p.communicate()
|
|
return p, out, err
|
|
|
|
|
|
def spoof(replace_program, with_spoof):
|
|
"""Modify environment variables to override subprocess executables
|
|
|
|
Before running any executable, ocrmypdf checks the environment variable
|
|
OCRMYPDF_PROGRAMNAME to override default program name/location, e.g.
|
|
OCRMYPDF_GS redirects from the system path Ghostscript ("gs") to elsewhere.
|
|
"""
|
|
env = os.environ.copy()
|
|
spoofer = os.path.join(SPOOF_PATH, with_spoof)
|
|
if not os.access(spoofer, os.X_OK):
|
|
os.chmod(spoofer, 0o755)
|
|
env['OCRMYPDF_' + replace_program.upper()] = spoofer
|
|
return env
|
|
|
|
|
|
@pytest.fixture
|
|
def spoof_tesseract_noop():
|
|
return spoof('tesseract', 'tesseract_noop.py')
|
|
|
|
|
|
@pytest.fixture
|
|
def spoof_tesseract_cache():
|
|
if running_in_docker():
|
|
return os.environ.copy()
|
|
return spoof('tesseract', "tesseract_cache.py")
|
|
|
|
|
|
@pytest.fixture
|
|
def spoof_tesseract_crash():
|
|
return spoof('tesseract', 'tesseract_crash.py')
|
|
|
|
|
|
@pytest.fixture
|
|
def spoof_tesseract_big_image_error():
|
|
return spoof('tesseract', 'tesseract_big_image_error.py')
|
|
|
|
|
|
def test_quick(spoof_tesseract_noop):
|
|
check_ocrmypdf('c02-22.pdf', 'test_quick.pdf', env=spoof_tesseract_noop)
|
|
|
|
|
|
def test_deskew(spoof_tesseract_noop):
|
|
# Run with deskew
|
|
deskewed_pdf = check_ocrmypdf(
|
|
'skew.pdf', 'test_deskew.pdf', '-d', '-v', '1', env=spoof_tesseract_noop)
|
|
|
|
# Now render as an image again and use Leptonica to find the skew angle
|
|
# to confirm that it was deskewed
|
|
from ocrmypdf.ghostscript import rasterize_pdf
|
|
import logging
|
|
log = logging.getLogger()
|
|
|
|
deskewed_png = _outfile('deskewed.png')
|
|
|
|
rasterize_pdf(
|
|
deskewed_pdf,
|
|
deskewed_png,
|
|
xres=150,
|
|
yres=150,
|
|
raster_device='pngmono',
|
|
log=log)
|
|
|
|
from ocrmypdf.leptonica import Pix
|
|
pix = Pix.read(deskewed_png)
|
|
skew_angle, skew_confidence = pix.find_skew()
|
|
|
|
print(skew_angle)
|
|
assert -0.5 < skew_angle < 0.5, "Deskewing failed"
|
|
|
|
|
|
def test_clean(spoof_tesseract_noop):
|
|
check_ocrmypdf('skew.pdf', 'test_clean.pdf', '-c',
|
|
env=spoof_tesseract_noop)
|
|
|
|
|
|
# This will run 5 * 2 * 2 = 20 test cases
|
|
@pytest.mark.parametrize(
|
|
"pdf",
|
|
['palette.pdf', 'cmyk.pdf', 'ccitt.pdf', 'jbig2.pdf', 'lichtenstein.pdf'])
|
|
@pytest.mark.parametrize("renderer", ['hocr', 'tesseract'])
|
|
@pytest.mark.parametrize("output_type", ['pdf', 'pdfa'])
|
|
def test_exotic_image(spoof_tesseract_cache, pdf, renderer, output_type):
|
|
check_ocrmypdf(
|
|
pdf,
|
|
'test_{0}_{1}.pdf'.format(pdf, renderer),
|
|
'-dc',
|
|
'-v', '1',
|
|
'--output-type', output_type,
|
|
'--pdf-renderer', renderer, env=spoof_tesseract_cache)
|
|
|
|
|
|
@pytest.mark.parametrize("output_type", [
|
|
'pdfa', 'pdf'
|
|
])
|
|
def test_preserve_metadata(spoof_tesseract_noop, output_type):
|
|
pdf_before = pypdf.PdfFileReader(_infile('graph.pdf'))
|
|
|
|
output = check_ocrmypdf('graph.pdf', 'test_metadata_preserve.pdf',
|
|
'--output-type', output_type,
|
|
env=spoof_tesseract_noop)
|
|
|
|
pdf_after = pypdf.PdfFileReader(output)
|
|
|
|
for key in ('/Title', '/Author'):
|
|
assert pdf_before.documentInfo[key] == pdf_after.documentInfo[key]
|
|
|
|
pdfa_info = file_claims_pdfa(output)
|
|
assert pdfa_info['output'] == output_type
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
is_linux() and not running_in_docker(),
|
|
reason="likely to fail if Linux locale is not configured correctly")
|
|
@pytest.mark.parametrize("output_type", [
|
|
'pdfa', 'pdf'
|
|
])
|
|
def test_override_metadata(spoof_tesseract_noop, output_type):
|
|
input_file = _infile('c02-22.pdf')
|
|
output_file = _outfile('test_override_metadata.pdf')
|
|
|
|
german = 'Du siehst den Wald vor lauter Bäumen nicht.'
|
|
chinese = '孔子'
|
|
high_unicode = 'U+1030C is: 𐌌'
|
|
|
|
p, out, err = run_ocrmypdf(
|
|
input_file, output_file,
|
|
'--title', german,
|
|
'--author', chinese,
|
|
'--subject', high_unicode,
|
|
'--output-type', output_type,
|
|
env=spoof_tesseract_noop)
|
|
|
|
assert p.returncode == ExitCode.ok
|
|
|
|
pdf = output_file
|
|
|
|
out_pdfinfo = check_output(['pdfinfo', pdf], universal_newlines=True)
|
|
lines_pdfinfo = out_pdfinfo.splitlines()
|
|
pdfinfo = {}
|
|
for line in lines_pdfinfo:
|
|
k, v = line.strip().split(':', maxsplit=1)
|
|
pdfinfo[k.strip()] = v.strip()
|
|
|
|
assert pdfinfo['Title'] == german
|
|
assert pdfinfo['Author'] == chinese
|
|
assert pdfinfo['Subject'] == high_unicode
|
|
assert pdfinfo.get('Keywords', '') == ''
|
|
|
|
pdfa_info = file_claims_pdfa(output_file)
|
|
assert pdfa_info['output'] == output_type
|
|
|
|
|
|
@pytest.mark.parametrize('renderer', [
|
|
'hocr',
|
|
'tesseract',
|
|
])
|
|
def test_oversample(spoof_tesseract_cache, renderer):
|
|
oversampled_pdf = check_ocrmypdf(
|
|
'skew.pdf', 'test_oversample_%s.pdf' % renderer, '--oversample', '350',
|
|
'-f',
|
|
'--pdf-renderer', renderer, env=spoof_tesseract_cache)
|
|
|
|
pdfinfo = pdf_get_all_pageinfo(oversampled_pdf)
|
|
|
|
print(pdfinfo[0]['xres'])
|
|
assert abs(pdfinfo[0]['xres'] - 350) < 1
|
|
|
|
|
|
def test_repeat_ocr():
|
|
p, _, _ = run_ocrmypdf('graph_ocred.pdf', 'wontwork.pdf')
|
|
assert p.returncode != 0
|
|
|
|
|
|
def test_force_ocr(spoof_tesseract_cache):
|
|
out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f',
|
|
env=spoof_tesseract_cache)
|
|
pdfinfo = pdf_get_all_pageinfo(out)
|
|
assert pdfinfo[0]['has_text']
|
|
|
|
|
|
def test_skip_ocr(spoof_tesseract_cache):
|
|
check_ocrmypdf('graph_ocred.pdf', 'test_skip.pdf', '-s',
|
|
env=spoof_tesseract_cache)
|
|
|
|
|
|
def test_argsfile(spoof_tesseract_noop):
|
|
with open(_outfile('test_argsfile.txt'), 'w') as argsfile:
|
|
print('--title', 'ArgsFile Test', '--author', 'Test Cases',
|
|
sep='\n', end='\n', file=argsfile)
|
|
check_ocrmypdf('graph.pdf', 'test_argsfile.pdf',
|
|
'@' + _outfile('test_argsfile.txt'),
|
|
env=spoof_tesseract_noop)
|
|
|
|
|
|
def check_monochrome_correlation(
|
|
reference_pdf, reference_pageno, test_pdf, test_pageno):
|
|
|
|
import ocrmypdf.ghostscript as ghostscript
|
|
import logging
|
|
|
|
gslog = logging.getLogger()
|
|
|
|
reference_png = _outfile('{}.ref{:04d}.png'.format(
|
|
reference_pdf, reference_pageno))
|
|
test_png = _outfile('{}.test{:04d}.png'.format(
|
|
test_pdf, test_pageno))
|
|
|
|
def rasterize(pdf, pageno, png):
|
|
if os.path.exists(png):
|
|
print(png)
|
|
return
|
|
ghostscript.rasterize_pdf(
|
|
pdf,
|
|
png,
|
|
xres=100, yres=100,
|
|
raster_device='pngmono', log=gslog, pageno=pageno)
|
|
|
|
rasterize(reference_pdf, reference_pageno, reference_png)
|
|
rasterize(test_pdf, test_pageno, test_png)
|
|
|
|
pix_ref = leptonica.Pix.read(reference_png)
|
|
pix_test = leptonica.Pix.read(test_png)
|
|
|
|
return leptonica.Pix.correlation_binary(pix_ref, pix_test)
|
|
|
|
|
|
def test_monochrome_correlation():
|
|
# Verify leptonica: check that an incorrect rotated image has poor
|
|
# correlation with reference
|
|
corr = check_monochrome_correlation(
|
|
reference_pdf=_infile('cardinal.pdf'),
|
|
reference_pageno=1, # north facing page
|
|
test_pdf=_infile('cardinal.pdf'),
|
|
test_pageno=3, # south facing page
|
|
)
|
|
assert corr < 0.10
|
|
corr = check_monochrome_correlation(
|
|
reference_pdf=_infile('cardinal.pdf'),
|
|
reference_pageno=2,
|
|
test_pdf=_infile('cardinal.pdf'),
|
|
test_pageno=2,
|
|
)
|
|
assert corr > 0.90
|
|
|
|
|
|
@pytest.mark.parametrize('renderer', [
|
|
'hocr',
|
|
'tesseract',
|
|
])
|
|
def test_autorotate(spoof_tesseract_cache, renderer):
|
|
# cardinal.pdf contains four copies of an image rotated in each cardinal
|
|
# direction - these ones are "burned in" not tagged with /Rotate
|
|
out = check_ocrmypdf('cardinal.pdf', 'test_autorotate_%s.pdf' % renderer,
|
|
'-r', '-v', '1', env=spoof_tesseract_cache)
|
|
for n in range(1, 4+1):
|
|
correlation = check_monochrome_correlation(
|
|
reference_pdf=_infile('cardinal.pdf'),
|
|
reference_pageno=1,
|
|
test_pdf=out,
|
|
test_pageno=n)
|
|
assert correlation > 0.80
|
|
|
|
|
|
def test_autorotate_threshold_low(spoof_tesseract_cache):
|
|
out = check_ocrmypdf('cardinal.pdf', 'test_autorotate_threshold_low.pdf',
|
|
'--rotate-pages-threshold', '1',
|
|
'-r', '-v', '1', env=spoof_tesseract_cache)
|
|
|
|
# Low threshold -> always rotate -> expect high correlation between
|
|
# reference page and test page
|
|
correlation = check_monochrome_correlation(
|
|
reference_pdf=_infile('cardinal.pdf'),
|
|
reference_pageno=1,
|
|
test_pdf=out,
|
|
test_pageno=3)
|
|
assert correlation > 0.80
|
|
|
|
|
|
def test_autorotate_threshold_high(spoof_tesseract_cache):
|
|
out = check_ocrmypdf('cardinal.pdf', 'test_autorotate_threshold_high.pdf',
|
|
'--rotate-pages-threshold', '99',
|
|
'-r', '-v', '1', env=spoof_tesseract_cache)
|
|
|
|
# High threshold -> never rotate -> expect low correlation since
|
|
# test page will not be rotated
|
|
correlation = check_monochrome_correlation(
|
|
reference_pdf=_infile('cardinal.pdf'),
|
|
reference_pageno=1,
|
|
test_pdf=out,
|
|
test_pageno=3)
|
|
assert correlation < 0.10
|
|
|
|
|
|
@pytest.mark.parametrize('renderer', [
|
|
'hocr',
|
|
'tesseract',
|
|
])
|
|
def test_ocr_timeout(renderer):
|
|
out = check_ocrmypdf('skew.pdf', 'test_timeout_%s.pdf' % renderer,
|
|
'--tesseract-timeout', '1.0')
|
|
pdfinfo = pdf_get_all_pageinfo(out)
|
|
assert not pdfinfo[0]['has_text']
|
|
|
|
|
|
def test_skip_big(spoof_tesseract_cache):
|
|
out = check_ocrmypdf('enormous.pdf', 'test_enormous.pdf',
|
|
'--skip-big', '10', env=spoof_tesseract_cache)
|
|
pdfinfo = pdf_get_all_pageinfo(out)
|
|
assert not pdfinfo[0]['has_text']
|
|
|
|
|
|
@pytest.mark.parametrize('renderer', ['hocr', 'tesseract'])
|
|
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
|
|
def test_maximum_options(spoof_tesseract_cache, renderer, output_type):
|
|
check_ocrmypdf(
|
|
'multipage.pdf', 'test_multipage%s.pdf' % renderer,
|
|
'-d', '-c', '-i', '-g', '-f', '-k', '--oversample', '300',
|
|
'--skip-big', '10', '--title', 'Too Many Weird Files',
|
|
'--author', 'py.test', '--pdf-renderer', renderer,
|
|
'--output-type', output_type,
|
|
env=spoof_tesseract_cache)
|
|
|
|
|
|
def test_tesseract_missing_tessdata():
|
|
env = os.environ.copy()
|
|
env['TESSDATA_PREFIX'] = '/tmp'
|
|
|
|
p, _, err = run_ocrmypdf(
|
|
'graph_ocred.pdf', 'not_a_pdfa.pdf', '-v', '1', '--skip-text', env=env)
|
|
assert p.returncode == ExitCode.missing_dependency, err
|
|
|
|
|
|
def test_invalid_input_pdf():
|
|
p, out, err = run_ocrmypdf(
|
|
'invalid.pdf', 'wont_be_created.pdf')
|
|
assert p.returncode == ExitCode.input_file, err
|
|
|
|
|
|
def test_blank_input_pdf():
|
|
p, out, err = run_ocrmypdf(
|
|
'blank.pdf', 'still_blank.pdf')
|
|
assert p.returncode == ExitCode.ok
|
|
|
|
|
|
def test_force_ocr_on_pdf_with_no_images(spoof_tesseract_crash):
|
|
# As a correctness test, make sure that --force-ocr on a PDF with no
|
|
# content still triggers tesseract. If tesseract crashes, then it was
|
|
# called.
|
|
p, _, err = run_ocrmypdf(
|
|
'blank.pdf', 'wont_be_created.pdf', '--force-ocr',
|
|
env=spoof_tesseract_crash)
|
|
assert p.returncode == ExitCode.child_process_error, err
|
|
assert not os.path.exists(_outfile('wontwork.pdf'))
|
|
|
|
|
|
def test_french(spoof_tesseract_cache):
|
|
p, out, err = run_ocrmypdf(
|
|
'francais.pdf', 'francais.pdf', '-l', 'fra', env=spoof_tesseract_cache)
|
|
assert p.returncode == ExitCode.ok, \
|
|
"This test may fail if Tesseract language packs are missing"
|
|
|
|
|
|
def test_klingon():
|
|
p, out, err = run_ocrmypdf(
|
|
'francais.pdf', 'francais.pdf', '-l', 'klz')
|
|
assert p.returncode == ExitCode.bad_args
|
|
|
|
|
|
def test_missing_docinfo(spoof_tesseract_noop):
|
|
p, out, err = run_ocrmypdf(
|
|
'missing_docinfo.pdf', 'missing_docinfo.pdf', '-l', 'eng', '-c',
|
|
env=spoof_tesseract_noop)
|
|
assert p.returncode == ExitCode.ok, err
|
|
|
|
|
|
@pytest.mark.skipif(running_in_docker(),
|
|
reason="writes to tests/resources")
|
|
def test_uppercase_extension(spoof_tesseract_noop):
|
|
shutil.copy(_infile("skew.pdf"), _infile("UPPERCASE.PDF"))
|
|
try:
|
|
check_ocrmypdf("UPPERCASE.PDF", "UPPERCASE_OUT.PDF",
|
|
env=spoof_tesseract_noop)
|
|
finally:
|
|
os.unlink(_infile("UPPERCASE.PDF"))
|
|
|
|
|
|
def test_input_file_not_found():
|
|
input_file = "does not exist.pdf"
|
|
p, out, err = run_ocrmypdf(
|
|
_infile(input_file),
|
|
_outfile("will not happen.pdf"))
|
|
assert p.returncode == ExitCode.input_file
|
|
assert (input_file in out or input_file in err)
|
|
|
|
|
|
def test_input_file_not_a_pdf():
|
|
input_file = __file__ # Try to OCR this file
|
|
p, out, err = run_ocrmypdf(
|
|
_infile(input_file),
|
|
_outfile("will not happen.pdf"))
|
|
assert p.returncode == ExitCode.input_file
|
|
assert (input_file in out or input_file in err)
|
|
|
|
|
|
def test_qpdf_repair_fails():
|
|
env = os.environ.copy()
|
|
env['OCRMYPDF_QPDF'] = os.path.abspath('./spoof/qpdf_dummy_return2.py')
|
|
p, out, err = run_ocrmypdf(
|
|
'-v', '1',
|
|
'c02-22.pdf', 'wont_be_created.pdf', env=env)
|
|
print(out)
|
|
print(err)
|
|
assert p.returncode == ExitCode.input_file
|
|
|
|
|
|
def test_encrypted():
|
|
p, out, err = run_ocrmypdf('skew-encrypted.pdf', 'wont_be_created.pdf')
|
|
assert p.returncode == ExitCode.input_file
|
|
assert out.find('password')
|
|
|
|
|
|
@pytest.mark.parametrize('renderer', [
|
|
'hocr',
|
|
'tesseract',
|
|
])
|
|
def test_pagesegmode(renderer, spoof_tesseract_cache):
|
|
check_ocrmypdf(
|
|
'skew.pdf', 'test_psm_%s.pdf' % renderer,
|
|
'--tesseract-pagesegmode', '7',
|
|
'-v', '1',
|
|
'--pdf-renderer', renderer, env=spoof_tesseract_cache)
|
|
|
|
|
|
@pytest.mark.parametrize('renderer', [
|
|
'hocr',
|
|
'tesseract',
|
|
])
|
|
def test_tesseract_crash(renderer, spoof_tesseract_crash):
|
|
p, out, err = run_ocrmypdf(
|
|
'ccitt.pdf', 'wontwork.pdf', '-v', '1',
|
|
'--pdf-renderer', renderer, env=spoof_tesseract_crash)
|
|
assert p.returncode == ExitCode.child_process_error
|
|
assert not os.path.exists(_outfile('wontwork.pdf'))
|
|
assert "ERROR" in err
|
|
|
|
|
|
def test_tesseract_crash_autorotate(spoof_tesseract_crash):
|
|
p, out, err = run_ocrmypdf(
|
|
'ccitt.pdf', 'wontwork.pdf',
|
|
'-r', env=spoof_tesseract_crash)
|
|
assert p.returncode == ExitCode.child_process_error
|
|
assert not os.path.exists(_outfile('wontwork.pdf'))
|
|
assert "ERROR" in err
|
|
print(out)
|
|
print(err)
|
|
|
|
|
|
@pytest.mark.parametrize('renderer', [
|
|
'hocr',
|
|
'tesseract',
|
|
])
|
|
def test_tesseract_image_too_big(renderer, spoof_tesseract_big_image_error):
|
|
check_ocrmypdf(
|
|
'hugemono.pdf', 'hugemono_%s.pdf' % renderer, '-r',
|
|
'--pdf-renderer', renderer, env=spoof_tesseract_big_image_error)
|
|
|
|
|
|
def test_no_unpaper():
|
|
env = os.environ.copy()
|
|
env['OCRMYPDF_UNPAPER'] = os.path.abspath('./spoof/no_unpaper_here.py')
|
|
p, out, err = run_ocrmypdf(
|
|
'c02-22.pdf', 'wont_be_created.pdf', '--clean', env=env)
|
|
assert p.returncode == ExitCode.missing_dependency
|
|
|
|
|
|
def test_old_unpaper():
|
|
env = os.environ.copy()
|
|
env['OCRMYPDF_UNPAPER'] = os.path.abspath('./spoof/unpaper_oldversion.py')
|
|
p, out, err = run_ocrmypdf(
|
|
'c02-22.pdf', 'wont_be_created.pdf', '--clean', env=env)
|
|
assert p.returncode == ExitCode.missing_dependency
|
|
|
|
|
|
def test_algo4():
|
|
p, _, _ = run_ocrmypdf('encrypted_algo4.pdf', 'wontwork.pdf')
|
|
assert p.returncode == ExitCode.encrypted_pdf
|
|
|
|
|
|
@pytest.mark.parametrize('renderer', [
|
|
'hocr']) # tesseract cannot pass this test - resamples to square image
|
|
def test_non_square_resolution(renderer, spoof_tesseract_cache):
|
|
# Confirm input image is non-square resolution
|
|
in_pageinfo = pdf_get_all_pageinfo(_infile('aspect.pdf'))
|
|
assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']
|
|
|
|
out = 'aspect_%s.pdf' % renderer
|
|
check_ocrmypdf(
|
|
'aspect.pdf', out,
|
|
'--pdf-renderer', renderer, env=spoof_tesseract_cache)
|
|
|
|
out_pageinfo = pdf_get_all_pageinfo(_outfile(out))
|
|
|
|
# Confirm resolution was kept the same
|
|
assert in_pageinfo[0]['xres'] == out_pageinfo[0]['xres']
|
|
assert in_pageinfo[0]['yres'] == out_pageinfo[0]['yres']
|
|
|
|
|
|
def test_image_to_pdf(spoof_tesseract_noop):
|
|
check_ocrmypdf(
|
|
'LinnSequencer.jpg', 'image_to_pdf.pdf', '--image-dpi', '200',
|
|
env=spoof_tesseract_noop)
|
|
|
|
|
|
def test_jbig2_passthrough(spoof_tesseract_cache):
|
|
out = check_ocrmypdf(
|
|
'jbig2.pdf', 'jbig2_out.pdf',
|
|
'--output-type', 'pdf',
|
|
'--pdf-renderer', 'hocr',
|
|
env=spoof_tesseract_cache)
|
|
|
|
out_pageinfo = pdf_get_all_pageinfo(out)
|
|
assert out_pageinfo[0]['images'][0]['enc'] == 'jbig2'
|
|
|
|
|
|
def test_stdin(spoof_tesseract_noop):
|
|
input_file = _infile('francais.pdf')
|
|
output_file = _outfile('test_stdin.pdf')
|
|
|
|
# Runs: cat testfile.pdf | ocrmypdf - output.pdf
|
|
|
|
p1_args = ['cat', input_file]
|
|
p1 = Popen(p1_args, close_fds=True, stdin=DEVNULL, stdout=PIPE)
|
|
|
|
p2_args = OCRMYPDF + ['-', output_file]
|
|
p2 = Popen(
|
|
p2_args, close_fds=True, stdout=PIPE, stderr=PIPE,
|
|
stdin=p1.stdout, env=spoof_tesseract_noop)
|
|
p1.stdout.close()
|
|
out, err = p2.communicate()
|
|
|
|
assert p2.returncode == ExitCode.ok
|
|
|
|
|
|
def test_masks(spoof_tesseract_noop):
|
|
check_ocrmypdf('masks.pdf', 'test_masks.pdf', env=spoof_tesseract_noop)
|
|
|
|
|
|
def test_linearized_pdf_and_indirect_object(spoof_tesseract_noop):
|
|
check_ocrmypdf('milk.pdf', 'test_milk.pdf', env=spoof_tesseract_noop)
|