OCRmyPDF/tests/test_main.py

#!/usr/bin/env python3
# © 2015 James R. Barlow: github.com/jbarlow83

from __future__ import print_function
from subprocess import Popen, PIPE, check_output, check_call, DEVNULL
import os
import shutil
from contextlib import suppress
import sys
import pytest
from ocrmypdf.pageinfo import pdf_get_all_pageinfo
import PyPDF2 as pypdf
from ocrmypdf import ExitCode
from ocrmypdf import leptonica
from ocrmypdf.pdfa import file_claims_pdfa
import platform


if sys.version_info.major < 3:
    print("Requires Python 3.4+")
    sys.exit(1)

TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
SPOOF_PATH = os.path.join(TESTS_ROOT, 'spoof')
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
TEST_RESOURCES = os.path.join(PROJECT_ROOT, 'tests', 'resources')
TEST_OUTPUT = os.environ.get(
    'OCRMYPDF_TEST_OUTPUT',
    default=os.path.join(PROJECT_ROOT, 'tests', 'output', 'main'))
OCRMYPDF = [sys.executable, '-m', 'ocrmypdf']


def running_in_docker():
    # Docker creates a file named /.dockerinit
    return os.path.exists('/.dockerinit')


def is_linux():
    return platform.system() == 'Linux'


def setup_module():
    with suppress(FileNotFoundError):
        shutil.rmtree(TEST_OUTPUT)
    with suppress(FileExistsError):
        os.makedirs(TEST_OUTPUT)


def _infile(input_basename):
    return os.path.join(TEST_RESOURCES, input_basename)


def _outfile(output_basename):
    return os.path.join(TEST_OUTPUT, os.path.basename(output_basename))


def check_ocrmypdf(input_basename, output_basename, *args, env=None):
    "Run ocrmypdf and confirmed that a valid file was created"
    input_file = _infile(input_basename)
    output_file = _outfile(output_basename)

    p, out, err = run_ocrmypdf(input_basename, output_basename, *args, env=env)
    if p.returncode != 0:
        print('stdout\n======')
        print(out)
        print('stderr\n======')
        print(err)
    assert p.returncode == 0
    assert os.path.exists(output_file), "Output file not created"
    assert os.stat(output_file).st_size > 100, "PDF too small or empty"
    return output_file


def run_ocrmypdf(input_basename, output_basename, *args, env=None):
    "Run ocrmypdf and let caller deal with results"
    input_file = _infile(input_basename)
    output_file = _outfile(output_basename)

    if env is None:
        env = os.environ

    p_args = OCRMYPDF + list(args) + [input_file, output_file]
    p = Popen(
        p_args, close_fds=True, stdout=PIPE, stderr=PIPE,
        universal_newlines=True, env=env)
    out, err = p.communicate()
    return p, out, err


def spoof(replace_program, with_spoof):
    """Modify environment variables to override subprocess executables

    Before running any executable, ocrmypdf checks the environment variable
    OCRMYPDF_PROGRAMNAME to override default program name/location, e.g.
    OCRMYPDF_GS redirects from the system path Ghostscript ("gs") to elsewhere.
    """
    env = os.environ.copy()
    spoofer = os.path.join(SPOOF_PATH, with_spoof)
    if not os.access(spoofer, os.X_OK):
        os.chmod(spoofer, 0o755)
    env['OCRMYPDF_' + replace_program.upper()] = spoofer
    return env


@pytest.fixture
def spoof_tesseract_noop():
    return spoof('tesseract', 'tesseract_noop.py')


@pytest.fixture
def spoof_tesseract_cache():
    if running_in_docker():
        return os.environ.copy()
    return spoof('tesseract', "tesseract_cache.py")


@pytest.fixture
def spoof_tesseract_crash():
    return spoof('tesseract', 'tesseract_crash.py')


@pytest.fixture
def spoof_tesseract_big_image_error():
    return spoof('tesseract', 'tesseract_big_image_error.py')


def test_quick(spoof_tesseract_noop):
    check_ocrmypdf('c02-22.pdf', 'test_quick.pdf', env=spoof_tesseract_noop)


def test_deskew(spoof_tesseract_noop):
    # Run with deskew
    deskewed_pdf = check_ocrmypdf(
        'skew.pdf', 'test_deskew.pdf', '-d', '-v', '1', env=spoof_tesseract_noop)

    # Now render as an image again and use Leptonica to find the skew angle
    # to confirm that it was deskewed
    from ocrmypdf.ghostscript import rasterize_pdf
    import logging
    log = logging.getLogger()

    deskewed_png = _outfile('deskewed.png')

    rasterize_pdf(
        deskewed_pdf,
        deskewed_png,
        xres=150,
        yres=150,
        raster_device='pngmono',
        log=log)

    from ocrmypdf.leptonica import Pix
    pix = Pix.read(deskewed_png)
    skew_angle, skew_confidence = pix.find_skew()

    print(skew_angle)
    assert -0.5 < skew_angle < 0.5, "Deskewing failed"


def test_clean(spoof_tesseract_noop):
    check_ocrmypdf('skew.pdf', 'test_clean.pdf', '-c',
                   env=spoof_tesseract_noop)


# This will run 5 * 2 * 2 = 20 test cases
@pytest.mark.parametrize(
    "pdf",
    ['palette.pdf', 'cmyk.pdf', 'ccitt.pdf', 'jbig2.pdf', 'lichtenstein.pdf'])
@pytest.mark.parametrize("renderer", ['hocr', 'tesseract'])
@pytest.mark.parametrize("output_type", ['pdf', 'pdfa'])
def test_exotic_image(spoof_tesseract_cache, pdf, renderer, output_type):
    check_ocrmypdf(
        pdf,
        'test_{0}_{1}.pdf'.format(pdf, renderer),
        '-dc',
        '-v', '1',
        '--output-type', output_type,
        '--pdf-renderer', renderer, env=spoof_tesseract_cache)


@pytest.mark.parametrize("output_type", [
    'pdfa', 'pdf'
    ])
def test_preserve_metadata(spoof_tesseract_noop, output_type):
    pdf_before = pypdf.PdfFileReader(_infile('graph.pdf'))

    output = check_ocrmypdf('graph.pdf', 'test_metadata_preserve.pdf',
                            '--output-type', output_type,
                            env=spoof_tesseract_noop)

    pdf_after = pypdf.PdfFileReader(output)

    for key in ('/Title', '/Author'):
        assert pdf_before.documentInfo[key] == pdf_after.documentInfo[key]

    pdfa_info = file_claims_pdfa(output)
    assert pdfa_info['output'] == output_type


@pytest.mark.skipif(
    is_linux() and not running_in_docker(),
    reason="likely to fail if Linux locale is not configured correctly")
@pytest.mark.parametrize("output_type", [
    'pdfa', 'pdf'
    ])
def test_override_metadata(spoof_tesseract_noop, output_type):
    input_file = _infile('c02-22.pdf')
    output_file = _outfile('test_override_metadata.pdf')

    german = 'Du siehst den Wald vor lauter Bäumen nicht.'
    chinese = '孔子'
    high_unicode = 'U+1030C is: 𐌌'

    p, out, err = run_ocrmypdf(
        input_file, output_file,
        '--title', german,
        '--author', chinese,
        '--subject', high_unicode,
        '--output-type', output_type,
        env=spoof_tesseract_noop)

    assert p.returncode == ExitCode.ok

    pdf = output_file

    out_pdfinfo = check_output(['pdfinfo', pdf], universal_newlines=True)
    lines_pdfinfo = out_pdfinfo.splitlines()
    pdfinfo = {}
    for line in lines_pdfinfo:
        k, v = line.strip().split(':', maxsplit=1)
        pdfinfo[k.strip()] = v.strip()

    assert pdfinfo['Title'] == german
    assert pdfinfo['Author'] == chinese
    assert pdfinfo['Subject'] == high_unicode
    assert pdfinfo.get('Keywords', '') == ''

    pdfa_info = file_claims_pdfa(output_file)
    assert pdfa_info['output'] == output_type


@pytest.mark.parametrize('renderer', [
    'hocr',
    'tesseract',
    ])
def test_oversample(spoof_tesseract_cache, renderer):
    oversampled_pdf = check_ocrmypdf(
        'skew.pdf', 'test_oversample_%s.pdf' % renderer, '--oversample', '350',
        '-f',
        '--pdf-renderer', renderer, env=spoof_tesseract_cache)

    pdfinfo = pdf_get_all_pageinfo(oversampled_pdf)

    print(pdfinfo[0]['xres'])
    assert abs(pdfinfo[0]['xres'] - 350) < 1


def test_repeat_ocr():
    p, _, _ = run_ocrmypdf('graph_ocred.pdf', 'wontwork.pdf')
    assert p.returncode != 0


def test_force_ocr(spoof_tesseract_cache):
    out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f',
                         env=spoof_tesseract_cache)
    pdfinfo = pdf_get_all_pageinfo(out)
    assert pdfinfo[0]['has_text']


def test_skip_ocr(spoof_tesseract_cache):
    check_ocrmypdf('graph_ocred.pdf', 'test_skip.pdf', '-s',
                   env=spoof_tesseract_cache)


def test_argsfile(spoof_tesseract_noop):
    with open(_outfile('test_argsfile.txt'), 'w') as argsfile:
        print('--title', 'ArgsFile Test', '--author', 'Test Cases',
              sep='\n', end='\n', file=argsfile)
    check_ocrmypdf('graph.pdf', 'test_argsfile.pdf',
                   '@' + _outfile('test_argsfile.txt'),
                   env=spoof_tesseract_noop)


def check_monochrome_correlation(
        reference_pdf, reference_pageno, test_pdf, test_pageno):

    import ocrmypdf.ghostscript as ghostscript
    import logging

    gslog = logging.getLogger()

    reference_png = _outfile('{}.ref{:04d}.png'.format(
        reference_pdf, reference_pageno))
    test_png = _outfile('{}.test{:04d}.png'.format(
        test_pdf, test_pageno))

    def rasterize(pdf, pageno, png):
        if os.path.exists(png):
            print(png)
            return
        ghostscript.rasterize_pdf(
            pdf,
            png,
            xres=100, yres=100,
            raster_device='pngmono', log=gslog, pageno=pageno)

    rasterize(reference_pdf, reference_pageno, reference_png)
    rasterize(test_pdf, test_pageno, test_png)

    pix_ref = leptonica.Pix.read(reference_png)
    pix_test = leptonica.Pix.read(test_png)

    return leptonica.Pix.correlation_binary(pix_ref, pix_test)


def test_monochrome_correlation():
    # Verify leptonica: check that an incorrect rotated image has poor
    # correlation with reference
    corr = check_monochrome_correlation(
        reference_pdf=_infile('cardinal.pdf'),
        reference_pageno=1,  # north facing page
        test_pdf=_infile('cardinal.pdf'),
        test_pageno=3,  # south facing page
        )
    assert corr < 0.10
    corr = check_monochrome_correlation(
        reference_pdf=_infile('cardinal.pdf'),
        reference_pageno=2,
        test_pdf=_infile('cardinal.pdf'),
        test_pageno=2,
        )
    assert corr > 0.90


@pytest.mark.parametrize('renderer', [
    'hocr',
    'tesseract',
    ])
def test_autorotate(spoof_tesseract_cache, renderer):
    # cardinal.pdf contains four copies of an image rotated in each cardinal
    # direction - these ones are "burned in" not tagged with /Rotate
    out = check_ocrmypdf('cardinal.pdf', 'test_autorotate_%s.pdf' % renderer,
                         '-r', '-v', '1', env=spoof_tesseract_cache)
    for n in range(1, 4+1):
        correlation = check_monochrome_correlation(
            reference_pdf=_infile('cardinal.pdf'),
            reference_pageno=1,
            test_pdf=out,
            test_pageno=n)
        assert correlation > 0.80


def test_autorotate_threshold_low(spoof_tesseract_cache):
    out = check_ocrmypdf('cardinal.pdf', 'test_autorotate_threshold_low.pdf',
                         '--rotate-pages-threshold', '1',
                         '-r', '-v', '1', env=spoof_tesseract_cache)

    # Low threshold -> always rotate -> expect high correlation between
    # reference page and test page
    correlation = check_monochrome_correlation(
        reference_pdf=_infile('cardinal.pdf'),
        reference_pageno=1,
        test_pdf=out,
        test_pageno=3)
    assert correlation > 0.80


def test_autorotate_threshold_high(spoof_tesseract_cache):
    out = check_ocrmypdf('cardinal.pdf', 'test_autorotate_threshold_high.pdf',
                         '--rotate-pages-threshold', '99',
                         '-r', '-v', '1', env=spoof_tesseract_cache)

    # High threshold -> never rotate -> expect low correlation since
    # test page will not be rotated
    correlation = check_monochrome_correlation(
        reference_pdf=_infile('cardinal.pdf'),
        reference_pageno=1,
        test_pdf=out,
        test_pageno=3)
    assert correlation < 0.10


@pytest.mark.parametrize('renderer', [
    'hocr',
    'tesseract',
    ])
def test_ocr_timeout(renderer):
    out = check_ocrmypdf('skew.pdf', 'test_timeout_%s.pdf' % renderer,
                         '--tesseract-timeout', '1.0')
    pdfinfo = pdf_get_all_pageinfo(out)
    assert not pdfinfo[0]['has_text']


def test_skip_big(spoof_tesseract_cache):
    out = check_ocrmypdf('enormous.pdf', 'test_enormous.pdf',
                         '--skip-big', '10', env=spoof_tesseract_cache)
    pdfinfo = pdf_get_all_pageinfo(out)
    assert not pdfinfo[0]['has_text']


@pytest.mark.parametrize('renderer', ['hocr', 'tesseract'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_maximum_options(spoof_tesseract_cache, renderer, output_type):
    check_ocrmypdf(
        'multipage.pdf', 'test_multipage%s.pdf' % renderer,
        '-d', '-c', '-i', '-g', '-f', '-k', '--oversample', '300',
        '--skip-big', '10', '--title', 'Too Many Weird Files',
        '--author', 'py.test', '--pdf-renderer', renderer,
        '--output-type', output_type,
        env=spoof_tesseract_cache)


def test_tesseract_missing_tessdata():
    env = os.environ.copy()
    env['TESSDATA_PREFIX'] = '/tmp'

    p, _, err = run_ocrmypdf(
        'graph_ocred.pdf', 'not_a_pdfa.pdf', '-v', '1', '--skip-text', env=env)
    assert p.returncode == ExitCode.missing_dependency, err


def test_invalid_input_pdf():
    p, out, err = run_ocrmypdf(
        'invalid.pdf', 'wont_be_created.pdf')
    assert p.returncode == ExitCode.input_file, err


def test_blank_input_pdf():
    p, out, err = run_ocrmypdf(
        'blank.pdf', 'still_blank.pdf')
    assert p.returncode == ExitCode.ok


def test_force_ocr_on_pdf_with_no_images(spoof_tesseract_crash):
    # As a correctness test, make sure that --force-ocr on a PDF with no
    # content still triggers tesseract. If tesseract crashes, then it was
    # called.
    p, _, err = run_ocrmypdf(
        'blank.pdf', 'wont_be_created.pdf', '--force-ocr',
        env=spoof_tesseract_crash)
    assert p.returncode == ExitCode.child_process_error, err
    assert not os.path.exists(_outfile('wontwork.pdf'))


def test_french(spoof_tesseract_cache):
    p, out, err = run_ocrmypdf(
        'francais.pdf', 'francais.pdf', '-l', 'fra', env=spoof_tesseract_cache)
    assert p.returncode == ExitCode.ok, \
        "This test may fail if Tesseract language packs are missing"


def test_klingon():
    p, out, err = run_ocrmypdf(
        'francais.pdf', 'francais.pdf', '-l', 'klz')
    assert p.returncode == ExitCode.bad_args


def test_missing_docinfo(spoof_tesseract_noop):
    p, out, err = run_ocrmypdf(
        'missing_docinfo.pdf', 'missing_docinfo.pdf', '-l', 'eng', '-c',
        env=spoof_tesseract_noop)
    assert p.returncode == ExitCode.ok, err


@pytest.mark.skipif(running_in_docker(),
                    reason="writes to tests/resources")
def test_uppercase_extension(spoof_tesseract_noop):
    shutil.copy(_infile("skew.pdf"), _infile("UPPERCASE.PDF"))
    try:
        check_ocrmypdf("UPPERCASE.PDF", "UPPERCASE_OUT.PDF",
                       env=spoof_tesseract_noop)
    finally:
        os.unlink(_infile("UPPERCASE.PDF"))


def test_input_file_not_found():
    input_file = "does not exist.pdf"
    p, out, err = run_ocrmypdf(
        _infile(input_file),
        _outfile("will not happen.pdf"))
    assert p.returncode == ExitCode.input_file
    assert (input_file in out or input_file in err)


def test_input_file_not_a_pdf():
    input_file = __file__  # Try to OCR this file
    p, out, err = run_ocrmypdf(
        _infile(input_file),
        _outfile("will not happen.pdf"))
    assert p.returncode == ExitCode.input_file
    assert (input_file in out or input_file in err)


def test_qpdf_repair_fails():
    env = os.environ.copy()
    env['OCRMYPDF_QPDF'] = os.path.abspath('./spoof/qpdf_dummy_return2.py')
    p, out, err = run_ocrmypdf(
        '-v', '1',
        'c02-22.pdf', 'wont_be_created.pdf', env=env)
    print(out)
    print(err)
    assert p.returncode == ExitCode.input_file


def test_encrypted():
    p, out, err = run_ocrmypdf('skew-encrypted.pdf', 'wont_be_created.pdf')
    assert p.returncode == ExitCode.input_file
    assert out.find('password')


@pytest.mark.parametrize('renderer', [
    'hocr',
    'tesseract',
    ])
def test_pagesegmode(renderer, spoof_tesseract_cache):
    check_ocrmypdf(
        'skew.pdf', 'test_psm_%s.pdf' % renderer,
        '--tesseract-pagesegmode', '7',
        '-v', '1',
        '--pdf-renderer', renderer, env=spoof_tesseract_cache)


@pytest.mark.parametrize('renderer', [
    'hocr',
    'tesseract',
    ])
def test_tesseract_crash(renderer, spoof_tesseract_crash):
    p, out, err = run_ocrmypdf(
        'ccitt.pdf', 'wontwork.pdf', '-v', '1',
        '--pdf-renderer', renderer, env=spoof_tesseract_crash)
    assert p.returncode == ExitCode.child_process_error
    assert not os.path.exists(_outfile('wontwork.pdf'))
    assert "ERROR" in err


def test_tesseract_crash_autorotate(spoof_tesseract_crash):
    p, out, err = run_ocrmypdf(
        'ccitt.pdf', 'wontwork.pdf',
        '-r', env=spoof_tesseract_crash)
    assert p.returncode == ExitCode.child_process_error
    assert not os.path.exists(_outfile('wontwork.pdf'))
    assert "ERROR" in err
    print(out)
    print(err)


@pytest.mark.parametrize('renderer', [
    'hocr',
    'tesseract',
    ])
def test_tesseract_image_too_big(renderer, spoof_tesseract_big_image_error):
    check_ocrmypdf(
        'hugemono.pdf', 'hugemono_%s.pdf' % renderer, '-r',
        '--pdf-renderer', renderer, env=spoof_tesseract_big_image_error)


def test_no_unpaper():
    env = os.environ.copy()
    env['OCRMYPDF_UNPAPER'] = os.path.abspath('./spoof/no_unpaper_here.py')
    p, out, err = run_ocrmypdf(
        'c02-22.pdf', 'wont_be_created.pdf', '--clean', env=env)
    assert p.returncode == ExitCode.missing_dependency


def test_old_unpaper():
    env = os.environ.copy()
    env['OCRMYPDF_UNPAPER'] = os.path.abspath('./spoof/unpaper_oldversion.py')
    p, out, err = run_ocrmypdf(
        'c02-22.pdf', 'wont_be_created.pdf', '--clean', env=env)
    assert p.returncode == ExitCode.missing_dependency


def test_algo4():
    p, _, _ = run_ocrmypdf('encrypted_algo4.pdf', 'wontwork.pdf')
    assert p.returncode == ExitCode.encrypted_pdf


@pytest.mark.parametrize('renderer', [
    'hocr'])  # tesseract cannot pass this test - resamples to square image
def test_non_square_resolution(renderer, spoof_tesseract_cache):
    # Confirm input image is non-square resolution
    in_pageinfo = pdf_get_all_pageinfo(_infile('aspect.pdf'))
    assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']

    out = 'aspect_%s.pdf' % renderer
    check_ocrmypdf(
        'aspect.pdf', out,
        '--pdf-renderer', renderer, env=spoof_tesseract_cache)

    out_pageinfo = pdf_get_all_pageinfo(_outfile(out))

    # Confirm resolution was kept the same
    assert in_pageinfo[0]['xres'] == out_pageinfo[0]['xres']
    assert in_pageinfo[0]['yres'] == out_pageinfo[0]['yres']


def test_image_to_pdf(spoof_tesseract_noop):
    check_ocrmypdf(
        'LinnSequencer.jpg', 'image_to_pdf.pdf', '--image-dpi', '200',
        env=spoof_tesseract_noop)


def test_jbig2_passthrough(spoof_tesseract_cache):
    out = check_ocrmypdf(
        'jbig2.pdf', 'jbig2_out.pdf',
        '--output-type', 'pdf',
        '--pdf-renderer', 'hocr',
        env=spoof_tesseract_cache)

    out_pageinfo = pdf_get_all_pageinfo(out)
    assert out_pageinfo[0]['images'][0]['enc'] == 'jbig2'


def test_stdin(spoof_tesseract_noop):
    input_file = _infile('francais.pdf')
    output_file = _outfile('test_stdin.pdf')

    # Runs: cat testfile.pdf | ocrmypdf - output.pdf

    p1_args = ['cat', input_file]
    p1 = Popen(p1_args, close_fds=True, stdin=DEVNULL, stdout=PIPE)

    p2_args = OCRMYPDF + ['-', output_file]
    p2 = Popen(
        p2_args, close_fds=True, stdout=PIPE, stderr=PIPE,
        stdin=p1.stdout, env=spoof_tesseract_noop)
    p1.stdout.close()
    out, err = p2.communicate()

    assert p2.returncode == ExitCode.ok


def test_masks(spoof_tesseract_noop):
    check_ocrmypdf('masks.pdf', 'test_masks.pdf', env=spoof_tesseract_noop)


@pytest.mark.skipif(not os.path.isfile(_infile('milk.pdf')),
                    reason="fair use restricted test resource does not exist")
def test_linearized_pdf_and_indirect_object(spoof_tesseract_noop):
    check_ocrmypdf('milk.pdf', 'test_milk.pdf', env=spoof_tesseract_noop)