OCRmyPDF/tests/test_main.py

#!/usr/bin/env python3
# © 2015 James R. Barlow: github.com/jbarlow83

from subprocess import Popen, PIPE, check_output, check_call, DEVNULL
import os
import shutil
from contextlib import suppress
import sys
import pytest
from ocrmypdf.pageinfo import pdf_get_all_pageinfo
import PyPDF2 as pypdf
from ocrmypdf.exceptions import ExitCode
from ocrmypdf import leptonica
from ocrmypdf.pdfa import file_claims_pdfa


check_ocrmypdf = pytest.helpers.check_ocrmypdf
run_ocrmypdf = pytest.helpers.run_ocrmypdf
spoof = pytest.helpers.spoof

@pytest.fixture
def spoof_tesseract_noop():
    return spoof(tesseract='tesseract_noop.py')


@pytest.fixture
def spoof_tesseract_cache():
    if pytest.helpers.running_in_docker():
        return os.environ.copy()
    return spoof(tesseract="tesseract_cache.py")


@pytest.fixture
def spoof_tesseract_crash():
    return spoof(tesseract='tesseract_crash.py')


@pytest.fixture
def spoof_tesseract_big_image_error():
    return spoof(tesseract='tesseract_big_image_error.py')


@pytest.fixture
def spoof_no_tess_no_pdfa():
    return spoof(tesseract='tesseract_noop.py', gs='gs_pdfa_failure.py')


@pytest.fixture
def spoof_no_tess_pdfa_warning():
    return spoof(tesseract='tesseract_noop.py', gs='gs_feature_elision.py')


@pytest.fixture
def spoof_qpdf_always_error():
    return spoof(qpdf='qpdf_dummy_return2.py')


def test_quick(spoof_tesseract_cache, resources, outpdf):
    check_ocrmypdf(resources / 'ccitt.pdf', outpdf, env=spoof_tesseract_cache)


def test_deskew(spoof_tesseract_noop, resources, outdir):
    # Run with deskew
    deskewed_pdf = check_ocrmypdf(
        resources / 'skew.pdf', outdir / 'skew.pdf', '-d', '-v', '1',
        env=spoof_tesseract_noop)

    # Now render as an image again and use Leptonica to find the skew angle
    # to confirm that it was deskewed
    from ocrmypdf.exec.ghostscript import rasterize_pdf
    import logging
    log = logging.getLogger()

    deskewed_png = outdir / 'deskewed.png'

    rasterize_pdf(
        str(deskewed_pdf),
        str(deskewed_png),
        xres=150,
        yres=150,
        raster_device='pngmono',
        log=log)

    from ocrmypdf.leptonica import Pix
    pix = Pix.read(str(deskewed_png))
    skew_angle, skew_confidence = pix.find_skew()

    print(skew_angle)
    assert -0.5 < skew_angle < 0.5, "Deskewing failed"


def test_clean(spoof_tesseract_noop, resources, outpdf):
    check_ocrmypdf(resources / 'skew.pdf', outpdf, '-c',
                   env=spoof_tesseract_noop)


def test_remove_background(spoof_tesseract_noop, resources, outdir):
    from PIL import Image

    # Ensure the input image does not contain pure white/black
    im = Image.open(resources / 'congress.jpg')
    assert im.getextrema() != ((0, 255), (0, 255), (0, 255))

    output_pdf = check_ocrmypdf(
        resources / 'congress.jpg',
        outdir / 'test_remove_bg.pdf',
        '--remove-background',
        '--image-dpi', '150',
        env=spoof_tesseract_noop)

    from ocrmypdf.exec.ghostscript import rasterize_pdf
    import logging
    log = logging.getLogger()

    output_png = outdir / 'remove_bg.png'

    rasterize_pdf(
        str(output_pdf),
        str(output_png),
        xres=100,
        yres=100,
        raster_device='png16m',
        log=log)


    # The output image should contain pure white and black
    im = Image.open(output_png)
    assert im.getextrema() == ((0, 255), (0, 255), (0, 255))


# This will run 5 * 2 * 2 = 20 test cases
@pytest.mark.parametrize(
    "pdf",
    ['palette.pdf', 'cmyk.pdf', 'ccitt.pdf', 'jbig2.pdf', 'lichtenstein.pdf'])
@pytest.mark.parametrize("renderer", ['hocr', 'tesseract'])
@pytest.mark.parametrize("output_type", ['pdf', 'pdfa'])
def test_exotic_image(spoof_tesseract_cache, pdf, renderer, output_type,
                      resources, outdir):
    check_ocrmypdf(
        resources / pdf,
        outdir / 'test_{0}_{1}.pdf'.format(pdf, renderer),
        '-dc',
        '-v', '1',
        '--output-type', output_type,
        '--pdf-renderer', renderer, env=spoof_tesseract_cache)


@pytest.mark.parametrize("output_type", [
    'pdfa', 'pdf'
    ])
def test_preserve_metadata(spoof_tesseract_noop, output_type,
                           resources, outpdf):
    pdf_before = pypdf.PdfFileReader(str(resources / 'graph.pdf'))

    output = check_ocrmypdf(
            resources / 'graph.pdf', outpdf,
            '--output-type', output_type,
            env=spoof_tesseract_noop)

    pdf_after = pypdf.PdfFileReader(str(output))

    for key in ('/Title', '/Author'):
        assert pdf_before.documentInfo[key] == pdf_after.documentInfo[key]

    pdfa_info = file_claims_pdfa(str(output))
    assert pdfa_info['output'] == output_type


@pytest.mark.skipif(
    pytest.helpers.is_linux() and not pytest.helpers.running_in_docker(),
    reason="likely to fail if Linux locale is not configured correctly")
@pytest.mark.parametrize("output_type", [
    'pdfa', 'pdf'
    ])
def test_override_metadata(spoof_tesseract_noop, output_type, resources,
                           outpdf):
    input_file = resources / 'c02-22.pdf'

    german = 'Du siehst den Wald vor lauter Bäumen nicht.'
    chinese = '孔子'
    high_unicode = 'U+1030C is: 𐌌'

    p, out, err = run_ocrmypdf(
        input_file, outpdf,
        '--title', german,
        '--author', chinese,
        '--subject', high_unicode,
        '--output-type', output_type,
        env=spoof_tesseract_noop)

    assert p.returncode == ExitCode.ok

    pdf = str(outpdf)

    out_pdfinfo = check_output(['pdfinfo', pdf], universal_newlines=True)
    lines_pdfinfo = out_pdfinfo.splitlines()
    pdfinfo = {}
    for line in lines_pdfinfo:
        k, v = line.strip().split(':', maxsplit=1)
        pdfinfo[k.strip()] = v.strip()

    assert pdfinfo['Title'] == german
    assert pdfinfo['Author'] == chinese
    assert pdfinfo['Subject'] == high_unicode
    assert pdfinfo.get('Keywords', '') == ''

    pdfa_info = file_claims_pdfa(pdf)
    assert pdfa_info['output'] == output_type


@pytest.mark.parametrize('renderer', [
    'hocr',
    'tesseract',
    ])
def test_oversample(spoof_tesseract_cache, renderer, resources, outpdf):
    oversampled_pdf = check_ocrmypdf(
        resources / 'skew.pdf', outpdf, '--oversample', '350',
        '-f',
        '--pdf-renderer', renderer, env=spoof_tesseract_cache)

    pdfinfo = pdf_get_all_pageinfo(str(oversampled_pdf))

    print(pdfinfo[0]['xres'])
    assert abs(pdfinfo[0]['xres'] - 350) < 1


def test_repeat_ocr(resources, no_outpdf):
    p, _, _ = run_ocrmypdf(resources / 'graph_ocred.pdf', no_outpdf)
    assert p.returncode != 0


def test_force_ocr(spoof_tesseract_cache, resources, outpdf):
    out = check_ocrmypdf(resources / 'graph_ocred.pdf', outpdf, '-f',
                         env=spoof_tesseract_cache)
    pdfinfo = pdf_get_all_pageinfo(out)
    assert pdfinfo[0]['has_text']


def test_skip_ocr(spoof_tesseract_cache, resources, outpdf):
    check_ocrmypdf(resources / 'graph_ocred.pdf', outpdf, '-s',
                   env=spoof_tesseract_cache)


def test_argsfile(spoof_tesseract_noop, resources, outdir):
    with open(outdir / 'test_argsfile.txt', 'w') as argsfile:
        print('--title', 'ArgsFile Test', '--author', 'Test Cases',
              sep='\n', end='\n', file=argsfile)
    check_ocrmypdf(resources / 'graph.pdf', outdir / 'args.pdf',
                   '@' + str(outdir / 'test_argsfile.txt'),
                   env=spoof_tesseract_noop)


def check_monochrome_correlation(
        outdir,
        reference_pdf, reference_pageno, test_pdf, test_pageno):

    import ocrmypdf.exec.ghostscript as ghostscript
    import logging

    gslog = logging.getLogger()

    reference_png = outdir / '{}.ref{:04d}.png'.format(
        reference_pdf, reference_pageno)
    test_png = outdir / '{}.test{:04d}.png'.format(
        test_pdf, test_pageno)

    def rasterize(pdf, pageno, png):
        if png.exists():
            print(png)
            return
        ghostscript.rasterize_pdf(
            str(pdf),
            str(png),
            xres=100, yres=100,
            raster_device='pngmono', log=gslog, pageno=pageno)

    rasterize(reference_pdf, reference_pageno, reference_png)
    rasterize(test_pdf, test_pageno, test_png)

    pix_ref = leptonica.Pix.read(str(reference_png))
    pix_test = leptonica.Pix.read(str(test_png))

    return leptonica.Pix.correlation_binary(pix_ref, pix_test)


def test_monochrome_correlation(resources, outdir):
    # Verify leptonica: check that an incorrect rotated image has poor
    # correlation with reference
    corr = check_monochrome_correlation(
        outdir,
        reference_pdf=resources / 'cardinal.pdf',
        reference_pageno=1,  # north facing page
        test_pdf=resources / 'cardinal.pdf',
        test_pageno=3,  # south facing page
        )
    assert corr < 0.10
    corr = check_monochrome_correlation(
        outdir,
        reference_pdf=resources / 'cardinal.pdf',
        reference_pageno=2,
        test_pdf=resources / 'cardinal.pdf',
        test_pageno=2,
        )
    assert corr > 0.90


@pytest.mark.parametrize('renderer', [
    'hocr',
    'tesseract',
    ])
def test_autorotate(spoof_tesseract_cache, renderer, resources, outdir):
    # cardinal.pdf contains four copies of an image rotated in each cardinal
    # direction - these ones are "burned in" not tagged with /Rotate
    out = check_ocrmypdf(resources / 'cardinal.pdf', outdir / 'out.pdf',
                         '-r', '-v', '1', env=spoof_tesseract_cache)
    for n in range(1, 4+1):
        correlation = check_monochrome_correlation(
            outdir,
            reference_pdf=resources / 'cardinal.pdf',
            reference_pageno=1,
            test_pdf=outdir / 'out.pdf',
            test_pageno=n)
        assert correlation > 0.80


def test_autorotate_threshold_low(spoof_tesseract_cache, resources, outdir):
    out = check_ocrmypdf(resources / 'cardinal.pdf', outdir / 'out.pdf',
                         '--rotate-pages-threshold', '1',
                         '-r', '-v', '1', env=spoof_tesseract_cache)

    # Low threshold -> always rotate -> expect high correlation between
    # reference page and test page
    correlation = check_monochrome_correlation(
        outdir,
        reference_pdf=resources / 'cardinal.pdf',
        reference_pageno=1,
        test_pdf=out,
        test_pageno=3)
    assert correlation > 0.80


def test_autorotate_threshold_high(spoof_tesseract_cache, resources, outdir):
    out = check_ocrmypdf(resources / 'cardinal.pdf', outdir / 'out.pdf',
                         '--rotate-pages-threshold', '99',
                         '-r', '-v', '1', env=spoof_tesseract_cache)

    # High threshold -> never rotate -> expect low correlation since
    # test page will not be rotated
    correlation = check_monochrome_correlation(
        outdir,
        reference_pdf=resources / 'cardinal.pdf',
        reference_pageno=1,
        test_pdf=out,
        test_pageno=3)
    assert correlation < 0.10


@pytest.mark.parametrize('renderer', [
    'hocr',
    'tesseract',
    ])
def test_ocr_timeout(renderer, resources, outpdf):
    out = check_ocrmypdf(resources / 'skew.pdf', outpdf,
                         '--tesseract-timeout', '1.0')
    pdfinfo = pdf_get_all_pageinfo(str(out))
    assert not pdfinfo[0]['has_text']


def test_skip_big(spoof_tesseract_cache, resources, outpdf):
    out = check_ocrmypdf(resources / 'enormous.pdf', outpdf,
                         '--skip-big', '10', env=spoof_tesseract_cache)
    pdfinfo = pdf_get_all_pageinfo(str(out))
    assert not pdfinfo[0]['has_text']


@pytest.mark.parametrize('renderer', ['hocr', 'tesseract'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_maximum_options(spoof_tesseract_cache, renderer, output_type,
                         resources, outpdf):
    check_ocrmypdf(
        resources / 'multipage.pdf', outpdf,
        '-d', '-c', '-i', '-g', '-f', '-k', '--oversample', '300',
        '--remove-background',
        '--skip-big', '10', '--title', 'Too Many Weird Files',
        '--author', 'py.test', '--pdf-renderer', renderer,
        '--output-type', output_type,
        env=spoof_tesseract_cache)


def test_tesseract_missing_tessdata(resources, no_outpdf):
    env = os.environ.copy()
    env['TESSDATA_PREFIX'] = '/tmp'

    p, _, err = run_ocrmypdf(
        resources / 'graph_ocred.pdf', no_outpdf,
        '-v', '1', '--skip-text', env=env)
    assert p.returncode == ExitCode.missing_dependency, err


def test_invalid_input_pdf(resources, no_outpdf):
    p, out, err = run_ocrmypdf(
        resources / 'invalid.pdf', no_outpdf)
    assert p.returncode == ExitCode.input_file, err


def test_blank_input_pdf(resources, outpdf):
    p, out, err = run_ocrmypdf(
        resources / 'blank.pdf', outpdf)
    assert p.returncode == ExitCode.ok


def test_force_ocr_on_pdf_with_no_images(spoof_tesseract_crash, resources,
                                         no_outpdf):
    # As a correctness test, make sure that --force-ocr on a PDF with no
    # content still triggers tesseract. If tesseract crashes, then it was
    # called.
    p, _, err = run_ocrmypdf(
        resources / 'blank.pdf', no_outpdf, '--force-ocr',
        env=spoof_tesseract_crash)
    assert p.returncode == ExitCode.child_process_error, err
    assert not os.path.exists(no_outpdf)


def test_french(spoof_tesseract_cache, resources, outpdf):
    p, out, err = run_ocrmypdf(
        resources / 'francais.pdf', outpdf, '-l', 'fra',
        env=spoof_tesseract_cache)
    assert p.returncode == ExitCode.ok, \
        "This test may fail if Tesseract language packs are missing"


def test_klingon(resources, outpdf):
    p, out, err = run_ocrmypdf(
        resources / 'francais.pdf', outpdf, '-l', 'klz')
    assert p.returncode == ExitCode.bad_args


def test_missing_docinfo(spoof_tesseract_noop, resources, outpdf):
    p, out, err = run_ocrmypdf(
        resources / 'missing_docinfo.pdf', outpdf, '-l', 'eng', '-c',
        env=spoof_tesseract_noop)
    assert p.returncode == ExitCode.ok, err


@pytest.mark.skipif(pytest.helpers.running_in_docker(),
                    reason="<no longer true> writes to tests/resources")
def test_uppercase_extension(spoof_tesseract_noop, resources, outdir):
    shutil.copy(
        str(resources / "skew.pdf"),
        str(outdir / "UPPERCASE.PDF"))

    check_ocrmypdf(outdir / "UPPERCASE.PDF", outdir / "UPPERCASE_OUT.PDF",
                   env=spoof_tesseract_noop)


def test_input_file_not_found(no_outpdf):
    input_file = "does not exist.pdf"
    p, out, err = run_ocrmypdf(
        input_file,
        no_outpdf)
    assert p.returncode == ExitCode.input_file
    assert (input_file in out or input_file in err)


def test_input_file_not_a_pdf(no_outpdf):
    input_file = __file__  # Try to OCR this file
    p, out, err = run_ocrmypdf(
        input_file,
        no_outpdf)
    assert p.returncode == ExitCode.input_file
    assert (input_file in out or input_file in err)


def test_qpdf_repair_fails(spoof_qpdf_always_error, resources, no_outpdf):
    p, out, err = run_ocrmypdf(
        '-v', '1',
        resources / 'c02-22.pdf', no_outpdf, env=spoof_qpdf_always_error)
    print(out)
    print(err)
    assert p.returncode == ExitCode.input_file


def test_encrypted(resources, no_outpdf):
    p, out, err = run_ocrmypdf(
        resources / 'skew-encrypted.pdf', no_outpdf)
    assert p.returncode == ExitCode.encrypted_pdf
    assert out.find('password')


@pytest.mark.parametrize('renderer', [
    'hocr',
    'tesseract',
    ])
def test_pagesegmode(renderer, spoof_tesseract_cache, resources, outpdf):
    check_ocrmypdf(
        resources / 'skew.pdf', outpdf,
        '--tesseract-pagesegmode', '7',
        '-v', '1',
        '--pdf-renderer', renderer, env=spoof_tesseract_cache)


@pytest.mark.parametrize('renderer', [
    'hocr',
    'tesseract',
    ])
def test_tesseract_crash(renderer, spoof_tesseract_crash,
                         resources, no_outpdf):
    p, out, err = run_ocrmypdf(
        resources / 'ccitt.pdf', no_outpdf, '-v', '1',
        '--pdf-renderer', renderer, env=spoof_tesseract_crash)
    assert p.returncode == ExitCode.child_process_error
    assert not os.path.exists(no_outpdf)
    assert "ERROR" in err


def test_tesseract_crash_autorotate(spoof_tesseract_crash,
                                    resources, no_outpdf):
    p, out, err = run_ocrmypdf(
        resources / 'ccitt.pdf', no_outpdf,
        '-r', env=spoof_tesseract_crash)
    assert p.returncode == ExitCode.child_process_error
    assert not os.path.exists(no_outpdf)
    assert "ERROR" in err
    print(out)
    print(err)


@pytest.mark.parametrize('renderer', [
    'hocr',
    'tesseract',
    ])
def test_tesseract_image_too_big(renderer, spoof_tesseract_big_image_error,
                                 resources, outpdf):
    check_ocrmypdf(
        resources / 'hugemono.pdf', outpdf, '-r',
        '--pdf-renderer', renderer, env=spoof_tesseract_big_image_error)


def test_no_unpaper(resources, no_outpdf):
    env = os.environ.copy()
    env['OCRMYPDF_UNPAPER'] = os.path.abspath('./spoof/no_unpaper_here.py')
    p, out, err = run_ocrmypdf(
        resources / 'c02-22.pdf', no_outpdf, '--clean', env=env)
    assert p.returncode == ExitCode.missing_dependency


def test_old_unpaper(resources, no_outpdf):
    env = os.environ.copy()
    env['OCRMYPDF_UNPAPER'] = os.path.abspath('./spoof/unpaper_oldversion.py')
    p, out, err = run_ocrmypdf(
        resources / 'c02-22.pdf', no_outpdf, '--clean', env=env)
    assert p.returncode == ExitCode.missing_dependency


def test_algo4(resources, no_outpdf):
    p, _, _ = run_ocrmypdf(resources / 'encrypted_algo4.pdf', no_outpdf)
    assert p.returncode == ExitCode.encrypted_pdf


@pytest.mark.parametrize('renderer', [
    'hocr'])  # tesseract cannot pass this test - resamples to square image
def test_non_square_resolution(renderer, spoof_tesseract_cache,
                               resources, outpdf):
    # Confirm input image is non-square resolution
    in_pageinfo = pdf_get_all_pageinfo(str(resources / 'aspect.pdf'))
    assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']

    check_ocrmypdf(
        resources / 'aspect.pdf', outpdf,
        '--pdf-renderer', renderer, env=spoof_tesseract_cache)

    out_pageinfo = pdf_get_all_pageinfo(str(outpdf))

    # Confirm resolution was kept the same
    assert in_pageinfo[0]['xres'] == out_pageinfo[0]['xres']
    assert in_pageinfo[0]['yres'] == out_pageinfo[0]['yres']


def test_image_to_pdf(spoof_tesseract_noop, resources, outpdf):
    check_ocrmypdf(
        resources / 'LinnSequencer.jpg', outpdf, '--image-dpi', '200',
        env=spoof_tesseract_noop)


def test_jbig2_passthrough(spoof_tesseract_cache, resources, outpdf):
    out = check_ocrmypdf(
        resources / 'jbig2.pdf', outpdf,
        '--output-type', 'pdf',
        '--pdf-renderer', 'hocr',
        env=spoof_tesseract_cache)

    out_pageinfo = pdf_get_all_pageinfo(str(out))
    assert out_pageinfo[0]['images'][0]['enc'] == 'jbig2'


def test_stdin(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf):
    input_file = str(resources / 'francais.pdf')
    output_file = str(outpdf)

    # Runs: ocrmypdf - output.pdf < testfile.pdf
    with open(input_file, 'rb') as input_stream:
        p_args = ocrmypdf_exec + ['-', output_file]
        p = Popen(
            p_args, close_fds=True, stdout=PIPE, stderr=PIPE,
            stdin=input_stream, env=spoof_tesseract_noop)
        out, err = p.communicate()

        assert p.returncode == ExitCode.ok


def test_stdout(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf):
    input_file = str(resources / 'francais.pdf')
    output_file = str(outpdf)

    # Runs: ocrmypdf francais.pdf - > test_stdout.pdf
    with open(output_file, 'wb') as output_stream:
        p_args = ocrmypdf_exec + [input_file, '-']
        p = Popen(
            p_args, close_fds=True, stdout=output_stream, stderr=PIPE,
            stdin=DEVNULL, env=spoof_tesseract_noop)
        out, err = p.communicate()

        assert p.returncode == ExitCode.ok

    from ocrmypdf.exec import qpdf
    assert qpdf.check(output_file, log=None)


def test_masks(spoof_tesseract_noop, resources, outpdf):
    check_ocrmypdf(resources / 'masks.pdf', outpdf, env=spoof_tesseract_noop)


def test_linearized_pdf_and_indirect_object(spoof_tesseract_noop,
        resources, outpdf):
    check_ocrmypdf(
        resources / 'epson.pdf', outpdf,
        env=spoof_tesseract_noop)


def test_rotated_skew_timeout(resources, outpdf):
    """This document contains an image that is rotated 90 into place with a
    /Rotate tag and intentionally skewed by altering the transformation matrix.

    This tests for a bug where the combinatino of preprocessing and a tesseract
    timeout produced a page whose dimensions did not match the original's.
    """

    input_file = str(resources / 'rotated_skew.pdf')
    in_pageinfo = pdf_get_all_pageinfo(input_file)[0]

    assert in_pageinfo['height_pixels'] < in_pageinfo['width_pixels'], \
        "Expected the input page to be landscape"
    assert in_pageinfo['rotate'] == 90, "Expected a rotated page"

    out = check_ocrmypdf(
        input_file, outpdf,
        '--pdf-renderer', 'hocr',
        '--deskew', '--tesseract-timeout', '0')

    out_pageinfo = pdf_get_all_pageinfo(str(out))[0]

    assert out_pageinfo['height_pixels'] > out_pageinfo['width_pixels'], \
        "Expected the output page to be portrait"

    assert out_pageinfo['rotate'] == 0, \
        "Expected no page rotation for output"

    assert in_pageinfo['width_pixels'] == out_pageinfo['height_pixels'] and \
        in_pageinfo['height_pixels'] == out_pageinfo['width_pixels'], \
        "Expected page rotation to be baked in"


def test_ghostscript_pdfa_failure(spoof_no_tess_no_pdfa, resources, outpdf):
    p, out, err = run_ocrmypdf(
        resources / 'ccitt.pdf', outpdf,
        env=spoof_no_tess_no_pdfa)
    assert p.returncode == 4, "Expected return code 4 when PDF/A fails"


def test_ghostscript_feature_elision(spoof_no_tess_pdfa_warning,
        resources, outpdf):
    check_ocrmypdf(resources / 'ccitt.pdf', outpdf,
                   env=spoof_no_tess_pdfa_warning)


def test_very_high_dpi(spoof_tesseract_cache, resources, outpdf):
    "Checks for a Decimal quantize error with high DPI, etc"
    check_ocrmypdf(resources / '2400dpi.pdf', outpdf,
                   env=spoof_tesseract_cache)