OCRmyPDF/tests/test_main.py

#!/usr/bin/env python3
# © 2015 James R. Barlow: github.com/jbarlow83

from __future__ import print_function
from subprocess import Popen, PIPE, check_output
import os
import shutil
from contextlib import suppress
import sys
from unittest.mock import patch, create_autospec
import pytest
from ocrmypdf.pageinfo import pdf_get_all_pageinfo
import PyPDF2 as pypdf
from ocrmypdf import ExitCode


if sys.version_info.major < 3:
    print("Requires Python 3.4+")
    sys.exit(1)

TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
OCRMYPDF = os.path.join(PROJECT_ROOT, 'OCRmyPDF.sh')
TEST_RESOURCES = os.path.join(PROJECT_ROOT, 'tests', 'resources')
TEST_OUTPUT = os.path.join(PROJECT_ROOT, 'tests', 'output')
TEST_BINARY_PATH = os.path.join(TEST_OUTPUT, 'bin')


def setup_module():
    with suppress(FileNotFoundError):
        shutil.rmtree(TEST_OUTPUT)
    with suppress(FileExistsError):
        os.mkdir(TEST_OUTPUT)


def run_ocrmypdf_sh(input_file, output_file, *args):
    sh_args = ['sh', OCRMYPDF] + list(args) + [input_file, output_file]
    sh = Popen(
        sh_args, close_fds=True, stdout=PIPE, stderr=PIPE,
        universal_newlines=True)
    out, err = sh.communicate()
    return sh, out, err


def _make_input(input_basename):
    return os.path.join(TEST_RESOURCES, input_basename)


def _make_output(output_basename):
    return os.path.join(TEST_OUTPUT, output_basename)


def check_ocrmypdf(input_basename, output_basename, *args):
    input_file = _make_input(input_basename)
    output_file = _make_output(output_basename)

    sh, _, err = run_ocrmypdf_sh(input_file, output_file, *args)
    assert sh.returncode == 0, err
    assert os.path.exists(output_file), "Output file not created"
    assert os.stat(output_file).st_size > 100, "PDF too small or empty"
    return output_file


def run_ocrmypdf_env(input_basename, output_basename, env, *args):
    input_file = _make_input(input_basename)
    output_file = _make_output(output_basename)

    p_args = ['ocrmypdf'] + list(args) + [input_file, output_file]
    p = Popen(
        p_args, close_fds=True, stdout=PIPE, stderr=PIPE,
        universal_newlines=True, env=env)
    out, err = p.communicate()
    return p, out, err


def test_quick():
    check_ocrmypdf('c02-22.pdf', 'test_quick.pdf')


def test_deskew():
    # Run with deskew
    deskewed_pdf = check_ocrmypdf('skew.pdf', 'test_deskew.pdf', '-d')

    # Now render as an image again and use Leptonica to find the skew angle
    # to confirm that it was deskewed
    from ocrmypdf.ghostscript import rasterize_pdf
    import logging
    log = logging.getLogger()

    deskewed_png = _make_output('deskewed.png')

    rasterize_pdf(
        deskewed_pdf,
        deskewed_png,
        xres=150,
        yres=150,
        raster_device='pngmono',
        log=log)

    from ocrmypdf.leptonica import pixRead, pixDestroy, pixFindSkew
    pix = pixRead(deskewed_png)
    skew_angle, skew_confidence = pixFindSkew(pix)
    pix = pixDestroy(pix)

    print(skew_angle)
    assert -0.5 < skew_angle < 0.5, "Deskewing failed"


def test_clean():
    check_ocrmypdf('skew.pdf', 'test_clean.pdf', '-c')


def test_preserve_metadata():
    pdf_before = pypdf.PdfFileReader(_make_input('graph.pdf'))

    output = check_ocrmypdf('graph.pdf', 'test_metadata_preserve.pdf')

    pdf_after = pypdf.PdfFileReader(output)

    for key in ('/Title', '/Author'):
        assert pdf_before.documentInfo[key] == pdf_after.documentInfo[key]


def test_override_metadata():
    german = 'Du siehst den Wald vor lauter Bäumen nicht.'
    chinese = '孔子'
    high_unicode = 'U+1030C is: 𐌌'

    pdf = check_ocrmypdf(
        'c02-22.pdf', 'test_metadata.pdf',
        '--title', german,
        '--author', chinese,
        '--subject', high_unicode)

    out_pdfinfo = check_output(['pdfinfo', pdf], universal_newlines=True)
    lines_pdfinfo = out_pdfinfo.splitlines()
    pdfinfo = {}
    for line in lines_pdfinfo:
        k, v = line.strip().split(':', maxsplit=1)
        pdfinfo[k.strip()] = v.strip()

    assert pdfinfo['Title'] == german
    assert pdfinfo['Author'] == chinese
    assert pdfinfo['Subject'] == high_unicode
    assert pdfinfo.get('Keywords', '') == ''


def check_oversample(renderer):
    oversampled_pdf = check_ocrmypdf(
        'skew.pdf', 'test_oversample_%s.pdf' % renderer, '--oversample', '300',
        '--pdf-renderer', renderer)

    pdfinfo = pdf_get_all_pageinfo(oversampled_pdf)

    print(pdfinfo[0]['xres'])
    assert abs(pdfinfo[0]['xres'] - 300) < 1


def test_oversample():
    yield check_oversample, 'hocr'
    yield check_oversample, 'tesseract'


def test_repeat_ocr():
    sh, _, _ = run_ocrmypdf_sh('graph_ocred.pdf', 'wontwork.pdf')
    assert sh.returncode != 0


def test_force_ocr():
    out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f')
    pdfinfo = pdf_get_all_pageinfo(out)
    assert pdfinfo[0]['has_text']


def test_skip_ocr():
    check_ocrmypdf('graph_ocred.pdf', 'test_skip.pdf', '-s')


def test_argsfile():
    with open(_make_output('test_argsfile.txt'), 'w') as argsfile:
        print('--title', 'ArgsFile Test', '--author', 'Test Cases',
              sep='\n', end='\n', file=argsfile)
    check_ocrmypdf('graph.pdf', 'test_argsfile.pdf',
                   '@' + _make_output('test_argsfile.txt'))


def check_ocr_timeout(renderer):
    out = check_ocrmypdf('skew.pdf', 'test_timeout_%s.pdf' % renderer,
                         '--tesseract-timeout', '1.0')
    pdfinfo = pdf_get_all_pageinfo(out)
    assert pdfinfo[0]['has_text'] == False


def test_ocr_timeout():
    yield check_ocr_timeout, 'hocr'
    yield check_ocr_timeout, 'tesseract'


def test_skip_big():
    out = check_ocrmypdf('enormous.pdf', 'test_enormous.pdf',
                         '--skip-big', '10')
    pdfinfo = pdf_get_all_pageinfo(out)
    assert pdfinfo[0]['has_text'] == False


def check_maximum_options(renderer):
    check_ocrmypdf(
        'multipage.pdf', 'test_multipage%s.pdf' % renderer,
        '-d', '-c', '-i', '-g', '-f', '-k', '--oversample', '300',
        '--skip-big', '10', '--title', 'Too Many Weird Files',
        '--author', 'py.test', '--pdf-renderer', renderer)


def test_maximum_options():
    yield check_maximum_options, 'hocr'
    yield check_maximum_options, 'tesseract'


def override_binary(binary, replacement):
    with suppress(FileExistsError):
        os.makedirs(TEST_BINARY_PATH)

    replacement_path = os.path.abspath(os.path.join(TESTS_ROOT,
                                                    replacement))
    binary_path = os.path.abspath(os.path.join(TEST_BINARY_PATH,
                                               binary))
    assert not os.path.lexists(binary_path)
    print("symlink %s -> %s" % (replacement_path, binary_path))
    os.symlink(replacement_path, binary_path)

    os.chmod(replacement_path, int('755', base=8))

    return os.path.dirname(binary_path) + os.pathsep + os.environ["PATH"]


@pytest.fixture
def break_ghostscript_pdfa():
    return override_binary('gs', 'replace_ghostscript_nopdfa.py')


def test_ghostscript_pdfa_fails(break_ghostscript_pdfa):
    env = os.environ
    env['PATH'] = break_ghostscript_pdfa

    p, out, err = run_ocrmypdf_env(
        'graph_ocred.pdf', 'not_a_pdfa.pdf', env, '-v', '1', '--skip-text')
    assert p.returncode == ExitCode.invalid_output_pdfa, err  # not PDFA


def test_tesseract_missing_tessdata():
    env = os.environ
    env['TESSDATA_PREFIX'] = '/tmp'

    p, _, err = run_ocrmypdf_env(
        'graph_ocred.pdf', 'not_a_pdfa.pdf', env, '-v', '1', '--skip-text')
    assert p.returncode == ExitCode.missing_dependency, err
Basic test cases 2015-07-22 02:59:25 -07:00			`#!/usr/bin/env python3`
Update release notes, add copyrights 2015-07-28 04:36:58 -07:00			`# © 2015 James R. Barlow: github.com/jbarlow83`
Basic test cases 2015-07-22 02:59:25 -07:00
Goodbye, so long, farewell, shell... 2015-07-25 00:57:07 -07:00			`from __future__ import print_function`
Add test to confirm that metadata is transferred to final PDF/A 2015-07-27 16:11:51 -07:00			`from subprocess import Popen, PIPE, check_output`
Basic test cases 2015-07-22 02:59:25 -07:00			`import os`
New test: check skew 2015-07-22 04:00:59 -07:00			`import shutil`
			`from contextlib import suppress`
Require Py3 for tests 2015-07-22 11:21:33 -07:00			`import sys`
nose can't really handle external tests so looking into py.test instead Specifically it trips over the need to reimport ocrmypdf.main. That in turn raises questions about whether to make that function into an external script that imports ocrmypdf... or something else. Would be possible with a loop that manipulates sys_argv and then reloads ocrmypdf.main; might need that anyway. 2015-07-27 22:07:04 -07:00			`from unittest.mock import patch, create_autospec`
Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`import pytest`
More test cases for other parameters 2015-07-28 02:31:18 -07:00			`from ocrmypdf.pageinfo import pdf_get_all_pageinfo`
New test case: ensure metadata is preserved from input to output 2015-08-05 17:09:38 -07:00			`import PyPDF2 as pypdf`
Refactor exit codes; test for missing tessdata Some versions of tesseract installed by homebrew end up without a functional tessdata folder, and tesseract is not helpful in this situation, so add a new test to make sure our output is at least indicative of the problem. In the process of properly handling return codes I discovered test_override_metadata triggers a NPE inside JHOVE probably due to the Unicode character checking. This could be specific to my JRE (1.6.0_65, Oracle) but it's probably JHOVE's fault. A valid PDF/A (per Acrobat) is still generated. 2015-08-11 00:17:02 -07:00			`from ocrmypdf import ExitCode`
nose can't really handle external tests so looking into py.test instead Specifically it trips over the need to reimport ocrmypdf.main. That in turn raises questions about whether to make that function into an external script that imports ocrmypdf... or something else. Would be possible with a loop that manipulates sys_argv and then reloads ocrmypdf.main; might need that anyway. 2015-07-27 22:07:04 -07:00
Require Py3 for tests 2015-07-22 11:21:33 -07:00
			`if sys.version_info.major < 3:`
			`print("Requires Python 3.4+")`
			`sys.exit(1)`
Basic test cases 2015-07-22 02:59:25 -07:00
			`TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))`
			`PROJECT_ROOT = os.path.dirname(TESTS_ROOT)`
			`OCRMYPDF = os.path.join(PROJECT_ROOT, 'OCRmyPDF.sh')`
			`TEST_RESOURCES = os.path.join(PROJECT_ROOT, 'tests', 'resources')`
Add another test 2015-07-22 03:16:19 -07:00			`TEST_OUTPUT = os.path.join(PROJECT_ROOT, 'tests', 'output')`
Add new test case to check invalid PDF/A case It revealed a regression - return code not the same as v2.x for invalid PDF/A. It's also not easy to get the return code out of ruffus. Will need to tweak the final step of the pipeline. 2015-08-10 13:57:28 -07:00			`TEST_BINARY_PATH = os.path.join(TEST_OUTPUT, 'bin')`
Basic test cases 2015-07-22 02:59:25 -07:00

Add another test 2015-07-22 03:16:19 -07:00			`def setup_module():`
New test: check skew 2015-07-22 04:00:59 -07:00			`with suppress(FileNotFoundError):`
			`shutil.rmtree(TEST_OUTPUT)`
			`with suppress(FileExistsError):`
Add another test 2015-07-22 03:16:19 -07:00			`os.mkdir(TEST_OUTPUT)`


Improve argument handling, test cases 2015-07-27 15:39:54 -07:00			`def run_ocrmypdf_sh(input_file, output_file, *args):`
New test: check skew 2015-07-22 04:00:59 -07:00			`sh_args = ['sh', OCRMYPDF] + list(args) + [input_file, output_file]`
Basic test cases 2015-07-22 02:59:25 -07:00			`sh = Popen(`
			`sh_args, close_fds=True, stdout=PIPE, stderr=PIPE,`
			`universal_newlines=True)`
			`out, err = sh.communicate()`
			`return sh, out, err`


New test case: ensure metadata is preserved from input to output 2015-08-05 17:09:38 -07:00			`def _make_input(input_basename):`
			`return os.path.join(TEST_RESOURCES, input_basename)`


			`def _make_output(output_basename):`
			`return os.path.join(TEST_OUTPUT, output_basename)`


Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`def check_ocrmypdf(input_basename, output_basename, *args):`
New test case: ensure metadata is preserved from input to output 2015-08-05 17:09:38 -07:00			`input_file = _make_input(input_basename)`
			`output_file = _make_output(output_basename)`
New test: check skew 2015-07-22 04:00:59 -07:00
Improve argument handling, test cases 2015-07-27 15:39:54 -07:00			`sh, _, err = run_ocrmypdf_sh(input_file, output_file, *args)`
Basic test cases 2015-07-22 02:59:25 -07:00			`assert sh.returncode == 0, err`
New test: check skew 2015-07-22 04:00:59 -07:00			`assert os.path.exists(output_file), "Output file not created"`
Improve argument handling, test cases 2015-07-27 15:39:54 -07:00			`assert os.stat(output_file).st_size > 100, "PDF too small or empty"`
			`return output_file`
Basic test cases 2015-07-22 02:59:25 -07:00

Add new test case to check invalid PDF/A case It revealed a regression - return code not the same as v2.x for invalid PDF/A. It's also not easy to get the return code out of ruffus. Will need to tweak the final step of the pipeline. 2015-08-10 13:57:28 -07:00			`def run_ocrmypdf_env(input_basename, output_basename, env, *args):`
			`input_file = _make_input(input_basename)`
			`output_file = _make_output(output_basename)`

			`p_args = ['ocrmypdf'] + list(args) + [input_file, output_file]`
			`p = Popen(`
			`p_args, close_fds=True, stdout=PIPE, stderr=PIPE,`
			`universal_newlines=True, env=env)`
			`out, err = p.communicate()`
			`return p, out, err`


Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`def test_quick():`
			`check_ocrmypdf('c02-22.pdf', 'test_quick.pdf')`
New test: check skew 2015-07-22 04:00:59 -07:00

Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`def test_deskew():`
Add test to confirm that metadata is transferred to final PDF/A 2015-07-27 16:11:51 -07:00			`# Run with deskew`
Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`deskewed_pdf = check_ocrmypdf('skew.pdf', 'test_deskew.pdf', '-d')`
Add test to confirm that metadata is transferred to final PDF/A 2015-07-27 16:11:51 -07:00
			`# Now render as an image again and use Leptonica to find the skew angle`
			`# to confirm that it was deskewed`
Improve argument handling, test cases 2015-07-27 15:39:54 -07:00			`from ocrmypdf.ghostscript import rasterize_pdf`
			`import logging`
			`log = logging.getLogger()`

New test case: ensure metadata is preserved from input to output 2015-08-05 17:09:38 -07:00			`deskewed_png = _make_output('deskewed.png')`
Improve argument handling, test cases 2015-07-27 15:39:54 -07:00
			`rasterize_pdf(`
			`deskewed_pdf,`
			`deskewed_png,`
			`xres=150,`
			`yres=150,`
			`raster_device='pngmono',`
			`log=log)`

			`from ocrmypdf.leptonica import pixRead, pixDestroy, pixFindSkew`
			`pix = pixRead(deskewed_png)`
			`skew_angle, skew_confidence = pixFindSkew(pix)`
			`pix = pixDestroy(pix)`

			`print(skew_angle)`
			`assert -0.5 < skew_angle < 0.5, "Deskewing failed"`
New test: check skew 2015-07-22 04:00:59 -07:00

Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`def test_clean():`
			`check_ocrmypdf('skew.pdf', 'test_clean.pdf', '-c')`
Improve argument handling, test cases 2015-07-27 15:39:54 -07:00

New test case: ensure metadata is preserved from input to output 2015-08-05 17:09:38 -07:00			`def test_preserve_metadata():`
			`pdf_before = pypdf.PdfFileReader(_make_input('graph.pdf'))`

			`output = check_ocrmypdf('graph.pdf', 'test_metadata_preserve.pdf')`

			`pdf_after = pypdf.PdfFileReader(output)`

			`for key in ('/Title', '/Author'):`
			`assert pdf_before.documentInfo[key] == pdf_after.documentInfo[key]`


Remove duplication in test case 2015-08-05 16:57:04 -07:00			`def test_override_metadata():`
			`german = 'Du siehst den Wald vor lauter Bäumen nicht.'`
			`chinese = '孔子'`
			`high_unicode = 'U+1030C is: 𐌌'`

Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`pdf = check_ocrmypdf(`
Add test to confirm that metadata is transferred to final PDF/A 2015-07-27 16:11:51 -07:00			`'c02-22.pdf', 'test_metadata.pdf',`
Remove duplication in test case 2015-08-05 16:57:04 -07:00			`'--title', german,`
			`'--author', chinese,`
			`'--subject', high_unicode)`
Add test to confirm that metadata is transferred to final PDF/A 2015-07-27 16:11:51 -07:00
			`out_pdfinfo = check_output(['pdfinfo', pdf], universal_newlines=True)`
			`lines_pdfinfo = out_pdfinfo.splitlines()`
			`pdfinfo = {}`
			`for line in lines_pdfinfo:`
			`k, v = line.strip().split(':', maxsplit=1)`
			`pdfinfo[k.strip()] = v.strip()`

Remove duplication in test case 2015-08-05 16:57:04 -07:00			`assert pdfinfo['Title'] == german`
			`assert pdfinfo['Author'] == chinese`
			`assert pdfinfo['Subject'] == high_unicode`
Add test to confirm that metadata is transferred to final PDF/A 2015-07-27 16:11:51 -07:00			`assert pdfinfo.get('Keywords', '') == ''`
Add --oversample test for hocr rendering 2015-07-27 17:18:02 -07:00

nose can't really handle external tests so looking into py.test instead Specifically it trips over the need to reimport ocrmypdf.main. That in turn raises questions about whether to make that function into an external script that imports ocrmypdf... or something else. Would be possible with a loop that manipulates sys_argv and then reloads ocrmypdf.main; might need that anyway. 2015-07-27 22:07:04 -07:00			`def check_oversample(renderer):`
Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`oversampled_pdf = check_ocrmypdf(`
nose can't really handle external tests so looking into py.test instead Specifically it trips over the need to reimport ocrmypdf.main. That in turn raises questions about whether to make that function into an external script that imports ocrmypdf... or something else. Would be possible with a loop that manipulates sys_argv and then reloads ocrmypdf.main; might need that anyway. 2015-07-27 22:07:04 -07:00			`'skew.pdf', 'test_oversample_%s.pdf' % renderer, '--oversample', '300',`
			`'--pdf-renderer', renderer)`
Add --oversample test for hocr rendering 2015-07-27 17:18:02 -07:00
			`pdfinfo = pdf_get_all_pageinfo(oversampled_pdf)`

			`print(pdfinfo[0]['xres'])`
			`assert abs(pdfinfo[0]['xres'] - 300) < 1`
nose can't really handle external tests so looking into py.test instead Specifically it trips over the need to reimport ocrmypdf.main. That in turn raises questions about whether to make that function into an external script that imports ocrmypdf... or something else. Would be possible with a loop that manipulates sys_argv and then reloads ocrmypdf.main; might need that anyway. 2015-07-27 22:07:04 -07:00

			`def test_oversample():`
			`yield check_oversample, 'hocr'`
			`yield check_oversample, 'tesseract'`


			`def test_repeat_ocr():`
Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`sh, _, _ = run_ocrmypdf_sh('graph_ocred.pdf', 'wontwork.pdf')`
			`assert sh.returncode != 0`
nose can't really handle external tests so looking into py.test instead Specifically it trips over the need to reimport ocrmypdf.main. That in turn raises questions about whether to make that function into an external script that imports ocrmypdf... or something else. Would be possible with a loop that manipulates sys_argv and then reloads ocrmypdf.main; might need that anyway. 2015-07-27 22:07:04 -07:00

			`def test_force_ocr():`
More test cases for other parameters 2015-07-28 02:31:18 -07:00			`out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f')`
			`pdfinfo = pdf_get_all_pageinfo(out)`
			`assert pdfinfo[0]['has_text']`
nose can't really handle external tests so looking into py.test instead Specifically it trips over the need to reimport ocrmypdf.main. That in turn raises questions about whether to make that function into an external script that imports ocrmypdf... or something else. Would be possible with a loop that manipulates sys_argv and then reloads ocrmypdf.main; might need that anyway. 2015-07-27 22:07:04 -07:00

Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`def test_skip_ocr():`
More test cases 2015-07-28 03:02:35 -07:00			`check_ocrmypdf('graph_ocred.pdf', 'test_skip.pdf', '-s')`
Test cases for --tesseract-timeout 2015-07-28 01:47:30 -07:00

Add a test case to check on the @argumentsfile syntax 2015-08-05 23:17:38 -07:00			`def test_argsfile():`
			`with open(_make_output('test_argsfile.txt'), 'w') as argsfile:`
			`print('--title', 'ArgsFile Test', '--author', 'Test Cases',`
			`sep='\n', end='\n', file=argsfile)`
			`check_ocrmypdf('graph.pdf', 'test_argsfile.pdf',`
			`'@' + _make_output('test_argsfile.txt'))`


Test cases for --tesseract-timeout 2015-07-28 01:47:30 -07:00			`def check_ocr_timeout(renderer):`
More test cases for other parameters 2015-07-28 02:31:18 -07:00			`out = check_ocrmypdf('skew.pdf', 'test_timeout_%s.pdf' % renderer,`
			`'--tesseract-timeout', '1.0')`
			`pdfinfo = pdf_get_all_pageinfo(out)`
			`assert pdfinfo[0]['has_text'] == False`
Test cases for --tesseract-timeout 2015-07-28 01:47:30 -07:00

			`def test_ocr_timeout():`
			`yield check_ocr_timeout, 'hocr'`
			`yield check_ocr_timeout, 'tesseract'`
More test cases for other parameters 2015-07-28 02:31:18 -07:00

			`def test_skip_big():`
			`out = check_ocrmypdf('enormous.pdf', 'test_enormous.pdf',`
			`'--skip-big', '10')`
			`pdfinfo = pdf_get_all_pageinfo(out)`
			`assert pdfinfo[0]['has_text'] == False`

More test cases 2015-07-28 03:02:35 -07:00
			`def check_maximum_options(renderer):`
			`check_ocrmypdf(`
			`'multipage.pdf', 'test_multipage%s.pdf' % renderer,`
			`'-d', '-c', '-i', '-g', '-f', '-k', '--oversample', '300',`
			`'--skip-big', '10', '--title', 'Too Many Weird Files',`
			`'--author', 'py.test', '--pdf-renderer', renderer)`


			`def test_maximum_options():`
			`yield check_maximum_options, 'hocr'`
			`yield check_maximum_options, 'tesseract'`
Add new test case to check invalid PDF/A case It revealed a regression - return code not the same as v2.x for invalid PDF/A. It's also not easy to get the return code out of ruffus. Will need to tweak the final step of the pipeline. 2015-08-10 13:57:28 -07:00

			`def override_binary(binary, replacement):`
			`with suppress(FileExistsError):`
			`os.makedirs(TEST_BINARY_PATH)`

			`replacement_path = os.path.abspath(os.path.join(TESTS_ROOT,`
			`replacement))`
			`binary_path = os.path.abspath(os.path.join(TEST_BINARY_PATH,`
			`binary))`
			`assert not os.path.lexists(binary_path)`
			`print("symlink %s -> %s" % (replacement_path, binary_path))`
			`os.symlink(replacement_path, binary_path)`

			`os.chmod(replacement_path, int('755', base=8))`

			`return os.path.dirname(binary_path) + os.pathsep + os.environ["PATH"]`


			`@pytest.fixture`
			`def break_ghostscript_pdfa():`
			`return override_binary('gs', 'replace_ghostscript_nopdfa.py')`


			`def test_ghostscript_pdfa_fails(break_ghostscript_pdfa):`
			`env = os.environ`
			`env['PATH'] = break_ghostscript_pdfa`

			`p, out, err = run_ocrmypdf_env(`
			`'graph_ocred.pdf', 'not_a_pdfa.pdf', env, '-v', '1', '--skip-text')`
Refactor exit codes; test for missing tessdata Some versions of tesseract installed by homebrew end up without a functional tessdata folder, and tesseract is not helpful in this situation, so add a new test to make sure our output is at least indicative of the problem. In the process of properly handling return codes I discovered test_override_metadata triggers a NPE inside JHOVE probably due to the Unicode character checking. This could be specific to my JRE (1.6.0_65, Oracle) but it's probably JHOVE's fault. A valid PDF/A (per Acrobat) is still generated. 2015-08-11 00:17:02 -07:00			`assert p.returncode == ExitCode.invalid_output_pdfa, err # not PDFA`


			`def test_tesseract_missing_tessdata():`
			`env = os.environ`
			`env['TESSDATA_PREFIX'] = '/tmp'`

			`p, _, err = run_ocrmypdf_env(`
			`'graph_ocred.pdf', 'not_a_pdfa.pdf', env, '-v', '1', '--skip-text')`
			`assert p.returncode == ExitCode.missing_dependency, err`