OCRmyPDF/tests/test_main.py

#!/usr/bin/env python3

from __future__ import print_function
from subprocess import Popen, PIPE, check_output
import os
import shutil
from contextlib import suppress
import sys
from unittest.mock import patch, create_autospec
import pytest
from ocrmypdf.pageinfo import pdf_get_all_pageinfo


if sys.version_info.major < 3:
    print("Requires Python 3.4+")
    sys.exit(1)

TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
OCRMYPDF = os.path.join(PROJECT_ROOT, 'OCRmyPDF.sh')
TEST_RESOURCES = os.path.join(PROJECT_ROOT, 'tests', 'resources')
TEST_OUTPUT = os.path.join(PROJECT_ROOT, 'tests', 'output')


def setup_module():
    with suppress(FileNotFoundError):
        shutil.rmtree(TEST_OUTPUT)
    with suppress(FileExistsError):
        os.mkdir(TEST_OUTPUT)


def run_ocrmypdf_sh(input_file, output_file, *args):
    sh_args = ['sh', OCRMYPDF] + list(args) + [input_file, output_file]
    sh = Popen(
        sh_args, close_fds=True, stdout=PIPE, stderr=PIPE,
        universal_newlines=True)
    out, err = sh.communicate()
    return sh, out, err


def check_ocrmypdf(input_basename, output_basename, *args):
    input_file = os.path.join(TEST_RESOURCES, input_basename)
    output_file = os.path.join(TEST_OUTPUT, output_basename)

    sh, _, err = run_ocrmypdf_sh(input_file, output_file, *args)
    assert sh.returncode == 0, err
    assert os.path.exists(output_file), "Output file not created"
    assert os.stat(output_file).st_size > 100, "PDF too small or empty"
    return output_file


def test_quick():
    check_ocrmypdf('c02-22.pdf', 'test_quick.pdf')


def test_deskew():
    # Run with deskew
    deskewed_pdf = check_ocrmypdf('skew.pdf', 'test_deskew.pdf', '-d')

    # Now render as an image again and use Leptonica to find the skew angle
    # to confirm that it was deskewed
    from ocrmypdf.ghostscript import rasterize_pdf
    import logging
    log = logging.getLogger()

    deskewed_png = os.path.join(TEST_OUTPUT, 'deskewed.png')

    rasterize_pdf(
        deskewed_pdf,
        deskewed_png,
        xres=150,
        yres=150,
        raster_device='pngmono',
        log=log)

    from ocrmypdf.leptonica import pixRead, pixDestroy, pixFindSkew
    pix = pixRead(deskewed_png)
    skew_angle, skew_confidence = pixFindSkew(pix)
    pix = pixDestroy(pix)

    print(skew_angle)
    assert -0.5 < skew_angle < 0.5, "Deskewing failed"


def test_clean():
    check_ocrmypdf('skew.pdf', 'test_clean.pdf', '-c')


def test_metadata():
    pdf = check_ocrmypdf(
        'c02-22.pdf', 'test_metadata.pdf',
        '--title', 'Du siehst den Wald vor lauter Bäumen nicht.',
        '--author', '孔子',
        '--subject', 'U+1030C is: 𐌌')

    out_pdfinfo = check_output(['pdfinfo', pdf], universal_newlines=True)
    lines_pdfinfo = out_pdfinfo.splitlines()
    pdfinfo = {}
    for line in lines_pdfinfo:
        k, v = line.strip().split(':', maxsplit=1)
        pdfinfo[k.strip()] = v.strip()

    assert pdfinfo['Title'] == 'Du siehst den Wald vor lauter Bäumen nicht.'
    assert pdfinfo['Author'] == '孔子'
    assert pdfinfo['Subject'] == 'U+1030C is: 𐌌'
    assert pdfinfo.get('Keywords', '') == ''


def check_oversample(renderer):
    oversampled_pdf = check_ocrmypdf(
        'skew.pdf', 'test_oversample_%s.pdf' % renderer, '--oversample', '300',
        '--pdf-renderer', renderer)

    pdfinfo = pdf_get_all_pageinfo(oversampled_pdf)

    print(pdfinfo[0]['xres'])
    assert abs(pdfinfo[0]['xres'] - 300) < 1


def test_oversample():
    yield check_oversample, 'hocr'
    yield check_oversample, 'tesseract'


def test_repeat_ocr():
    sh, _, _ = run_ocrmypdf_sh('graph_ocred.pdf', 'wontwork.pdf')
    assert sh.returncode != 0


def test_force_ocr():
    out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f')
    pdfinfo = pdf_get_all_pageinfo(out)
    assert pdfinfo[0]['has_text']


def test_skip_ocr():
    check_ocrmypdf('graph_ocred.pdf', 'test_skip.pdf', '-s')


def check_ocr_timeout(renderer):
    out = check_ocrmypdf('skew.pdf', 'test_timeout_%s.pdf' % renderer,
                         '--tesseract-timeout', '1.0')
    pdfinfo = pdf_get_all_pageinfo(out)
    assert pdfinfo[0]['has_text'] == False


def test_ocr_timeout():
    yield check_ocr_timeout, 'hocr'
    yield check_ocr_timeout, 'tesseract'


def test_skip_big():
    out = check_ocrmypdf('enormous.pdf', 'test_enormous.pdf',
                         '--skip-big', '10')
    pdfinfo = pdf_get_all_pageinfo(out)
    assert pdfinfo[0]['has_text'] == False


def check_maximum_options(renderer):
    check_ocrmypdf(
        'multipage.pdf', 'test_multipage%s.pdf' % renderer,
        '-d', '-c', '-i', '-g', '-f', '-k', '--oversample', '300',
        '--skip-big', '10', '--title', 'Too Many Weird Files',
        '--author', 'py.test', '--pdf-renderer', renderer)


def test_maximum_options():
    yield check_maximum_options, 'hocr'
    yield check_maximum_options, 'tesseract'
Basic test cases 2015-07-22 02:59:25 -07:00			`#!/usr/bin/env python3`

Goodbye, so long, farewell, shell... 2015-07-25 00:57:07 -07:00			`from __future__ import print_function`
Add test to confirm that metadata is transferred to final PDF/A 2015-07-27 16:11:51 -07:00			`from subprocess import Popen, PIPE, check_output`
Basic test cases 2015-07-22 02:59:25 -07:00			`import os`
New test: check skew 2015-07-22 04:00:59 -07:00			`import shutil`
			`from contextlib import suppress`
Require Py3 for tests 2015-07-22 11:21:33 -07:00			`import sys`
nose can't really handle external tests so looking into py.test instead Specifically it trips over the need to reimport ocrmypdf.main. That in turn raises questions about whether to make that function into an external script that imports ocrmypdf... or something else. Would be possible with a loop that manipulates sys_argv and then reloads ocrmypdf.main; might need that anyway. 2015-07-27 22:07:04 -07:00			`from unittest.mock import patch, create_autospec`
Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`import pytest`
More test cases for other parameters 2015-07-28 02:31:18 -07:00			`from ocrmypdf.pageinfo import pdf_get_all_pageinfo`
nose can't really handle external tests so looking into py.test instead Specifically it trips over the need to reimport ocrmypdf.main. That in turn raises questions about whether to make that function into an external script that imports ocrmypdf... or something else. Would be possible with a loop that manipulates sys_argv and then reloads ocrmypdf.main; might need that anyway. 2015-07-27 22:07:04 -07:00
Require Py3 for tests 2015-07-22 11:21:33 -07:00
			`if sys.version_info.major < 3:`
			`print("Requires Python 3.4+")`
			`sys.exit(1)`
Basic test cases 2015-07-22 02:59:25 -07:00
			`TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))`
			`PROJECT_ROOT = os.path.dirname(TESTS_ROOT)`
			`OCRMYPDF = os.path.join(PROJECT_ROOT, 'OCRmyPDF.sh')`
			`TEST_RESOURCES = os.path.join(PROJECT_ROOT, 'tests', 'resources')`
Add another test 2015-07-22 03:16:19 -07:00			`TEST_OUTPUT = os.path.join(PROJECT_ROOT, 'tests', 'output')`
Basic test cases 2015-07-22 02:59:25 -07:00

Add another test 2015-07-22 03:16:19 -07:00			`def setup_module():`
New test: check skew 2015-07-22 04:00:59 -07:00			`with suppress(FileNotFoundError):`
			`shutil.rmtree(TEST_OUTPUT)`
			`with suppress(FileExistsError):`
Add another test 2015-07-22 03:16:19 -07:00			`os.mkdir(TEST_OUTPUT)`


Improve argument handling, test cases 2015-07-27 15:39:54 -07:00			`def run_ocrmypdf_sh(input_file, output_file, *args):`
New test: check skew 2015-07-22 04:00:59 -07:00			`sh_args = ['sh', OCRMYPDF] + list(args) + [input_file, output_file]`
Basic test cases 2015-07-22 02:59:25 -07:00			`sh = Popen(`
			`sh_args, close_fds=True, stdout=PIPE, stderr=PIPE,`
			`universal_newlines=True)`
			`out, err = sh.communicate()`
			`return sh, out, err`


Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`def check_ocrmypdf(input_basename, output_basename, *args):`
New test: check skew 2015-07-22 04:00:59 -07:00			`input_file = os.path.join(TEST_RESOURCES, input_basename)`
nose can't really handle external tests so looking into py.test instead Specifically it trips over the need to reimport ocrmypdf.main. That in turn raises questions about whether to make that function into an external script that imports ocrmypdf... or something else. Would be possible with a loop that manipulates sys_argv and then reloads ocrmypdf.main; might need that anyway. 2015-07-27 22:07:04 -07:00			`output_file = os.path.join(TEST_OUTPUT, output_basename)`
New test: check skew 2015-07-22 04:00:59 -07:00
Improve argument handling, test cases 2015-07-27 15:39:54 -07:00			`sh, _, err = run_ocrmypdf_sh(input_file, output_file, *args)`
Basic test cases 2015-07-22 02:59:25 -07:00			`assert sh.returncode == 0, err`
New test: check skew 2015-07-22 04:00:59 -07:00			`assert os.path.exists(output_file), "Output file not created"`
Improve argument handling, test cases 2015-07-27 15:39:54 -07:00			`assert os.stat(output_file).st_size > 100, "PDF too small or empty"`
			`return output_file`
Basic test cases 2015-07-22 02:59:25 -07:00

Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`def test_quick():`
			`check_ocrmypdf('c02-22.pdf', 'test_quick.pdf')`
New test: check skew 2015-07-22 04:00:59 -07:00

Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`def test_deskew():`
Add test to confirm that metadata is transferred to final PDF/A 2015-07-27 16:11:51 -07:00			`# Run with deskew`
Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`deskewed_pdf = check_ocrmypdf('skew.pdf', 'test_deskew.pdf', '-d')`
Add test to confirm that metadata is transferred to final PDF/A 2015-07-27 16:11:51 -07:00
			`# Now render as an image again and use Leptonica to find the skew angle`
			`# to confirm that it was deskewed`
Improve argument handling, test cases 2015-07-27 15:39:54 -07:00			`from ocrmypdf.ghostscript import rasterize_pdf`
			`import logging`
			`log = logging.getLogger()`

			`deskewed_png = os.path.join(TEST_OUTPUT, 'deskewed.png')`

			`rasterize_pdf(`
			`deskewed_pdf,`
			`deskewed_png,`
			`xres=150,`
			`yres=150,`
			`raster_device='pngmono',`
			`log=log)`

			`from ocrmypdf.leptonica import pixRead, pixDestroy, pixFindSkew`
			`pix = pixRead(deskewed_png)`
			`skew_angle, skew_confidence = pixFindSkew(pix)`
			`pix = pixDestroy(pix)`

			`print(skew_angle)`
			`assert -0.5 < skew_angle < 0.5, "Deskewing failed"`
New test: check skew 2015-07-22 04:00:59 -07:00

Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`def test_clean():`
			`check_ocrmypdf('skew.pdf', 'test_clean.pdf', '-c')`
Improve argument handling, test cases 2015-07-27 15:39:54 -07:00

Add test to confirm that metadata is transferred to final PDF/A 2015-07-27 16:11:51 -07:00			`def test_metadata():`
Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`pdf = check_ocrmypdf(`
Add test to confirm that metadata is transferred to final PDF/A 2015-07-27 16:11:51 -07:00			`'c02-22.pdf', 'test_metadata.pdf',`
			`'--title', 'Du siehst den Wald vor lauter Bäumen nicht.',`
			`'--author', '孔子',`
			`'--subject', 'U+1030C is: 𐌌')`

			`out_pdfinfo = check_output(['pdfinfo', pdf], universal_newlines=True)`
			`lines_pdfinfo = out_pdfinfo.splitlines()`
			`pdfinfo = {}`
			`for line in lines_pdfinfo:`
			`k, v = line.strip().split(':', maxsplit=1)`
			`pdfinfo[k.strip()] = v.strip()`

			`assert pdfinfo['Title'] == 'Du siehst den Wald vor lauter Bäumen nicht.'`
			`assert pdfinfo['Author'] == '孔子'`
			`assert pdfinfo['Subject'] == 'U+1030C is: 𐌌'`
			`assert pdfinfo.get('Keywords', '') == ''`
Add --oversample test for hocr rendering 2015-07-27 17:18:02 -07:00

nose can't really handle external tests so looking into py.test instead Specifically it trips over the need to reimport ocrmypdf.main. That in turn raises questions about whether to make that function into an external script that imports ocrmypdf... or something else. Would be possible with a loop that manipulates sys_argv and then reloads ocrmypdf.main; might need that anyway. 2015-07-27 22:07:04 -07:00			`def check_oversample(renderer):`
Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`oversampled_pdf = check_ocrmypdf(`
nose can't really handle external tests so looking into py.test instead Specifically it trips over the need to reimport ocrmypdf.main. That in turn raises questions about whether to make that function into an external script that imports ocrmypdf... or something else. Would be possible with a loop that manipulates sys_argv and then reloads ocrmypdf.main; might need that anyway. 2015-07-27 22:07:04 -07:00			`'skew.pdf', 'test_oversample_%s.pdf' % renderer, '--oversample', '300',`
			`'--pdf-renderer', renderer)`
Add --oversample test for hocr rendering 2015-07-27 17:18:02 -07:00
			`pdfinfo = pdf_get_all_pageinfo(oversampled_pdf)`

			`print(pdfinfo[0]['xres'])`
			`assert abs(pdfinfo[0]['xres'] - 300) < 1`
nose can't really handle external tests so looking into py.test instead Specifically it trips over the need to reimport ocrmypdf.main. That in turn raises questions about whether to make that function into an external script that imports ocrmypdf... or something else. Would be possible with a loop that manipulates sys_argv and then reloads ocrmypdf.main; might need that anyway. 2015-07-27 22:07:04 -07:00

			`def test_oversample():`
			`yield check_oversample, 'hocr'`
			`yield check_oversample, 'tesseract'`


			`def test_repeat_ocr():`
Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`sh, _, _ = run_ocrmypdf_sh('graph_ocred.pdf', 'wontwork.pdf')`
			`assert sh.returncode != 0`
nose can't really handle external tests so looking into py.test instead Specifically it trips over the need to reimport ocrmypdf.main. That in turn raises questions about whether to make that function into an external script that imports ocrmypdf... or something else. Would be possible with a loop that manipulates sys_argv and then reloads ocrmypdf.main; might need that anyway. 2015-07-27 22:07:04 -07:00

			`def test_force_ocr():`
More test cases for other parameters 2015-07-28 02:31:18 -07:00			`out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f')`
			`pdfinfo = pdf_get_all_pageinfo(out)`
			`assert pdfinfo[0]['has_text']`
nose can't really handle external tests so looking into py.test instead Specifically it trips over the need to reimport ocrmypdf.main. That in turn raises questions about whether to make that function into an external script that imports ocrmypdf... or something else. Would be possible with a loop that manipulates sys_argv and then reloads ocrmypdf.main; might need that anyway. 2015-07-27 22:07:04 -07:00

Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`def test_skip_ocr():`
More test cases 2015-07-28 03:02:35 -07:00			`check_ocrmypdf('graph_ocred.pdf', 'test_skip.pdf', '-s')`
Test cases for --tesseract-timeout 2015-07-28 01:47:30 -07:00

			`def check_ocr_timeout(renderer):`
More test cases for other parameters 2015-07-28 02:31:18 -07:00			`out = check_ocrmypdf('skew.pdf', 'test_timeout_%s.pdf' % renderer,`
			`'--tesseract-timeout', '1.0')`
			`pdfinfo = pdf_get_all_pageinfo(out)`
			`assert pdfinfo[0]['has_text'] == False`
Test cases for --tesseract-timeout 2015-07-28 01:47:30 -07:00

			`def test_ocr_timeout():`
			`yield check_ocr_timeout, 'hocr'`
			`yield check_ocr_timeout, 'tesseract'`
More test cases for other parameters 2015-07-28 02:31:18 -07:00

			`def test_skip_big():`
			`out = check_ocrmypdf('enormous.pdf', 'test_enormous.pdf',`
			`'--skip-big', '10')`
			`pdfinfo = pdf_get_all_pageinfo(out)`
			`assert pdfinfo[0]['has_text'] == False`

More test cases 2015-07-28 03:02:35 -07:00
			`def check_maximum_options(renderer):`
			`check_ocrmypdf(`
			`'multipage.pdf', 'test_multipage%s.pdf' % renderer,`
			`'-d', '-c', '-i', '-g', '-f', '-k', '--oversample', '300',`
			`'--skip-big', '10', '--title', 'Too Many Weird Files',`
			`'--author', 'py.test', '--pdf-renderer', renderer)`


			`def test_maximum_options():`
			`yield check_maximum_options, 'hocr'`
			`yield check_maximum_options, 'tesseract'`