OCRmyPDF/tests/test_tess4.py

#!/usr/bin/env python3
# © 2017 James R. Barlow: github.com/jbarlow83

import pytest
from ocrmypdf.exceptions import ExitCode
from ocrmypdf.exec import tesseract
from ocrmypdf import pdfinfo
import sys
import os
import PyPDF2 as pypdf
from contextlib import contextmanager


spoof = pytest.helpers.spoof


@pytest.fixture
def ensure_tess4():
    if tesseract.v4():
        # Either "tesseract" on $PATH is already v4, or
        # OCRMYPDF_TESSERACT is tess4 already
        return os.environ.copy()

    if os.environ.get('OCRMYPDF_TESS4'):
        # OCRMYPDF_TESS4 is a hint environment variable that tells us to look
        # somewhere special for tess4 if and only if we need it. This allows
        # setting OCRMYPDF_TESS4 to test tess4 and PATH to point to tess3
        # on a system with both installed.
        env = os.environ.copy()
        env['OCRMYPDF_TESSERACT'] = env['OCRMYPDF_TESS4']
        return env

    raise EnvironmentError("Can't find Tesseract 4")


@contextmanager
def modified_os_environ(env):
    old_env = os.environ.copy()
    os.environ = env
    yield
    os.environ = old_env


def tess4_available():
    """Check if a tesseract 4 binary is available, even if it's not the
    official "tesseract" on PATH

    """
    try:
        # ensure_tess4 locates the tess4 binary we are going to check
        env = ensure_tess4()
        with modified_os_environ(env):
            # Now jump into this environment and make sure it really is Tess4
            return tesseract.v4() and tesseract.has_textonly_pdf()
    except EnvironmentError:
        pass

    return False

# Skip all tests in this file if not tesseract 4
pytestmark = pytest.mark.skipif(
    not tess4_available(),
    reason="tesseract 4.0 with textonly_pdf feature required")

check_ocrmypdf = pytest.helpers.check_ocrmypdf
run_ocrmypdf = pytest.helpers.run_ocrmypdf
spoof = pytest.helpers.spoof


def test_textonly_pdf(ensure_tess4, resources, outdir):
    check_ocrmypdf(
        resources / 'linn.pdf',
        outdir / 'linn_textonly.pdf', '--pdf-renderer', 'tess4',
        '--sidecar', 'foo',
        env=ensure_tess4)


def test_pagesize_consistency_tess4(ensure_tess4, resources, outpdf):
    from math import isclose

    infile = resources / 'linn.pdf'

    before_dims = pytest.helpers.first_page_dimensions(infile)

    check_ocrmypdf(
        infile,
        outpdf, '--pdf-renderer', 'tess4',
        '--clean', '--deskew', '--remove-background', '--clean-final',
        env=ensure_tess4)

    after_dims = pytest.helpers.first_page_dimensions(outpdf)

    assert isclose(before_dims[0], after_dims[0])
    assert isclose(before_dims[1], after_dims[1])


@pytest.mark.parametrize('basename', ['graph_ocred.pdf', 'cardinal.pdf'])
def test_skip_pages_does_not_replicate(
        ensure_tess4, resources, basename, outdir):
    infile = resources / basename
    outpdf = outdir / basename

    check_ocrmypdf(
        infile,
        outpdf, '--pdf-renderer', 'tess4', '--force-ocr',
        '--tesseract-timeout', '0',
        env=ensure_tess4
    )

    info_in = pdfinfo.PdfInfo(infile)

    info = pdfinfo.PdfInfo(outpdf)
    for page in info:
        assert len(page.images) == 1, "skipped page was replicated"

    for n in range(len(info_in)):
        assert info[n].width_inches == info_in[n].width_inches


def test_content_preservation(ensure_tess4, resources, outpdf):
    infile = resources / 'masks.pdf'

    check_ocrmypdf(
        infile,
        outpdf, '--pdf-renderer', 'tess4', '--tesseract-timeout', '0',
        env=ensure_tess4
    )

    info = pdfinfo.PdfInfo(outpdf)
    page = info[0]
    assert len(page.images) > 1, "masks were rasterized"
Rename ‘tesstop’ to ‘tess4’ There’s no reason text-only PDF shouldn’t become the default for tesseract 4. 2017-01-26 12:28:51 -08:00			`#!/usr/bin/env python3`
			`# © 2017 James R. Barlow: github.com/jbarlow83`

			`import pytest`
			`from ocrmypdf.exceptions import ExitCode`
			`from ocrmypdf.exec import tesseract`
Rename pageinfo to pdfinfo 2017-05-19 15:48:23 -07:00			`from ocrmypdf import pdfinfo`
Fix issue #147: unpaper loses DPI information, affects —pdf-renderer tess4 2017-03-24 13:23:03 -07:00			`import sys`
Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00			`import os`
			`import PyPDF2 as pypdf`
Rename “tess4” renderer to “sandwich” and make it default in Tess 3.05.01 Tesseract 3.05.01 backported the textonly_pdf=1 which allows the use of this superior PDF renderer prior to 4.00 alpha. This means that the tess4 name is no longer accurate, so call it a sandwich because of its merge-preserve characteristic. Preserve the tess4 name. Fix the documentation and tests to reflect this. Make it the default, because it’s better. It does not have the issues the “tesseract” renderer does prior to Tess 3.05.00 with rendering PDFs that Ghostscript corrupts, and it produces better output without re-rastering. Deprecate some old stuff to avoid the test suite growing obscenely large. 2017-06-13 13:09:12 -07:00			`from contextlib import contextmanager`
Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00

			`spoof = pytest.helpers.spoof`


			`@pytest.fixture`
			`def ensure_tess4():`
Fix Travis CI errors while looking around for Tess4 2017-05-12 00:40:00 -07:00			`if tesseract.v4():`
			`# Either "tesseract" on $PATH is already v4, or`
			`# OCRMYPDF_TESSERACT is tess4 already`
			`return os.environ.copy()`

			`if os.environ.get('OCRMYPDF_TESS4'):`
			`# OCRMYPDF_TESS4 is a hint environment variable that tells us to look`
			`# somewhere special for tess4 if and only if we need it. This allows`
			`# setting OCRMYPDF_TESS4 to test tess4 and PATH to point to tess3`
			`# on a system with both installed.`
			`env = os.environ.copy()`
			`env['OCRMYPDF_TESSERACT'] = env['OCRMYPDF_TESS4']`
			`return env`

			`raise EnvironmentError("Can't find Tesseract 4")`
Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00

Rename “tess4” renderer to “sandwich” and make it default in Tess 3.05.01 Tesseract 3.05.01 backported the textonly_pdf=1 which allows the use of this superior PDF renderer prior to 4.00 alpha. This means that the tess4 name is no longer accurate, so call it a sandwich because of its merge-preserve characteristic. Preserve the tess4 name. Fix the documentation and tests to reflect this. Make it the default, because it’s better. It does not have the issues the “tesseract” renderer does prior to Tess 3.05.00 with rendering PDFs that Ghostscript corrupts, and it produces better output without re-rastering. Deprecate some old stuff to avoid the test suite growing obscenely large. 2017-06-13 13:09:12 -07:00			`@contextmanager`
			`def modified_os_environ(env):`
			`old_env = os.environ.copy()`
			`os.environ = env`
			`yield`
			`os.environ = old_env`


Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00			`def tess4_available():`
			`"""Check if a tesseract 4 binary is available, even if it's not the`
			`official "tesseract" on PATH`

			`"""`
			`try:`
Rename “tess4” renderer to “sandwich” and make it default in Tess 3.05.01 Tesseract 3.05.01 backported the textonly_pdf=1 which allows the use of this superior PDF renderer prior to 4.00 alpha. This means that the tess4 name is no longer accurate, so call it a sandwich because of its merge-preserve characteristic. Preserve the tess4 name. Fix the documentation and tests to reflect this. Make it the default, because it’s better. It does not have the issues the “tesseract” renderer does prior to Tess 3.05.00 with rendering PDFs that Ghostscript corrupts, and it produces better output without re-rastering. Deprecate some old stuff to avoid the test suite growing obscenely large. 2017-06-13 13:09:12 -07:00			`# ensure_tess4 locates the tess4 binary we are going to check`
			`env = ensure_tess4()`
			`with modified_os_environ(env):`
			`# Now jump into this environment and make sure it really is Tess4`
			`return tesseract.v4() and tesseract.has_textonly_pdf()`
Fix Travis CI errors while looking around for Tess4 2017-05-12 00:40:00 -07:00			`except EnvironmentError:`
			`pass`
Rename ‘tesstop’ to ‘tess4’ There’s no reason text-only PDF shouldn’t become the default for tesseract 4. 2017-01-26 12:28:51 -08:00
Fix Travis CI errors while looking around for Tess4 2017-05-12 00:40:00 -07:00			`return False`
Rename ‘tesstop’ to ‘tess4’ There’s no reason text-only PDF shouldn’t become the default for tesseract 4. 2017-01-26 12:28:51 -08:00
			`# Skip all tests in this file if not tesseract 4`
Fix issue #147: unpaper loses DPI information, affects —pdf-renderer tess4 2017-03-24 13:23:03 -07:00			`pytestmark = pytest.mark.skipif(`
Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00			`not tess4_available(),`
Fix issue #147: unpaper loses DPI information, affects —pdf-renderer tess4 2017-03-24 13:23:03 -07:00			`reason="tesseract 4.0 with textonly_pdf feature required")`

			`check_ocrmypdf = pytest.helpers.check_ocrmypdf`
			`run_ocrmypdf = pytest.helpers.run_ocrmypdf`
			`spoof = pytest.helpers.spoof`
Rename ‘tesstop’ to ‘tess4’ There’s no reason text-only PDF shouldn’t become the default for tesseract 4. 2017-01-26 12:28:51 -08:00

Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00			`def test_textonly_pdf(ensure_tess4, resources, outdir):`
Fix issue #147: unpaper loses DPI information, affects —pdf-renderer tess4 2017-03-24 13:23:03 -07:00			`check_ocrmypdf(`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`resources / 'linn.pdf',`
Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00			`outdir / 'linn_textonly.pdf', '--pdf-renderer', 'tess4',`
Implement sidecar text files (#126) 2017-05-10 15:22:44 -07:00			`'--sidecar', 'foo',`
Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00			`env=ensure_tess4)`
Rename ‘tesstop’ to ‘tess4’ There’s no reason text-only PDF shouldn’t become the default for tesseract 4. 2017-01-26 12:28:51 -08:00

Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00			`def test_pagesize_consistency_tess4(ensure_tess4, resources, outpdf):`
Fix issue #147: unpaper loses DPI information, affects —pdf-renderer tess4 2017-03-24 13:23:03 -07:00			`from math import isclose`

			`infile = resources / 'linn.pdf'`

			`before_dims = pytest.helpers.first_page_dimensions(infile)`

			`check_ocrmypdf(`
			`infile,`
			`outpdf, '--pdf-renderer', 'tess4',`
Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00			`'--clean', '--deskew', '--remove-background', '--clean-final',`
			`env=ensure_tess4)`
Fix issue #147: unpaper loses DPI information, affects —pdf-renderer tess4 2017-03-24 13:23:03 -07:00
			`after_dims = pytest.helpers.first_page_dimensions(outpdf)`
Rename ‘tesstop’ to ‘tess4’ There’s no reason text-only PDF shouldn’t become the default for tesseract 4. 2017-01-26 12:28:51 -08:00
Fix issue #147: unpaper loses DPI information, affects —pdf-renderer tess4 2017-03-24 13:23:03 -07:00			`assert isclose(before_dims[0], after_dims[0])`
			`assert isclose(before_dims[1], after_dims[1])`
Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00

			`@pytest.mark.parametrize('basename', ['graph_ocred.pdf', 'cardinal.pdf'])`
			`def test_skip_pages_does_not_replicate(`
			`ensure_tess4, resources, basename, outdir):`
			`infile = resources / basename`
			`outpdf = outdir / basename`

			`check_ocrmypdf(`
			`infile,`
			`outpdf, '--pdf-renderer', 'tess4', '--force-ocr',`
			`'--tesseract-timeout', '0',`
			`env=ensure_tess4`
			`)`

Rename pageinfo to pdfinfo 2017-05-19 15:48:23 -07:00			`info_in = pdfinfo.PdfInfo(infile)`
Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00
Rename pageinfo to pdfinfo 2017-05-19 15:48:23 -07:00			`info = pdfinfo.PdfInfo(outpdf)`
Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00			`for page in info:`
Fix tess4 test using old-style pageinfo API 2017-05-29 13:51:21 -07:00			`assert len(page.images) == 1, "skipped page was replicated"`
Fix issues with —pdf-renderer tess4 page skipping If tess4 renderer needed to skip OCR on a page it would end up duplicating the page contents onto the new page, rather than creating a blank OCR layer and placing it on the output page. This created duplicated content in output files. 2017-03-29 23:31:44 -07:00
			`for n in range(len(info_in)):`
Fix tess4 test using old-style pageinfo API 2017-05-29 13:51:21 -07:00			`assert info[n].width_inches == info_in[n].width_inches`
Enable lossless reconstruction for —pdf-renderer tess4 where appropriate 2017-03-29 23:44:12 -07:00

			`def test_content_preservation(ensure_tess4, resources, outpdf):`
			`infile = resources / 'masks.pdf'`

			`check_ocrmypdf(`
			`infile,`
			`outpdf, '--pdf-renderer', 'tess4', '--tesseract-timeout', '0',`
			`env=ensure_tess4`
			`)`

Rename pageinfo to pdfinfo 2017-05-19 15:48:23 -07:00			`info = pdfinfo.PdfInfo(outpdf)`
Enable lossless reconstruction for —pdf-renderer tess4 where appropriate 2017-03-29 23:44:12 -07:00			`page = info[0]`
Fix tess4 test using old-style pageinfo API 2017-05-29 13:51:21 -07:00			`assert len(page.images) > 1, "masks were rasterized"`