OCRmyPDF/tests/test_hocrtransform.py

# © 2015 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.


from __future__ import annotations

import re
from io import StringIO

import pytest
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from PIL import Image

from ocrmypdf import hocrtransform
from ocrmypdf._exec.tesseract import HOCR_TEMPLATE
from ocrmypdf.helpers import check_pdf

from .conftest import check_ocrmypdf


def text_from_pdf(filename):
    output_string = StringIO()
    with open(filename, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
    return output_string.getvalue()


# pylint: disable=redefined-outer-name


@pytest.fixture
def blank_hocr(tmp_path):
    filename = tmp_path / "blank.hocr"
    filename.write_text(HOCR_TEMPLATE)
    return filename


def test_mono_image(blank_hocr, outdir):
    im = Image.new('1', (8, 8), 0)
    for n in range(8):
        im.putpixel((n, n), 1)
    im.save(outdir / 'mono.tif', format='TIFF')

    hocr = hocrtransform.HocrTransform(hocr_filename=str(blank_hocr), dpi=300)
    hocr.to_pdf(
        out_filename=str(outdir / 'mono.pdf'), image_filename=str(outdir / 'mono.tif')
    )

    check_pdf(str(outdir / 'mono.pdf'))


@pytest.mark.slow
def test_hocrtransform_matches_sandwich(resources, outdir):
    check_ocrmypdf(resources / 'ccitt.pdf', outdir / 'hocr.pdf', '--pdf-renderer=hocr')
    check_ocrmypdf(
        resources / 'ccitt.pdf', outdir / 'tess.pdf', '--pdf-renderer=sandwich'
    )

    # Slight differences in spacing and word order can appear, so at least ensure
    # that we get all of the same words...
    def clean(s):
        s = re.sub(r'\s+', ' ', s)
        words = s.split(' ')
        return '\n'.join(sorted(words))

    hocr_txt = clean(text_from_pdf(outdir / 'hocr.pdf'))
    tess_txt = clean(text_from_pdf(outdir / 'tess.pdf'))

    # from pathlib import Path
    # Path('hocr.txt').write_text(hocr_txt)
    # Path('tess.txt').write_text(tess_txt)

    assert hocr_txt == tess_txt
New hocrtransform test 2016-01-15 14:14:08 -08:00			`# © 2015 James R. Barlow: github.com/jbarlow83`
Add license notice to all files Source files to GPL3 Exceptions: -tests/spoof/* to MIT -hocrtransform.py -_unicodefun.py Test resources to CC BY-SA 4.0 except when otherwise noted. Add GPL license. 2018-03-14 14:40:48 -07:00			`#`
Change license of all GPLv3 files to MPL-2.0 https://github.com/jbarlow83/OCRmyPDF/issues/600 2020-08-05 00:44:42 -07:00			`# This Source Code Form is subject to the terms of the Mozilla Public`
			`# License, v. 2.0. If a copy of the MPL was not distributed with this`
			`# file, You can obtain one at http://mozilla.org/MPL/2.0/.`

New hocrtransform test 2016-01-15 14:14:08 -08:00
Modernize type annotations 2022-07-23 00:39:24 -07:00			`from __future__ import annotations`

Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00			`import re`
			`from io import StringIO`

New hocrtransform test 2016-01-15 14:14:08 -08:00			`import pytest`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00			`from pdfminer.converter import TextConverter`
			`from pdfminer.layout import LAParams`
			`from pdfminer.pdfdocument import PDFDocument`
			`from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager`
			`from pdfminer.pdfpage import PDFPage`
			`from pdfminer.pdfparser import PDFParser`
Sort imports with isort 2018-12-30 01:28:15 -08:00			`from PIL import Image`

			`from ocrmypdf import hocrtransform`
Rename ocrmypdf.exec -> ocrmypdf._exec 2020-06-09 14:55:54 -07:00			`from ocrmypdf._exec.tesseract import HOCR_TEMPLATE`
Remove last vestiges of command line usage of qpdf - change to check_pdf 2020-04-26 05:33:26 -07:00			`from ocrmypdf.helpers import check_pdf`
Cleanup unused imports 2018-06-23 01:47:53 -07:00
Remove pytest_helpers_namespace 2021-04-07 01:56:51 -07:00			`from .conftest import check_ocrmypdf`

Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00
			`def text_from_pdf(filename):`
			`output_string = StringIO()`
			`with open(filename, 'rb') as in_file:`
			`parser = PDFParser(in_file)`
			`doc = PDFDocument(parser)`
			`rsrcmgr = PDFResourceManager()`
			`device = TextConverter(rsrcmgr, output_string, laparams=LAParams())`
			`interpreter = PDFPageInterpreter(rsrcmgr, device)`
			`for page in PDFPage.create_pages(doc):`
			`interpreter.process_page(page)`
			`return output_string.getvalue()`


Cleanup unused imports 2018-06-23 01:47:53 -07:00			`# pylint: disable=redefined-outer-name`
New hocrtransform test 2016-01-15 14:14:08 -08:00

Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`@pytest.fixture`
Use newer pytest tmp_path API 2019-06-01 01:55:51 -07:00			`def blank_hocr(tmp_path):`
			`filename = tmp_path / "blank.hocr"`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00			`filename.write_text(HOCR_TEMPLATE)`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`return filename`
New hocrtransform test 2016-01-15 14:14:08 -08:00

Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`def test_mono_image(blank_hocr, outdir):`
New hocrtransform test 2016-01-15 14:14:08 -08:00			`im = Image.new('1', (8, 8), 0)`
			`for n in range(8):`
			`im.putpixel((n, n), 1)`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`im.save(outdir / 'mono.tif', format='TIFF')`
New hocrtransform test 2016-01-15 14:14:08 -08:00
Stricter parameter checking for many public functions 2020-12-09 10:15:15 -08:00			`hocr = hocrtransform.HocrTransform(hocr_filename=str(blank_hocr), dpi=300)`
			`hocr.to_pdf(`
			`out_filename=str(outdir / 'mono.pdf'), image_filename=str(outdir / 'mono.tif')`
			`)`
New hocrtransform test 2016-01-15 14:14:08 -08:00
Remove last vestiges of command line usage of qpdf - change to check_pdf 2020-04-26 05:33:26 -07:00			`check_pdf(str(outdir / 'mono.pdf'))`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00

Spell runslow correctly 2020-06-22 23:32:09 -07:00			`@pytest.mark.slow`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00			`def test_hocrtransform_matches_sandwich(resources, outdir):`
test_hocrtransform: this test is worth not caching 2020-06-22 16:31:06 -07:00			`check_ocrmypdf(resources / 'ccitt.pdf', outdir / 'hocr.pdf', '--pdf-renderer=hocr')`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00			`check_ocrmypdf(`
test_hocrtransform: this test is worth not caching 2020-06-22 16:31:06 -07:00			`resources / 'ccitt.pdf', outdir / 'tess.pdf', '--pdf-renderer=sandwich'`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00			`)`

Replace leptonica deskew with tesseract find skew and pillow rotate Also rebuild the cache. 2021-11-12 16:35:08 -08:00			`# Slight differences in spacing and word order can appear, so at least ensure`
			`# that we get all of the same words...`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00			`def clean(s):`
Replace leptonica deskew with tesseract find skew and pillow rotate Also rebuild the cache. 2021-11-12 16:35:08 -08:00			`s = re.sub(r'\s+', ' ', s)`
			`words = s.split(' ')`
			`return '\n'.join(sorted(words))`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00
			`hocr_txt = clean(text_from_pdf(outdir / 'hocr.pdf'))`
			`tess_txt = clean(text_from_pdf(outdir / 'tess.pdf'))`

Replace leptonica deskew with tesseract find skew and pillow rotate Also rebuild the cache. 2021-11-12 16:35:08 -08:00			`# from pathlib import Path`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00			`# Path('hocr.txt').write_text(hocr_txt)`
			`# Path('tess.txt').write_text(tess_txt)`

			`assert hocr_txt == tess_txt`