OCRmyPDF/tests/test_hocrtransform.py

# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import re
from io import StringIO

import pytest
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from PIL import Image

from ocrmypdf import hocrtransform
from ocrmypdf._exec.tesseract import generate_hocr
from ocrmypdf.helpers import check_pdf

from .conftest import check_ocrmypdf


def text_from_pdf(filename):
    output_string = StringIO()
    with open(filename, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
    return output_string.getvalue()


# pylint: disable=redefined-outer-name


@pytest.fixture
def blank_hocr(tmp_path):
    im = Image.new('1', (8, 8), 0)
    im.save(tmp_path / 'blank.tif', format='TIFF')
    generate_hocr(
        input_file=tmp_path / 'blank.tif',
        output_hocr=tmp_path / 'blank.hocr',
        output_text=tmp_path / 'blank.txt',
        languages=['eng'],
        engine_mode=1,
        tessconfig=[],
        pagesegmode=3,
        thresholding=0,
        user_words=None,
        user_patterns=None,
        timeout=None,
    )
    return tmp_path / 'blank.hocr'


def test_mono_image(blank_hocr, outdir):
    im = Image.new('1', (8, 8), 0)
    for n in range(8):
        im.putpixel((n, n), 1)
    im.save(outdir / 'mono.tif', format='TIFF')

    hocr = hocrtransform.HocrTransform(hocr_filename=str(blank_hocr), dpi=8)
    hocr.to_pdf(
        out_filename=str(outdir / 'mono.pdf'), image_filename=str(outdir / 'mono.tif')
    )
    # shutil.copy(outdir / 'mono.pdf', 'mono.pdf')
    check_pdf(str(outdir / 'mono.pdf'))


@pytest.mark.slow
def test_hocrtransform_matches_sandwich(resources, outdir):
    check_ocrmypdf(resources / 'ccitt.pdf', outdir / 'hocr.pdf', '--pdf-renderer=hocr')
    check_ocrmypdf(
        resources / 'ccitt.pdf', outdir / 'tess.pdf', '--pdf-renderer=sandwich'
    )

    # Slight differences in spacing and word order can appear, so at least ensure
    # that we get all of the same words...
    def clean(s):
        s = re.sub(r'\s+', ' ', s)
        words = s.split(' ')
        return set(words)

    hocr_words = clean(text_from_pdf(outdir / 'hocr.pdf'))
    tess_words = clean(text_from_pdf(outdir / 'tess.pdf'))

    similarity = len(hocr_words & tess_words) / len(hocr_words | tess_words)

    # from pathlib import Path

    # Path('hocr.txt').write_text(sorted('\n'.join(hocr_words)))
    # Path('tess.txt').write_text(sorted('\n'.join(tess_words)))
    # Path('mismatch.txt').write_text(
    #     '\n'.join(sorted(hocr_words ^ tess_words)), encoding='utf8'
    # )

    assert similarity > 0.99
Change to SPDX license tracking 2022-07-28 01:06:46 -07:00			`# SPDX-FileCopyrightText: 2022 James R. Barlow`
			`# SPDX-License-Identifier: MPL-2.0`
New hocrtransform test 2016-01-15 14:14:08 -08:00
Modernize type annotations 2022-07-23 00:39:24 -07:00			`from __future__ import annotations`

Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00			`import re`
			`from io import StringIO`

New hocrtransform test 2016-01-15 14:14:08 -08:00			`import pytest`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00			`from pdfminer.converter import TextConverter`
			`from pdfminer.layout import LAParams`
			`from pdfminer.pdfdocument import PDFDocument`
			`from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager`
			`from pdfminer.pdfpage import PDFPage`
			`from pdfminer.pdfparser import PDFParser`
Sort imports with isort 2018-12-30 01:28:15 -08:00			`from PIL import Image`

			`from ocrmypdf import hocrtransform`
Fix hocrtransform test to generate blank hocr 2023-10-15 01:57:48 -07:00			`from ocrmypdf._exec.tesseract import generate_hocr`
Remove last vestiges of command line usage of qpdf - change to check_pdf 2020-04-26 05:33:26 -07:00			`from ocrmypdf.helpers import check_pdf`
Cleanup unused imports 2018-06-23 01:47:53 -07:00
Remove pytest_helpers_namespace 2021-04-07 01:56:51 -07:00			`from .conftest import check_ocrmypdf`

Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00
			`def text_from_pdf(filename):`
			`output_string = StringIO()`
			`with open(filename, 'rb') as in_file:`
			`parser = PDFParser(in_file)`
			`doc = PDFDocument(parser)`
			`rsrcmgr = PDFResourceManager()`
			`device = TextConverter(rsrcmgr, output_string, laparams=LAParams())`
			`interpreter = PDFPageInterpreter(rsrcmgr, device)`
			`for page in PDFPage.create_pages(doc):`
			`interpreter.process_page(page)`
			`return output_string.getvalue()`


Cleanup unused imports 2018-06-23 01:47:53 -07:00			`# pylint: disable=redefined-outer-name`
New hocrtransform test 2016-01-15 14:14:08 -08:00

Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`@pytest.fixture`
Use newer pytest tmp_path API 2019-06-01 01:55:51 -07:00			`def blank_hocr(tmp_path):`
Fix hocrtransform test to generate blank hocr 2023-10-15 01:57:48 -07:00			`im = Image.new('1', (8, 8), 0)`
			`im.save(tmp_path / 'blank.tif', format='TIFF')`
			`generate_hocr(`
			`input_file=tmp_path / 'blank.tif',`
			`output_hocr=tmp_path / 'blank.hocr',`
			`output_text=tmp_path / 'blank.txt',`
			`languages=['eng'],`
			`engine_mode=1,`
			`tessconfig=[],`
			`pagesegmode=3,`
			`thresholding=0,`
			`user_words=None,`
			`user_patterns=None,`
			`timeout=None,`
			`)`
			`return tmp_path / 'blank.hocr'`
New hocrtransform test 2016-01-15 14:14:08 -08:00

Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`def test_mono_image(blank_hocr, outdir):`
New hocrtransform test 2016-01-15 14:14:08 -08:00			`im = Image.new('1', (8, 8), 0)`
			`for n in range(8):`
			`im.putpixel((n, n), 1)`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`im.save(outdir / 'mono.tif', format='TIFF')`
New hocrtransform test 2016-01-15 14:14:08 -08:00
Remove some obsolete parameters 2023-11-20 00:10:55 -08:00			`hocr = hocrtransform.HocrTransform(hocr_filename=str(blank_hocr), dpi=8)`
Stricter parameter checking for many public functions 2020-12-09 10:15:15 -08:00			`hocr.to_pdf(`
			`out_filename=str(outdir / 'mono.pdf'), image_filename=str(outdir / 'mono.tif')`
			`)`
Rationalize canvas interface 2023-11-20 15:31:44 -08:00			`# shutil.copy(outdir / 'mono.pdf', 'mono.pdf')`
Remove last vestiges of command line usage of qpdf - change to check_pdf 2020-04-26 05:33:26 -07:00			`check_pdf(str(outdir / 'mono.pdf'))`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00

Spell runslow correctly 2020-06-22 23:32:09 -07:00			`@pytest.mark.slow`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00			`def test_hocrtransform_matches_sandwich(resources, outdir):`
test_hocrtransform: this test is worth not caching 2020-06-22 16:31:06 -07:00			`check_ocrmypdf(resources / 'ccitt.pdf', outdir / 'hocr.pdf', '--pdf-renderer=hocr')`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00			`check_ocrmypdf(`
test_hocrtransform: this test is worth not caching 2020-06-22 16:31:06 -07:00			`resources / 'ccitt.pdf', outdir / 'tess.pdf', '--pdf-renderer=sandwich'`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00			`)`

Replace leptonica deskew with tesseract find skew and pillow rotate Also rebuild the cache. 2021-11-12 16:35:08 -08:00			`# Slight differences in spacing and word order can appear, so at least ensure`
			`# that we get all of the same words...`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00			`def clean(s):`
Replace leptonica deskew with tesseract find skew and pillow rotate Also rebuild the cache. 2021-11-12 16:35:08 -08:00			`s = re.sub(r'\s+', ' ', s)`
			`words = s.split(' ')`
Fix broken test_hocrtransform_matches_sandwich Expect word similarity rather than exact match. Difference appears to be due to quote styles. Thanks @QuLogic for reporting. 2025-02-09 13:55:54 -08:00			`return set(words)`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00
Fix broken test_hocrtransform_matches_sandwich Expect word similarity rather than exact match. Difference appears to be due to quote styles. Thanks @QuLogic for reporting. 2025-02-09 13:55:54 -08:00			`hocr_words = clean(text_from_pdf(outdir / 'hocr.pdf'))`
			`tess_words = clean(text_from_pdf(outdir / 'tess.pdf'))`

			`similarity = len(hocr_words & tess_words) / len(hocr_words \| tess_words)`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00
Replace leptonica deskew with tesseract find skew and pillow rotate Also rebuild the cache. 2021-11-12 16:35:08 -08:00			`# from pathlib import Path`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00
Fix broken test_hocrtransform_matches_sandwich Expect word similarity rather than exact match. Difference appears to be due to quote styles. Thanks @QuLogic for reporting. 2025-02-09 13:55:54 -08:00			`# Path('hocr.txt').write_text(sorted('\n'.join(hocr_words)))`
			`# Path('tess.txt').write_text(sorted('\n'.join(tess_words)))`
			`# Path('mismatch.txt').write_text(`
			`# '\n'.join(sorted(hocr_words ^ tess_words)), encoding='utf8'`
			`# )`

			`assert similarity > 0.99`