OCRmyPDF/tests/test_hocrtransform.py
James R. Barlow 32322a9fe9
Fix broken test_hocrtransform_matches_sandwich
Expect word similarity rather than exact match. Difference appears to be due to quote styles.

Thanks @QuLogic for reporting.
2025-02-09 13:57:50 -08:00

103 lines
3.1 KiB
Python

# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import re
from io import StringIO
import pytest
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from PIL import Image
from ocrmypdf import hocrtransform
from ocrmypdf._exec.tesseract import generate_hocr
from ocrmypdf.helpers import check_pdf
from .conftest import check_ocrmypdf
def text_from_pdf(filename):
output_string = StringIO()
with open(filename, 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
return output_string.getvalue()
# pylint: disable=redefined-outer-name
@pytest.fixture
def blank_hocr(tmp_path):
im = Image.new('1', (8, 8), 0)
im.save(tmp_path / 'blank.tif', format='TIFF')
generate_hocr(
input_file=tmp_path / 'blank.tif',
output_hocr=tmp_path / 'blank.hocr',
output_text=tmp_path / 'blank.txt',
languages=['eng'],
engine_mode=1,
tessconfig=[],
pagesegmode=3,
thresholding=0,
user_words=None,
user_patterns=None,
timeout=None,
)
return tmp_path / 'blank.hocr'
def test_mono_image(blank_hocr, outdir):
im = Image.new('1', (8, 8), 0)
for n in range(8):
im.putpixel((n, n), 1)
im.save(outdir / 'mono.tif', format='TIFF')
hocr = hocrtransform.HocrTransform(hocr_filename=str(blank_hocr), dpi=8)
hocr.to_pdf(
out_filename=str(outdir / 'mono.pdf'), image_filename=str(outdir / 'mono.tif')
)
# shutil.copy(outdir / 'mono.pdf', 'mono.pdf')
check_pdf(str(outdir / 'mono.pdf'))
@pytest.mark.slow
def test_hocrtransform_matches_sandwich(resources, outdir):
check_ocrmypdf(resources / 'ccitt.pdf', outdir / 'hocr.pdf', '--pdf-renderer=hocr')
check_ocrmypdf(
resources / 'ccitt.pdf', outdir / 'tess.pdf', '--pdf-renderer=sandwich'
)
# Slight differences in spacing and word order can appear, so at least ensure
# that we get all of the same words...
def clean(s):
s = re.sub(r'\s+', ' ', s)
words = s.split(' ')
return set(words)
hocr_words = clean(text_from_pdf(outdir / 'hocr.pdf'))
tess_words = clean(text_from_pdf(outdir / 'tess.pdf'))
similarity = len(hocr_words & tess_words) / len(hocr_words | tess_words)
# from pathlib import Path
# Path('hocr.txt').write_text(sorted('\n'.join(hocr_words)))
# Path('tess.txt').write_text(sorted('\n'.join(tess_words)))
# Path('mismatch.txt').write_text(
# '\n'.join(sorted(hocr_words ^ tess_words)), encoding='utf8'
# )
assert similarity > 0.99