# © 2015 James R. Barlow: github.com/jbarlow83 # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. import re from io import StringIO import pytest from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PDFParser from PIL import Image from ocrmypdf import hocrtransform from ocrmypdf._exec.tesseract import HOCR_TEMPLATE from ocrmypdf.helpers import check_pdf def text_from_pdf(filename): output_string = StringIO() with open(filename, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) return output_string.getvalue() # pylint: disable=redefined-outer-name check_ocrmypdf = pytest.helpers.check_ocrmypdf # pylint: disable=no-member @pytest.fixture def blank_hocr(tmp_path): filename = tmp_path / "blank.hocr" filename.write_text(HOCR_TEMPLATE) return filename def test_mono_image(blank_hocr, outdir): im = Image.new('1', (8, 8), 0) for n in range(8): im.putpixel((n, n), 1) im.save(outdir / 'mono.tif', format='TIFF') hocr = hocrtransform.HocrTransform(str(blank_hocr), 300) hocr.to_pdf(str(outdir / 'mono.pdf'), image_filename=str(outdir / 'mono.tif')) check_pdf(str(outdir / 'mono.pdf')) @pytest.mark.slow def test_hocrtransform_matches_sandwich(resources, outdir): check_ocrmypdf(resources / 'ccitt.pdf', outdir / 'hocr.pdf', '--pdf-renderer=hocr') check_ocrmypdf( resources / 'ccitt.pdf', outdir / 'tess.pdf', '--pdf-renderer=sandwich' ) def clean(s): s = re.sub(r'[ ]+', ' ', s) s = re.sub(r'[ ]?[\n]+', r'\n', s) return s hocr_txt = clean(text_from_pdf(outdir / 'hocr.pdf')) tess_txt = clean(text_from_pdf(outdir / 'tess.pdf')) # Path('hocr.txt').write_text(hocr_txt) # Path('tess.txt').write_text(tess_txt) assert hocr_txt == tess_txt