OCRmyPDF/tests/test_hocrtransform.py

101 lines
3.0 KiB
Python
Raw Normal View History

2016-01-15 14:14:08 -08:00
# © 2015 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
2016-01-15 14:14:08 -08:00
import re
from io import StringIO
from pathlib import Path
2016-01-15 14:14:08 -08:00
import pytest
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
2018-12-30 01:28:15 -08:00
from PIL import Image
from ocrmypdf import hocrtransform
2020-06-09 14:55:54 -07:00
from ocrmypdf._exec.tesseract import HOCR_TEMPLATE
from ocrmypdf.helpers import check_pdf
2018-06-23 01:47:53 -07:00
def text_from_pdf(filename):
output_string = StringIO()
with open(filename, 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
return output_string.getvalue()
2018-06-23 01:47:53 -07:00
# pylint: disable=redefined-outer-name
2016-01-15 14:14:08 -08:00
check_ocrmypdf = pytest.helpers.check_ocrmypdf # pylint: disable=no-member
2016-01-15 14:14:08 -08:00
@pytest.fixture
2019-06-01 01:55:51 -07:00
def blank_hocr(tmp_path):
filename = tmp_path / "blank.hocr"
filename.write_text(HOCR_TEMPLATE)
return filename
2016-01-15 14:14:08 -08:00
def test_mono_image(blank_hocr, outdir):
2016-01-15 14:14:08 -08:00
im = Image.new('1', (8, 8), 0)
for n in range(8):
im.putpixel((n, n), 1)
im.save(outdir / 'mono.tif', format='TIFF')
2016-01-15 14:14:08 -08:00
2017-01-26 17:53:27 -08:00
hocr = hocrtransform.HocrTransform(str(blank_hocr), 300)
hocr.to_pdf(str(outdir / 'mono.pdf'), image_filename=str(outdir / 'mono.tif'))
2016-01-15 14:14:08 -08:00
check_pdf(str(outdir / 'mono.pdf'))
def test_hocrtransform_matches_sandwich(resources, outdir):
check_ocrmypdf(
resources / 'ccitt.pdf',
outdir / 'hocr.pdf',
'--pdf-renderer=hocr',
# '--plugin',
# 'tests/plugins/tesseract_cache.py',
)
check_ocrmypdf(
resources / 'ccitt.pdf',
outdir / 'tess.pdf',
'--pdf-renderer=sandwich',
# '--plugin',
# 'tests/plugins/tesseract_cache.py',
)
def clean(s):
s = re.sub(r'[ ]+', ' ', s)
s = re.sub(r'[ ]?[\n]+', r'\n', s)
return s
hocr_txt = clean(text_from_pdf(outdir / 'hocr.pdf'))
tess_txt = clean(text_from_pdf(outdir / 'tess.pdf'))
# Path('hocr.txt').write_text(hocr_txt)
# Path('tess.txt').write_text(tess_txt)
assert hocr_txt == tess_txt