OCRmyPDF/tests/test_hocrtransform.py

# © 2015 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.

import re
from io import StringIO
from pathlib import Path

import pytest
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from PIL import Image

from ocrmypdf import hocrtransform
from ocrmypdf._exec.tesseract import HOCR_TEMPLATE
from ocrmypdf.helpers import check_pdf


def text_from_pdf(filename):
    output_string = StringIO()
    with open(filename, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
    return output_string.getvalue()


# pylint: disable=redefined-outer-name

check_ocrmypdf = pytest.helpers.check_ocrmypdf  # pylint: disable=no-member


@pytest.fixture
def blank_hocr(tmp_path):
    filename = tmp_path / "blank.hocr"
    filename.write_text(HOCR_TEMPLATE)
    return filename


def test_mono_image(blank_hocr, outdir):
    im = Image.new('1', (8, 8), 0)
    for n in range(8):
        im.putpixel((n, n), 1)
    im.save(outdir / 'mono.tif', format='TIFF')

    hocr = hocrtransform.HocrTransform(str(blank_hocr), 300)
    hocr.to_pdf(str(outdir / 'mono.pdf'), image_filename=str(outdir / 'mono.tif'))

    check_pdf(str(outdir / 'mono.pdf'))


def test_hocrtransform_matches_sandwich(resources, outdir):
    check_ocrmypdf(
        resources / 'ccitt.pdf',
        outdir / 'hocr.pdf',
        '--pdf-renderer=hocr',
        # '--plugin',
        # 'tests/plugins/tesseract_cache.py',
    )
    check_ocrmypdf(
        resources / 'ccitt.pdf',
        outdir / 'tess.pdf',
        '--pdf-renderer=sandwich',
        # '--plugin',
        # 'tests/plugins/tesseract_cache.py',
    )

    def clean(s):
        s = re.sub(r'[ ]+', ' ', s)
        s = re.sub(r'[ ]?[\n]+', r'\n', s)
        return s

    hocr_txt = clean(text_from_pdf(outdir / 'hocr.pdf'))
    tess_txt = clean(text_from_pdf(outdir / 'tess.pdf'))

    # Path('hocr.txt').write_text(hocr_txt)
    # Path('tess.txt').write_text(tess_txt)

    assert hocr_txt == tess_txt
New hocrtransform test 2016-01-15 14:14:08 -08:00			`# © 2015 James R. Barlow: github.com/jbarlow83`
Add license notice to all files Source files to GPL3 Exceptions: -tests/spoof/* to MIT -hocrtransform.py -_unicodefun.py Test resources to CC BY-SA 4.0 except when otherwise noted. Add GPL license. 2018-03-14 14:40:48 -07:00			`#`
			`# This file is part of OCRmyPDF.`
			`#`
			`# OCRmyPDF is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# OCRmyPDF is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.`
New hocrtransform test 2016-01-15 14:14:08 -08:00
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00			`import re`
			`from io import StringIO`
			`from pathlib import Path`

New hocrtransform test 2016-01-15 14:14:08 -08:00			`import pytest`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00			`from pdfminer.converter import TextConverter`
			`from pdfminer.layout import LAParams`
			`from pdfminer.pdfdocument import PDFDocument`
			`from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager`
			`from pdfminer.pdfpage import PDFPage`
			`from pdfminer.pdfparser import PDFParser`
Sort imports with isort 2018-12-30 01:28:15 -08:00			`from PIL import Image`

			`from ocrmypdf import hocrtransform`
Rename ocrmypdf.exec -> ocrmypdf._exec 2020-06-09 14:55:54 -07:00			`from ocrmypdf._exec.tesseract import HOCR_TEMPLATE`
Remove last vestiges of command line usage of qpdf - change to check_pdf 2020-04-26 05:33:26 -07:00			`from ocrmypdf.helpers import check_pdf`
Cleanup unused imports 2018-06-23 01:47:53 -07:00
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00
			`def text_from_pdf(filename):`
			`output_string = StringIO()`
			`with open(filename, 'rb') as in_file:`
			`parser = PDFParser(in_file)`
			`doc = PDFDocument(parser)`
			`rsrcmgr = PDFResourceManager()`
			`device = TextConverter(rsrcmgr, output_string, laparams=LAParams())`
			`interpreter = PDFPageInterpreter(rsrcmgr, device)`
			`for page in PDFPage.create_pages(doc):`
			`interpreter.process_page(page)`
			`return output_string.getvalue()`


Cleanup unused imports 2018-06-23 01:47:53 -07:00			`# pylint: disable=redefined-outer-name`
New hocrtransform test 2016-01-15 14:14:08 -08:00
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00			`check_ocrmypdf = pytest.helpers.check_ocrmypdf # pylint: disable=no-member`

New hocrtransform test 2016-01-15 14:14:08 -08:00
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`@pytest.fixture`
Use newer pytest tmp_path API 2019-06-01 01:55:51 -07:00			`def blank_hocr(tmp_path):`
			`filename = tmp_path / "blank.hocr"`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00			`filename.write_text(HOCR_TEMPLATE)`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`return filename`
New hocrtransform test 2016-01-15 14:14:08 -08:00

Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`def test_mono_image(blank_hocr, outdir):`
New hocrtransform test 2016-01-15 14:14:08 -08:00			`im = Image.new('1', (8, 8), 0)`
			`for n in range(8):`
			`im.putpixel((n, n), 1)`
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`im.save(outdir / 'mono.tif', format='TIFF')`
New hocrtransform test 2016-01-15 14:14:08 -08:00
Fix remaining 3.4/3.5 regressions 2017-01-26 17:53:27 -08:00			`hocr = hocrtransform.HocrTransform(str(blank_hocr), 300)`
hocrtransform: cleanup/PEP8 Some API breaking changes. 2020-04-03 22:04:42 -07:00			`hocr.to_pdf(str(outdir / 'mono.pdf'), image_filename=str(outdir / 'mono.tif'))`
New hocrtransform test 2016-01-15 14:14:08 -08:00
Remove last vestiges of command line usage of qpdf - change to check_pdf 2020-04-26 05:33:26 -07:00			`check_pdf(str(outdir / 'mono.pdf'))`
Add test to sanity check our pdf renderers 2020-06-22 16:18:38 -07:00

			`def test_hocrtransform_matches_sandwich(resources, outdir):`
			`check_ocrmypdf(`
			`resources / 'ccitt.pdf',`
			`outdir / 'hocr.pdf',`
			`'--pdf-renderer=hocr',`
			`# '--plugin',`
			`# 'tests/plugins/tesseract_cache.py',`
			`)`
			`check_ocrmypdf(`
			`resources / 'ccitt.pdf',`
			`outdir / 'tess.pdf',`
			`'--pdf-renderer=sandwich',`
			`# '--plugin',`
			`# 'tests/plugins/tesseract_cache.py',`
			`)`

			`def clean(s):`
			`s = re.sub(r'[ ]+', ' ', s)`
			`s = re.sub(r'[ ]?[\n]+', r'\n', s)`
			`return s`

			`hocr_txt = clean(text_from_pdf(outdir / 'hocr.pdf'))`
			`tess_txt = clean(text_from_pdf(outdir / 'tess.pdf'))`

			`# Path('hocr.txt').write_text(hocr_txt)`
			`# Path('tess.txt').write_text(tess_txt)`

			`assert hocr_txt == tess_txt`