OCRmyPDF/tests/test_rotation.py
James R. Barlow 7d330afd81 Delinting
2019-01-02 13:34:45 -08:00

275 lines
8.0 KiB
Python

# © 2018 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import logging
from io import BytesIO
from os import fspath
from unittest.mock import Mock
import img2pdf
import pytest
from PIL import Image
import pikepdf
from ocrmypdf import leptonica
from ocrmypdf.exec import ghostscript, tesseract
from ocrmypdf.pdfinfo import PdfInfo
# pytest.helpers is dynamic
# pylint: disable=no-member
# pylint: disable=w0612
pytestmark = pytest.mark.skipif(
leptonica.get_leptonica_version() < 'leptonica-1.72',
reason="Leptonica is too old, correlation doesn't work",
)
check_ocrmypdf = pytest.helpers.check_ocrmypdf
run_ocrmypdf = pytest.helpers.run_ocrmypdf
RENDERERS = ['hocr', 'sandwich']
def check_monochrome_correlation(
outdir, reference_pdf, reference_pageno, test_pdf, test_pageno
):
gslog = logging.getLogger()
reference_png = outdir / f'{reference_pdf.name}.ref{reference_pageno:04d}.png'
test_png = outdir / f'{test_pdf.name}.test{test_pageno:04d}.png'
def rasterize(pdf, pageno, png):
if png.exists():
print(png)
return
ghostscript.rasterize_pdf(
pdf,
png,
xres=100,
yres=100,
raster_device='pngmono',
log=gslog,
pageno=pageno,
rotation=0,
)
rasterize(reference_pdf, reference_pageno, reference_png)
rasterize(test_pdf, test_pageno, test_png)
pix_ref = leptonica.Pix.open(reference_png)
pix_test = leptonica.Pix.open(test_png)
return leptonica.Pix.correlation_binary(pix_ref, pix_test)
def test_monochrome_correlation(resources, outdir):
# Verify leptonica: check that an incorrect rotated image has poor
# correlation with reference
corr = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=1, # north facing page
test_pdf=resources / 'cardinal.pdf',
test_pageno=3, # south facing page
)
assert corr < 0.10
corr = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=2,
test_pdf=resources / 'cardinal.pdf',
test_pageno=2,
)
assert corr > 0.90
@pytest.mark.slow
@pytest.mark.parametrize('renderer', RENDERERS)
def test_autorotate(spoof_tesseract_cache, renderer, resources, outdir):
# cardinal.pdf contains four copies of an image rotated in each cardinal
# direction - these ones are "burned in" not tagged with /Rotate
out = check_ocrmypdf(
resources / 'cardinal.pdf',
outdir / 'out.pdf',
'-r',
'-v',
'1',
'--pdf-renderer',
renderer,
env=spoof_tesseract_cache,
)
for n in range(1, 4 + 1):
correlation = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=1,
test_pdf=outdir / 'out.pdf',
test_pageno=n,
)
assert correlation > 0.80
@pytest.mark.parametrize(
'threshold, correlation_test',
[
('1', 'correlation > 0.80'), # Low thresh -> always rotate -> high corr
('99', 'correlation < 0.10'), # High thres -> never rotate -> low corr
],
)
def test_autorotate_threshold(
spoof_tesseract_cache, threshold, correlation_test, resources, outdir
):
out = check_ocrmypdf(
resources / 'cardinal.pdf',
outdir / 'out.pdf',
'--rotate-pages-threshold',
threshold,
'-r',
'-v',
'1',
env=spoof_tesseract_cache,
)
correlation = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=1,
test_pdf=outdir / 'out.pdf',
test_pageno=3,
)
assert eval(correlation_test) # pylint: disable=w0123
def test_rotated_skew_timeout(resources, outpdf):
"""This document contains an image that is rotated 90 into place with a
/Rotate tag and intentionally skewed by altering the transformation matrix.
This tests for a bug where the combination of preprocessing and a tesseract
timeout produced a page whose dimensions did not match the original's.
"""
input_file = resources / 'rotated_skew.pdf'
in_pageinfo = PdfInfo(input_file)[0]
assert (
in_pageinfo.height_pixels < in_pageinfo.width_pixels
), "Expected the input page to be landscape"
assert in_pageinfo.rotation == 90, "Expected a rotated page"
out = check_ocrmypdf(
input_file,
outpdf,
'--pdf-renderer',
'hocr',
'--deskew',
'--tesseract-timeout',
'0',
)
out_pageinfo = PdfInfo(out)[0]
w, h = out_pageinfo.width_pixels, out_pageinfo.height_pixels
assert h > w, "Expected the output page to be portrait"
assert out_pageinfo.rotation == 0, "Expected no page rotation for output"
assert (
in_pageinfo.width_pixels == h and in_pageinfo.height_pixels == w
), "Expected page rotation to be baked in"
def test_rotate_deskew_timeout(resources, outdir):
check_ocrmypdf(
resources / 'rotated_skew.pdf',
outdir / 'deskewed.pdf',
'--rotate-pages',
'--rotate-pages-threshold',
'0',
'--deskew',
'--tesseract-timeout',
'0',
'--pdf-renderer',
'sandwich',
)
correlation = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'ccitt.pdf',
reference_pageno=1,
test_pdf=outdir / 'deskewed.pdf',
test_pageno=1,
)
# Confirm that the page still got deskewed
assert correlation > 0.50
@pytest.mark.slow
@pytest.mark.parametrize('page_angle', (0, 90, 180, 270))
@pytest.mark.parametrize('image_angle', (0, 90, 180, 270))
def test_rotate_page_level(image_angle, page_angle, resources, outdir):
def make_rotate_test(prefix, image_angle, page_angle):
im = Image.open(fspath(resources / 'typewriter.png'))
if image_angle != 0:
ccw_angle = -image_angle % 360
im = im.transpose(getattr(Image, f'ROTATE_{ccw_angle}'))
memimg = BytesIO()
im.save(memimg, format='PNG')
memimg.seek(0)
mempdf = BytesIO()
img2pdf.convert(
memimg.read(),
layout_fun=img2pdf.get_fixed_dpi_layout_fun((200, 200)),
outputstream=mempdf,
)
mempdf.seek(0)
pike = pikepdf.open(mempdf)
pike.pages[0].Rotate = page_angle
target = outdir / f'{prefix}_{image_angle}_{page_angle}.pdf'
pike.save(target)
return target
reference = make_rotate_test('ref', 0, 0)
test = make_rotate_test('test', image_angle, page_angle)
out = test.with_suffix('.out.pdf')
p, _, err = run_ocrmypdf(
test,
out,
'-O0',
'--rotate-pages',
'--rotate-pages-threshold',
'0.001',
universal_newlines=False,
)
err = err.decode('utf-8', errors='replace')
assert p.returncode == 0, err
assert check_monochrome_correlation(outdir, reference, 1, out, 1) > 0.2
def test_tesseract_orientation(resources, tmpdir):
pix = leptonica.Pix.open(resources / 'crom.png')
pix_rotated = pix.rotate_orth(2) # 180 degrees clockwise
pix_rotated.write_implied_format(tmpdir / '000001.png')
log = Mock()
tesseract.get_orientation( # Test results of this are unreliable
tmpdir / '000001.png', engine_mode='3', timeout=10, log=log
)