OCRmyPDF/tests/test_rotation.py

278 lines
8.1 KiB
Python
Raw Normal View History

2018-05-01 23:51:35 -07:00
# © 2018 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import logging
2018-08-03 00:42:59 -07:00
from io import BytesIO
from unittest.mock import Mock
2018-12-30 00:23:26 -08:00
from os import fspath
2018-05-01 23:51:35 -07:00
2018-08-03 00:42:59 -07:00
from PIL import Image
2018-05-01 23:51:35 -07:00
import pytest
2018-08-03 00:42:59 -07:00
import img2pdf
import pikepdf
2018-05-01 23:51:35 -07:00
from ocrmypdf import leptonica
from ocrmypdf.pdfinfo import PdfInfo
from ocrmypdf.exec import ghostscript, tesseract
2018-05-01 23:51:35 -07:00
# pytest.helpers is dynamic
# pylint: disable=no-member
# pylint: disable=w0612
pytestmark = pytest.mark.skipif(
leptonica.get_leptonica_version() < 'leptonica-1.72',
2018-12-30 01:27:49 -08:00
reason="Leptonica is too old, correlation doesn't work",
)
2018-05-01 23:51:35 -07:00
check_ocrmypdf = pytest.helpers.check_ocrmypdf
run_ocrmypdf = pytest.helpers.run_ocrmypdf
RENDERERS = ['hocr', 'sandwich']
def check_monochrome_correlation(
2018-12-30 01:27:49 -08:00
outdir, reference_pdf, reference_pageno, test_pdf, test_pageno
):
2018-05-01 23:51:35 -07:00
gslog = logging.getLogger()
reference_png = outdir / '{}.ref{:04d}.png'.format(
2018-12-30 01:27:49 -08:00
reference_pdf.name, reference_pageno
)
test_png = outdir / '{}.test{:04d}.png'.format(test_pdf.name, test_pageno)
2018-05-01 23:51:35 -07:00
def rasterize(pdf, pageno, png):
if png.exists():
print(png)
return
ghostscript.rasterize_pdf(
2018-12-30 01:27:49 -08:00
pdf,
png,
xres=100,
yres=100,
raster_device='pngmono',
log=gslog,
pageno=pageno,
rotation=0,
)
2018-05-01 23:51:35 -07:00
rasterize(reference_pdf, reference_pageno, reference_png)
rasterize(test_pdf, test_pageno, test_png)
pix_ref = leptonica.Pix.open(reference_png)
pix_test = leptonica.Pix.open(test_png)
2018-05-01 23:51:35 -07:00
return leptonica.Pix.correlation_binary(pix_ref, pix_test)
def test_monochrome_correlation(resources, outdir):
# Verify leptonica: check that an incorrect rotated image has poor
# correlation with reference
corr = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=1, # north facing page
test_pdf=resources / 'cardinal.pdf',
test_pageno=3, # south facing page
2018-12-30 01:27:49 -08:00
)
2018-05-01 23:51:35 -07:00
assert corr < 0.10
corr = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=2,
test_pdf=resources / 'cardinal.pdf',
test_pageno=2,
2018-12-30 01:27:49 -08:00
)
2018-05-01 23:51:35 -07:00
assert corr > 0.90
2018-08-03 00:57:59 -07:00
@pytest.mark.slow
2018-05-01 23:51:35 -07:00
@pytest.mark.parametrize('renderer', RENDERERS)
def test_autorotate(spoof_tesseract_cache, renderer, resources, outdir):
# cardinal.pdf contains four copies of an image rotated in each cardinal
# direction - these ones are "burned in" not tagged with /Rotate
2018-12-30 01:27:49 -08:00
out = check_ocrmypdf(
resources / 'cardinal.pdf',
outdir / 'out.pdf',
'-r',
'-v',
'1',
'--pdf-renderer',
renderer,
env=spoof_tesseract_cache,
)
for n in range(1, 4 + 1):
2018-05-01 23:51:35 -07:00
correlation = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=1,
test_pdf=outdir / 'out.pdf',
2018-12-30 01:27:49 -08:00
test_pageno=n,
)
2018-05-01 23:51:35 -07:00
assert correlation > 0.80
2018-12-30 01:27:49 -08:00
@pytest.mark.parametrize(
'threshold, correlation_test',
[
('1', 'correlation > 0.80'), # Low thresh -> always rotate -> high corr
('99', 'correlation < 0.10'), # High thres -> never rotate -> low corr
],
)
2018-05-01 23:51:35 -07:00
def test_autorotate_threshold(
2018-12-30 01:27:49 -08:00
spoof_tesseract_cache, threshold, correlation_test, resources, outdir
):
out = check_ocrmypdf(
resources / 'cardinal.pdf',
outdir / 'out.pdf',
'--rotate-pages-threshold',
threshold,
'-r',
'-v',
'1',
env=spoof_tesseract_cache,
)
2018-05-01 23:51:35 -07:00
correlation = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=1,
test_pdf=outdir / 'out.pdf',
2018-12-30 01:27:49 -08:00
test_pageno=3,
)
2018-05-01 23:51:35 -07:00
assert eval(correlation_test)
def test_rotated_skew_timeout(resources, outpdf):
"""This document contains an image that is rotated 90 into place with a
/Rotate tag and intentionally skewed by altering the transformation matrix.
This tests for a bug where the combination of preprocessing and a tesseract
timeout produced a page whose dimensions did not match the original's.
"""
input_file = resources / 'rotated_skew.pdf'
2018-05-01 23:51:35 -07:00
in_pageinfo = PdfInfo(input_file)[0]
2018-12-30 01:27:49 -08:00
assert (
in_pageinfo.height_pixels < in_pageinfo.width_pixels
), "Expected the input page to be landscape"
2018-05-01 23:51:35 -07:00
assert in_pageinfo.rotation == 90, "Expected a rotated page"
out = check_ocrmypdf(
2018-12-30 01:27:49 -08:00
input_file,
outpdf,
'--pdf-renderer',
'hocr',
'--deskew',
'--tesseract-timeout',
'0',
)
2018-05-01 23:51:35 -07:00
out_pageinfo = PdfInfo(out)[0]
w, h = out_pageinfo.width_pixels, out_pageinfo.height_pixels
2018-12-30 01:27:49 -08:00
assert h > w, "Expected the output page to be portrait"
2018-05-01 23:51:35 -07:00
2018-12-30 01:27:49 -08:00
assert out_pageinfo.rotation == 0, "Expected no page rotation for output"
2018-05-01 23:51:35 -07:00
2018-12-30 01:27:49 -08:00
assert (
in_pageinfo.width_pixels == h and in_pageinfo.height_pixels == w
), "Expected page rotation to be baked in"
2018-05-01 23:51:35 -07:00
def test_rotate_deskew_timeout(resources, outdir):
check_ocrmypdf(
resources / 'rotated_skew.pdf',
outdir / 'deskewed.pdf',
'--rotate-pages',
2018-12-30 01:27:49 -08:00
'--rotate-pages-threshold',
'0',
2018-05-01 23:51:35 -07:00
'--deskew',
2018-12-30 01:27:49 -08:00
'--tesseract-timeout',
'0',
'--pdf-renderer',
'sandwich',
2018-05-01 23:51:35 -07:00
)
correlation = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'ccitt.pdf',
reference_pageno=1,
test_pdf=outdir / 'deskewed.pdf',
2018-12-30 01:27:49 -08:00
test_pageno=1,
)
2018-05-01 23:51:35 -07:00
# Confirm that the page still got deskewed
assert correlation > 0.50
2018-08-03 00:42:59 -07:00
@pytest.mark.slow
@pytest.mark.parametrize('page_angle', (0, 90, 180, 270))
@pytest.mark.parametrize('image_angle', (0, 90, 180, 270))
def test_rotate_page_level(image_angle, page_angle, resources, outdir):
def make_rotate_test(prefix, image_angle, page_angle):
im = Image.open(fspath(resources / 'typewriter.png'))
if image_angle != 0:
ccw_angle = -image_angle % 360
im = im.transpose(getattr(Image, 'ROTATE_{}'.format(ccw_angle)))
memimg = BytesIO()
im.save(memimg, format='PNG')
memimg.seek(0)
mempdf = BytesIO()
img2pdf.convert(
memimg.read(),
layout_fun=img2pdf.get_fixed_dpi_layout_fun((200, 200)),
2018-12-30 01:27:49 -08:00
outputstream=mempdf,
2018-08-03 00:42:59 -07:00
)
mempdf.seek(0)
pike = pikepdf.open(mempdf)
pike.pages[0].Rotate = page_angle
target = outdir / '{}_{}_{}.pdf'.format(prefix, image_angle, page_angle)
pike.save(target)
return target
reference = make_rotate_test('ref', 0, 0)
test = make_rotate_test('test', image_angle, page_angle)
out = test.with_suffix('.out.pdf')
p, _, err = run_ocrmypdf(
2018-12-30 01:27:49 -08:00
test,
out,
2018-08-03 00:42:59 -07:00
'-O0',
'--rotate-pages',
2018-12-30 01:27:49 -08:00
'--rotate-pages-threshold',
'0.001',
universal_newlines=False,
2018-08-03 00:42:59 -07:00
)
err = err.decode('utf-8', errors='replace')
assert p.returncode == 0, err
assert check_monochrome_correlation(outdir, reference, 1, out, 1) > 0.2
def test_tesseract_orientation(resources, tmpdir):
pix = leptonica.Pix.open(resources / 'crom.png')
pix_rotated = pix.rotate_orth(2) # 180 degrees clockwise
pix_rotated.write_implied_format(tmpdir / '000001.png')
log = Mock()
tesseract.get_orientation( # Test results of this are unreliable
2018-12-30 01:27:49 -08:00
tmpdir / '000001.png', engine_mode='3', timeout=10, log=log
)