OCRmyPDF/tests/test_rotation.py

229 lines
7.4 KiB
Python
Raw Normal View History

2018-05-01 23:51:35 -07:00
# © 2018 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import logging
2018-08-03 00:42:59 -07:00
from io import BytesIO
2018-05-01 23:51:35 -07:00
2018-08-03 00:42:59 -07:00
from PIL import Image
2018-05-01 23:51:35 -07:00
import pytest
2018-08-03 00:42:59 -07:00
import img2pdf
import pikepdf
2018-05-01 23:51:35 -07:00
from ocrmypdf import leptonica
from ocrmypdf.pdfinfo import PdfInfo
from ocrmypdf.exec import ghostscript
2018-08-03 00:42:59 -07:00
from ocrmypdf.helpers import fspath
2018-05-01 23:51:35 -07:00
# pytest.helpers is dynamic
# pylint: disable=no-member
# pylint: disable=w0612
pytestmark = pytest.mark.skipif(
leptonica.get_leptonica_version() < 'leptonica-1.72',
reason="Leptonica is too old, correlation doesn't work"
)
2018-05-01 23:51:35 -07:00
check_ocrmypdf = pytest.helpers.check_ocrmypdf
run_ocrmypdf = pytest.helpers.run_ocrmypdf
RENDERERS = ['hocr', 'sandwich']
def check_monochrome_correlation(
outdir,
reference_pdf, reference_pageno, test_pdf, test_pageno):
gslog = logging.getLogger()
reference_png = outdir / '{}.ref{:04d}.png'.format(
reference_pdf.name, reference_pageno)
test_png = outdir / '{}.test{:04d}.png'.format(
test_pdf.name, test_pageno)
def rasterize(pdf, pageno, png):
if png.exists():
print(png)
return
ghostscript.rasterize_pdf(
pdf, png, xres=100, yres=100,
2018-08-03 00:42:59 -07:00
raster_device='pngmono', log=gslog, pageno=pageno,
rotation=0)
2018-05-01 23:51:35 -07:00
rasterize(reference_pdf, reference_pageno, reference_png)
rasterize(test_pdf, test_pageno, test_png)
pix_ref = leptonica.Pix.open(reference_png)
pix_test = leptonica.Pix.open(test_png)
2018-05-01 23:51:35 -07:00
return leptonica.Pix.correlation_binary(pix_ref, pix_test)
def test_monochrome_correlation(resources, outdir):
# Verify leptonica: check that an incorrect rotated image has poor
# correlation with reference
corr = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=1, # north facing page
test_pdf=resources / 'cardinal.pdf',
test_pageno=3, # south facing page
)
assert corr < 0.10
corr = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=2,
test_pdf=resources / 'cardinal.pdf',
test_pageno=2,
)
assert corr > 0.90
2018-08-03 00:57:59 -07:00
@pytest.mark.slow
2018-05-01 23:51:35 -07:00
@pytest.mark.parametrize('renderer', RENDERERS)
def test_autorotate(spoof_tesseract_cache, renderer, resources, outdir):
# cardinal.pdf contains four copies of an image rotated in each cardinal
# direction - these ones are "burned in" not tagged with /Rotate
out = check_ocrmypdf(resources / 'cardinal.pdf', outdir / 'out.pdf',
'-r', '-v', '1', '--pdf-renderer', renderer,
env=spoof_tesseract_cache)
for n in range(1, 4+1):
correlation = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=1,
test_pdf=outdir / 'out.pdf',
test_pageno=n)
assert correlation > 0.80
@pytest.mark.parametrize('threshold, correlation_test', [
('1', 'correlation > 0.80'), # Low thresh -> always rotate -> high corr
('99', 'correlation < 0.10'), # High thres -> never rotate -> low corr
])
def test_autorotate_threshold(
spoof_tesseract_cache, threshold, correlation_test, resources, outdir):
out = check_ocrmypdf(resources / 'cardinal.pdf', outdir / 'out.pdf',
'--rotate-pages-threshold', threshold,
'-r', '-v', '1', env=spoof_tesseract_cache)
correlation = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=1,
test_pdf=outdir / 'out.pdf',
test_pageno=3)
assert eval(correlation_test)
def test_rotated_skew_timeout(resources, outpdf):
"""This document contains an image that is rotated 90 into place with a
/Rotate tag and intentionally skewed by altering the transformation matrix.
This tests for a bug where the combination of preprocessing and a tesseract
timeout produced a page whose dimensions did not match the original's.
"""
input_file = resources / 'rotated_skew.pdf'
2018-05-01 23:51:35 -07:00
in_pageinfo = PdfInfo(input_file)[0]
assert in_pageinfo.height_pixels < in_pageinfo.width_pixels, \
"Expected the input page to be landscape"
assert in_pageinfo.rotation == 90, "Expected a rotated page"
out = check_ocrmypdf(
input_file, outpdf,
'--pdf-renderer', 'hocr',
'--deskew', '--tesseract-timeout', '0')
out_pageinfo = PdfInfo(out)[0]
w, h = out_pageinfo.width_pixels, out_pageinfo.height_pixels
assert h > w, \
"Expected the output page to be portrait"
assert out_pageinfo.rotation == 0, \
"Expected no page rotation for output"
assert in_pageinfo.width_pixels == h and \
in_pageinfo.height_pixels == w, \
"Expected page rotation to be baked in"
def test_rotate_deskew_timeout(resources, outdir):
check_ocrmypdf(
resources / 'rotated_skew.pdf',
outdir / 'deskewed.pdf',
'--deskew',
'--tesseract-timeout', '0',
'--pdf-renderer', 'sandwich'
)
correlation = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'ccitt.pdf',
reference_pageno=1,
test_pdf=outdir / 'deskewed.pdf',
test_pageno=1)
# Confirm that the page still got deskewed
assert correlation > 0.50
2018-08-03 00:42:59 -07:00
@pytest.mark.slow
@pytest.mark.parametrize('page_angle', (0, 90, 180, 270))
@pytest.mark.parametrize('image_angle', (0, 90, 180, 270))
def test_rotate_page_level(image_angle, page_angle, resources, outdir):
def make_rotate_test(prefix, image_angle, page_angle):
im = Image.open(fspath(resources / 'typewriter.png'))
if image_angle != 0:
ccw_angle = -image_angle % 360
im = im.transpose(getattr(Image, 'ROTATE_{}'.format(ccw_angle)))
memimg = BytesIO()
im.save(memimg, format='PNG')
memimg.seek(0)
mempdf = BytesIO()
img2pdf.convert(
memimg.read(),
layout_fun=img2pdf.get_fixed_dpi_layout_fun((200, 200)),
outputstream=mempdf
)
mempdf.seek(0)
pike = pikepdf.open(mempdf)
pike.pages[0].Rotate = page_angle
target = outdir / '{}_{}_{}.pdf'.format(prefix, image_angle, page_angle)
pike.save(target)
return target
reference = make_rotate_test('ref', 0, 0)
test = make_rotate_test('test', image_angle, page_angle)
out = test.with_suffix('.out.pdf')
p, _, err = run_ocrmypdf(
test, out,
'-O0',
'--rotate-pages',
'--rotate-pages-threshold', '0.001',
universal_newlines=False
)
err = err.decode('utf-8', errors='replace')
assert p.returncode == 0, err
assert check_monochrome_correlation(outdir, reference, 1, out, 1) > 0.2