OCRmyPDF/tests/test_rotation.py

271 lines
8.0 KiB
Python
Raw Normal View History

2018-05-01 23:51:35 -07:00
# © 2018 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import logging
2018-08-03 00:42:59 -07:00
from io import BytesIO
2018-12-30 00:23:26 -08:00
from os import fspath
2018-12-30 01:28:15 -08:00
from unittest.mock import Mock
2018-05-01 23:51:35 -07:00
2018-08-03 00:42:59 -07:00
import img2pdf
2019-12-19 15:29:56 -08:00
import pikepdf
2018-12-30 01:28:15 -08:00
import pytest
from PIL import Image
2018-05-01 23:51:35 -07:00
from ocrmypdf import leptonica
from ocrmypdf.exec import ghostscript, tesseract
2020-04-24 04:12:05 -07:00
from ocrmypdf.helpers import Resolution
2018-12-30 01:28:15 -08:00
from ocrmypdf.pdfinfo import PdfInfo
2018-05-01 23:51:35 -07:00
# pytest.helpers is dynamic
# pylint: disable=no-member
# pylint: disable=w0612
pytestmark = pytest.mark.skipif(
leptonica.get_leptonica_version() < 'leptonica-1.72',
2018-12-30 01:27:49 -08:00
reason="Leptonica is too old, correlation doesn't work",
)
2018-05-01 23:51:35 -07:00
check_ocrmypdf = pytest.helpers.check_ocrmypdf
run_ocrmypdf = pytest.helpers.run_ocrmypdf
RENDERERS = ['hocr', 'sandwich']
def check_monochrome_correlation(
2018-12-30 01:27:49 -08:00
outdir, reference_pdf, reference_pageno, test_pdf, test_pageno
):
reference_png = outdir / f'{reference_pdf.name}.ref{reference_pageno:04d}.png'
test_png = outdir / f'{test_pdf.name}.test{test_pageno:04d}.png'
2018-05-01 23:51:35 -07:00
def rasterize(pdf, pageno, png):
if png.exists():
print(png)
return
ghostscript.rasterize_pdf(
2018-12-30 01:27:49 -08:00
pdf,
png,
raster_device='pngmono',
2020-04-24 04:12:05 -07:00
raster_dpi=Resolution(100, 100),
2018-12-30 01:27:49 -08:00
pageno=pageno,
rotation=0,
)
2018-05-01 23:51:35 -07:00
rasterize(reference_pdf, reference_pageno, reference_png)
rasterize(test_pdf, test_pageno, test_png)
pix_ref = leptonica.Pix.open(reference_png)
pix_test = leptonica.Pix.open(test_png)
2018-05-01 23:51:35 -07:00
return leptonica.Pix.correlation_binary(pix_ref, pix_test)
def test_monochrome_correlation(resources, outdir):
# Verify leptonica: check that an incorrect rotated image has poor
# correlation with reference
corr = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=1, # north facing page
test_pdf=resources / 'cardinal.pdf',
test_pageno=3, # south facing page
2018-12-30 01:27:49 -08:00
)
2018-05-01 23:51:35 -07:00
assert corr < 0.10
corr = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=2,
test_pdf=resources / 'cardinal.pdf',
test_pageno=2,
2018-12-30 01:27:49 -08:00
)
2018-05-01 23:51:35 -07:00
assert corr > 0.90
2018-08-03 00:57:59 -07:00
@pytest.mark.slow
2018-05-01 23:51:35 -07:00
@pytest.mark.parametrize('renderer', RENDERERS)
def test_autorotate(spoof_tesseract_cache, renderer, resources, outdir):
# cardinal.pdf contains four copies of an image rotated in each cardinal
# direction - these ones are "burned in" not tagged with /Rotate
2018-12-30 01:27:49 -08:00
out = check_ocrmypdf(
resources / 'cardinal.pdf',
outdir / 'out.pdf',
'-r',
'-v',
'1',
'--pdf-renderer',
renderer,
env=spoof_tesseract_cache,
)
for n in range(1, 4 + 1):
2018-05-01 23:51:35 -07:00
correlation = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=1,
test_pdf=outdir / 'out.pdf',
2018-12-30 01:27:49 -08:00
test_pageno=n,
)
2018-05-01 23:51:35 -07:00
assert correlation > 0.80
2018-12-30 01:27:49 -08:00
@pytest.mark.parametrize(
'threshold, correlation_test',
[
('1', 'correlation > 0.80'), # Low thresh -> always rotate -> high corr
('99', 'correlation < 0.10'), # High thres -> never rotate -> low corr
],
)
2018-05-01 23:51:35 -07:00
def test_autorotate_threshold(
2018-12-30 01:27:49 -08:00
spoof_tesseract_cache, threshold, correlation_test, resources, outdir
):
out = check_ocrmypdf(
resources / 'cardinal.pdf',
outdir / 'out.pdf',
'--rotate-pages-threshold',
threshold,
'-r',
2019-04-08 14:57:42 +02:00
# '-v',
# '1',
2018-12-30 01:27:49 -08:00
env=spoof_tesseract_cache,
)
2018-05-01 23:51:35 -07:00
correlation = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=1,
test_pdf=outdir / 'out.pdf',
2018-12-30 01:27:49 -08:00
test_pageno=3,
)
2019-01-02 13:34:45 -08:00
assert eval(correlation_test) # pylint: disable=w0123
2018-05-01 23:51:35 -07:00
def test_rotated_skew_timeout(resources, outpdf):
"""This document contains an image that is rotated 90 into place with a
/Rotate tag and intentionally skewed by altering the transformation matrix.
This tests for a bug where the combination of preprocessing and a tesseract
timeout produced a page whose dimensions did not match the original's.
"""
input_file = resources / 'rotated_skew.pdf'
2018-05-01 23:51:35 -07:00
in_pageinfo = PdfInfo(input_file)[0]
2018-12-30 01:27:49 -08:00
assert (
in_pageinfo.height_pixels < in_pageinfo.width_pixels
), "Expected the input page to be landscape"
2018-05-01 23:51:35 -07:00
assert in_pageinfo.rotation == 90, "Expected a rotated page"
out = check_ocrmypdf(
2018-12-30 01:27:49 -08:00
input_file,
outpdf,
'--pdf-renderer',
'hocr',
'--deskew',
'--tesseract-timeout',
'0',
)
2018-05-01 23:51:35 -07:00
out_pageinfo = PdfInfo(out)[0]
w, h = out_pageinfo.width_pixels, out_pageinfo.height_pixels
2018-12-30 01:27:49 -08:00
assert h > w, "Expected the output page to be portrait"
2018-05-01 23:51:35 -07:00
2018-12-30 01:27:49 -08:00
assert out_pageinfo.rotation == 0, "Expected no page rotation for output"
2018-05-01 23:51:35 -07:00
2018-12-30 01:27:49 -08:00
assert (
in_pageinfo.width_pixels == h and in_pageinfo.height_pixels == w
), "Expected page rotation to be baked in"
2018-05-01 23:51:35 -07:00
def test_rotate_deskew_timeout(resources, outdir):
check_ocrmypdf(
resources / 'rotated_skew.pdf',
outdir / 'deskewed.pdf',
'--rotate-pages',
2018-12-30 01:27:49 -08:00
'--rotate-pages-threshold',
'0',
2018-05-01 23:51:35 -07:00
'--deskew',
2018-12-30 01:27:49 -08:00
'--tesseract-timeout',
'0',
'--pdf-renderer',
'sandwich',
2018-05-01 23:51:35 -07:00
)
correlation = check_monochrome_correlation(
outdir,
reference_pdf=resources / 'ccitt.pdf',
reference_pageno=1,
test_pdf=outdir / 'deskewed.pdf',
2018-12-30 01:27:49 -08:00
test_pageno=1,
)
2018-05-01 23:51:35 -07:00
# Confirm that the page still got deskewed
assert correlation > 0.50
2018-08-03 00:42:59 -07:00
@pytest.mark.slow
@pytest.mark.parametrize('page_angle', (0, 90, 180, 270))
@pytest.mark.parametrize('image_angle', (0, 90, 180, 270))
def test_rotate_page_level(image_angle, page_angle, resources, outdir):
def make_rotate_test(prefix, image_angle, page_angle):
memimg = BytesIO()
with Image.open(fspath(resources / 'typewriter.png')) as im:
if image_angle != 0:
ccw_angle = -image_angle % 360
im = im.transpose(getattr(Image, f'ROTATE_{ccw_angle}'))
im.save(memimg, format='PNG')
2018-08-03 00:42:59 -07:00
memimg.seek(0)
mempdf = BytesIO()
img2pdf.convert(
memimg.read(),
layout_fun=img2pdf.get_fixed_dpi_layout_fun((200, 200)),
2018-12-30 01:27:49 -08:00
outputstream=mempdf,
2018-08-03 00:42:59 -07:00
)
mempdf.seek(0)
pike = pikepdf.open(mempdf)
pike.pages[0].Rotate = page_angle
target = outdir / f'{prefix}_{image_angle}_{page_angle}.pdf'
2018-08-03 00:42:59 -07:00
pike.save(target)
return target
reference = make_rotate_test('ref', 0, 0)
test = make_rotate_test('test', image_angle, page_angle)
out = test.with_suffix('.out.pdf')
p, _, err = run_ocrmypdf(
2018-12-30 01:27:49 -08:00
test,
out,
2018-08-03 00:42:59 -07:00
'-O0',
'--rotate-pages',
2018-12-30 01:27:49 -08:00
'--rotate-pages-threshold',
'0.001',
universal_newlines=False,
2018-08-03 00:42:59 -07:00
)
err = err.decode('utf-8', errors='replace')
assert p.returncode == 0, err
assert check_monochrome_correlation(outdir, reference, 1, out, 1) > 0.2
2019-06-01 01:55:51 -07:00
def test_tesseract_orientation(resources, tmp_path):
pix = leptonica.Pix.open(resources / 'crom.png')
pix_rotated = pix.rotate_orth(2) # 180 degrees clockwise
2019-06-01 01:55:51 -07:00
pix_rotated.write_implied_format(tmp_path / '000001.png')
tesseract.get_orientation( # Test results of this are unreliable
tmp_path / '000001.png', engine_mode='3', timeout=10
2018-12-30 01:27:49 -08:00
)