2018-05-01 23:51:35 -07:00
|
|
|
# © 2018 James R. Barlow: github.com/jbarlow83
|
|
|
|
#
|
|
|
|
# This file is part of OCRmyPDF.
|
|
|
|
#
|
|
|
|
# OCRmyPDF is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# OCRmyPDF is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
import logging
|
2018-08-03 00:42:59 -07:00
|
|
|
from io import BytesIO
|
2018-11-02 01:32:20 -07:00
|
|
|
from unittest.mock import Mock
|
2018-12-30 00:23:26 -08:00
|
|
|
from os import fspath
|
2018-05-01 23:51:35 -07:00
|
|
|
|
2018-08-03 00:42:59 -07:00
|
|
|
from PIL import Image
|
2018-05-01 23:51:35 -07:00
|
|
|
import pytest
|
2018-08-03 00:42:59 -07:00
|
|
|
import img2pdf
|
|
|
|
import pikepdf
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
from ocrmypdf import leptonica
|
|
|
|
from ocrmypdf.pdfinfo import PdfInfo
|
2018-11-02 01:32:20 -07:00
|
|
|
from ocrmypdf.exec import ghostscript, tesseract
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
|
|
|
|
# pytest.helpers is dynamic
|
|
|
|
# pylint: disable=no-member
|
|
|
|
# pylint: disable=w0612
|
|
|
|
|
2018-10-11 01:36:53 -07:00
|
|
|
pytestmark = pytest.mark.skipif(
|
|
|
|
leptonica.get_leptonica_version() < 'leptonica-1.72',
|
2018-12-30 01:27:49 -08:00
|
|
|
reason="Leptonica is too old, correlation doesn't work",
|
2018-10-11 01:36:53 -07:00
|
|
|
)
|
|
|
|
|
2018-05-01 23:51:35 -07:00
|
|
|
check_ocrmypdf = pytest.helpers.check_ocrmypdf
|
|
|
|
run_ocrmypdf = pytest.helpers.run_ocrmypdf
|
|
|
|
|
|
|
|
|
|
|
|
RENDERERS = ['hocr', 'sandwich']
|
|
|
|
|
|
|
|
|
|
|
|
def check_monochrome_correlation(
|
2018-12-30 01:27:49 -08:00
|
|
|
outdir, reference_pdf, reference_pageno, test_pdf, test_pageno
|
|
|
|
):
|
2018-05-01 23:51:35 -07:00
|
|
|
gslog = logging.getLogger()
|
|
|
|
|
|
|
|
reference_png = outdir / '{}.ref{:04d}.png'.format(
|
2018-12-30 01:27:49 -08:00
|
|
|
reference_pdf.name, reference_pageno
|
|
|
|
)
|
|
|
|
test_png = outdir / '{}.test{:04d}.png'.format(test_pdf.name, test_pageno)
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
def rasterize(pdf, pageno, png):
|
|
|
|
if png.exists():
|
|
|
|
print(png)
|
|
|
|
return
|
|
|
|
ghostscript.rasterize_pdf(
|
2018-12-30 01:27:49 -08:00
|
|
|
pdf,
|
|
|
|
png,
|
|
|
|
xres=100,
|
|
|
|
yres=100,
|
|
|
|
raster_device='pngmono',
|
|
|
|
log=gslog,
|
|
|
|
pageno=pageno,
|
|
|
|
rotation=0,
|
|
|
|
)
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
rasterize(reference_pdf, reference_pageno, reference_png)
|
|
|
|
rasterize(test_pdf, test_pageno, test_png)
|
|
|
|
|
2018-06-22 21:00:47 -07:00
|
|
|
pix_ref = leptonica.Pix.open(reference_png)
|
|
|
|
pix_test = leptonica.Pix.open(test_png)
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
return leptonica.Pix.correlation_binary(pix_ref, pix_test)
|
|
|
|
|
|
|
|
|
|
|
|
def test_monochrome_correlation(resources, outdir):
|
|
|
|
# Verify leptonica: check that an incorrect rotated image has poor
|
|
|
|
# correlation with reference
|
|
|
|
corr = check_monochrome_correlation(
|
|
|
|
outdir,
|
|
|
|
reference_pdf=resources / 'cardinal.pdf',
|
|
|
|
reference_pageno=1, # north facing page
|
|
|
|
test_pdf=resources / 'cardinal.pdf',
|
|
|
|
test_pageno=3, # south facing page
|
2018-12-30 01:27:49 -08:00
|
|
|
)
|
2018-05-01 23:51:35 -07:00
|
|
|
assert corr < 0.10
|
|
|
|
corr = check_monochrome_correlation(
|
|
|
|
outdir,
|
|
|
|
reference_pdf=resources / 'cardinal.pdf',
|
|
|
|
reference_pageno=2,
|
|
|
|
test_pdf=resources / 'cardinal.pdf',
|
|
|
|
test_pageno=2,
|
2018-12-30 01:27:49 -08:00
|
|
|
)
|
2018-05-01 23:51:35 -07:00
|
|
|
assert corr > 0.90
|
|
|
|
|
|
|
|
|
2018-08-03 00:57:59 -07:00
|
|
|
@pytest.mark.slow
|
2018-05-01 23:51:35 -07:00
|
|
|
@pytest.mark.parametrize('renderer', RENDERERS)
|
|
|
|
def test_autorotate(spoof_tesseract_cache, renderer, resources, outdir):
|
|
|
|
# cardinal.pdf contains four copies of an image rotated in each cardinal
|
|
|
|
# direction - these ones are "burned in" not tagged with /Rotate
|
2018-12-30 01:27:49 -08:00
|
|
|
out = check_ocrmypdf(
|
|
|
|
resources / 'cardinal.pdf',
|
|
|
|
outdir / 'out.pdf',
|
|
|
|
'-r',
|
|
|
|
'-v',
|
|
|
|
'1',
|
|
|
|
'--pdf-renderer',
|
|
|
|
renderer,
|
|
|
|
env=spoof_tesseract_cache,
|
|
|
|
)
|
|
|
|
for n in range(1, 4 + 1):
|
2018-05-01 23:51:35 -07:00
|
|
|
correlation = check_monochrome_correlation(
|
|
|
|
outdir,
|
|
|
|
reference_pdf=resources / 'cardinal.pdf',
|
|
|
|
reference_pageno=1,
|
|
|
|
test_pdf=outdir / 'out.pdf',
|
2018-12-30 01:27:49 -08:00
|
|
|
test_pageno=n,
|
|
|
|
)
|
2018-05-01 23:51:35 -07:00
|
|
|
assert correlation > 0.80
|
|
|
|
|
|
|
|
|
2018-12-30 01:27:49 -08:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
'threshold, correlation_test',
|
|
|
|
[
|
|
|
|
('1', 'correlation > 0.80'), # Low thresh -> always rotate -> high corr
|
|
|
|
('99', 'correlation < 0.10'), # High thres -> never rotate -> low corr
|
|
|
|
],
|
|
|
|
)
|
2018-05-01 23:51:35 -07:00
|
|
|
def test_autorotate_threshold(
|
2018-12-30 01:27:49 -08:00
|
|
|
spoof_tesseract_cache, threshold, correlation_test, resources, outdir
|
|
|
|
):
|
|
|
|
out = check_ocrmypdf(
|
|
|
|
resources / 'cardinal.pdf',
|
|
|
|
outdir / 'out.pdf',
|
|
|
|
'--rotate-pages-threshold',
|
|
|
|
threshold,
|
|
|
|
'-r',
|
|
|
|
'-v',
|
|
|
|
'1',
|
|
|
|
env=spoof_tesseract_cache,
|
|
|
|
)
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
correlation = check_monochrome_correlation(
|
|
|
|
outdir,
|
|
|
|
reference_pdf=resources / 'cardinal.pdf',
|
|
|
|
reference_pageno=1,
|
|
|
|
test_pdf=outdir / 'out.pdf',
|
2018-12-30 01:27:49 -08:00
|
|
|
test_pageno=3,
|
|
|
|
)
|
2018-05-01 23:51:35 -07:00
|
|
|
assert eval(correlation_test)
|
|
|
|
|
|
|
|
|
|
|
|
def test_rotated_skew_timeout(resources, outpdf):
|
|
|
|
"""This document contains an image that is rotated 90 into place with a
|
|
|
|
/Rotate tag and intentionally skewed by altering the transformation matrix.
|
|
|
|
|
|
|
|
This tests for a bug where the combination of preprocessing and a tesseract
|
|
|
|
timeout produced a page whose dimensions did not match the original's.
|
|
|
|
"""
|
|
|
|
|
2018-06-22 21:00:47 -07:00
|
|
|
input_file = resources / 'rotated_skew.pdf'
|
2018-05-01 23:51:35 -07:00
|
|
|
in_pageinfo = PdfInfo(input_file)[0]
|
|
|
|
|
2018-12-30 01:27:49 -08:00
|
|
|
assert (
|
|
|
|
in_pageinfo.height_pixels < in_pageinfo.width_pixels
|
|
|
|
), "Expected the input page to be landscape"
|
2018-05-01 23:51:35 -07:00
|
|
|
assert in_pageinfo.rotation == 90, "Expected a rotated page"
|
|
|
|
|
|
|
|
out = check_ocrmypdf(
|
2018-12-30 01:27:49 -08:00
|
|
|
input_file,
|
|
|
|
outpdf,
|
|
|
|
'--pdf-renderer',
|
|
|
|
'hocr',
|
|
|
|
'--deskew',
|
|
|
|
'--tesseract-timeout',
|
|
|
|
'0',
|
|
|
|
)
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
out_pageinfo = PdfInfo(out)[0]
|
|
|
|
w, h = out_pageinfo.width_pixels, out_pageinfo.height_pixels
|
|
|
|
|
2018-12-30 01:27:49 -08:00
|
|
|
assert h > w, "Expected the output page to be portrait"
|
2018-05-01 23:51:35 -07:00
|
|
|
|
2018-12-30 01:27:49 -08:00
|
|
|
assert out_pageinfo.rotation == 0, "Expected no page rotation for output"
|
2018-05-01 23:51:35 -07:00
|
|
|
|
2018-12-30 01:27:49 -08:00
|
|
|
assert (
|
|
|
|
in_pageinfo.width_pixels == h and in_pageinfo.height_pixels == w
|
|
|
|
), "Expected page rotation to be baked in"
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_rotate_deskew_timeout(resources, outdir):
|
|
|
|
check_ocrmypdf(
|
|
|
|
resources / 'rotated_skew.pdf',
|
|
|
|
outdir / 'deskewed.pdf',
|
2018-11-02 00:40:56 -07:00
|
|
|
'--rotate-pages',
|
2018-12-30 01:27:49 -08:00
|
|
|
'--rotate-pages-threshold',
|
|
|
|
'0',
|
2018-05-01 23:51:35 -07:00
|
|
|
'--deskew',
|
2018-12-30 01:27:49 -08:00
|
|
|
'--tesseract-timeout',
|
|
|
|
'0',
|
|
|
|
'--pdf-renderer',
|
|
|
|
'sandwich',
|
2018-05-01 23:51:35 -07:00
|
|
|
)
|
|
|
|
|
|
|
|
correlation = check_monochrome_correlation(
|
|
|
|
outdir,
|
|
|
|
reference_pdf=resources / 'ccitt.pdf',
|
|
|
|
reference_pageno=1,
|
|
|
|
test_pdf=outdir / 'deskewed.pdf',
|
2018-12-30 01:27:49 -08:00
|
|
|
test_pageno=1,
|
|
|
|
)
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
# Confirm that the page still got deskewed
|
|
|
|
assert correlation > 0.50
|
2018-08-03 00:42:59 -07:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.slow
|
|
|
|
@pytest.mark.parametrize('page_angle', (0, 90, 180, 270))
|
|
|
|
@pytest.mark.parametrize('image_angle', (0, 90, 180, 270))
|
|
|
|
def test_rotate_page_level(image_angle, page_angle, resources, outdir):
|
|
|
|
def make_rotate_test(prefix, image_angle, page_angle):
|
|
|
|
im = Image.open(fspath(resources / 'typewriter.png'))
|
|
|
|
if image_angle != 0:
|
|
|
|
ccw_angle = -image_angle % 360
|
|
|
|
im = im.transpose(getattr(Image, 'ROTATE_{}'.format(ccw_angle)))
|
|
|
|
memimg = BytesIO()
|
|
|
|
im.save(memimg, format='PNG')
|
|
|
|
memimg.seek(0)
|
|
|
|
mempdf = BytesIO()
|
|
|
|
img2pdf.convert(
|
|
|
|
memimg.read(),
|
|
|
|
layout_fun=img2pdf.get_fixed_dpi_layout_fun((200, 200)),
|
2018-12-30 01:27:49 -08:00
|
|
|
outputstream=mempdf,
|
2018-08-03 00:42:59 -07:00
|
|
|
)
|
|
|
|
mempdf.seek(0)
|
|
|
|
pike = pikepdf.open(mempdf)
|
|
|
|
pike.pages[0].Rotate = page_angle
|
|
|
|
target = outdir / '{}_{}_{}.pdf'.format(prefix, image_angle, page_angle)
|
|
|
|
pike.save(target)
|
|
|
|
return target
|
|
|
|
|
|
|
|
reference = make_rotate_test('ref', 0, 0)
|
|
|
|
test = make_rotate_test('test', image_angle, page_angle)
|
|
|
|
out = test.with_suffix('.out.pdf')
|
|
|
|
|
|
|
|
p, _, err = run_ocrmypdf(
|
2018-12-30 01:27:49 -08:00
|
|
|
test,
|
|
|
|
out,
|
2018-08-03 00:42:59 -07:00
|
|
|
'-O0',
|
|
|
|
'--rotate-pages',
|
2018-12-30 01:27:49 -08:00
|
|
|
'--rotate-pages-threshold',
|
|
|
|
'0.001',
|
|
|
|
universal_newlines=False,
|
2018-08-03 00:42:59 -07:00
|
|
|
)
|
|
|
|
err = err.decode('utf-8', errors='replace')
|
|
|
|
assert p.returncode == 0, err
|
|
|
|
|
|
|
|
assert check_monochrome_correlation(outdir, reference, 1, out, 1) > 0.2
|
2018-11-02 01:32:20 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_tesseract_orientation(resources, tmpdir):
|
|
|
|
pix = leptonica.Pix.open(resources / 'crom.png')
|
|
|
|
pix_rotated = pix.rotate_orth(2) # 180 degrees clockwise
|
|
|
|
pix_rotated.write_implied_format(tmpdir / '000001.png')
|
|
|
|
|
|
|
|
log = Mock()
|
|
|
|
tesseract.get_orientation( # Test results of this are unreliable
|
2018-12-30 01:27:49 -08:00
|
|
|
tmpdir / '000001.png', engine_mode='3', timeout=10, log=log
|
|
|
|
)
|