2018-05-01 23:51:35 -07:00
|
|
|
# © 2018 James R. Barlow: github.com/jbarlow83
|
|
|
|
#
|
|
|
|
# This file is part of OCRmyPDF.
|
|
|
|
#
|
|
|
|
# OCRmyPDF is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# OCRmyPDF is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
import logging
|
2018-08-03 00:42:59 -07:00
|
|
|
from io import BytesIO
|
2018-12-30 00:23:26 -08:00
|
|
|
from os import fspath
|
2018-12-30 01:28:15 -08:00
|
|
|
from unittest.mock import Mock
|
2018-05-01 23:51:35 -07:00
|
|
|
|
2018-08-03 00:42:59 -07:00
|
|
|
import img2pdf
|
2019-12-19 15:29:56 -08:00
|
|
|
import pikepdf
|
2018-12-30 01:28:15 -08:00
|
|
|
import pytest
|
|
|
|
from PIL import Image
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
from ocrmypdf import leptonica
|
2018-11-02 01:32:20 -07:00
|
|
|
from ocrmypdf.exec import ghostscript, tesseract
|
2020-04-24 04:12:05 -07:00
|
|
|
from ocrmypdf.helpers import Resolution
|
2018-12-30 01:28:15 -08:00
|
|
|
from ocrmypdf.pdfinfo import PdfInfo
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
# pytest.helpers is dynamic
|
|
|
|
# pylint: disable=no-member
|
|
|
|
# pylint: disable=w0612
|
|
|
|
|
2018-10-11 01:36:53 -07:00
|
|
|
pytestmark = pytest.mark.skipif(
|
|
|
|
leptonica.get_leptonica_version() < 'leptonica-1.72',
|
2018-12-30 01:27:49 -08:00
|
|
|
reason="Leptonica is too old, correlation doesn't work",
|
2018-10-11 01:36:53 -07:00
|
|
|
)
|
|
|
|
|
2018-05-01 23:51:35 -07:00
|
|
|
check_ocrmypdf = pytest.helpers.check_ocrmypdf
|
|
|
|
run_ocrmypdf = pytest.helpers.run_ocrmypdf
|
|
|
|
|
|
|
|
|
|
|
|
RENDERERS = ['hocr', 'sandwich']
|
|
|
|
|
|
|
|
|
|
|
|
def check_monochrome_correlation(
|
2018-12-30 01:27:49 -08:00
|
|
|
outdir, reference_pdf, reference_pageno, test_pdf, test_pageno
|
|
|
|
):
|
2018-12-31 15:00:02 -08:00
|
|
|
reference_png = outdir / f'{reference_pdf.name}.ref{reference_pageno:04d}.png'
|
|
|
|
test_png = outdir / f'{test_pdf.name}.test{test_pageno:04d}.png'
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
def rasterize(pdf, pageno, png):
|
|
|
|
if png.exists():
|
|
|
|
print(png)
|
|
|
|
return
|
|
|
|
ghostscript.rasterize_pdf(
|
2018-12-30 01:27:49 -08:00
|
|
|
pdf,
|
|
|
|
png,
|
|
|
|
raster_device='pngmono',
|
2020-04-24 04:12:05 -07:00
|
|
|
raster_dpi=Resolution(100, 100),
|
2018-12-30 01:27:49 -08:00
|
|
|
pageno=pageno,
|
|
|
|
rotation=0,
|
|
|
|
)
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
rasterize(reference_pdf, reference_pageno, reference_png)
|
|
|
|
rasterize(test_pdf, test_pageno, test_png)
|
|
|
|
|
2018-06-22 21:00:47 -07:00
|
|
|
pix_ref = leptonica.Pix.open(reference_png)
|
|
|
|
pix_test = leptonica.Pix.open(test_png)
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
return leptonica.Pix.correlation_binary(pix_ref, pix_test)
|
|
|
|
|
|
|
|
|
|
|
|
def test_monochrome_correlation(resources, outdir):
|
|
|
|
# Verify leptonica: check that an incorrect rotated image has poor
|
|
|
|
# correlation with reference
|
|
|
|
corr = check_monochrome_correlation(
|
|
|
|
outdir,
|
|
|
|
reference_pdf=resources / 'cardinal.pdf',
|
|
|
|
reference_pageno=1, # north facing page
|
|
|
|
test_pdf=resources / 'cardinal.pdf',
|
|
|
|
test_pageno=3, # south facing page
|
2018-12-30 01:27:49 -08:00
|
|
|
)
|
2018-05-01 23:51:35 -07:00
|
|
|
assert corr < 0.10
|
|
|
|
corr = check_monochrome_correlation(
|
|
|
|
outdir,
|
|
|
|
reference_pdf=resources / 'cardinal.pdf',
|
|
|
|
reference_pageno=2,
|
|
|
|
test_pdf=resources / 'cardinal.pdf',
|
|
|
|
test_pageno=2,
|
2018-12-30 01:27:49 -08:00
|
|
|
)
|
2018-05-01 23:51:35 -07:00
|
|
|
assert corr > 0.90
|
|
|
|
|
|
|
|
|
2018-08-03 00:57:59 -07:00
|
|
|
@pytest.mark.slow
|
2018-05-01 23:51:35 -07:00
|
|
|
@pytest.mark.parametrize('renderer', RENDERERS)
|
|
|
|
def test_autorotate(spoof_tesseract_cache, renderer, resources, outdir):
|
|
|
|
# cardinal.pdf contains four copies of an image rotated in each cardinal
|
|
|
|
# direction - these ones are "burned in" not tagged with /Rotate
|
2018-12-30 01:27:49 -08:00
|
|
|
out = check_ocrmypdf(
|
|
|
|
resources / 'cardinal.pdf',
|
|
|
|
outdir / 'out.pdf',
|
|
|
|
'-r',
|
|
|
|
'-v',
|
|
|
|
'1',
|
|
|
|
'--pdf-renderer',
|
|
|
|
renderer,
|
|
|
|
env=spoof_tesseract_cache,
|
|
|
|
)
|
|
|
|
for n in range(1, 4 + 1):
|
2018-05-01 23:51:35 -07:00
|
|
|
correlation = check_monochrome_correlation(
|
|
|
|
outdir,
|
|
|
|
reference_pdf=resources / 'cardinal.pdf',
|
|
|
|
reference_pageno=1,
|
|
|
|
test_pdf=outdir / 'out.pdf',
|
2018-12-30 01:27:49 -08:00
|
|
|
test_pageno=n,
|
|
|
|
)
|
2018-05-01 23:51:35 -07:00
|
|
|
assert correlation > 0.80
|
|
|
|
|
|
|
|
|
2018-12-30 01:27:49 -08:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
'threshold, correlation_test',
|
|
|
|
[
|
|
|
|
('1', 'correlation > 0.80'), # Low thresh -> always rotate -> high corr
|
|
|
|
('99', 'correlation < 0.10'), # High thres -> never rotate -> low corr
|
|
|
|
],
|
|
|
|
)
|
2018-05-01 23:51:35 -07:00
|
|
|
def test_autorotate_threshold(
|
2018-12-30 01:27:49 -08:00
|
|
|
spoof_tesseract_cache, threshold, correlation_test, resources, outdir
|
|
|
|
):
|
|
|
|
out = check_ocrmypdf(
|
|
|
|
resources / 'cardinal.pdf',
|
|
|
|
outdir / 'out.pdf',
|
|
|
|
'--rotate-pages-threshold',
|
|
|
|
threshold,
|
|
|
|
'-r',
|
2019-04-08 14:57:42 +02:00
|
|
|
# '-v',
|
|
|
|
# '1',
|
2018-12-30 01:27:49 -08:00
|
|
|
env=spoof_tesseract_cache,
|
|
|
|
)
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
correlation = check_monochrome_correlation(
|
|
|
|
outdir,
|
|
|
|
reference_pdf=resources / 'cardinal.pdf',
|
|
|
|
reference_pageno=1,
|
|
|
|
test_pdf=outdir / 'out.pdf',
|
2018-12-30 01:27:49 -08:00
|
|
|
test_pageno=3,
|
|
|
|
)
|
2019-01-02 13:34:45 -08:00
|
|
|
assert eval(correlation_test) # pylint: disable=w0123
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_rotated_skew_timeout(resources, outpdf):
|
|
|
|
"""This document contains an image that is rotated 90 into place with a
|
|
|
|
/Rotate tag and intentionally skewed by altering the transformation matrix.
|
|
|
|
|
|
|
|
This tests for a bug where the combination of preprocessing and a tesseract
|
|
|
|
timeout produced a page whose dimensions did not match the original's.
|
|
|
|
"""
|
|
|
|
|
2018-06-22 21:00:47 -07:00
|
|
|
input_file = resources / 'rotated_skew.pdf'
|
2018-05-01 23:51:35 -07:00
|
|
|
in_pageinfo = PdfInfo(input_file)[0]
|
|
|
|
|
2018-12-30 01:27:49 -08:00
|
|
|
assert (
|
|
|
|
in_pageinfo.height_pixels < in_pageinfo.width_pixels
|
|
|
|
), "Expected the input page to be landscape"
|
2018-05-01 23:51:35 -07:00
|
|
|
assert in_pageinfo.rotation == 90, "Expected a rotated page"
|
|
|
|
|
|
|
|
out = check_ocrmypdf(
|
2018-12-30 01:27:49 -08:00
|
|
|
input_file,
|
|
|
|
outpdf,
|
|
|
|
'--pdf-renderer',
|
|
|
|
'hocr',
|
|
|
|
'--deskew',
|
|
|
|
'--tesseract-timeout',
|
|
|
|
'0',
|
|
|
|
)
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
out_pageinfo = PdfInfo(out)[0]
|
|
|
|
w, h = out_pageinfo.width_pixels, out_pageinfo.height_pixels
|
|
|
|
|
2018-12-30 01:27:49 -08:00
|
|
|
assert h > w, "Expected the output page to be portrait"
|
2018-05-01 23:51:35 -07:00
|
|
|
|
2018-12-30 01:27:49 -08:00
|
|
|
assert out_pageinfo.rotation == 0, "Expected no page rotation for output"
|
2018-05-01 23:51:35 -07:00
|
|
|
|
2018-12-30 01:27:49 -08:00
|
|
|
assert (
|
|
|
|
in_pageinfo.width_pixels == h and in_pageinfo.height_pixels == w
|
|
|
|
), "Expected page rotation to be baked in"
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_rotate_deskew_timeout(resources, outdir):
|
|
|
|
check_ocrmypdf(
|
|
|
|
resources / 'rotated_skew.pdf',
|
|
|
|
outdir / 'deskewed.pdf',
|
2018-11-02 00:40:56 -07:00
|
|
|
'--rotate-pages',
|
2018-12-30 01:27:49 -08:00
|
|
|
'--rotate-pages-threshold',
|
|
|
|
'0',
|
2018-05-01 23:51:35 -07:00
|
|
|
'--deskew',
|
2018-12-30 01:27:49 -08:00
|
|
|
'--tesseract-timeout',
|
|
|
|
'0',
|
|
|
|
'--pdf-renderer',
|
|
|
|
'sandwich',
|
2018-05-01 23:51:35 -07:00
|
|
|
)
|
|
|
|
|
|
|
|
correlation = check_monochrome_correlation(
|
|
|
|
outdir,
|
|
|
|
reference_pdf=resources / 'ccitt.pdf',
|
|
|
|
reference_pageno=1,
|
|
|
|
test_pdf=outdir / 'deskewed.pdf',
|
2018-12-30 01:27:49 -08:00
|
|
|
test_pageno=1,
|
|
|
|
)
|
2018-05-01 23:51:35 -07:00
|
|
|
|
|
|
|
# Confirm that the page still got deskewed
|
|
|
|
assert correlation > 0.50
|
2018-08-03 00:42:59 -07:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.slow
|
|
|
|
@pytest.mark.parametrize('page_angle', (0, 90, 180, 270))
|
|
|
|
@pytest.mark.parametrize('image_angle', (0, 90, 180, 270))
|
|
|
|
def test_rotate_page_level(image_angle, page_angle, resources, outdir):
|
|
|
|
def make_rotate_test(prefix, image_angle, page_angle):
|
|
|
|
memimg = BytesIO()
|
2019-09-03 17:19:12 -07:00
|
|
|
with Image.open(fspath(resources / 'typewriter.png')) as im:
|
|
|
|
if image_angle != 0:
|
|
|
|
ccw_angle = -image_angle % 360
|
|
|
|
im = im.transpose(getattr(Image, f'ROTATE_{ccw_angle}'))
|
|
|
|
im.save(memimg, format='PNG')
|
2018-08-03 00:42:59 -07:00
|
|
|
memimg.seek(0)
|
|
|
|
mempdf = BytesIO()
|
|
|
|
img2pdf.convert(
|
|
|
|
memimg.read(),
|
|
|
|
layout_fun=img2pdf.get_fixed_dpi_layout_fun((200, 200)),
|
2018-12-30 01:27:49 -08:00
|
|
|
outputstream=mempdf,
|
2018-08-03 00:42:59 -07:00
|
|
|
)
|
|
|
|
mempdf.seek(0)
|
|
|
|
pike = pikepdf.open(mempdf)
|
|
|
|
pike.pages[0].Rotate = page_angle
|
2018-12-31 15:00:02 -08:00
|
|
|
target = outdir / f'{prefix}_{image_angle}_{page_angle}.pdf'
|
2018-08-03 00:42:59 -07:00
|
|
|
pike.save(target)
|
|
|
|
return target
|
|
|
|
|
|
|
|
reference = make_rotate_test('ref', 0, 0)
|
|
|
|
test = make_rotate_test('test', image_angle, page_angle)
|
|
|
|
out = test.with_suffix('.out.pdf')
|
|
|
|
|
|
|
|
p, _, err = run_ocrmypdf(
|
2018-12-30 01:27:49 -08:00
|
|
|
test,
|
|
|
|
out,
|
2018-08-03 00:42:59 -07:00
|
|
|
'-O0',
|
|
|
|
'--rotate-pages',
|
2018-12-30 01:27:49 -08:00
|
|
|
'--rotate-pages-threshold',
|
|
|
|
'0.001',
|
|
|
|
universal_newlines=False,
|
2018-08-03 00:42:59 -07:00
|
|
|
)
|
|
|
|
err = err.decode('utf-8', errors='replace')
|
|
|
|
assert p.returncode == 0, err
|
|
|
|
|
|
|
|
assert check_monochrome_correlation(outdir, reference, 1, out, 1) > 0.2
|
2018-11-02 01:32:20 -07:00
|
|
|
|
|
|
|
|
2019-06-01 01:55:51 -07:00
|
|
|
def test_tesseract_orientation(resources, tmp_path):
|
2018-11-02 01:32:20 -07:00
|
|
|
pix = leptonica.Pix.open(resources / 'crom.png')
|
|
|
|
pix_rotated = pix.rotate_orth(2) # 180 degrees clockwise
|
2019-06-01 01:55:51 -07:00
|
|
|
pix_rotated.write_implied_format(tmp_path / '000001.png')
|
2018-11-02 01:32:20 -07:00
|
|
|
|
|
|
|
tesseract.get_orientation( # Test results of this are unreliable
|
2020-03-04 21:24:13 -08:00
|
|
|
tmp_path / '000001.png', engine_mode='3', timeout=10
|
2018-12-30 01:27:49 -08:00
|
|
|
)
|