OCRmyPDF/tests/test_rotation.py

365 lines
10 KiB
Python
Raw Normal View History

2022-07-28 01:06:46 -07:00
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
2018-05-01 23:51:35 -07:00
2022-07-23 00:39:24 -07:00
from __future__ import annotations
import operator
2018-08-03 00:42:59 -07:00
from io import BytesIO
from math import cos, pi, sin
2018-12-30 00:23:26 -08:00
from os import fspath
2024-02-12 01:17:08 -08:00
from subprocess import run
2018-05-01 23:51:35 -07:00
2018-08-03 00:42:59 -07:00
import img2pdf
2019-12-19 15:29:56 -08:00
import pikepdf
2018-12-30 01:28:15 -08:00
import pytest
2023-04-14 00:38:34 -07:00
from PIL import Image, ImageChops
from reportlab.pdfgen.canvas import Canvas
from ocrmypdf._exec import ghostscript
from ocrmypdf._plugin_manager import get_plugin_manager
from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution
2018-12-30 01:28:15 -08:00
from ocrmypdf.pdfinfo import PdfInfo
2018-05-01 23:51:35 -07:00
from .conftest import check_ocrmypdf, run_ocrmypdf_api
2021-04-07 01:56:51 -07:00
# pylintx: disable=unused-variable
2018-05-01 23:51:35 -07:00
RENDERERS = ['hocr', 'sandwich']
def compare_images_monochrome(
2018-12-30 01:27:49 -08:00
outdir, reference_pdf, reference_pageno, test_pdf, test_pageno
):
reference_png = outdir / f'{reference_pdf.name}.ref{reference_pageno:04d}.png'
test_png = outdir / f'{test_pdf.name}.test{test_pageno:04d}.png'
2018-05-01 23:51:35 -07:00
def rasterize(pdf, pageno, png):
if png.exists():
print(png)
return
ghostscript.rasterize_pdf(
2018-12-30 01:27:49 -08:00
pdf,
png,
raster_device='pngmono',
2020-04-24 04:12:05 -07:00
raster_dpi=Resolution(100, 100),
2018-12-30 01:27:49 -08:00
pageno=pageno,
rotation=0,
)
2018-05-01 23:51:35 -07:00
rasterize(reference_pdf, reference_pageno, reference_png)
rasterize(test_pdf, test_pageno, test_png)
with Image.open(reference_png) as reference_im, Image.open(test_png) as test_im:
assert reference_im.mode == test_im.mode == '1'
difference = ImageChops.logical_xor(reference_im, test_im)
assert difference.mode == '1'
2018-05-01 23:51:35 -07:00
histogram = difference.histogram()
assert (
len(histogram) == 256
), "Expected Pillow to convert to grayscale for histogram"
2018-05-01 23:51:35 -07:00
# All entries other than first and last will be 0
count_same = histogram[0]
count_different = histogram[-1]
total = count_same + count_different
2018-05-01 23:51:35 -07:00
return count_same / (total)
def test_monochrome_comparison(resources, outdir):
2021-11-13 00:06:35 -08:00
# Self test: check that an incorrect rotated image has poor
# comparison with reference
cmp = compare_images_monochrome(
2018-05-01 23:51:35 -07:00
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=1, # north facing page
test_pdf=resources / 'cardinal.pdf',
test_pageno=3, # south facing page
2018-12-30 01:27:49 -08:00
)
assert cmp < 0.90
cmp = compare_images_monochrome(
2018-05-01 23:51:35 -07:00
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=2,
test_pdf=resources / 'cardinal.pdf',
test_pageno=2,
2018-12-30 01:27:49 -08:00
)
assert cmp > 0.95
2018-05-01 23:51:35 -07:00
2018-08-03 00:57:59 -07:00
@pytest.mark.slow
2018-05-01 23:51:35 -07:00
@pytest.mark.parametrize('renderer', RENDERERS)
def test_autorotate(renderer, resources, outdir):
2018-05-01 23:51:35 -07:00
# cardinal.pdf contains four copies of an image rotated in each cardinal
# direction - these ones are "burned in" not tagged with /Rotate
2021-04-07 02:09:45 -07:00
check_ocrmypdf(
2018-12-30 01:27:49 -08:00
resources / 'cardinal.pdf',
outdir / 'out.pdf',
'-r',
'-v',
'1',
'--pdf-renderer',
renderer,
'--plugin',
'tests/plugins/tesseract_cache.py',
2018-12-30 01:27:49 -08:00
)
for n in range(1, 4 + 1):
cmp = compare_images_monochrome(
2018-05-01 23:51:35 -07:00
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=1,
test_pdf=outdir / 'out.pdf',
2018-12-30 01:27:49 -08:00
test_pageno=n,
)
assert cmp > 0.95
2018-05-01 23:51:35 -07:00
2018-12-30 01:27:49 -08:00
@pytest.mark.parametrize(
'threshold, op, comparison_threshold',
2018-12-30 01:27:49 -08:00
[
('1', operator.ge, 0.95), # Low thresh -> always rotate -> high score
('99', operator.le, 0.90), # High thres -> never rotate -> low score
2018-12-30 01:27:49 -08:00
],
)
def test_autorotate_threshold(threshold, op, comparison_threshold, resources, outdir):
2021-04-07 02:09:45 -07:00
check_ocrmypdf(
2018-12-30 01:27:49 -08:00
resources / 'cardinal.pdf',
outdir / 'out.pdf',
'--rotate-pages-threshold',
threshold,
'-r',
2019-04-08 14:57:42 +02:00
# '-v',
# '1',
'--plugin',
'tests/plugins/tesseract_cache.py',
2018-12-30 01:27:49 -08:00
)
2018-05-01 23:51:35 -07:00
cmp = compare_images_monochrome( # pylint: disable=unused-variable
2018-05-01 23:51:35 -07:00
outdir,
reference_pdf=resources / 'cardinal.pdf',
reference_pageno=1,
test_pdf=outdir / 'out.pdf',
2018-12-30 01:27:49 -08:00
test_pageno=3,
)
assert op(cmp, comparison_threshold)
2018-05-01 23:51:35 -07:00
def test_rotated_skew_timeout(resources, outpdf):
2023-04-14 00:38:34 -07:00
"""Check rotated skew timeout.
This document contains an image that is rotated 90 into place with a
2018-05-01 23:51:35 -07:00
/Rotate tag and intentionally skewed by altering the transformation matrix.
This tests for a bug where the combination of preprocessing and a tesseract
timeout produced a page whose dimensions did not match the original's.
"""
input_file = resources / 'rotated_skew.pdf'
2018-05-01 23:51:35 -07:00
in_pageinfo = PdfInfo(input_file)[0]
2018-12-30 01:27:49 -08:00
assert (
in_pageinfo.height_pixels < in_pageinfo.width_pixels
), "Expected the input page to be landscape"
2018-05-01 23:51:35 -07:00
assert in_pageinfo.rotation == 90, "Expected a rotated page"
out = check_ocrmypdf(
2018-12-30 01:27:49 -08:00
input_file,
outpdf,
'--pdf-renderer',
'hocr',
'--deskew',
'--tesseract-timeout',
'0',
)
2018-05-01 23:51:35 -07:00
out_pageinfo = PdfInfo(out)[0]
w, h = out_pageinfo.width_pixels, out_pageinfo.height_pixels
2018-12-30 01:27:49 -08:00
assert h > w, "Expected the output page to be portrait"
2018-05-01 23:51:35 -07:00
2018-12-30 01:27:49 -08:00
assert out_pageinfo.rotation == 0, "Expected no page rotation for output"
2018-05-01 23:51:35 -07:00
2018-12-30 01:27:49 -08:00
assert (
in_pageinfo.width_pixels == h and in_pageinfo.height_pixels == w
), "Expected page rotation to be baked in"
2018-05-01 23:51:35 -07:00
def test_rotate_deskew_ocr_timeout(resources, outdir):
2018-05-01 23:51:35 -07:00
check_ocrmypdf(
resources / 'rotated_skew.pdf',
outdir / 'deskewed.pdf',
'--rotate-pages',
2018-12-30 01:27:49 -08:00
'--rotate-pages-threshold',
'0',
2018-05-01 23:51:35 -07:00
'--deskew',
2018-12-30 01:27:49 -08:00
'--tesseract-timeout',
'0',
'--pdf-renderer',
2023-12-02 19:58:00 -08:00
'hocr',
2018-05-01 23:51:35 -07:00
)
cmp = compare_images_monochrome(
2018-05-01 23:51:35 -07:00
outdir,
reference_pdf=resources / 'ccitt.pdf',
reference_pageno=1,
test_pdf=outdir / 'deskewed.pdf',
2018-12-30 01:27:49 -08:00
test_pageno=1,
)
2018-05-01 23:51:35 -07:00
# Confirm that the page still got deskewed
assert cmp > 0.95
2018-08-03 00:42:59 -07:00
def make_rotate_test(imagefile, outdir, prefix, image_angle, page_angle):
memimg = BytesIO()
with Image.open(fspath(imagefile)) as im:
if image_angle != 0:
ccw_angle = -image_angle % 360
im = im.transpose(getattr(Image.Transpose, f'ROTATE_{ccw_angle}'))
im.save(memimg, format='PNG')
memimg.seek(0)
mempdf = BytesIO()
img2pdf.convert(
memimg.read(),
layout_fun=img2pdf.get_fixed_dpi_layout_fun((200, 200)),
outputstream=mempdf,
**IMG2PDF_KWARGS,
)
mempdf.seek(0)
with pikepdf.open(mempdf) as pdf:
pdf.pages[0].Rotate = page_angle
target = outdir / f'{prefix}_{image_angle}_{page_angle}.pdf'
pdf.save(target)
return target
2018-08-03 00:42:59 -07:00
@pytest.mark.slow
@pytest.mark.parametrize('page_angle', (0, 90, 180, 270))
@pytest.mark.parametrize('image_angle', (0, 90, 180, 270))
def test_rotate_page_level(image_angle, page_angle, resources, outdir, caplog):
reference = make_rotate_test(resources / 'typewriter.png', outdir, 'ref', 0, 0)
test = make_rotate_test(
resources / 'typewriter.png', outdir, 'test', image_angle, page_angle
)
2018-08-03 00:42:59 -07:00
out = test.with_suffix('.out.pdf')
exitcode = run_ocrmypdf_api(
2018-12-30 01:27:49 -08:00
test,
out,
2018-08-03 00:42:59 -07:00
'-O0',
'--rotate-pages',
2018-12-30 01:27:49 -08:00
'--rotate-pages-threshold',
'0.001',
2018-08-03 00:42:59 -07:00
)
assert exitcode == 0, caplog.text
2018-08-03 00:42:59 -07:00
assert compare_images_monochrome(outdir, reference, 1, out, 1) > 0.2
@pytest.mark.slow
@pytest.mark.parametrize('page_rotate_angle', (0, 90, 180, 270))
def test_page_rotate_tag(page_rotate_angle, resources, outdir, caplog):
# Check that pages that have an image that is misrotated but restored to
# correct rotation with a /Rotate will be processed correct and yield text.
test = make_rotate_test(
resources / 'crom.png', outdir, 'test', -page_rotate_angle, page_rotate_angle
)
out = test.with_suffix('.out.pdf')
exitcode = run_ocrmypdf_api(
test,
out,
'-O0',
)
assert exitcode == 0, caplog.text
def pdftotext(filename):
return (
run(['pdftotext', '-enc', 'UTF-8', filename, '-'], capture_output=True)
.stdout.strip()
.decode('utf-8')
)
test_text = pdftotext(out)
assert 'is a' in test_text, test_text
def test_rasterize_rotates(resources, tmp_path):
2020-10-24 03:19:32 -07:00
pm = get_plugin_manager([])
img = tmp_path / 'img90.png'
pm.hook.rasterize_pdf_page(
input_file=resources / 'graph.pdf',
output_file=img,
raster_device='pngmono',
raster_dpi=Resolution(20, 20),
page_dpi=Resolution(20, 20),
pageno=1,
rotation=90,
filter_vector=False,
stop_on_soft_error=True,
)
with Image.open(img) as im:
assert im.size == (83, 200), "Image not rotated"
img = tmp_path / 'img180.png'
pm.hook.rasterize_pdf_page(
input_file=resources / 'graph.pdf',
output_file=img,
raster_device='pngmono',
raster_dpi=Resolution(20, 20),
page_dpi=Resolution(20, 20),
pageno=1,
rotation=180,
filter_vector=False,
stop_on_soft_error=True,
)
assert Image.open(img).size == (200, 83), "Image not rotated"
def test_simulated_scan(outdir):
canvas = Canvas(
fspath(outdir / 'fakescan.pdf'),
pagesize=(209.8, 297.6),
)
page_vars = [(2, 36, 250), (91, 170, 240), (179, 190, 36), (271, 36, 36)]
for n, page_var in enumerate(page_vars):
text = canvas.beginText()
text.setFont('Helvetica', 20)
angle, x, y = page_var
cos_a, sin_a = cos(angle / 180.0 * pi), sin(angle / 180.0 * pi)
text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, x, y)
text.textOut(f'Page {n + 1}')
canvas.drawText(text)
canvas.showPage()
canvas.save()
check_ocrmypdf(
outdir / 'fakescan.pdf',
outdir / 'out.pdf',
'--force-ocr',
'--deskew',
'--rotate-pages',
'--plugin',
'tests/plugins/tesseract_debug_rotate.py',
)
with pikepdf.open(outdir / 'out.pdf') as pdf:
assert (
pdf.pages[1].mediabox[2] > pdf.pages[1].mediabox[3]
), "Wrong orientation: not landscape"
assert (
pdf.pages[3].mediabox[2] > pdf.pages[3].mediabox[3]
), "Wrong orientation: Not landscape"
assert (
pdf.pages[0].mediabox[2] < pdf.pages[0].mediabox[3]
), "Wrong orientation: Not portrait"
assert (
pdf.pages[2].mediabox[2] < pdf.pages[2].mediabox[3]
), "Wrong orientation: Not portrait"