# © 2018 James R. Barlow: github.com/jbarlow83 # # This file is part of OCRmyPDF. # # OCRmyPDF is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # OCRmyPDF is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with OCRmyPDF. If not, see . import logging from io import BytesIO from os import fspath from unittest.mock import Mock import img2pdf import pytest from PIL import Image import pikepdf from ocrmypdf import leptonica from ocrmypdf.exec import ghostscript, tesseract from ocrmypdf.pdfinfo import PdfInfo # pytest.helpers is dynamic # pylint: disable=no-member # pylint: disable=w0612 pytestmark = pytest.mark.skipif( leptonica.get_leptonica_version() < 'leptonica-1.72', reason="Leptonica is too old, correlation doesn't work", ) check_ocrmypdf = pytest.helpers.check_ocrmypdf run_ocrmypdf = pytest.helpers.run_ocrmypdf RENDERERS = ['hocr', 'sandwich'] def check_monochrome_correlation( outdir, reference_pdf, reference_pageno, test_pdf, test_pageno ): gslog = logging.getLogger() reference_png = outdir / f'{reference_pdf.name}.ref{reference_pageno:04d}.png' test_png = outdir / f'{test_pdf.name}.test{test_pageno:04d}.png' def rasterize(pdf, pageno, png): if png.exists(): print(png) return ghostscript.rasterize_pdf( pdf, png, xres=100, yres=100, raster_device='pngmono', log=gslog, pageno=pageno, rotation=0, ) rasterize(reference_pdf, reference_pageno, reference_png) rasterize(test_pdf, test_pageno, test_png) pix_ref = leptonica.Pix.open(reference_png) pix_test = leptonica.Pix.open(test_png) return leptonica.Pix.correlation_binary(pix_ref, pix_test) def test_monochrome_correlation(resources, outdir): # Verify leptonica: check that an incorrect rotated image has poor # correlation with reference corr = check_monochrome_correlation( outdir, reference_pdf=resources / 'cardinal.pdf', reference_pageno=1, # north facing page test_pdf=resources / 'cardinal.pdf', test_pageno=3, # south facing page ) assert corr < 0.10 corr = check_monochrome_correlation( outdir, reference_pdf=resources / 'cardinal.pdf', reference_pageno=2, test_pdf=resources / 'cardinal.pdf', test_pageno=2, ) assert corr > 0.90 @pytest.mark.slow @pytest.mark.parametrize('renderer', RENDERERS) def test_autorotate(spoof_tesseract_cache, renderer, resources, outdir): # cardinal.pdf contains four copies of an image rotated in each cardinal # direction - these ones are "burned in" not tagged with /Rotate out = check_ocrmypdf( resources / 'cardinal.pdf', outdir / 'out.pdf', '-r', '-v', '1', '--pdf-renderer', renderer, env=spoof_tesseract_cache, ) for n in range(1, 4 + 1): correlation = check_monochrome_correlation( outdir, reference_pdf=resources / 'cardinal.pdf', reference_pageno=1, test_pdf=outdir / 'out.pdf', test_pageno=n, ) assert correlation > 0.80 @pytest.mark.parametrize( 'threshold, correlation_test', [ ('1', 'correlation > 0.80'), # Low thresh -> always rotate -> high corr ('99', 'correlation < 0.10'), # High thres -> never rotate -> low corr ], ) def test_autorotate_threshold( spoof_tesseract_cache, threshold, correlation_test, resources, outdir ): out = check_ocrmypdf( resources / 'cardinal.pdf', outdir / 'out.pdf', '--rotate-pages-threshold', threshold, '-r', '-v', '1', env=spoof_tesseract_cache, ) correlation = check_monochrome_correlation( outdir, reference_pdf=resources / 'cardinal.pdf', reference_pageno=1, test_pdf=outdir / 'out.pdf', test_pageno=3, ) assert eval(correlation_test) # pylint: disable=w0123 def test_rotated_skew_timeout(resources, outpdf): """This document contains an image that is rotated 90 into place with a /Rotate tag and intentionally skewed by altering the transformation matrix. This tests for a bug where the combination of preprocessing and a tesseract timeout produced a page whose dimensions did not match the original's. """ input_file = resources / 'rotated_skew.pdf' in_pageinfo = PdfInfo(input_file)[0] assert ( in_pageinfo.height_pixels < in_pageinfo.width_pixels ), "Expected the input page to be landscape" assert in_pageinfo.rotation == 90, "Expected a rotated page" out = check_ocrmypdf( input_file, outpdf, '--pdf-renderer', 'hocr', '--deskew', '--tesseract-timeout', '0', ) out_pageinfo = PdfInfo(out)[0] w, h = out_pageinfo.width_pixels, out_pageinfo.height_pixels assert h > w, "Expected the output page to be portrait" assert out_pageinfo.rotation == 0, "Expected no page rotation for output" assert ( in_pageinfo.width_pixels == h and in_pageinfo.height_pixels == w ), "Expected page rotation to be baked in" def test_rotate_deskew_timeout(resources, outdir): check_ocrmypdf( resources / 'rotated_skew.pdf', outdir / 'deskewed.pdf', '--rotate-pages', '--rotate-pages-threshold', '0', '--deskew', '--tesseract-timeout', '0', '--pdf-renderer', 'sandwich', ) correlation = check_monochrome_correlation( outdir, reference_pdf=resources / 'ccitt.pdf', reference_pageno=1, test_pdf=outdir / 'deskewed.pdf', test_pageno=1, ) # Confirm that the page still got deskewed assert correlation > 0.50 @pytest.mark.slow @pytest.mark.parametrize('page_angle', (0, 90, 180, 270)) @pytest.mark.parametrize('image_angle', (0, 90, 180, 270)) def test_rotate_page_level(image_angle, page_angle, resources, outdir): def make_rotate_test(prefix, image_angle, page_angle): im = Image.open(fspath(resources / 'typewriter.png')) if image_angle != 0: ccw_angle = -image_angle % 360 im = im.transpose(getattr(Image, f'ROTATE_{ccw_angle}')) memimg = BytesIO() im.save(memimg, format='PNG') memimg.seek(0) mempdf = BytesIO() img2pdf.convert( memimg.read(), layout_fun=img2pdf.get_fixed_dpi_layout_fun((200, 200)), outputstream=mempdf, ) mempdf.seek(0) pike = pikepdf.open(mempdf) pike.pages[0].Rotate = page_angle target = outdir / f'{prefix}_{image_angle}_{page_angle}.pdf' pike.save(target) return target reference = make_rotate_test('ref', 0, 0) test = make_rotate_test('test', image_angle, page_angle) out = test.with_suffix('.out.pdf') p, _, err = run_ocrmypdf( test, out, '-O0', '--rotate-pages', '--rotate-pages-threshold', '0.001', universal_newlines=False, ) err = err.decode('utf-8', errors='replace') assert p.returncode == 0, err assert check_monochrome_correlation(outdir, reference, 1, out, 1) > 0.2 def test_tesseract_orientation(resources, tmpdir): pix = leptonica.Pix.open(resources / 'crom.png') pix_rotated = pix.rotate_orth(2) # 180 degrees clockwise pix_rotated.write_implied_format(tmpdir / '000001.png') log = Mock() tesseract.get_orientation( # Test results of this are unreliable tmpdir / '000001.png', engine_mode='3', timeout=10, log=log )