OCRmyPDF/tests/test_optimize.py

# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

from os import fspath
from pathlib import Path
from unittest.mock import patch

import img2pdf
import pikepdf
import pytest
from PIL import Image, ImageDraw

from ocrmypdf import optimize as opt
from ocrmypdf._exec import jbig2enc, pngquant
from ocrmypdf._exec.ghostscript import rasterize_pdf
from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution

from .conftest import check_ocrmypdf

needs_pngquant = pytest.mark.skipif(
    not pngquant.available(), reason="pngquant not installed"
)
needs_jbig2enc = pytest.mark.skipif(
    not jbig2enc.available(), reason="jbig2enc not installed"
)


@needs_pngquant
@pytest.mark.parametrize('pdf', ['multipage.pdf', 'palette.pdf'])
def test_basic(resources, pdf, outpdf):
    infile = resources / pdf
    opt.main(infile, outpdf, level=3)

    assert 0.98 * Path(outpdf).stat().st_size <= Path(infile).stat().st_size


@needs_pngquant
def test_mono_not_inverted(resources, outdir):
    infile = resources / '2400dpi.pdf'
    opt.main(infile, outdir / 'out.pdf', level=3)

    rasterize_pdf(
        outdir / 'out.pdf',
        outdir / 'im.png',
        raster_device='pnggray',
        raster_dpi=Resolution(10, 10),
    )

    with Image.open(fspath(outdir / 'im.png')) as im:
        assert im.getpixel((0, 0)) > 240, "Expected white background"


@needs_pngquant
def test_jpg_png_params(resources, outpdf):
    check_ocrmypdf(
        resources / 'crom.png',
        outpdf,
        '--image-dpi',
        '200',
        '--optimize',
        '3',
        '--jpg-quality',
        '50',
        '--png-quality',
        '20',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )


@needs_jbig2enc
@pytest.mark.parametrize('lossy', [False, True])
def test_jbig2_lossy(lossy, resources, outpdf):
    args = [
        resources / 'ccitt.pdf',
        outpdf,
        '--image-dpi',
        '200',
        '--optimize',
        '3',
        '--jpg-quality',
        '50',
        '--png-quality',
        '20',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    ]
    if lossy:
        args.append('--jbig2-lossy')

    check_ocrmypdf(*args)

    pdf = pikepdf.open(outpdf)
    pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))
    assert pim.filters[0] == '/JBIG2Decode'

    if lossy:
        assert '/JBIG2Globals' in pim.decode_parms[0]
    else:
        assert len(pim.decode_parms) == 0


@needs_pngquant
@needs_jbig2enc
def test_flate_to_jbig2(resources, outdir):
    # This test requires an image that pngquant is capable of converting to
    # to 1bpp - so use an existing 1bpp image, convert up, confirm it can
    # convert down
    with Image.open(fspath(resources / 'typewriter.png')) as im:
        assert im.mode in ('1', 'P')
        im = im.convert('L')
        im.save(fspath(outdir / 'type8.png'))

    check_ocrmypdf(
        outdir / 'type8.png',
        outdir / 'out.pdf',
        '--image-dpi',
        '100',
        '--png-quality',
        '50',
        '--optimize',
        '3',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )

    pdf = pikepdf.open(outdir / 'out.pdf')
    pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))
    assert pim.filters[0] == '/JBIG2Decode'


@needs_pngquant
def test_multiple_pngs(resources, outdir):
    with Path.open(outdir / 'in.pdf', 'wb') as inpdf:
        img2pdf.convert(
            fspath(resources / 'baiona_colormapped.png'),
            fspath(resources / 'baiona_gray.png'),
            outputstream=inpdf,
            **IMG2PDF_KWARGS,
        )

    def mockquant(input_file, output_file, *_args):
        with Image.open(input_file) as im:
            draw = ImageDraw.Draw(im)
            draw.rectangle((0, 0, im.width, im.height), fill=128)
            im.save(output_file)

    with patch('ocrmypdf.optimize.pngquant.quantize') as mock:
        mock.side_effect = mockquant
        check_ocrmypdf(
            outdir / 'in.pdf',
            outdir / 'out.pdf',
            '--optimize',
            '3',
            '--jobs',
            '1',
            '--use-threads',
            '--output-type',
            'pdf',
            '--plugin',
            'tests/plugins/tesseract_noop.py',
        )
        mock.assert_called()

    with pikepdf.open(outdir / 'in.pdf') as inpdf, pikepdf.open(
        outdir / 'out.pdf'
    ) as outpdf:
        for n in range(len(inpdf.pages)):
            inim = next(iter(inpdf.pages[n].images.values()))
            outim = next(iter(outpdf.pages[n].images.values()))
            assert len(outim.read_raw_bytes()) < len(inim.read_raw_bytes()), n


def test_optimize_off(resources, outpdf):
    check_ocrmypdf(
        resources / 'trivial.pdf',
        outpdf,
        '--optimize=0',
        '--output-type',
        'pdf',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )


def test_group3(resources, outdir):
    with pikepdf.open(resources / 'ccitt.pdf') as pdf:
        im = pdf.pages[0].Resources.XObject['/Im1']
        assert (
            opt.extract_image_filter(pdf, outdir, im, im.objgen[0]) is not None
        ), "Group 4 should be allowed"

        im.DecodeParms['/K'] = 0
        assert (
            opt.extract_image_filter(pdf, outdir, im, im.objgen[0]) is None
        ), "Group 3 should be disallowed"
Change to SPDX license tracking 2022-07-28 01:06:46 -07:00			`# SPDX-FileCopyrightText: 2022 James R. Barlow`
			`# SPDX-License-Identifier: MPL-2.0`
optimize: move a lot of image scanning code to pikepdf 2018-05-14 22:21:53 -07:00
Modernize type annotations 2022-07-23 00:39:24 -07:00			`from __future__ import annotations`

Drop support for Python 3.5 2018-12-30 00:23:26 -08:00			`from os import fspath`
Sort imports with isort 2018-12-30 01:28:15 -08:00			`from pathlib import Path`
Fix issue where only first PNG-style image would be optimized 2020-04-25 03:50:11 -07:00			`from unittest.mock import patch`
Make optimize test do a little more 2018-05-18 17:50:39 -07:00
Fix issue where only first PNG-style image would be optimized 2020-04-25 03:50:11 -07:00			`import img2pdf`
Sort imports 2019-12-19 15:29:56 -08:00			`import pikepdf`
optimize: move a lot of image scanning code to pikepdf 2018-05-14 22:21:53 -07:00			`import pytest`
Fix issue where only first PNG-style image would be optimized 2020-04-25 03:50:11 -07:00			`from PIL import Image, ImageDraw`
optimize: move a lot of image scanning code to pikepdf 2018-05-14 22:21:53 -07:00
Rename _optimize to optimize.py 2018-06-22 17:51:57 -07:00			`from ocrmypdf import optimize as opt`
Rename ocrmypdf.exec -> ocrmypdf._exec 2020-06-09 14:55:54 -07:00			`from ocrmypdf._exec import jbig2enc, pngquant`
			`from ocrmypdf._exec.ghostscript import rasterize_pdf`
Use better img2pdf settings where possible while supporting old versions Fixes #894 2022-01-14 11:55:54 -08:00			`from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution`
optimize: move a lot of image scanning code to pikepdf 2018-05-14 22:21:53 -07:00
Remove pytest_helpers_namespace 2021-04-07 01:56:51 -07:00			`from .conftest import check_ocrmypdf`
Add test to optimize if jbig2 is present 2018-07-02 23:49:11 -07:00
tests: tag tests that need pngquant, jbig2enc 2020-12-30 01:58:57 -08:00			`needs_pngquant = pytest.mark.skipif(`
			`not pngquant.available(), reason="pngquant not installed"`
			`)`
			`needs_jbig2enc = pytest.mark.skipif(`
			`not jbig2enc.available(), reason="jbig2enc not installed"`
			`)`
Add test to optimize if jbig2 is present 2018-07-02 23:49:11 -07:00
tests: tag tests that need pngquant, jbig2enc 2020-12-30 01:58:57 -08:00
			`@needs_pngquant`
Make optimize test do a little more 2018-05-18 17:50:39 -07:00			`@pytest.mark.parametrize('pdf', ['multipage.pdf', 'palette.pdf'])`
			`def test_basic(resources, pdf, outpdf):`
			`infile = resources / pdf`
			`opt.main(infile, outpdf, level=3)`
optimize: move a lot of image scanning code to pikepdf 2018-05-14 22:21:53 -07:00
Ensure test_optimize passes Linearization sends it over the edge 2019-07-27 16:47:53 -07:00			`assert 0.98 * Path(outpdf).stat().st_size <= Path(infile).stat().st_size`
Add test case to ensure mono is not inverted 2018-06-29 00:25:11 -07:00

tests: tag tests that need pngquant, jbig2enc 2020-12-30 01:58:57 -08:00			`@needs_pngquant`
Add test case to ensure mono is not inverted 2018-06-29 00:25:11 -07:00			`def test_mono_not_inverted(resources, outdir):`
			`infile = resources / '2400dpi.pdf'`
			`opt.main(infile, outdir / 'out.pdf', level=3)`

			`rasterize_pdf(`
Reformat with black 2018-12-30 01:27:49 -08:00			`outdir / 'out.pdf',`
			`outdir / 'im.png',`
			`raster_device='pnggray',`
Refactor 'xyres' into Resolution 2020-04-24 04:12:05 -07:00			`raster_dpi=Resolution(10, 10),`
Add test case to ensure mono is not inverted 2018-06-29 00:25:11 -07:00			`)`

Use context managers to ensure Pillow images are closed 2019-09-03 17:19:12 -07:00			`with Image.open(fspath(outdir / 'im.png')) as im:`
Turning on Ghostscript interpolation changes this test Seems acceptable. We don't normally use Ghostscript to downsample PDFs like is happening in this test. 2021-11-15 16:36:24 -08:00			`assert im.getpixel((0, 0)) > 240, "Expected white background"`
Add test to optimize if jbig2 is present 2018-07-02 23:49:11 -07:00

tests: tag tests that need pngquant, jbig2enc 2020-12-30 01:58:57 -08:00			`@needs_pngquant`
Abolish spoof_tesseract_noop 2020-06-01 03:06:40 -07:00			`def test_jpg_png_params(resources, outpdf):`
Add test to optimize if jbig2 is present 2018-07-02 23:49:11 -07:00			`check_ocrmypdf(`
Reformat with black 2018-12-30 01:27:49 -08:00			`resources / 'crom.png',`
			`outpdf,`
			`'--image-dpi',`
			`'200',`
			`'--optimize',`
			`'3',`
			`'--jpg-quality',`
			`'50',`
			`'--png-quality',`
			`'20',`
Abolish spoof_tesseract_noop 2020-06-01 03:06:40 -07:00			`'--plugin',`
			`'tests/plugins/tesseract_noop.py',`
optimize: only enable lossy JBIG2 for -O3 2018-10-03 00:38:58 -07:00			`)`


tests: tag tests that need pngquant, jbig2enc 2020-12-30 01:58:57 -08:00			`@needs_jbig2enc`
Change JBIG2 lossy mode to require --jbig2-lossy 2018-10-04 01:20:49 -07:00			`@pytest.mark.parametrize('lossy', [False, True])`
Abolish spoof_tesseract_noop 2020-06-01 03:06:40 -07:00			`def test_jbig2_lossy(lossy, resources, outpdf):`
Change JBIG2 lossy mode to require --jbig2-lossy 2018-10-04 01:20:49 -07:00			`args = [`
Reformat with black 2018-12-30 01:27:49 -08:00			`resources / 'ccitt.pdf',`
			`outpdf,`
			`'--image-dpi',`
			`'200',`
			`'--optimize',`
optimize: recognize and produce [/FlateDecode /DCTDecode] images 2022-02-08 00:38:08 -08:00			`'3',`
Reformat with black 2018-12-30 01:27:49 -08:00			`'--jpg-quality',`
			`'50',`
			`'--png-quality',`
			`'20',`
Abolish spoof_tesseract_noop 2020-06-01 03:06:40 -07:00			`'--plugin',`
			`'tests/plugins/tesseract_noop.py',`
Change JBIG2 lossy mode to require --jbig2-lossy 2018-10-04 01:20:49 -07:00			`]`
			`if lossy:`
			`args.append('--jbig2-lossy')`

Abolish spoof_tesseract_noop 2020-06-01 03:06:40 -07:00			`check_ocrmypdf(*args)`
optimize: only enable lossy JBIG2 for -O3 2018-10-03 00:38:58 -07:00
			`pdf = pikepdf.open(outpdf)`
			`pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))`
			`assert pim.filters[0] == '/JBIG2Decode'`

Change JBIG2 lossy mode to require --jbig2-lossy 2018-10-04 01:20:49 -07:00			`if lossy:`
optimize: only enable lossy JBIG2 for -O3 2018-10-03 00:38:58 -07:00			`assert '/JBIG2Globals' in pim.decode_parms[0]`
			`else:`
			`assert len(pim.decode_parms) == 0`
optimize: Reorganize so JBIG2 can be performed on images reduced to 1bpp Closes #297 2018-10-04 11:53:11 -07:00

tests: tag tests that need pngquant, jbig2enc 2020-12-30 01:58:57 -08:00			`@needs_pngquant`
			`@needs_jbig2enc`
Abolish spoof_tesseract_noop 2020-06-01 03:06:40 -07:00			`def test_flate_to_jbig2(resources, outdir):`
optimize: Reorganize so JBIG2 can be performed on images reduced to 1bpp Closes #297 2018-10-04 11:53:11 -07:00			`# This test requires an image that pngquant is capable of converting to`
			`# to 1bpp - so use an existing 1bpp image, convert up, confirm it can`
			`# convert down`
Use context managers to ensure Pillow images are closed 2019-09-03 17:19:12 -07:00			`with Image.open(fspath(resources / 'typewriter.png')) as im:`
			`assert im.mode in ('1', 'P')`
			`im = im.convert('L')`
			`im.save(fspath(outdir / 'type8.png'))`
optimize: Reorganize so JBIG2 can be performed on images reduced to 1bpp Closes #297 2018-10-04 11:53:11 -07:00
			`check_ocrmypdf(`
Reformat with black 2018-12-30 01:27:49 -08:00			`outdir / 'type8.png',`
			`outdir / 'out.pdf',`
			`'--image-dpi',`
			`'100',`
			`'--png-quality',`
optimize: use Decode to invert 1bpp PNGs for now 2019-03-03 17:50:12 -08:00			`'50',`
Reformat with black 2018-12-30 01:27:49 -08:00			`'--optimize',`
			`'3',`
Abolish spoof_tesseract_noop 2020-06-01 03:06:40 -07:00			`'--plugin',`
			`'tests/plugins/tesseract_noop.py',`
optimize: Reorganize so JBIG2 can be performed on images reduced to 1bpp Closes #297 2018-10-04 11:53:11 -07:00			`)`

			`pdf = pikepdf.open(outdir / 'out.pdf')`
			`pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))`
			`assert pim.filters[0] == '/JBIG2Decode'`
Fix issue where only first PNG-style image would be optimized 2020-04-25 03:50:11 -07:00

tests: tag tests that need pngquant, jbig2enc 2020-12-30 01:58:57 -08:00			`@needs_pngquant`
Merge branch 'release/v10' into trialmerge 2020-06-09 15:12:40 -07:00			`def test_multiple_pngs(resources, outdir):`
Fix issue where only first PNG-style image would be optimized 2020-04-25 03:50:11 -07:00			`with Path.open(outdir / 'in.pdf', 'wb') as inpdf:`
			`img2pdf.convert(`
			`fspath(resources / 'baiona_colormapped.png'),`
			`fspath(resources / 'baiona_gray.png'),`
			`outputstream=inpdf,`
Use better img2pdf settings where possible while supporting old versions Fixes #894 2022-01-14 11:55:54 -08:00			`**IMG2PDF_KWARGS,`
Fix issue where only first PNG-style image would be optimized 2020-04-25 03:50:11 -07:00			`)`

Delinting 2021-04-07 02:09:45 -07:00			`def mockquant(input_file, output_file, *_args):`
Fix issue where only first PNG-style image would be optimized 2020-04-25 03:50:11 -07:00			`with Image.open(input_file) as im:`
			`draw = ImageDraw.Draw(im)`
			`draw.rectangle((0, 0, im.width, im.height), fill=128)`
			`im.save(output_file)`

tests: assert that most patched functions are called We were not actually checking if functions we patched we called when expected. 2020-12-28 23:51:55 -08:00			`with patch('ocrmypdf.optimize.pngquant.quantize') as mock:`
			`mock.side_effect = mockquant`
Fix issue where only first PNG-style image would be optimized 2020-04-25 03:50:11 -07:00			`check_ocrmypdf(`
			`outdir / 'in.pdf',`
			`outdir / 'out.pdf',`
			`'--optimize',`
			`'3',`
			`'--jobs',`
			`'1',`
			`'--use-threads',`
			`'--output-type',`
			`'pdf',`
Merge branch 'release/v10' into trialmerge 2020-06-09 15:12:40 -07:00			`'--plugin',`
			`'tests/plugins/tesseract_noop.py',`
Fix issue where only first PNG-style image would be optimized 2020-04-25 03:50:11 -07:00			`)`
tests: assert that most patched functions are called We were not actually checking if functions we patched we called when expected. 2020-12-28 23:51:55 -08:00			`mock.assert_called()`
Fix issue where only first PNG-style image would be optimized 2020-04-25 03:50:11 -07:00
			`with pikepdf.open(outdir / 'in.pdf') as inpdf, pikepdf.open(`
			`outdir / 'out.pdf'`
			`) as outpdf:`
			`for n in range(len(inpdf.pages)):`
			`inim = next(iter(inpdf.pages[n].images.values()))`
			`outim = next(iter(outpdf.pages[n].images.values()))`
			`assert len(outim.read_raw_bytes()) < len(inim.read_raw_bytes()), n`
tests: confirm that we produce pdf when optimization is off 2021-01-24 01:53:36 -08:00

			`def test_optimize_off(resources, outpdf):`
			`check_ocrmypdf(`
			`resources / 'trivial.pdf',`
			`outpdf,`
			`'--optimize=0',`
			`'--output-type',`
			`'pdf',`
			`'--plugin',`
			`'tests/plugins/tesseract_noop.py',`
			`)`
Exclude Group 3 images from optimization 2021-03-20 23:28:21 -07:00

			`def test_group3(resources, outdir):`
			`with pikepdf.open(resources / 'ccitt.pdf') as pdf:`
			`im = pdf.pages[0].Resources.XObject['/Im1']`
			`assert (`
			`opt.extract_image_filter(pdf, outdir, im, im.objgen[0]) is not None`
			`), "Group 4 should be allowed"`

			`im.DecodeParms['/K'] = 0`
			`assert (`
			`opt.extract_image_filter(pdf, outdir, im, im.objgen[0]) is None`
			`), "Group 3 should be disallowed"`