OCRmyPDF/tests/test_optimize.py

# © 2018 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.

from os import fspath
from pathlib import Path
from unittest.mock import patch

import img2pdf
import pikepdf
import pytest
from PIL import Image, ImageDraw

from ocrmypdf import optimize as opt
from ocrmypdf._exec import jbig2enc, pngquant
from ocrmypdf._exec.ghostscript import rasterize_pdf
from ocrmypdf.helpers import Resolution

check_ocrmypdf = pytest.helpers.check_ocrmypdf  # pylint: disable=e1101


@pytest.mark.parametrize('pdf', ['multipage.pdf', 'palette.pdf'])
def test_basic(resources, pdf, outpdf):
    infile = resources / pdf
    opt.main(infile, outpdf, level=3)

    assert 0.98 * Path(outpdf).stat().st_size <= Path(infile).stat().st_size


def test_mono_not_inverted(resources, outdir):
    infile = resources / '2400dpi.pdf'
    opt.main(infile, outdir / 'out.pdf', level=3)

    rasterize_pdf(
        outdir / 'out.pdf',
        outdir / 'im.png',
        raster_device='pnggray',
        raster_dpi=Resolution(10, 10),
    )

    with Image.open(fspath(outdir / 'im.png')) as im:
        assert im.getpixel((0, 0)) == 255, "Expected white background"


@pytest.mark.skipif(not pngquant.available(), reason='need pngquant')
def test_jpg_png_params(resources, outpdf):
    check_ocrmypdf(
        resources / 'crom.png',
        outpdf,
        '--image-dpi',
        '200',
        '--optimize',
        '3',
        '--jpg-quality',
        '50',
        '--png-quality',
        '20',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )


@pytest.mark.skipif(not jbig2enc.available(), reason='need jbig2enc')
@pytest.mark.parametrize('lossy', [False, True])
def test_jbig2_lossy(lossy, resources, outpdf):
    args = [
        resources / 'ccitt.pdf',
        outpdf,
        '--image-dpi',
        '200',
        '--optimize',
        3,
        '--jpg-quality',
        '50',
        '--png-quality',
        '20',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    ]
    if lossy:
        args.append('--jbig2-lossy')

    check_ocrmypdf(*args)

    pdf = pikepdf.open(outpdf)
    pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))
    assert pim.filters[0] == '/JBIG2Decode'

    if lossy:
        assert '/JBIG2Globals' in pim.decode_parms[0]
    else:
        assert len(pim.decode_parms) == 0


@pytest.mark.skipif(
    not jbig2enc.available() or not pngquant.available(),
    reason='need jbig2enc and pngquant',
)
def test_flate_to_jbig2(resources, outdir):
    # This test requires an image that pngquant is capable of converting to
    # to 1bpp - so use an existing 1bpp image, convert up, confirm it can
    # convert down
    with Image.open(fspath(resources / 'typewriter.png')) as im:
        assert im.mode in ('1', 'P')
        im = im.convert('L')
        im.save(fspath(outdir / 'type8.png'))

    check_ocrmypdf(
        outdir / 'type8.png',
        outdir / 'out.pdf',
        '--image-dpi',
        '100',
        '--png-quality',
        '50',
        '--optimize',
        '3',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )

    pdf = pikepdf.open(outdir / 'out.pdf')
    pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))
    assert pim.filters[0] == '/JBIG2Decode'


def test_multiple_pngs(resources, outdir):
    with Path.open(outdir / 'in.pdf', 'wb') as inpdf:
        img2pdf.convert(
            fspath(resources / 'baiona_colormapped.png'),
            fspath(resources / 'baiona_gray.png'),
            with_pdfrw=False,
            outputstream=inpdf,
        )

    def mockquant(input_file, output_file, _quality_min, _quality_max):
        with Image.open(input_file) as im:
            draw = ImageDraw.Draw(im)
            draw.rectangle((0, 0, im.width, im.height), fill=128)
            im.save(output_file)

    with patch('ocrmypdf.optimize.pngquant.quantize', new=mockquant):
        check_ocrmypdf(
            outdir / 'in.pdf',
            outdir / 'out.pdf',
            '--optimize',
            '3',
            '--jobs',
            '1',
            '--use-threads',
            '--output-type',
            'pdf',
            '--plugin',
            'tests/plugins/tesseract_noop.py',
        )

    with pikepdf.open(outdir / 'in.pdf') as inpdf, pikepdf.open(
        outdir / 'out.pdf'
    ) as outpdf:
        for n in range(len(inpdf.pages)):
            inim = next(iter(inpdf.pages[n].images.values()))
            outim = next(iter(outpdf.pages[n].images.values()))
            assert len(outim.read_raw_bytes()) < len(inim.read_raw_bytes()), n
optimize: move a lot of image scanning code to pikepdf 2018-05-14 22:21:53 -07:00			`# © 2018 James R. Barlow: github.com/jbarlow83`
			`#`
			`# This file is part of OCRmyPDF.`
			`#`
			`# OCRmyPDF is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# OCRmyPDF is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.`

Drop support for Python 3.5 2018-12-30 00:23:26 -08:00			`from os import fspath`
Sort imports with isort 2018-12-30 01:28:15 -08:00			`from pathlib import Path`
Fix issue where only first PNG-style image would be optimized 2020-04-25 03:50:11 -07:00			`from unittest.mock import patch`
Make optimize test do a little more 2018-05-18 17:50:39 -07:00
Fix issue where only first PNG-style image would be optimized 2020-04-25 03:50:11 -07:00			`import img2pdf`
Sort imports 2019-12-19 15:29:56 -08:00			`import pikepdf`
optimize: move a lot of image scanning code to pikepdf 2018-05-14 22:21:53 -07:00			`import pytest`
Fix issue where only first PNG-style image would be optimized 2020-04-25 03:50:11 -07:00			`from PIL import Image, ImageDraw`
optimize: move a lot of image scanning code to pikepdf 2018-05-14 22:21:53 -07:00
Rename _optimize to optimize.py 2018-06-22 17:51:57 -07:00			`from ocrmypdf import optimize as opt`
Rename ocrmypdf.exec -> ocrmypdf._exec 2020-06-09 14:55:54 -07:00			`from ocrmypdf._exec import jbig2enc, pngquant`
			`from ocrmypdf._exec.ghostscript import rasterize_pdf`
Refactor 'xyres' into Resolution 2020-04-24 04:12:05 -07:00			`from ocrmypdf.helpers import Resolution`
optimize: move a lot of image scanning code to pikepdf 2018-05-14 22:21:53 -07:00
Delinting 2019-01-02 13:34:45 -08:00			`check_ocrmypdf = pytest.helpers.check_ocrmypdf # pylint: disable=e1101`
Add test to optimize if jbig2 is present 2018-07-02 23:49:11 -07:00

Make optimize test do a little more 2018-05-18 17:50:39 -07:00			`@pytest.mark.parametrize('pdf', ['multipage.pdf', 'palette.pdf'])`
			`def test_basic(resources, pdf, outpdf):`
			`infile = resources / pdf`
			`opt.main(infile, outpdf, level=3)`
optimize: move a lot of image scanning code to pikepdf 2018-05-14 22:21:53 -07:00
Ensure test_optimize passes Linearization sends it over the edge 2019-07-27 16:47:53 -07:00			`assert 0.98 * Path(outpdf).stat().st_size <= Path(infile).stat().st_size`
Add test case to ensure mono is not inverted 2018-06-29 00:25:11 -07:00

			`def test_mono_not_inverted(resources, outdir):`
			`infile = resources / '2400dpi.pdf'`
			`opt.main(infile, outdir / 'out.pdf', level=3)`

			`rasterize_pdf(`
Reformat with black 2018-12-30 01:27:49 -08:00			`outdir / 'out.pdf',`
			`outdir / 'im.png',`
			`raster_device='pnggray',`
Refactor 'xyres' into Resolution 2020-04-24 04:12:05 -07:00			`raster_dpi=Resolution(10, 10),`
Add test case to ensure mono is not inverted 2018-06-29 00:25:11 -07:00			`)`

Use context managers to ensure Pillow images are closed 2019-09-03 17:19:12 -07:00			`with Image.open(fspath(outdir / 'im.png')) as im:`
			`assert im.getpixel((0, 0)) == 255, "Expected white background"`
Add test to optimize if jbig2 is present 2018-07-02 23:49:11 -07:00

tests: mark test as requiring pngquant 2019-08-11 16:15:49 -07:00			`@pytest.mark.skipif(not pngquant.available(), reason='need pngquant')`
Abolish spoof_tesseract_noop 2020-06-01 03:06:40 -07:00			`def test_jpg_png_params(resources, outpdf):`
Add test to optimize if jbig2 is present 2018-07-02 23:49:11 -07:00			`check_ocrmypdf(`
Reformat with black 2018-12-30 01:27:49 -08:00			`resources / 'crom.png',`
			`outpdf,`
			`'--image-dpi',`
			`'200',`
			`'--optimize',`
			`'3',`
			`'--jpg-quality',`
			`'50',`
			`'--png-quality',`
			`'20',`
Abolish spoof_tesseract_noop 2020-06-01 03:06:40 -07:00			`'--plugin',`
			`'tests/plugins/tesseract_noop.py',`
optimize: only enable lossy JBIG2 for -O3 2018-10-03 00:38:58 -07:00			`)`


			`@pytest.mark.skipif(not jbig2enc.available(), reason='need jbig2enc')`
Change JBIG2 lossy mode to require --jbig2-lossy 2018-10-04 01:20:49 -07:00			`@pytest.mark.parametrize('lossy', [False, True])`
Abolish spoof_tesseract_noop 2020-06-01 03:06:40 -07:00			`def test_jbig2_lossy(lossy, resources, outpdf):`
Change JBIG2 lossy mode to require --jbig2-lossy 2018-10-04 01:20:49 -07:00			`args = [`
Reformat with black 2018-12-30 01:27:49 -08:00			`resources / 'ccitt.pdf',`
			`outpdf,`
			`'--image-dpi',`
			`'200',`
			`'--optimize',`
			`3,`
			`'--jpg-quality',`
			`'50',`
			`'--png-quality',`
			`'20',`
Abolish spoof_tesseract_noop 2020-06-01 03:06:40 -07:00			`'--plugin',`
			`'tests/plugins/tesseract_noop.py',`
Change JBIG2 lossy mode to require --jbig2-lossy 2018-10-04 01:20:49 -07:00			`]`
			`if lossy:`
			`args.append('--jbig2-lossy')`

Abolish spoof_tesseract_noop 2020-06-01 03:06:40 -07:00			`check_ocrmypdf(*args)`
optimize: only enable lossy JBIG2 for -O3 2018-10-03 00:38:58 -07:00
			`pdf = pikepdf.open(outpdf)`
			`pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))`
			`assert pim.filters[0] == '/JBIG2Decode'`

Change JBIG2 lossy mode to require --jbig2-lossy 2018-10-04 01:20:49 -07:00			`if lossy:`
optimize: only enable lossy JBIG2 for -O3 2018-10-03 00:38:58 -07:00			`assert '/JBIG2Globals' in pim.decode_parms[0]`
			`else:`
			`assert len(pim.decode_parms) == 0`
optimize: Reorganize so JBIG2 can be performed on images reduced to 1bpp Closes #297 2018-10-04 11:53:11 -07:00

Reformat with black 2018-12-30 01:27:49 -08:00			`@pytest.mark.skipif(`
			`not jbig2enc.available() or not pngquant.available(),`
			`reason='need jbig2enc and pngquant',`
			`)`
Abolish spoof_tesseract_noop 2020-06-01 03:06:40 -07:00			`def test_flate_to_jbig2(resources, outdir):`
optimize: Reorganize so JBIG2 can be performed on images reduced to 1bpp Closes #297 2018-10-04 11:53:11 -07:00			`# This test requires an image that pngquant is capable of converting to`
			`# to 1bpp - so use an existing 1bpp image, convert up, confirm it can`
			`# convert down`
Use context managers to ensure Pillow images are closed 2019-09-03 17:19:12 -07:00			`with Image.open(fspath(resources / 'typewriter.png')) as im:`
			`assert im.mode in ('1', 'P')`
			`im = im.convert('L')`
			`im.save(fspath(outdir / 'type8.png'))`
optimize: Reorganize so JBIG2 can be performed on images reduced to 1bpp Closes #297 2018-10-04 11:53:11 -07:00
			`check_ocrmypdf(`
Reformat with black 2018-12-30 01:27:49 -08:00			`outdir / 'type8.png',`
			`outdir / 'out.pdf',`
			`'--image-dpi',`
			`'100',`
			`'--png-quality',`
optimize: use Decode to invert 1bpp PNGs for now 2019-03-03 17:50:12 -08:00			`'50',`
Reformat with black 2018-12-30 01:27:49 -08:00			`'--optimize',`
			`'3',`
Abolish spoof_tesseract_noop 2020-06-01 03:06:40 -07:00			`'--plugin',`
			`'tests/plugins/tesseract_noop.py',`
optimize: Reorganize so JBIG2 can be performed on images reduced to 1bpp Closes #297 2018-10-04 11:53:11 -07:00			`)`

			`pdf = pikepdf.open(outdir / 'out.pdf')`
			`pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))`
			`assert pim.filters[0] == '/JBIG2Decode'`
Fix issue where only first PNG-style image would be optimized 2020-04-25 03:50:11 -07:00

Merge branch 'release/v10' into trialmerge 2020-06-09 15:12:40 -07:00			`def test_multiple_pngs(resources, outdir):`
Fix issue where only first PNG-style image would be optimized 2020-04-25 03:50:11 -07:00			`with Path.open(outdir / 'in.pdf', 'wb') as inpdf:`
			`img2pdf.convert(`
			`fspath(resources / 'baiona_colormapped.png'),`
			`fspath(resources / 'baiona_gray.png'),`
			`with_pdfrw=False,`
			`outputstream=inpdf,`
			`)`

			`def mockquant(input_file, output_file, _quality_min, _quality_max):`
			`with Image.open(input_file) as im:`
			`draw = ImageDraw.Draw(im)`
			`draw.rectangle((0, 0, im.width, im.height), fill=128)`
			`im.save(output_file)`

			`with patch('ocrmypdf.optimize.pngquant.quantize', new=mockquant):`
			`check_ocrmypdf(`
			`outdir / 'in.pdf',`
			`outdir / 'out.pdf',`
			`'--optimize',`
			`'3',`
			`'--jobs',`
			`'1',`
			`'--use-threads',`
			`'--output-type',`
			`'pdf',`
Merge branch 'release/v10' into trialmerge 2020-06-09 15:12:40 -07:00			`'--plugin',`
			`'tests/plugins/tesseract_noop.py',`
Fix issue where only first PNG-style image would be optimized 2020-04-25 03:50:11 -07:00			`)`

			`with pikepdf.open(outdir / 'in.pdf') as inpdf, pikepdf.open(`
			`outdir / 'out.pdf'`
			`) as outpdf:`
			`for n in range(len(inpdf.pages)):`
			`inim = next(iter(inpdf.pages[n].images.values()))`
			`outim = next(iter(outpdf.pages[n].images.values()))`
			`assert len(outim.read_raw_bytes()) < len(inim.read_raw_bytes()), n`