OCRmyPDF/tests/test_pageinfo.py

#!/usr/bin/env python3
# © 2015 James R. Barlow: github.com/jbarlow83

from ocrmypdf import pageinfo
from reportlab.pdfgen.canvas import Canvas
from PIL import Image
from tempfile import NamedTemporaryFile
from contextlib import suppress
import os
import shutil
import pytest
import img2pdf
import pytest
import sys


if sys.version_info.major < 3:
    print("Requires Python 3.4+")
    sys.exit(1)

TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
SPOOF_PATH = os.path.join(TESTS_ROOT, 'spoof')
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
OCRMYPDF = os.path.join(PROJECT_ROOT, 'OCRmyPDF.sh')
TEST_RESOURCES = os.path.join(PROJECT_ROOT, 'tests', 'resources')
TEST_OUTPUT = os.environ.get(
    'OCRMYPDF_TEST_OUTPUT',
    default=os.path.join(PROJECT_ROOT, 'tests', 'output', 'pageinfo'))


def setup_module():
    with suppress(FileNotFoundError):
        shutil.rmtree(TEST_OUTPUT)
    with suppress(FileExistsError):
        os.makedirs(TEST_OUTPUT)


def _make_input(input_basename):
    return os.path.join(TEST_RESOURCES, input_basename)


def _make_output(output_basename):
    return os.path.join(TEST_OUTPUT, output_basename)


def test_single_page_text():
    filename = os.path.join(TEST_OUTPUT, 'text.pdf')
    pdf = Canvas(filename, pagesize=(8*72, 6*72))
    text = pdf.beginText()
    text.setFont('Helvetica', 12)
    text.setTextOrigin(1*72, 3*72)
    text.textLine("Methink'st thou art a general offence and every"
                  " man should beat thee.")
    pdf.drawText(text)
    pdf.showPage()
    pdf.save()

    pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)

    assert len(pdfinfo) == 1
    page = pdfinfo[0]

    assert page['has_text']
    assert len(page['images']) == 0


def test_single_page_image():
    filename = os.path.join(TEST_OUTPUT, 'image-mono.pdf')

    with NamedTemporaryFile(mode='wb+', suffix='.png') as im_tmp:
        im = Image.new('1', (8, 8), 0)
        for n in range(8):
            im.putpixel((n, n), 1)
        im.save(im_tmp.name, format='PNG')

        imgsize = ((img2pdf.ImgSize.dpi, 8), (img2pdf.ImgSize.dpi, 8))
        layout_fun = img2pdf.get_layout_fun(None, imgsize, None, None, None)

        im_tmp.seek(0)
        im_bytes = im_tmp.read()
        pdf_bytes = img2pdf.convert(
                im_bytes, producer="img2pdf", with_pdfrw=False,
                layout_fun=layout_fun)

        with open(filename, 'wb') as pdf:
            pdf.write(pdf_bytes)

    pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)

    assert len(pdfinfo) == 1
    page = pdfinfo[0]

    assert not page['has_text']
    assert len(page['images']) == 1

    pdfimage = page['images'][0]
    assert pdfimage['width'] == 8
    assert pdfimage['color'] == 'gray'

    # While unexpected, this is correct
    # PDF spec says /FlateDecode image must have /BitsPerComponent 8
    # So mono images get upgraded to 8-bit
    assert pdfimage['bpc'] == 8

    # DPI in a 1"x1" is the image width
    assert abs(pdfimage['dpi_w'] - 8) < 1e-5
    assert abs(pdfimage['dpi_h'] - 8) < 1e-5


def test_single_page_inline_image():
    filename = os.path.join(TEST_OUTPUT, 'image-mono-inline.pdf')
    pdf = Canvas(filename, pagesize=(8*72, 6*72))
    with NamedTemporaryFile() as im_tmp:
        im = Image.new('1', (8, 8), 0)
        for n in range(8):
            im.putpixel((n, n), 1)
        im.save(im_tmp.name, format='PNG')
        # Draw image in a 72x72 pt or 1"x1" area
        pdf.drawInlineImage(im_tmp.name, 0, 0, width=72, height=72)
        pdf.showPage()
        pdf.save()

    pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)
    print(pdfinfo)
    pdfimage = pdfinfo[0]['images'][0]
    assert (pdfimage['dpi_w'] - 8) < 1e-5


def test_jpeg():
    filename = _make_input('c02-22.pdf')

    pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)

    pdfimage = pdfinfo[0]['images'][0]
    assert pdfimage['enc'] == 'jpeg'
    assert (pdfimage['dpi_w'] - 150) < 1e-5
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00			`#!/usr/bin/env python3`
Update release notes, add copyrights 2015-07-28 04:36:58 -07:00			`# © 2015 James R. Barlow: github.com/jbarlow83`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00
			`from ocrmypdf import pageinfo`
			`from reportlab.pdfgen.canvas import Canvas`
			`from PIL import Image`
			`from tempfile import NamedTemporaryFile`
			`from contextlib import suppress`
			`import os`
			`import shutil`
Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`import pytest`
Use img2pdf in test case because it does a better job 2015-07-30 03:35:56 -07:00			`import img2pdf`
Suppress failing test 2015-09-10 07:01:14 -07:00			`import pytest`
Move pageinfo test into tests folder 2016-01-11 17:40:44 -08:00			`import sys`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00

Move pageinfo test into tests folder 2016-01-11 17:40:44 -08:00			`if sys.version_info.major < 3:`
			`print("Requires Python 3.4+")`
			`sys.exit(1)`

			`TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))`
			`SPOOF_PATH = os.path.join(TESTS_ROOT, 'spoof')`
			`PROJECT_ROOT = os.path.dirname(TESTS_ROOT)`
			`OCRMYPDF = os.path.join(PROJECT_ROOT, 'OCRmyPDF.sh')`
			`TEST_RESOURCES = os.path.join(PROJECT_ROOT, 'tests', 'resources')`
Fixup other docker test suite errors Outstanding failures: test_pageinfo::test_jpeg tests involving unpaper due to version <6.1 failures 2015-08-20 02:36:28 -07:00			`TEST_OUTPUT = os.environ.get(`
			`'OCRMYPDF_TEST_OUTPUT',`
Move pageinfo test into tests folder 2016-01-11 17:40:44 -08:00			`default=os.path.join(PROJECT_ROOT, 'tests', 'output', 'pageinfo'))`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00

			`def setup_module():`
			`with suppress(FileNotFoundError):`
			`shutil.rmtree(TEST_OUTPUT)`
			`with suppress(FileExistsError):`
Use os.makedirs for test output directories Broke Travis 2016-01-16 02:47:56 -08:00			`os.makedirs(TEST_OUTPUT)`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00

Move pageinfo test into tests folder 2016-01-11 17:40:44 -08:00			`def _make_input(input_basename):`
			`return os.path.join(TEST_RESOURCES, input_basename)`


			`def _make_output(output_basename):`
			`return os.path.join(TEST_OUTPUT, output_basename)`


Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00			`def test_single_page_text():`
			`filename = os.path.join(TEST_OUTPUT, 'text.pdf')`
			`pdf = Canvas(filename, pagesize=(872, 672))`
			`text = pdf.beginText()`
			`text.setFont('Helvetica', 12)`
pageinfo: drop pdftotext and use PyPDF instead 2015-07-26 18:23:37 -07:00			`text.setTextOrigin(172, 372)`
			`text.textLine("Methink'st thou art a general offence and every"`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00			`" man should beat thee.")`
			`pdf.drawText(text)`
			`pdf.showPage()`
			`pdf.save()`

			`pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)`

			`assert len(pdfinfo) == 1`
			`page = pdfinfo[0]`

			`assert page['has_text']`
			`assert len(page['images']) == 0`


			`def test_single_page_image():`
			`filename = os.path.join(TEST_OUTPUT, 'image-mono.pdf')`
Use img2pdf in test case because it does a better job 2015-07-30 03:35:56 -07:00
Fix img2pdf usage in test case (to make Travis CI happy again) 2016-02-06 23:41:32 -08:00			`with NamedTemporaryFile(mode='wb+', suffix='.png') as im_tmp:`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00			`im = Image.new('1', (8, 8), 0)`
			`for n in range(8):`
			`im.putpixel((n, n), 1)`
			`im.save(im_tmp.name, format='PNG')`
Use img2pdf in test case because it does a better job 2015-07-30 03:35:56 -07:00
Fix img2pdf usage in test case (to make Travis CI happy again) 2016-02-06 23:41:32 -08:00			`imgsize = ((img2pdf.ImgSize.dpi, 8), (img2pdf.ImgSize.dpi, 8))`
			`layout_fun = img2pdf.get_layout_fun(None, imgsize, None, None, None)`

			`im_tmp.seek(0)`
			`im_bytes = im_tmp.read()`
			`pdf_bytes = img2pdf.convert(`
			`im_bytes, producer="img2pdf", with_pdfrw=False,`
			`layout_fun=layout_fun)`

Use img2pdf in test case because it does a better job 2015-07-30 03:35:56 -07:00			`with open(filename, 'wb') as pdf:`
			`pdf.write(pdf_bytes)`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00
			`pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)`

			`assert len(pdfinfo) == 1`
			`page = pdfinfo[0]`

			`assert not page['has_text']`
			`assert len(page['images']) == 1`

			`pdfimage = page['images'][0]`
			`assert pdfimage['width'] == 8`
Use img2pdf in test case because it does a better job 2015-07-30 03:35:56 -07:00			`assert pdfimage['color'] == 'gray'`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00
			`# While unexpected, this is correct`
			`# PDF spec says /FlateDecode image must have /BitsPerComponent 8`
			`# So mono images get upgraded to 8-bit`
			`assert pdfimage['bpc'] == 8`

			`# DPI in a 1"x1" is the image width`
Compute image pixel density without performing rectangle intersection (+5 squashed commits) Squashed commits: [0e27904] Partially implement DPI calculation with rotation of the image Fixes test suite [a64f662] pageinfo: all tests pass [c5b811a] Fix typos [cdd2286] Can now find inline images for efficiently [60dde8d] First cut at implementing intelligent DPI detection based on content stream Broke many of the test cases 2016-02-26 18:19:39 -08:00			`assert abs(pdfimage['dpi_w'] - 8) < 1e-5`
			`assert abs(pdfimage['dpi_h'] - 8) < 1e-5`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00

			`def test_single_page_inline_image():`
			`filename = os.path.join(TEST_OUTPUT, 'image-mono-inline.pdf')`
			`pdf = Canvas(filename, pagesize=(872, 672))`
			`with NamedTemporaryFile() as im_tmp:`
			`im = Image.new('1', (8, 8), 0)`
			`for n in range(8):`
			`im.putpixel((n, n), 1)`
			`im.save(im_tmp.name, format='PNG')`
			`# Draw image in a 72x72 pt or 1"x1" area`
			`pdf.drawInlineImage(im_tmp.name, 0, 0, width=72, height=72)`
			`pdf.showPage()`
			`pdf.save()`

Add support for inline images 2016-02-26 22:44:28 -08:00			`pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)`
			`print(pdfinfo)`
			`pdfimage = pdfinfo[0]['images'][0]`
			`assert (pdfimage['dpi_w'] - 8) < 1e-5`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00
More testing: JPEG 2015-07-27 00:25:24 -07:00
			`def test_jpeg():`
Move pageinfo test into tests folder 2016-01-11 17:40:44 -08:00			`filename = _make_input('c02-22.pdf')`
More testing: JPEG 2015-07-27 00:25:24 -07:00
			`pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)`

			`pdfimage = pdfinfo[0]['images'][0]`
			`assert pdfimage['enc'] == 'jpeg'`
Compute image pixel density without performing rectangle intersection (+5 squashed commits) Squashed commits: [0e27904] Partially implement DPI calculation with rotation of the image Fixes test suite [a64f662] pageinfo: all tests pass [c5b811a] Fix typos [cdd2286] Can now find inline images for efficiently [60dde8d] First cut at implementing intelligent DPI detection based on content stream Broke many of the test cases 2016-02-26 18:19:39 -08:00			`assert (pdfimage['dpi_w'] - 150) < 1e-5`
More testing: JPEG 2015-07-27 00:25:24 -07:00