OCRmyPDF/tests/test_pdfinfo.py

# © 2015 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.

from ocrmypdf import pdfinfo
from reportlab.pdfgen.canvas import Canvas
from PIL import Image
from tempfile import NamedTemporaryFile
from math import isclose
from ocrmypdf.pdfinfo import Colorspace, Encoding
from contextlib import suppress
import os
import shutil
import pytest
import img2pdf
import sys
import PyPDF2 as pypdf
import pikepdf
import pickle


def test_single_page_text(outdir):
    filename = outdir / 'text.pdf'
    pdf = Canvas(str(filename), pagesize=(8*72, 6*72))
    text = pdf.beginText()
    text.setFont('Helvetica', 12)
    text.setTextOrigin(1*72, 3*72)
    text.textLine("Methink'st thou art a general offence and every"
                  " man should beat thee.")
    pdf.drawText(text)
    pdf.showPage()
    pdf.save()

    info = pdfinfo.PdfInfo(filename)

    assert len(info) == 1
    page = info[0]

    assert page.has_text
    assert len(page.images) == 0


def test_single_page_image(outdir):
    filename = outdir / 'image-mono.pdf'

    im_tmp = outdir / 'tmp.png'
    im = Image.new('1', (8, 8), 0)
    for n in range(8):
        im.putpixel((n, n), 1)
    im.save(str(im_tmp), format='PNG')

    imgsize = ((img2pdf.ImgSize.dpi, 8), (img2pdf.ImgSize.dpi, 8))
    layout_fun = img2pdf.get_layout_fun(None, imgsize, None, None, None)

    im_bytes = im_tmp.read_bytes()
    pdf_bytes = img2pdf.convert(
            im_bytes, producer="img2pdf", with_pdfrw=False,
            layout_fun=layout_fun)
    filename.write_bytes(pdf_bytes)

    info = pdfinfo.PdfInfo(filename)

    assert len(info) == 1
    page = info[0]

    assert not page.has_text
    assert len(page.images) == 1

    pdfimage = page.images[0]
    assert pdfimage.width == 8
    assert pdfimage.color == Colorspace.gray

    # DPI in a 1"x1" is the image width
    assert isclose(pdfimage.xres, 8)
    assert isclose(pdfimage.yres, 8)


def test_single_page_inline_image(outdir):
    filename = outdir / 'image-mono-inline.pdf'
    pdf = Canvas(str(filename), pagesize=(8*72, 6*72))
    with NamedTemporaryFile() as im_tmp:
        im = Image.new('1', (8, 8), 0)
        for n in range(8):
            im.putpixel((n, n), 1)
        im.save(im_tmp.name, format='PNG')
        # Draw image in a 72x72 pt or 1"x1" area
        pdf.drawInlineImage(im_tmp.name, 0, 0, width=72, height=72)
        pdf.showPage()
        pdf.save()

    pdf = pdfinfo.PdfInfo(filename)
    print(pdf)
    pdfimage = pdf[0].images[0]
    assert isclose(pdfimage.xres, 8)
    assert pdfimage.color == Colorspace.rgb  # reportlab produces color image
    assert pdfimage.width == 8


def test_jpeg(resources, outdir):
    filename = resources / 'c02-22.pdf'

    pdf = pdfinfo.PdfInfo(filename)

    pdfimage = pdf[0].images[0]
    assert pdfimage.enc == Encoding.jpeg
    assert isclose(pdfimage.xres, 150)


def test_form_xobject(resources):
    filename = resources / 'formxobject.pdf'

    pdf = pdfinfo.PdfInfo(filename)
    pdfimage = pdf[0].images[0]
    assert pdfimage.width == 50


def test_no_contents(resources):
    filename = resources / 'no_contents.pdf'

    pdf = pdfinfo.PdfInfo(filename)
    assert len(pdf[0].images) == 0
    assert pdf[0].has_text == False


def test_oversized_page(resources):
    pdf = pdfinfo.PdfInfo(resources / 'poster.pdf')
    image = pdf[0].images[0]
    assert image.width * image.xres > 200, "this is supposed to be oversized"


def test_pickle(resources):
    # For multiprocessing we must be able to pickle our information - if
    # this fails then we are probably storing some unpickleabe pikepdf or
    # other external data around
    filename = resources / 'formxobject.pdf'
    pdf = pdfinfo.PdfInfo(filename)
    pickle.dumps(pdf)


def test_regex():
    rx = pdfinfo.regex_remove_char_tags

    must_match = [
        b'<char bbox="0 108 0 108" c="/"/>',
        b'<char bbox="0 108 0 108" c=">"/>',
        b'<char bbox="0 108 0 108" c="X"/>',
    ]
    must_not_match = [
        b'<span stuff="c">',
        b'<span>',
        b'</span>',
        b'</page>'
    ]

    for s in must_match:
        assert rx.match(s)
    for s in must_not_match:
        assert not rx.match(s)


def test_vector(resources):
    filename = resources / 'vector.pdf'
    pdf = pdfinfo.PdfInfo(filename)
    assert pdf[0].has_vector
Update release notes, add copyrights 2015-07-28 04:36:58 -07:00			`# © 2015 James R. Barlow: github.com/jbarlow83`
Add license notice to all files Source files to GPL3 Exceptions: -tests/spoof/* to MIT -hocrtransform.py -_unicodefun.py Test resources to CC BY-SA 4.0 except when otherwise noted. Add GPL license. 2018-03-14 14:40:48 -07:00			`#`
			`# This file is part of OCRmyPDF.`
			`#`
			`# OCRmyPDF is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# OCRmyPDF is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00
Rename pageinfo to pdfinfo 2017-05-19 15:48:23 -07:00			`from ocrmypdf import pdfinfo`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00			`from reportlab.pdfgen.canvas import Canvas`
			`from PIL import Image`
			`from tempfile import NamedTemporaryFile`
Refactor from ImageInfo index to attribute accessing 2017-05-18 18:39:14 -07:00			`from math import isclose`
Rename pageinfo to pdfinfo 2017-05-19 15:48:23 -07:00			`from ocrmypdf.pdfinfo import Colorspace, Encoding`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00			`from contextlib import suppress`
			`import os`
			`import shutil`
Drop nose, all tests working reasonably again Although the real issue was that the ruffus pipeline cannot be executed twice in the same process due to its reliance on global variables. The new OO pipeline in ruffus 2.6 would be one resolution that would allow for more comprehensive testing as opposed to farming out the execution to subprocess and inspecting the results, as is currently done. 2015-07-28 00:43:22 -07:00			`import pytest`
Use img2pdf in test case because it does a better job 2015-07-30 03:35:56 -07:00			`import img2pdf`
Move pageinfo test into tests folder 2016-01-11 17:40:44 -08:00			`import sys`
Add _naive_find_text to search for text when fitz is not available 2018-03-27 13:32:38 -07:00			`import PyPDF2 as pypdf`
Main changeset for pikepdf-based refactor pdfinfo 2018-05-24 22:22:01 -07:00			`import pikepdf`
			`import pickle`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00

Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`def test_single_page_text(outdir):`
			`filename = outdir / 'text.pdf'`
			`pdf = Canvas(str(filename), pagesize=(872, 672))`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00			`text = pdf.beginText()`
			`text.setFont('Helvetica', 12)`
pageinfo: drop pdftotext and use PyPDF instead 2015-07-26 18:23:37 -07:00			`text.setTextOrigin(172, 372)`
			`text.textLine("Methink'st thou art a general offence and every"`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00			`" man should beat thee.")`
			`pdf.drawText(text)`
			`pdf.showPage()`
			`pdf.save()`

pdfinfo: replace most remaining dict-style access 2017-05-19 16:17:36 -07:00			`info = pdfinfo.PdfInfo(filename)`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00
pdfinfo: replace most remaining dict-style access 2017-05-19 16:17:36 -07:00			`assert len(info) == 1`
			`page = info[0]`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00
Access PageInfo instance variables instead of dictionary 2017-05-18 17:12:04 -07:00			`assert page.has_text`
			`assert len(page.images) == 0`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00

Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`def test_single_page_image(outdir):`
			`filename = outdir / 'image-mono.pdf'`
Use img2pdf in test case because it does a better job 2015-07-30 03:35:56 -07:00
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`im_tmp = outdir / 'tmp.png'`
			`im = Image.new('1', (8, 8), 0)`
			`for n in range(8):`
			`im.putpixel((n, n), 1)`
			`im.save(str(im_tmp), format='PNG')`
Fix img2pdf usage in test case (to make Travis CI happy again) 2016-02-06 23:41:32 -08:00
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`imgsize = ((img2pdf.ImgSize.dpi, 8), (img2pdf.ImgSize.dpi, 8))`
			`layout_fun = img2pdf.get_layout_fun(None, imgsize, None, None, None)`
Fix img2pdf usage in test case (to make Travis CI happy again) 2016-02-06 23:41:32 -08:00
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`im_bytes = im_tmp.read_bytes()`
			`pdf_bytes = img2pdf.convert(`
			`im_bytes, producer="img2pdf", with_pdfrw=False,`
			`layout_fun=layout_fun)`
Fix remaining 3.4/3.5 regressions 2017-01-26 17:53:27 -08:00			`filename.write_bytes(pdf_bytes)`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00
pdfinfo: replace most remaining dict-style access 2017-05-19 16:17:36 -07:00			`info = pdfinfo.PdfInfo(filename)`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00
pdfinfo: replace most remaining dict-style access 2017-05-19 16:17:36 -07:00			`assert len(info) == 1`
			`page = info[0]`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00
Access PageInfo instance variables instead of dictionary 2017-05-18 17:12:04 -07:00			`assert not page.has_text`
			`assert len(page.images) == 1`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00
Access PageInfo instance variables instead of dictionary 2017-05-18 17:12:04 -07:00			`pdfimage = page.images[0]`
Refactor from ImageInfo index to attribute accessing 2017-05-18 18:39:14 -07:00			`assert pdfimage.width == 8`
Replace magic strings colorspace and encoding with Enums 2017-05-18 22:32:27 -07:00			`assert pdfimage.color == Colorspace.gray`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00
			`# DPI in a 1"x1" is the image width`
Refactor from ImageInfo index to attribute accessing 2017-05-18 18:39:14 -07:00			`assert isclose(pdfimage.xres, 8)`
			`assert isclose(pdfimage.yres, 8)`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00

Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`def test_single_page_inline_image(outdir):`
			`filename = outdir / 'image-mono-inline.pdf'`
			`pdf = Canvas(str(filename), pagesize=(872, 672))`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00			`with NamedTemporaryFile() as im_tmp:`
			`im = Image.new('1', (8, 8), 0)`
			`for n in range(8):`
			`im.putpixel((n, n), 1)`
			`im.save(im_tmp.name, format='PNG')`
			`# Draw image in a 72x72 pt or 1"x1" area`
			`pdf.drawInlineImage(im_tmp.name, 0, 0, width=72, height=72)`
			`pdf.showPage()`
			`pdf.save()`

pdfinfo: replace most remaining dict-style access 2017-05-19 16:17:36 -07:00			`pdf = pdfinfo.PdfInfo(filename)`
			`print(pdf)`
			`pdfimage = pdf[0].images[0]`
Refactor from ImageInfo index to attribute accessing 2017-05-18 18:39:14 -07:00			`assert isclose(pdfimage.xres, 8)`
Replace magic strings colorspace and encoding with Enums 2017-05-18 22:32:27 -07:00			`assert pdfimage.color == Colorspace.rgb # reportlab produces color image`
Refactor from ImageInfo index to attribute accessing 2017-05-18 18:39:14 -07:00			`assert pdfimage.width == 8`
Add some pageinfo test cases; found problem with inline images 2015-07-26 15:24:42 -07:00
More testing: JPEG 2015-07-27 00:25:24 -07:00
Refactor test suite to use fixtures to manage paths 2017-01-26 16:38:59 -08:00			`def test_jpeg(resources, outdir):`
			`filename = resources / 'c02-22.pdf'`
More testing: JPEG 2015-07-27 00:25:24 -07:00
pdfinfo: replace most remaining dict-style access 2017-05-19 16:17:36 -07:00			`pdf = pdfinfo.PdfInfo(filename)`
More testing: JPEG 2015-07-27 00:25:24 -07:00
pdfinfo: replace most remaining dict-style access 2017-05-19 16:17:36 -07:00			`pdfimage = pdf[0].images[0]`
Replace magic strings colorspace and encoding with Enums 2017-05-18 22:32:27 -07:00			`assert pdfimage.enc == Encoding.jpeg`
Refactor from ImageInfo index to attribute accessing 2017-05-18 18:39:14 -07:00			`assert isclose(pdfimage.xres, 150)`
More testing: JPEG 2015-07-27 00:25:24 -07:00
Create test case for Form XObjects 2017-02-14 12:51:15 -08:00
			`def test_form_xobject(resources):`
			`filename = resources / 'formxobject.pdf'`

pdfinfo: replace most remaining dict-style access 2017-05-19 16:17:36 -07:00			`pdf = pdfinfo.PdfInfo(filename)`
			`pdfimage = pdf[0].images[0]`
Refactor from ImageInfo index to attribute accessing 2017-05-18 18:39:14 -07:00			`assert pdfimage.width == 50`
Fix #156 - NoneType has no ‘getObject’ for pages with no /Contents 2017-05-01 15:46:15 -07:00

			`def test_no_contents(resources):`
			`filename = resources / 'no_contents.pdf'`

pdfinfo: replace most remaining dict-style access 2017-05-19 16:17:36 -07:00			`pdf = pdfinfo.PdfInfo(filename)`
			`assert len(pdf[0].images) == 0`
—output-type=pdf now outputs /UserUnit PDFs at the correct size This currently distorts the output size because Tesseract assumes it knows the DPI better than we do. Does not work for Ghostscript, because it emerges that Ghostscript honors /UserUnit for rasterizing but not in pdfwrite (resolve/wontfix). https://bugs.ghostscript.com/show_bug.cgi?id=690781 Ghostscript’s output would need to be patched in a PDF/A safe way for this to work. Temporary route may be to block Ghostscript if /UserUnit. 2017-05-24 23:26:07 -07:00			`assert pdf[0].has_text == False`


			`def test_oversized_page(resources):`
			`pdf = pdfinfo.PdfInfo(resources / 'poster.pdf')`
			`image = pdf[0].images[0]`
lint: Remove shebangs from non-executable files 2018-02-24 12:38:58 -08:00			`assert image.width * image.xres > 200, "this is supposed to be oversized"`
Document need for pdfinfo to be pickleable 2018-05-24 22:24:13 -07:00

			`def test_pickle(resources):`
			`# For multiprocessing we must be able to pickle our information - if`
			`# this fails then we are probably storing some unpickleabe pikepdf or`
			`# other external data around`
			`filename = resources / 'formxobject.pdf'`
			`pdf = pdfinfo.PdfInfo(filename)`
			`pickle.dumps(pdf)`
pdfinfo: improve the regex 2018-07-04 00:59:32 -07:00

			`def test_regex():`
			`rx = pdfinfo.regex_remove_char_tags`

			`must_match = [`
			`b'<char bbox="0 108 0 108" c="/"/>',`
			`b'<char bbox="0 108 0 108" c=">"/>',`
			`b'<char bbox="0 108 0 108" c="X"/>',`
			`]`
			`must_not_match = [`
			`b'<span stuff="c">',`
			`b'<span>',`
			`b'</span>',`
			`b'</page>'`
			`]`

			`for s in must_match:`
			`assert rx.match(s)`
			`for s in must_not_match:`
			`assert not rx.match(s)`
pdfinfo: learn to detect vector graphic objects 2018-10-18 01:21:51 -07:00

			`def test_vector(resources):`
			`filename = resources / 'vector.pdf'`
			`pdf = pdfinfo.PdfInfo(filename)`
			`assert pdf[0].has_vector`