2015-07-28 04:36:58 -07:00
|
|
|
# © 2015 James R. Barlow: github.com/jbarlow83
|
2018-03-14 14:40:48 -07:00
|
|
|
#
|
2020-08-05 00:44:42 -07:00
|
|
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
|
2018-12-30 01:28:15 -08:00
|
|
|
import pickle
|
2021-01-08 15:04:52 -08:00
|
|
|
from io import BytesIO
|
2018-12-30 01:28:15 -08:00
|
|
|
from math import isclose
|
|
|
|
|
|
|
|
import img2pdf
|
2019-12-19 15:29:56 -08:00
|
|
|
import pikepdf
|
2018-12-30 01:28:15 -08:00
|
|
|
import pytest
|
|
|
|
from PIL import Image
|
2021-01-08 15:04:52 -08:00
|
|
|
from reportlab.lib.units import inch
|
2018-12-30 01:28:15 -08:00
|
|
|
from reportlab.pdfgen.canvas import Canvas
|
|
|
|
|
|
|
|
from ocrmypdf import pdfinfo
|
2021-01-01 01:11:32 -08:00
|
|
|
from ocrmypdf.exceptions import InputFileError
|
2021-04-15 23:26:14 -07:00
|
|
|
from ocrmypdf.helpers import Resolution
|
2018-12-30 01:28:15 -08:00
|
|
|
from ocrmypdf.pdfinfo import Colorspace, Encoding
|
2021-01-01 01:11:32 -08:00
|
|
|
from ocrmypdf.pdfinfo.layout import PDFPage
|
|
|
|
|
2019-01-02 13:34:45 -08:00
|
|
|
# pylint: disable=protected-access
|
|
|
|
|
2015-07-26 15:24:42 -07:00
|
|
|
|
2017-01-26 16:38:59 -08:00
|
|
|
def test_single_page_text(outdir):
|
|
|
|
filename = outdir / 'text.pdf'
|
2021-01-08 15:04:52 -08:00
|
|
|
pdf = Canvas(str(filename), pagesize=(8 * inch, 6 * inch))
|
2015-07-26 15:24:42 -07:00
|
|
|
text = pdf.beginText()
|
|
|
|
text.setFont('Helvetica', 12)
|
2021-01-08 15:04:52 -08:00
|
|
|
text.setTextOrigin(1 * inch, 3 * inch)
|
2018-12-30 01:27:49 -08:00
|
|
|
text.textLine(
|
|
|
|
"Methink'st thou art a general offence and every" " man should beat thee."
|
|
|
|
)
|
2015-07-26 15:24:42 -07:00
|
|
|
pdf.drawText(text)
|
|
|
|
pdf.showPage()
|
|
|
|
pdf.save()
|
|
|
|
|
2017-05-19 16:17:36 -07:00
|
|
|
info = pdfinfo.PdfInfo(filename)
|
2015-07-26 15:24:42 -07:00
|
|
|
|
2017-05-19 16:17:36 -07:00
|
|
|
assert len(info) == 1
|
|
|
|
page = info[0]
|
2015-07-26 15:24:42 -07:00
|
|
|
|
2017-05-18 17:12:04 -07:00
|
|
|
assert page.has_text
|
|
|
|
assert len(page.images) == 0
|
2015-07-26 15:24:42 -07:00
|
|
|
|
|
|
|
|
2021-01-08 15:04:52 -08:00
|
|
|
@pytest.fixture(scope='session')
|
|
|
|
def eight_by_eight():
|
2017-01-26 16:38:59 -08:00
|
|
|
im = Image.new('1', (8, 8), 0)
|
|
|
|
for n in range(8):
|
|
|
|
im.putpixel((n, n), 1)
|
2021-01-08 15:04:52 -08:00
|
|
|
return im
|
|
|
|
|
|
|
|
|
|
|
|
def test_single_page_image(eight_by_eight, outpdf):
|
|
|
|
im = eight_by_eight
|
|
|
|
bio = BytesIO()
|
|
|
|
im.save(bio, format='PNG')
|
|
|
|
bio.seek(0)
|
2016-02-06 23:41:32 -08:00
|
|
|
|
2017-01-26 16:38:59 -08:00
|
|
|
imgsize = ((img2pdf.ImgSize.dpi, 8), (img2pdf.ImgSize.dpi, 8))
|
|
|
|
layout_fun = img2pdf.get_layout_fun(None, imgsize, None, None, None)
|
2016-02-06 23:41:32 -08:00
|
|
|
|
2021-01-08 15:04:52 -08:00
|
|
|
with outpdf.open('wb') as f:
|
|
|
|
img2pdf.convert(
|
|
|
|
bio,
|
|
|
|
producer="img2pdf",
|
|
|
|
with_pdfrw=False,
|
|
|
|
layout_fun=layout_fun,
|
|
|
|
outputstream=f,
|
|
|
|
)
|
|
|
|
info = pdfinfo.PdfInfo(outpdf)
|
2015-07-26 15:24:42 -07:00
|
|
|
|
2017-05-19 16:17:36 -07:00
|
|
|
assert len(info) == 1
|
|
|
|
page = info[0]
|
2015-07-26 15:24:42 -07:00
|
|
|
|
2017-05-18 17:12:04 -07:00
|
|
|
assert not page.has_text
|
|
|
|
assert len(page.images) == 1
|
2015-07-26 15:24:42 -07:00
|
|
|
|
2017-05-18 17:12:04 -07:00
|
|
|
pdfimage = page.images[0]
|
2017-05-18 18:39:14 -07:00
|
|
|
assert pdfimage.width == 8
|
2017-05-18 22:32:27 -07:00
|
|
|
assert pdfimage.color == Colorspace.gray
|
2015-07-26 15:24:42 -07:00
|
|
|
|
|
|
|
# DPI in a 1"x1" is the image width
|
2020-04-24 04:12:05 -07:00
|
|
|
assert isclose(pdfimage.dpi.x, 8)
|
|
|
|
assert isclose(pdfimage.dpi.y, 8)
|
2015-07-26 15:24:42 -07:00
|
|
|
|
|
|
|
|
2021-04-16 00:21:11 -07:00
|
|
|
def test_single_page_inline_image(eight_by_eight, outdir):
|
2017-01-26 16:38:59 -08:00
|
|
|
filename = outdir / 'image-mono-inline.pdf'
|
2021-04-16 00:21:11 -07:00
|
|
|
pdf = Canvas(str(filename), pagesize=(8 * 72, 6 * 72))
|
|
|
|
|
|
|
|
# Draw image in a 72x72 pt or 1"x1" area
|
|
|
|
pdf.drawInlineImage(eight_by_eight, 0, 0, width=72, height=72)
|
|
|
|
pdf.showPage()
|
|
|
|
pdf.save()
|
2019-11-27 02:26:13 -08:00
|
|
|
|
|
|
|
info = pdfinfo.PdfInfo(filename)
|
|
|
|
print(info)
|
|
|
|
pdfimage = info[0].images[0]
|
2020-04-24 04:12:05 -07:00
|
|
|
assert isclose(pdfimage.dpi.x, 8)
|
2019-11-27 02:26:13 -08:00
|
|
|
assert pdfimage.color == Colorspace.gray
|
2017-05-18 18:39:14 -07:00
|
|
|
assert pdfimage.width == 8
|
2015-07-26 15:24:42 -07:00
|
|
|
|
2015-07-27 00:25:24 -07:00
|
|
|
|
2020-06-09 15:27:14 -07:00
|
|
|
def test_jpeg(resources):
|
2017-01-26 16:38:59 -08:00
|
|
|
filename = resources / 'c02-22.pdf'
|
2015-07-27 00:25:24 -07:00
|
|
|
|
2017-05-19 16:17:36 -07:00
|
|
|
pdf = pdfinfo.PdfInfo(filename)
|
2015-07-27 00:25:24 -07:00
|
|
|
|
2017-05-19 16:17:36 -07:00
|
|
|
pdfimage = pdf[0].images[0]
|
2017-05-18 22:32:27 -07:00
|
|
|
assert pdfimage.enc == Encoding.jpeg
|
2020-04-24 04:12:05 -07:00
|
|
|
assert isclose(pdfimage.dpi.x, 150)
|
2015-07-27 00:25:24 -07:00
|
|
|
|
2017-02-14 12:51:15 -08:00
|
|
|
|
|
|
|
def test_form_xobject(resources):
|
|
|
|
filename = resources / 'formxobject.pdf'
|
|
|
|
|
2017-05-19 16:17:36 -07:00
|
|
|
pdf = pdfinfo.PdfInfo(filename)
|
|
|
|
pdfimage = pdf[0].images[0]
|
2017-05-18 18:39:14 -07:00
|
|
|
assert pdfimage.width == 50
|
2017-05-01 15:46:15 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_no_contents(resources):
|
|
|
|
filename = resources / 'no_contents.pdf'
|
|
|
|
|
2017-05-19 16:17:36 -07:00
|
|
|
pdf = pdfinfo.PdfInfo(filename)
|
|
|
|
assert len(pdf[0].images) == 0
|
2020-06-09 15:27:14 -07:00
|
|
|
assert not pdf[0].has_text
|
2017-05-24 23:26:07 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_oversized_page(resources):
|
|
|
|
pdf = pdfinfo.PdfInfo(resources / 'poster.pdf')
|
|
|
|
image = pdf[0].images[0]
|
2020-04-24 04:12:05 -07:00
|
|
|
assert image.width * image.dpi.x > 200, "this is supposed to be oversized"
|
2018-05-24 22:24:13 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_pickle(resources):
|
|
|
|
# For multiprocessing we must be able to pickle our information - if
|
|
|
|
# this fails then we are probably storing some unpickleabe pikepdf or
|
|
|
|
# other external data around
|
2018-10-26 01:07:02 -07:00
|
|
|
filename = resources / 'graph_ocred.pdf'
|
2018-05-24 22:24:13 -07:00
|
|
|
pdf = pdfinfo.PdfInfo(filename)
|
|
|
|
pickle.dumps(pdf)
|
2018-07-04 00:59:32 -07:00
|
|
|
|
|
|
|
|
2018-10-18 01:21:51 -07:00
|
|
|
def test_vector(resources):
|
|
|
|
filename = resources / 'vector.pdf'
|
|
|
|
pdf = pdfinfo.PdfInfo(filename)
|
|
|
|
assert pdf[0].has_vector
|
2018-10-19 00:02:19 -07:00
|
|
|
assert not pdf[0].has_text
|
|
|
|
|
|
|
|
|
|
|
|
def test_ocr_detection(resources):
|
|
|
|
filename = resources / 'graph_ocred.pdf'
|
|
|
|
pdf = pdfinfo.PdfInfo(filename)
|
|
|
|
assert not pdf[0].has_vector
|
|
|
|
assert pdf[0].has_text
|
2018-11-15 16:22:53 -08:00
|
|
|
|
|
|
|
|
2018-11-15 21:54:26 -08:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
'testfile', ('truetype_font_nomapping.pdf', 'type3_font_nomapping.pdf')
|
|
|
|
)
|
|
|
|
def test_corrupt_font_detection(resources, testfile):
|
|
|
|
filename = resources / testfile
|
2020-06-10 11:53:04 -07:00
|
|
|
pdf = pdfinfo.PdfInfo(filename, detailed_analysis=True)
|
2018-11-15 16:22:53 -08:00
|
|
|
assert pdf[0].has_corrupt_text
|
2018-12-15 14:55:11 -08:00
|
|
|
|
|
|
|
|
|
|
|
def test_stack_abuse():
|
|
|
|
p = pikepdf.Pdf.new()
|
|
|
|
|
|
|
|
stream = pikepdf.Stream(p, b'q ' * 35)
|
|
|
|
with pytest.warns(None) as record:
|
2019-06-20 03:10:41 -07:00
|
|
|
pdfinfo.info._interpret_contents(stream)
|
2018-12-15 14:55:11 -08:00
|
|
|
assert 'overflowed' in str(record[0].message)
|
|
|
|
|
|
|
|
stream = pikepdf.Stream(p, b'q Q Q Q Q')
|
|
|
|
with pytest.warns(None) as record:
|
2019-06-20 03:10:41 -07:00
|
|
|
pdfinfo.info._interpret_contents(stream)
|
2018-12-15 14:55:11 -08:00
|
|
|
assert 'underflowed' in str(record[0].message)
|
|
|
|
|
|
|
|
stream = pikepdf.Stream(p, b'q ' * 135)
|
|
|
|
with pytest.warns(None):
|
|
|
|
with pytest.raises(RuntimeError):
|
2019-06-20 03:10:41 -07:00
|
|
|
pdfinfo.info._interpret_contents(stream)
|
2021-01-01 01:11:32 -08:00
|
|
|
|
|
|
|
|
|
|
|
def test_pages_issue700(monkeypatch, resources):
|
|
|
|
def get_no_pages(*args, **kwargs):
|
|
|
|
return iter([])
|
|
|
|
|
|
|
|
monkeypatch.setattr(PDFPage, 'get_pages', get_no_pages)
|
|
|
|
|
|
|
|
with pytest.raises(InputFileError, match="pdfminer"):
|
|
|
|
pdfinfo.PdfInfo(
|
|
|
|
resources / 'cardinal.pdf',
|
|
|
|
detailed_analysis=True,
|
|
|
|
progbar=False,
|
|
|
|
max_workers=1,
|
|
|
|
)
|
2021-04-15 23:26:14 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_image_scale0(resources, outpdf):
|
|
|
|
with pikepdf.open(resources / 'cmyk.pdf') as cmyk:
|
2021-11-13 00:41:36 -08:00
|
|
|
xobj = cmyk.pages[0].as_form_xobject()
|
2021-04-15 23:26:14 -07:00
|
|
|
|
|
|
|
p = pikepdf.Pdf.new()
|
|
|
|
p.add_blank_page(page_size=(72, 72))
|
2021-11-13 00:41:36 -08:00
|
|
|
objname = p.pages[0].add_resource(
|
2021-04-15 23:26:14 -07:00
|
|
|
p.copy_foreign(xobj), pikepdf.Name.XObject, pikepdf.Name.Im0
|
|
|
|
)
|
|
|
|
print(objname)
|
|
|
|
p.pages[0].Contents = pikepdf.Stream(
|
|
|
|
p, b"q 0 0 0 0 0 0 cm %s Do Q" % bytes(objname)
|
|
|
|
)
|
|
|
|
p.save(outpdf)
|
|
|
|
|
|
|
|
pi = pdfinfo.PdfInfo(outpdf, detailed_analysis=True, progbar=False, max_workers=1)
|
|
|
|
assert not pi.pages[0]._images[0].dpi.is_finite
|
|
|
|
assert pi.pages[0].dpi == Resolution(0, 0)
|