2015-07-28 04:36:58 -07:00
|
|
|
# © 2015 James R. Barlow: github.com/jbarlow83
|
2018-03-14 14:40:48 -07:00
|
|
|
#
|
|
|
|
# This file is part of OCRmyPDF.
|
|
|
|
#
|
|
|
|
# OCRmyPDF is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# OCRmyPDF is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
2015-07-26 15:24:42 -07:00
|
|
|
|
2017-05-19 15:48:23 -07:00
|
|
|
from ocrmypdf import pdfinfo
|
2015-07-26 15:24:42 -07:00
|
|
|
from reportlab.pdfgen.canvas import Canvas
|
|
|
|
from PIL import Image
|
|
|
|
from tempfile import NamedTemporaryFile
|
2017-05-18 18:39:14 -07:00
|
|
|
from math import isclose
|
2017-05-19 15:48:23 -07:00
|
|
|
from ocrmypdf.pdfinfo import Colorspace, Encoding
|
2015-07-26 15:24:42 -07:00
|
|
|
from contextlib import suppress
|
|
|
|
import os
|
|
|
|
import shutil
|
2015-07-28 00:43:22 -07:00
|
|
|
import pytest
|
2015-07-30 03:35:56 -07:00
|
|
|
import img2pdf
|
2016-01-11 17:40:44 -08:00
|
|
|
import sys
|
2018-03-27 13:32:38 -07:00
|
|
|
import PyPDF2 as pypdf
|
2018-05-24 22:22:01 -07:00
|
|
|
import pikepdf
|
|
|
|
import pickle
|
2015-07-26 15:24:42 -07:00
|
|
|
|
|
|
|
|
2017-01-26 16:38:59 -08:00
|
|
|
def test_single_page_text(outdir):
|
|
|
|
filename = outdir / 'text.pdf'
|
|
|
|
pdf = Canvas(str(filename), pagesize=(8*72, 6*72))
|
2015-07-26 15:24:42 -07:00
|
|
|
text = pdf.beginText()
|
|
|
|
text.setFont('Helvetica', 12)
|
2015-07-26 18:23:37 -07:00
|
|
|
text.setTextOrigin(1*72, 3*72)
|
|
|
|
text.textLine("Methink'st thou art a general offence and every"
|
2015-07-26 15:24:42 -07:00
|
|
|
" man should beat thee.")
|
|
|
|
pdf.drawText(text)
|
|
|
|
pdf.showPage()
|
|
|
|
pdf.save()
|
|
|
|
|
2017-05-19 16:17:36 -07:00
|
|
|
info = pdfinfo.PdfInfo(filename)
|
2015-07-26 15:24:42 -07:00
|
|
|
|
2017-05-19 16:17:36 -07:00
|
|
|
assert len(info) == 1
|
|
|
|
page = info[0]
|
2015-07-26 15:24:42 -07:00
|
|
|
|
2017-05-18 17:12:04 -07:00
|
|
|
assert page.has_text
|
|
|
|
assert len(page.images) == 0
|
2015-07-26 15:24:42 -07:00
|
|
|
|
|
|
|
|
2017-01-26 16:38:59 -08:00
|
|
|
def test_single_page_image(outdir):
|
|
|
|
filename = outdir / 'image-mono.pdf'
|
2015-07-30 03:35:56 -07:00
|
|
|
|
2017-01-26 16:38:59 -08:00
|
|
|
im_tmp = outdir / 'tmp.png'
|
|
|
|
im = Image.new('1', (8, 8), 0)
|
|
|
|
for n in range(8):
|
|
|
|
im.putpixel((n, n), 1)
|
|
|
|
im.save(str(im_tmp), format='PNG')
|
2016-02-06 23:41:32 -08:00
|
|
|
|
2017-01-26 16:38:59 -08:00
|
|
|
imgsize = ((img2pdf.ImgSize.dpi, 8), (img2pdf.ImgSize.dpi, 8))
|
|
|
|
layout_fun = img2pdf.get_layout_fun(None, imgsize, None, None, None)
|
2016-02-06 23:41:32 -08:00
|
|
|
|
2017-01-26 16:38:59 -08:00
|
|
|
im_bytes = im_tmp.read_bytes()
|
|
|
|
pdf_bytes = img2pdf.convert(
|
|
|
|
im_bytes, producer="img2pdf", with_pdfrw=False,
|
|
|
|
layout_fun=layout_fun)
|
2017-01-26 17:53:27 -08:00
|
|
|
filename.write_bytes(pdf_bytes)
|
2015-07-26 15:24:42 -07:00
|
|
|
|
2017-05-19 16:17:36 -07:00
|
|
|
info = pdfinfo.PdfInfo(filename)
|
2015-07-26 15:24:42 -07:00
|
|
|
|
2017-05-19 16:17:36 -07:00
|
|
|
assert len(info) == 1
|
|
|
|
page = info[0]
|
2015-07-26 15:24:42 -07:00
|
|
|
|
2017-05-18 17:12:04 -07:00
|
|
|
assert not page.has_text
|
|
|
|
assert len(page.images) == 1
|
2015-07-26 15:24:42 -07:00
|
|
|
|
2017-05-18 17:12:04 -07:00
|
|
|
pdfimage = page.images[0]
|
2017-05-18 18:39:14 -07:00
|
|
|
assert pdfimage.width == 8
|
2017-05-18 22:32:27 -07:00
|
|
|
assert pdfimage.color == Colorspace.gray
|
2015-07-26 15:24:42 -07:00
|
|
|
|
|
|
|
# DPI in a 1"x1" is the image width
|
2017-05-18 18:39:14 -07:00
|
|
|
assert isclose(pdfimage.xres, 8)
|
|
|
|
assert isclose(pdfimage.yres, 8)
|
2015-07-26 15:24:42 -07:00
|
|
|
|
|
|
|
|
2017-01-26 16:38:59 -08:00
|
|
|
def test_single_page_inline_image(outdir):
|
|
|
|
filename = outdir / 'image-mono-inline.pdf'
|
|
|
|
pdf = Canvas(str(filename), pagesize=(8*72, 6*72))
|
2015-07-26 15:24:42 -07:00
|
|
|
with NamedTemporaryFile() as im_tmp:
|
|
|
|
im = Image.new('1', (8, 8), 0)
|
|
|
|
for n in range(8):
|
|
|
|
im.putpixel((n, n), 1)
|
|
|
|
im.save(im_tmp.name, format='PNG')
|
|
|
|
# Draw image in a 72x72 pt or 1"x1" area
|
|
|
|
pdf.drawInlineImage(im_tmp.name, 0, 0, width=72, height=72)
|
|
|
|
pdf.showPage()
|
|
|
|
pdf.save()
|
|
|
|
|
2017-05-19 16:17:36 -07:00
|
|
|
pdf = pdfinfo.PdfInfo(filename)
|
|
|
|
print(pdf)
|
|
|
|
pdfimage = pdf[0].images[0]
|
2017-05-18 18:39:14 -07:00
|
|
|
assert isclose(pdfimage.xres, 8)
|
2017-05-18 22:32:27 -07:00
|
|
|
assert pdfimage.color == Colorspace.rgb # reportlab produces color image
|
2017-05-18 18:39:14 -07:00
|
|
|
assert pdfimage.width == 8
|
2015-07-26 15:24:42 -07:00
|
|
|
|
2015-07-27 00:25:24 -07:00
|
|
|
|
2017-01-26 16:38:59 -08:00
|
|
|
def test_jpeg(resources, outdir):
|
|
|
|
filename = resources / 'c02-22.pdf'
|
2015-07-27 00:25:24 -07:00
|
|
|
|
2017-05-19 16:17:36 -07:00
|
|
|
pdf = pdfinfo.PdfInfo(filename)
|
2015-07-27 00:25:24 -07:00
|
|
|
|
2017-05-19 16:17:36 -07:00
|
|
|
pdfimage = pdf[0].images[0]
|
2017-05-18 22:32:27 -07:00
|
|
|
assert pdfimage.enc == Encoding.jpeg
|
2017-05-18 18:39:14 -07:00
|
|
|
assert isclose(pdfimage.xres, 150)
|
2015-07-27 00:25:24 -07:00
|
|
|
|
2017-02-14 12:51:15 -08:00
|
|
|
|
|
|
|
def test_form_xobject(resources):
|
|
|
|
filename = resources / 'formxobject.pdf'
|
|
|
|
|
2017-05-19 16:17:36 -07:00
|
|
|
pdf = pdfinfo.PdfInfo(filename)
|
|
|
|
pdfimage = pdf[0].images[0]
|
2017-05-18 18:39:14 -07:00
|
|
|
assert pdfimage.width == 50
|
2017-05-01 15:46:15 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_no_contents(resources):
|
|
|
|
filename = resources / 'no_contents.pdf'
|
|
|
|
|
2017-05-19 16:17:36 -07:00
|
|
|
pdf = pdfinfo.PdfInfo(filename)
|
|
|
|
assert len(pdf[0].images) == 0
|
2017-05-24 23:26:07 -07:00
|
|
|
assert pdf[0].has_text == False
|
|
|
|
|
|
|
|
|
|
|
|
def test_oversized_page(resources):
|
|
|
|
pdf = pdfinfo.PdfInfo(resources / 'poster.pdf')
|
|
|
|
image = pdf[0].images[0]
|
2018-02-24 12:38:58 -08:00
|
|
|
assert image.width * image.xres > 200, "this is supposed to be oversized"
|
2018-05-24 22:24:13 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_pickle(resources):
|
|
|
|
# For multiprocessing we must be able to pickle our information - if
|
|
|
|
# this fails then we are probably storing some unpickleabe pikepdf or
|
|
|
|
# other external data around
|
|
|
|
filename = resources / 'formxobject.pdf'
|
|
|
|
pdf = pdfinfo.PdfInfo(filename)
|
|
|
|
pickle.dumps(pdf)
|
2018-07-04 00:59:32 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_regex():
|
|
|
|
rx = pdfinfo.regex_remove_char_tags
|
|
|
|
|
|
|
|
must_match = [
|
|
|
|
b'<char bbox="0 108 0 108" c="/"/>',
|
|
|
|
b'<char bbox="0 108 0 108" c=">"/>',
|
|
|
|
b'<char bbox="0 108 0 108" c="X"/>',
|
|
|
|
]
|
|
|
|
must_not_match = [
|
|
|
|
b'<span stuff="c">',
|
|
|
|
b'<span>',
|
|
|
|
b'</span>',
|
|
|
|
b'</page>'
|
|
|
|
]
|
|
|
|
|
|
|
|
for s in must_match:
|
|
|
|
assert rx.match(s)
|
|
|
|
for s in must_not_match:
|
|
|
|
assert not rx.match(s)
|
2018-10-18 01:21:51 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_vector(resources):
|
|
|
|
filename = resources / 'vector.pdf'
|
|
|
|
pdf = pdfinfo.PdfInfo(filename)
|
|
|
|
assert pdf[0].has_vector
|