2015-07-26 15:24:42 -07:00
|
|
|
#!/usr/bin/env python3
|
2015-07-28 04:36:58 -07:00
|
|
|
# © 2015 James R. Barlow: github.com/jbarlow83
|
2015-07-26 15:24:42 -07:00
|
|
|
|
|
|
|
from ocrmypdf import pageinfo
|
|
|
|
from reportlab.pdfgen.canvas import Canvas
|
|
|
|
from PIL import Image
|
|
|
|
from tempfile import NamedTemporaryFile
|
|
|
|
from contextlib import suppress
|
|
|
|
import os
|
|
|
|
import shutil
|
2015-07-28 00:43:22 -07:00
|
|
|
import pytest
|
2015-07-30 03:35:56 -07:00
|
|
|
import img2pdf
|
2015-09-10 07:01:14 -07:00
|
|
|
import pytest
|
2016-01-11 17:40:44 -08:00
|
|
|
import sys
|
2015-07-26 15:24:42 -07:00
|
|
|
|
|
|
|
|
2016-01-11 17:40:44 -08:00
|
|
|
if sys.version_info.major < 3:
|
|
|
|
print("Requires Python 3.4+")
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
|
|
|
|
SPOOF_PATH = os.path.join(TESTS_ROOT, 'spoof')
|
|
|
|
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
|
|
|
|
OCRMYPDF = os.path.join(PROJECT_ROOT, 'OCRmyPDF.sh')
|
|
|
|
TEST_RESOURCES = os.path.join(PROJECT_ROOT, 'tests', 'resources')
|
2015-08-20 02:36:28 -07:00
|
|
|
TEST_OUTPUT = os.environ.get(
|
|
|
|
'OCRMYPDF_TEST_OUTPUT',
|
2016-01-11 17:40:44 -08:00
|
|
|
default=os.path.join(PROJECT_ROOT, 'tests', 'output', 'pageinfo'))
|
2015-07-26 15:24:42 -07:00
|
|
|
|
|
|
|
|
|
|
|
def setup_module():
|
|
|
|
with suppress(FileNotFoundError):
|
|
|
|
shutil.rmtree(TEST_OUTPUT)
|
|
|
|
with suppress(FileExistsError):
|
2016-01-16 02:47:56 -08:00
|
|
|
os.makedirs(TEST_OUTPUT)
|
2015-07-26 15:24:42 -07:00
|
|
|
|
|
|
|
|
2016-01-11 17:40:44 -08:00
|
|
|
def _make_input(input_basename):
|
|
|
|
return os.path.join(TEST_RESOURCES, input_basename)
|
|
|
|
|
|
|
|
|
|
|
|
def _make_output(output_basename):
|
|
|
|
return os.path.join(TEST_OUTPUT, output_basename)
|
|
|
|
|
|
|
|
|
2015-07-26 15:24:42 -07:00
|
|
|
def test_single_page_text():
|
|
|
|
filename = os.path.join(TEST_OUTPUT, 'text.pdf')
|
|
|
|
pdf = Canvas(filename, pagesize=(8*72, 6*72))
|
|
|
|
text = pdf.beginText()
|
|
|
|
text.setFont('Helvetica', 12)
|
2015-07-26 18:23:37 -07:00
|
|
|
text.setTextOrigin(1*72, 3*72)
|
|
|
|
text.textLine("Methink'st thou art a general offence and every"
|
2015-07-26 15:24:42 -07:00
|
|
|
" man should beat thee.")
|
|
|
|
pdf.drawText(text)
|
|
|
|
pdf.showPage()
|
|
|
|
pdf.save()
|
|
|
|
|
|
|
|
pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)
|
|
|
|
|
|
|
|
assert len(pdfinfo) == 1
|
|
|
|
page = pdfinfo[0]
|
|
|
|
|
|
|
|
assert page['has_text']
|
|
|
|
assert len(page['images']) == 0
|
|
|
|
|
|
|
|
|
|
|
|
def test_single_page_image():
|
|
|
|
filename = os.path.join(TEST_OUTPUT, 'image-mono.pdf')
|
2015-07-30 03:35:56 -07:00
|
|
|
|
2016-02-06 23:41:32 -08:00
|
|
|
with NamedTemporaryFile(mode='wb+', suffix='.png') as im_tmp:
|
2015-07-26 15:24:42 -07:00
|
|
|
im = Image.new('1', (8, 8), 0)
|
|
|
|
for n in range(8):
|
|
|
|
im.putpixel((n, n), 1)
|
|
|
|
im.save(im_tmp.name, format='PNG')
|
2015-07-30 03:35:56 -07:00
|
|
|
|
2016-02-06 23:41:32 -08:00
|
|
|
imgsize = ((img2pdf.ImgSize.dpi, 8), (img2pdf.ImgSize.dpi, 8))
|
|
|
|
layout_fun = img2pdf.get_layout_fun(None, imgsize, None, None, None)
|
|
|
|
|
|
|
|
im_tmp.seek(0)
|
|
|
|
im_bytes = im_tmp.read()
|
|
|
|
pdf_bytes = img2pdf.convert(
|
|
|
|
im_bytes, producer="img2pdf", with_pdfrw=False,
|
|
|
|
layout_fun=layout_fun)
|
|
|
|
|
2015-07-30 03:35:56 -07:00
|
|
|
with open(filename, 'wb') as pdf:
|
|
|
|
pdf.write(pdf_bytes)
|
2015-07-26 15:24:42 -07:00
|
|
|
|
|
|
|
pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)
|
|
|
|
|
|
|
|
assert len(pdfinfo) == 1
|
|
|
|
page = pdfinfo[0]
|
|
|
|
|
|
|
|
assert not page['has_text']
|
|
|
|
assert len(page['images']) == 1
|
|
|
|
|
|
|
|
pdfimage = page['images'][0]
|
|
|
|
assert pdfimage['width'] == 8
|
2015-07-30 03:35:56 -07:00
|
|
|
assert pdfimage['color'] == 'gray'
|
2015-07-26 15:24:42 -07:00
|
|
|
|
|
|
|
# While unexpected, this is correct
|
|
|
|
# PDF spec says /FlateDecode image must have /BitsPerComponent 8
|
|
|
|
# So mono images get upgraded to 8-bit
|
|
|
|
assert pdfimage['bpc'] == 8
|
|
|
|
|
|
|
|
# DPI in a 1"x1" is the image width
|
2016-02-26 18:19:39 -08:00
|
|
|
assert abs(pdfimage['dpi_w'] - 8) < 1e-5
|
|
|
|
assert abs(pdfimage['dpi_h'] - 8) < 1e-5
|
2015-07-26 15:24:42 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_single_page_inline_image():
|
|
|
|
filename = os.path.join(TEST_OUTPUT, 'image-mono-inline.pdf')
|
|
|
|
pdf = Canvas(filename, pagesize=(8*72, 6*72))
|
|
|
|
with NamedTemporaryFile() as im_tmp:
|
|
|
|
im = Image.new('1', (8, 8), 0)
|
|
|
|
for n in range(8):
|
|
|
|
im.putpixel((n, n), 1)
|
|
|
|
im.save(im_tmp.name, format='PNG')
|
|
|
|
# Draw image in a 72x72 pt or 1"x1" area
|
|
|
|
pdf.drawInlineImage(im_tmp.name, 0, 0, width=72, height=72)
|
|
|
|
pdf.showPage()
|
|
|
|
pdf.save()
|
|
|
|
|
2016-02-26 22:44:28 -08:00
|
|
|
pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)
|
|
|
|
print(pdfinfo)
|
|
|
|
pdfimage = pdfinfo[0]['images'][0]
|
|
|
|
assert (pdfimage['dpi_w'] - 8) < 1e-5
|
2015-07-26 15:24:42 -07:00
|
|
|
|
2015-07-27 00:25:24 -07:00
|
|
|
|
|
|
|
def test_jpeg():
|
2016-01-11 17:40:44 -08:00
|
|
|
filename = _make_input('c02-22.pdf')
|
2015-07-27 00:25:24 -07:00
|
|
|
|
|
|
|
pdfinfo = pageinfo.pdf_get_all_pageinfo(filename)
|
|
|
|
|
|
|
|
pdfimage = pdfinfo[0]['images'][0]
|
|
|
|
assert pdfimage['enc'] == 'jpeg'
|
2016-02-26 18:19:39 -08:00
|
|
|
assert (pdfimage['dpi_w'] - 150) < 1e-5
|
2015-07-27 00:25:24 -07:00
|
|
|
|