Fix #156 - NoneType has no ‘getObject’ for pages with no /Contents

This commit is contained in:
James R. Barlow 2017-05-01 15:46:15 -07:00
parent b9b12e2879
commit aa859a4139
5 changed files with 37 additions and 1 deletions

View File

@ -430,7 +430,7 @@ def _find_images(pdf, container, shorthand=None):
"""
if container.get('/Type') == '/Page':
if container.get('/Type') == '/Page' and '/Contents' in container:
# For a /Page the content stream is attached to the page's /Contents
page = container
contentstream = pypdf.pdf.ContentStream(page.getContents(), pdf)

View File

@ -85,6 +85,9 @@ under the terms of the license in LICENSE.rst.
* - overlay.pdf
- @maxandersen
- PDF file generated by PDFPen pro that triggered content stream parse errors
* - no_conentes.pdf
- @jbarlow83
- synthetic PDF with a blank page that has no /Contents entry
Assemblies
==========

View File

@ -0,0 +1,21 @@
%PDF-1.3
%¿÷¢þ
1 0 obj
<< /Pages 2 0 R /Type /Catalog >>
endobj
2 0 obj
<< /Count 1 /Kids [ 3 0 R ] /Type /Pages >>
endobj
3 0 obj
<< /MediaBox [ 0 0 720 720 ] /Parent 2 0 R /Resources << >> /Type /Page >>
endobj
xref
0 4
0000000000 65535 f
0000000015 00000 n
0000000064 00000 n
0000000123 00000 n
trailer << /Root 1 0 R /Size 4 /ID [<52bba3c78160d0c6e851b59110e5d076><52bba3c78160d0c6e851b59110e5d076>] >>
startxref
213
%%EOF

View File

@ -870,4 +870,8 @@ def test_gs_raster_failure(spoof_no_tess_gs_raster_fail, resources, outpdf):
env=spoof_no_tess_gs_raster_fail)
print(err)
assert p.returncode == ExitCode.child_process_error
def test_no_contents(spoof_tesseract_noop, resources, outpdf):
check_ocrmypdf(resources / 'no_contents.pdf', outpdf, '--force-ocr',
env=spoof_tesseract_noop)

View File

@ -108,3 +108,11 @@ def test_form_xobject(resources):
pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename))
pdfimage = pdfinfo[0]['images'][0]
assert pdfimage['width'] == 50
def test_no_contents(resources):
filename = resources / 'no_contents.pdf'
pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename))
assert len(pdfinfo[0]['images']) == 0
assert pdfinfo[0]['has_text'] == False