Fix #156 - NoneType has no ‘getObject’ for pages with no /Contents

2025-12-29 16:10:06 +00:00 · 2017-05-01 15:46:15 -07:00 · 2017-05-01 15:46:15 -07:00 · aa859a4139
commit aa859a4139
parent b9b12e2879
5 changed files with 37 additions and 1 deletions
--- a/ocrmypdf/pageinfo.py
+++ b/ocrmypdf/pageinfo.py
@ -430,7 +430,7 @@ def _find_images(pdf, container, shorthand=None):

    """

-    if container.get('/Type') == '/Page':
+    if container.get('/Type') == '/Page' and '/Contents' in container:
        # For a /Page the content stream is attached to the page's /Contents
        page = container
        contentstream = pypdf.pdf.ContentStream(page.getContents(), pdf)
--- a/tests/resources/README.rst
+++ b/tests/resources/README.rst
@ -85,6 +85,9 @@ under the terms of the license in LICENSE.rst.
    *   - overlay.pdf
        - @maxandersen
        - PDF file generated by PDFPen pro that triggered content stream parse errors
+    *   - no_conentes.pdf
+        - @jbarlow83
+        - synthetic PDF with a blank page that has no /Contents entry

 Assemblies
 ==========
--- a/tests/resources/no_contents.pdf
+++ b/tests/resources/no_contents.pdf
@ -0,0 +1,21 @@
+%PDF-1.3
+%¿÷¢þ
+1 0 obj
+<< /Pages 2 0 R /Type /Catalog >>
+endobj
+2 0 obj
+<< /Count 1 /Kids [ 3 0 R ] /Type /Pages >>
+endobj
+3 0 obj
+<< /MediaBox [ 0 0 720 720 ] /Parent 2 0 R /Resources << >> /Type /Page >>
+endobj
+xref
+0 4
+0000000000 65535 f 
+0000000015 00000 n 
+0000000064 00000 n 
+0000000123 00000 n 
+trailer << /Root 1 0 R /Size 4 /ID [<52bba3c78160d0c6e851b59110e5d076><52bba3c78160d0c6e851b59110e5d076>] >>
+startxref
+213
+%%EOF
--- a/tests/test_main.py
+++ b/tests/test_main.py
@ -870,4 +870,8 @@ def test_gs_raster_failure(spoof_no_tess_gs_raster_fail, resources, outpdf):
        env=spoof_no_tess_gs_raster_fail)
    print(err)
    assert p.returncode == ExitCode.child_process_error
+
+
+def test_no_contents(spoof_tesseract_noop, resources, outpdf):
+    check_ocrmypdf(resources / 'no_contents.pdf', outpdf, '--force-ocr',
                   env=spoof_tesseract_noop)
--- a/tests/test_pageinfo.py
+++ b/tests/test_pageinfo.py
@ -108,3 +108,11 @@ def test_form_xobject(resources):
    pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename))
    pdfimage = pdfinfo[0]['images'][0]
    assert pdfimage['width'] == 50
+
+
+def test_no_contents(resources):
+    filename = resources / 'no_contents.pdf'
+
+    pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename))
+    assert len(pdfinfo[0]['images']) == 0
+    assert pdfinfo[0]['has_text'] == False