Fix table of contents not preserved in PDF/A

This commit is contained in:
James R. Barlow 2018-03-26 02:23:19 -07:00
parent bc56b8e058
commit 45dbff6401
5 changed files with 37 additions and 9 deletions

View File

@ -29,7 +29,7 @@ from enum import Enum
from contextlib import contextmanager
import PyPDF2 as pypdf
from fitz import Document
import fitz
from .helpers import universal_open
@ -555,7 +555,7 @@ def borrow_stream(stream):
def _page_has_text(infile, pageno):
doc = Document(infile)
doc = fitz.Document(infile)
text = doc.getPageText(pageno)
if text.strip() != '':
return True
@ -692,6 +692,7 @@ class PdfInfo:
def __init__(self, infile):
self._infile = infile
self._pages = _pdf_get_all_pageinfo(infile)
self._toc = fitz.Document(infile).getToC()
@property
def pages(self):
@ -712,6 +713,10 @@ class PdfInfo:
raise NotImplementedError("can't get filename from stream")
return self._infile
@property
def table_of_contents(self):
return self._toc
def __getitem__(self, item):
return self._pages[item]

View File

@ -963,11 +963,14 @@ def merge_pages_ghostscript(
ghostscript.generate_pdfa(
pdf_version=input_pdfinfo.min_version,
pdf_pages=pdf_pages,
output_file=output_file,
output_file=output_file + '_toc.pdf',
compression=options.pdfa_image_compression,
log=log,
threads=options.jobs or 1,
pdfa_part=('1' if options.output_type == 'pdfa-1' else '2'))
doc = fitz.Document(output_file + '_toc.pdf')
doc.setToC(input_pdfinfo.table_of_contents)
doc.save(output_file)
def merge_pages_qpdf(

View File

@ -128,6 +128,7 @@ Assemblies
These test resources are assemblies or derivatives from other previously mentioned files, released under the same license terms as their input files.
- baiona_gray.png (from baiona.png)
- cardinal.pdf (four cardinal directions, baked-in rotated copies of LinnSequencer.jpg)
- ccitt.pdf (LinnSequencer.jpg, converted to CCITT encoding)
- encrypted_algo4.pdf (congress.jpg, encrypted with algorithm 4 - not supported by PyPDF2)
@ -135,12 +136,11 @@ These test resources are assemblies or derivatives from other previously mention
- jbig2.pdf (congress.jpg, converted to JBIG2 encoding)
- multipage.pdf (from several other files)
- palette.pdf (congress.jpg, converted to a 256-color palette)
- rotated_skew.pdf (a /Rotate'd and skewed document from LinnSequencer.jpg)
- skew.pdf (from LinnSequencer.jpg, skew simulated by adjusting the transformation matrix)
- skew-encrypted.pdf (skew.pdf with encryption - access supported by PyPDF2, password is "password")
- baiona_gray.png (from baiona.png)
- poster.pdf (from LinnSequencer.jpg)
- rotated_skew.pdf (a /Rotate'd and skewed document from LinnSequencer.jpg)
- skew-encrypted.pdf (skew.pdf with encryption - access supported by PyPDF2, password is "password")
- skew.pdf (from LinnSequencer.jpg, skew simulated by adjusting the transformation matrix)
- toc.pdf (from formxobject.pdf, trivial.pdf)
.. _`Wikimedia: LinnSequencer`: https://upload.wikimedia.org/wikipedia/en/b/b7/LinnSequencer_hardware_MIDI_sequencer_brochure_page_2_300dpi.jpg

BIN
tests/resources/toc.pdf Normal file

Binary file not shown.

View File

@ -18,6 +18,7 @@
import pytest
import PyPDF2 as pypdf
import fitz
from ocrmypdf.pdfa import file_claims_pdfa
from ocrmypdf.exceptions import ExitCode
@ -94,3 +95,22 @@ def test_high_unicode(spoof_tesseract_noop, resources, no_outpdf):
env=spoof_tesseract_noop)
assert p.returncode == ExitCode.bad_args, err
@pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option,
resources, outpdf):
input_file = resources / 'toc.pdf'
before_toc = fitz.Document(str(input_file)).getToC()
check_ocrmypdf(
input_file, outpdf,
ocr_option,
'--output-type', output_type,
env=spoof_tesseract_noop)
after_toc = fitz.Document(str(outpdf)).getToC()
print(before_toc)
print(after_toc)
assert before_toc == after_toc