Fix table of contents not preserved in PDF/A

This commit is contained in:
James R. Barlow 2018-03-26 02:23:19 -07:00
parent bc56b8e058
commit 45dbff6401
5 changed files with 37 additions and 9 deletions

View File

@ -29,7 +29,7 @@ from enum import Enum
from contextlib import contextmanager from contextlib import contextmanager
import PyPDF2 as pypdf import PyPDF2 as pypdf
from fitz import Document import fitz
from .helpers import universal_open from .helpers import universal_open
@ -555,7 +555,7 @@ def borrow_stream(stream):
def _page_has_text(infile, pageno): def _page_has_text(infile, pageno):
doc = Document(infile) doc = fitz.Document(infile)
text = doc.getPageText(pageno) text = doc.getPageText(pageno)
if text.strip() != '': if text.strip() != '':
return True return True
@ -692,6 +692,7 @@ class PdfInfo:
def __init__(self, infile): def __init__(self, infile):
self._infile = infile self._infile = infile
self._pages = _pdf_get_all_pageinfo(infile) self._pages = _pdf_get_all_pageinfo(infile)
self._toc = fitz.Document(infile).getToC()
@property @property
def pages(self): def pages(self):
@ -712,6 +713,10 @@ class PdfInfo:
raise NotImplementedError("can't get filename from stream") raise NotImplementedError("can't get filename from stream")
return self._infile return self._infile
@property
def table_of_contents(self):
return self._toc
def __getitem__(self, item): def __getitem__(self, item):
return self._pages[item] return self._pages[item]

View File

@ -963,11 +963,14 @@ def merge_pages_ghostscript(
ghostscript.generate_pdfa( ghostscript.generate_pdfa(
pdf_version=input_pdfinfo.min_version, pdf_version=input_pdfinfo.min_version,
pdf_pages=pdf_pages, pdf_pages=pdf_pages,
output_file=output_file, output_file=output_file + '_toc.pdf',
compression=options.pdfa_image_compression, compression=options.pdfa_image_compression,
log=log, log=log,
threads=options.jobs or 1, threads=options.jobs or 1,
pdfa_part=('1' if options.output_type == 'pdfa-1' else '2')) pdfa_part=('1' if options.output_type == 'pdfa-1' else '2'))
doc = fitz.Document(output_file + '_toc.pdf')
doc.setToC(input_pdfinfo.table_of_contents)
doc.save(output_file)
def merge_pages_qpdf( def merge_pages_qpdf(

View File

@ -128,6 +128,7 @@ Assemblies
These test resources are assemblies or derivatives from other previously mentioned files, released under the same license terms as their input files. These test resources are assemblies or derivatives from other previously mentioned files, released under the same license terms as their input files.
- baiona_gray.png (from baiona.png)
- cardinal.pdf (four cardinal directions, baked-in rotated copies of LinnSequencer.jpg) - cardinal.pdf (four cardinal directions, baked-in rotated copies of LinnSequencer.jpg)
- ccitt.pdf (LinnSequencer.jpg, converted to CCITT encoding) - ccitt.pdf (LinnSequencer.jpg, converted to CCITT encoding)
- encrypted_algo4.pdf (congress.jpg, encrypted with algorithm 4 - not supported by PyPDF2) - encrypted_algo4.pdf (congress.jpg, encrypted with algorithm 4 - not supported by PyPDF2)
@ -135,12 +136,11 @@ These test resources are assemblies or derivatives from other previously mention
- jbig2.pdf (congress.jpg, converted to JBIG2 encoding) - jbig2.pdf (congress.jpg, converted to JBIG2 encoding)
- multipage.pdf (from several other files) - multipage.pdf (from several other files)
- palette.pdf (congress.jpg, converted to a 256-color palette) - palette.pdf (congress.jpg, converted to a 256-color palette)
- rotated_skew.pdf (a /Rotate'd and skewed document from LinnSequencer.jpg)
- skew.pdf (from LinnSequencer.jpg, skew simulated by adjusting the transformation matrix)
- skew-encrypted.pdf (skew.pdf with encryption - access supported by PyPDF2, password is "password")
- baiona_gray.png (from baiona.png)
- poster.pdf (from LinnSequencer.jpg) - poster.pdf (from LinnSequencer.jpg)
- rotated_skew.pdf (a /Rotate'd and skewed document from LinnSequencer.jpg)
- skew-encrypted.pdf (skew.pdf with encryption - access supported by PyPDF2, password is "password")
- skew.pdf (from LinnSequencer.jpg, skew simulated by adjusting the transformation matrix)
- toc.pdf (from formxobject.pdf, trivial.pdf)
.. _`Wikimedia: LinnSequencer`: https://upload.wikimedia.org/wikipedia/en/b/b7/LinnSequencer_hardware_MIDI_sequencer_brochure_page_2_300dpi.jpg .. _`Wikimedia: LinnSequencer`: https://upload.wikimedia.org/wikipedia/en/b/b7/LinnSequencer_hardware_MIDI_sequencer_brochure_page_2_300dpi.jpg

BIN
tests/resources/toc.pdf Normal file

Binary file not shown.

View File

@ -18,6 +18,7 @@
import pytest import pytest
import PyPDF2 as pypdf import PyPDF2 as pypdf
import fitz
from ocrmypdf.pdfa import file_claims_pdfa from ocrmypdf.pdfa import file_claims_pdfa
from ocrmypdf.exceptions import ExitCode from ocrmypdf.exceptions import ExitCode
@ -94,3 +95,22 @@ def test_high_unicode(spoof_tesseract_noop, resources, no_outpdf):
env=spoof_tesseract_noop) env=spoof_tesseract_noop)
assert p.returncode == ExitCode.bad_args, err assert p.returncode == ExitCode.bad_args, err
@pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option,
resources, outpdf):
input_file = resources / 'toc.pdf'
before_toc = fitz.Document(str(input_file)).getToC()
check_ocrmypdf(
input_file, outpdf,
ocr_option,
'--output-type', output_type,
env=spoof_tesseract_noop)
after_toc = fitz.Document(str(outpdf)).getToC()
print(before_toc)
print(after_toc)
assert before_toc == after_toc