diff --git a/src/ocrmypdf/pdfinfo.py b/src/ocrmypdf/pdfinfo.py index 08931d2a..8f5ff932 100644 --- a/src/ocrmypdf/pdfinfo.py +++ b/src/ocrmypdf/pdfinfo.py @@ -29,7 +29,7 @@ from enum import Enum from contextlib import contextmanager import PyPDF2 as pypdf -from fitz import Document +import fitz from .helpers import universal_open @@ -555,7 +555,7 @@ def borrow_stream(stream): def _page_has_text(infile, pageno): - doc = Document(infile) + doc = fitz.Document(infile) text = doc.getPageText(pageno) if text.strip() != '': return True @@ -692,6 +692,7 @@ class PdfInfo: def __init__(self, infile): self._infile = infile self._pages = _pdf_get_all_pageinfo(infile) + self._toc = fitz.Document(infile).getToC() @property def pages(self): @@ -712,6 +713,10 @@ class PdfInfo: raise NotImplementedError("can't get filename from stream") return self._infile + @property + def table_of_contents(self): + return self._toc + def __getitem__(self, item): return self._pages[item] diff --git a/src/ocrmypdf/pipeline.py b/src/ocrmypdf/pipeline.py index 32b742a9..9ebb0b3a 100644 --- a/src/ocrmypdf/pipeline.py +++ b/src/ocrmypdf/pipeline.py @@ -963,11 +963,14 @@ def merge_pages_ghostscript( ghostscript.generate_pdfa( pdf_version=input_pdfinfo.min_version, pdf_pages=pdf_pages, - output_file=output_file, + output_file=output_file + '_toc.pdf', compression=options.pdfa_image_compression, log=log, threads=options.jobs or 1, pdfa_part=('1' if options.output_type == 'pdfa-1' else '2')) + doc = fitz.Document(output_file + '_toc.pdf') + doc.setToC(input_pdfinfo.table_of_contents) + doc.save(output_file) def merge_pages_qpdf( diff --git a/tests/resources/README.rst b/tests/resources/README.rst index c3956acd..769f10d3 100644 --- a/tests/resources/README.rst +++ b/tests/resources/README.rst @@ -128,6 +128,7 @@ Assemblies These test resources are assemblies or derivatives from other previously mentioned files, released under the same license terms as their input files. +- baiona_gray.png (from baiona.png) - cardinal.pdf (four cardinal directions, baked-in rotated copies of LinnSequencer.jpg) - ccitt.pdf (LinnSequencer.jpg, converted to CCITT encoding) - encrypted_algo4.pdf (congress.jpg, encrypted with algorithm 4 - not supported by PyPDF2) @@ -135,12 +136,11 @@ These test resources are assemblies or derivatives from other previously mention - jbig2.pdf (congress.jpg, converted to JBIG2 encoding) - multipage.pdf (from several other files) - palette.pdf (congress.jpg, converted to a 256-color palette) -- rotated_skew.pdf (a /Rotate'd and skewed document from LinnSequencer.jpg) -- skew.pdf (from LinnSequencer.jpg, skew simulated by adjusting the transformation matrix) -- skew-encrypted.pdf (skew.pdf with encryption - access supported by PyPDF2, password is "password") -- baiona_gray.png (from baiona.png) - poster.pdf (from LinnSequencer.jpg) - +- rotated_skew.pdf (a /Rotate'd and skewed document from LinnSequencer.jpg) +- skew-encrypted.pdf (skew.pdf with encryption - access supported by PyPDF2, password is "password") +- skew.pdf (from LinnSequencer.jpg, skew simulated by adjusting the transformation matrix) +- toc.pdf (from formxobject.pdf, trivial.pdf) .. _`Wikimedia: LinnSequencer`: https://upload.wikimedia.org/wikipedia/en/b/b7/LinnSequencer_hardware_MIDI_sequencer_brochure_page_2_300dpi.jpg diff --git a/tests/resources/toc.pdf b/tests/resources/toc.pdf new file mode 100644 index 00000000..6f710c78 Binary files /dev/null and b/tests/resources/toc.pdf differ diff --git a/tests/test_metadata.py b/tests/test_metadata.py index fdb301c3..02b632b7 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -18,6 +18,7 @@ import pytest import PyPDF2 as pypdf +import fitz from ocrmypdf.pdfa import file_claims_pdfa from ocrmypdf.exceptions import ExitCode @@ -93,4 +94,23 @@ def test_high_unicode(spoof_tesseract_noop, resources, no_outpdf): '--output-type', 'pdfa', env=spoof_tesseract_noop) - assert p.returncode == ExitCode.bad_args, err \ No newline at end of file + assert p.returncode == ExitCode.bad_args, err + + +@pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr']) +@pytest.mark.parametrize('output_type', ['pdf', 'pdfa']) +def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option, + resources, outpdf): + input_file = resources / 'toc.pdf' + before_toc = fitz.Document(str(input_file)).getToC() + + check_ocrmypdf( + input_file, outpdf, + ocr_option, + '--output-type', output_type, + env=spoof_tesseract_noop) + + after_toc = fitz.Document(str(outpdf)).getToC() + print(before_toc) + print(after_toc) + assert before_toc == after_toc