mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-08-16 04:31:45 +00:00
Fix table of contents not preserved in PDF/A
This commit is contained in:
parent
bc56b8e058
commit
45dbff6401
@ -29,7 +29,7 @@ from enum import Enum
|
|||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
import PyPDF2 as pypdf
|
import PyPDF2 as pypdf
|
||||||
from fitz import Document
|
import fitz
|
||||||
|
|
||||||
from .helpers import universal_open
|
from .helpers import universal_open
|
||||||
|
|
||||||
@ -555,7 +555,7 @@ def borrow_stream(stream):
|
|||||||
|
|
||||||
|
|
||||||
def _page_has_text(infile, pageno):
|
def _page_has_text(infile, pageno):
|
||||||
doc = Document(infile)
|
doc = fitz.Document(infile)
|
||||||
text = doc.getPageText(pageno)
|
text = doc.getPageText(pageno)
|
||||||
if text.strip() != '':
|
if text.strip() != '':
|
||||||
return True
|
return True
|
||||||
@ -692,6 +692,7 @@ class PdfInfo:
|
|||||||
def __init__(self, infile):
|
def __init__(self, infile):
|
||||||
self._infile = infile
|
self._infile = infile
|
||||||
self._pages = _pdf_get_all_pageinfo(infile)
|
self._pages = _pdf_get_all_pageinfo(infile)
|
||||||
|
self._toc = fitz.Document(infile).getToC()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def pages(self):
|
def pages(self):
|
||||||
@ -712,6 +713,10 @@ class PdfInfo:
|
|||||||
raise NotImplementedError("can't get filename from stream")
|
raise NotImplementedError("can't get filename from stream")
|
||||||
return self._infile
|
return self._infile
|
||||||
|
|
||||||
|
@property
|
||||||
|
def table_of_contents(self):
|
||||||
|
return self._toc
|
||||||
|
|
||||||
def __getitem__(self, item):
|
def __getitem__(self, item):
|
||||||
return self._pages[item]
|
return self._pages[item]
|
||||||
|
|
||||||
|
@ -963,11 +963,14 @@ def merge_pages_ghostscript(
|
|||||||
ghostscript.generate_pdfa(
|
ghostscript.generate_pdfa(
|
||||||
pdf_version=input_pdfinfo.min_version,
|
pdf_version=input_pdfinfo.min_version,
|
||||||
pdf_pages=pdf_pages,
|
pdf_pages=pdf_pages,
|
||||||
output_file=output_file,
|
output_file=output_file + '_toc.pdf',
|
||||||
compression=options.pdfa_image_compression,
|
compression=options.pdfa_image_compression,
|
||||||
log=log,
|
log=log,
|
||||||
threads=options.jobs or 1,
|
threads=options.jobs or 1,
|
||||||
pdfa_part=('1' if options.output_type == 'pdfa-1' else '2'))
|
pdfa_part=('1' if options.output_type == 'pdfa-1' else '2'))
|
||||||
|
doc = fitz.Document(output_file + '_toc.pdf')
|
||||||
|
doc.setToC(input_pdfinfo.table_of_contents)
|
||||||
|
doc.save(output_file)
|
||||||
|
|
||||||
|
|
||||||
def merge_pages_qpdf(
|
def merge_pages_qpdf(
|
||||||
|
@ -128,6 +128,7 @@ Assemblies
|
|||||||
|
|
||||||
These test resources are assemblies or derivatives from other previously mentioned files, released under the same license terms as their input files.
|
These test resources are assemblies or derivatives from other previously mentioned files, released under the same license terms as their input files.
|
||||||
|
|
||||||
|
- baiona_gray.png (from baiona.png)
|
||||||
- cardinal.pdf (four cardinal directions, baked-in rotated copies of LinnSequencer.jpg)
|
- cardinal.pdf (four cardinal directions, baked-in rotated copies of LinnSequencer.jpg)
|
||||||
- ccitt.pdf (LinnSequencer.jpg, converted to CCITT encoding)
|
- ccitt.pdf (LinnSequencer.jpg, converted to CCITT encoding)
|
||||||
- encrypted_algo4.pdf (congress.jpg, encrypted with algorithm 4 - not supported by PyPDF2)
|
- encrypted_algo4.pdf (congress.jpg, encrypted with algorithm 4 - not supported by PyPDF2)
|
||||||
@ -135,12 +136,11 @@ These test resources are assemblies or derivatives from other previously mention
|
|||||||
- jbig2.pdf (congress.jpg, converted to JBIG2 encoding)
|
- jbig2.pdf (congress.jpg, converted to JBIG2 encoding)
|
||||||
- multipage.pdf (from several other files)
|
- multipage.pdf (from several other files)
|
||||||
- palette.pdf (congress.jpg, converted to a 256-color palette)
|
- palette.pdf (congress.jpg, converted to a 256-color palette)
|
||||||
- rotated_skew.pdf (a /Rotate'd and skewed document from LinnSequencer.jpg)
|
|
||||||
- skew.pdf (from LinnSequencer.jpg, skew simulated by adjusting the transformation matrix)
|
|
||||||
- skew-encrypted.pdf (skew.pdf with encryption - access supported by PyPDF2, password is "password")
|
|
||||||
- baiona_gray.png (from baiona.png)
|
|
||||||
- poster.pdf (from LinnSequencer.jpg)
|
- poster.pdf (from LinnSequencer.jpg)
|
||||||
|
- rotated_skew.pdf (a /Rotate'd and skewed document from LinnSequencer.jpg)
|
||||||
|
- skew-encrypted.pdf (skew.pdf with encryption - access supported by PyPDF2, password is "password")
|
||||||
|
- skew.pdf (from LinnSequencer.jpg, skew simulated by adjusting the transformation matrix)
|
||||||
|
- toc.pdf (from formxobject.pdf, trivial.pdf)
|
||||||
|
|
||||||
|
|
||||||
.. _`Wikimedia: LinnSequencer`: https://upload.wikimedia.org/wikipedia/en/b/b7/LinnSequencer_hardware_MIDI_sequencer_brochure_page_2_300dpi.jpg
|
.. _`Wikimedia: LinnSequencer`: https://upload.wikimedia.org/wikipedia/en/b/b7/LinnSequencer_hardware_MIDI_sequencer_brochure_page_2_300dpi.jpg
|
||||||
|
BIN
tests/resources/toc.pdf
Normal file
BIN
tests/resources/toc.pdf
Normal file
Binary file not shown.
@ -18,6 +18,7 @@
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import PyPDF2 as pypdf
|
import PyPDF2 as pypdf
|
||||||
|
import fitz
|
||||||
|
|
||||||
from ocrmypdf.pdfa import file_claims_pdfa
|
from ocrmypdf.pdfa import file_claims_pdfa
|
||||||
from ocrmypdf.exceptions import ExitCode
|
from ocrmypdf.exceptions import ExitCode
|
||||||
@ -94,3 +95,22 @@ def test_high_unicode(spoof_tesseract_noop, resources, no_outpdf):
|
|||||||
env=spoof_tesseract_noop)
|
env=spoof_tesseract_noop)
|
||||||
|
|
||||||
assert p.returncode == ExitCode.bad_args, err
|
assert p.returncode == ExitCode.bad_args, err
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr'])
|
||||||
|
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
|
||||||
|
def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option,
|
||||||
|
resources, outpdf):
|
||||||
|
input_file = resources / 'toc.pdf'
|
||||||
|
before_toc = fitz.Document(str(input_file)).getToC()
|
||||||
|
|
||||||
|
check_ocrmypdf(
|
||||||
|
input_file, outpdf,
|
||||||
|
ocr_option,
|
||||||
|
'--output-type', output_type,
|
||||||
|
env=spoof_tesseract_noop)
|
||||||
|
|
||||||
|
after_toc = fitz.Document(str(outpdf)).getToC()
|
||||||
|
print(before_toc)
|
||||||
|
print(after_toc)
|
||||||
|
assert before_toc == after_toc
|
||||||
|
Loading…
x
Reference in New Issue
Block a user