mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-08-19 14:12:38 +00:00
Add code to repair ToC with pikepdf
This commit is contained in:
parent
5e20d1d554
commit
2b5f23a2d1
@ -798,6 +798,51 @@ def _find_font(text, pdf_base):
|
|||||||
return font, font_key
|
return font, font_key
|
||||||
|
|
||||||
|
|
||||||
|
def _fix_toc(pdf_base, pageref_remap, log):
|
||||||
|
visited = set()
|
||||||
|
queue = set()
|
||||||
|
link_keys = ('/Parent', '/First', '/Last', '/Prev', '/Next')
|
||||||
|
|
||||||
|
if not '/Outlines' in pdf_base.root:
|
||||||
|
return
|
||||||
|
if not pageref_remap:
|
||||||
|
return
|
||||||
|
|
||||||
|
def remap_dest(dest_node):
|
||||||
|
if not isinstance(dest_node, pikepdf.Array):
|
||||||
|
return
|
||||||
|
pageref = dest_node[0]
|
||||||
|
if pageref['/Type'] == '/Page' and \
|
||||||
|
pageref._objgen in pageref_remap:
|
||||||
|
new_objgen = pageref_remap[pageref._objgen]
|
||||||
|
dest_node[0] = pdf_base._get_object_id(*new_objgen)
|
||||||
|
|
||||||
|
queue.add(pdf_base.root.Outlines._objgen)
|
||||||
|
while queue:
|
||||||
|
objgen = queue.pop()
|
||||||
|
visited.add(objgen)
|
||||||
|
node = pdf_base._get_object_id(*objgen)
|
||||||
|
log.debug('fix toc: visiting %r', objgen)
|
||||||
|
|
||||||
|
# Enumerate other nodes we could visit from here
|
||||||
|
for key in link_keys:
|
||||||
|
if key not in node:
|
||||||
|
continue
|
||||||
|
item = node[key]
|
||||||
|
if not item.is_indirect:
|
||||||
|
continue
|
||||||
|
objgen = item._objgen
|
||||||
|
if objgen not in visited:
|
||||||
|
queue.add(objgen)
|
||||||
|
|
||||||
|
log.debug(repr(node))
|
||||||
|
if '/Dest' in node:
|
||||||
|
remap_dest(node['/Dest'])
|
||||||
|
elif '/A' in node:
|
||||||
|
if '/S' in node['/A'] and node['/A']['/S'] == '/GoTo':
|
||||||
|
remap_dest(node['/A']['/D'])
|
||||||
|
|
||||||
|
|
||||||
def weave_layers(
|
def weave_layers(
|
||||||
infiles,
|
infiles,
|
||||||
output_file,
|
output_file,
|
||||||
@ -821,6 +866,7 @@ def weave_layers(
|
|||||||
keep_open = []
|
keep_open = []
|
||||||
font, font_key, procset = None, None, None
|
font, font_key, procset = None, None, None
|
||||||
pdfinfo = context.get_pdfinfo()
|
pdfinfo = context.get_pdfinfo()
|
||||||
|
pagerefs = {}
|
||||||
|
|
||||||
procset = pdf_base.make_indirect(
|
procset = pdf_base.make_indirect(
|
||||||
pikepdf.Object.parse(b'[ /PDF /Text /ImageB /ImageC /ImageI ]'))
|
pikepdf.Object.parse(b'[ /PDF /Text /ImageB /ImageC /ImageI ]'))
|
||||||
@ -848,10 +894,14 @@ def weave_layers(
|
|||||||
if path_image is not None and path_image != path_base:
|
if path_image is not None and path_image != path_base:
|
||||||
# We are replacing the old page
|
# We are replacing the old page
|
||||||
log.debug("Replace")
|
log.debug("Replace")
|
||||||
|
old_objgen = pdf_base.pages[page_num - 1]._objgen
|
||||||
|
|
||||||
pdf_image = pikepdf.open(image)
|
pdf_image = pikepdf.open(image)
|
||||||
keep_open.append(pdf_image)
|
keep_open.append(pdf_image)
|
||||||
image_page = pdf_image.pages[0]
|
image_page = pdf_image.pages[0]
|
||||||
pdf_base.pages[page_num - 1] = image_page
|
pdf_base.pages[page_num - 1] = image_page
|
||||||
|
|
||||||
|
pagerefs[old_objgen] = pdf_base.pages[page_num - 1]._objgen
|
||||||
replacing = True
|
replacing = True
|
||||||
|
|
||||||
autorotate_correction = context.get_rotation(page_num - 1)
|
autorotate_correction = context.get_rotation(page_num - 1)
|
||||||
@ -894,6 +944,7 @@ def weave_layers(
|
|||||||
procset = pdf_base.pages[0].Resources.ProcSet
|
procset = pdf_base.pages[0].Resources.ProcSet
|
||||||
font = pdf_base.pages[0].Resources.Font.get(font_key)
|
font = pdf_base.pages[0].Resources.Font.get(font_key)
|
||||||
|
|
||||||
|
_fix_toc(pdf_base, pagerefs, log)
|
||||||
pdf_base.save(output_file)
|
pdf_base.save(output_file)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user