From 45dbff64017eb3f7c218a9aac000cb15e035bf80 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Mon, 26 Mar 2018 02:23:19 -0700 Subject: [PATCH] Fix table of contents not preserved in PDF/A --- src/ocrmypdf/pdfinfo.py | 9 +++++++-- src/ocrmypdf/pipeline.py | 5 ++++- tests/resources/README.rst | 10 +++++----- tests/resources/toc.pdf | Bin 0 -> 9698 bytes tests/test_metadata.py | 22 +++++++++++++++++++++- 5 files changed, 37 insertions(+), 9 deletions(-) create mode 100644 tests/resources/toc.pdf diff --git a/src/ocrmypdf/pdfinfo.py b/src/ocrmypdf/pdfinfo.py index 08931d2a..8f5ff932 100644 --- a/src/ocrmypdf/pdfinfo.py +++ b/src/ocrmypdf/pdfinfo.py @@ -29,7 +29,7 @@ from enum import Enum from contextlib import contextmanager import PyPDF2 as pypdf -from fitz import Document +import fitz from .helpers import universal_open @@ -555,7 +555,7 @@ def borrow_stream(stream): def _page_has_text(infile, pageno): - doc = Document(infile) + doc = fitz.Document(infile) text = doc.getPageText(pageno) if text.strip() != '': return True @@ -692,6 +692,7 @@ class PdfInfo: def __init__(self, infile): self._infile = infile self._pages = _pdf_get_all_pageinfo(infile) + self._toc = fitz.Document(infile).getToC() @property def pages(self): @@ -712,6 +713,10 @@ class PdfInfo: raise NotImplementedError("can't get filename from stream") return self._infile + @property + def table_of_contents(self): + return self._toc + def __getitem__(self, item): return self._pages[item] diff --git a/src/ocrmypdf/pipeline.py b/src/ocrmypdf/pipeline.py index 32b742a9..9ebb0b3a 100644 --- a/src/ocrmypdf/pipeline.py +++ b/src/ocrmypdf/pipeline.py @@ -963,11 +963,14 @@ def merge_pages_ghostscript( ghostscript.generate_pdfa( pdf_version=input_pdfinfo.min_version, pdf_pages=pdf_pages, - output_file=output_file, + output_file=output_file + '_toc.pdf', compression=options.pdfa_image_compression, log=log, threads=options.jobs or 1, pdfa_part=('1' if options.output_type == 'pdfa-1' else '2')) + doc = fitz.Document(output_file + '_toc.pdf') + doc.setToC(input_pdfinfo.table_of_contents) + doc.save(output_file) def merge_pages_qpdf( diff --git a/tests/resources/README.rst b/tests/resources/README.rst index c3956acd..769f10d3 100644 --- a/tests/resources/README.rst +++ b/tests/resources/README.rst @@ -128,6 +128,7 @@ Assemblies These test resources are assemblies or derivatives from other previously mentioned files, released under the same license terms as their input files. +- baiona_gray.png (from baiona.png) - cardinal.pdf (four cardinal directions, baked-in rotated copies of LinnSequencer.jpg) - ccitt.pdf (LinnSequencer.jpg, converted to CCITT encoding) - encrypted_algo4.pdf (congress.jpg, encrypted with algorithm 4 - not supported by PyPDF2) @@ -135,12 +136,11 @@ These test resources are assemblies or derivatives from other previously mention - jbig2.pdf (congress.jpg, converted to JBIG2 encoding) - multipage.pdf (from several other files) - palette.pdf (congress.jpg, converted to a 256-color palette) -- rotated_skew.pdf (a /Rotate'd and skewed document from LinnSequencer.jpg) -- skew.pdf (from LinnSequencer.jpg, skew simulated by adjusting the transformation matrix) -- skew-encrypted.pdf (skew.pdf with encryption - access supported by PyPDF2, password is "password") -- baiona_gray.png (from baiona.png) - poster.pdf (from LinnSequencer.jpg) - +- rotated_skew.pdf (a /Rotate'd and skewed document from LinnSequencer.jpg) +- skew-encrypted.pdf (skew.pdf with encryption - access supported by PyPDF2, password is "password") +- skew.pdf (from LinnSequencer.jpg, skew simulated by adjusting the transformation matrix) +- toc.pdf (from formxobject.pdf, trivial.pdf) .. _`Wikimedia: LinnSequencer`: https://upload.wikimedia.org/wikipedia/en/b/b7/LinnSequencer_hardware_MIDI_sequencer_brochure_page_2_300dpi.jpg diff --git a/tests/resources/toc.pdf b/tests/resources/toc.pdf new file mode 100644 index 0000000000000000000000000000000000000000..6f710c780823eb14d242c308e07a128ae4323f8a GIT binary patch literal 9698 zcmeHNZ;T{G6-N$@j4pC2{)sVGRxaFKoT>gdJ=4SP+|BOnavN@MH@kZ|xMhts-8Hk# z_H+;Z=XUorK?I2)8jYX{qCt=^7g3^qF(x7y4FQZFj1Wx-$i+m7hDh`qCcdib?y326 zkB*GRqwre^{e+@)qBN(<>q`*Eg6Laub%(i?+az5prCatDojlk@SJZ4j_Y?2 zKIQriKXS8%0$xp8@otYIyas>O0{=mHKJeoLo*@xsZwMcw-WeLZHYwEW1?oFoa#bE8 zm$T3%9a_LANEd`IbaRn6Ewyf?Hb|X!y_kkDhDT!Bq@Wp)pK_ge6P1+$4CQt_58_4Yog(fAKrQG(YvoIUG|mO{RNw# zkbv^JR^+)Zhk?CHW3-L|b2dqs@l5(V3rNzPc%y*R35yi~AP~cAg{dcDXv%WugRsk% z0Rd<^o~*HeS7;O@p-m&iG+D{;Wu2o4&&&inXk7uqvY`QzWzc3U>M6ixrGOVn9J-9F zq$q0DVCDn=*oVdRRyGE3)z_5EW;SLKx%D&WzV^^_C!cxe`_-~~`FlS2*x|dPzuflG zFFepV_V9y8A3nUgZ{L?b{qB9u%jv(Sqls&7dE<`5m+e2Y|J*o?4HzdkS#EdSc;{8b zwBg9%{{ecA5n@0OgKrHCIL<6nfK;COgXTXE{^`yS z@4vk8>i3Sgj~gHAKJ{Yd@n?Todj9WMeWG#2;|JgS@ejOs`rF1eg~vKe^U<;8E0^wj z^G8=-X?XU{_wPUU^b1cu_p`gN_|`qzZ@&Mo7ta5<`qCTwlb4hKeD>(|==O*Iy?g$Z z&;9w_vnQXq^PV5vzWdPoe*Mw|=g;nb@ui=9XX5I^-+Vc|=7}rrzqZt}fBM~{kNoY+ zPyFML>RKp2f<1p6cDff+vK@Uc?7P{TO?O0`0H8ilDS%?APa}i>I0ogJMnG32Z z6AvHD-0K&q%)=Kz3hoE`1N0>xzQl zKW_9{xEmPjM%RhZIzw{>aV=H=C+@p|PKQQp>;y2#)=0Sgz@vm_>7&|#ngTZ*a7 zGZ4rZ1E&d56CG^UG;mWz*Gxq%YpPmQD%UB>b%$VF2z{{|_!mVNgy>+;X~$bgD=8W( zqV4VN5^pHkK^LTfzd4Bw;rgH(Ls%>JNa6)WwOmHGh!!9&MeUMLn zLL6dLt4`ITmZ|G?FyoaV0Ow9a!m4`~_9Ig`A_-%@ma1u~g#!oXmgWm-K$)1D!VTXK zAV@Hr&<+~|fN@CCTnfviQTJ}Zm)HwB0NqXv*OuFoG7j#TP6@+i)Za8)C zK_O;Y%e*^x$31(aq$sKv`?UpAI^Nn=8hQpsgL+Q1z99{so}<{{X^huZ%gjzY_@>3<0`P2w z>UD7vnsh!$Rx7UUBMmt^4k7_&9lLU^hvl3N1|db|l3BD_ z7^xa1wQN;Ycvu89>3t4dMPYKB_V6_h@c^#Z(2g-*M+0>?rz9DGerZpLx1hB1q+ z+j=PoJ6N?W3o9Ddv?5Rxqg_8HJ4HYGq!5zHY0}6J-5v|)h{+|bAc?0ZC)q&xlg_K# z%LnqK5}zZB#CV7Fuv${^I3@_*~?Vu0L*U+SZdt6DBI2IY|ASudQmgi6tz~?YJA_TDasTcqU;0Iml1?(LEzQ- z?lyM2lcc1yLt?-^-S+M{RZP$ZoHSQOq8yfyhMtR(BGXE8U<$QtW3CVH%Wt(UADY4g zypl}EuC*aTlDJNKJr6QHW^{~t3irDtdcK-9hzto^^@V0VNnEE^v1yB_cB=@xWwq!) z`cbr;wq2}Hqeb+JX%Ry#*5abfJ0f?%k7DB6G?PmeOE)WKtEku&TGZMGEfURa7afJ@ zWm_|eO3HlsK(54_%ALZ@c_e2Np2}OnRUt~?JWTMe+_&DYm;Qv`t`R`)r9t)t?j^V` z!L=s@_C$Qix-P-BCj|CHe95~0Z*WP824s@cliQF+u$(f@L$d6WC5mZwDJne{q&r~* z6(61;;p2q8^8%5`XtEqrN`_bt%w()o#wSy17%Ht;dL0Qt2YuWOY` z$%5p#Qc+53)j~S_YKDO-RcKd$GA&2r?OYk|a=7XEmi4y2s(1k{?4E|AQs0U|YDb7PMlWW_S19+IT65&MvdQh`_) zJzE%FDM=Sbybp?A4(S=tlPe&GLGlPp=I8{E@fzvqtfcxt1L?9pTB;9JzgW`D4JCa~ z^~ubJPxVBwjkYO*^Kcj=s8Pglk0YxkrSlD>iFyWyFO@QGjxv;Mn-s(X+^rzk>XL8^ zxe+2LV)rWYI8x>=t?z}VFS!S3B`s=-?-;EhlV*6x3yesIxW24K<}$(;I0IPO1+?H2 zbED9O9|>#(0^_t0Mklmb+XO5a99|Tl9ldNHo@|&KWnW+67{aI2%!Y-%2^$vg=_E8^-9qsur9B$T z&`DN%QHo2}2ktbmda3WoBZ0TLo+pnd+2cixcR@ST$1E%_f2BZWcc%R-)4Z6L*3S6V zDDTdrDl1>ABCN7|uYOqL#k66`r%A8fgecG=P;FuNti14c5yGTM+jQG)3b#0Lqqwda zxCNn)Vw;4J^V@C+NqyU+JJ<Y_#P?mL03i{KiC|15#w6$16`kOt0K#t8ydeT!cC*kAdhCj6g54LwpZv-0 z*(Tt|Deur!6E+6+Ab+9&H$Iw`@n||bMQ2_7MbQj)DKyk(0`qWNh3#I0bY(a*54V|$ z3H#U}ovFZKceqWfjJE08s62hl7=|@E2K(PShQ}}s2@2k+n3eMAxD{)Zu38Rh!k zW6sRir)Dj++Gtc}XOxC!SXHZ`&Fgx31{BeiW~DJ--+;S4NI%%kDtH1v?Q=h