Fix creation date metadata lost from input

Closes #247
This commit is contained in:
James R. Barlow 2018-04-02 17:53:39 -07:00
parent 1c1fd9616a
commit 7a1cd39b21
3 changed files with 125 additions and 12 deletions

View File

@ -19,6 +19,8 @@
from string import Template
from binascii import hexlify
from datetime import datetime
from xml.parsers.expat import ExpatError
import pkg_resources
import PyPDF2 as pypdf
@ -98,7 +100,79 @@ def encode_text_string(s: str) -> str:
return ascii_hex_str
def encode_pdf_date(d: datetime) -> str:
"""Encode Python datetime object as PDF date string
From Adobe pdfmark manual:
(D:YYYYMMDDHHmmSSOHH'mm')
D: is an optional prefix. YYYY is the year. All fields after the year are
optional. MM is the month (01-12), DD is the day (01-31), HH is the
hour (00-23), mm are the minutes (00-59), and SS are the seconds
(00-59). The remainder of the string defines the relation of local
time to GMT. O is either + for a positive difference (local time is
later than GMT) or - (minus) for a negative difference. HH' is the
absolute value of the offset from GMT in hours, and mm' is the
absolute value of the offset in minutes. If no GMT information is
specified, the relation between the specified time and GMT is
considered unknown. Regardless of whether or not GMT
information is specified, the remainder of the string should specify
the local time.
"""
pdfmark_date_fmt = r'%Y%m%d%H%M%S'
s = d.strftime(pdfmark_date_fmt)
tz = d.strftime('%z')
if tz == 'Z':
s += "+00'00'"
elif tz != '':
sign, tz_hours, tz_mins = tz[0], tz[1:3], tz[3:5]
s += "{}{}'{tz}'".format(sign, tz_hours, tz_mins)
return s
def decode_pdf_date(s: str) -> datetime:
pdfmark_date_fmts = (
r'%Y%m%d%H%M%S%z', # +0430 etc
r'%Y%m%d%H%M%S', # no time zone
r'%Y%m%d%H%M%SZ') # trailing Z
if s.startswith('D:'):
s = s[2:]
for fmt in pdfmark_date_fmts:
try:
return datetime.strptime(s, fmt)
except ValueError:
continue
return None
def _get_pdfmark_dates(pdfmark):
"""Encode dates for pdfmark Postscript. The best way to deal with a
missing date entry is set it to null, because if the key is omitted
Ghostscript will set it to now - we do not want to erase the fact that
the value was unknown. Setting to an empty string breaks Ghostscript
9.22 as reported here:
https://bugs.ghostscript.com/show_bug.cgi?id=699182
"""
for key in ('/CreationDate', '/ModDate'):
if key not in pdfmark:
continue
if pdfmark[key].strip() == '':
yield ' {} null'.format(key)
continue
date_str = pdfmark[key]
if date_str.startswith('D:'):
date_str = date_str[2:]
yield ' {} (D:{})'.format(key, date_str)
def _get_pdfa_def(icc_profile, icc_identifier, pdfmark):
"""Create a Postscript file for Ghostscript. pdfmark contains the various
objects as strings; these must be encoded in ASCII, and dates have a
special format."""
# Ghostscript <= 9.21 has a bug where null entries in DOCINFO might produce
# ERROR: VMerror (-25) on closing pdfwrite device.
# https://bugs.ghostscript.com/show_bug.cgi?id=697684
@ -107,12 +181,12 @@ def _get_pdfa_def(icc_profile, icc_identifier, pdfmark):
docinfo_line_template = ' {key} <{value}>'
def docinfo_gen():
yield from _get_pdfmark_dates(pdfmark)
for key in docinfo_keys:
if key in pdfmark and pdfmark[key].strip() != '':
line = docinfo_line_template.format(
key=key, value=encode_text_string(pdfmark[key]))
yield line
docinfo = '\n'.join(docinfo_gen())
t = Template(pdfa_def_template)
@ -145,9 +219,12 @@ def file_claims_pdfa(filename):
This checks if the XMP metadata contains a PDF/A marker.
"""
pdf = pypdf.PdfFileReader(filename)
xmp = pdf.getXmpMetadata()
try:
xmp = pdf.getXmpMetadata()
except ExpatError:
return {'pass': False, 'output': 'pdf',
'conformance': 'Invalid XML metadata'}
try:
pdfa_nodes = xmp.getNodesInNamespace(

View File

@ -18,6 +18,7 @@
from contextlib import suppress
from shutil import copyfileobj
from pathlib import Path
from datetime import datetime
import sys
import os
import shutil
@ -31,7 +32,7 @@ from ruffus import formatter, regex, Pipeline, suffix
from .hocrtransform import HocrTransform
from .pdfinfo import PdfInfo, Encoding, Colorspace
from .pdfa import generate_pdfa_ps
from .pdfa import generate_pdfa_ps, encode_pdf_date
from .helpers import re_symlink, is_iterable_notstr, page_number
from .exec import ghostscript, tesseract, qpdf
from .lib import fitz
@ -871,12 +872,8 @@ def get_pdfmark(base_pdf, options):
except (KeyError, TypeError):
return ''
pdfmark = {
'/Title': from_document_info('/Title'),
'/Author': from_document_info('/Author'),
'/Keywords': from_document_info('/Keywords'),
'/Subject': from_document_info('/Subject'),
}
pdfmark = {k: from_document_info(k) for k in
('/Title', '/Author', '/Keywords', '/Subject', '/CreationDate')}
if options.title:
pdfmark['/Title'] = options.title
if options.author:
@ -897,6 +894,7 @@ def get_pdfmark(base_pdf, options):
PROGRAM_NAME, VERSION,
renderer_tag,
tesseract.version())
pdfmark['/ModDate'] = encode_pdf_date(datetime.utcnow())
return pdfmark
@ -1030,7 +1028,7 @@ def merge_pages_mupdf(
reader_metadata = pypdf.PdfFileReader(metadata_file)
pdfmark = get_pdfmark(reader_metadata, options)
pdfmark['/Producer'] = 'PyMuPDF ' + fitz.version[0]
pymupdf_metadata = {k[1:].lower() : v for k, v in pdfmark.items()}
pymupdf_metadata = {(k[1].lower() + k[2:]) : v for k, v in pdfmark.items()}
for pdf_page in pdf_pages:
page = fitz.open(pdf_page)

View File

@ -18,8 +18,9 @@
import pytest
import PyPDF2 as pypdf
import datetime
from ocrmypdf.pdfa import file_claims_pdfa
from ocrmypdf.pdfa import file_claims_pdfa, encode_pdf_date, decode_pdf_date
from ocrmypdf.exceptions import ExitCode
from ocrmypdf.lib import fitz
@ -115,3 +116,40 @@ def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option,
print(before_toc)
print(after_toc)
assert before_toc == after_toc
def seconds_between_dates(date1, date2):
return (date2 - date1).total_seconds()
@pytest.mark.parametrize('infile', ['trivial.pdf', 'jbig2.pdf'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_creation_date_preserved(spoof_tesseract_noop, output_type, resources,
infile, outpdf):
input_file = resources / infile
before = pypdf.PdfFileReader(str(input_file)).getDocumentInfo()
check_ocrmypdf(
input_file, outpdf, '--output-type', output_type,
env=spoof_tesseract_noop)
after = pypdf.PdfFileReader(str(outpdf)).getDocumentInfo()
if not before:
# If there was input creation date, none should be output
# because of Ghostscript quirks we set it to null
# This test would be better if we had a test file with /DocumentInfo but
# no /CreationDate, which we don't
assert not after['/CreationDate'] or \
isinstance(after['/CreationDate'], pypdf.generic.NullObject)
else:
# We expect that the creation date stayed the same
date_before = decode_pdf_date(before['/CreationDate'])
date_after = decode_pdf_date(after['/CreationDate'])
assert seconds_between_dates(date_before, date_after) < 1000
# We expect that the modified date is quite recent
date_after = decode_pdf_date(after['/ModDate'])
assert seconds_between_dates(
date_after, datetime.datetime.utcnow()) < 1000