mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-12-30 00:31:59 +00:00
parent
1c1fd9616a
commit
7a1cd39b21
@ -19,6 +19,8 @@
|
||||
|
||||
from string import Template
|
||||
from binascii import hexlify
|
||||
from datetime import datetime
|
||||
from xml.parsers.expat import ExpatError
|
||||
import pkg_resources
|
||||
import PyPDF2 as pypdf
|
||||
|
||||
@ -98,7 +100,79 @@ def encode_text_string(s: str) -> str:
|
||||
return ascii_hex_str
|
||||
|
||||
|
||||
def encode_pdf_date(d: datetime) -> str:
|
||||
"""Encode Python datetime object as PDF date string
|
||||
|
||||
From Adobe pdfmark manual:
|
||||
(D:YYYYMMDDHHmmSSOHH'mm')
|
||||
D: is an optional prefix. YYYY is the year. All fields after the year are
|
||||
optional. MM is the month (01-12), DD is the day (01-31), HH is the
|
||||
hour (00-23), mm are the minutes (00-59), and SS are the seconds
|
||||
(00-59). The remainder of the string defines the relation of local
|
||||
time to GMT. O is either + for a positive difference (local time is
|
||||
later than GMT) or - (minus) for a negative difference. HH' is the
|
||||
absolute value of the offset from GMT in hours, and mm' is the
|
||||
absolute value of the offset in minutes. If no GMT information is
|
||||
specified, the relation between the specified time and GMT is
|
||||
considered unknown. Regardless of whether or not GMT
|
||||
information is specified, the remainder of the string should specify
|
||||
the local time.
|
||||
"""
|
||||
|
||||
pdfmark_date_fmt = r'%Y%m%d%H%M%S'
|
||||
s = d.strftime(pdfmark_date_fmt)
|
||||
|
||||
tz = d.strftime('%z')
|
||||
if tz == 'Z':
|
||||
s += "+00'00'"
|
||||
elif tz != '':
|
||||
sign, tz_hours, tz_mins = tz[0], tz[1:3], tz[3:5]
|
||||
s += "{}{}'{tz}'".format(sign, tz_hours, tz_mins)
|
||||
return s
|
||||
|
||||
|
||||
def decode_pdf_date(s: str) -> datetime:
|
||||
pdfmark_date_fmts = (
|
||||
r'%Y%m%d%H%M%S%z', # +0430 etc
|
||||
r'%Y%m%d%H%M%S', # no time zone
|
||||
r'%Y%m%d%H%M%SZ') # trailing Z
|
||||
|
||||
if s.startswith('D:'):
|
||||
s = s[2:]
|
||||
for fmt in pdfmark_date_fmts:
|
||||
try:
|
||||
return datetime.strptime(s, fmt)
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def _get_pdfmark_dates(pdfmark):
|
||||
"""Encode dates for pdfmark Postscript. The best way to deal with a
|
||||
missing date entry is set it to null, because if the key is omitted
|
||||
Ghostscript will set it to now - we do not want to erase the fact that
|
||||
the value was unknown. Setting to an empty string breaks Ghostscript
|
||||
9.22 as reported here:
|
||||
https://bugs.ghostscript.com/show_bug.cgi?id=699182
|
||||
"""
|
||||
|
||||
for key in ('/CreationDate', '/ModDate'):
|
||||
if key not in pdfmark:
|
||||
continue
|
||||
if pdfmark[key].strip() == '':
|
||||
yield ' {} null'.format(key)
|
||||
continue
|
||||
date_str = pdfmark[key]
|
||||
if date_str.startswith('D:'):
|
||||
date_str = date_str[2:]
|
||||
yield ' {} (D:{})'.format(key, date_str)
|
||||
|
||||
|
||||
def _get_pdfa_def(icc_profile, icc_identifier, pdfmark):
|
||||
"""Create a Postscript file for Ghostscript. pdfmark contains the various
|
||||
objects as strings; these must be encoded in ASCII, and dates have a
|
||||
special format."""
|
||||
|
||||
# Ghostscript <= 9.21 has a bug where null entries in DOCINFO might produce
|
||||
# ERROR: VMerror (-25) on closing pdfwrite device.
|
||||
# https://bugs.ghostscript.com/show_bug.cgi?id=697684
|
||||
@ -107,12 +181,12 @@ def _get_pdfa_def(icc_profile, icc_identifier, pdfmark):
|
||||
docinfo_line_template = ' {key} <{value}>'
|
||||
|
||||
def docinfo_gen():
|
||||
yield from _get_pdfmark_dates(pdfmark)
|
||||
for key in docinfo_keys:
|
||||
if key in pdfmark and pdfmark[key].strip() != '':
|
||||
line = docinfo_line_template.format(
|
||||
key=key, value=encode_text_string(pdfmark[key]))
|
||||
yield line
|
||||
|
||||
docinfo = '\n'.join(docinfo_gen())
|
||||
|
||||
t = Template(pdfa_def_template)
|
||||
@ -145,9 +219,12 @@ def file_claims_pdfa(filename):
|
||||
|
||||
This checks if the XMP metadata contains a PDF/A marker.
|
||||
"""
|
||||
|
||||
pdf = pypdf.PdfFileReader(filename)
|
||||
xmp = pdf.getXmpMetadata()
|
||||
try:
|
||||
xmp = pdf.getXmpMetadata()
|
||||
except ExpatError:
|
||||
return {'pass': False, 'output': 'pdf',
|
||||
'conformance': 'Invalid XML metadata'}
|
||||
|
||||
try:
|
||||
pdfa_nodes = xmp.getNodesInNamespace(
|
||||
|
||||
@ -18,6 +18,7 @@
|
||||
from contextlib import suppress
|
||||
from shutil import copyfileobj
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import sys
|
||||
import os
|
||||
import shutil
|
||||
@ -31,7 +32,7 @@ from ruffus import formatter, regex, Pipeline, suffix
|
||||
|
||||
from .hocrtransform import HocrTransform
|
||||
from .pdfinfo import PdfInfo, Encoding, Colorspace
|
||||
from .pdfa import generate_pdfa_ps
|
||||
from .pdfa import generate_pdfa_ps, encode_pdf_date
|
||||
from .helpers import re_symlink, is_iterable_notstr, page_number
|
||||
from .exec import ghostscript, tesseract, qpdf
|
||||
from .lib import fitz
|
||||
@ -871,12 +872,8 @@ def get_pdfmark(base_pdf, options):
|
||||
except (KeyError, TypeError):
|
||||
return ''
|
||||
|
||||
pdfmark = {
|
||||
'/Title': from_document_info('/Title'),
|
||||
'/Author': from_document_info('/Author'),
|
||||
'/Keywords': from_document_info('/Keywords'),
|
||||
'/Subject': from_document_info('/Subject'),
|
||||
}
|
||||
pdfmark = {k: from_document_info(k) for k in
|
||||
('/Title', '/Author', '/Keywords', '/Subject', '/CreationDate')}
|
||||
if options.title:
|
||||
pdfmark['/Title'] = options.title
|
||||
if options.author:
|
||||
@ -897,6 +894,7 @@ def get_pdfmark(base_pdf, options):
|
||||
PROGRAM_NAME, VERSION,
|
||||
renderer_tag,
|
||||
tesseract.version())
|
||||
pdfmark['/ModDate'] = encode_pdf_date(datetime.utcnow())
|
||||
return pdfmark
|
||||
|
||||
|
||||
@ -1030,7 +1028,7 @@ def merge_pages_mupdf(
|
||||
reader_metadata = pypdf.PdfFileReader(metadata_file)
|
||||
pdfmark = get_pdfmark(reader_metadata, options)
|
||||
pdfmark['/Producer'] = 'PyMuPDF ' + fitz.version[0]
|
||||
pymupdf_metadata = {k[1:].lower() : v for k, v in pdfmark.items()}
|
||||
pymupdf_metadata = {(k[1].lower() + k[2:]) : v for k, v in pdfmark.items()}
|
||||
|
||||
for pdf_page in pdf_pages:
|
||||
page = fitz.open(pdf_page)
|
||||
|
||||
@ -18,8 +18,9 @@
|
||||
|
||||
import pytest
|
||||
import PyPDF2 as pypdf
|
||||
import datetime
|
||||
|
||||
from ocrmypdf.pdfa import file_claims_pdfa
|
||||
from ocrmypdf.pdfa import file_claims_pdfa, encode_pdf_date, decode_pdf_date
|
||||
from ocrmypdf.exceptions import ExitCode
|
||||
from ocrmypdf.lib import fitz
|
||||
|
||||
@ -115,3 +116,40 @@ def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option,
|
||||
print(before_toc)
|
||||
print(after_toc)
|
||||
assert before_toc == after_toc
|
||||
|
||||
|
||||
def seconds_between_dates(date1, date2):
|
||||
return (date2 - date1).total_seconds()
|
||||
|
||||
|
||||
@pytest.mark.parametrize('infile', ['trivial.pdf', 'jbig2.pdf'])
|
||||
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
|
||||
def test_creation_date_preserved(spoof_tesseract_noop, output_type, resources,
|
||||
infile, outpdf):
|
||||
input_file = resources / infile
|
||||
|
||||
before = pypdf.PdfFileReader(str(input_file)).getDocumentInfo()
|
||||
check_ocrmypdf(
|
||||
input_file, outpdf, '--output-type', output_type,
|
||||
env=spoof_tesseract_noop)
|
||||
after = pypdf.PdfFileReader(str(outpdf)).getDocumentInfo()
|
||||
|
||||
if not before:
|
||||
# If there was input creation date, none should be output
|
||||
# because of Ghostscript quirks we set it to null
|
||||
# This test would be better if we had a test file with /DocumentInfo but
|
||||
# no /CreationDate, which we don't
|
||||
assert not after['/CreationDate'] or \
|
||||
isinstance(after['/CreationDate'], pypdf.generic.NullObject)
|
||||
else:
|
||||
# We expect that the creation date stayed the same
|
||||
date_before = decode_pdf_date(before['/CreationDate'])
|
||||
date_after = decode_pdf_date(after['/CreationDate'])
|
||||
assert seconds_between_dates(date_before, date_after) < 1000
|
||||
|
||||
# We expect that the modified date is quite recent
|
||||
date_after = decode_pdf_date(after['/ModDate'])
|
||||
assert seconds_between_dates(
|
||||
date_after, datetime.datetime.utcnow()) < 1000
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user