Add PDF/A validation

This commit is contained in:
Jim Barlow 2015-07-23 14:48:46 -07:00
parent 6dc2782e80
commit cffd4623ca
2 changed files with 142 additions and 2 deletions

View File

@ -33,7 +33,10 @@ from .pdfa import generate_pdfa_def
warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
basedir = os.path.dirname(os.path.realpath(__file__))
BASEDIR = os.path.dirname(os.path.realpath(__file__))
JHOVE_PATH = os.path.realpath(os.path.join(BASEDIR, '..', 'jhove'))
JHOVE_JAR = os.path.join(JHOVE_PATH, 'bin', 'JhoveApp.jar')
JHOVE_CFG = os.path.join(JHOVE_PATH, 'conf', 'jhove.conf')
parser = cmdline.get_argparse(
prog="OCRmyPDF",
@ -310,7 +313,7 @@ def generate_postscript_stub(
@merge(
input=[noop, generate_postscript_stub],
output=options.output_file,
output=os.path.join(options.temp_folder, 'merged.pdf'),
extras=[_log, _pdfinfo, _pdfinfo_lock])
def merge_pages(
input_files,
@ -341,6 +344,68 @@ def merge_pages(
shutil.copy(gs_pdf.name, output_file)
@transform(
input=merge_pages,
filter=formatter(),
output=options.output_file,
extras=[_log, _pdfinfo, _pdfinfo_lock])
def validate_pdfa(
input_file,
output_file,
log,
pdfinfo,
pdfinfo_lock):
args_jhove = [
'java',
'-jar', JHOVE_JAR,
'-c', JHOVE_CFG,
'-m', 'PDF-hul',
input_file
]
p_jhove = Popen(args_jhove, close_fds=True, universal_newlines=True,
stdout=PIPE, stderr=DEVNULL)
stdout, _ = p_jhove.communicate()
log.debug(stdout)
if p_jhove.returncode != 0:
log.error(stdout)
raise RuntimeError(
"Unexpected error while checking compliance to PDF/A file.")
pdf_is_valid = True
if re.search(r'ErrorMessage', stdout,
re.IGNORECASE | re.MULTILINE):
pdf_is_valid = False
if re.search(r'^\s+Status.*not valid', stdout,
re.IGNORECASE | re.MULTILINE):
pdf_is_valid = False
if re.search(r'^\s+Status.*Not well-formed', stdout,
re.IGNORECASE | re.MULTILINE):
pdf_is_valid = False
pdf_is_pdfa = False
if re.search(r'^\s+Profile:.*PDF/A-1', stdout,
re.IGNORECASE | re.MULTILINE):
pdf_is_pdfa = True
if not pdf_is_valid:
log.warning('Output file: The generated PDF/A file is INVALID')
elif pdf_is_valid and not pdf_is_pdfa:
log.warning('Output file: Generated file is a VALID PDF but not PDF/A')
elif pdf_is_valid and pdf_is_pdfa:
log.info('Output file: The generated PDF/A file is VALID')
shutil.copy(input_file, output_file)
# [ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Checking compliance to PDF/A standard"
# ! java -jar "$JHOVE" -c "$JHOVE_CFG" -m PDF-hul "$FILE_OUTPUT_PDFA" 2> /dev/null 1> "$FILE_VALIDATION_LOG" \
# && echo "Unexpected error while checking compliance to PDF/A file. Exiting..." && exit $EXIT_OTHER_ERROR
# grep -i "Status|Message" "$FILE_VALIDATION_LOG" # summary of the validation
# [ $VERBOSITY -ge $LOG_DEBUG ] && echo "The full validation log is available here: \"$FILE_VALIDATION_LOG\""
# @active_if(not ocr_required or (ocr_required and options.exact_image))
# @transform(setup_working_directory,

75
src/pdfa.py Normal file
View File

@ -0,0 +1,75 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# © 2015: jbarlow83 (https://github.com/jbarlow83)
#
# Generate a PDFA_def.ps file for Ghostscript >= 9.14
from __future__ import print_function, absolute_import, division
from string import Template
pdfa_def_template = u"""%!
% This is a sample prefix file for creating a PDF/A document.
% Feel free to modify entries marked with "Customize".
% This assumes an ICC profile to reside in the file (ISO Coated sb.icc),
% unless the user modifies the corresponding line below.
% Define entries in the document Info dictionary :
/ICCProfile ($icc_profile)
def
[ /Title ($pdf_title)
/DOCINFO pdfmark
% Define an ICC profile :
[/_objdef {icc_PDFA} /type /stream /OBJ pdfmark
[{icc_PDFA}
<<
/N currentpagedevice /ProcessColorModel known {
currentpagedevice /ProcessColorModel get dup /DeviceGray eq
{pop 1} {
/DeviceRGB eq
{3}{4} ifelse
} ifelse
} {
(ERROR, unable to determine ProcessColorModel) == flush
} ifelse
>> /PUT pdfmark
[{icc_PDFA} ICCProfile (r) file /PUT pdfmark
% Define the output intent dictionary :
[/_objdef {OutputIntent_PDFA} /type /dict /OBJ pdfmark
[{OutputIntent_PDFA} <<
/Type /OutputIntent % Must be so (the standard requires).
/S /GTS_PDFA1 % Must be so (the standard requires).
/DestOutputProfile {icc_PDFA} % Must be so (see above).
/OutputConditionIdentifier ($icc_identifier)
>> /PUT pdfmark
[{Catalog} <</OutputIntents [ {OutputIntent_PDFA} ]>> /PUT pdfmark
"""
def _get_pdfa_def(icc_profile, pdf_title, icc_identifier):
t = Template(pdfa_def_template)
result = t.substitute(icc_profile=icc_profile,
pdf_title=pdf_title,
icc_identifier=icc_identifier)
return result
def generate_pdfa_def(target_filename, pdf_title='', icc='sRGB'):
# How does find this directory on other platforms?
if icc == 'sRGB':
icc_profile = '/usr/local/share/ghostscript/iccprofiles/srgb.icc'
else:
raise NotImplementedError("Only supporting sRGB")
ps = _get_pdfa_def(icc_profile, pdf_title, icc)
# Since PostScript might not handle UTF-8 (it's hard to get a clear
# answer), insist on ascii
with open(target_filename, 'w', encoding='ascii') as f:
f.write(ps)