Add PDF/A validation

2025-11-01 18:29:58 +00:00 · 2015-07-23 14:48:46 -07:00 · 2015-07-23 14:48:46 -07:00 · cffd4623ca
commit cffd4623ca
parent 6dc2782e80
2 changed files with 142 additions and 2 deletions
--- a/src/ocrmypdf.py
+++ b/src/ocrmypdf.py
@ -33,7 +33,10 @@ from .pdfa import generate_pdfa_def
 warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)


-basedir = os.path.dirname(os.path.realpath(__file__))
+BASEDIR = os.path.dirname(os.path.realpath(__file__))
+JHOVE_PATH = os.path.realpath(os.path.join(BASEDIR, '..', 'jhove'))
+JHOVE_JAR = os.path.join(JHOVE_PATH, 'bin', 'JhoveApp.jar')
+JHOVE_CFG = os.path.join(JHOVE_PATH, 'conf', 'jhove.conf')

 parser = cmdline.get_argparse(
    prog="OCRmyPDF",
@ -310,7 +313,7 @@ def generate_postscript_stub(

@merge(
    input=[noop, generate_postscript_stub],
-    output=options.output_file,
+    output=os.path.join(options.temp_folder, 'merged.pdf'),
    extras=[_log, _pdfinfo, _pdfinfo_lock])
 def merge_pages(
        input_files,
@ -341,6 +344,68 @@ def merge_pages(
        shutil.copy(gs_pdf.name, output_file)


+@transform(
+    input=merge_pages,
+    filter=formatter(),
+    output=options.output_file,
+    extras=[_log, _pdfinfo, _pdfinfo_lock])
+def validate_pdfa(
+        input_file,
+        output_file,
+        log,
+        pdfinfo,
+        pdfinfo_lock):
+
+    args_jhove = [
+        'java',
+        '-jar', JHOVE_JAR,
+        '-c', JHOVE_CFG,
+        '-m', 'PDF-hul',
+        input_file
+    ]
+    p_jhove = Popen(args_jhove, close_fds=True, universal_newlines=True,
+                    stdout=PIPE, stderr=DEVNULL)
+    stdout, _ = p_jhove.communicate()
+
+    log.debug(stdout)
+    if p_jhove.returncode != 0:
+        log.error(stdout)
+        raise RuntimeError(
+            "Unexpected error while checking compliance to PDF/A file.")
+
+    pdf_is_valid = True
+    if re.search(r'ErrorMessage', stdout,
+                 re.IGNORECASE | re.MULTILINE):
+        pdf_is_valid = False
+    if re.search(r'^\s+Status.*not valid', stdout,
+                 re.IGNORECASE | re.MULTILINE):
+        pdf_is_valid = False
+    if re.search(r'^\s+Status.*Not well-formed', stdout,
+                 re.IGNORECASE | re.MULTILINE):
+        pdf_is_valid = False
+
+    pdf_is_pdfa = False
+    if re.search(r'^\s+Profile:.*PDF/A-1', stdout,
+                 re.IGNORECASE | re.MULTILINE):
+        pdf_is_pdfa = True
+
+    if not pdf_is_valid:
+        log.warning('Output file: The generated PDF/A file is INVALID')
+    elif pdf_is_valid and not pdf_is_pdfa:
+        log.warning('Output file: Generated file is a VALID PDF but not PDF/A')
+    elif pdf_is_valid and pdf_is_pdfa:
+        log.info('Output file: The generated PDF/A file is VALID')
+    shutil.copy(input_file, output_file)
+
+
+
+#     [ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Checking compliance to PDF/A standard"
+# ! java -jar "$JHOVE" -c "$JHOVE_CFG" -m PDF-hul "$FILE_OUTPUT_PDFA" 2> /dev/null 1> "$FILE_VALIDATION_LOG" \
+#     && echo "Unexpected error while checking compliance to PDF/A file. Exiting..." && exit $EXIT_OTHER_ERROR
+# grep -i "Status|Message" "$FILE_VALIDATION_LOG" # summary of the validation
+# [ $VERBOSITY -ge $LOG_DEBUG ] && echo "The full validation log is available here: \"$FILE_VALIDATION_LOG\""
+
+

 # @active_if(not ocr_required or (ocr_required and options.exact_image))
 # @transform(setup_working_directory,
--- a/src/pdfa.py
+++ b/src/pdfa.py
@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# © 2015: jbarlow83 (https://github.com/jbarlow83)
+#
+# Generate a PDFA_def.ps file for Ghostscript >= 9.14
+
+from __future__ import print_function, absolute_import, division
+from string import Template
+
+
+pdfa_def_template = u"""%!
+% This is a sample prefix file for creating a PDF/A document.
+% Feel free to modify entries marked with "Customize".
+% This assumes an ICC profile to reside in the file (ISO Coated sb.icc),
+% unless the user modifies the corresponding line below.
+
+% Define entries in the document Info dictionary :
+/ICCProfile ($icc_profile)
+def
+
+[ /Title ($pdf_title)
+  /DOCINFO pdfmark
+
+% Define an ICC profile :
+
+[/_objdef {icc_PDFA} /type /stream /OBJ pdfmark
+[{icc_PDFA}
+<<
+  /N currentpagedevice /ProcessColorModel known {
+    currentpagedevice /ProcessColorModel get dup /DeviceGray eq
+    {pop 1} {
+      /DeviceRGB eq
+      {3}{4} ifelse
+    } ifelse
+  } {
+    (ERROR, unable to determine ProcessColorModel) == flush
+  } ifelse
+>> /PUT pdfmark
+[{icc_PDFA} ICCProfile (r) file /PUT pdfmark
+
+% Define the output intent dictionary :
+
+[/_objdef {OutputIntent_PDFA} /type /dict /OBJ pdfmark
+[{OutputIntent_PDFA} <<
+  /Type /OutputIntent             % Must be so (the standard requires).
+  /S /GTS_PDFA1                   % Must be so (the standard requires).
+  /DestOutputProfile {icc_PDFA}            % Must be so (see above).
+  /OutputConditionIdentifier ($icc_identifier)
+>> /PUT pdfmark
+[{Catalog} <</OutputIntents [ {OutputIntent_PDFA} ]>> /PUT pdfmark
+"""
+
+
+def _get_pdfa_def(icc_profile, pdf_title, icc_identifier):
+    t = Template(pdfa_def_template)
+    result = t.substitute(icc_profile=icc_profile,
+                          pdf_title=pdf_title,
+                          icc_identifier=icc_identifier)
+    return result
+
+
+def generate_pdfa_def(target_filename, pdf_title='', icc='sRGB'):
+    # How does find this directory on other platforms?
+    if icc == 'sRGB':
+        icc_profile = '/usr/local/share/ghostscript/iccprofiles/srgb.icc'
+    else:
+        raise NotImplementedError("Only supporting sRGB")
+
+    ps = _get_pdfa_def(icc_profile, pdf_title, icc)
+
+    # Since PostScript might not handle UTF-8 (it's hard to get a clear
+    # answer), insist on ascii
+    with open(target_filename, 'w', encoding='ascii') as f:
+        f.write(ps)