Add PDF/A validation

2025-11-02 19:00:12 +00:00 · 2015-07-23 14:48:46 -07:00 · 2015-07-23 14:48:46 -07:00 · cffd4623ca
commit cffd4623ca
parent 6dc2782e80
2 changed files with 142 additions and 2 deletions
--- a/src/ocrmypdf.py
+++ b/src/ocrmypdf.py
@ -33,7 +33,10 @@ from .pdfa import generate_pdfa_def
 warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
-basedir = os.path.dirname(os.path.realpath(__file__))
+BASEDIR = os.path.dirname(os.path.realpath(__file__))
 JHOVE_PATH = os.path.realpath(os.path.join(BASEDIR, '..', 'jhove'))
 JHOVE_JAR = os.path.join(JHOVE_PATH, 'bin', 'JhoveApp.jar')
 JHOVE_CFG = os.path.join(JHOVE_PATH, 'conf', 'jhove.conf')
 parser = cmdline.get_argparse(
    prog="OCRmyPDF",
@ -310,7 +313,7 @@ def generate_postscript_stub(
@merge(
    input=[noop, generate_postscript_stub],
-    output=options.output_file,
+    output=os.path.join(options.temp_folder, 'merged.pdf'),
    extras=[_log, _pdfinfo, _pdfinfo_lock])
 def merge_pages(
        input_files,
@ -341,6 +344,68 @@ def merge_pages(
        shutil.copy(gs_pdf.name, output_file)
@transform(
    input=merge_pages,
    filter=formatter(),
    output=options.output_file,
    extras=[_log, _pdfinfo, _pdfinfo_lock])
 def validate_pdfa(
        input_file,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):
    args_jhove = [
        'java',
        '-jar', JHOVE_JAR,
        '-c', JHOVE_CFG,
        '-m', 'PDF-hul',
        input_file
    ]
    p_jhove = Popen(args_jhove, close_fds=True, universal_newlines=True,
                    stdout=PIPE, stderr=DEVNULL)
    stdout, _ = p_jhove.communicate()
    log.debug(stdout)
    if p_jhove.returncode != 0:
        log.error(stdout)
        raise RuntimeError(
            "Unexpected error while checking compliance to PDF/A file.")
    pdf_is_valid = True
    if re.search(r'ErrorMessage', stdout,
                 re.IGNORECASE | re.MULTILINE):
        pdf_is_valid = False
    if re.search(r'^\s+Status.*not valid', stdout,
                 re.IGNORECASE | re.MULTILINE):
        pdf_is_valid = False
    if re.search(r'^\s+Status.*Not well-formed', stdout,
                 re.IGNORECASE | re.MULTILINE):
        pdf_is_valid = False
    pdf_is_pdfa = False
    if re.search(r'^\s+Profile:.*PDF/A-1', stdout,
                 re.IGNORECASE | re.MULTILINE):
        pdf_is_pdfa = True
    if not pdf_is_valid:
        log.warning('Output file: The generated PDF/A file is INVALID')
    elif pdf_is_valid and not pdf_is_pdfa:
        log.warning('Output file: Generated file is a VALID PDF but not PDF/A')
    elif pdf_is_valid and pdf_is_pdfa:
        log.info('Output file: The generated PDF/A file is VALID')
    shutil.copy(input_file, output_file)
 #     [ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Checking compliance to PDF/A standard"
 # ! java -jar "$JHOVE" -c "$JHOVE_CFG" -m PDF-hul "$FILE_OUTPUT_PDFA" 2> /dev/null 1> "$FILE_VALIDATION_LOG" \
 #     && echo "Unexpected error while checking compliance to PDF/A file. Exiting..." && exit $EXIT_OTHER_ERROR
 # grep -i "Status|Message" "$FILE_VALIDATION_LOG" # summary of the validation
 # [ $VERBOSITY -ge $LOG_DEBUG ] && echo "The full validation log is available here: \"$FILE_VALIDATION_LOG\""
 # @active_if(not ocr_required or (ocr_required and options.exact_image))
 # @transform(setup_working_directory,
--- a/src/pdfa.py
+++ b/src/pdfa.py
@ -0,0 +1,75 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
 # © 2015: jbarlow83 (https://github.com/jbarlow83)
 #
 # Generate a PDFA_def.ps file for Ghostscript >= 9.14
 from __future__ import print_function, absolute_import, division
 from string import Template
 pdfa_def_template = u"""%!
 % This is a sample prefix file for creating a PDF/A document.
 % Feel free to modify entries marked with "Customize".
 % This assumes an ICC profile to reside in the file (ISO Coated sb.icc),
 % unless the user modifies the corresponding line below.
 % Define entries in the document Info dictionary :
 /ICCProfile ($icc_profile)
 def
 [ /Title ($pdf_title)
  /DOCINFO pdfmark
 % Define an ICC profile :
 [/_objdef {icc_PDFA} /type /stream /OBJ pdfmark
 [{icc_PDFA}
 <<
  /N currentpagedevice /ProcessColorModel known {
    currentpagedevice /ProcessColorModel get dup /DeviceGray eq
    {pop 1} {
      /DeviceRGB eq
      {3}{4} ifelse
    } ifelse
  } {
    (ERROR, unable to determine ProcessColorModel) == flush
  } ifelse
 >> /PUT pdfmark
 [{icc_PDFA} ICCProfile (r) file /PUT pdfmark
 % Define the output intent dictionary :
 [/_objdef {OutputIntent_PDFA} /type /dict /OBJ pdfmark
 [{OutputIntent_PDFA} <<
  /Type /OutputIntent             % Must be so (the standard requires).
  /S /GTS_PDFA1                   % Must be so (the standard requires).
  /DestOutputProfile {icc_PDFA}            % Must be so (see above).
  /OutputConditionIdentifier ($icc_identifier)
 >> /PUT pdfmark
 [{Catalog} <</OutputIntents [ {OutputIntent_PDFA} ]>> /PUT pdfmark
 """
 def _get_pdfa_def(icc_profile, pdf_title, icc_identifier):
    t = Template(pdfa_def_template)
    result = t.substitute(icc_profile=icc_profile,
                          pdf_title=pdf_title,
                          icc_identifier=icc_identifier)
    return result
 def generate_pdfa_def(target_filename, pdf_title='', icc='sRGB'):
    # How does find this directory on other platforms?
    if icc == 'sRGB':
        icc_profile = '/usr/local/share/ghostscript/iccprofiles/srgb.icc'
    else:
        raise NotImplementedError("Only supporting sRGB")
    ps = _get_pdfa_def(icc_profile, pdf_title, icc)
    # Since PostScript might not handle UTF-8 (it's hard to get a clear
    # answer), insist on ascii
    with open(target_filename, 'w', encoding='ascii') as f:
        f.write(ps)