mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-11-01 18:29:58 +00:00
Add PDF/A validation
This commit is contained in:
parent
6dc2782e80
commit
cffd4623ca
@ -33,7 +33,10 @@ from .pdfa import generate_pdfa_def
|
||||
warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
|
||||
|
||||
|
||||
basedir = os.path.dirname(os.path.realpath(__file__))
|
||||
BASEDIR = os.path.dirname(os.path.realpath(__file__))
|
||||
JHOVE_PATH = os.path.realpath(os.path.join(BASEDIR, '..', 'jhove'))
|
||||
JHOVE_JAR = os.path.join(JHOVE_PATH, 'bin', 'JhoveApp.jar')
|
||||
JHOVE_CFG = os.path.join(JHOVE_PATH, 'conf', 'jhove.conf')
|
||||
|
||||
parser = cmdline.get_argparse(
|
||||
prog="OCRmyPDF",
|
||||
@ -310,7 +313,7 @@ def generate_postscript_stub(
|
||||
|
||||
@merge(
|
||||
input=[noop, generate_postscript_stub],
|
||||
output=options.output_file,
|
||||
output=os.path.join(options.temp_folder, 'merged.pdf'),
|
||||
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
||||
def merge_pages(
|
||||
input_files,
|
||||
@ -341,6 +344,68 @@ def merge_pages(
|
||||
shutil.copy(gs_pdf.name, output_file)
|
||||
|
||||
|
||||
@transform(
|
||||
input=merge_pages,
|
||||
filter=formatter(),
|
||||
output=options.output_file,
|
||||
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
||||
def validate_pdfa(
|
||||
input_file,
|
||||
output_file,
|
||||
log,
|
||||
pdfinfo,
|
||||
pdfinfo_lock):
|
||||
|
||||
args_jhove = [
|
||||
'java',
|
||||
'-jar', JHOVE_JAR,
|
||||
'-c', JHOVE_CFG,
|
||||
'-m', 'PDF-hul',
|
||||
input_file
|
||||
]
|
||||
p_jhove = Popen(args_jhove, close_fds=True, universal_newlines=True,
|
||||
stdout=PIPE, stderr=DEVNULL)
|
||||
stdout, _ = p_jhove.communicate()
|
||||
|
||||
log.debug(stdout)
|
||||
if p_jhove.returncode != 0:
|
||||
log.error(stdout)
|
||||
raise RuntimeError(
|
||||
"Unexpected error while checking compliance to PDF/A file.")
|
||||
|
||||
pdf_is_valid = True
|
||||
if re.search(r'ErrorMessage', stdout,
|
||||
re.IGNORECASE | re.MULTILINE):
|
||||
pdf_is_valid = False
|
||||
if re.search(r'^\s+Status.*not valid', stdout,
|
||||
re.IGNORECASE | re.MULTILINE):
|
||||
pdf_is_valid = False
|
||||
if re.search(r'^\s+Status.*Not well-formed', stdout,
|
||||
re.IGNORECASE | re.MULTILINE):
|
||||
pdf_is_valid = False
|
||||
|
||||
pdf_is_pdfa = False
|
||||
if re.search(r'^\s+Profile:.*PDF/A-1', stdout,
|
||||
re.IGNORECASE | re.MULTILINE):
|
||||
pdf_is_pdfa = True
|
||||
|
||||
if not pdf_is_valid:
|
||||
log.warning('Output file: The generated PDF/A file is INVALID')
|
||||
elif pdf_is_valid and not pdf_is_pdfa:
|
||||
log.warning('Output file: Generated file is a VALID PDF but not PDF/A')
|
||||
elif pdf_is_valid and pdf_is_pdfa:
|
||||
log.info('Output file: The generated PDF/A file is VALID')
|
||||
shutil.copy(input_file, output_file)
|
||||
|
||||
|
||||
|
||||
# [ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Checking compliance to PDF/A standard"
|
||||
# ! java -jar "$JHOVE" -c "$JHOVE_CFG" -m PDF-hul "$FILE_OUTPUT_PDFA" 2> /dev/null 1> "$FILE_VALIDATION_LOG" \
|
||||
# && echo "Unexpected error while checking compliance to PDF/A file. Exiting..." && exit $EXIT_OTHER_ERROR
|
||||
# grep -i "Status|Message" "$FILE_VALIDATION_LOG" # summary of the validation
|
||||
# [ $VERBOSITY -ge $LOG_DEBUG ] && echo "The full validation log is available here: \"$FILE_VALIDATION_LOG\""
|
||||
|
||||
|
||||
|
||||
# @active_if(not ocr_required or (ocr_required and options.exact_image))
|
||||
# @transform(setup_working_directory,
|
||||
|
||||
75
src/pdfa.py
Normal file
75
src/pdfa.py
Normal file
@ -0,0 +1,75 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# © 2015: jbarlow83 (https://github.com/jbarlow83)
|
||||
#
|
||||
# Generate a PDFA_def.ps file for Ghostscript >= 9.14
|
||||
|
||||
from __future__ import print_function, absolute_import, division
|
||||
from string import Template
|
||||
|
||||
|
||||
pdfa_def_template = u"""%!
|
||||
% This is a sample prefix file for creating a PDF/A document.
|
||||
% Feel free to modify entries marked with "Customize".
|
||||
% This assumes an ICC profile to reside in the file (ISO Coated sb.icc),
|
||||
% unless the user modifies the corresponding line below.
|
||||
|
||||
% Define entries in the document Info dictionary :
|
||||
/ICCProfile ($icc_profile)
|
||||
def
|
||||
|
||||
[ /Title ($pdf_title)
|
||||
/DOCINFO pdfmark
|
||||
|
||||
% Define an ICC profile :
|
||||
|
||||
[/_objdef {icc_PDFA} /type /stream /OBJ pdfmark
|
||||
[{icc_PDFA}
|
||||
<<
|
||||
/N currentpagedevice /ProcessColorModel known {
|
||||
currentpagedevice /ProcessColorModel get dup /DeviceGray eq
|
||||
{pop 1} {
|
||||
/DeviceRGB eq
|
||||
{3}{4} ifelse
|
||||
} ifelse
|
||||
} {
|
||||
(ERROR, unable to determine ProcessColorModel) == flush
|
||||
} ifelse
|
||||
>> /PUT pdfmark
|
||||
[{icc_PDFA} ICCProfile (r) file /PUT pdfmark
|
||||
|
||||
% Define the output intent dictionary :
|
||||
|
||||
[/_objdef {OutputIntent_PDFA} /type /dict /OBJ pdfmark
|
||||
[{OutputIntent_PDFA} <<
|
||||
/Type /OutputIntent % Must be so (the standard requires).
|
||||
/S /GTS_PDFA1 % Must be so (the standard requires).
|
||||
/DestOutputProfile {icc_PDFA} % Must be so (see above).
|
||||
/OutputConditionIdentifier ($icc_identifier)
|
||||
>> /PUT pdfmark
|
||||
[{Catalog} <</OutputIntents [ {OutputIntent_PDFA} ]>> /PUT pdfmark
|
||||
"""
|
||||
|
||||
|
||||
def _get_pdfa_def(icc_profile, pdf_title, icc_identifier):
|
||||
t = Template(pdfa_def_template)
|
||||
result = t.substitute(icc_profile=icc_profile,
|
||||
pdf_title=pdf_title,
|
||||
icc_identifier=icc_identifier)
|
||||
return result
|
||||
|
||||
|
||||
def generate_pdfa_def(target_filename, pdf_title='', icc='sRGB'):
|
||||
# How does find this directory on other platforms?
|
||||
if icc == 'sRGB':
|
||||
icc_profile = '/usr/local/share/ghostscript/iccprofiles/srgb.icc'
|
||||
else:
|
||||
raise NotImplementedError("Only supporting sRGB")
|
||||
|
||||
ps = _get_pdfa_def(icc_profile, pdf_title, icc)
|
||||
|
||||
# Since PostScript might not handle UTF-8 (it's hard to get a clear
|
||||
# answer), insist on ascii
|
||||
with open(target_filename, 'w', encoding='ascii') as f:
|
||||
f.write(ps)
|
||||
Loading…
x
Reference in New Issue
Block a user