mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-11-02 19:00:12 +00:00
Add PDF/A validation
This commit is contained in:
parent
6dc2782e80
commit
cffd4623ca
@ -33,7 +33,10 @@ from .pdfa import generate_pdfa_def
|
|||||||
warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
|
warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
|
||||||
|
|
||||||
|
|
||||||
basedir = os.path.dirname(os.path.realpath(__file__))
|
BASEDIR = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
JHOVE_PATH = os.path.realpath(os.path.join(BASEDIR, '..', 'jhove'))
|
||||||
|
JHOVE_JAR = os.path.join(JHOVE_PATH, 'bin', 'JhoveApp.jar')
|
||||||
|
JHOVE_CFG = os.path.join(JHOVE_PATH, 'conf', 'jhove.conf')
|
||||||
|
|
||||||
parser = cmdline.get_argparse(
|
parser = cmdline.get_argparse(
|
||||||
prog="OCRmyPDF",
|
prog="OCRmyPDF",
|
||||||
@ -310,7 +313,7 @@ def generate_postscript_stub(
|
|||||||
|
|
||||||
@merge(
|
@merge(
|
||||||
input=[noop, generate_postscript_stub],
|
input=[noop, generate_postscript_stub],
|
||||||
output=options.output_file,
|
output=os.path.join(options.temp_folder, 'merged.pdf'),
|
||||||
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
||||||
def merge_pages(
|
def merge_pages(
|
||||||
input_files,
|
input_files,
|
||||||
@ -341,6 +344,68 @@ def merge_pages(
|
|||||||
shutil.copy(gs_pdf.name, output_file)
|
shutil.copy(gs_pdf.name, output_file)
|
||||||
|
|
||||||
|
|
||||||
|
@transform(
|
||||||
|
input=merge_pages,
|
||||||
|
filter=formatter(),
|
||||||
|
output=options.output_file,
|
||||||
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
||||||
|
def validate_pdfa(
|
||||||
|
input_file,
|
||||||
|
output_file,
|
||||||
|
log,
|
||||||
|
pdfinfo,
|
||||||
|
pdfinfo_lock):
|
||||||
|
|
||||||
|
args_jhove = [
|
||||||
|
'java',
|
||||||
|
'-jar', JHOVE_JAR,
|
||||||
|
'-c', JHOVE_CFG,
|
||||||
|
'-m', 'PDF-hul',
|
||||||
|
input_file
|
||||||
|
]
|
||||||
|
p_jhove = Popen(args_jhove, close_fds=True, universal_newlines=True,
|
||||||
|
stdout=PIPE, stderr=DEVNULL)
|
||||||
|
stdout, _ = p_jhove.communicate()
|
||||||
|
|
||||||
|
log.debug(stdout)
|
||||||
|
if p_jhove.returncode != 0:
|
||||||
|
log.error(stdout)
|
||||||
|
raise RuntimeError(
|
||||||
|
"Unexpected error while checking compliance to PDF/A file.")
|
||||||
|
|
||||||
|
pdf_is_valid = True
|
||||||
|
if re.search(r'ErrorMessage', stdout,
|
||||||
|
re.IGNORECASE | re.MULTILINE):
|
||||||
|
pdf_is_valid = False
|
||||||
|
if re.search(r'^\s+Status.*not valid', stdout,
|
||||||
|
re.IGNORECASE | re.MULTILINE):
|
||||||
|
pdf_is_valid = False
|
||||||
|
if re.search(r'^\s+Status.*Not well-formed', stdout,
|
||||||
|
re.IGNORECASE | re.MULTILINE):
|
||||||
|
pdf_is_valid = False
|
||||||
|
|
||||||
|
pdf_is_pdfa = False
|
||||||
|
if re.search(r'^\s+Profile:.*PDF/A-1', stdout,
|
||||||
|
re.IGNORECASE | re.MULTILINE):
|
||||||
|
pdf_is_pdfa = True
|
||||||
|
|
||||||
|
if not pdf_is_valid:
|
||||||
|
log.warning('Output file: The generated PDF/A file is INVALID')
|
||||||
|
elif pdf_is_valid and not pdf_is_pdfa:
|
||||||
|
log.warning('Output file: Generated file is a VALID PDF but not PDF/A')
|
||||||
|
elif pdf_is_valid and pdf_is_pdfa:
|
||||||
|
log.info('Output file: The generated PDF/A file is VALID')
|
||||||
|
shutil.copy(input_file, output_file)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# [ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Checking compliance to PDF/A standard"
|
||||||
|
# ! java -jar "$JHOVE" -c "$JHOVE_CFG" -m PDF-hul "$FILE_OUTPUT_PDFA" 2> /dev/null 1> "$FILE_VALIDATION_LOG" \
|
||||||
|
# && echo "Unexpected error while checking compliance to PDF/A file. Exiting..." && exit $EXIT_OTHER_ERROR
|
||||||
|
# grep -i "Status|Message" "$FILE_VALIDATION_LOG" # summary of the validation
|
||||||
|
# [ $VERBOSITY -ge $LOG_DEBUG ] && echo "The full validation log is available here: \"$FILE_VALIDATION_LOG\""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# @active_if(not ocr_required or (ocr_required and options.exact_image))
|
# @active_if(not ocr_required or (ocr_required and options.exact_image))
|
||||||
# @transform(setup_working_directory,
|
# @transform(setup_working_directory,
|
||||||
|
|||||||
75
src/pdfa.py
Normal file
75
src/pdfa.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
#
|
||||||
|
# © 2015: jbarlow83 (https://github.com/jbarlow83)
|
||||||
|
#
|
||||||
|
# Generate a PDFA_def.ps file for Ghostscript >= 9.14
|
||||||
|
|
||||||
|
from __future__ import print_function, absolute_import, division
|
||||||
|
from string import Template
|
||||||
|
|
||||||
|
|
||||||
|
pdfa_def_template = u"""%!
|
||||||
|
% This is a sample prefix file for creating a PDF/A document.
|
||||||
|
% Feel free to modify entries marked with "Customize".
|
||||||
|
% This assumes an ICC profile to reside in the file (ISO Coated sb.icc),
|
||||||
|
% unless the user modifies the corresponding line below.
|
||||||
|
|
||||||
|
% Define entries in the document Info dictionary :
|
||||||
|
/ICCProfile ($icc_profile)
|
||||||
|
def
|
||||||
|
|
||||||
|
[ /Title ($pdf_title)
|
||||||
|
/DOCINFO pdfmark
|
||||||
|
|
||||||
|
% Define an ICC profile :
|
||||||
|
|
||||||
|
[/_objdef {icc_PDFA} /type /stream /OBJ pdfmark
|
||||||
|
[{icc_PDFA}
|
||||||
|
<<
|
||||||
|
/N currentpagedevice /ProcessColorModel known {
|
||||||
|
currentpagedevice /ProcessColorModel get dup /DeviceGray eq
|
||||||
|
{pop 1} {
|
||||||
|
/DeviceRGB eq
|
||||||
|
{3}{4} ifelse
|
||||||
|
} ifelse
|
||||||
|
} {
|
||||||
|
(ERROR, unable to determine ProcessColorModel) == flush
|
||||||
|
} ifelse
|
||||||
|
>> /PUT pdfmark
|
||||||
|
[{icc_PDFA} ICCProfile (r) file /PUT pdfmark
|
||||||
|
|
||||||
|
% Define the output intent dictionary :
|
||||||
|
|
||||||
|
[/_objdef {OutputIntent_PDFA} /type /dict /OBJ pdfmark
|
||||||
|
[{OutputIntent_PDFA} <<
|
||||||
|
/Type /OutputIntent % Must be so (the standard requires).
|
||||||
|
/S /GTS_PDFA1 % Must be so (the standard requires).
|
||||||
|
/DestOutputProfile {icc_PDFA} % Must be so (see above).
|
||||||
|
/OutputConditionIdentifier ($icc_identifier)
|
||||||
|
>> /PUT pdfmark
|
||||||
|
[{Catalog} <</OutputIntents [ {OutputIntent_PDFA} ]>> /PUT pdfmark
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def _get_pdfa_def(icc_profile, pdf_title, icc_identifier):
|
||||||
|
t = Template(pdfa_def_template)
|
||||||
|
result = t.substitute(icc_profile=icc_profile,
|
||||||
|
pdf_title=pdf_title,
|
||||||
|
icc_identifier=icc_identifier)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def generate_pdfa_def(target_filename, pdf_title='', icc='sRGB'):
|
||||||
|
# How does find this directory on other platforms?
|
||||||
|
if icc == 'sRGB':
|
||||||
|
icc_profile = '/usr/local/share/ghostscript/iccprofiles/srgb.icc'
|
||||||
|
else:
|
||||||
|
raise NotImplementedError("Only supporting sRGB")
|
||||||
|
|
||||||
|
ps = _get_pdfa_def(icc_profile, pdf_title, icc)
|
||||||
|
|
||||||
|
# Since PostScript might not handle UTF-8 (it's hard to get a clear
|
||||||
|
# answer), insist on ascii
|
||||||
|
with open(target_filename, 'w', encoding='ascii') as f:
|
||||||
|
f.write(ps)
|
||||||
Loading…
x
Reference in New Issue
Block a user