Merge branch 'feature/pdfa3'

This commit is contained in:
James R. Barlow 2018-05-03 16:45:57 -07:00
commit 7cf83c77ca
3 changed files with 12 additions and 6 deletions

View File

@ -156,14 +156,15 @@ parser.add_argument(
'--image-dpi', metavar='DPI', type=int, '--image-dpi', metavar='DPI', type=int,
help="For input image instead of PDF, use this DPI instead of file's.") help="For input image instead of PDF, use this DPI instead of file's.")
parser.add_argument( parser.add_argument(
'--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2'], '--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'],
default='pdfa', default='pdfa',
help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for " help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
"long term archiving (default, recommended) but may not suitable " "long term archiving (default, recommended) but may not suitable "
"for users who want their file altered as little as possible. 'pdfa' " "for users who want their file altered as little as possible. 'pdfa' "
"also has problems with full Unicode text. 'pdf' attempts to " "also has problems with full Unicode text. 'pdf' attempts to "
"preserve file contents as much as possible. 'pdf-a1' creates a " "preserve file contents as much as possible. 'pdf-a1' creates a "
"PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'." "PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
"PDF/A3-b file."
) )
# Use null string '\0' as sentinel to indicate the user supplied no argument, # Use null string '\0' as sentinel to indicate the user supplied no argument,
@ -394,6 +395,9 @@ def check_options_output(options, log):
log.info( log.info(
"Ignoring --debug-rendering because it requires --pdf-renderer=hocr") "Ignoring --debug-rendering because it requires --pdf-renderer=hocr")
if options.output_type == 'pdfa':
options.output_type == 'pdfa-2'
lossless_reconstruction = False lossless_reconstruction = False
if options.pdf_renderer in ('hocr', 'sandwich'): if options.pdf_renderer in ('hocr', 'sandwich'):
if not any((options.deskew, options.clean_final, options.force_ocr, if not any((options.deskew, options.clean_final, options.force_ocr,

View File

@ -970,6 +970,7 @@ def merge_pages_ghostscript(
pdf_pages, _ = _merge_pages_common( pdf_pages, _ = _merge_pages_common(
input_files_groups, output_file, log, context) input_files_groups, output_file, log, context)
input_pdfinfo = context.get_pdfinfo() input_pdfinfo = context.get_pdfinfo()
ghostscript.generate_pdfa( ghostscript.generate_pdfa(
pdf_version=input_pdfinfo.min_version, pdf_version=input_pdfinfo.min_version,
pdf_pages=pdf_pages, pdf_pages=pdf_pages,
@ -977,7 +978,7 @@ def merge_pages_ghostscript(
compression=options.pdfa_image_compression, compression=options.pdfa_image_compression,
log=log, log=log,
threads=options.jobs or 1, threads=options.jobs or 1,
pdfa_part=('1' if options.output_type == 'pdfa-1' else '2')) pdfa_part=options.output_type[-1]) # is pdfa-1, pdfa-2, or pdfa-3
if fitz: if fitz:
doc = fitz.Document(output_file + '_toc.pdf') doc = fitz.Document(output_file + '_toc.pdf')
doc.setToC(input_pdfinfo.table_of_contents) doc.setToC(input_pdfinfo.table_of_contents)

View File

@ -951,15 +951,16 @@ def test_sidecar_nonempty(spoof_tesseract_cache, resources, outpdf):
assert 'the' in ocr_text assert 'the' in ocr_text
def test_pdfa_1(spoof_tesseract_cache, resources, outpdf): @pytest.mark.parametrize('pdfa_level', ['1', '2', '3'])
def test_pdfa_n(spoof_tesseract_cache, pdfa_level, resources, outpdf):
check_ocrmypdf( check_ocrmypdf(
resources / 'ccitt.pdf', outpdf, resources / 'ccitt.pdf', outpdf,
'--output-type', 'pdfa-1', '--output-type', 'pdfa-' + pdfa_level,
env=spoof_tesseract_cache env=spoof_tesseract_cache
) )
pdfa_info = file_claims_pdfa(outpdf) pdfa_info = file_claims_pdfa(outpdf)
assert pdfa_info['conformance'] == 'PDF/A-1B' assert pdfa_info['conformance'] == 'PDF/A-{}B'.format(pdfa_level)
def test_bad_locale(): def test_bad_locale():