Merge branch 'feature/pdfa3'

This commit is contained in:
James R. Barlow 2018-05-03 16:45:57 -07:00
commit 7cf83c77ca
3 changed files with 12 additions and 6 deletions

View File

@ -156,14 +156,15 @@ parser.add_argument(
'--image-dpi', metavar='DPI', type=int,
help="For input image instead of PDF, use this DPI instead of file's.")
parser.add_argument(
'--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2'],
'--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'],
default='pdfa',
help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
"long term archiving (default, recommended) but may not suitable "
"for users who want their file altered as little as possible. 'pdfa' "
"also has problems with full Unicode text. 'pdf' attempts to "
"preserve file contents as much as possible. 'pdf-a1' creates a "
"PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'."
"PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
"PDF/A3-b file."
)
# Use null string '\0' as sentinel to indicate the user supplied no argument,
@ -394,6 +395,9 @@ def check_options_output(options, log):
log.info(
"Ignoring --debug-rendering because it requires --pdf-renderer=hocr")
if options.output_type == 'pdfa':
options.output_type == 'pdfa-2'
lossless_reconstruction = False
if options.pdf_renderer in ('hocr', 'sandwich'):
if not any((options.deskew, options.clean_final, options.force_ocr,

View File

@ -970,6 +970,7 @@ def merge_pages_ghostscript(
pdf_pages, _ = _merge_pages_common(
input_files_groups, output_file, log, context)
input_pdfinfo = context.get_pdfinfo()
ghostscript.generate_pdfa(
pdf_version=input_pdfinfo.min_version,
pdf_pages=pdf_pages,
@ -977,7 +978,7 @@ def merge_pages_ghostscript(
compression=options.pdfa_image_compression,
log=log,
threads=options.jobs or 1,
pdfa_part=('1' if options.output_type == 'pdfa-1' else '2'))
pdfa_part=options.output_type[-1]) # is pdfa-1, pdfa-2, or pdfa-3
if fitz:
doc = fitz.Document(output_file + '_toc.pdf')
doc.setToC(input_pdfinfo.table_of_contents)

View File

@ -951,15 +951,16 @@ def test_sidecar_nonempty(spoof_tesseract_cache, resources, outpdf):
assert 'the' in ocr_text
def test_pdfa_1(spoof_tesseract_cache, resources, outpdf):
@pytest.mark.parametrize('pdfa_level', ['1', '2', '3'])
def test_pdfa_n(spoof_tesseract_cache, pdfa_level, resources, outpdf):
check_ocrmypdf(
resources / 'ccitt.pdf', outpdf,
'--output-type', 'pdfa-1',
'--output-type', 'pdfa-' + pdfa_level,
env=spoof_tesseract_cache
)
pdfa_info = file_claims_pdfa(outpdf)
assert pdfa_info['conformance'] == 'PDF/A-1B'
assert pdfa_info['conformance'] == 'PDF/A-{}B'.format(pdfa_level)
def test_bad_locale():