diff --git a/src/ocrmypdf/__main__.py b/src/ocrmypdf/__main__.py index 55bf6644..b988ff26 100755 --- a/src/ocrmypdf/__main__.py +++ b/src/ocrmypdf/__main__.py @@ -156,14 +156,15 @@ parser.add_argument( '--image-dpi', metavar='DPI', type=int, help="For input image instead of PDF, use this DPI instead of file's.") parser.add_argument( - '--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2'], + '--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'], default='pdfa', help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for " "long term archiving (default, recommended) but may not suitable " "for users who want their file altered as little as possible. 'pdfa' " "also has problems with full Unicode text. 'pdf' attempts to " "preserve file contents as much as possible. 'pdf-a1' creates a " - "PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'." + "PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a " + "PDF/A3-b file." ) # Use null string '\0' as sentinel to indicate the user supplied no argument, @@ -394,6 +395,9 @@ def check_options_output(options, log): log.info( "Ignoring --debug-rendering because it requires --pdf-renderer=hocr") + if options.output_type == 'pdfa': + options.output_type == 'pdfa-2' + lossless_reconstruction = False if options.pdf_renderer in ('hocr', 'sandwich'): if not any((options.deskew, options.clean_final, options.force_ocr, diff --git a/src/ocrmypdf/pipeline.py b/src/ocrmypdf/pipeline.py index c11c06bd..9cc7226e 100644 --- a/src/ocrmypdf/pipeline.py +++ b/src/ocrmypdf/pipeline.py @@ -970,6 +970,7 @@ def merge_pages_ghostscript( pdf_pages, _ = _merge_pages_common( input_files_groups, output_file, log, context) input_pdfinfo = context.get_pdfinfo() + ghostscript.generate_pdfa( pdf_version=input_pdfinfo.min_version, pdf_pages=pdf_pages, @@ -977,7 +978,7 @@ def merge_pages_ghostscript( compression=options.pdfa_image_compression, log=log, threads=options.jobs or 1, - pdfa_part=('1' if options.output_type == 'pdfa-1' else '2')) + pdfa_part=options.output_type[-1]) # is pdfa-1, pdfa-2, or pdfa-3 if fitz: doc = fitz.Document(output_file + '_toc.pdf') doc.setToC(input_pdfinfo.table_of_contents) diff --git a/tests/test_main.py b/tests/test_main.py index 32483279..e08d31b1 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -951,15 +951,16 @@ def test_sidecar_nonempty(spoof_tesseract_cache, resources, outpdf): assert 'the' in ocr_text -def test_pdfa_1(spoof_tesseract_cache, resources, outpdf): +@pytest.mark.parametrize('pdfa_level', ['1', '2', '3']) +def test_pdfa_n(spoof_tesseract_cache, pdfa_level, resources, outpdf): check_ocrmypdf( resources / 'ccitt.pdf', outpdf, - '--output-type', 'pdfa-1', + '--output-type', 'pdfa-' + pdfa_level, env=spoof_tesseract_cache ) pdfa_info = file_claims_pdfa(outpdf) - assert pdfa_info['conformance'] == 'PDF/A-1B' + assert pdfa_info['conformance'] == 'PDF/A-{}B'.format(pdfa_level) def test_bad_locale():