Merge branch 'feature/pdfa3'

2025-11-02 10:50:29 +00:00 · 2018-05-03 16:45:57 -07:00 · 2018-05-03 16:45:57 -07:00 · 7cf83c77ca
commit 7cf83c77ca
parent 8a9f174f63 df87e21c85
3 changed files with 12 additions and 6 deletions
--- a/src/ocrmypdf/main.py
+++ b/src/ocrmypdf/main.py
@ -156,14 +156,15 @@ parser.add_argument(
    '--image-dpi', metavar='DPI', type=int,
    help="For input image instead of PDF, use this DPI instead of file's.")
 parser.add_argument(
-    '--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2'], 
+    '--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'], 
    default='pdfa',
    help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
         "long term archiving (default, recommended) but may not suitable "
         "for users who want their file altered as little as possible. 'pdfa' "
         "also has problems with full Unicode text. 'pdf' attempts to "
         "preserve file contents as much as possible. 'pdf-a1' creates a "
-         "PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'."
+         "PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
+         "PDF/A3-b file."
         )

 # Use null string '\0' as sentinel to indicate the user supplied no argument,
@ -394,6 +395,9 @@ def check_options_output(options, log):
        log.info(
            "Ignoring --debug-rendering because it requires --pdf-renderer=hocr")

+    if options.output_type == 'pdfa':
+        options.output_type == 'pdfa-2'
+
    lossless_reconstruction = False
    if options.pdf_renderer in ('hocr', 'sandwich'):
        if not any((options.deskew, options.clean_final, options.force_ocr,
--- a/src/ocrmypdf/pipeline.py
+++ b/src/ocrmypdf/pipeline.py
@ -970,6 +970,7 @@ def merge_pages_ghostscript(
    pdf_pages, _ = _merge_pages_common(
        input_files_groups, output_file, log, context)
    input_pdfinfo = context.get_pdfinfo()
+
    ghostscript.generate_pdfa(
        pdf_version=input_pdfinfo.min_version,
        pdf_pages=pdf_pages,
@ -977,7 +978,7 @@ def merge_pages_ghostscript(
        compression=options.pdfa_image_compression,
        log=log,
        threads=options.jobs or 1,
-        pdfa_part=('1' if options.output_type == 'pdfa-1' else '2'))
+        pdfa_part=options.output_type[-1])  # is pdfa-1, pdfa-2, or pdfa-3
    if fitz:
        doc = fitz.Document(output_file + '_toc.pdf')
        doc.setToC(input_pdfinfo.table_of_contents)
--- a/tests/test_main.py
+++ b/tests/test_main.py
@ -951,15 +951,16 @@ def test_sidecar_nonempty(spoof_tesseract_cache, resources, outpdf):
    assert 'the' in ocr_text


-def test_pdfa_1(spoof_tesseract_cache, resources, outpdf):
+@pytest.mark.parametrize('pdfa_level', ['1', '2', '3'])
+def test_pdfa_n(spoof_tesseract_cache, pdfa_level, resources, outpdf):
    check_ocrmypdf(
        resources / 'ccitt.pdf', outpdf,
-        '--output-type', 'pdfa-1',
+        '--output-type', 'pdfa-' + pdfa_level,
        env=spoof_tesseract_cache
    )

    pdfa_info = file_claims_pdfa(outpdf)
-    assert pdfa_info['conformance'] == 'PDF/A-1B'
+    assert pdfa_info['conformance'] == 'PDF/A-{}B'.format(pdfa_level)


 def test_bad_locale():