mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-11-02 10:50:29 +00:00
Merge branch 'feature/pdfa3'
This commit is contained in:
commit
7cf83c77ca
@ -156,14 +156,15 @@ parser.add_argument(
|
||||
'--image-dpi', metavar='DPI', type=int,
|
||||
help="For input image instead of PDF, use this DPI instead of file's.")
|
||||
parser.add_argument(
|
||||
'--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2'],
|
||||
'--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'],
|
||||
default='pdfa',
|
||||
help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
|
||||
"long term archiving (default, recommended) but may not suitable "
|
||||
"for users who want their file altered as little as possible. 'pdfa' "
|
||||
"also has problems with full Unicode text. 'pdf' attempts to "
|
||||
"preserve file contents as much as possible. 'pdf-a1' creates a "
|
||||
"PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'."
|
||||
"PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
|
||||
"PDF/A3-b file."
|
||||
)
|
||||
|
||||
# Use null string '\0' as sentinel to indicate the user supplied no argument,
|
||||
@ -394,6 +395,9 @@ def check_options_output(options, log):
|
||||
log.info(
|
||||
"Ignoring --debug-rendering because it requires --pdf-renderer=hocr")
|
||||
|
||||
if options.output_type == 'pdfa':
|
||||
options.output_type == 'pdfa-2'
|
||||
|
||||
lossless_reconstruction = False
|
||||
if options.pdf_renderer in ('hocr', 'sandwich'):
|
||||
if not any((options.deskew, options.clean_final, options.force_ocr,
|
||||
|
||||
@ -970,6 +970,7 @@ def merge_pages_ghostscript(
|
||||
pdf_pages, _ = _merge_pages_common(
|
||||
input_files_groups, output_file, log, context)
|
||||
input_pdfinfo = context.get_pdfinfo()
|
||||
|
||||
ghostscript.generate_pdfa(
|
||||
pdf_version=input_pdfinfo.min_version,
|
||||
pdf_pages=pdf_pages,
|
||||
@ -977,7 +978,7 @@ def merge_pages_ghostscript(
|
||||
compression=options.pdfa_image_compression,
|
||||
log=log,
|
||||
threads=options.jobs or 1,
|
||||
pdfa_part=('1' if options.output_type == 'pdfa-1' else '2'))
|
||||
pdfa_part=options.output_type[-1]) # is pdfa-1, pdfa-2, or pdfa-3
|
||||
if fitz:
|
||||
doc = fitz.Document(output_file + '_toc.pdf')
|
||||
doc.setToC(input_pdfinfo.table_of_contents)
|
||||
|
||||
@ -951,15 +951,16 @@ def test_sidecar_nonempty(spoof_tesseract_cache, resources, outpdf):
|
||||
assert 'the' in ocr_text
|
||||
|
||||
|
||||
def test_pdfa_1(spoof_tesseract_cache, resources, outpdf):
|
||||
@pytest.mark.parametrize('pdfa_level', ['1', '2', '3'])
|
||||
def test_pdfa_n(spoof_tesseract_cache, pdfa_level, resources, outpdf):
|
||||
check_ocrmypdf(
|
||||
resources / 'ccitt.pdf', outpdf,
|
||||
'--output-type', 'pdfa-1',
|
||||
'--output-type', 'pdfa-' + pdfa_level,
|
||||
env=spoof_tesseract_cache
|
||||
)
|
||||
|
||||
pdfa_info = file_claims_pdfa(outpdf)
|
||||
assert pdfa_info['conformance'] == 'PDF/A-1B'
|
||||
assert pdfa_info['conformance'] == 'PDF/A-{}B'.format(pdfa_level)
|
||||
|
||||
|
||||
def test_bad_locale():
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user