mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-12-01 01:20:31 +00:00
Merge branch 'feature/pdfa3'
This commit is contained in:
commit
7cf83c77ca
@ -156,14 +156,15 @@ parser.add_argument(
|
|||||||
'--image-dpi', metavar='DPI', type=int,
|
'--image-dpi', metavar='DPI', type=int,
|
||||||
help="For input image instead of PDF, use this DPI instead of file's.")
|
help="For input image instead of PDF, use this DPI instead of file's.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2'],
|
'--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'],
|
||||||
default='pdfa',
|
default='pdfa',
|
||||||
help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
|
help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
|
||||||
"long term archiving (default, recommended) but may not suitable "
|
"long term archiving (default, recommended) but may not suitable "
|
||||||
"for users who want their file altered as little as possible. 'pdfa' "
|
"for users who want their file altered as little as possible. 'pdfa' "
|
||||||
"also has problems with full Unicode text. 'pdf' attempts to "
|
"also has problems with full Unicode text. 'pdf' attempts to "
|
||||||
"preserve file contents as much as possible. 'pdf-a1' creates a "
|
"preserve file contents as much as possible. 'pdf-a1' creates a "
|
||||||
"PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'."
|
"PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
|
||||||
|
"PDF/A3-b file."
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use null string '\0' as sentinel to indicate the user supplied no argument,
|
# Use null string '\0' as sentinel to indicate the user supplied no argument,
|
||||||
@ -394,6 +395,9 @@ def check_options_output(options, log):
|
|||||||
log.info(
|
log.info(
|
||||||
"Ignoring --debug-rendering because it requires --pdf-renderer=hocr")
|
"Ignoring --debug-rendering because it requires --pdf-renderer=hocr")
|
||||||
|
|
||||||
|
if options.output_type == 'pdfa':
|
||||||
|
options.output_type == 'pdfa-2'
|
||||||
|
|
||||||
lossless_reconstruction = False
|
lossless_reconstruction = False
|
||||||
if options.pdf_renderer in ('hocr', 'sandwich'):
|
if options.pdf_renderer in ('hocr', 'sandwich'):
|
||||||
if not any((options.deskew, options.clean_final, options.force_ocr,
|
if not any((options.deskew, options.clean_final, options.force_ocr,
|
||||||
|
|||||||
@ -970,6 +970,7 @@ def merge_pages_ghostscript(
|
|||||||
pdf_pages, _ = _merge_pages_common(
|
pdf_pages, _ = _merge_pages_common(
|
||||||
input_files_groups, output_file, log, context)
|
input_files_groups, output_file, log, context)
|
||||||
input_pdfinfo = context.get_pdfinfo()
|
input_pdfinfo = context.get_pdfinfo()
|
||||||
|
|
||||||
ghostscript.generate_pdfa(
|
ghostscript.generate_pdfa(
|
||||||
pdf_version=input_pdfinfo.min_version,
|
pdf_version=input_pdfinfo.min_version,
|
||||||
pdf_pages=pdf_pages,
|
pdf_pages=pdf_pages,
|
||||||
@ -977,7 +978,7 @@ def merge_pages_ghostscript(
|
|||||||
compression=options.pdfa_image_compression,
|
compression=options.pdfa_image_compression,
|
||||||
log=log,
|
log=log,
|
||||||
threads=options.jobs or 1,
|
threads=options.jobs or 1,
|
||||||
pdfa_part=('1' if options.output_type == 'pdfa-1' else '2'))
|
pdfa_part=options.output_type[-1]) # is pdfa-1, pdfa-2, or pdfa-3
|
||||||
if fitz:
|
if fitz:
|
||||||
doc = fitz.Document(output_file + '_toc.pdf')
|
doc = fitz.Document(output_file + '_toc.pdf')
|
||||||
doc.setToC(input_pdfinfo.table_of_contents)
|
doc.setToC(input_pdfinfo.table_of_contents)
|
||||||
|
|||||||
@ -951,15 +951,16 @@ def test_sidecar_nonempty(spoof_tesseract_cache, resources, outpdf):
|
|||||||
assert 'the' in ocr_text
|
assert 'the' in ocr_text
|
||||||
|
|
||||||
|
|
||||||
def test_pdfa_1(spoof_tesseract_cache, resources, outpdf):
|
@pytest.mark.parametrize('pdfa_level', ['1', '2', '3'])
|
||||||
|
def test_pdfa_n(spoof_tesseract_cache, pdfa_level, resources, outpdf):
|
||||||
check_ocrmypdf(
|
check_ocrmypdf(
|
||||||
resources / 'ccitt.pdf', outpdf,
|
resources / 'ccitt.pdf', outpdf,
|
||||||
'--output-type', 'pdfa-1',
|
'--output-type', 'pdfa-' + pdfa_level,
|
||||||
env=spoof_tesseract_cache
|
env=spoof_tesseract_cache
|
||||||
)
|
)
|
||||||
|
|
||||||
pdfa_info = file_claims_pdfa(outpdf)
|
pdfa_info = file_claims_pdfa(outpdf)
|
||||||
assert pdfa_info['conformance'] == 'PDF/A-1B'
|
assert pdfa_info['conformance'] == 'PDF/A-{}B'.format(pdfa_level)
|
||||||
|
|
||||||
|
|
||||||
def test_bad_locale():
|
def test_bad_locale():
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user