mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-10-23 22:09:37 +00:00
Convert the final image to a JPEG if the original image was a JPEG
Of course, this introduces recompression artifacts, and is unnecessary if no options are given that modify the final image (no -d, -c, -i). But rather than worry about that, it would be better to ultimately find a way to combine the original PDF page with the output PDF text in the case where we want no changes to the original. This is good enough for now. The better option can apparently be achieved using pdftk background, or probably better, PyPDF2's merge. If Tesseract PDF generation is used then we need a way to remove the image. Tesseract PDF generation at 3.03 does layout better (I think) and also properly encodes the hidden layer, which is less likely to give display issues (I think).
This commit is contained in:
parent
638c6db05d
commit
cc2af2bc15
@ -516,7 +516,7 @@ def ocr_tesseract(
|
||||
@active_if(ocr_required)
|
||||
@merge([unpack_with_ghostscript, convert_to_png,
|
||||
deskew_imagemagick, deskew_leptonica, cleaned_to_png],
|
||||
os.path.join(options.tmp_fld, "%04i.image_for_pdf.png" % pageno))
|
||||
os.path.join(options.tmp_fld, "%04i.image_for_pdf" % pageno))
|
||||
def select_image_for_pdf(infiles, output_file):
|
||||
if options.preprocess_clean != 0 and options.preprocess_cleantopdf != 0:
|
||||
input_file = infiles[-1]
|
||||
@ -526,7 +526,12 @@ def select_image_for_pdf(infiles, output_file):
|
||||
input_file = infiles[-1]
|
||||
else:
|
||||
input_file = infiles[0]
|
||||
re_symlink(input_file, output_file, logger, logger_mutex)
|
||||
|
||||
if all(image['enc'] == 'jpeg' for image in pageinfo['images']):
|
||||
# If all images were JPEGs originally, produce a JPEG as output
|
||||
check_call(['convert', input_file, 'jpg:' + output_file])
|
||||
else:
|
||||
re_symlink(input_file, output_file, logger, logger_mutex)
|
||||
|
||||
|
||||
@active_if(ocr_required)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user