From cc2af2bc15184463074ca3804fb859629a8f7ecb Mon Sep 17 00:00:00 2001 From: Jim Barlow Date: Wed, 11 Feb 2015 10:23:45 -0800 Subject: [PATCH] Convert the final image to a JPEG if the original image was a JPEG Of course, this introduces recompression artifacts, and is unnecessary if no options are given that modify the final image (no -d, -c, -i). But rather than worry about that, it would be better to ultimately find a way to combine the original PDF page with the output PDF text in the case where we want no changes to the original. This is good enough for now. The better option can apparently be achieved using pdftk background, or probably better, PyPDF2's merge. If Tesseract PDF generation is used then we need a way to remove the image. Tesseract PDF generation at 3.03 does layout better (I think) and also properly encodes the hidden layer, which is less likely to give display issues (I think). --- src/ocrpage.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/ocrpage.py b/src/ocrpage.py index f1ec6d29..6c6bf6cc 100755 --- a/src/ocrpage.py +++ b/src/ocrpage.py @@ -516,7 +516,7 @@ def ocr_tesseract( @active_if(ocr_required) @merge([unpack_with_ghostscript, convert_to_png, deskew_imagemagick, deskew_leptonica, cleaned_to_png], - os.path.join(options.tmp_fld, "%04i.image_for_pdf.png" % pageno)) + os.path.join(options.tmp_fld, "%04i.image_for_pdf" % pageno)) def select_image_for_pdf(infiles, output_file): if options.preprocess_clean != 0 and options.preprocess_cleantopdf != 0: input_file = infiles[-1] @@ -526,7 +526,12 @@ def select_image_for_pdf(infiles, output_file): input_file = infiles[-1] else: input_file = infiles[0] - re_symlink(input_file, output_file, logger, logger_mutex) + + if all(image['enc'] == 'jpeg' for image in pageinfo['images']): + # If all images were JPEGs originally, produce a JPEG as output + check_call(['convert', input_file, 'jpg:' + output_file]) + else: + re_symlink(input_file, output_file, logger, logger_mutex) @active_if(ocr_required)