From 4b51b521e24f8b42df6c188e5caea1b8c44fe147 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sun, 7 Feb 2016 03:27:33 -0800 Subject: [PATCH] Implement autorotate (provided lossless reconstruction is disabled) Works for a single page file, probably Although arguably rotation is not quite lossless, and the two could be mutually exclusive anyway, so maybe this is it. Did not check in some debugging changes (lossless=False, text debugging=True) PyPDF seems to get merging wrong when one of the pages is rotated. --- ocrmypdf/main.py | 78 ++++++++++-- pipeline.svg | 309 ++++++++++++++++++++++++++--------------------- 2 files changed, 240 insertions(+), 147 deletions(-) diff --git a/ocrmypdf/main.py b/ocrmypdf/main.py index fbd13769..b316257d 100755 --- a/ocrmypdf/main.py +++ b/ocrmypdf/main.py @@ -440,7 +440,69 @@ def split_pages( @transform( input=split_pages, - filter=suffix('.ocr.page.pdf'), + filter=suffix('.page.pdf'), + output='.preview.png', + output_dir=work_folder, + extras=[_log, _pdfinfo, _pdfinfo_lock]) +def rasterize_preview( + input_file, + output_file, + log, + pdfinfo, + pdfinfo_lock): + ghostscript.rasterize_pdf( + input_file=input_file, + output_file=output_file, + xres=200, + yres=200, + raster_device='pnggray', + log=log) + + +@collate( + input=[split_pages, rasterize_preview], + filter=regex(r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.png)"), + output=os.path.join(work_folder, r'\1\2.oriented.pdf'), + extras=[_log, _pdfinfo, _pdfinfo_lock]) +def orient_page( + infiles, + output_file, + log, + pdfinfo, + pdfinfo_lock): + + page_pdf = next(ii for ii in infiles if ii.endswith('.page.pdf')) + preview = next(ii for ii in infiles if ii.endswith('.preview.png')) + + orient_conf = tesseract.get_orientation( + preview, + language=options.language, + timeout=options.tesseract_timeout, + log=log) + print(orient_conf) + + if orient_conf.angle == 0: + re_symlink(page_pdf, output_file) + else: + if orient_conf.confidence < 15: + log.warning( + 'Low orientation confidence {:.1f}'.format( + orient_conf.confidence)) + + writer = pypdf.PdfFileWriter() + reader = pypdf.PdfFileReader(page_pdf) + page = reader.pages[0] + + # Rotate opposite of orientation + rotated_page = page.rotateClockwise(orient_conf.angle) + writer.addPage(rotated_page) + with open(output_file, 'wb') as out: + writer.write(out) + + +@transform( + input=orient_page, + filter=suffix('.ocr.oriented.pdf'), output='.page.png', output_dir=work_folder, extras=[_log, _pdfinfo, _pdfinfo_lock]) @@ -571,8 +633,8 @@ def select_image_for_pdf( @active_if(options.pdf_renderer == 'hocr') @collate( - input=[select_image_for_pdf, split_pages], - filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.page\.pdf)"), + input=[select_image_for_pdf, orient_page], + filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"), output=os.path.join(work_folder, r'\1.image-layer.pdf'), extras=[_log, _pdfinfo, _pdfinfo_lock]) def select_image_layer( @@ -582,7 +644,7 @@ def select_image_layer( pdfinfo, pdfinfo_lock): - page_pdf = next(ii for ii in infiles if ii.endswith('.page.pdf')) + page_pdf = next(ii for ii in infiles if ii.endswith('.ocr.oriented.pdf')) image = next(ii for ii in infiles if ii.endswith('.image')) if lossless_reconstruction: @@ -679,8 +741,8 @@ def add_text_layer( @active_if(options.pdf_renderer == 'tesseract') @collate( - input=[preprocess_clean, split_pages], - filter=regex(r".*/(\d{6})(?:\.pp-clean\.png|\.page\.pdf)"), + input=[preprocess_clean, orient_page], + filter=regex(r".*/(\d{6})(?:\.pp-clean\.png|\.ocr\.oriented\.pdf)"), output=os.path.join(work_folder, r'\1.rendered.pdf'), extras=[_log, _pdfinfo, _pdfinfo_lock]) def tesseract_ocr_and_render_pdf( @@ -754,8 +816,8 @@ def generate_postscript_stub( @transform( - input=split_pages, - filter=suffix('.skip.page.pdf'), + input=orient_page, + filter=suffix('.skip.oriented.pdf'), output='.done.pdf', output_dir=work_folder, extras=[_log]) diff --git a/pipeline.svg b/pipeline.svg index 3c639f86..09d2acc4 100644 --- a/pipeline.svg +++ b/pipeline.svg @@ -4,261 +4,292 @@ - - + + Pipeline: - + clustertasks - -Pipeline: + +Pipeline: t0 - - - - -repair_pdf + + + + +repair_pdf t1 - - -split_pages + + +split_pages t0->t1 - - + + - -t12 - - - - -generate_postscript_stub + +t14 + + + + +generate_postscript_stub - -t0->t12 - - + +t0->t14 + + t2 - - - - -rasterize_with_ghostscript + + + + +rasterize_preview t1->t2 - - + + - -t7 + +t3 + + + + +orient_page + + +t1->t3 + + + + +t2->t3 + + + + +t4 + + + + +rasterize_with_ghostscript + + +t3->t4 + + + + +t9 select_image_layer - -t1->t7 - + +t3->t9 + + +t15 + + + + +skip_page + + +t3->t15 + + + -t13 - - - - -skip_page - - -t1->t13 - - - - -t11 +t13 tesseract_ocr_and_render_pdf - -t1->t11 - - + +t3->t13 + + - -t3 - - - - -preprocess_deskew + +t5 + + + + +preprocess_deskew - -t2->t3 - - + +t4->t5 + + - -t6 + +t8 select_image_for_pdf - -t2->t6 - - + +t4->t8 + + - -t4 - - - - -preprocess_clean + +t6 + + + + +preprocess_clean - -t3->t4 - - + +t5->t6 + + - -t3->t6 - - + +t5->t8 + + - -t5 + +t7 ocr_tesseract_hocr - -t4->t5 - - + +t6->t7 + + - -t4->t6 - - + +t6->t8 + + - -t4->t11 - - + +t6->t13 + + - -t8 + +t10 render_hocr_page - -t5->t8 + +t7->t10 - -t9 + +t11 render_hocr_debug_page - -t5->t9 + +t7->t11 - -t10 + +t12 add_text_layer - -t8->t10 + +t10->t12 - -t6->t7 + +t8->t9 - -t6->t9 + +t8->t11 - -t7->t10 + +t9->t12 - -t14 + +t16 merge_pages - -t10->t14 + +t12->t16 - -t9->t14 + +t11->t16 - -t13->t14 - + +t15->t16 + - -t11->t14 + +t13->t16 - -t12->t14 - + +t14->t16 + - -t15 + +t17 copy_final - -t14->t15 + +t16->t17