diff --git a/ocrmypdf/main.py b/ocrmypdf/main.py index fbd13769..b316257d 100755 --- a/ocrmypdf/main.py +++ b/ocrmypdf/main.py @@ -440,7 +440,69 @@ def split_pages( @transform( input=split_pages, - filter=suffix('.ocr.page.pdf'), + filter=suffix('.page.pdf'), + output='.preview.png', + output_dir=work_folder, + extras=[_log, _pdfinfo, _pdfinfo_lock]) +def rasterize_preview( + input_file, + output_file, + log, + pdfinfo, + pdfinfo_lock): + ghostscript.rasterize_pdf( + input_file=input_file, + output_file=output_file, + xres=200, + yres=200, + raster_device='pnggray', + log=log) + + +@collate( + input=[split_pages, rasterize_preview], + filter=regex(r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.png)"), + output=os.path.join(work_folder, r'\1\2.oriented.pdf'), + extras=[_log, _pdfinfo, _pdfinfo_lock]) +def orient_page( + infiles, + output_file, + log, + pdfinfo, + pdfinfo_lock): + + page_pdf = next(ii for ii in infiles if ii.endswith('.page.pdf')) + preview = next(ii for ii in infiles if ii.endswith('.preview.png')) + + orient_conf = tesseract.get_orientation( + preview, + language=options.language, + timeout=options.tesseract_timeout, + log=log) + print(orient_conf) + + if orient_conf.angle == 0: + re_symlink(page_pdf, output_file) + else: + if orient_conf.confidence < 15: + log.warning( + 'Low orientation confidence {:.1f}'.format( + orient_conf.confidence)) + + writer = pypdf.PdfFileWriter() + reader = pypdf.PdfFileReader(page_pdf) + page = reader.pages[0] + + # Rotate opposite of orientation + rotated_page = page.rotateClockwise(orient_conf.angle) + writer.addPage(rotated_page) + with open(output_file, 'wb') as out: + writer.write(out) + + +@transform( + input=orient_page, + filter=suffix('.ocr.oriented.pdf'), output='.page.png', output_dir=work_folder, extras=[_log, _pdfinfo, _pdfinfo_lock]) @@ -571,8 +633,8 @@ def select_image_for_pdf( @active_if(options.pdf_renderer == 'hocr') @collate( - input=[select_image_for_pdf, split_pages], - filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.page\.pdf)"), + input=[select_image_for_pdf, orient_page], + filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"), output=os.path.join(work_folder, r'\1.image-layer.pdf'), extras=[_log, _pdfinfo, _pdfinfo_lock]) def select_image_layer( @@ -582,7 +644,7 @@ def select_image_layer( pdfinfo, pdfinfo_lock): - page_pdf = next(ii for ii in infiles if ii.endswith('.page.pdf')) + page_pdf = next(ii for ii in infiles if ii.endswith('.ocr.oriented.pdf')) image = next(ii for ii in infiles if ii.endswith('.image')) if lossless_reconstruction: @@ -679,8 +741,8 @@ def add_text_layer( @active_if(options.pdf_renderer == 'tesseract') @collate( - input=[preprocess_clean, split_pages], - filter=regex(r".*/(\d{6})(?:\.pp-clean\.png|\.page\.pdf)"), + input=[preprocess_clean, orient_page], + filter=regex(r".*/(\d{6})(?:\.pp-clean\.png|\.ocr\.oriented\.pdf)"), output=os.path.join(work_folder, r'\1.rendered.pdf'), extras=[_log, _pdfinfo, _pdfinfo_lock]) def tesseract_ocr_and_render_pdf( @@ -754,8 +816,8 @@ def generate_postscript_stub( @transform( - input=split_pages, - filter=suffix('.skip.page.pdf'), + input=orient_page, + filter=suffix('.skip.oriented.pdf'), output='.done.pdf', output_dir=work_folder, extras=[_log]) diff --git a/pipeline.svg b/pipeline.svg index 3c639f86..09d2acc4 100644 --- a/pipeline.svg +++ b/pipeline.svg @@ -4,261 +4,292 @@ - - + + Pipeline: - + clustertasks - -Pipeline: + +Pipeline: t0 - - - - -repair_pdf + + + + +repair_pdf t1 - - -split_pages + + +split_pages t0->t1 - - + + - -t12 - - - - -generate_postscript_stub + +t14 + + + + +generate_postscript_stub - -t0->t12 - - + +t0->t14 + + t2 - - - - -rasterize_with_ghostscript + + + + +rasterize_preview t1->t2 - - + + - -t7 + +t3 + + + + +orient_page + + +t1->t3 + + + + +t2->t3 + + + + +t4 + + + + +rasterize_with_ghostscript + + +t3->t4 + + + + +t9 select_image_layer - -t1->t7 - + +t3->t9 + + +t15 + + + + +skip_page + + +t3->t15 + + + -t13 - - - - -skip_page - - -t1->t13 - - - - -t11 +t13 tesseract_ocr_and_render_pdf - -t1->t11 - - + +t3->t13 + + - -t3 - - - - -preprocess_deskew + +t5 + + + + +preprocess_deskew - -t2->t3 - - + +t4->t5 + + - -t6 + +t8 select_image_for_pdf - -t2->t6 - - + +t4->t8 + + - -t4 - - - - -preprocess_clean + +t6 + + + + +preprocess_clean - -t3->t4 - - + +t5->t6 + + - -t3->t6 - - + +t5->t8 + + - -t5 + +t7 ocr_tesseract_hocr - -t4->t5 - - + +t6->t7 + + - -t4->t6 - - + +t6->t8 + + - -t4->t11 - - + +t6->t13 + + - -t8 + +t10 render_hocr_page - -t5->t8 + +t7->t10 - -t9 + +t11 render_hocr_debug_page - -t5->t9 + +t7->t11 - -t10 + +t12 add_text_layer - -t8->t10 + +t10->t12 - -t6->t7 + +t8->t9 - -t6->t9 + +t8->t11 - -t7->t10 + +t9->t12 - -t14 + +t16 merge_pages - -t10->t14 + +t12->t16 - -t9->t14 + +t11->t16 - -t13->t14 - + +t15->t16 + - -t11->t14 + +t13->t16 - -t12->t14 - + +t14->t16 + - -t15 + +t17 copy_final - -t14->t15 + +t16->t17