diff --git a/ocrmypdf/main.py b/ocrmypdf/main.py index b5441cae..a4d6ea7c 100755 --- a/ocrmypdf/main.py +++ b/ocrmypdf/main.py @@ -483,7 +483,6 @@ def orient_page( language=options.language, timeout=options.tesseract_timeout, log=log) - print(orient_conf) if orient_conf.angle == 0: re_symlink(page_pdf, output_file) @@ -745,6 +744,11 @@ def add_text_layer( w, h = page_image.mediaBox.getWidth(), page_image.mediaBox.getHeight() + # Rotation occurs about the bottom left corner of the image rather than + # the center. + # Rotation is applied first, then the image must be translated so that the + # bottom left corner of the image is moved to the bottom left corner of the + # page. if rotation == 0: tx, ty = 0, 0 elif rotation == 90: @@ -770,8 +774,8 @@ def add_text_layer( @active_if(options.pdf_renderer == 'tesseract') @collate( - input=[preprocess_clean, orient_page], - filter=regex(r".*/(\d{6})(?:\.pp-clean\.png|\.ocr\.oriented\.pdf)"), + input=[select_image_for_pdf, orient_page], + filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"), output=os.path.join(work_folder, r'\1.rendered.pdf'), extras=[_log, _pdfinfo, _pdfinfo_lock]) def tesseract_ocr_and_render_pdf( @@ -781,7 +785,7 @@ def tesseract_ocr_and_render_pdf( pdfinfo, pdfinfo_lock): - input_image = next((ii for ii in input_files if ii.endswith('.png')), '') + input_image = next((ii for ii in input_files if ii.endswith('.image')), '') input_pdf = next((ii for ii in input_files if ii.endswith('.pdf'))) if not input_image: # Skipping this page @@ -854,6 +858,10 @@ def skip_page( input_file, output_file, log): + # The purpose of this step is its filter to forward only the skipped + # files (.skip.oriented.pdf) while disregarding the processed ones + # (.ocr.oriented.pdf). Alternative would be for merge_pages to filter + # pages itself if it gets multiple copies of a page. re_symlink(input_file, output_file, log) diff --git a/pipeline.svg b/pipeline.svg index 09d2acc4..6c199c90 100644 --- a/pipeline.svg +++ b/pipeline.svg @@ -4,90 +4,90 @@ - + Pipeline: - + clustertasks - -Pipeline: + +Pipeline: t0 - - - - -repair_pdf + + + + +repair_pdf t1 - - -split_pages + + +split_pages t0->t1 - - + + t14 - - - - -generate_postscript_stub + + + + +generate_postscript_stub t0->t14 - - + + t2 - - - - -rasterize_preview + + + + +rasterize_preview t1->t2 - - + + t3 - - - - -orient_page + + + + +orient_page t1->t3 - - + + t2->t3 - - + + t4 - - - - -rasterize_with_ghostscript + + + + +rasterize_with_ghostscript t3->t4 - - + + t9 @@ -99,199 +99,199 @@ t3->t9 - - + + t15 - - - - -skip_page + + + + +skip_page t3->t15 - - + + t13 - - - - -tesseract_ocr_and_render_pdf + + + + +tesseract_ocr_and_render_pdf t3->t13 - - + + t5 - - - - -preprocess_deskew + + + + +preprocess_deskew t4->t5 - - + + t8 - - - - -select_image_for_pdf + + + + +select_image_for_pdf t4->t8 - - + + t6 - - - - -preprocess_clean + + + + +preprocess_clean t5->t6 - - + + t5->t8 - - + + t7 - - - - -ocr_tesseract_hocr + + + + +ocr_tesseract_hocr t6->t7 - - + + t6->t8 - - - - -t6->t13 - - + + t10 - - - - -render_hocr_page + + + + +render_hocr_page t7->t10 - - + + t11 - - - - -render_hocr_debug_page + + + + +render_hocr_debug_page t7->t11 - - + + t12 - - - - -add_text_layer + + + + +add_text_layer t10->t12 - - + + t8->t9 - - + + t8->t11 - - + + + + +t8->t13 + + t9->t12 - - + + t16 - - -merge_pages + + +merge_pages t12->t16 - - + + t11->t16 - - + + t15->t16 - - + + t13->t16 - - + + t14->t16 - - + + t17 - - - - -copy_final + + + + +copy_final t16->t17 - - + +