diff --git a/ocrmypdf/main.py b/ocrmypdf/main.py index 96f07141..978bdcda 100755 --- a/ocrmypdf/main.py +++ b/ocrmypdf/main.py @@ -20,7 +20,7 @@ from PIL import Image from functools import partial from ruffus import transform, suffix, merge, active_if, regex, jobs_limit, \ - formatter, follows, split, collate, check_if_uptodate + formatter, follows, split, collate, check_if_uptodate, graphviz import ruffus.ruffus_exceptions as ruffus_exceptions import ruffus.cmdline as cmdline import ruffus.proxy_logger as proxy_logger @@ -798,6 +798,7 @@ def preprocess_clean( filter=suffix(".pp-clean.png"), output=".hocr", extras=[_log, _pdfinfo, _pdfinfo_lock]) +@graphviz(fillcolor='"#00cc66"') def ocr_tesseract_hocr( input_file, output_file, @@ -823,6 +824,7 @@ def ocr_tesseract_hocr( filter=regex(r".*/(\d{6})(?:\.page|\.pp-deskew|\.pp-clean)\.png"), output=os.path.join(work_folder, r'\1.image'), extras=[_log, _pdfinfo, _pdfinfo_lock]) +@graphviz(shape='diamond') def select_image_for_pdf( infiles, output_file, @@ -863,6 +865,7 @@ def select_image_for_pdf( filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"), output=os.path.join(work_folder, r'\1.image-layer.pdf'), extras=[_log, _pdfinfo, _pdfinfo_lock]) +@graphviz(fillcolor='"#00cc66"', shape='diamond') def select_image_layer( infiles, output_file, @@ -897,6 +900,7 @@ def select_image_layer( filter=suffix('.hocr'), output='.hocr.pdf', extras=[_log, _pdfinfo, _pdfinfo_lock]) +@graphviz(fillcolor='"#00cc66"') def render_hocr_page( input_file, output_file, @@ -919,6 +923,7 @@ def render_hocr_page( filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"), output=os.path.join(work_folder, r'\1.debug.pdf'), extras=[_log, _pdfinfo, _pdfinfo_lock]) +@graphviz(fillcolor='"#00cc66"') def render_hocr_debug_page( infiles, output_file, @@ -946,6 +951,7 @@ class PdfMergeFailedError(Exception): filter=regex(r".*/(\d{6})(?:\.hocr\.pdf|\.image-layer\.pdf)"), output=os.path.join(work_folder, r'\1.rendered.pdf'), extras=[_log, _pdfinfo, _pdfinfo_lock]) +@graphviz(fillcolor='"#00cc66"') def add_text_layer( infiles, output_file, @@ -1018,6 +1024,7 @@ def add_text_layer( filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"), output=os.path.join(work_folder, r'\1.rendered.pdf'), extras=[_log, _pdfinfo, _pdfinfo_lock]) +@graphviz(fillcolor='"#66ccff"') def tesseract_ocr_and_render_pdf( input_files, output_file, @@ -1316,7 +1323,7 @@ def run_pipeline(): from shutil import copyfileobj copyfileobj(sys.stdin.buffer, stream_buffer) else: - re_symlink(options.input_file, start_input_file, log) + re_symlink(options.input_file, start_input_file, _log) cmdline.run(options) except ruffus_exceptions.RethrownJobError as e: diff --git a/pipeline.svg b/pipeline.svg index 87573b46..0f3d40aa 100644 --- a/pipeline.svg +++ b/pipeline.svg @@ -4,341 +4,335 @@ - - + + Pipeline: - + clustertasks - -Pipeline: + +Pipeline: t0 - - - - -triage + + + + +triage t1 - - - - -repair_pdf + + + + +repair_pdf t0->t1 - - + + t2 - - -split_pages + + +split_pages t1->t2 - - + + t15 - - - - -generate_postscript_stub + + + + +generate_postscript_stub t1->t15 - - + + t18 - - -merge_pages_qpdf + + +merge_pages_qpdf t1->t18 - - + + t3 - - - - -rasterize_preview + + + + +rasterize_preview t2->t3 - - + + t4 - - - - -orient_page + + + + +orient_page t2->t4 - - + + t3->t4 - - + + t5 - - - - -rasterize_with_ghostscript + + + + +rasterize_with_ghostscript t4->t5 - - + + t10 - - - - -select_image_layer + +select_image_layer t4->t10 - - + + t16 - - - - -skip_page + + + + +skip_page t4->t16 - - + + t14 - - - - -tesseract_ocr_and_render_pdf + + + + +tesseract_ocr_and_render_pdf t4->t14 - - + + t6 - - - - -preprocess_deskew + + + + +preprocess_deskew t5->t6 - - + + t9 - - - - -select_image_for_pdf + +select_image_for_pdf t5->t9 - - + + t7 - - - - -preprocess_clean + + + + +preprocess_clean t6->t7 - - + + t6->t9 - - + + t8 - - - - -ocr_tesseract_hocr + + + + +ocr_tesseract_hocr t7->t8 - - + + t7->t9 - - + + t11 - - - - -render_hocr_page + + + + +render_hocr_page t8->t11 - - + + t12 - - - - -render_hocr_debug_page + + + + +render_hocr_debug_page t8->t12 - - + + t13 - - - - -add_text_layer + + + + +add_text_layer t11->t13 - - + + t9->t10 - - + + t9->t12 - - + + t9->t14 - - + + t10->t13 - - + + t17 - - -merge_pages_ghostscript + + +merge_pages_ghostscript t13->t17 - - + + t13->t18 - - + + t12->t17 - - + + t12->t18 - - + + t16->t17 - - + + t16->t18 - - + + t14->t17 - - + + t14->t18 - - + + t15->t17 - - + + t19 - - -copy_final + + +copy_final t17->t19 - - + + t18->t19 - - + +