Remove hocr debug renderer (-g)

The fact that this produces additional pages makes it a maintenance burden. hocr can be debugged using hocrtransform.
2026-01-05 11:41:19 +00:00 · 2018-05-10 13:48:39 -07:00 · 2018-05-10 13:48:39 -07:00 · e0bb898f29
commit e0bb898f29
parent 45336c7c28
3 changed files with 1 additions and 38 deletions
--- a/README.rst
+++ b/README.rst
@ -39,7 +39,6 @@ Main features
 -  Keeps file size about the same
 -  If requested deskews and/or cleans the image before performing OCR
 -  Validates input and output files
-  Provides debug mode to enable easy verification of the OCR results
 -  Processes pages in parallel when more than one CPU core is
   available
 -  Uses `Tesseract OCR <https://github.com/tesseract-ocr/tesseract>`_ engine
--- a/src/ocrmypdf/main.py
+++ b/src/ocrmypdf/main.py
@ -337,9 +337,6 @@ debugging = parser.add_argument_group(
 debugging.add_argument(
    '-k', '--keep-temporary-files', action='store_true',
    help="Keep temporary files (helpful for debugging)")
-debugging.add_argument(
-    '-g', '--debug-rendering', action='store_true',
-    help="Render each page twice with debug information on second page")
 debugging.add_argument(
    '--flowchart', type=str,
    help="Generate the pipeline execution flowchart")
@ -386,10 +383,6 @@ def check_options_output(options, log):
            "Ghostscript, which is known to corrupt the OCR text of "
            "some PDFs produced your version of Tesseract.")

-    if options.debug_rendering and options.pdf_renderer != 'hocr':
-        log.info(
-            "Ignoring --debug-rendering because it requires --pdf-renderer=hocr")
-
    lossless_reconstruction = False
    if options.pdf_renderer in ('hocr', 'sandwich'):
        if not any((options.deskew, options.clean_final, options.force_ocr,
--- a/src/ocrmypdf/pipeline.py
+++ b/src/ocrmypdf/pipeline.py
@ -739,24 +739,6 @@ def flatten_groups(groups):
            yield obj


-def render_hocr_debug_page(
-        infiles,
-        output_file,
-        log,
-        context):
-    options = context.get_options()
-    hocr = next(ii for ii in flatten_groups(infiles) if ii.endswith('.hocr'))
-    image = next(ii for ii in flatten_groups(infiles) if ii.endswith('.image'))
-
-    pageinfo = get_pageinfo(image, context)
-    dpi = get_page_square_dpi(pageinfo, options)
-
-    hocrtransform = HocrTransform(hocr, dpi)
-    hocrtransform.to_pdf(output_file, imageFileName=None,
-                         showBoundingboxes=True, invisibleText=False,
-                         interwordSpaces=True)
-
-
 def _weave_layers_graft(
        *, pdf_base, page_num, text, font, font_key, procset, rotation, log):
    log.debug("Grafting")
@ -1321,16 +1303,6 @@ def build_pipeline(options, work_folder, log, context):
    task_render_hocr_page.graphviz(fillcolor='"#00cc66"')
    task_render_hocr_page.active_if(options.pdf_renderer == 'hocr')

-    task_render_hocr_debug_page = main_pipeline.collate(
-        task_func=render_hocr_debug_page,
-        input=[task_select_visible_page_image, task_ocr_tesseract_hocr],
-        filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"),
-        output=os.path.join(work_folder, r'\1.debug.pdf'),
-        extras=[log, context])
-    task_render_hocr_debug_page.graphviz(fillcolor='"#00cc66"')
-    task_render_hocr_debug_page.active_if(options.pdf_renderer == 'hocr')
-    task_render_hocr_debug_page.active_if(options.debug_rendering)
-
    # Tesseract OCR + text only PDF
    task_ocr_tesseract_textonly_pdf = main_pipeline.collate(
        task_func=ocr_tesseract_textonly_pdf,
@ -1366,8 +1338,7 @@ def build_pipeline(options, work_folder, log, context):
    task_metadata_fixup = main_pipeline.merge(
        task_func=metadata_fixup,
        input=[task_repair_and_parse_pdf,
-               task_weave_layers,
-               task_render_hocr_debug_page,
+               task_weave_layers,               
               task_generate_postscript_stub],
        output=os.path.join(work_folder, 'metafix.pdf'),
        extras=[log, context]