Add rudimentary support for combining OCR layer with existing content

It appears to be very fragile due to weaknesses in PyPDF. Better option is probably to use pdftk's watermark feature.
2025-11-02 02:40:44 +00:00 · 2015-03-10 14:28:38 -07:00 · 2015-03-10 14:28:38 -07:00 · a99ba3b696
commit a99ba3b696
parent 9229f7c6cc
2 changed files with 59 additions and 26 deletions
--- a/OCRmyPDF.sh
+++ b/OCRmyPDF.sh
@ -50,6 +50,7 @@ Usage: OCRmyPDF.sh  [-h] [-v] [-g] [-k] [-d] [-c] [-i] [-o dpi] [-f] [-l languag
     (which should not be the case for PDF files built from scanned images) 
 -s : If pages contain font data, do not perform processing on that page, but include the page in the final output.
 -b : Skip big pages
+-e : Use exact PDF pages with no changes other than inserting hidden OCR text layer (mutually exclusive with -d/-c/-i/-f)
 -l : Set the language of the PDF file in order to improve OCR results (default "eng")
     Any language supported by tesseract is supported (Tesseract uses 3-character ISO 639-2 language codes)
     Multiple languages may be specified, separated by '+' characters.
@ -93,10 +94,11 @@ PDF_NOIMG="0"			# 0=no, 1=yes (generates each PDF page twice, with and without i
 FORCE_OCR="0"			# 0=do not force, 1=force (force to OCR the whole document, even if some page already contain font data)
 SKIP_TEXT="0"			# 0=do not skip text pages, 1=skip text pages
 SKIP_BIG="0"
+EXACT_IMAGE="0"
 TESS_CFG_FILES=""		# list of additional configuration files to be used by tesseract

 # Parse optional command line arguments
-while getopts ":hvgkdcio:fsbl:C:" opt; do
+while getopts ":hvgkdcio:fsbel:C:" opt; do
 	case $opt in
 		h) usage ; exit 0 ;;
 		v) VERBOSITY=$(($VERBOSITY+1)) ;;
@ -109,6 +111,7 @@ while getopts ":hvgkdcio:fsbl:C:" opt; do
 		f) FORCE_OCR="1" ;;
 		s) SKIP_TEXT="1" ;;
 		b) SKIP_BIG="1" ;;
+		e) EXACT_IMAGE="1" ;;
 		l) LAN="$OPTARG" ;;
 		C) TESS_CFG_FILES="$OPTARG $TESS_CFG_FILES" ;;
 		\?)
@ -273,10 +276,10 @@ sed '/^$/d' "$FILE_TMP" | awk '{printf "%04d %s\n", NR, $0}' > "$FILE_PAGES_INFO
 numpages=`tail -n 1 "$FILE_PAGES_INFO" | cut -f1 -d" "`

 # process each page of the input pdf file
-parallel --gnu -q -k --halt-on-error 1 python3 -m src.ocrpage \
+parallel --gnu -q -k --halt-on-error 1 -j2 python3 -m src.ocrpage \
 	"$FILE_INPUT_PDF" "{}" "$numpages" "$TMP_FLD" \
 	"$VERBOSITY" "$LAN" "$KEEP_TMP" "$PREPROCESS_DESKEW" "$PREPROCESS_CLEAN" "$PREPROCESS_CLEANTOPDF" "$OVERSAMPLING_DPI" \
-	"$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$SKIP_BIG" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO"
+	"$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$SKIP_BIG" "$EXACT_IMAGE" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO"
 ret_code="$?"
 [ $ret_code -ne 0 ] && exit $ret_code 

--- a/src/ocrpage.py
+++ b/src/ocrpage.py
@ -6,6 +6,8 @@ import os.path
 import fileinput
 import re
 from parse import parse
+import PyPDF2 as pypdf
+import shutil

 from subprocess import Popen, check_call, PIPE, CalledProcessError, \
    TimeoutExpired
@ -73,6 +75,9 @@ parser.add_argument(
 parser.add_argument(
    'skip_big', type=int,
    help="Skip OCR for pages that are very large")
+parser.add_argument(
+    'exact_image', type=int,
+    help="Use original page from PDF without re-rendering")
 parser.add_argument(
    'tess_cfg_files', default='', nargs='*',    # Implemented
    help="Tesseract configuration")
@ -225,11 +230,11 @@ def setup_working_directory(input_file, soft_link_name):
        pass


-@active_if(not ocr_required)
+@active_if(not ocr_required or (ocr_required and options.exact_image))
@transform(setup_working_directory,
           formatter(),
-           os.path.join(options.tmp_fld, '%04i.skip.pdf' % pageno))
-def skip_ocr(
+           os.path.join(options.tmp_fld, '%04i.page.pdf' % pageno))
+def extract_single_page(
        input_file,
        output_file):
    args_pdfseparate = [
@ -526,27 +531,25 @@ def ocr_tesseract(
            raise CalledProcessError(p.returncode, args_tesseract)

        if os.path.exists(output_file + '.html'):
-            # Tesseract 3.02 appends suffix ".html" on its own
-            re_symlink(output_file + ".html", output_file,
-                       logger, logger_mutex)
+            # Tesseract 3.02 appends suffix ".html" on its own (.hocr.html)
+            shutil.move(output_file + '.html', output_file)
        elif os.path.exists(output_file + '.hocr'):
-            # Tesseract 3.03 appends suffix ".hocr" on its own
-            re_symlink(output_file + ".hocr", output_file,
-                       logger, logger_mutex)
+            # Tesseract 3.03 appends suffix ".hocr" on its own (.hocr.hocr)
+            shutil.move(output_file + '.hocr', output_file)

-            # The filename gets inserted to hocr
-            # but Tesseract does not verify that it is escaped XML
-            # it's not necessary so strip it out
-            regex_nested_single_quotes = re.compile(
-                r"""title='image "([^"]*)";""")
-            with fileinput.input(files=(output_file,), inplace=True) as f:
-                for line in f:
-                    line = regex_nested_single_quotes.sub(
-                        r"""title='image " ";""", line)
-                    print(line, end='')  # stdout is redirected here
+        # Tesseract inserts source filename into hocr file without escaping
+        # it. This could break the XML parser. Rewrite the hocr file,
+        # replacing the filename with a space.
+        regex_nested_single_quotes = re.compile(
+            r"""title='image "([^"]*)";""")
+        with fileinput.input(files=(output_file,), inplace=True) as f:
+            for line in f:
+                line = regex_nested_single_quotes.sub(
+                    r"""title='image " ";""", line)
+                print(line, end='')  # fileinput.input redirects stdout


-@active_if(ocr_required)
+@active_if(ocr_required and not options.exact_image)
@merge([unpack_with_ghostscript, convert_to_png,
        deskew_imagemagick, deskew_leptonica, cleaned_to_png],
       os.path.join(options.tmp_fld, "%04i.image_for_pdf" % pageno))
@ -567,7 +570,7 @@ def select_image_for_pdf(infiles, output_file):
        re_symlink(input_file, output_file, logger, logger_mutex)


-@active_if(ocr_required)
+@active_if(ocr_required and not options.exact_image)
@merge([ocr_tesseract, select_image_for_pdf],
       os.path.join(options.tmp_fld, '%04i.rendered.pdf' % pageno))
 def render_page(infiles, output_file):
@ -590,7 +593,34 @@ def render_text_output_page(input_file, output_file):
                         showBoundingboxes=True, invisibleText=False)


-@merge([render_page, skip_ocr],
+@active_if(ocr_required and options.exact_image)
+@transform(ocr_tesseract, suffix(".hocr"), ".hocr.pdf")
+def render_hocr_blank_page(input_file, output_file):
+    dpi = round(max(pageinfo['xres'], pageinfo['yres']))
+
+    hocrtransform = HocrTransform(input_file, dpi)
+    hocrtransform.to_pdf(output_file, imageFileName=None,
+                         showBoundingboxes=False, invisibleText=True)
+
+
+@active_if(ocr_required and options.exact_image)
+@merge([render_hocr_blank_page, extract_single_page],
+       os.path.join(options.tmp_fld, "%04i.merged.pdf") % pageno)
+def merge_hocr_with_original_page(infiles, output_file):
+    with open(infiles[0], 'rb') as hocr_input, \
+            open(infiles[1], 'rb') as page_input, \
+            open(output_file, 'wb') as output:
+        hocr_reader = pypdf.PdfFileReader(hocr_input)
+        page_reader = pypdf.PdfFileReader(page_input)
+        writer = pypdf.PdfFileWriter()
+
+        the_page = hocr_reader.getPage(0)
+        the_page.mergePage(page_reader.getPage(0))
+        writer.addPage(the_page)
+        writer.write(output)
+
+
+@merge([render_page, merge_hocr_with_original_page, extract_single_page],
       os.path.join(options.tmp_fld, '%04i.ocred.pdf' % pageno))
 def select_final_page(infiles, output_file):
    re_symlink(infiles[-1], output_file, logger, logger_mutex)
@ -600,7 +630,7 @@ cmdline.run(options)

 # parser.add_argument(
 #     'tess_cfg_files',
-#     help="Specific configuration files to be used by Tesseract during OCRing")
+#   help="Specific configuration files to be used by Tesseract during OCRing")


 def main():