Add support for -b (skip big pages)

2026-01-05 19:51:07 +00:00 · 2015-02-20 15:26:33 -08:00 · 2015-02-20 15:26:33 -08:00 · db311fb6a2
commit db311fb6a2
parent 02c1dcec8e
2 changed files with 18 additions and 3 deletions
--- a/OCRmyPDF.sh
+++ b/OCRmyPDF.sh
@ -49,6 +49,7 @@ Usage: OCRmyPDF.sh  [-h] [-v] [-g] [-k] [-d] [-c] [-i] [-o dpi] [-f] [-l languag
 -f : Force to OCR the whole document, even if some page already contain font data 
     (which should not be the case for PDF files built from scanned images) 
 -s : If pages contain font data, do not perform processing on that page, but include the page in the final output.
+-b : Skip big pages
 -l : Set the language of the PDF file in order to improve OCR results (default "eng")
     Any language supported by tesseract is supported (Tesseract uses 3-character ISO 639-2 language codes)
     Multiple languages may be specified, separated by '+' characters.
@ -91,10 +92,11 @@ OVERSAMPLING_DPI="0"		# 0=do not perform oversampling (dpi value under which ove
 PDF_NOIMG="0"			# 0=no, 1=yes (generates each PDF page twice, with and without image)
 FORCE_OCR="0"			# 0=do not force, 1=force (force to OCR the whole document, even if some page already contain font data)
 SKIP_TEXT="0"			# 0=do not skip text pages, 1=skip text pages
+SKIP_BIG="0"
 TESS_CFG_FILES=""		# list of additional configuration files to be used by tesseract

 # Parse optional command line arguments
-while getopts ":hvgkdcio:fsl:C:" opt; do
+while getopts ":hvgkdcio:fsbl:C:" opt; do
 	case $opt in
 		h) usage ; exit 0 ;;
 		v) VERBOSITY=$(($VERBOSITY+1)) ;;
@ -106,6 +108,7 @@ while getopts ":hvgkdcio:fsl:C:" opt; do
 		o) OVERSAMPLING_DPI="$OPTARG" ;;
 		f) FORCE_OCR="1" ;;
 		s) SKIP_TEXT="1" ;;
+		b) SKIP_BIG="1" ;;
 		l) LAN="$OPTARG" ;;
 		C) TESS_CFG_FILES="$OPTARG $TESS_CFG_FILES" ;;
 		\?)
@ -270,9 +273,10 @@ sed '/^$/d' "$FILE_TMP" | awk '{printf "%04d %s\n", NR, $0}' > "$FILE_PAGES_INFO
 numpages=`tail -n 1 "$FILE_PAGES_INFO" | cut -f1 -d" "`

 # process each page of the input pdf file
-parallel --gnu -q -k --halt-on-error 1 python3 -m src.ocrpage "$FILE_INPUT_PDF" "{}" "$numpages" "$TMP_FLD" \
+parallel --gnu -q -k --halt-on-error 1 python3 -m src.ocrpage \
+	"$FILE_INPUT_PDF" "{}" "$numpages" "$TMP_FLD" \
 	"$VERBOSITY" "$LAN" "$KEEP_TMP" "$PREPROCESS_DESKEW" "$PREPROCESS_CLEAN" "$PREPROCESS_CLEANTOPDF" "$OVERSAMPLING_DPI" \
-	"$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO"
+	"$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$SKIP_BIG" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO"
 ret_code="$?"
 [ $ret_code -ne 0 ] && exit $ret_code 

--- a/src/ocrpage.py
+++ b/src/ocrpage.py
@ -71,6 +71,9 @@ parser.add_argument(
 parser.add_argument(
    'skip_text', type=int,                  # Implemented
    help="Skip OCR on pages that contain fonts and include the page anyway")
+parser.add_argument(
+    'skip_big', type=int,
+    help="Skip OCR for pages that are very large")
 parser.add_argument(
    'tess_cfg_files', default='', nargs='*',    # Implemented
    help="Tesseract configuration")
@ -166,6 +169,14 @@ ocr_required = pageinfo['images'] and \
    (options.force_ocr or
        (not (pageinfo['has_text'] and options.skip_text)))

+if ocr_required and options.skip_big:
+    area = pageinfo['width_inches'] * pageinfo['height_inches']
+    pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
+    if area > (11.0 * 17.0) or pixel_count > (300.0 * 300.0 * 11 * 17):
+        ocr_required = False
+        logger.info(
+            "Page {0} is very large; skipping due to -b".format(pageno))
+

 def re_symlink(input_file, soft_link_name, logger, logger_mutex):
    """