From db311fb6a2100484e0f384473dacfca57dcc6748 Mon Sep 17 00:00:00 2001
From: Jim Barlow <jim@purplerock.ca>
Date: Fri, 20 Feb 2015 15:26:33 -0800
Subject: [PATCH] Add support for -b (skip big pages)

---
 OCRmyPDF.sh    | 10 +++++++---
 src/ocrpage.py | 11 +++++++++++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/OCRmyPDF.sh b/OCRmyPDF.sh
index 39bcda63..d7020112 100755
--- a/OCRmyPDF.sh
+++ b/OCRmyPDF.sh
@@ -49,6 +49,7 @@ Usage: OCRmyPDF.sh  [-h] [-v] [-g] [-k] [-d] [-c] [-i] [-o dpi] [-f] [-l languag
 -f : Force to OCR the whole document, even if some page already contain font data 
      (which should not be the case for PDF files built from scanned images) 
 -s : If pages contain font data, do not perform processing on that page, but include the page in the final output.
+-b : Skip big pages
 -l : Set the language of the PDF file in order to improve OCR results (default "eng")
      Any language supported by tesseract is supported (Tesseract uses 3-character ISO 639-2 language codes)
      Multiple languages may be specified, separated by '+' characters.
@@ -91,10 +92,11 @@ OVERSAMPLING_DPI="0"		# 0=do not perform oversampling (dpi value under which ove
 PDF_NOIMG="0"			# 0=no, 1=yes (generates each PDF page twice, with and without image)
 FORCE_OCR="0"			# 0=do not force, 1=force (force to OCR the whole document, even if some page already contain font data)
 SKIP_TEXT="0"			# 0=do not skip text pages, 1=skip text pages
+SKIP_BIG="0"
 TESS_CFG_FILES=""		# list of additional configuration files to be used by tesseract
 
 # Parse optional command line arguments
-while getopts ":hvgkdcio:fsl:C:" opt; do
+while getopts ":hvgkdcio:fsbl:C:" opt; do
 	case $opt in
 		h) usage ; exit 0 ;;
 		v) VERBOSITY=$(($VERBOSITY+1)) ;;
@@ -106,6 +108,7 @@ while getopts ":hvgkdcio:fsl:C:" opt; do
 		o) OVERSAMPLING_DPI="$OPTARG" ;;
 		f) FORCE_OCR="1" ;;
 		s) SKIP_TEXT="1" ;;
+		b) SKIP_BIG="1" ;;
 		l) LAN="$OPTARG" ;;
 		C) TESS_CFG_FILES="$OPTARG $TESS_CFG_FILES" ;;
 		\?)
@@ -270,9 +273,10 @@ sed '/^$/d' "$FILE_TMP" | awk '{printf "%04d %s\n", NR, $0}' > "$FILE_PAGES_INFO
 numpages=`tail -n 1 "$FILE_PAGES_INFO" | cut -f1 -d" "`
 
 # process each page of the input pdf file
-parallel --gnu -q -k --halt-on-error 1 python3 -m src.ocrpage "$FILE_INPUT_PDF" "{}" "$numpages" "$TMP_FLD" \
+parallel --gnu -q -k --halt-on-error 1 python3 -m src.ocrpage \
+	"$FILE_INPUT_PDF" "{}" "$numpages" "$TMP_FLD" \
 	"$VERBOSITY" "$LAN" "$KEEP_TMP" "$PREPROCESS_DESKEW" "$PREPROCESS_CLEAN" "$PREPROCESS_CLEANTOPDF" "$OVERSAMPLING_DPI" \
-	"$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO"
+	"$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$SKIP_BIG" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO"
 ret_code="$?"
 [ $ret_code -ne 0 ] && exit $ret_code 
 
diff --git a/src/ocrpage.py b/src/ocrpage.py
index fc5d3041..d52ae706 100755
--- a/src/ocrpage.py
+++ b/src/ocrpage.py
@@ -71,6 +71,9 @@ parser.add_argument(
 parser.add_argument(
     'skip_text', type=int,                  # Implemented
     help="Skip OCR on pages that contain fonts and include the page anyway")
+parser.add_argument(
+    'skip_big', type=int,
+    help="Skip OCR for pages that are very large")
 parser.add_argument(
     'tess_cfg_files', default='', nargs='*',    # Implemented
     help="Tesseract configuration")
@@ -166,6 +169,14 @@ ocr_required = pageinfo['images'] and \
     (options.force_ocr or
         (not (pageinfo['has_text'] and options.skip_text)))
 
+if ocr_required and options.skip_big:
+    area = pageinfo['width_inches'] * pageinfo['height_inches']
+    pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
+    if area > (11.0 * 17.0) or pixel_count > (300.0 * 300.0 * 11 * 17):
+        ocr_required = False
+        logger.info(
+            "Page {0} is very large; skipping due to -b".format(pageno))
+
 
 def re_symlink(input_file, soft_link_name, logger, logger_mutex):
     """