From db311fb6a2100484e0f384473dacfca57dcc6748 Mon Sep 17 00:00:00 2001 From: Jim Barlow Date: Fri, 20 Feb 2015 15:26:33 -0800 Subject: [PATCH] Add support for -b (skip big pages) --- OCRmyPDF.sh | 10 +++++++--- src/ocrpage.py | 11 +++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/OCRmyPDF.sh b/OCRmyPDF.sh index 39bcda63..d7020112 100755 --- a/OCRmyPDF.sh +++ b/OCRmyPDF.sh @@ -49,6 +49,7 @@ Usage: OCRmyPDF.sh [-h] [-v] [-g] [-k] [-d] [-c] [-i] [-o dpi] [-f] [-l languag -f : Force to OCR the whole document, even if some page already contain font data (which should not be the case for PDF files built from scanned images) -s : If pages contain font data, do not perform processing on that page, but include the page in the final output. +-b : Skip big pages -l : Set the language of the PDF file in order to improve OCR results (default "eng") Any language supported by tesseract is supported (Tesseract uses 3-character ISO 639-2 language codes) Multiple languages may be specified, separated by '+' characters. @@ -91,10 +92,11 @@ OVERSAMPLING_DPI="0" # 0=do not perform oversampling (dpi value under which ove PDF_NOIMG="0" # 0=no, 1=yes (generates each PDF page twice, with and without image) FORCE_OCR="0" # 0=do not force, 1=force (force to OCR the whole document, even if some page already contain font data) SKIP_TEXT="0" # 0=do not skip text pages, 1=skip text pages +SKIP_BIG="0" TESS_CFG_FILES="" # list of additional configuration files to be used by tesseract # Parse optional command line arguments -while getopts ":hvgkdcio:fsl:C:" opt; do +while getopts ":hvgkdcio:fsbl:C:" opt; do case $opt in h) usage ; exit 0 ;; v) VERBOSITY=$(($VERBOSITY+1)) ;; @@ -106,6 +108,7 @@ while getopts ":hvgkdcio:fsl:C:" opt; do o) OVERSAMPLING_DPI="$OPTARG" ;; f) FORCE_OCR="1" ;; s) SKIP_TEXT="1" ;; + b) SKIP_BIG="1" ;; l) LAN="$OPTARG" ;; C) TESS_CFG_FILES="$OPTARG $TESS_CFG_FILES" ;; \?) @@ -270,9 +273,10 @@ sed '/^$/d' "$FILE_TMP" | awk '{printf "%04d %s\n", NR, $0}' > "$FILE_PAGES_INFO numpages=`tail -n 1 "$FILE_PAGES_INFO" | cut -f1 -d" "` # process each page of the input pdf file -parallel --gnu -q -k --halt-on-error 1 python3 -m src.ocrpage "$FILE_INPUT_PDF" "{}" "$numpages" "$TMP_FLD" \ +parallel --gnu -q -k --halt-on-error 1 python3 -m src.ocrpage \ + "$FILE_INPUT_PDF" "{}" "$numpages" "$TMP_FLD" \ "$VERBOSITY" "$LAN" "$KEEP_TMP" "$PREPROCESS_DESKEW" "$PREPROCESS_CLEAN" "$PREPROCESS_CLEANTOPDF" "$OVERSAMPLING_DPI" \ - "$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO" + "$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$SKIP_BIG" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO" ret_code="$?" [ $ret_code -ne 0 ] && exit $ret_code diff --git a/src/ocrpage.py b/src/ocrpage.py index fc5d3041..d52ae706 100755 --- a/src/ocrpage.py +++ b/src/ocrpage.py @@ -71,6 +71,9 @@ parser.add_argument( parser.add_argument( 'skip_text', type=int, # Implemented help="Skip OCR on pages that contain fonts and include the page anyway") +parser.add_argument( + 'skip_big', type=int, + help="Skip OCR for pages that are very large") parser.add_argument( 'tess_cfg_files', default='', nargs='*', # Implemented help="Tesseract configuration") @@ -166,6 +169,14 @@ ocr_required = pageinfo['images'] and \ (options.force_ocr or (not (pageinfo['has_text'] and options.skip_text))) +if ocr_required and options.skip_big: + area = pageinfo['width_inches'] * pageinfo['height_inches'] + pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels'] + if area > (11.0 * 17.0) or pixel_count > (300.0 * 300.0 * 11 * 17): + ocr_required = False + logger.info( + "Page {0} is very large; skipping due to -b".format(pageno)) + def re_symlink(input_file, soft_link_name, logger, logger_mutex): """