Add support for -b (skip big pages)

This commit is contained in:
Jim Barlow 2015-02-20 15:26:33 -08:00
parent 02c1dcec8e
commit db311fb6a2
2 changed files with 18 additions and 3 deletions

View File

@ -49,6 +49,7 @@ Usage: OCRmyPDF.sh [-h] [-v] [-g] [-k] [-d] [-c] [-i] [-o dpi] [-f] [-l languag
-f : Force to OCR the whole document, even if some page already contain font data
(which should not be the case for PDF files built from scanned images)
-s : If pages contain font data, do not perform processing on that page, but include the page in the final output.
-b : Skip big pages
-l : Set the language of the PDF file in order to improve OCR results (default "eng")
Any language supported by tesseract is supported (Tesseract uses 3-character ISO 639-2 language codes)
Multiple languages may be specified, separated by '+' characters.
@ -91,10 +92,11 @@ OVERSAMPLING_DPI="0" # 0=do not perform oversampling (dpi value under which ove
PDF_NOIMG="0" # 0=no, 1=yes (generates each PDF page twice, with and without image)
FORCE_OCR="0" # 0=do not force, 1=force (force to OCR the whole document, even if some page already contain font data)
SKIP_TEXT="0" # 0=do not skip text pages, 1=skip text pages
SKIP_BIG="0"
TESS_CFG_FILES="" # list of additional configuration files to be used by tesseract
# Parse optional command line arguments
while getopts ":hvgkdcio:fsl:C:" opt; do
while getopts ":hvgkdcio:fsbl:C:" opt; do
case $opt in
h) usage ; exit 0 ;;
v) VERBOSITY=$(($VERBOSITY+1)) ;;
@ -106,6 +108,7 @@ while getopts ":hvgkdcio:fsl:C:" opt; do
o) OVERSAMPLING_DPI="$OPTARG" ;;
f) FORCE_OCR="1" ;;
s) SKIP_TEXT="1" ;;
b) SKIP_BIG="1" ;;
l) LAN="$OPTARG" ;;
C) TESS_CFG_FILES="$OPTARG $TESS_CFG_FILES" ;;
\?)
@ -270,9 +273,10 @@ sed '/^$/d' "$FILE_TMP" | awk '{printf "%04d %s\n", NR, $0}' > "$FILE_PAGES_INFO
numpages=`tail -n 1 "$FILE_PAGES_INFO" | cut -f1 -d" "`
# process each page of the input pdf file
parallel --gnu -q -k --halt-on-error 1 python3 -m src.ocrpage "$FILE_INPUT_PDF" "{}" "$numpages" "$TMP_FLD" \
parallel --gnu -q -k --halt-on-error 1 python3 -m src.ocrpage \
"$FILE_INPUT_PDF" "{}" "$numpages" "$TMP_FLD" \
"$VERBOSITY" "$LAN" "$KEEP_TMP" "$PREPROCESS_DESKEW" "$PREPROCESS_CLEAN" "$PREPROCESS_CLEANTOPDF" "$OVERSAMPLING_DPI" \
"$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO"
"$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$SKIP_BIG" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO"
ret_code="$?"
[ $ret_code -ne 0 ] && exit $ret_code

View File

@ -71,6 +71,9 @@ parser.add_argument(
parser.add_argument(
'skip_text', type=int, # Implemented
help="Skip OCR on pages that contain fonts and include the page anyway")
parser.add_argument(
'skip_big', type=int,
help="Skip OCR for pages that are very large")
parser.add_argument(
'tess_cfg_files', default='', nargs='*', # Implemented
help="Tesseract configuration")
@ -166,6 +169,14 @@ ocr_required = pageinfo['images'] and \
(options.force_ocr or
(not (pageinfo['has_text'] and options.skip_text)))
if ocr_required and options.skip_big:
area = pageinfo['width_inches'] * pageinfo['height_inches']
pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
if area > (11.0 * 17.0) or pixel_count > (300.0 * 300.0 * 11 * 17):
ocr_required = False
logger.info(
"Page {0} is very large; skipping due to -b".format(pageno))
def re_symlink(input_file, soft_link_name, logger, logger_mutex):
"""