mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-01-05 19:51:07 +00:00
Add support for -b (skip big pages)
This commit is contained in:
parent
02c1dcec8e
commit
db311fb6a2
10
OCRmyPDF.sh
10
OCRmyPDF.sh
@ -49,6 +49,7 @@ Usage: OCRmyPDF.sh [-h] [-v] [-g] [-k] [-d] [-c] [-i] [-o dpi] [-f] [-l languag
|
||||
-f : Force to OCR the whole document, even if some page already contain font data
|
||||
(which should not be the case for PDF files built from scanned images)
|
||||
-s : If pages contain font data, do not perform processing on that page, but include the page in the final output.
|
||||
-b : Skip big pages
|
||||
-l : Set the language of the PDF file in order to improve OCR results (default "eng")
|
||||
Any language supported by tesseract is supported (Tesseract uses 3-character ISO 639-2 language codes)
|
||||
Multiple languages may be specified, separated by '+' characters.
|
||||
@ -91,10 +92,11 @@ OVERSAMPLING_DPI="0" # 0=do not perform oversampling (dpi value under which ove
|
||||
PDF_NOIMG="0" # 0=no, 1=yes (generates each PDF page twice, with and without image)
|
||||
FORCE_OCR="0" # 0=do not force, 1=force (force to OCR the whole document, even if some page already contain font data)
|
||||
SKIP_TEXT="0" # 0=do not skip text pages, 1=skip text pages
|
||||
SKIP_BIG="0"
|
||||
TESS_CFG_FILES="" # list of additional configuration files to be used by tesseract
|
||||
|
||||
# Parse optional command line arguments
|
||||
while getopts ":hvgkdcio:fsl:C:" opt; do
|
||||
while getopts ":hvgkdcio:fsbl:C:" opt; do
|
||||
case $opt in
|
||||
h) usage ; exit 0 ;;
|
||||
v) VERBOSITY=$(($VERBOSITY+1)) ;;
|
||||
@ -106,6 +108,7 @@ while getopts ":hvgkdcio:fsl:C:" opt; do
|
||||
o) OVERSAMPLING_DPI="$OPTARG" ;;
|
||||
f) FORCE_OCR="1" ;;
|
||||
s) SKIP_TEXT="1" ;;
|
||||
b) SKIP_BIG="1" ;;
|
||||
l) LAN="$OPTARG" ;;
|
||||
C) TESS_CFG_FILES="$OPTARG $TESS_CFG_FILES" ;;
|
||||
\?)
|
||||
@ -270,9 +273,10 @@ sed '/^$/d' "$FILE_TMP" | awk '{printf "%04d %s\n", NR, $0}' > "$FILE_PAGES_INFO
|
||||
numpages=`tail -n 1 "$FILE_PAGES_INFO" | cut -f1 -d" "`
|
||||
|
||||
# process each page of the input pdf file
|
||||
parallel --gnu -q -k --halt-on-error 1 python3 -m src.ocrpage "$FILE_INPUT_PDF" "{}" "$numpages" "$TMP_FLD" \
|
||||
parallel --gnu -q -k --halt-on-error 1 python3 -m src.ocrpage \
|
||||
"$FILE_INPUT_PDF" "{}" "$numpages" "$TMP_FLD" \
|
||||
"$VERBOSITY" "$LAN" "$KEEP_TMP" "$PREPROCESS_DESKEW" "$PREPROCESS_CLEAN" "$PREPROCESS_CLEANTOPDF" "$OVERSAMPLING_DPI" \
|
||||
"$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO"
|
||||
"$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$SKIP_BIG" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO"
|
||||
ret_code="$?"
|
||||
[ $ret_code -ne 0 ] && exit $ret_code
|
||||
|
||||
|
||||
@ -71,6 +71,9 @@ parser.add_argument(
|
||||
parser.add_argument(
|
||||
'skip_text', type=int, # Implemented
|
||||
help="Skip OCR on pages that contain fonts and include the page anyway")
|
||||
parser.add_argument(
|
||||
'skip_big', type=int,
|
||||
help="Skip OCR for pages that are very large")
|
||||
parser.add_argument(
|
||||
'tess_cfg_files', default='', nargs='*', # Implemented
|
||||
help="Tesseract configuration")
|
||||
@ -166,6 +169,14 @@ ocr_required = pageinfo['images'] and \
|
||||
(options.force_ocr or
|
||||
(not (pageinfo['has_text'] and options.skip_text)))
|
||||
|
||||
if ocr_required and options.skip_big:
|
||||
area = pageinfo['width_inches'] * pageinfo['height_inches']
|
||||
pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
|
||||
if area > (11.0 * 17.0) or pixel_count > (300.0 * 300.0 * 11 * 17):
|
||||
ocr_required = False
|
||||
logger.info(
|
||||
"Page {0} is very large; skipping due to -b".format(pageno))
|
||||
|
||||
|
||||
def re_symlink(input_file, soft_link_name, logger, logger_mutex):
|
||||
"""
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user