Merge branch 'feature/keep-text-pages' into develop

This commit is contained in:
Jim Barlow 2014-09-25 03:50:21 -07:00
commit d7130a1e56
2 changed files with 9 additions and 10 deletions

View File

@ -272,7 +272,7 @@ numpages=`tail -n 1 "$FILE_PAGES_INFO" | cut -f1 -d" "`
# process each page of the input pdf file
parallel --gnu -q -k --halt-on-error 1 "$OCR_PAGE" "$FILE_INPUT_PDF" "{}" "$numpages" "$TMP_FLD" \
"$VERBOSITY" "$LAN" "$KEEP_TMP" "$PREPROCESS_DESKEW" "$PREPROCESS_CLEAN" "$PREPROCESS_CLEANTOPDF" "$OVERSAMPLING_DPI" \
"$PDF_NOIMG" "$TESS_CFG_FILES" "$FORCE_OCR" "$SKIP_TEXT" < "$FILE_PAGES_INFO"
"$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO"
ret_code="$?"
[ $ret_code -ne 0 ] && exit $ret_code

View File

@ -7,7 +7,6 @@
. "./src/config.sh"
# Initialization of variables passed by arguments
FILE_INPUT_PDF="$1" # PDF file containing the page to be OCRed
PAGE_INFO="$2" # Various characteristics of the page to be OCRed
@ -21,11 +20,9 @@ PREPROCESS_CLEAN="$9" # Clean the page to be OCRed
PREPROCESS_CLEANTOPDF="${10}" # Put the cleaned paged in the OCRed PDF
OVERSAMPLING_DPI="${11}" # Oversampling resolution in dpi
PDF_NOIMG="${12}" # Request to generate also a PDF page containing only the OCRed text but no image (helpful for debugging)
TESS_CFG_FILES="${13}" # Specific configuration files to be used by Tesseract during OCRing
FORCE_OCR="${14}" # Force to OCR, even if the page already contains fonts
SKIP_TEXT="${15}" # Skip OCR on pages that contain fonts and include the page anyway
FORCE_OCR="${13}" # Force to OCR, even if the page already contains fonts
SKIP_TEXT="${14}" # Skip OCR on pages that contain fonts and include the page anyway
TESS_CFG_FILES="${15}" # Specific configuration files to be used by Tesseract during OCRing
##################################
# Detect the characteristics of the embedded image for
@ -60,8 +57,10 @@ getImgInfo() {
# check if the page already contains fonts (which should not be the case for PDF based on scanned files
[ `pdffonts -f $page -l $page "${FILE_INPUT_PDF}" | wc -l` -gt 2 ] && echo "Page $page: Page already contains font data !!!" && return 1
if [ `pdffonts -f $page -l $page "${FILE_INPUT_PDF}" | wc -l` -gt 2 ]; then
[ "$SKIP_TEXT" -eq "0" ] && echo "Page $page: Page already contains font data !!!"
return 1
fi
# extract raw image from pdf file to compute resolution
# unfortunately this image can have another orientation than in the pdf...
@ -122,7 +121,7 @@ ret_code="$?"
# Handle pages that already contain a text layer
if ([ "$ret_code" -eq "1" ] && [ "$SKIP_TEXT" -eq "1" ]); then
echo "Page $page: Skipping processing because page contains text..."
echo "Page $page: Skipping OCR on this page since it already contains text"
pdfseparate -f $page -l $page ${FILE_INPUT_PDF} $curOCRedPDF
exit 0
elif ([ "$ret_code" -eq "1" ] && [ "$FORCE_OCR" -eq "0" ]); then