2013-04-09 19:00:26 +02:00
|
|
|
#!/bin/sh
|
2013-04-26 11:50:39 +02:00
|
|
|
##############################################################################
|
|
|
|
|
# Copyright (c) 2013: fritz-hh from Github (https://github.com/fritz-hh)
|
|
|
|
|
##############################################################################
|
2013-04-09 19:00:26 +02:00
|
|
|
|
2013-11-27 22:34:21 +01:00
|
|
|
# Import required scripts
|
|
|
|
|
. "`dirname $0`/src/config.sh"
|
|
|
|
|
|
2013-04-14 19:15:01 +02:00
|
|
|
|
2013-05-01 15:58:55 +02:00
|
|
|
START=`date +%s`
|
|
|
|
|
|
2013-04-21 21:58:58 +02:00
|
|
|
usage() {
|
|
|
|
|
cat << EOF
|
2013-04-22 20:08:24 +02:00
|
|
|
--------------------------------------------------------------------------------------
|
|
|
|
|
Script aimed at generating a searchable PDF file from a PDF file containing only images.
|
|
|
|
|
(The script performs optical character recognition of each respective page using the
|
|
|
|
|
tesseract engine)
|
|
|
|
|
|
|
|
|
|
Copyright: fritz from NAS4Free forum
|
|
|
|
|
Version: $VERSION
|
|
|
|
|
|
2013-12-30 23:44:38 +01:00
|
|
|
Usage: OCRmyPDF.sh [-h] [-v] [-g] [-k] [-d] [-c] [-i] [-o dpi] [-l language] [-C filename] inputfile outputfile
|
2013-04-22 20:08:24 +02:00
|
|
|
|
|
|
|
|
-h : Display this help message
|
2013-12-30 23:44:38 +01:00
|
|
|
-v : Increase the verbosity (this option can be used more than once) (e.g. -vvv)
|
2013-04-28 14:43:21 +02:00
|
|
|
-k : Do not delete the temporary files
|
2013-04-22 22:50:34 +02:00
|
|
|
-g : Activate debug mode:
|
|
|
|
|
- Generates a PDF file containing each page twice (once with the image, once without the image
|
2013-04-26 16:19:18 +02:00
|
|
|
but with the OCRed text as well as the detected bounding boxes)
|
2013-04-28 14:43:21 +02:00
|
|
|
- Set the verbosity to the highest possible
|
|
|
|
|
- Do not delete the temporary files
|
2013-04-22 20:08:24 +02:00
|
|
|
-d : Deskew each page before performing OCR
|
|
|
|
|
-c : Clean each page before performing OCR
|
|
|
|
|
-i : Incorporate the cleaned image in the final PDF file (by default the original image
|
|
|
|
|
image, or the deskewed image if the -d option is set, is incorporated)
|
2013-12-30 23:44:38 +01:00
|
|
|
-o : If the resolution of an image is lower than dpi value provided as argument, provide the OCR engine with
|
|
|
|
|
an oversampled image having the latter dpi value. This can improve the OCR results but can lead to a larger output PDF file.
|
|
|
|
|
(default: no oversampling performed)
|
2013-04-23 00:35:42 +03:00
|
|
|
-l : Set the language of the PDF file in order to improve OCR results (default "eng")
|
2013-04-22 20:08:24 +02:00
|
|
|
Any language supported by tesseract is supported.
|
2013-04-23 21:36:34 +02:00
|
|
|
-C : Pass an additional configuration file to the tesseract OCR engine.
|
|
|
|
|
(this option can be used more than once)
|
2013-04-23 22:54:58 +02:00
|
|
|
Note: The configuration file must be available in the "tessdata/configs" folder
|
|
|
|
|
of your tesseract installation
|
|
|
|
|
inputfile : PDF file to be OCRed
|
|
|
|
|
outputfile : The PDF/A file to be generated
|
2013-04-22 20:08:24 +02:00
|
|
|
--------------------------------------------------------------------------------------
|
2013-04-21 21:58:58 +02:00
|
|
|
EOF
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2013-04-23 22:54:58 +02:00
|
|
|
#################################################
|
|
|
|
|
# Get an absolute path from a relative path to a file
|
2013-04-23 22:57:41 +02:00
|
|
|
#
|
|
|
|
|
# Param1 : Relative path
|
2013-04-23 22:54:58 +02:00
|
|
|
# Returns: 1 if the folder in which the file is located does not exist
|
|
|
|
|
# 0 otherwise
|
|
|
|
|
#################################################
|
|
|
|
|
absolutePath() {
|
|
|
|
|
local wdsave absolutepath
|
2013-05-01 13:44:20 +02:00
|
|
|
wdsave="$(pwd)"
|
|
|
|
|
! cd "$(dirname "$1")" 1> /dev/null 2> /dev/null && return 1
|
|
|
|
|
absolutepath="$(pwd)/$(basename "$1")"
|
2013-04-23 22:54:58 +02:00
|
|
|
cd "$wdsave"
|
|
|
|
|
echo "$absolutepath"
|
|
|
|
|
return 0
|
|
|
|
|
}
|
2013-04-21 21:58:58 +02:00
|
|
|
|
|
|
|
|
|
2013-04-20 22:02:33 +02:00
|
|
|
# Initialization the configuration parameters with default values
|
2013-04-21 21:58:58 +02:00
|
|
|
VERBOSITY="$LOG_ERR" # default verbosity level
|
|
|
|
|
LAN="eng" # default language of the PDF file (required to get good OCR results)
|
|
|
|
|
KEEP_TMP="0" # do not delete the temporary files (default)
|
|
|
|
|
PREPROCESS_DESKEW="0" # 0=no, 1=yes (deskew image)
|
|
|
|
|
PREPROCESS_CLEAN="0" # 0=no, 1=yes (clean image to improve OCR)
|
2013-04-23 21:36:34 +02:00
|
|
|
PREPROCESS_CLEANTOPDF="0" # 0=no, 1=yes (put cleaned image in final PDF)
|
2013-12-30 23:44:38 +01:00
|
|
|
OVERSAMPLING_DPI="0" # do not perform oversampling
|
2013-04-28 14:43:21 +02:00
|
|
|
PDF_NOIMG="0" # 0=no, 1=yes (generates each PDF page twice, with and without image)
|
2013-04-23 21:36:34 +02:00
|
|
|
TESS_CFG_FILES="" # list of additional configuration files to be used by tesseract
|
2013-04-21 21:58:58 +02:00
|
|
|
|
|
|
|
|
# Parse optional command line arguments
|
2013-12-30 23:44:38 +01:00
|
|
|
while getopts ":hvgkdcio:l:C:" opt; do
|
2013-04-21 21:58:58 +02:00
|
|
|
case $opt in
|
|
|
|
|
h) usage ; exit 0 ;;
|
|
|
|
|
v) VERBOSITY=$(($VERBOSITY+1)) ;;
|
|
|
|
|
k) KEEP_TMP="1" ;;
|
2013-11-29 10:34:05 +01:00
|
|
|
g) PDF_NOIMG="1"; VERBOSITY="$LOG_DEBUG"; KEEP_TMP="1" ;;
|
2013-04-23 21:36:34 +02:00
|
|
|
d) PREPROCESS_DESKEW="1" ;;
|
2013-04-21 21:58:58 +02:00
|
|
|
c) PREPROCESS_CLEAN="1" ;;
|
2013-04-23 21:36:34 +02:00
|
|
|
i) PREPROCESS_CLEANTOPDF="1" ;;
|
2013-12-30 23:44:38 +01:00
|
|
|
o) OVERSAMPLING_DPI="$OPTARG" ;;
|
2013-04-21 21:58:58 +02:00
|
|
|
l) LAN="$OPTARG" ;;
|
2013-04-23 21:36:34 +02:00
|
|
|
C) TESS_CFG_FILES="$OPTARG $TESS_CFG_FILES" ;;
|
2013-04-21 21:58:58 +02:00
|
|
|
\?)
|
2013-12-30 23:44:38 +01:00
|
|
|
echo "Invalid option: -$OPTARG"
|
2013-04-21 21:58:58 +02:00
|
|
|
usage
|
|
|
|
|
exit $EXIT_BAD_ARGS ;;
|
|
|
|
|
:)
|
2013-12-30 23:44:38 +01:00
|
|
|
echo "Option -$OPTARG requires an argument"
|
2013-04-21 21:58:58 +02:00
|
|
|
usage
|
|
|
|
|
exit $EXIT_BAD_ARGS ;;
|
|
|
|
|
esac
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
# Remove the optional arguments parsed above.
|
|
|
|
|
shift $((OPTIND-1))
|
|
|
|
|
|
|
|
|
|
# Check if the number of mandatory parameters
|
|
|
|
|
# provided is as expected
|
2013-04-23 22:54:58 +02:00
|
|
|
if [ "$#" -ne "2" ]; then
|
2013-12-30 23:44:38 +01:00
|
|
|
echo "Exactly two mandatory argument shall be provided ($# arguments provided)"
|
2013-04-21 21:58:58 +02:00
|
|
|
usage
|
|
|
|
|
exit $EXIT_BAD_ARGS
|
|
|
|
|
fi
|
|
|
|
|
|
2013-05-05 22:33:54 +02:00
|
|
|
! absolutePath "$1" > /dev/null \
|
2013-12-30 23:44:38 +01:00
|
|
|
&& echo "The folder in which the input file should be located does not exist. Exiting..." && exit $EXIT_BAD_ARGS
|
2013-04-23 22:54:58 +02:00
|
|
|
FILE_INPUT_PDF="`absolutePath "$1"`"
|
2013-05-05 22:33:54 +02:00
|
|
|
! absolutePath "$2" > /dev/null \
|
2013-12-30 23:44:38 +01:00
|
|
|
&& echo "The folder in which the output file should be generated does not exist. Exiting..." && exit $EXIT_BAD_ARGS
|
2013-04-23 22:54:58 +02:00
|
|
|
FILE_OUTPUT_PDFA="`absolutePath "$2"`"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# set script path as working directory
|
|
|
|
|
cd "`dirname $0`"
|
2013-04-21 21:58:58 +02:00
|
|
|
|
2013-04-28 22:18:34 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_INFO ] && echo "$TOOLNAME version: $VERSION"
|
2013-04-26 11:50:39 +02:00
|
|
|
|
2013-04-19 22:23:28 +02:00
|
|
|
# check if the required utilities are installed
|
2013-04-22 20:56:45 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Checking if all dependencies are installed"
|
2013-12-30 23:44:38 +01:00
|
|
|
! command -v identify > /dev/null && echo "Please install ImageMagick. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
|
! command -v parallel > /dev/null && echo "Please install GNU Parallel. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
|
! command -v pdfimages > /dev/null && echo "Please install poppler-utils. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
|
! command -v pdftoppm > /dev/null && echo "Please install poppler-utils. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
|
! command -v pdftk > /dev/null && echo "Please install pdftk. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
|
[ $PREPROCESS_CLEAN -eq 1 ] && ! command -v unpaper > /dev/null && echo "Please install unpaper. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
|
! command -v tesseract > /dev/null && echo "Please install tesseract and tesseract-data. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
|
! command -v python2 > /dev/null && echo "Please install python v2.x, and the python libraries: reportlab, lxml. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
|
! command -v gs > /dev/null && echo "Please install ghostcript. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
|
! command -v java > /dev/null && echo "Please install java. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
2013-04-19 23:00:00 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2014-01-03 15:59:51 +01:00
|
|
|
# Display the version of the tools if log level is LOG_DEBUG
|
|
|
|
|
if [ $VERBOSITY -ge $LOG_DEBUG ]; then
|
|
|
|
|
echo "--------------------------------"
|
|
|
|
|
echo "ImageMagick version:"
|
|
|
|
|
identify --version
|
|
|
|
|
echo "--------------------------------"
|
|
|
|
|
echo "GNU Parallel version:"
|
|
|
|
|
parallel --version
|
|
|
|
|
echo "--------------------------------"
|
|
|
|
|
echo "Poppler-utils version:"
|
|
|
|
|
pdfimages -v
|
|
|
|
|
pdftoppm -v
|
|
|
|
|
echo "--------------------------------"
|
|
|
|
|
echo "pdftk version:"
|
|
|
|
|
pdftk --version
|
|
|
|
|
echo "--------------------------------"
|
|
|
|
|
echo "unpaper version:"
|
|
|
|
|
unpaper --version
|
|
|
|
|
echo "--------------------------------"
|
|
|
|
|
echo "tesseract version:"
|
|
|
|
|
tesseract --version
|
|
|
|
|
echo "--------------------------------"
|
|
|
|
|
echo "python2 version:"
|
|
|
|
|
python2 --version
|
|
|
|
|
echo "--------------------------------"
|
|
|
|
|
echo "Ghostscript version:"
|
|
|
|
|
gs --version
|
|
|
|
|
echo "--------------------------------"
|
2014-01-03 16:27:11 +01:00
|
|
|
echo "Java version:"
|
|
|
|
|
java -version
|
|
|
|
|
echo "--------------------------------"
|
2014-01-03 15:59:51 +01:00
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
2013-04-23 22:54:58 +02:00
|
|
|
|
2013-04-19 23:00:00 +02:00
|
|
|
# Initialize path to temporary files
|
2013-05-01 13:44:20 +02:00
|
|
|
today=$(date +"%Y%m%d_%H%M")
|
|
|
|
|
fld=$(basename "$FILE_INPUT_PDF" | sed 's/[.][^.]*//')
|
|
|
|
|
TMP_FLD="./tmp/$today.filename.$fld"
|
2013-05-02 16:51:46 +02:00
|
|
|
FILE_TMP="$TMP_FLD/tmp.txt" # temporary file with a very short lifetime (may be used for several things)
|
2013-05-05 20:44:03 +02:00
|
|
|
FILE_PAGES_INFO="$TMP_FLD/pages-info.txt" # for each page: page #; width in pt; height in pt
|
2013-04-28 22:18:34 +02:00
|
|
|
FILE_OUTPUT_PDF_CAT="${TMP_FLD}/ocred.pdf" # concatenated OCRed PDF files
|
|
|
|
|
FILE_OUTPUT_PDFA_WO_META="${TMP_FLD}/ocred-pdfa-wo-metadata.pdf" # PDFA file before appending metadata
|
2013-05-01 15:58:55 +02:00
|
|
|
FILE_VALIDATION_LOG="${TMP_FLD}/pdf_validation.log" # log file containing the results of the validation of the PDF/A file
|
2013-04-19 23:00:00 +02:00
|
|
|
|
2013-04-23 22:54:58 +02:00
|
|
|
# Create tmp folder
|
2013-05-01 13:44:20 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Creating temporary folder: \"$TMP_FLD\""
|
2013-04-23 22:54:58 +02:00
|
|
|
rm -r -f "${TMP_FLD}"
|
|
|
|
|
mkdir -p "${TMP_FLD}"
|
2013-04-09 19:00:26 +02:00
|
|
|
|
2013-04-19 22:23:28 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2013-04-14 19:15:01 +02:00
|
|
|
# get the size of each pdf page (width / height) in pt (inch*72)
|
2013-04-22 20:56:45 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Input file: Extracting size of each page (in pt)"
|
2013-05-02 16:51:46 +02:00
|
|
|
! identify -format "%w %h\n" "$FILE_INPUT_PDF" > "$FILE_TMP" \
|
2013-12-30 23:44:38 +01:00
|
|
|
&& echo "Could not get size of PDF pages. Exiting..." && exit $EXIT_BAD_INPUT_FILE
|
2013-05-02 22:06:16 +02:00
|
|
|
# removing empty lines (last one should be) and prepend page # before each line
|
2013-05-05 20:44:03 +02:00
|
|
|
sed '/^$/d' "$FILE_TMP" | awk '{printf "%04d %s\n", NR, $0}' > "$FILE_PAGES_INFO"
|
|
|
|
|
numpages=`tail -n 1 "$FILE_PAGES_INFO" | cut -f1 -d" "`
|
2013-04-09 19:00:26 +02:00
|
|
|
|
2013-11-27 22:34:21 +01:00
|
|
|
# OCR each page of the input pdf file
|
2013-11-27 23:15:54 +01:00
|
|
|
! parallel -q -k --halt-on-error 1 "$OCR_PAGE" "$FILE_INPUT_PDF" "{}" "$numpages" "$TMP_FLD" \
|
2013-12-30 23:44:38 +01:00
|
|
|
"$VERBOSITY" "$LAN" "$KEEP_TMP" "$PREPROCESS_DESKEW" "$PREPROCESS_CLEAN" "$PREPROCESS_CLEANTOPDF" "$OVERSAMPLING_DPI" "$PDF_NOIMG" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO" \
|
2013-05-05 22:33:54 +02:00
|
|
|
&& exit $?
|
|
|
|
|
#while read pageInfo ; do
|
|
|
|
|
# ! "$OCR_PAGE" "$FILE_INPUT_PDF" "$pageInfo" "$numpages" "$TMP_FLD" \
|
2013-12-30 23:44:38 +01:00
|
|
|
# "$VERBOSITY" "$LAN" "$KEEP_TMP" "$PREPROCESS_DESKEW" "$PREPROCESS_CLEAN" "$PREPROCESS_CLEANTOPDF" "$OVERSAMPLING_DPI" "$PDF_NOIMG" "$TESS_CFG_FILES" \
|
2013-05-05 22:33:54 +02:00
|
|
|
# && exit $?
|
|
|
|
|
#done < "$FILE_PAGES_INFO"
|
2013-04-19 22:23:28 +02:00
|
|
|
|
2013-04-19 23:00:00 +02:00
|
|
|
|
2013-04-09 19:00:26 +02:00
|
|
|
# concatenate all pages
|
2013-04-22 20:56:45 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Concatenating all pages"
|
2013-05-05 20:56:45 +02:00
|
|
|
! pdftk "${TMP_FLD}/"*-ocred.pdf cat output "$FILE_OUTPUT_PDF_CAT" \
|
2013-12-30 23:44:38 +01:00
|
|
|
&& echo "Could not concatenate individual PDF pages (\"${TMP_FLD}/*-ocred.pdf\") to one file. Exiting..." && exit $EXIT_OTHER_ERROR
|
2013-04-09 19:00:26 +02:00
|
|
|
|
2013-04-18 10:31:36 +02:00
|
|
|
# convert the pdf file to match PDF/A format
|
2013-04-22 20:56:45 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Converting to PDF/A"
|
2013-04-21 21:58:58 +02:00
|
|
|
! gs -dQUIET -dPDFA -dBATCH -dNOPAUSE -dUseCIEColor \
|
2013-04-18 23:13:06 +02:00
|
|
|
-sProcessColorModel=DeviceCMYK -sDEVICE=pdfwrite -sPDFACompatibilityPolicy=2 \
|
2013-05-06 21:26:33 +02:00
|
|
|
-sOutputFile="$FILE_OUTPUT_PDFA" "$FILE_OUTPUT_PDF_CAT" 1> /dev/null 2> /dev/null \
|
2013-12-30 23:44:38 +01:00
|
|
|
&& echo "Could not convert PDF file \"$FILE_OUTPUT_PDF_CAT\" to PDF/A. Exiting..." && exit $EXIT_OTHER_ERROR
|
2013-04-28 22:18:34 +02:00
|
|
|
|
2013-04-18 10:31:36 +02:00
|
|
|
# validate generated pdf file (compliance to PDF/A)
|
2013-04-22 20:56:45 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Checking compliance to PDF/A standard"
|
2013-04-26 16:11:59 +02:00
|
|
|
java -jar "$JHOVE" -c "$JHOVE_CFG" -m PDF-hul "$FILE_OUTPUT_PDFA" > "$FILE_VALIDATION_LOG"
|
2013-04-19 22:23:28 +02:00
|
|
|
grep -i "Status|Message" "$FILE_VALIDATION_LOG" # summary of the validation
|
2013-05-01 15:58:55 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "The full validation log is available here: \"$FILE_VALIDATION_LOG\""
|
2013-04-26 12:23:29 +02:00
|
|
|
# check the validation results
|
|
|
|
|
pdf_valid=1
|
2013-12-30 23:44:38 +01:00
|
|
|
grep -i 'ErrorMessage' "$FILE_VALIDATION_LOG" && pdf_valid=0
|
|
|
|
|
grep -i 'Status.*not valid' "$FILE_VALIDATION_LOG" && pdf_valid=0
|
|
|
|
|
grep -i 'Status.*Not well-formed' "$FILE_VALIDATION_LOG" && pdf_valid=0
|
|
|
|
|
! grep -i 'Profile:.*PDF/A-1' "$FILE_VALIDATION_LOG" > /dev/null && echo "PDF file profile is not PDF/A-1" && pdf_valid=0
|
|
|
|
|
[ $pdf_valid -ne 1 ] && echo "Output file: The generated PDF/A file is INVALID"
|
2013-04-28 22:18:34 +02:00
|
|
|
[ $pdf_valid -ne 0 ] && [ $VERBOSITY -ge $LOG_INFO ] && echo "Output file: The generated PDF/A file is VALID"
|
2013-04-19 23:00:00 +02:00
|
|
|
|
2013-04-20 22:02:33 +02:00
|
|
|
|
2013-04-28 15:54:31 +02:00
|
|
|
|
|
|
|
|
|
2013-04-18 23:13:06 +02:00
|
|
|
# delete temporary files
|
|
|
|
|
if [ $KEEP_TMP -eq 0 ]; then
|
2013-04-28 22:18:34 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Deleting temporary files"
|
2013-04-23 22:54:58 +02:00
|
|
|
rm -r -f "${TMP_FLD}"
|
2013-04-20 22:02:33 +02:00
|
|
|
fi
|
|
|
|
|
|
2013-04-26 12:23:29 +02:00
|
|
|
|
2013-05-01 15:58:55 +02:00
|
|
|
END=`date +%s`
|
|
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Script took $(($END-$START)) seconds"
|
|
|
|
|
|
2013-04-26 12:23:29 +02:00
|
|
|
|
2013-04-28 22:18:34 +02:00
|
|
|
[ $pdf_valid -ne 1 ] && exit $EXIT_INVALID_OUPUT_PDFA || exit 0
|