2013-04-09 19:00:26 +02:00
|
|
|
#!/bin/sh
|
2013-04-26 11:50:39 +02:00
|
|
|
##############################################################################
|
|
|
|
|
# Copyright (c) 2013: fritz-hh from Github (https://github.com/fritz-hh)
|
|
|
|
|
##############################################################################
|
2013-04-09 19:00:26 +02:00
|
|
|
|
2013-04-28 22:18:34 +02:00
|
|
|
TOOLNAME="OCRmyPDF"
|
2013-04-29 12:19:19 +03:00
|
|
|
VERSION="v1.0-rc2"
|
2013-04-14 19:15:01 +02:00
|
|
|
|
2013-05-01 15:58:55 +02:00
|
|
|
START=`date +%s`
|
|
|
|
|
|
2013-04-21 21:58:58 +02:00
|
|
|
usage() {
|
|
|
|
|
cat << EOF
|
2013-04-22 20:08:24 +02:00
|
|
|
--------------------------------------------------------------------------------------
|
|
|
|
|
Script aimed at generating a searchable PDF file from a PDF file containing only images.
|
|
|
|
|
(The script performs optical character recognition of each respective page using the
|
|
|
|
|
tesseract engine)
|
|
|
|
|
|
|
|
|
|
Copyright: fritz from NAS4Free forum
|
|
|
|
|
Version: $VERSION
|
|
|
|
|
|
2013-04-23 22:54:58 +02:00
|
|
|
Usage: OCRmyPDF.sh [-h] [-v] [-g] [-k] [-d] [-c] [-i] [-l language] [-C filename] inputfile outputfile
|
2013-04-22 20:08:24 +02:00
|
|
|
|
|
|
|
|
-h : Display this help message
|
|
|
|
|
-v : Increase the verbosity (this option can be used more than once)
|
2013-04-28 14:43:21 +02:00
|
|
|
-k : Do not delete the temporary files
|
2013-04-22 22:50:34 +02:00
|
|
|
-g : Activate debug mode:
|
|
|
|
|
- Generates a PDF file containing each page twice (once with the image, once without the image
|
2013-04-26 16:19:18 +02:00
|
|
|
but with the OCRed text as well as the detected bounding boxes)
|
2013-04-28 14:43:21 +02:00
|
|
|
- Set the verbosity to the highest possible
|
|
|
|
|
- Do not delete the temporary files
|
2013-04-22 20:08:24 +02:00
|
|
|
-d : Deskew each page before performing OCR
|
|
|
|
|
-c : Clean each page before performing OCR
|
|
|
|
|
-i : Incorporate the cleaned image in the final PDF file (by default the original image
|
|
|
|
|
image, or the deskewed image if the -d option is set, is incorporated)
|
2013-04-23 00:35:42 +03:00
|
|
|
-l : Set the language of the PDF file in order to improve OCR results (default "eng")
|
2013-04-22 20:08:24 +02:00
|
|
|
Any language supported by tesseract is supported.
|
2013-04-23 21:36:34 +02:00
|
|
|
-C : Pass an additional configuration file to the tesseract OCR engine.
|
|
|
|
|
(this option can be used more than once)
|
2013-04-23 22:54:58 +02:00
|
|
|
Note: The configuration file must be available in the "tessdata/configs" folder
|
|
|
|
|
of your tesseract installation
|
|
|
|
|
inputfile : PDF file to be OCRed
|
|
|
|
|
outputfile : The PDF/A file to be generated
|
2013-04-22 20:08:24 +02:00
|
|
|
--------------------------------------------------------------------------------------
|
2013-04-21 21:58:58 +02:00
|
|
|
EOF
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2013-04-23 22:54:58 +02:00
|
|
|
#################################################
|
|
|
|
|
# Get an absolute path from a relative path to a file
|
2013-04-23 22:57:41 +02:00
|
|
|
#
|
|
|
|
|
# Param1 : Relative path
|
2013-04-23 22:54:58 +02:00
|
|
|
# Returns: 1 if the folder in which the file is located does not exist
|
|
|
|
|
# 0 otherwise
|
|
|
|
|
#################################################
|
|
|
|
|
absolutePath() {
|
|
|
|
|
local wdsave absolutepath
|
2013-05-01 13:44:20 +02:00
|
|
|
wdsave="$(pwd)"
|
|
|
|
|
! cd "$(dirname "$1")" 1> /dev/null 2> /dev/null && return 1
|
|
|
|
|
absolutepath="$(pwd)/$(basename "$1")"
|
2013-04-23 22:54:58 +02:00
|
|
|
cd "$wdsave"
|
|
|
|
|
echo "$absolutepath"
|
|
|
|
|
return 0
|
|
|
|
|
}
|
2013-04-21 21:58:58 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2013-04-20 22:02:33 +02:00
|
|
|
# Initialization of constants
|
2013-04-21 21:58:58 +02:00
|
|
|
EXIT_BAD_ARGS="1" # possible exit codes
|
|
|
|
|
EXIT_BAD_INPUT_FILE="2"
|
|
|
|
|
EXIT_MISSING_DEPENDENCY="3"
|
|
|
|
|
EXIT_INVALID_OUPUT_PDFA="4"
|
|
|
|
|
EXIT_OTHER_ERROR="5"
|
2013-04-20 22:02:33 +02:00
|
|
|
LOG_ERR="0" # 0=only error messages
|
|
|
|
|
LOG_INFO="1" # 1=error messages and some infos
|
|
|
|
|
LOG_DEBUG="2" # 2=debug level logging
|
2013-04-26 16:34:49 +02:00
|
|
|
SRC="./src" # location of the source folder (except source of external tools like jhove)
|
2013-04-26 14:46:47 +02:00
|
|
|
JHOVE="./jhove/bin/JhoveApp.jar" # java SW for validating the final PDF/A
|
2013-04-26 16:11:59 +02:00
|
|
|
JHOVE_CFG="./jhove/conf/jhove.conf" # location of the jhove config file
|
2013-04-20 22:02:33 +02:00
|
|
|
|
|
|
|
|
# Initialization the configuration parameters with default values
|
2013-04-21 21:58:58 +02:00
|
|
|
VERBOSITY="$LOG_ERR" # default verbosity level
|
|
|
|
|
LAN="eng" # default language of the PDF file (required to get good OCR results)
|
|
|
|
|
KEEP_TMP="0" # do not delete the temporary files (default)
|
|
|
|
|
PREPROCESS_DESKEW="0" # 0=no, 1=yes (deskew image)
|
|
|
|
|
PREPROCESS_CLEAN="0" # 0=no, 1=yes (clean image to improve OCR)
|
2013-04-23 21:36:34 +02:00
|
|
|
PREPROCESS_CLEANTOPDF="0" # 0=no, 1=yes (put cleaned image in final PDF)
|
2013-04-28 14:43:21 +02:00
|
|
|
PDF_NOIMG="0" # 0=no, 1=yes (generates each PDF page twice, with and without image)
|
2013-04-23 21:36:34 +02:00
|
|
|
TESS_CFG_FILES="" # list of additional configuration files to be used by tesseract
|
2013-04-21 21:58:58 +02:00
|
|
|
|
|
|
|
|
# Parse optional command line arguments
|
2013-04-23 21:36:34 +02:00
|
|
|
while getopts ":hvgkdcil:C:" opt; do
|
2013-04-21 21:58:58 +02:00
|
|
|
case $opt in
|
|
|
|
|
h) usage ; exit 0 ;;
|
|
|
|
|
v) VERBOSITY=$(($VERBOSITY+1)) ;;
|
|
|
|
|
k) KEEP_TMP="1" ;;
|
2013-04-28 14:43:21 +02:00
|
|
|
g) PDF_NOIMG="1"; VERBOSITY="10"; KEEP_TMP="1" ;;
|
2013-04-23 21:36:34 +02:00
|
|
|
d) PREPROCESS_DESKEW="1" ;;
|
2013-04-21 21:58:58 +02:00
|
|
|
c) PREPROCESS_CLEAN="1" ;;
|
2013-04-23 21:36:34 +02:00
|
|
|
i) PREPROCESS_CLEANTOPDF="1" ;;
|
2013-04-21 21:58:58 +02:00
|
|
|
l) LAN="$OPTARG" ;;
|
2013-04-23 21:36:34 +02:00
|
|
|
C) TESS_CFG_FILES="$OPTARG $TESS_CFG_FILES" ;;
|
2013-04-21 21:58:58 +02:00
|
|
|
\?)
|
2013-04-26 12:23:29 +02:00
|
|
|
echo "Invalid option: -$OPTARG" >&2
|
2013-04-21 21:58:58 +02:00
|
|
|
usage
|
|
|
|
|
exit $EXIT_BAD_ARGS ;;
|
|
|
|
|
:)
|
2013-04-26 12:23:29 +02:00
|
|
|
echo "Option -$OPTARG requires an argument" >&2
|
2013-04-21 21:58:58 +02:00
|
|
|
usage
|
|
|
|
|
exit $EXIT_BAD_ARGS ;;
|
|
|
|
|
esac
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
# Remove the optional arguments parsed above.
|
|
|
|
|
shift $((OPTIND-1))
|
|
|
|
|
|
|
|
|
|
# Check if the number of mandatory parameters
|
|
|
|
|
# provided is as expected
|
2013-04-23 22:54:58 +02:00
|
|
|
if [ "$#" -ne "2" ]; then
|
2013-05-01 13:44:20 +02:00
|
|
|
echo "Exactly two mandatory argument shall be provided ($# arguments provided)" >&2
|
2013-04-21 21:58:58 +02:00
|
|
|
usage
|
|
|
|
|
exit $EXIT_BAD_ARGS
|
|
|
|
|
fi
|
|
|
|
|
|
2013-04-26 14:20:45 +02:00
|
|
|
! absolutePath "$1" > /dev/null && echo "The folder in which the input file should be located does not exist. Exiting..." >&2 && exit $EXIT_BAD_ARGS
|
2013-04-23 22:54:58 +02:00
|
|
|
FILE_INPUT_PDF="`absolutePath "$1"`"
|
2013-04-26 14:20:45 +02:00
|
|
|
! absolutePath "$2" > /dev/null && echo "The folder in which the output file should be generated does not exist. Exiting..." >&2 && exit $EXIT_BAD_ARGS
|
2013-04-23 22:54:58 +02:00
|
|
|
FILE_OUTPUT_PDFA="`absolutePath "$2"`"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# set script path as working directory
|
|
|
|
|
cd "`dirname $0`"
|
2013-04-21 21:58:58 +02:00
|
|
|
|
2013-04-28 22:18:34 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_INFO ] && echo "$TOOLNAME version: $VERSION"
|
2013-04-26 11:50:39 +02:00
|
|
|
|
2013-04-19 22:23:28 +02:00
|
|
|
# check if the required utilities are installed
|
2013-04-22 20:56:45 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Checking if all dependencies are installed"
|
2013-04-26 12:23:29 +02:00
|
|
|
! command -v identify > /dev/null && echo "Please install ImageMagick. Exiting..." >&2 && exit $EXIT_MISSING_DEPENDENCY
|
2013-04-26 16:52:58 +02:00
|
|
|
! command -v pdfimages > /dev/null && echo "Please install poppler-utils. Exiting..." >&2 && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
|
! command -v pdftoppm > /dev/null && echo "Please install poppler-utils. Exiting..." >&2 && exit $EXIT_MISSING_DEPENDENCY
|
2013-04-26 12:23:29 +02:00
|
|
|
! command -v pdftk > /dev/null && echo "Please install pdftk. Exiting..." >&2 && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
|
[ $PREPROCESS_CLEAN -eq 1 ] && ! command -v unpaper > /dev/null && echo "Please install unpaper. Exiting..." >&2 && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
|
! command -v tesseract > /dev/null && echo "Please install tesseract and tesseract-data. Exiting..." >&2 && exit $EXIT_MISSING_DEPENDENCY
|
2013-04-26 16:11:59 +02:00
|
|
|
! command -v python > /dev/null && echo "Please install python, and the python libraries: reportlab, lxml. Exiting..." >&2 && exit $EXIT_MISSING_DEPENDENCY
|
2013-04-26 12:23:29 +02:00
|
|
|
! command -v gs > /dev/null && echo "Please install ghostcript. Exiting..." >&2 && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
|
! command -v java > /dev/null && echo "Please install java. Exiting..." >&2 && exit $EXIT_MISSING_DEPENDENCY
|
2013-04-19 23:00:00 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2013-04-23 22:54:58 +02:00
|
|
|
|
2013-04-19 23:00:00 +02:00
|
|
|
# Initialize path to temporary files
|
2013-05-01 13:44:20 +02:00
|
|
|
today=$(date +"%Y%m%d_%H%M")
|
|
|
|
|
fld=$(basename "$FILE_INPUT_PDF" | sed 's/[.][^.]*//')
|
|
|
|
|
TMP_FLD="./tmp/$today.filename.$fld"
|
2013-05-02 16:51:46 +02:00
|
|
|
FILE_TMP="$TMP_FLD/tmp.txt" # temporary file with a very short lifetime (may be used for several things)
|
2013-04-28 22:18:34 +02:00
|
|
|
FILE_SIZE_PAGES="$TMP_FLD/page-sizes.txt" # size in pt of the respective page of the input PDF file
|
|
|
|
|
FILES_OCRed_PDFS="${TMP_FLD}/*-ocred.pdf" # string matching all 1 page PDF files that need to be merged
|
|
|
|
|
FILE_OUTPUT_PDF_CAT="${TMP_FLD}/ocred.pdf" # concatenated OCRed PDF files
|
|
|
|
|
FILE_OUTPUT_PDFA_WO_META="${TMP_FLD}/ocred-pdfa-wo-metadata.pdf" # PDFA file before appending metadata
|
2013-05-01 15:58:55 +02:00
|
|
|
FILE_VALIDATION_LOG="${TMP_FLD}/pdf_validation.log" # log file containing the results of the validation of the PDF/A file
|
2013-04-19 23:00:00 +02:00
|
|
|
|
2013-04-23 22:54:58 +02:00
|
|
|
# Create tmp folder
|
2013-05-01 13:44:20 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Creating temporary folder: \"$TMP_FLD\""
|
2013-04-23 22:54:58 +02:00
|
|
|
rm -r -f "${TMP_FLD}"
|
|
|
|
|
mkdir -p "${TMP_FLD}"
|
2013-04-09 19:00:26 +02:00
|
|
|
|
2013-04-19 22:23:28 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2013-04-14 19:15:01 +02:00
|
|
|
# get the size of each pdf page (width / height) in pt (inch*72)
|
2013-04-22 20:56:45 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Input file: Extracting size of each page (in pt)"
|
2013-05-02 16:51:46 +02:00
|
|
|
! identify -format "%w %h\n" "$FILE_INPUT_PDF" > "$FILE_TMP" \
|
2013-04-26 12:23:29 +02:00
|
|
|
&& echo "Could not get size of PDF pages. Exiting..." >&2 && exit $EXIT_BAD_INPUT_FILE
|
2013-05-02 16:51:46 +02:00
|
|
|
sed '/^$/d' "$FILE_TMP" > "$FILE_SIZE_PAGES" # removing empty lines (last one should be)
|
2013-04-18 23:13:06 +02:00
|
|
|
numpages=`cat "$FILE_SIZE_PAGES" | wc -l | sed 's/^ *//g'`
|
2013-05-01 15:58:55 +02:00
|
|
|
numpages=`printf "%04d" $numpages` # add leading zeros to the page number
|
2013-04-09 19:00:26 +02:00
|
|
|
|
2013-04-18 23:13:06 +02:00
|
|
|
# Itterate the pages of the input pdf file
|
|
|
|
|
cpt="1"
|
2013-04-19 21:27:40 +02:00
|
|
|
while read pageSize ; do
|
2013-04-13 12:35:26 +02:00
|
|
|
|
2013-05-01 15:58:55 +02:00
|
|
|
page=`printf "%04d" $cpt` # add leading zeros to the page number
|
|
|
|
|
[ $VERBOSITY -ge $LOG_INFO ] && echo "Processing page $page / $numpages"
|
2013-04-20 22:02:33 +02:00
|
|
|
|
2013-04-18 11:16:40 +02:00
|
|
|
# create the name of the required file
|
2013-04-23 22:54:58 +02:00
|
|
|
curOrigImg="$TMP_FLD/${page}_Image" # original image available in the current PDF page
|
2013-05-01 15:58:55 +02:00
|
|
|
# (the image file may have a different orientation than in the pdf file)
|
|
|
|
|
curHocr="$TMP_FLD/$page.hocr" # hocr file to be generated by the OCR SW for the current page
|
2013-04-23 22:54:58 +02:00
|
|
|
curOCRedPDF="$TMP_FLD/${page}-ocred.pdf" # PDF file containing the image + the OCRed text for the current page
|
|
|
|
|
curOCRedPDFDebug="$TMP_FLD/${page}-debug-ocred.pdf" # PDF file containing data required to find out if OCR worked correctly
|
2013-04-18 11:16:40 +02:00
|
|
|
|
2013-04-26 14:20:45 +02:00
|
|
|
# get width / height of PDF page (in pt)
|
|
|
|
|
widthPDF=`echo $pageSize | cut -f1 -d" "`
|
|
|
|
|
heightPDF=`echo $pageSize | cut -f2 -d" "`
|
|
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Page $page: size ${heightPDF}x${widthPDF} (h*w in pt)"
|
2013-04-14 19:15:01 +02:00
|
|
|
# extract raw image from pdf file to compute resolution
|
2013-04-25 11:13:30 +02:00
|
|
|
# unfortunatelly this image can have another orientation than in the pdf...
|
2013-05-01 15:58:55 +02:00
|
|
|
# so we will have to extract it again later using pdftoppm
|
2013-04-18 23:13:06 +02:00
|
|
|
pdfimages -f $page -l $page -j "$FILE_INPUT_PDF" "$curOrigImg" 1>&2
|
2013-04-14 19:15:01 +02:00
|
|
|
# count number of extracted images
|
2013-04-18 11:16:40 +02:00
|
|
|
nbImg=`ls -1 "$curOrigImg"* | wc -l`
|
2013-04-26 12:23:29 +02:00
|
|
|
[ $nbImg -ne "1" ] && echo "Expecting exactly 1 image on page $page (found $nbImg). Exiting..." >&2 && exit $EXIT_BAD_INPUT_FILE
|
2013-04-20 22:02:33 +02:00
|
|
|
# Get characteristics of the extracted image
|
2013-04-25 11:13:30 +02:00
|
|
|
curImg=`ls -1 "$curOrigImg"*`
|
|
|
|
|
propCurImg=`identify -format "%w %h %[colorspace]" "$curImg"`
|
2013-04-26 14:20:45 +02:00
|
|
|
widthCurImg=`echo "$propCurImg" | cut -f1 -d" "`
|
|
|
|
|
heightCurImg=`echo "$propCurImg" | cut -f2 -d" "`
|
2013-04-25 11:13:30 +02:00
|
|
|
colorspaceCurImg=`echo "$propCurImg" | cut -f3 -d" "`
|
|
|
|
|
# switch height/width values if the image has not the right orientation
|
2013-04-26 14:20:45 +02:00
|
|
|
# we make here the assumption that vertical/horizontal dpi are equal
|
|
|
|
|
# we will check that later
|
2013-04-25 11:13:30 +02:00
|
|
|
if [ $((($heightPDF-$widthPDF)*($heightCurImg-$widthCurImg))) -lt 0 ]; then
|
|
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Page $page: Extracted image has wrong orientation. Inverting image height/width values"
|
|
|
|
|
tmpval=$heightCurImg
|
|
|
|
|
heightCurImg=$widthCurImg
|
|
|
|
|
widthCurImg=$tmpval
|
|
|
|
|
fi
|
2013-04-26 14:20:45 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Page $page: size ${heightCurImg}x${widthCurImg} (h*w pixel)"
|
|
|
|
|
# compute the resolution of the image
|
2013-05-01 15:58:55 +02:00
|
|
|
dpi_x=`echo "scale=5;$widthCurImg*72/$widthPDF" | bc`
|
|
|
|
|
dpi_y=`echo "scale=5;$heightCurImg*72/$heightPDF" | bc`
|
|
|
|
|
# compute the maximum allowed resolution difference that can be cause by:
|
|
|
|
|
# - the truncated PDF with/height in pt
|
|
|
|
|
# - the precision of dpi value
|
|
|
|
|
epsilon=`echo "scale=5;($widthCurImg*72/$widthPDF^2)+($heightCurImg*72/$heightPDF^2)+0.00002" | bc` # max inaccuracy due to truncation of PDF size in pt
|
2013-05-02 16:51:46 +02:00
|
|
|
[ `echo "($dpi_x - $dpi_y) < $epsilon " | bc` -eq 0 -o `echo "($dpi_y - $dpi_x) < $epsilon " | bc` -eq 0 ] \
|
|
|
|
|
&& echo "Resolutions difference ($dpi_x/$dpi_y) higher than expected ($epsilon). Exiting..." >&2 && exit $EXIT_BAD_INPUT_FILE
|
2013-05-01 15:58:55 +02:00
|
|
|
dpi=`echo "scale=5;($dpi_x+$dpi_y)/2+0.5" | bc` # adding 0.5 is required for rounding
|
|
|
|
|
dpi=`echo "scale=0;$dpi/1" | bc` # round to the nearest integer
|
2013-04-24 21:12:35 +02:00
|
|
|
|
2013-04-18 11:16:40 +02:00
|
|
|
# Identify if page image should be saved as ppm (color) or pgm (gray)
|
2013-04-19 22:23:28 +02:00
|
|
|
ext="ppm"
|
|
|
|
|
opt=""
|
2013-04-25 11:13:30 +02:00
|
|
|
if [ $colorspaceCurImg == "Gray" ]; then
|
2013-04-14 19:15:01 +02:00
|
|
|
ext="pgm"
|
|
|
|
|
opt="-gray"
|
|
|
|
|
fi
|
2013-04-23 22:54:58 +02:00
|
|
|
curImgPixmap="$TMP_FLD/$page.$ext"
|
|
|
|
|
curImgPixmapDeskewed="$TMP_FLD/$page.deskewed.$ext"
|
|
|
|
|
curImgPixmapClean="$TMP_FLD/$page.cleaned.$ext"
|
2013-04-18 11:16:40 +02:00
|
|
|
|
|
|
|
|
# extract current page as image with right orientation and resoltution
|
2013-04-22 20:56:45 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Page $page: Extracting image as $ext file (${dpi} dpi)"
|
2013-04-21 21:58:58 +02:00
|
|
|
! pdftoppm -f $page -l $page -r $dpi $opt "$FILE_INPUT_PDF" > "$curImgPixmap" \
|
2013-05-01 13:44:20 +02:00
|
|
|
&& echo "Could not extract page $page as $ext from \"$FILE_INPUT_PDF\". Exiting..." >&2 && exit $EXIT_OTHER_ERROR
|
2013-04-09 19:00:26 +02:00
|
|
|
|
2013-04-20 22:02:33 +02:00
|
|
|
# if requested deskew image (without changing its size in pixel)
|
|
|
|
|
if [ "$PREPROCESS_DESKEW" -eq "1" ]; then
|
2013-04-22 20:56:45 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Page $page: Deskewing image"
|
2013-04-26 19:37:02 +02:00
|
|
|
! convert "$curImgPixmap" -deskew 40% -gravity center -extent ${widthCurImg}x${heightCurImg} "$curImgPixmapDeskewed" \
|
2013-04-26 12:23:29 +02:00
|
|
|
&& echo "Could not deskew \"$curImgPixmap\". Exiting..." >&2 && exit $EXIT_OTHER_ERROR
|
2013-04-20 22:02:33 +02:00
|
|
|
else
|
|
|
|
|
cp "$curImgPixmap" "$curImgPixmapDeskewed"
|
|
|
|
|
fi
|
2013-04-21 21:58:58 +02:00
|
|
|
|
2013-04-20 22:02:33 +02:00
|
|
|
# if requested clean image with unpaper to get better OCR results
|
2013-04-21 21:58:58 +02:00
|
|
|
if [ "$PREPROCESS_CLEAN" -eq "1" ]; then
|
2013-04-22 20:56:45 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Page $page: Cleaning image with unpaper"
|
2013-04-21 21:58:58 +02:00
|
|
|
! unpaper --dpi $dpi --mask-scan-size 100 \
|
2013-04-20 22:02:33 +02:00
|
|
|
--no-deskew --no-grayfilter --no-blackfilter --no-mask-center --no-border-align \
|
2013-04-21 21:58:58 +02:00
|
|
|
"$curImgPixmapDeskewed" "$curImgPixmapClean" 1> /dev/null \
|
2013-04-26 12:23:29 +02:00
|
|
|
&& echo "Could not clean \"$curImgPixmapDeskewed\". Exiting..." >&2 && exit $EXIT_OTHER_ERROR
|
2013-04-20 22:02:33 +02:00
|
|
|
else
|
|
|
|
|
cp "$curImgPixmapDeskewed" "$curImgPixmapClean"
|
|
|
|
|
fi
|
2013-04-21 21:58:58 +02:00
|
|
|
|
2013-04-09 19:00:26 +02:00
|
|
|
# perform OCR
|
2013-04-22 20:56:45 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Page $page: Performing OCR"
|
2013-04-23 21:36:34 +02:00
|
|
|
! tesseract -l "$LAN" "$curImgPixmapClean" "$curHocr" hocr $TESS_CFG_FILES 1> /dev/null 2> /dev/null \
|
2013-04-26 12:23:29 +02:00
|
|
|
&& echo "Could not OCR file \"$curImgPixmapClean\". Exiting..." >&2 && exit $EXIT_OTHER_ERROR
|
2013-04-18 11:16:40 +02:00
|
|
|
mv "$curHocr.html" "$curHocr"
|
2013-04-09 19:00:26 +02:00
|
|
|
|
|
|
|
|
# embed text and image to new pdf file
|
2013-04-23 21:36:34 +02:00
|
|
|
if [ "$PREPROCESS_CLEANTOPDF" -eq "1" ]; then
|
2013-04-20 22:02:33 +02:00
|
|
|
image4finalPDF="$curImgPixmapClean"
|
|
|
|
|
else
|
|
|
|
|
image4finalPDF="$curImgPixmapDeskewed"
|
|
|
|
|
fi
|
2013-04-22 22:50:34 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Page $page: Embedding text in PDF"
|
2013-04-26 16:34:49 +02:00
|
|
|
! python $SRC/hocrTransform.py -r $dpi -i "$image4finalPDF" "$curHocr" "$curOCRedPDF" \
|
2013-04-26 12:23:29 +02:00
|
|
|
&& echo "Could not create PDF file from \"$curHocr\". Exiting..." >&2 && exit $EXIT_OTHER_ERROR
|
2013-04-11 20:29:10 +02:00
|
|
|
|
2013-04-22 22:50:34 +02:00
|
|
|
# if requested generate special debug PDF page with visible OCR text
|
2013-04-28 14:43:21 +02:00
|
|
|
if [ $PDF_NOIMG -eq "1" ] ; then
|
2013-04-22 22:50:34 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Page $page: Embedding text in PDF (debug page)"
|
2013-04-26 16:34:49 +02:00
|
|
|
! python $SRC/hocrTransform.py -b -r $dpi "$curHocr" "$curOCRedPDFDebug" \
|
2013-04-26 12:23:29 +02:00
|
|
|
&& echo "Could not create PDF file from \"$curHocr\". Exiting..." >&2 && exit $EXIT_OTHER_ERROR
|
2013-04-22 22:50:34 +02:00
|
|
|
fi
|
|
|
|
|
|
2013-04-18 23:13:06 +02:00
|
|
|
# delete temporary files created for the current page
|
|
|
|
|
# to avoid using to much disk space in case of PDF files having many pages
|
|
|
|
|
if [ $KEEP_TMP -eq 0 ]; then
|
|
|
|
|
rm "$curOrigImg"*.*
|
|
|
|
|
rm "$curHocr"
|
|
|
|
|
rm "$curImgPixmap"
|
2013-04-20 22:02:33 +02:00
|
|
|
rm "$curImgPixmapDeskewed"
|
2013-04-18 23:13:06 +02:00
|
|
|
rm "$curImgPixmapClean"
|
|
|
|
|
fi
|
2013-04-21 21:58:58 +02:00
|
|
|
|
2013-04-09 19:00:26 +02:00
|
|
|
# go to next page of the pdf
|
2013-04-18 23:13:06 +02:00
|
|
|
cpt=$(($cpt+1))
|
2013-04-22 22:50:34 +02:00
|
|
|
|
2013-04-19 21:27:40 +02:00
|
|
|
done < "$FILE_SIZE_PAGES"
|
2013-04-09 19:00:26 +02:00
|
|
|
|
2013-04-19 22:23:28 +02:00
|
|
|
|
|
|
|
|
|
2013-04-19 23:00:00 +02:00
|
|
|
|
2013-04-09 19:00:26 +02:00
|
|
|
# concatenate all pages
|
2013-04-22 20:56:45 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Concatenating all pages"
|
2013-04-28 22:18:34 +02:00
|
|
|
! pdftk $FILES_OCRed_PDFS cat output "$FILE_OUTPUT_PDF_CAT" \
|
2013-04-26 12:23:29 +02:00
|
|
|
&& echo "Could not concatenate individual PDF pages (\"$FILES_OCRed_PDFS\") to one file. Exiting..." >&2 && exit $EXIT_OTHER_ERROR
|
2013-04-09 19:00:26 +02:00
|
|
|
|
2013-04-18 10:31:36 +02:00
|
|
|
# convert the pdf file to match PDF/A format
|
2013-04-22 20:56:45 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Converting to PDF/A"
|
2013-04-21 21:58:58 +02:00
|
|
|
! gs -dQUIET -dPDFA -dBATCH -dNOPAUSE -dUseCIEColor \
|
2013-04-18 23:13:06 +02:00
|
|
|
-sProcessColorModel=DeviceCMYK -sDEVICE=pdfwrite -sPDFACompatibilityPolicy=2 \
|
2013-04-28 22:18:34 +02:00
|
|
|
-sOutputFile="$FILE_OUTPUT_PDFA_WO_META" "$FILE_OUTPUT_PDF_CAT" 1> /dev/null 2> /dev/null \
|
|
|
|
|
&& echo "Could not convert PDF file \"$FILE_OUTPUT_PDF_CAT\" to PDF/A. Exiting..." >&2 && exit $EXIT_OTHER_ERROR
|
|
|
|
|
|
|
|
|
|
# Write metadata
|
|
|
|
|
# Needs to be done after converting to PDF/A, as gs does not preserve metadata
|
|
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Update metadata (creator, producer, and title)"
|
2013-05-01 13:44:20 +02:00
|
|
|
title=`basename "$FILE_INPUT_PDF" | sed 's/[.][^.]*//' | \
|
|
|
|
|
sed 's/_/ /g' | sed 's/-/ /g' | \
|
|
|
|
|
sed 's/\([[:lower:]]\)\([[:upper:]]\)/\1 \2/g' | \
|
|
|
|
|
sed 's/\([[:alpha:]]\)\([[:digit:]]\)/\1 \2/g' | \
|
|
|
|
|
sed 's/\([[:digit:]]\)\([[:alpha:]]\)/\1 \2/g'` # transform the file name (with extension) into distinct words
|
|
|
|
|
pdftk "$FILE_OUTPUT_PDFA_WO_META" update_info_utf8 - output "$FILE_OUTPUT_PDFA" << EOF
|
2013-04-28 22:18:34 +02:00
|
|
|
InfoBegin
|
|
|
|
|
InfoKey: Title
|
|
|
|
|
InfoValue: $title
|
|
|
|
|
InfoBegin
|
|
|
|
|
InfoKey: Creator
|
|
|
|
|
InfoValue: $TOOLNAME $VERSION
|
|
|
|
|
InfoBegin
|
|
|
|
|
InfoKey: Producer
|
|
|
|
|
InfoValue: ghostcript `gs --version`, pdftk
|
|
|
|
|
EOF
|
2013-04-18 10:31:36 +02:00
|
|
|
|
|
|
|
|
# validate generated pdf file (compliance to PDF/A)
|
2013-04-22 20:56:45 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Checking compliance to PDF/A standard"
|
2013-04-26 16:11:59 +02:00
|
|
|
java -jar "$JHOVE" -c "$JHOVE_CFG" -m PDF-hul "$FILE_OUTPUT_PDFA" > "$FILE_VALIDATION_LOG"
|
2013-04-19 22:23:28 +02:00
|
|
|
grep -i "Status|Message" "$FILE_VALIDATION_LOG" # summary of the validation
|
2013-05-01 15:58:55 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "The full validation log is available here: \"$FILE_VALIDATION_LOG\""
|
2013-04-26 12:23:29 +02:00
|
|
|
# check the validation results
|
|
|
|
|
pdf_valid=1
|
2013-04-28 22:18:34 +02:00
|
|
|
grep -i 'ErrorMessage' "$FILE_VALIDATION_LOG" >&2 && pdf_valid=0
|
|
|
|
|
grep -i 'Status.*not valid' "$FILE_VALIDATION_LOG" >&2 && pdf_valid=0
|
|
|
|
|
grep -i 'Status.*Not well-formed' "$FILE_VALIDATION_LOG" >&2 && pdf_valid=0
|
|
|
|
|
! grep -i 'Profile:.*PDF/A-1' "$FILE_VALIDATION_LOG" > /dev/null && echo "PDF file profile is not PDF/A-1" >&2 && pdf_valid=0
|
|
|
|
|
[ $pdf_valid -ne 1 ] && echo "Output file: The generated PDF/A file is INVALID" >&2
|
|
|
|
|
[ $pdf_valid -ne 0 ] && [ $VERBOSITY -ge $LOG_INFO ] && echo "Output file: The generated PDF/A file is VALID"
|
2013-04-19 23:00:00 +02:00
|
|
|
|
2013-04-20 22:02:33 +02:00
|
|
|
|
2013-04-28 15:54:31 +02:00
|
|
|
|
|
|
|
|
|
2013-04-18 23:13:06 +02:00
|
|
|
# delete temporary files
|
|
|
|
|
if [ $KEEP_TMP -eq 0 ]; then
|
2013-04-28 22:18:34 +02:00
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Deleting temporary files"
|
2013-04-23 22:54:58 +02:00
|
|
|
rm -r -f "${TMP_FLD}"
|
2013-04-20 22:02:33 +02:00
|
|
|
fi
|
|
|
|
|
|
2013-04-26 12:23:29 +02:00
|
|
|
|
2013-05-01 15:58:55 +02:00
|
|
|
END=`date +%s`
|
|
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Script took $(($END-$START)) seconds"
|
|
|
|
|
|
2013-04-26 12:23:29 +02:00
|
|
|
|
2013-04-28 22:18:34 +02:00
|
|
|
[ $pdf_valid -ne 1 ] && exit $EXIT_INVALID_OUPUT_PDFA || exit 0
|