From fcac99bc73d21a793aba84cac93b70e40b917cf5 Mon Sep 17 00:00:00 2001 From: fritz-hh Date: Thu, 18 Apr 2013 11:16:40 +0200 Subject: [PATCH] OCRmyPDF.sh: code clean-up --- OCRmyPDF.sh | 67 ++++++++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/OCRmyPDF.sh b/OCRmyPDF.sh index 046bbb0e..ce183fbf 100644 --- a/OCRmyPDF.sh +++ b/OCRmyPDF.sh @@ -7,7 +7,9 @@ KEEP_TMP="1" infile="$1" tmp="./tmp" -FILE_SIZEPAGES="$tmp/infile-page-sizes.txt" +FILE_SIZE_PAGES="$tmp/page-sizes.txt" # size in pt of the respective page of the input PDF file +FILE_OUTPUT_PDF="${tmp}/ocred.pdf" # name of the OCRed PDF file before conversion to PDF/A +FILE_OUTPUT_PDFA="${tmp}/ocred-pdfa.pdf" # name of the final PDF/A file # delete tmp files rm -r -f "${tmp}" @@ -15,18 +17,24 @@ mkdir -p "${tmp}" # get the size of each pdf page (width / height) in pt (inch*72) echo "Extracting size of each page (in pt)" -identify -format "%w %h\n" "$infile" > "$FILE_SIZEPAGES" -sed -I "" '/^$/d' "$FILE_SIZEPAGES" # removing empty lines (last one should be) -numpages=`cat "$FILE_SIZEPAGES" | wc -l` +identify -format "%w %h\n" "$infile" > "$FILE_SIZE_PAGES" +sed -I "" '/^$/d' "$FILE_SIZE_PAGES" # removing empty lines (last one should be) +numpages=`cat "$FILE_SIZE_PAGES" | wc -l` echo "PDF file has $numpages pages" # Itterate the pages of the pdf file page="1" -cat "$FILE_SIZEPAGES" | while read pageSize ; do +cat "$FILE_SIZE_PAGES" | while read pageSize ; do # add leading zeros to the page number page=`printf "%04d" $page` + # create the name of the required file + curOrigImg="$tmp/${page}_Image" # original image available in the current PDF page + # (the image file may have a different orientation than in the pdf file) + curHocr="$tmp/$page.hocr" # hocr file to be generated by the OCR SW for the current page + curOCRedPDF="$tmp/${page}-ocred.pdf" # PDF file containing the image + the OCRed text for the current page + echo "Page $page: Computing embedded image resolution" # get width / height of PDF page heightPDF=`echo $pageSize | cut -f1 -d" "` @@ -34,52 +42,49 @@ cat "$FILE_SIZEPAGES" | while read pageSize ; do # extract raw image from pdf file to compute resolution # unfortunatelly this image may not be rotated as in the pdf... # so we will have to extract it again later - pdfimages -f $page -l $page -j "$infile" "$tmp/${page}_orig" 1>&2 + pdfimages -f $page -l $page -j "$infile" "$curOrigImg" 1>&2 # count number of extracted images - nbImg=`ls -1 "$tmp/${page}_orig"* | wc -l` + nbImg=`ls -1 "$curOrigImg"* | wc -l` [ $nbImg -ne "1" ] && echo "Not exactly 1 image on page $page. Exiting" && exit 1 # Get the characteristic of the extracted image - origImg=`ls -1 "$tmp/${page}_orig"*` - origImg_woext=`echo "$origImg" | sed 's/\.[^.]*$//'` - origImg_ext=`echo "$origImg" | sed 's/^.*[.]//'` - propImg=`identify -format "%w %h %[colorspace]" "$origImg"` - heightImg=`echo "$propImg" | cut -f1 -d" "` - widthImg=`echo "$propImg" | cut -f2 -d" "` - colorspaceImg=`echo "$propImg" | cut -f3 -d" "` + curOrigImg01=`ls -1 "$curOrigImg"*` + propCurOrigImg01=`identify -format "%w %h %[colorspace]" "$curOrigImg01"` + heightCurOrigImg01=`echo "$propCurOrigImg01" | cut -f1 -d" "` + widthICurOrigImg01=`echo "$propCurOrigImg01" | cut -f2 -d" "` + colorspaceCurOrigImg01=`echo "$propCurOrigImg01" | cut -f3 -d" "` # compute the resolution of the whole page (taking into account all images) - dpi=$(($heightImg*72/$heightPDF)) + dpi=$(($heightCurOrigImg01*72/$heightPDF)) echo "Page $page: Resolution: ${dpi} dpi" - # extract current page as image with right rotation + # Identify if page image should be saved as ppm (color) or pgm (gray) echo "Page $page: Extracting image as ppm/pgm" - if [ $colorspaceImg == "Gray" ]; then + if [ $colorspaceCurOrigImg01 == "Gray" ]; then ext="pgm" opt="-gray" else ext="ppm" opt="" fi - pdftoppm -f $page -l $page -r $dpi $opt $infile > "${tmp}/${page}.$ext" + curImgPixmap="$tmp/$page.$ext" + curImgPixmapClean="$tmp/$page.for-ocr.$ext" + + # extract current page as image with right orientation and resoltution + pdftoppm -f $page -l $page -r $dpi $opt $infile > "$curImgPixmap" # improve quality of the image with unpaper to get better OCR results echo "Page $page: Preprocessing image with unpaper" unpaper --dpi $dpi --mask-scan-size 100 \ --no-grayfilter --no-blackfilter --no-mask-center --no-border-align \ - "$tmp/$page.$ext" "$tmp/$page.forocr.$ext" 1> /dev/null + "$curImgPixmap" "$curImgPixmapClean" 1> /dev/null # perform OCR echo "Page $page: Performing OCR" - tesseract -l "$LAN" "$tmp/$page.forocr.$ext" "$tmp/$page.hocr" hocr 1> /dev/null 2> /dev/null - mv "${tmp}/${page}.hocr.html" "$tmp/$page.hocr" + tesseract -l "$LAN" "$curImgPixmapClean" "$curHocr" hocr 1> /dev/null 2> /dev/null + mv "$curHocr.html" "$curHocr" - # compress image to be put inside the pdf file - #echo "Page $page: Compressing image for final PDF file" - #convert -colorspace "$colorspaceImg" "$tmp/$page.forocr.$ext" "$tmp/$page.forpdf.jpg" - # embed text and image to new pdf file echo "Page $page: Embedding text in PDF" - #python hocrTransform.py -r $dpi -i "$tmp/$page.forpdf.jpg" "$tmp/$page.hocr" "$tmp/${page}-ocred.pdf" - python hocrTransform.py -r $dpi -i "$tmp/$page.forocr.$ext" "$tmp/$page.hocr" "$tmp/${page}-ocred.pdf" + python hocrTransform.py -r $dpi -i "$curImgPixmapClean" "$curHocr" "$curOCRedPDF" # go to next page of the pdf page=$(($page+1)) @@ -87,15 +92,15 @@ done # concatenate all pages -pdftk ${tmp}/*-ocred.pdf cat output "${tmp}/ocred.pdf" +pdftk ${tmp}/*-ocred.pdf cat output "$FILE_OUTPUT_PDF" # insert metadata # TODO # convert the pdf file to match PDF/A format -gs -dPDFA -dBATCH -dNOPAUSE -dUseCIEColor -sProcessColorModel=DeviceCMYK -sDEVICE=pdfwrite -sPDFACompatibilityPolicy=2 -sOutputFile=${tmp}/ocred-pdfa.pdf ${tmp}/ocred.pdf +gs -dPDFA -dBATCH -dNOPAUSE -dUseCIEColor -sProcessColorModel=DeviceCMYK -sDEVICE=pdfwrite -sPDFACompatibilityPolicy=2 -sOutputFile=$FILE_OUTPUT_PDFA "$FILE_OUTPUT_PDF" # validate generated pdf file (compliance to PDF/A) echo "Check compliance of generated PDF to PDF/A standard" -#java -jar /root/jhove-1_9/jhove/bin/JhoveApp.jar -m PDF-hul "${tmp}/ocred-pdfa.pdf" |egrep "Status|Message" -java -jar /root/jhove-1_9/jhove/bin/JhoveApp.jar -m PDF-hul "${tmp}/ocred-pdfa.pdf" +#java -jar /root/jhove-1_9/jhove/bin/JhoveApp.jar -m PDF-hul "$FILE_OUTPUT_PDFA" |egrep "Status|Message" +java -jar /root/jhove-1_9/jhove/bin/JhoveApp.jar -m PDF-hul "$FILE_OUTPUT_PDFA"