OCRmyPDF/OCRmyPDF.sh

102 lines
3.7 KiB
Bash
Raw Normal View History

2013-04-09 19:00:26 +02:00
#!/bin/sh
echo "usage: ./scan-archive.sh filename.pdf"
LAN="eng"
KEEP_TMP="1"
2013-04-09 19:00:26 +02:00
infile="$1"
tmp="./tmp"
FILE_SIZEPAGES="$tmp/infile-page-sizes.txt"
2013-04-09 19:00:26 +02:00
# delete tmp files
rm -r -f "${tmp}"
mkdir -p "${tmp}"
# get the size of each pdf page (width / height) in pt (inch*72)
echo "Extracting size of each page (in pt)"
identify -format "%w %h\n" "$infile" > "$FILE_SIZEPAGES"
sed -I "" '/^$/d' "$FILE_SIZEPAGES" # removing empty lines (last one should be)
numpages=`cat "$FILE_SIZEPAGES" | wc -l`
2013-04-09 19:00:26 +02:00
echo "PDF file has $numpages pages"
# Itterate the pages of the pdf file
page="1"
cat "$FILE_SIZEPAGES" | while read pageSize ; do
# add leading zeros to the page number
page=`printf "%04d" $page`
echo "Page $page: Computing embedded image resolution"
# get width / height of PDF page
heightPDF=`echo $pageSize | cut -f1 -d" "`
widthPDF=`echo $pageSize | cut -f2 -d" "`
# extract raw image from pdf file to compute resolution
# unfortunatelly this image may not be rotated as in the pdf...
# so we will have to extract it again later
pdfimages -f $page -l $page -j "$infile" "$tmp/${page}_orig" 1>&2
# count number of extracted images
nbImg=`ls -1 "$tmp/${page}_orig"* | wc -l`
[ $nbImg -ne "1" ] && echo "Not exactly 1 image on page $page. Exiting" && exit 1
# Get the characteristic of the extracted image
origImg=`ls -1 "$tmp/${page}_orig"*`
origImg_woext=`echo "$origImg" | sed 's/\.[^.]*$//'`
origImg_ext=`echo "$origImg" | sed 's/^.*[.]//'`
propImg=`identify -format "%w %h %[colorspace]" "$origImg"`
heightImg=`echo "$propImg" | cut -f1 -d" "`
widthImg=`echo "$propImg" | cut -f2 -d" "`
colorspaceImg=`echo "$propImg" | cut -f3 -d" "`
# compute the resolution of the whole page (taking into account all images)
dpi=$(($heightImg*72/$heightPDF))
echo "Page $page: Resolution: ${dpi} dpi"
# extract current page as image with right rotation
echo "Page $page: Extracting image as ppm/pgm"
if [ $colorspaceImg == "Gray" ]; then
ext="pgm"
opt="-gray"
else
ext="ppm"
opt=""
fi
pdftoppm -f $page -l $page -r $dpi $opt $infile > "${tmp}/${page}.$ext"
2013-04-09 19:00:26 +02:00
# improve quality of the image with unpaper to get better OCR results
2013-04-09 19:00:26 +02:00
echo "Page $page: Preprocessing image with unpaper"
unpaper --dpi $dpi --mask-scan-size 100 \
--no-grayfilter --no-blackfilter --no-mask-center --no-border-align \
"$tmp/$page.$ext" "$tmp/$page.forocr.$ext" 1> /dev/null
2013-04-09 19:00:26 +02:00
# perform OCR
echo "Page $page: Performing OCR"
tesseract -l "$LAN" "$tmp/$page.forocr.$ext" "$tmp/$page.hocr" hocr 1> /dev/null 2> /dev/null
mv "${tmp}/${page}.hocr.html" "$tmp/$page.hocr"
2013-04-09 19:00:26 +02:00
# compress image to be put inside the pdf file
2013-04-18 10:31:36 +02:00
#echo "Page $page: Compressing image for final PDF file"
#convert -colorspace "$colorspaceImg" "$tmp/$page.forocr.$ext" "$tmp/$page.forpdf.jpg"
2013-04-09 19:00:26 +02:00
# embed text and image to new pdf file
echo "Page $page: Embedding text in PDF"
2013-04-18 10:31:36 +02:00
#python hocrTransform.py -r $dpi -i "$tmp/$page.forpdf.jpg" "$tmp/$page.hocr" "$tmp/${page}-ocred.pdf"
python hocrTransform.py -r $dpi -i "$tmp/$page.forocr.$ext" "$tmp/$page.hocr" "$tmp/${page}-ocred.pdf"
2013-04-09 19:00:26 +02:00
# go to next page of the pdf
page=$(($page+1))
done
# concatenate all pages
pdftk ${tmp}/*-ocred.pdf cat output "${tmp}/ocred.pdf"
# insert metadata
# TODO
2013-04-09 19:00:26 +02:00
2013-04-18 10:31:36 +02:00
# convert the pdf file to match PDF/A format
gs -dPDFA -dBATCH -dNOPAUSE -dUseCIEColor -sProcessColorModel=DeviceCMYK -sDEVICE=pdfwrite -sPDFACompatibilityPolicy=2 -sOutputFile=${tmp}/ocred-pdfa.pdf ${tmp}/ocred.pdf
# validate generated pdf file (compliance to PDF/A)
echo "Check compliance of generated PDF to PDF/A standard"
#java -jar /root/jhove-1_9/jhove/bin/JhoveApp.jar -m PDF-hul "${tmp}/ocred-pdfa.pdf" |egrep "Status|Message"
java -jar /root/jhove-1_9/jhove/bin/JhoveApp.jar -m PDF-hul "${tmp}/ocred-pdfa.pdf"