mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-10-27 07:49:22 +00:00
OCRmyPDF.sh: code clean-up
This commit is contained in:
parent
42208aa5fe
commit
fcac99bc73
67
OCRmyPDF.sh
67
OCRmyPDF.sh
@ -7,7 +7,9 @@ KEEP_TMP="1"
|
||||
infile="$1"
|
||||
|
||||
tmp="./tmp"
|
||||
FILE_SIZEPAGES="$tmp/infile-page-sizes.txt"
|
||||
FILE_SIZE_PAGES="$tmp/page-sizes.txt" # size in pt of the respective page of the input PDF file
|
||||
FILE_OUTPUT_PDF="${tmp}/ocred.pdf" # name of the OCRed PDF file before conversion to PDF/A
|
||||
FILE_OUTPUT_PDFA="${tmp}/ocred-pdfa.pdf" # name of the final PDF/A file
|
||||
|
||||
# delete tmp files
|
||||
rm -r -f "${tmp}"
|
||||
@ -15,18 +17,24 @@ mkdir -p "${tmp}"
|
||||
|
||||
# get the size of each pdf page (width / height) in pt (inch*72)
|
||||
echo "Extracting size of each page (in pt)"
|
||||
identify -format "%w %h\n" "$infile" > "$FILE_SIZEPAGES"
|
||||
sed -I "" '/^$/d' "$FILE_SIZEPAGES" # removing empty lines (last one should be)
|
||||
numpages=`cat "$FILE_SIZEPAGES" | wc -l`
|
||||
identify -format "%w %h\n" "$infile" > "$FILE_SIZE_PAGES"
|
||||
sed -I "" '/^$/d' "$FILE_SIZE_PAGES" # removing empty lines (last one should be)
|
||||
numpages=`cat "$FILE_SIZE_PAGES" | wc -l`
|
||||
echo "PDF file has $numpages pages"
|
||||
|
||||
# Itterate the pages of the pdf file
|
||||
page="1"
|
||||
cat "$FILE_SIZEPAGES" | while read pageSize ; do
|
||||
cat "$FILE_SIZE_PAGES" | while read pageSize ; do
|
||||
|
||||
# add leading zeros to the page number
|
||||
page=`printf "%04d" $page`
|
||||
|
||||
# create the name of the required file
|
||||
curOrigImg="$tmp/${page}_Image" # original image available in the current PDF page
|
||||
# (the image file may have a different orientation than in the pdf file)
|
||||
curHocr="$tmp/$page.hocr" # hocr file to be generated by the OCR SW for the current page
|
||||
curOCRedPDF="$tmp/${page}-ocred.pdf" # PDF file containing the image + the OCRed text for the current page
|
||||
|
||||
echo "Page $page: Computing embedded image resolution"
|
||||
# get width / height of PDF page
|
||||
heightPDF=`echo $pageSize | cut -f1 -d" "`
|
||||
@ -34,52 +42,49 @@ cat "$FILE_SIZEPAGES" | while read pageSize ; do
|
||||
# extract raw image from pdf file to compute resolution
|
||||
# unfortunatelly this image may not be rotated as in the pdf...
|
||||
# so we will have to extract it again later
|
||||
pdfimages -f $page -l $page -j "$infile" "$tmp/${page}_orig" 1>&2
|
||||
pdfimages -f $page -l $page -j "$infile" "$curOrigImg" 1>&2
|
||||
# count number of extracted images
|
||||
nbImg=`ls -1 "$tmp/${page}_orig"* | wc -l`
|
||||
nbImg=`ls -1 "$curOrigImg"* | wc -l`
|
||||
[ $nbImg -ne "1" ] && echo "Not exactly 1 image on page $page. Exiting" && exit 1
|
||||
# Get the characteristic of the extracted image
|
||||
origImg=`ls -1 "$tmp/${page}_orig"*`
|
||||
origImg_woext=`echo "$origImg" | sed 's/\.[^.]*$//'`
|
||||
origImg_ext=`echo "$origImg" | sed 's/^.*[.]//'`
|
||||
propImg=`identify -format "%w %h %[colorspace]" "$origImg"`
|
||||
heightImg=`echo "$propImg" | cut -f1 -d" "`
|
||||
widthImg=`echo "$propImg" | cut -f2 -d" "`
|
||||
colorspaceImg=`echo "$propImg" | cut -f3 -d" "`
|
||||
curOrigImg01=`ls -1 "$curOrigImg"*`
|
||||
propCurOrigImg01=`identify -format "%w %h %[colorspace]" "$curOrigImg01"`
|
||||
heightCurOrigImg01=`echo "$propCurOrigImg01" | cut -f1 -d" "`
|
||||
widthICurOrigImg01=`echo "$propCurOrigImg01" | cut -f2 -d" "`
|
||||
colorspaceCurOrigImg01=`echo "$propCurOrigImg01" | cut -f3 -d" "`
|
||||
# compute the resolution of the whole page (taking into account all images)
|
||||
dpi=$(($heightImg*72/$heightPDF))
|
||||
dpi=$(($heightCurOrigImg01*72/$heightPDF))
|
||||
echo "Page $page: Resolution: ${dpi} dpi"
|
||||
|
||||
# extract current page as image with right rotation
|
||||
# Identify if page image should be saved as ppm (color) or pgm (gray)
|
||||
echo "Page $page: Extracting image as ppm/pgm"
|
||||
if [ $colorspaceImg == "Gray" ]; then
|
||||
if [ $colorspaceCurOrigImg01 == "Gray" ]; then
|
||||
ext="pgm"
|
||||
opt="-gray"
|
||||
else
|
||||
ext="ppm"
|
||||
opt=""
|
||||
fi
|
||||
pdftoppm -f $page -l $page -r $dpi $opt $infile > "${tmp}/${page}.$ext"
|
||||
curImgPixmap="$tmp/$page.$ext"
|
||||
curImgPixmapClean="$tmp/$page.for-ocr.$ext"
|
||||
|
||||
# extract current page as image with right orientation and resoltution
|
||||
pdftoppm -f $page -l $page -r $dpi $opt $infile > "$curImgPixmap"
|
||||
|
||||
# improve quality of the image with unpaper to get better OCR results
|
||||
echo "Page $page: Preprocessing image with unpaper"
|
||||
unpaper --dpi $dpi --mask-scan-size 100 \
|
||||
--no-grayfilter --no-blackfilter --no-mask-center --no-border-align \
|
||||
"$tmp/$page.$ext" "$tmp/$page.forocr.$ext" 1> /dev/null
|
||||
"$curImgPixmap" "$curImgPixmapClean" 1> /dev/null
|
||||
|
||||
# perform OCR
|
||||
echo "Page $page: Performing OCR"
|
||||
tesseract -l "$LAN" "$tmp/$page.forocr.$ext" "$tmp/$page.hocr" hocr 1> /dev/null 2> /dev/null
|
||||
mv "${tmp}/${page}.hocr.html" "$tmp/$page.hocr"
|
||||
tesseract -l "$LAN" "$curImgPixmapClean" "$curHocr" hocr 1> /dev/null 2> /dev/null
|
||||
mv "$curHocr.html" "$curHocr"
|
||||
|
||||
# compress image to be put inside the pdf file
|
||||
#echo "Page $page: Compressing image for final PDF file"
|
||||
#convert -colorspace "$colorspaceImg" "$tmp/$page.forocr.$ext" "$tmp/$page.forpdf.jpg"
|
||||
|
||||
# embed text and image to new pdf file
|
||||
echo "Page $page: Embedding text in PDF"
|
||||
#python hocrTransform.py -r $dpi -i "$tmp/$page.forpdf.jpg" "$tmp/$page.hocr" "$tmp/${page}-ocred.pdf"
|
||||
python hocrTransform.py -r $dpi -i "$tmp/$page.forocr.$ext" "$tmp/$page.hocr" "$tmp/${page}-ocred.pdf"
|
||||
python hocrTransform.py -r $dpi -i "$curImgPixmapClean" "$curHocr" "$curOCRedPDF"
|
||||
|
||||
# go to next page of the pdf
|
||||
page=$(($page+1))
|
||||
@ -87,15 +92,15 @@ done
|
||||
|
||||
|
||||
# concatenate all pages
|
||||
pdftk ${tmp}/*-ocred.pdf cat output "${tmp}/ocred.pdf"
|
||||
pdftk ${tmp}/*-ocred.pdf cat output "$FILE_OUTPUT_PDF"
|
||||
|
||||
# insert metadata
|
||||
# TODO
|
||||
|
||||
# convert the pdf file to match PDF/A format
|
||||
gs -dPDFA -dBATCH -dNOPAUSE -dUseCIEColor -sProcessColorModel=DeviceCMYK -sDEVICE=pdfwrite -sPDFACompatibilityPolicy=2 -sOutputFile=${tmp}/ocred-pdfa.pdf ${tmp}/ocred.pdf
|
||||
gs -dPDFA -dBATCH -dNOPAUSE -dUseCIEColor -sProcessColorModel=DeviceCMYK -sDEVICE=pdfwrite -sPDFACompatibilityPolicy=2 -sOutputFile=$FILE_OUTPUT_PDFA "$FILE_OUTPUT_PDF"
|
||||
|
||||
# validate generated pdf file (compliance to PDF/A)
|
||||
echo "Check compliance of generated PDF to PDF/A standard"
|
||||
#java -jar /root/jhove-1_9/jhove/bin/JhoveApp.jar -m PDF-hul "${tmp}/ocred-pdfa.pdf" |egrep "Status|Message"
|
||||
java -jar /root/jhove-1_9/jhove/bin/JhoveApp.jar -m PDF-hul "${tmp}/ocred-pdfa.pdf"
|
||||
#java -jar /root/jhove-1_9/jhove/bin/JhoveApp.jar -m PDF-hul "$FILE_OUTPUT_PDFA" |egrep "Status|Message"
|
||||
java -jar /root/jhove-1_9/jhove/bin/JhoveApp.jar -m PDF-hul "$FILE_OUTPUT_PDFA"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user