OCRmyPDF.sh: code clean-up

2025-10-27 07:49:22 +00:00 · 2013-04-18 11:16:40 +02:00 · 2013-04-18 11:16:40 +02:00 · fcac99bc73
commit fcac99bc73
parent 42208aa5fe
1 changed files with 36 additions and 31 deletions
--- a/OCRmyPDF.sh
+++ b/OCRmyPDF.sh
@ -7,7 +7,9 @@ KEEP_TMP="1"
 infile="$1"

 tmp="./tmp"
-FILE_SIZEPAGES="$tmp/infile-page-sizes.txt"
+FILE_SIZE_PAGES="$tmp/page-sizes.txt"		# size in pt of the respective page of the input PDF file
+FILE_OUTPUT_PDF="${tmp}/ocred.pdf"		# name of the OCRed PDF file before conversion to PDF/A
+FILE_OUTPUT_PDFA="${tmp}/ocred-pdfa.pdf"	# name of the final PDF/A file

 # delete tmp files
 rm -r -f "${tmp}"
@ -15,18 +17,24 @@ mkdir -p "${tmp}"

 # get the size of each pdf page (width / height) in pt (inch*72)
 echo "Extracting size of each page (in pt)"
-identify -format "%w %h\n" "$infile" > "$FILE_SIZEPAGES"
-sed -I "" '/^$/d' "$FILE_SIZEPAGES"	# removing empty lines (last one should be)
-numpages=`cat "$FILE_SIZEPAGES" | wc -l`
+identify -format "%w %h\n" "$infile" > "$FILE_SIZE_PAGES"
+sed -I "" '/^$/d' "$FILE_SIZE_PAGES"	# removing empty lines (last one should be)
+numpages=`cat "$FILE_SIZE_PAGES" | wc -l`
 echo "PDF file has $numpages pages"

 # Itterate the pages of the pdf file
 page="1"
-cat "$FILE_SIZEPAGES" | while read pageSize ; do
+cat "$FILE_SIZE_PAGES" | while read pageSize ; do

 	# add leading zeros to the page number
 	page=`printf "%04d" $page`

+	# create the name of the required file
+	curOrigImg="$tmp/${page}_Image"		# original image available in the current PDF page 
+						# (the image file may have a different orientation than in the pdf file)
+	curHocr="$tmp/$page.hocr"		# hocr file to be generated by the OCR SW for the current page
+	curOCRedPDF="$tmp/${page}-ocred.pdf"	# PDF file containing the image + the OCRed text for the current page
+	
 	echo "Page $page: Computing embedded image resolution"
 	# get width / height of PDF page
 	heightPDF=`echo $pageSize | cut -f1 -d" "`
@ -34,52 +42,49 @@ cat "$FILE_SIZEPAGES" | while read pageSize ; do
 	# extract raw image from pdf file to compute resolution
 	# unfortunatelly this image may not be rotated as in the pdf...
 	# so we will have to extract it again later
-	pdfimages -f $page -l $page -j "$infile" "$tmp/${page}_orig" 1>&2	
+	pdfimages -f $page -l $page -j "$infile" "$curOrigImg" 1>&2	
 	# count number of extracted images
-	nbImg=`ls -1 "$tmp/${page}_orig"* | wc -l`
+	nbImg=`ls -1 "$curOrigImg"* | wc -l`
 	[ $nbImg -ne "1" ] && echo "Not exactly 1 image on page $page. Exiting" && exit 1
 	# Get the characteristic of the extracted image
-	origImg=`ls -1 "$tmp/${page}_orig"*`
-	origImg_woext=`echo "$origImg" | sed 's/\.[^.]*$//'`
-	origImg_ext=`echo "$origImg" | sed 's/^.*[.]//'`
-	propImg=`identify -format "%w %h %[colorspace]" "$origImg"`
-	heightImg=`echo "$propImg" | cut -f1 -d" "`
-	widthImg=`echo "$propImg" | cut -f2 -d" "`
-	colorspaceImg=`echo "$propImg" | cut -f3 -d" "`
+	curOrigImg01=`ls -1 "$curOrigImg"*`
+	propCurOrigImg01=`identify -format "%w %h %[colorspace]" "$curOrigImg01"`
+	heightCurOrigImg01=`echo "$propCurOrigImg01" | cut -f1 -d" "`
+	widthICurOrigImg01=`echo "$propCurOrigImg01" | cut -f2 -d" "`
+	colorspaceCurOrigImg01=`echo "$propCurOrigImg01" | cut -f3 -d" "`
 	# compute the resolution of the whole page (taking into account all images)
-	dpi=$(($heightImg*72/$heightPDF))
+	dpi=$(($heightCurOrigImg01*72/$heightPDF))
 	echo "Page $page: Resolution: ${dpi} dpi"

-	# extract current page as image with right rotation
+	# Identify if page image should be saved as ppm (color) or pgm (gray)
 	echo "Page $page: Extracting image as ppm/pgm"
-	if [ $colorspaceImg == "Gray" ]; then
+	if [ $colorspaceCurOrigImg01 == "Gray" ]; then
 		ext="pgm"
 		opt="-gray"
 	else
 		ext="ppm"
 		opt=""		
 	fi
-	pdftoppm -f $page -l $page -r $dpi $opt $infile > "${tmp}/${page}.$ext"
+	curImgPixmap="$tmp/$page.$ext"
+	curImgPixmapClean="$tmp/$page.for-ocr.$ext"
+	
+	# extract current page as image with right orientation and resoltution
+	pdftoppm -f $page -l $page -r $dpi $opt $infile > "$curImgPixmap"

 	# improve quality of the image with unpaper to get better OCR results
 	echo "Page $page: Preprocessing image with unpaper"
 	unpaper --dpi $dpi --mask-scan-size 100 \
 		--no-grayfilter --no-blackfilter --no-mask-center --no-border-align \
-		"$tmp/$page.$ext" "$tmp/$page.forocr.$ext" 1> /dev/null
+		"$curImgPixmap" "$curImgPixmapClean" 1> /dev/null

 	# perform OCR
 	echo "Page $page: Performing OCR"
-	tesseract -l "$LAN" "$tmp/$page.forocr.$ext" "$tmp/$page.hocr" hocr 1> /dev/null 2> /dev/null 
-	mv "${tmp}/${page}.hocr.html" "$tmp/$page.hocr"
+	tesseract -l "$LAN" "$curImgPixmapClean" "$curHocr" hocr 1> /dev/null 2> /dev/null 
+	mv "$curHocr.html" "$curHocr"

-	# compress image to be put inside the pdf file
-	#echo "Page $page: Compressing image for final PDF file"
-	#convert -colorspace "$colorspaceImg" "$tmp/$page.forocr.$ext" "$tmp/$page.forpdf.jpg"
-	
 	# embed text and image to new pdf file
 	echo "Page $page: Embedding text in PDF"
-	#python hocrTransform.py -r $dpi -i "$tmp/$page.forpdf.jpg" "$tmp/$page.hocr" "$tmp/${page}-ocred.pdf"
-	python hocrTransform.py -r $dpi -i "$tmp/$page.forocr.$ext" "$tmp/$page.hocr" "$tmp/${page}-ocred.pdf"
+	python hocrTransform.py -r $dpi -i "$curImgPixmapClean" "$curHocr" "$curOCRedPDF"
 	
 	# go to next page of the pdf
 	page=$(($page+1))
@ -87,15 +92,15 @@ done


 # concatenate all pages
-pdftk ${tmp}/*-ocred.pdf cat output "${tmp}/ocred.pdf"
+pdftk ${tmp}/*-ocred.pdf cat output "$FILE_OUTPUT_PDF"

 # insert metadata
 # TODO

 # convert the pdf file to match PDF/A format
-gs -dPDFA -dBATCH -dNOPAUSE -dUseCIEColor -sProcessColorModel=DeviceCMYK -sDEVICE=pdfwrite -sPDFACompatibilityPolicy=2 -sOutputFile=${tmp}/ocred-pdfa.pdf ${tmp}/ocred.pdf
+gs -dPDFA -dBATCH -dNOPAUSE -dUseCIEColor -sProcessColorModel=DeviceCMYK -sDEVICE=pdfwrite -sPDFACompatibilityPolicy=2 -sOutputFile=$FILE_OUTPUT_PDFA "$FILE_OUTPUT_PDF"

 # validate generated pdf file (compliance to PDF/A)
 echo "Check compliance of generated PDF to PDF/A standard" 
-#java -jar /root/jhove-1_9/jhove/bin/JhoveApp.jar -m PDF-hul "${tmp}/ocred-pdfa.pdf" |egrep "Status|Message"
-java -jar /root/jhove-1_9/jhove/bin/JhoveApp.jar -m PDF-hul "${tmp}/ocred-pdfa.pdf"
+#java -jar /root/jhove-1_9/jhove/bin/JhoveApp.jar -m PDF-hul "$FILE_OUTPUT_PDFA" |egrep "Status|Message"
+java -jar /root/jhove-1_9/jhove/bin/JhoveApp.jar -m PDF-hul "$FILE_OUTPUT_PDFA"