OCRmyPDF/OCRmyPDF.sh

#!/bin/sh
echo "usage: ./scan-archive.sh filename.pdf"

LAN="eng"
KEEP_TMP="1"

infile="$1"

tmp="./tmp"
FILE_SIZE_PAGES="$tmp/page-sizes.txt"		# size in pt of the respective page of the input PDF file
FILE_OUTPUT_PDF="${tmp}/ocred.pdf"		# name of the OCRed PDF file before conversion to PDF/A
FILE_OUTPUT_PDFA="${tmp}/ocred-pdfa.pdf"	# name of the final PDF/A file

# delete tmp files
rm -r -f "${tmp}"
mkdir -p "${tmp}"

# get the size of each pdf page (width / height) in pt (inch*72)
echo "Extracting size of each page (in pt)"
identify -format "%w %h\n" "$infile" > "$FILE_SIZE_PAGES"
sed -I "" '/^$/d' "$FILE_SIZE_PAGES"	# removing empty lines (last one should be)
numpages=`cat "$FILE_SIZE_PAGES" | wc -l`
echo "PDF file has $numpages pages"

# Itterate the pages of the pdf file
page="1"
cat "$FILE_SIZE_PAGES" | while read pageSize ; do

	# add leading zeros to the page number
	page=`printf "%04d" $page`

	# create the name of the required file
	curOrigImg="$tmp/${page}_Image"		# original image available in the current PDF page
						# (the image file may have a different orientation than in the pdf file)
	curHocr="$tmp/$page.hocr"		# hocr file to be generated by the OCR SW for the current page
	curOCRedPDF="$tmp/${page}-ocred.pdf"	# PDF file containing the image + the OCRed text for the current page

	echo "Page $page: Computing embedded image resolution"
	# get width / height of PDF page
	heightPDF=`echo $pageSize | cut -f1 -d" "`
	widthPDF=`echo $pageSize | cut -f2 -d" "`
	# extract raw image from pdf file to compute resolution
	# unfortunatelly this image may not be rotated as in the pdf...
	# so we will have to extract it again later
	pdfimages -f $page -l $page -j "$infile" "$curOrigImg" 1>&2
	# count number of extracted images
	nbImg=`ls -1 "$curOrigImg"* | wc -l`
	[ $nbImg -ne "1" ] && echo "Not exactly 1 image on page $page. Exiting" && exit 1
	# Get the characteristic of the extracted image
	curOrigImg01=`ls -1 "$curOrigImg"*`
	propCurOrigImg01=`identify -format "%w %h %[colorspace]" "$curOrigImg01"`
	heightCurOrigImg01=`echo "$propCurOrigImg01" | cut -f1 -d" "`
	widthICurOrigImg01=`echo "$propCurOrigImg01" | cut -f2 -d" "`
	colorspaceCurOrigImg01=`echo "$propCurOrigImg01" | cut -f3 -d" "`
	# compute the resolution of the whole page (taking into account all images)
	dpi=$(($heightCurOrigImg01*72/$heightPDF))
	echo "Page $page: Resolution: ${dpi} dpi"

	# Identify if page image should be saved as ppm (color) or pgm (gray)
	echo "Page $page: Extracting image as ppm/pgm"
	if [ $colorspaceCurOrigImg01 == "Gray" ]; then
		ext="pgm"
		opt="-gray"
	else
		ext="ppm"
		opt=""
	fi
	curImgPixmap="$tmp/$page.$ext"
	curImgPixmapClean="$tmp/$page.for-ocr.$ext"

	# extract current page as image with right orientation and resoltution
	pdftoppm -f $page -l $page -r $dpi $opt $infile > "$curImgPixmap"

	# improve quality of the image with unpaper to get better OCR results
	echo "Page $page: Preprocessing image with unpaper"
	unpaper --dpi $dpi --mask-scan-size 100 \
		--no-grayfilter --no-blackfilter --no-mask-center --no-border-align \
		"$curImgPixmap" "$curImgPixmapClean" 1> /dev/null

	# perform OCR
	echo "Page $page: Performing OCR"
	tesseract -l "$LAN" "$curImgPixmapClean" "$curHocr" hocr 1> /dev/null 2> /dev/null
	mv "$curHocr.html" "$curHocr"

	# embed text and image to new pdf file
	echo "Page $page: Embedding text in PDF"
	python hocrTransform.py -r $dpi -i "$curImgPixmapClean" "$curHocr" "$curOCRedPDF"

	# go to next page of the pdf
	page=$(($page+1))
done


# concatenate all pages
pdftk ${tmp}/*-ocred.pdf cat output "$FILE_OUTPUT_PDF"

# insert metadata
# TODO

# convert the pdf file to match PDF/A format
gs -dPDFA -dBATCH -dNOPAUSE -dUseCIEColor -sProcessColorModel=DeviceCMYK -sDEVICE=pdfwrite -sPDFACompatibilityPolicy=2 -sOutputFile=$FILE_OUTPUT_PDFA "$FILE_OUTPUT_PDF"

# validate generated pdf file (compliance to PDF/A)
echo "Check compliance of generated PDF to PDF/A standard"
#java -jar /root/jhove-1_9/jhove/bin/JhoveApp.jar -m PDF-hul "$FILE_OUTPUT_PDFA" |egrep "Status|Message"
java -jar /root/jhove-1_9/jhove/bin/JhoveApp.jar -m PDF-hul "$FILE_OUTPUT_PDFA"