2013-04-09 19:00:26 +02:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								#!/bin/sh
 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-26 11:50:39 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								############################################################################## 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-04 21:24:33 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# Copyright (c) 2013-14: fritz-hh from Github (https://github.com/fritz-hh) 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-26 11:50:39 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								############################################################################## 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-09 19:00:26 +02:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-09-25 03:11:27 -07:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# Darwin/OS X has not evolved a proper readlink yet 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								if  [  $( uname)  = =  "Darwin"  ] ;  then 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									function  readlink( )  { 
							 
						 
					
						
							
								
									
										
										
										
											2015-03-24 22:36:13 -07:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
										python3 -c 'import os,sys; print(os.path.realpath(sys.argv[1]))'  " $2 " 
							 
						 
					
						
							
								
									
										
										
										
											2014-09-25 03:11:27 -07:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									} 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								fi 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2013-11-27 22:34:21 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# Import required scripts 
							 
						 
					
						
							
								
									
										
										
										
											2014-09-10 13:26:14 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								BASEPATH = " $( dirname $( readlink -f $0 ) ) " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								. " $BASEPATH /src/config.sh " 
							 
						 
					
						
							
								
									
										
										
										
											2013-11-27 22:34:21 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-01-04 21:43:41 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# Set variables corresponding to the input parameters 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								ARGUMENTS = " $@ " 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-14 19:15:01 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2013-05-01 15:58:55 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								START = ` date +%s` 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2013-04-21 21:58:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								usage( )  { 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									cat << EOF
							 
						 
					
						
							
								
									
										
										
										
											2013-04-22 20:08:24 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								--------------------------------------------------------------------------------------
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								Script aimed at generating a searchable PDF file from a PDF file containing only images.
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								( The script performs optical character recognition of each respective page using the
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								tesseract engine) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-01-04 21:43:41 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								Copyright: fritz-hh  from Github ( https://github.com/fritz-hh) 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-22 20:08:24 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								Version: $VERSION 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-01-04 21:24:33 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								Usage: OCRmyPDF.sh  [ -h]  [ -v]  [ -g]  [ -k]  [ -d]  [ -c]  [ -i]  [ -o dpi]  [ -f]  [ -l language]  [ -C filename]  inputfile outputfile
							 
						 
					
						
							
								
									
										
										
										
											2013-04-22 20:08:24 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								-h : Display this help  message
							 
						 
					
						
							
								
									
										
										
										
											2013-12-30 23:44:38 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								-v : Increase the verbosity ( this option can be used more than once)  ( e.g. -vvv) 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-28 14:43:21 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								-k : Do not delete the temporary files
							 
						 
					
						
							
								
									
										
										
										
											2013-04-22 22:50:34 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								-g : Activate debug mode:
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								     - Generates a PDF file containing each page twice ( once with the image, once without the image
							 
						 
					
						
							
								
									
										
										
										
											2013-04-26 16:19:18 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								       but with the OCRed text as well as the detected bounding boxes) 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-28 14:43:21 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								     - Set the verbosity to the highest possible
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								     - Do not delete the temporary files
							 
						 
					
						
							
								
									
										
										
										
											2013-04-22 20:08:24 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								-d : Deskew each page before performing OCR
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								-c : Clean each page before performing OCR
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								-i : Incorporate the cleaned image in the final PDF file ( by default the original image	
							 
						 
					
						
							
								
									
										
										
										
											2014-09-22 21:22:38 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								     image, or the deskewed image if  the -d option is set ) 
							 
						 
					
						
							
								
									
										
										
										
											2013-12-30 23:44:38 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								-o : If the resolution of an image is lower than dpi value provided as argument, provide the OCR engine with 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								     an oversampled image having the latter dpi value. This can improve the OCR results but can lead to a larger output PDF file.
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								     ( default: no oversampling performed) 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-04 21:24:33 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								-f : Force to OCR the whole document, even if  some page already contain font data 
							 
						 
					
						
							
								
									
										
										
										
											2014-05-23 12:40:51 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								     ( which should not be the case  for  PDF files built from scanned images)  
							 
						 
					
						
							
								
									
										
										
										
											2014-02-04 21:25:38 -08:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								-s : If pages contain font data, do  not perform processing on that page, but include the page in the final output.
							 
						 
					
						
							
								
									
										
										
										
											2015-02-20 15:26:33 -08:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								-b : Skip big pages
							 
						 
					
						
							
								
									
										
										
										
											2015-03-10 14:28:38 -07:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								-e : Use exact PDF pages with no changes other than inserting hidden OCR text layer ( mutually exclusive with -d/-c/-i/-f) 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-23 00:35:42 +03:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								-l : Set the language of the PDF file in order to improve OCR results ( default "eng" ) 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-10 22:59:33 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								     Any language supported by tesseract is supported ( Tesseract uses 3-character ISO 639-2 language codes) 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-18 22:22:19 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								     Multiple languages may be specified, separated by '+'  characters.
							 
						 
					
						
							
								
									
										
										
										
											2013-04-23 21:36:34 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								-C : Pass an additional configuration file to the tesseract OCR engine.
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								     ( this option can be used more than once) 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-05 22:02:12 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								     Note 1: The configuration file must be available in the "tessdata/configs"  folder of your tesseract installation
							 
						 
					
						
							
								
									
										
										
										
											2013-04-23 22:54:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								inputfile  : PDF file to be OCRed
							 
						 
					
						
							
								
									
										
										
										
											2014-01-04 21:24:33 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								outputfile : The PDF/A file that will be generated 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-22 20:08:24 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								--------------------------------------------------------------------------------------
							 
						 
					
						
							
								
									
										
										
										
											2013-04-21 21:58:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								EOF
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								} 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2013-04-23 22:54:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								################################################# 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# Get an absolute path from a relative path to a file 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-23 22:57:41 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# Param1 : Relative path 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-23 22:54:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# Returns: 1 if the folder in which the file is located does not exist 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								#          0 otherwise 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								#################################################  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								absolutePath( )  { 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									local  wdsave absolutepath 
							 
						 
					
						
							
								
									
										
										
										
											2013-05-01 13:44:20 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									wdsave = " $( pwd ) " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									! cd  " $( dirname " $1 " ) "  1> /dev/null 2> /dev/null &&  return  1 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									absolutepath = " $( pwd ) / $( basename " $1 " ) " 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-23 22:54:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									cd  " $wdsave " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									echo  " $absolutepath " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									return  0 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								} 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-21 21:58:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2013-04-20 22:02:33 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# Initialization the configuration parameters with default values 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-21 21:58:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								VERBOSITY = " $LOG_ERR " 		# default verbosity level 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								LAN = "eng" 			# default language of the PDF file (required to get good OCR results) 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-05 22:02:12 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								KEEP_TMP = "0" 			# 0=no, 1=yes (keep the temporary files) 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-21 21:58:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								PREPROCESS_DESKEW = "0" 		# 0=no, 1=yes (deskew image) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								PREPROCESS_CLEAN = "0" 		# 0=no, 1=yes (clean image to improve OCR) 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-23 21:36:34 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								PREPROCESS_CLEANTOPDF = "0" 	# 0=no, 1=yes (put cleaned image in final PDF) 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-05 22:02:12 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								OVERSAMPLING_DPI = "0" 		# 0=do not perform oversampling (dpi value under which oversampling should be performed) 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-28 14:43:21 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								PDF_NOIMG = "0" 			# 0=no, 1=yes (generates each PDF page twice, with and without image) 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-05 22:02:12 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								FORCE_OCR = "0" 			# 0=do not force, 1=force (force to OCR the whole document, even if some page already contain font data) 
							 
						 
					
						
							
								
									
										
										
										
											2014-02-04 21:25:38 -08:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								SKIP_TEXT = "0" 			# 0=do not skip text pages, 1=skip text pages 
							 
						 
					
						
							
								
									
										
										
										
											2015-02-20 15:26:33 -08:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								SKIP_BIG = "0" 
							 
						 
					
						
							
								
									
										
										
										
											2015-03-10 14:28:38 -07:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								EXACT_IMAGE = "0" 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-23 21:36:34 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								TESS_CFG_FILES = "" 		# list of additional configuration files to be used by tesseract 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-21 21:58:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# Parse optional command line arguments 
							 
						 
					
						
							
								
									
										
										
										
											2015-03-10 14:28:38 -07:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								while  getopts  ":hvgkdcio:fsbel:C:"  opt;  do 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-21 21:58:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									case  $opt  in
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
										h)  usage ;  exit  0  ; ; 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
										v)  VERBOSITY = $(( $VERBOSITY + 1 ))  ; ; 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
										k)  KEEP_TMP = "1"  ; ; 
							 
						 
					
						
							
								
									
										
										
										
											2013-11-29 10:34:05 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
										g)  PDF_NOIMG = "1" ;  VERBOSITY = " $LOG_DEBUG " ;  KEEP_TMP = "1"  ; ; 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-23 21:36:34 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
										d)  PREPROCESS_DESKEW = "1"  ; ; 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-21 21:58:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
										c)  PREPROCESS_CLEAN = "1"  ; ; 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-23 21:36:34 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
										i)  PREPROCESS_CLEANTOPDF = "1"  ; ; 
							 
						 
					
						
							
								
									
										
										
										
											2013-12-30 23:44:38 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
										o)  OVERSAMPLING_DPI = " $OPTARG "  ; ; 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-04 21:24:33 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
										f)  FORCE_OCR = "1"  ; ; 
							 
						 
					
						
							
								
									
										
										
										
											2014-02-04 21:25:38 -08:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
										s)  SKIP_TEXT = "1"  ; ; 
							 
						 
					
						
							
								
									
										
										
										
											2015-02-20 15:26:33 -08:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
										b)  SKIP_BIG = "1"  ; ; 
							 
						 
					
						
							
								
									
										
										
										
											2015-03-10 14:28:38 -07:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
										e)  EXACT_IMAGE = "1"  ; ; 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-21 21:58:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
										l)  LAN = " $OPTARG "  ; ; 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-23 21:36:34 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
										C)  TESS_CFG_FILES = " $OPTARG   $TESS_CFG_FILES "  ; ; 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-21 21:58:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
										\? ) 
							 
						 
					
						
							
								
									
										
										
										
											2013-12-30 23:44:38 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
											echo  " Invalid option: - $OPTARG " 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-21 21:58:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
											usage
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
											exit  $EXIT_BAD_ARGS  ; ; 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
										:) 
							 
						 
					
						
							
								
									
										
										
										
											2013-12-30 23:44:38 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
											echo  " Option - $OPTARG  requires an argument " 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-21 21:58:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
											usage
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
											exit  $EXIT_BAD_ARGS  ; ; 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									esac 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								done 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# Remove the optional arguments parsed above. 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								shift  $(( OPTIND-1)) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-01-18 21:38:22 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# Check if the number of mandatory parameters provided is as expected 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-23 22:54:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								if  [  " $# "  -ne "2"  ] ;  then 
							 
						 
					
						
							
								
									
										
										
										
											2013-12-30 23:44:38 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									echo  " Exactly two mandatory argument shall be provided ( $#  arguments provided) " 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-21 21:58:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									usage
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									exit  $EXIT_BAD_ARGS 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								fi 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-02-04 21:25:38 -08:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								if  [  " $SKIP_TEXT "  -eq "1"  -a " $FORCE_OCR "  -eq "1"  ] ;  then 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									echo  "Options -f and -s are mutually exclusive; choose one or the other" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									usage
							 
						 
					
						
							
								
									
										
										
										
											2015-03-19 17:12:32 -07:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									exit  $EXIT_BAD_ARGSor 
							 
						 
					
						
							
								
									
										
										
										
											2014-02-04 21:25:38 -08:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								fi 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2013-05-05 22:33:54 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								! absolutePath " $1 "  > /dev/null \
 
							 
						 
					
						
							
								
									
										
										
										
											2013-12-30 23:44:38 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									&&  echo  "The folder in which the input file should be located does not exist. Exiting..."  &&  exit  $EXIT_BAD_ARGS 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-23 22:54:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								FILE_INPUT_PDF = "`absolutePath " $1 "`" 
							 
						 
					
						
							
								
									
										
										
										
											2013-05-05 22:33:54 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								! absolutePath " $2 "  > /dev/null \
 
							 
						 
					
						
							
								
									
										
										
										
											2013-12-30 23:44:38 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									&&  echo  "The folder in which the output file should be generated does not exist. Exiting..."  &&  exit  $EXIT_BAD_ARGS 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-23 22:54:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								FILE_OUTPUT_PDFA = "`absolutePath " $2 "`" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# set script path as working directory 
							 
						 
					
						
							
								
									
										
										
										
											2014-09-10 13:26:14 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								cd  " $BASEPATH " 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-21 21:58:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-01-04 21:43:41 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								[  $VERBOSITY  -ge $LOG_DEBUG  ]  &&  echo  " $TOOLNAME  version:  $VERSION " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								[  $VERBOSITY  -ge $LOG_DEBUG  ]  &&  echo  " Arguments:  $ARGUMENTS " 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-26 11:50:39 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2013-04-19 22:23:28 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# check if the required utilities are installed 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-22 20:56:45 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								[  $VERBOSITY  -ge $LOG_DEBUG  ]  &&  echo  "Checking if all dependencies are installed" 
							 
						 
					
						
							
								
									
										
										
										
											2013-12-30 23:44:38 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								! command  -v identify > /dev/null &&  echo  "Please install ImageMagick. Exiting..."  &&  exit  $EXIT_MISSING_DEPENDENCY 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								! command  -v parallel > /dev/null &&  echo  "Please install GNU Parallel. Exiting..."  &&  exit  $EXIT_MISSING_DEPENDENCY 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								! command  -v pdfimages > /dev/null &&  echo  "Please install poppler-utils. Exiting..."  &&  exit  $EXIT_MISSING_DEPENDENCY 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-04 18:05:21 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								! command  -v pdffonts > /dev/null &&  echo  "Please install poppler-utils. Exiting..."  &&  exit  $EXIT_MISSING_DEPENDENCY 
							 
						 
					
						
							
								
									
										
										
										
											2014-02-02 22:14:05 -08:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								! command  -v pdftoppm > /dev/null &&  echo  "Please install poppler-utils with the option --enable-splash-output enabled. Exiting..."  &&  exit  $EXIT_MISSING_DEPENDENCY 
							 
						 
					
						
							
								
									
										
										
										
											2014-02-04 21:25:38 -08:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								! command  -v pdfseparate > /dev/null &&  echo  "Please install or update poppler-utils to at least 0.24.5. Exiting..."  &&  exit  $EXIT_MISSING_DEPENDENCY 
							 
						 
					
						
							
								
									
										
										
										
											2013-12-30 23:44:38 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								[  $PREPROCESS_CLEAN  -eq 1  ]  &&  ! command  -v unpaper > /dev/null &&  echo  "Please install unpaper. Exiting..."  &&  exit  $EXIT_MISSING_DEPENDENCY 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								! command  -v tesseract > /dev/null &&  echo  "Please install tesseract and tesseract-data. Exiting..."  &&  exit  $EXIT_MISSING_DEPENDENCY 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-11 17:08:26 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								! command  -v python2 > /dev/null &&  echo  "Please install python v2.x. Exiting..."  &&  exit  $EXIT_MISSING_DEPENDENCY 
							 
						 
					
						
							
								
									
										
										
										
											2015-03-19 17:12:32 -07:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								! python3 -c 'import lxml'  2>/dev/null &&  echo  "Please install the python library lxml. Exiting..."  &&  exit  $EXIT_MISSING_DEPENDENCY 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								! python3 -c 'import reportlab'  2>/dev/null &&  echo  "Please install the python library reportlab. Exiting..."  &&  exit  $EXIT_MISSING_DEPENDENCY 
							 
						 
					
						
							
								
									
										
										
										
											2014-09-08 16:52:49 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								! command  -v gs > /dev/null &&  echo  "Please install ghostscript. Exiting..."  &&  exit  $EXIT_MISSING_DEPENDENCY 
							 
						 
					
						
							
								
									
										
										
										
											2013-12-30 23:44:38 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								! command  -v java > /dev/null &&  echo  "Please install java. Exiting..."  &&  exit  $EXIT_MISSING_DEPENDENCY 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-19 23:00:00 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-01-10 22:59:33 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# ensure the right tesseract version is installed 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-07 21:04:28 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# older versions are known to produce malformed hocr output and should not be used 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-25 21:58:50 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# Even 3.02.01 fails in few cases (see issue #28). I decided to allow this version anyway because 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# 3.02.02 is not yet available for some widespread linux distributions 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								reqtessversion = "3.02.01" 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-10 22:59:33 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								tessversion = ` tesseract -v 2>& 1  |  grep "tesseract"  |  sed s/[ ^0-9.] //g` 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-18 21:02:15 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								tesstooold = $( echo  " `echo  $tessversion  | sed s/[.]//2`-`echo  $reqtessversion  | sed s/[.]//2` < 0 "  |  bc) 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-18 21:38:22 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								[  " $tesstooold "  -eq "1"  ]  \
 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-10 22:59:33 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									&&  echo  " Please install tesseract  ${ reqtessversion }  or newer (currently installed version is  ${ tessversion } ) "  &&  exit  $EXIT_MISSING_DEPENDENCY 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-18 21:02:15 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-01-10 22:59:33 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# ensure the right GNU parallel version is installed 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# older version do not support -q flag (required to escape special characters) 
							 
						 
					
						
							
								
									
										
										
										
											2014-09-10 13:27:59 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								reqparallelversion = "20121122" 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-10 22:59:33 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								parallelversion = ` parallel --minversion 0` 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								! parallel --minversion " $reqparallelversion "  > /dev/null \
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									&&  echo  " Please install GNU parallel  ${ reqparallelversion }  or newer (currently installed version is  ${ parallelversion } ) "  &&  exit  $EXIT_MISSING_DEPENDENCY 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-07 21:04:28 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-01-12 21:30:42 -08:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# ensure pdftoppm is provided by poppler-utils, not the older xpdf version 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								! pdftoppm -v 2>& 1  |  grep -q 'Poppler'  &&  echo  "Please remove xpdf and install poppler-utils. Exiting..."  &&  $EXIT_MISSING_DEPENDENCY 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-01-18 22:22:19 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-01-03 15:59:51 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# Display the version of the tools if log level is LOG_DEBUG 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								if  [  $VERBOSITY  -ge $LOG_DEBUG  ] ;  then 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									echo  "--------------------------------" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									echo  "ImageMagick version:" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									identify --version
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									echo  "--------------------------------" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									echo  "GNU Parallel version:" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									parallel --version
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									echo  "--------------------------------" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									echo  "Poppler-utils version:" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									pdfimages -v
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									pdftoppm -v
							 
						 
					
						
							
								
									
										
										
										
											2014-01-04 18:05:21 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									pdffonts -v
							 
						 
					
						
							
								
									
										
										
										
											2014-02-04 21:25:38 -08:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									pdfseparate -v
							 
						 
					
						
							
								
									
										
										
										
											2014-01-03 15:59:51 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									echo  "--------------------------------" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									echo  "unpaper version:" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									unpaper --version
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									echo  "--------------------------------" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									echo  "tesseract version:" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									tesseract --version
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									echo  "--------------------------------" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									echo  "python2 version:" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									python2 --version
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									echo  "--------------------------------" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									echo  "Ghostscript version:" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									gs --version
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									echo  "--------------------------------" 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-03 16:27:11 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									echo  "Java version:" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									java -version
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									echo  "--------------------------------" 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-03 15:59:51 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								fi 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-01-18 21:38:22 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# check if the languages passed to tesseract are all supported 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								for  currentlan in ` echo  " $LAN "  |  sed 's/+/ /g' ` ;  do 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-18 22:22:19 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									if  ! tesseract --list-langs 2>& 1  |  grep " ^ $currentlan \$ "  > /dev/null;  then 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
										echo  " The language \" $currentlan \" is not supported by tesseract. " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
										tesseract --list-langs 2>& 1  |  tr '\n'  ' ' ;  echo 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
										echo  "Exiting..." 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
										exit  $EXIT_BAD_ARGS 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									fi 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-18 21:38:22 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								done 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-23 22:54:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-01-14 22:57:10 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-01-18 21:38:22 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# Initialize path to temporary files using mktemp 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-14 22:57:10 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# Goal: save tmp file in a sub-folder of the $TMPDIR environment variable (or in "/tmp" if unset) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# Unfortunately, Linux mktemp is not compatible with FreeBSD/OSX mktemp 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# Linux version requires no arg 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# FreeBSD requires '-t prefix' to be used so that $TMPDIR is taken into account 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# But in Linux '-t template' is handled differently than in FreeBSD 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# Therefore different calls must be used for Linux and for FreeBSD 
							 
						 
					
						
							
								
									
										
										
										
											2015-02-09 15:02:49 -08:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								prefix = " com.github.ocrmypdf. $( date +"%Y%m%d_%H%M" ) . $( basename " $FILE_INPUT_PDF "  |  sed 's/[.][^.]*$//' ) " 	# prefix made of date, time and pdf file name without extension 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-14 22:57:10 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								TMP_FLD = ` mktemp -d 2>/dev/null ||  mktemp -d -t " ${ prefix } "  2>/dev/null` 				# try Linux syntax first, if it fails try FreeBSD/OSX			 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-13 22:05:26 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								if  [  $?  -ne 0  ] ;  then 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-14 22:57:10 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									if  [  -z " $TMPDIR "  ] ;  then 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
										echo  "Could not create folder for temporary files. Please ensure you have sufficient right and \"/tmp\" exists" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									else 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
										echo  " Could not create folder for temporary files. Please ensure you have sufficient right and \" $TMPDIR \" exists " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									fi 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-13 22:05:26 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									exit  $EXIT_FILE_ACCESS_ERROR 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								fi 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-15 21:20:27 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								[  $VERBOSITY  -ge $LOG_DEBUG  ]  &&  echo  " Created temporary folder: \" $TMP_FLD \" " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-01-04 17:24:35 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								FILE_TMP = " ${ TMP_FLD } /tmp.txt " 						# temporary file with a very short lifetime (may be used for several things) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								FILE_PAGES_INFO = " ${ TMP_FLD } /pages-info.txt " 				# for each page: page #; width in pt; height in pt 
							 
						 
					
						
							
								
									
										
										
										
											2013-05-01 15:58:55 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								FILE_VALIDATION_LOG = " ${ TMP_FLD } /pdf_validation.log " 			# log file containing the results of the validation of the PDF/A file 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-19 23:00:00 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2013-04-19 22:23:28 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-01-15 21:20:27 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# get the size of each pdf page (width / height) in pt (i.e. inch/72) 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-22 20:56:45 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								[  $VERBOSITY  -ge $LOG_DEBUG  ]  &&  echo  "Input file: Extracting size of each page (in pt)" 
							 
						 
					
						
							
								
									
										
										
										
											2013-05-02 16:51:46 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								! identify -format "%w %h\n"  " $FILE_INPUT_PDF "  > " $FILE_TMP "  \
 
							 
						 
					
						
							
								
									
										
										
										
											2013-12-30 23:44:38 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									&&  echo  "Could not get size of PDF pages. Exiting..."  &&  exit  $EXIT_BAD_INPUT_FILE 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-06 22:41:43 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# removing empty lines (last one should be) and add page # before each line 
							 
						 
					
						
							
								
									
										
										
										
											2013-05-05 20:44:03 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								sed '/^$/d'  " $FILE_TMP "  |  awk '{printf "%04d %s\n", NR, $0}'  > " $FILE_PAGES_INFO " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								numpages = ` tail -n 1  " $FILE_PAGES_INFO "  |  cut -f1 -d" " ` 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-09 19:00:26 +02:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-01-07 21:57:18 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# process each page of the input pdf file 
							 
						 
					
						
							
								
									
										
										
										
											2015-03-24 23:04:49 -07:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								parallel --gnu -q -k --halt-on-error 1  python3 -m src.ocrpage \
 
							 
						 
					
						
							
								
									
										
										
										
											2015-02-20 15:26:33 -08:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									" $FILE_INPUT_PDF "  "{}"  " $numpages "  " $TMP_FLD "  \
 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-04 21:24:33 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									" $VERBOSITY "  " $LAN "  " $KEEP_TMP "  " $PREPROCESS_DESKEW "  " $PREPROCESS_CLEAN "  " $PREPROCESS_CLEANTOPDF "  " $OVERSAMPLING_DPI "  \
 
							 
						 
					
						
							
								
									
										
										
										
											2015-03-10 14:28:38 -07:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									" $PDF_NOIMG "  " $FORCE_OCR "  " $SKIP_TEXT "  " $SKIP_BIG "  " $EXACT_IMAGE "  " $TESS_CFG_FILES "  < " $FILE_PAGES_INFO " 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-07 21:57:18 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								ret_code = " $? " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								[  $ret_code  -ne 0  ]  &&  exit  $ret_code  
							 
						 
					
						
							
								
									
										
										
										
											2013-04-19 23:00:00 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-01-15 21:20:27 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# concatenate all pages and convert the pdf file to match PDF/A format 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								[  $VERBOSITY  -ge $LOG_DEBUG  ]  &&  echo  "Output file: Concatenating all pages to the final PDF/A file"  
							 
						 
					
						
							
								
									
										
										
										
											2015-02-09 15:33:49 -08:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								! gs -dQUIET -dBATCH -dNOPAUSE -sDEVICE= pdfwrite -sColorConversionStrategy= /RGB \
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									-sProcessColorModel= DeviceRGB -dPDFA -sPDFACompatibilityPolicy= 2  \
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									-sOutputICCProfile= srgb.icc \
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									-sOutputFile= " $FILE_OUTPUT_PDFA "  " $( pwd ) /PDFA_def.ps "  " ${ TMP_FLD } / " *ocred*.pdf \
 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-15 21:20:27 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									&&  echo  "Could not concatenate all pages to the final PDF/A file. Exiting..."  &&  exit  $EXIT_OTHER_ERROR 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-28 22:18:34 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2013-04-18 10:31:36 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# validate generated pdf file (compliance to PDF/A) 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-22 20:56:45 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								[  $VERBOSITY  -ge $LOG_DEBUG  ]  &&  echo  "Output file: Checking compliance to PDF/A standard"  
							 
						 
					
						
							
								
									
										
										
										
											2014-09-24 23:43:37 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								! java -jar " $JHOVE "  -c " $JHOVE_CFG "  -m PDF-hul " $FILE_OUTPUT_PDFA "  2> /dev/null 1> " $FILE_VALIDATION_LOG "  \
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
									&&  echo  "Unexpected error while checking compliance to PDF/A file. Exiting..."  &&  exit  $EXIT_OTHER_ERROR 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-19 22:23:28 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								grep -i "Status|Message"  " $FILE_VALIDATION_LOG "  # summary of the validation 
							 
						 
					
						
							
								
									
										
										
										
											2013-05-01 15:58:55 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								[  $VERBOSITY  -ge $LOG_DEBUG  ]  &&  echo  " The full validation log is available here: \" $FILE_VALIDATION_LOG \" " 
							 
						 
					
						
							
								
									
										
										
										
											2015-03-24 23:05:42 -07:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2013-04-26 12:23:29 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# check the validation results 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								pdf_valid = 1 
							 
						 
					
						
							
								
									
										
										
										
											2013-12-30 23:44:38 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								grep -i 'ErrorMessage'  " $FILE_VALIDATION_LOG "  &&  pdf_valid = 0 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								grep -i 'Status.*not valid'  " $FILE_VALIDATION_LOG "  &&  pdf_valid = 0 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								grep -i 'Status.*Not well-formed'  " $FILE_VALIDATION_LOG "  &&  pdf_valid = 0 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								! grep -i 'Profile:.*PDF/A-1'  " $FILE_VALIDATION_LOG "  > /dev/null &&  echo  "PDF file profile is not PDF/A-1"  &&  pdf_valid = 0 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								[  $pdf_valid  -ne 1  ]  &&  echo  "Output file: The generated PDF/A file is INVALID" 
							 
						 
					
						
							
								
									
										
										
										
											2014-01-05 21:22:31 +01:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								[  $pdf_valid  -eq 1  ]  &&  [  $VERBOSITY  -ge $LOG_INFO  ]  &&  echo  "Output file: The generated PDF/A file is VALID" 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-19 23:00:00 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2013-04-18 23:13:06 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# delete temporary files 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								if  [  $KEEP_TMP  -eq 0  ] ;  then 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-28 22:18:34 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									[  $VERBOSITY  -ge $LOG_DEBUG  ]  &&  echo  "Deleting temporary files" 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-23 22:54:58 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
									rm -r -f " ${ TMP_FLD } " 
							 
						 
					
						
							
								
									
										
										
										
											2013-04-20 22:02:33 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								fi 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2013-05-01 15:58:55 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								END = ` date +%s` 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								[  $VERBOSITY  -ge $LOG_DEBUG  ]  &&  echo  " Script took  $(( $END - $START ))  seconds " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2014-09-09 07:01:04 +02:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								[  $pdf_valid  -ne 1  ]  &&  exit  $EXIT_INVALID_OUTPUT_PDFA  ||  exit  0