mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-11-02 02:40:44 +00:00
314 lines
15 KiB
Bash
Executable File
314 lines
15 KiB
Bash
Executable File
#!/bin/sh
|
|
##############################################################################
|
|
# Copyright (c) 2013-14: fritz-hh from Github (https://github.com/fritz-hh)
|
|
##############################################################################
|
|
|
|
# Darwin/OS X has not evolved a proper readlink yet
|
|
if [ $(uname) == "Darwin" ]; then
|
|
function readlink() {
|
|
python3 -c 'import os,sys; print(os.path.realpath(sys.argv[1]))' "$2"
|
|
}
|
|
fi
|
|
|
|
# Import required scripts
|
|
BASEPATH="$(dirname $(readlink -f $0))"
|
|
. "$BASEPATH/src/config.sh"
|
|
|
|
# Set variables corresponding to the input parameters
|
|
ARGUMENTS="$@"
|
|
|
|
START=`date +%s`
|
|
|
|
usage() {
|
|
cat << EOF
|
|
--------------------------------------------------------------------------------------
|
|
Script aimed at generating a searchable PDF file from a PDF file containing only images.
|
|
(The script performs optical character recognition of each respective page using the
|
|
tesseract engine)
|
|
|
|
Copyright: fritz-hh from Github (https://github.com/fritz-hh)
|
|
Version: $VERSION
|
|
|
|
Usage: OCRmyPDF.sh [-h] [-v] [-g] [-k] [-d] [-c] [-i] [-o dpi] [-f] [-l language] [-C filename] inputfile outputfile
|
|
|
|
-h : Display this help message
|
|
-v : Increase the verbosity (this option can be used more than once) (e.g. -vvv)
|
|
-k : Do not delete the temporary files
|
|
-g : Activate debug mode:
|
|
- Generates a PDF file containing each page twice (once with the image, once without the image
|
|
but with the OCRed text as well as the detected bounding boxes)
|
|
- Set the verbosity to the highest possible
|
|
- Do not delete the temporary files
|
|
-d : Deskew each page before performing OCR
|
|
-c : Clean each page before performing OCR
|
|
-i : Incorporate the cleaned image in the final PDF file (by default the original image
|
|
image, or the deskewed image if the -d option is set)
|
|
-o : If the resolution of an image is lower than dpi value provided as argument, provide the OCR engine with
|
|
an oversampled image having the latter dpi value. This can improve the OCR results but can lead to a larger output PDF file.
|
|
(default: no oversampling performed)
|
|
-f : Force to OCR the whole document, even if some page already contain font data
|
|
(which should not be the case for PDF files built from scanned images)
|
|
-s : If pages contain font data, do not perform processing on that page, but include the page in the final output.
|
|
-b : Skip big pages
|
|
-e : Use exact PDF pages with no changes other than inserting hidden OCR text layer (mutually exclusive with -d/-c/-i/-f)
|
|
-l : Set the language of the PDF file in order to improve OCR results (default "eng")
|
|
Any language supported by tesseract is supported (Tesseract uses 3-character ISO 639-2 language codes)
|
|
Multiple languages may be specified, separated by '+' characters.
|
|
-C : Pass an additional configuration file to the tesseract OCR engine.
|
|
(this option can be used more than once)
|
|
Note 1: The configuration file must be available in the "tessdata/configs" folder of your tesseract installation
|
|
inputfile : PDF file to be OCRed
|
|
outputfile : The PDF/A file that will be generated
|
|
--------------------------------------------------------------------------------------
|
|
EOF
|
|
}
|
|
|
|
|
|
#################################################
|
|
# Get an absolute path from a relative path to a file
|
|
#
|
|
# Param1 : Relative path
|
|
# Returns: 1 if the folder in which the file is located does not exist
|
|
# 0 otherwise
|
|
#################################################
|
|
absolutePath() {
|
|
local wdsave absolutepath
|
|
wdsave="$(pwd)"
|
|
! cd "$(dirname "$1")" 1> /dev/null 2> /dev/null && return 1
|
|
absolutepath="$(pwd)/$(basename "$1")"
|
|
cd "$wdsave"
|
|
echo "$absolutepath"
|
|
return 0
|
|
}
|
|
|
|
|
|
# Initialization the configuration parameters with default values
|
|
VERBOSITY="$LOG_ERR" # default verbosity level
|
|
LAN="eng" # default language of the PDF file (required to get good OCR results)
|
|
KEEP_TMP="0" # 0=no, 1=yes (keep the temporary files)
|
|
PREPROCESS_DESKEW="0" # 0=no, 1=yes (deskew image)
|
|
PREPROCESS_CLEAN="0" # 0=no, 1=yes (clean image to improve OCR)
|
|
PREPROCESS_CLEANTOPDF="0" # 0=no, 1=yes (put cleaned image in final PDF)
|
|
OVERSAMPLING_DPI="0" # 0=do not perform oversampling (dpi value under which oversampling should be performed)
|
|
PDF_NOIMG="0" # 0=no, 1=yes (generates each PDF page twice, with and without image)
|
|
FORCE_OCR="0" # 0=do not force, 1=force (force to OCR the whole document, even if some page already contain font data)
|
|
SKIP_TEXT="0" # 0=do not skip text pages, 1=skip text pages
|
|
SKIP_BIG="0"
|
|
EXACT_IMAGE="0"
|
|
TESS_CFG_FILES="" # list of additional configuration files to be used by tesseract
|
|
|
|
# Parse optional command line arguments
|
|
while getopts ":hvgkdcio:fsbel:C:" opt; do
|
|
case $opt in
|
|
h) usage ; exit 0 ;;
|
|
v) VERBOSITY=$(($VERBOSITY+1)) ;;
|
|
k) KEEP_TMP="1" ;;
|
|
g) PDF_NOIMG="1"; VERBOSITY="$LOG_DEBUG"; KEEP_TMP="1" ;;
|
|
d) PREPROCESS_DESKEW="1" ;;
|
|
c) PREPROCESS_CLEAN="1" ;;
|
|
i) PREPROCESS_CLEANTOPDF="1" ;;
|
|
o) OVERSAMPLING_DPI="$OPTARG" ;;
|
|
f) FORCE_OCR="1" ;;
|
|
s) SKIP_TEXT="1" ;;
|
|
b) SKIP_BIG="1" ;;
|
|
e) EXACT_IMAGE="1" ;;
|
|
l) LAN="$OPTARG" ;;
|
|
C) TESS_CFG_FILES="$OPTARG $TESS_CFG_FILES" ;;
|
|
\?)
|
|
echo "Invalid option: -$OPTARG"
|
|
usage
|
|
exit $EXIT_BAD_ARGS ;;
|
|
:)
|
|
echo "Option -$OPTARG requires an argument"
|
|
usage
|
|
exit $EXIT_BAD_ARGS ;;
|
|
esac
|
|
done
|
|
|
|
# Remove the optional arguments parsed above.
|
|
shift $((OPTIND-1))
|
|
|
|
# Check if the number of mandatory parameters provided is as expected
|
|
if [ "$#" -ne "2" ]; then
|
|
echo "Exactly two mandatory argument shall be provided ($# arguments provided)"
|
|
usage
|
|
exit $EXIT_BAD_ARGS
|
|
fi
|
|
|
|
if [ "$SKIP_TEXT" -eq "1" -a "$FORCE_OCR" -eq "1" ]; then
|
|
echo "Options -f and -s are mutually exclusive; choose one or the other"
|
|
usage
|
|
exit $EXIT_BAD_ARGSor
|
|
fi
|
|
|
|
! absolutePath "$1" > /dev/null \
|
|
&& echo "The folder in which the input file should be located does not exist. Exiting..." && exit $EXIT_BAD_ARGS
|
|
FILE_INPUT_PDF="`absolutePath "$1"`"
|
|
! absolutePath "$2" > /dev/null \
|
|
&& echo "The folder in which the output file should be generated does not exist. Exiting..." && exit $EXIT_BAD_ARGS
|
|
FILE_OUTPUT_PDFA="`absolutePath "$2"`"
|
|
|
|
|
|
# set script path as working directory
|
|
cd "$BASEPATH"
|
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "$TOOLNAME version: $VERSION"
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Arguments: $ARGUMENTS"
|
|
|
|
# check if the required utilities are installed
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Checking if all dependencies are installed"
|
|
! command -v identify > /dev/null && echo "Please install ImageMagick. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! command -v parallel > /dev/null && echo "Please install GNU Parallel. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! command -v pdfimages > /dev/null && echo "Please install poppler-utils. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! command -v pdffonts > /dev/null && echo "Please install poppler-utils. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! command -v pdftoppm > /dev/null && echo "Please install poppler-utils with the option --enable-splash-output enabled. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! command -v pdfseparate > /dev/null && echo "Please install or update poppler-utils to at least 0.24.5. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
[ $PREPROCESS_CLEAN -eq 1 ] && ! command -v unpaper > /dev/null && echo "Please install unpaper. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! command -v tesseract > /dev/null && echo "Please install tesseract and tesseract-data. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! command -v python2 > /dev/null && echo "Please install python v2.x. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! python3 -c 'import lxml' 2>/dev/null && echo "Please install the python library lxml. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! python3 -c 'import reportlab' 2>/dev/null && echo "Please install the python library reportlab. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! command -v gs > /dev/null && echo "Please install ghostscript. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! command -v java > /dev/null && echo "Please install java. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
|
|
# ensure the right tesseract version is installed
|
|
# older versions are known to produce malformed hocr output and should not be used
|
|
# Even 3.02.01 fails in few cases (see issue #28). I decided to allow this version anyway because
|
|
# 3.02.02 is not yet available for some widespread linux distributions
|
|
reqtessversion="3.02.01"
|
|
tessversion=`tesseract -v 2>&1 | grep "tesseract" | sed s/[^0-9.]//g`
|
|
tesstooold=$(echo "`echo $tessversion | sed s/[.]//2`-`echo $reqtessversion | sed s/[.]//2` < 0" | bc)
|
|
[ "$tesstooold" -eq "1" ] \
|
|
&& echo "Please install tesseract ${reqtessversion} or newer (currently installed version is ${tessversion})" && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
# ensure the right GNU parallel version is installed
|
|
# older version do not support -q flag (required to escape special characters)
|
|
reqparallelversion="20121122"
|
|
parallelversion=`parallel --minversion 0`
|
|
! parallel --minversion "$reqparallelversion" > /dev/null \
|
|
&& echo "Please install GNU parallel ${reqparallelversion} or newer (currently installed version is ${parallelversion})" && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
# ensure pdftoppm is provided by poppler-utils, not the older xpdf version
|
|
! pdftoppm -v 2>&1 | grep -q 'Poppler' && echo "Please remove xpdf and install poppler-utils. Exiting..." && $EXIT_MISSING_DEPENDENCY
|
|
|
|
|
|
# Display the version of the tools if log level is LOG_DEBUG
|
|
if [ $VERBOSITY -ge $LOG_DEBUG ]; then
|
|
echo "--------------------------------"
|
|
echo "ImageMagick version:"
|
|
identify --version
|
|
echo "--------------------------------"
|
|
echo "GNU Parallel version:"
|
|
parallel --version
|
|
echo "--------------------------------"
|
|
echo "Poppler-utils version:"
|
|
pdfimages -v
|
|
pdftoppm -v
|
|
pdffonts -v
|
|
pdfseparate -v
|
|
echo "--------------------------------"
|
|
echo "unpaper version:"
|
|
unpaper --version
|
|
echo "--------------------------------"
|
|
echo "tesseract version:"
|
|
tesseract --version
|
|
echo "--------------------------------"
|
|
echo "python2 version:"
|
|
python2 --version
|
|
echo "--------------------------------"
|
|
echo "Ghostscript version:"
|
|
gs --version
|
|
echo "--------------------------------"
|
|
echo "Java version:"
|
|
java -version
|
|
echo "--------------------------------"
|
|
fi
|
|
|
|
|
|
# check if the languages passed to tesseract are all supported
|
|
for currentlan in `echo "$LAN" | sed 's/+/ /g'`; do
|
|
if ! tesseract --list-langs 2>&1 | grep "^$currentlan\$" > /dev/null; then
|
|
echo "The language \"$currentlan\" is not supported by tesseract."
|
|
tesseract --list-langs 2>&1 | tr '\n' ' '; echo
|
|
echo "Exiting..."
|
|
exit $EXIT_BAD_ARGS
|
|
fi
|
|
done
|
|
|
|
|
|
# Initialize path to temporary files using mktemp
|
|
# Goal: save tmp file in a sub-folder of the $TMPDIR environment variable (or in "/tmp" if unset)
|
|
# Unfortunately, Linux mktemp is not compatible with FreeBSD/OSX mktemp
|
|
# Linux version requires no arg
|
|
# FreeBSD requires '-t prefix' to be used so that $TMPDIR is taken into account
|
|
# But in Linux '-t template' is handled differently than in FreeBSD
|
|
# Therefore different calls must be used for Linux and for FreeBSD
|
|
prefix="com.github.ocrmypdf.$(date +"%Y%m%d_%H%M").$(basename "$FILE_INPUT_PDF" | sed 's/[.][^.]*$//')" # prefix made of date, time and pdf file name without extension
|
|
TMP_FLD=`mktemp -d 2>/dev/null || mktemp -d -t "${prefix}" 2>/dev/null` # try Linux syntax first, if it fails try FreeBSD/OSX
|
|
if [ $? -ne 0 ]; then
|
|
if [ -z "$TMPDIR" ]; then
|
|
echo "Could not create folder for temporary files. Please ensure you have sufficient right and \"/tmp\" exists"
|
|
else
|
|
echo "Could not create folder for temporary files. Please ensure you have sufficient right and \"$TMPDIR\" exists"
|
|
fi
|
|
exit $EXIT_FILE_ACCESS_ERROR
|
|
fi
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Created temporary folder: \"$TMP_FLD\""
|
|
|
|
FILE_TMP="${TMP_FLD}/tmp.txt" # temporary file with a very short lifetime (may be used for several things)
|
|
FILE_PAGES_INFO="${TMP_FLD}/pages-info.txt" # for each page: page #; width in pt; height in pt
|
|
FILE_VALIDATION_LOG="${TMP_FLD}/pdf_validation.log" # log file containing the results of the validation of the PDF/A file
|
|
|
|
|
|
# get the size of each pdf page (width / height) in pt (i.e. inch/72)
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Input file: Extracting size of each page (in pt)"
|
|
! identify -format "%w %h\n" "$FILE_INPUT_PDF" > "$FILE_TMP" \
|
|
&& echo "Could not get size of PDF pages. Exiting..." && exit $EXIT_BAD_INPUT_FILE
|
|
# removing empty lines (last one should be) and add page # before each line
|
|
sed '/^$/d' "$FILE_TMP" | awk '{printf "%04d %s\n", NR, $0}' > "$FILE_PAGES_INFO"
|
|
numpages=`tail -n 1 "$FILE_PAGES_INFO" | cut -f1 -d" "`
|
|
|
|
# process each page of the input pdf file
|
|
parallel --gnu -q -k --halt-on-error 1 python3 -m src.ocrpage \
|
|
"$FILE_INPUT_PDF" "{}" "$numpages" "$TMP_FLD" \
|
|
"$VERBOSITY" "$LAN" "$KEEP_TMP" "$PREPROCESS_DESKEW" "$PREPROCESS_CLEAN" "$PREPROCESS_CLEANTOPDF" "$OVERSAMPLING_DPI" \
|
|
"$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$SKIP_BIG" "$EXACT_IMAGE" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO"
|
|
ret_code="$?"
|
|
[ $ret_code -ne 0 ] && exit $ret_code
|
|
|
|
# concatenate all pages and convert the pdf file to match PDF/A format
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Concatenating all pages to the final PDF/A file"
|
|
! gs -dQUIET -dBATCH -dNOPAUSE -sDEVICE=pdfwrite -sColorConversionStrategy=/RGB \
|
|
-sProcessColorModel=DeviceRGB -dPDFA -sPDFACompatibilityPolicy=2 \
|
|
-sOutputICCProfile=srgb.icc \
|
|
-sOutputFile="$FILE_OUTPUT_PDFA" "$(pwd)/PDFA_def.ps" "${TMP_FLD}/"*ocred*.pdf \
|
|
&& echo "Could not concatenate all pages to the final PDF/A file. Exiting..." && exit $EXIT_OTHER_ERROR
|
|
|
|
# validate generated pdf file (compliance to PDF/A)
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Checking compliance to PDF/A standard"
|
|
! java -jar "$JHOVE" -c "$JHOVE_CFG" -m PDF-hul "$FILE_OUTPUT_PDFA" 2> /dev/null 1> "$FILE_VALIDATION_LOG" \
|
|
&& echo "Unexpected error while checking compliance to PDF/A file. Exiting..." && exit $EXIT_OTHER_ERROR
|
|
grep -i "Status|Message" "$FILE_VALIDATION_LOG" # summary of the validation
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "The full validation log is available here: \"$FILE_VALIDATION_LOG\""
|
|
|
|
# check the validation results
|
|
pdf_valid=1
|
|
grep -i 'ErrorMessage' "$FILE_VALIDATION_LOG" && pdf_valid=0
|
|
grep -i 'Status.*not valid' "$FILE_VALIDATION_LOG" && pdf_valid=0
|
|
grep -i 'Status.*Not well-formed' "$FILE_VALIDATION_LOG" && pdf_valid=0
|
|
! grep -i 'Profile:.*PDF/A-1' "$FILE_VALIDATION_LOG" > /dev/null && echo "PDF file profile is not PDF/A-1" && pdf_valid=0
|
|
[ $pdf_valid -ne 1 ] && echo "Output file: The generated PDF/A file is INVALID"
|
|
[ $pdf_valid -eq 1 ] && [ $VERBOSITY -ge $LOG_INFO ] && echo "Output file: The generated PDF/A file is VALID"
|
|
|
|
# delete temporary files
|
|
if [ $KEEP_TMP -eq 0 ]; then
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Deleting temporary files"
|
|
rm -r -f "${TMP_FLD}"
|
|
fi
|
|
|
|
END=`date +%s`
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Script took $(($END-$START)) seconds"
|
|
|
|
[ $pdf_valid -ne 1 ] && exit $EXIT_INVALID_OUTPUT_PDFA || exit 0
|