mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-11-01 18:29:58 +00:00
The flag -dUseCIEColor is now deprecated, as it invokes the old engine which introduces color errors. The new engine requires a PDF/A file header with hardcoded location of a ICC profile to use, now included in the project. Portable iterations should generate a PDFA_def.ps based on the target system; for now OS X with homebrew is presumed. I have selected sRGB since scanners tend to capture RGB and printing is not a major consideration for PDF/A. Also note all file paths given to gs must be absolute. May its creators be forever haunted for their failure to document this unexpected quirk.
317 lines
14 KiB
Bash
Executable File
317 lines
14 KiB
Bash
Executable File
#!/bin/sh
|
|
##############################################################################
|
|
# Copyright (c) 2013-14: fritz-hh from Github (https://github.com/fritz-hh)
|
|
##############################################################################
|
|
|
|
# Darwin/OS X has not evolved a proper readlink yet
|
|
if [ $(uname) == "Darwin" ]; then
|
|
function readlink() {
|
|
python -c 'import os,sys; print os.path.realpath(sys.argv[1])' "$2"
|
|
}
|
|
fi
|
|
|
|
# Import required scripts
|
|
BASEPATH="$(dirname $(readlink -f $0))"
|
|
. "$BASEPATH/src/config.sh"
|
|
|
|
# Set variables corresponding to the input parameters
|
|
ARGUMENTS="$@"
|
|
|
|
START=`date +%s`
|
|
|
|
usage() {
|
|
cat << EOF
|
|
--------------------------------------------------------------------------------------
|
|
Script aimed at generating a searchable PDF file from a PDF file containing only images.
|
|
(The script performs optical character recognition of each respective page using the
|
|
tesseract engine)
|
|
|
|
Copyright: fritz-hh from Github (https://github.com/fritz-hh)
|
|
Version: $VERSION
|
|
|
|
Usage: OCRmyPDF.sh [-h] [-v] [-g] [-k] [-d] [-c] [-i] [-o dpi] [-f] [-l language] [-C filename] inputfile outputfile
|
|
|
|
-h : Display this help message
|
|
-v : Increase the verbosity (this option can be used more than once) (e.g. -vvv)
|
|
-k : Do not delete the temporary files
|
|
-g : Activate debug mode:
|
|
- Generates a PDF file containing each page twice (once with the image, once without the image
|
|
but with the OCRed text as well as the detected bounding boxes)
|
|
- Set the verbosity to the highest possible
|
|
- Do not delete the temporary files
|
|
-d : Deskew each page before performing OCR
|
|
-c : Clean each page before performing OCR
|
|
-i : Incorporate the cleaned image in the final PDF file (by default the original image
|
|
image, or the deskewed image if the -d option is set)
|
|
-o : If the resolution of an image is lower than dpi value provided as argument, provide the OCR engine with
|
|
an oversampled image having the latter dpi value. This can improve the OCR results but can lead to a larger output PDF file.
|
|
(default: no oversampling performed)
|
|
-f : Force to OCR the whole document, even if some page already contain font data
|
|
(which should not be the case for PDF files built from scanned images)
|
|
-s : If pages contain font data, do not perform processing on that page, but include the page in the final output.
|
|
-l : Set the language of the PDF file in order to improve OCR results (default "eng")
|
|
Any language supported by tesseract is supported (Tesseract uses 3-character ISO 639-2 language codes)
|
|
Multiple languages may be specified, separated by '+' characters.
|
|
-C : Pass an additional configuration file to the tesseract OCR engine.
|
|
(this option can be used more than once)
|
|
Note 1: The configuration file must be available in the "tessdata/configs" folder of your tesseract installation
|
|
inputfile : PDF file to be OCRed
|
|
outputfile : The PDF/A file that will be generated
|
|
--------------------------------------------------------------------------------------
|
|
EOF
|
|
}
|
|
|
|
|
|
#################################################
|
|
# Get an absolute path from a relative path to a file
|
|
#
|
|
# Param1 : Relative path
|
|
# Returns: 1 if the folder in which the file is located does not exist
|
|
# 0 otherwise
|
|
#################################################
|
|
absolutePath() {
|
|
local wdsave absolutepath
|
|
wdsave="$(pwd)"
|
|
! cd "$(dirname "$1")" 1> /dev/null 2> /dev/null && return 1
|
|
absolutepath="$(pwd)/$(basename "$1")"
|
|
cd "$wdsave"
|
|
echo "$absolutepath"
|
|
return 0
|
|
}
|
|
|
|
|
|
# Initialization the configuration parameters with default values
|
|
VERBOSITY="$LOG_ERR" # default verbosity level
|
|
LAN="eng" # default language of the PDF file (required to get good OCR results)
|
|
KEEP_TMP="0" # 0=no, 1=yes (keep the temporary files)
|
|
PREPROCESS_DESKEW="0" # 0=no, 1=yes (deskew image)
|
|
PREPROCESS_CLEAN="0" # 0=no, 1=yes (clean image to improve OCR)
|
|
PREPROCESS_CLEANTOPDF="0" # 0=no, 1=yes (put cleaned image in final PDF)
|
|
OVERSAMPLING_DPI="0" # 0=do not perform oversampling (dpi value under which oversampling should be performed)
|
|
PDF_NOIMG="0" # 0=no, 1=yes (generates each PDF page twice, with and without image)
|
|
FORCE_OCR="0" # 0=do not force, 1=force (force to OCR the whole document, even if some page already contain font data)
|
|
SKIP_TEXT="0" # 0=do not skip text pages, 1=skip text pages
|
|
TESS_CFG_FILES="" # list of additional configuration files to be used by tesseract
|
|
|
|
# Parse optional command line arguments
|
|
while getopts ":hvgkdcio:fsl:C:" opt; do
|
|
case $opt in
|
|
h) usage ; exit 0 ;;
|
|
v) VERBOSITY=$(($VERBOSITY+1)) ;;
|
|
k) KEEP_TMP="1" ;;
|
|
g) PDF_NOIMG="1"; VERBOSITY="$LOG_DEBUG"; KEEP_TMP="1" ;;
|
|
d) PREPROCESS_DESKEW="1" ;;
|
|
c) PREPROCESS_CLEAN="1" ;;
|
|
i) PREPROCESS_CLEANTOPDF="1" ;;
|
|
o) OVERSAMPLING_DPI="$OPTARG" ;;
|
|
f) FORCE_OCR="1" ;;
|
|
s) SKIP_TEXT="1" ;;
|
|
l) LAN="$OPTARG" ;;
|
|
C) TESS_CFG_FILES="$OPTARG $TESS_CFG_FILES" ;;
|
|
\?)
|
|
echo "Invalid option: -$OPTARG"
|
|
usage
|
|
exit $EXIT_BAD_ARGS ;;
|
|
:)
|
|
echo "Option -$OPTARG requires an argument"
|
|
usage
|
|
exit $EXIT_BAD_ARGS ;;
|
|
esac
|
|
done
|
|
|
|
# Remove the optional arguments parsed above.
|
|
shift $((OPTIND-1))
|
|
|
|
# Check if the number of mandatory parameters provided is as expected
|
|
if [ "$#" -ne "2" ]; then
|
|
echo "Exactly two mandatory argument shall be provided ($# arguments provided)"
|
|
usage
|
|
exit $EXIT_BAD_ARGS
|
|
fi
|
|
|
|
if [ "$SKIP_TEXT" -eq "1" -a "$FORCE_OCR" -eq "1" ]; then
|
|
echo "Options -f and -s are mutually exclusive; choose one or the other"
|
|
usage
|
|
exit $EXIT_BAD_ARGS
|
|
fi
|
|
|
|
! absolutePath "$1" > /dev/null \
|
|
&& echo "The folder in which the input file should be located does not exist. Exiting..." && exit $EXIT_BAD_ARGS
|
|
FILE_INPUT_PDF="`absolutePath "$1"`"
|
|
! absolutePath "$2" > /dev/null \
|
|
&& echo "The folder in which the output file should be generated does not exist. Exiting..." && exit $EXIT_BAD_ARGS
|
|
FILE_OUTPUT_PDFA="`absolutePath "$2"`"
|
|
|
|
|
|
|
|
# set script path as working directory
|
|
cd "$BASEPATH"
|
|
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "$TOOLNAME version: $VERSION"
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Arguments: $ARGUMENTS"
|
|
|
|
# check if the required utilities are installed
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Checking if all dependencies are installed"
|
|
! command -v identify > /dev/null && echo "Please install ImageMagick. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! command -v parallel > /dev/null && echo "Please install GNU Parallel. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! command -v pdfimages > /dev/null && echo "Please install poppler-utils. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! command -v pdffonts > /dev/null && echo "Please install poppler-utils. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! command -v pdftoppm > /dev/null && echo "Please install poppler-utils with the option --enable-splash-output enabled. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! command -v pdfseparate > /dev/null && echo "Please install or update poppler-utils to at least 0.24.5. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
[ $PREPROCESS_CLEAN -eq 1 ] && ! command -v unpaper > /dev/null && echo "Please install unpaper. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! command -v tesseract > /dev/null && echo "Please install tesseract and tesseract-data. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! command -v python2 > /dev/null && echo "Please install python v2.x. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! python2 -c 'import lxml' 2>/dev/null && echo "Please install the python library lxml. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! python2 -c 'import reportlab' 2>/dev/null && echo "Please install the python library reportlab. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! command -v gs > /dev/null && echo "Please install ghostscript. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
! command -v java > /dev/null && echo "Please install java. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
|
|
|
|
# ensure the right tesseract version is installed
|
|
# older versions are known to produce malformed hocr output and should not be used
|
|
# Even 3.02.01 fails in few cases (see issue #28). I decided to allow this version anyway because
|
|
# 3.02.02 is not yet available for some widespread linux distributions
|
|
reqtessversion="3.02.01"
|
|
tessversion=`tesseract -v 2>&1 | grep "tesseract" | sed s/[^0-9.]//g`
|
|
tesstooold=$(echo "`echo $tessversion | sed s/[.]//2`-`echo $reqtessversion | sed s/[.]//2` < 0" | bc)
|
|
[ "$tesstooold" -eq "1" ] \
|
|
&& echo "Please install tesseract ${reqtessversion} or newer (currently installed version is ${tessversion})" && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
# ensure the right GNU parallel version is installed
|
|
# older version do not support -q flag (required to escape special characters)
|
|
reqparallelversion="20121122"
|
|
parallelversion=`parallel --minversion 0`
|
|
! parallel --minversion "$reqparallelversion" > /dev/null \
|
|
&& echo "Please install GNU parallel ${reqparallelversion} or newer (currently installed version is ${parallelversion})" && exit $EXIT_MISSING_DEPENDENCY
|
|
|
|
# ensure pdftoppm is provided by poppler-utils, not the older xpdf version
|
|
! pdftoppm -v 2>&1 | grep -q 'Poppler' && echo "Please remove xpdf and install poppler-utils. Exiting..." && $EXIT_MISSING_DEPENDENCY
|
|
|
|
|
|
|
|
# Display the version of the tools if log level is LOG_DEBUG
|
|
if [ $VERBOSITY -ge $LOG_DEBUG ]; then
|
|
echo "--------------------------------"
|
|
echo "ImageMagick version:"
|
|
identify --version
|
|
echo "--------------------------------"
|
|
echo "GNU Parallel version:"
|
|
parallel --version
|
|
echo "--------------------------------"
|
|
echo "Poppler-utils version:"
|
|
pdfimages -v
|
|
pdftoppm -v
|
|
pdffonts -v
|
|
pdfseparate -v
|
|
echo "--------------------------------"
|
|
echo "unpaper version:"
|
|
unpaper --version
|
|
echo "--------------------------------"
|
|
echo "tesseract version:"
|
|
tesseract --version
|
|
echo "--------------------------------"
|
|
echo "python2 version:"
|
|
python2 --version
|
|
echo "--------------------------------"
|
|
echo "Ghostscript version:"
|
|
gs --version
|
|
echo "--------------------------------"
|
|
echo "Java version:"
|
|
java -version
|
|
echo "--------------------------------"
|
|
fi
|
|
|
|
|
|
|
|
# check if the languages passed to tesseract are all supported
|
|
for currentlan in `echo "$LAN" | sed 's/+/ /g'`; do
|
|
if ! tesseract --list-langs 2>&1 | grep "^$currentlan\$" > /dev/null; then
|
|
echo "The language \"$currentlan\" is not supported by tesseract."
|
|
tesseract --list-langs 2>&1 | tr '\n' ' '; echo
|
|
echo "Exiting..."
|
|
exit $EXIT_BAD_ARGS
|
|
fi
|
|
done
|
|
|
|
|
|
|
|
# Initialize path to temporary files using mktemp
|
|
# Goal: save tmp file in a sub-folder of the $TMPDIR environment variable (or in "/tmp" if unset)
|
|
# Unfortunately, Linux mktemp is not compatible with FreeBSD/OSX mktemp
|
|
# Linux version requires no arg
|
|
# FreeBSD requires '-t prefix' to be used so that $TMPDIR is taken into account
|
|
# But in Linux '-t template' is handled differently than in FreeBSD
|
|
# Therefore different calls must be used for Linux and for FreeBSD
|
|
prefix="com.github.ocrmypdf.$(date +"%Y%m%d_%H%M").$(basename "$FILE_INPUT_PDF" | sed 's/[.][^.]*$//')" # prefix made of date, time and pdf file name without extension
|
|
TMP_FLD=`mktemp -d 2>/dev/null || mktemp -d -t "${prefix}" 2>/dev/null` # try Linux syntax first, if it fails try FreeBSD/OSX
|
|
if [ $? -ne 0 ]; then
|
|
if [ -z "$TMPDIR" ]; then
|
|
echo "Could not create folder for temporary files. Please ensure you have sufficient right and \"/tmp\" exists"
|
|
else
|
|
echo "Could not create folder for temporary files. Please ensure you have sufficient right and \"$TMPDIR\" exists"
|
|
fi
|
|
exit $EXIT_FILE_ACCESS_ERROR
|
|
fi
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Created temporary folder: \"$TMP_FLD\""
|
|
|
|
FILE_TMP="${TMP_FLD}/tmp.txt" # temporary file with a very short lifetime (may be used for several things)
|
|
FILE_PAGES_INFO="${TMP_FLD}/pages-info.txt" # for each page: page #; width in pt; height in pt
|
|
FILE_VALIDATION_LOG="${TMP_FLD}/pdf_validation.log" # log file containing the results of the validation of the PDF/A file
|
|
|
|
|
|
|
|
# get the size of each pdf page (width / height) in pt (i.e. inch/72)
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Input file: Extracting size of each page (in pt)"
|
|
! identify -format "%w %h\n" "$FILE_INPUT_PDF" > "$FILE_TMP" \
|
|
&& echo "Could not get size of PDF pages. Exiting..." && exit $EXIT_BAD_INPUT_FILE
|
|
# removing empty lines (last one should be) and add page # before each line
|
|
sed '/^$/d' "$FILE_TMP" | awk '{printf "%04d %s\n", NR, $0}' > "$FILE_PAGES_INFO"
|
|
numpages=`tail -n 1 "$FILE_PAGES_INFO" | cut -f1 -d" "`
|
|
|
|
# process each page of the input pdf file
|
|
parallel --gnu -q -k --halt-on-error 1 python3 -m src.ocrpage "$FILE_INPUT_PDF" "{}" "$numpages" "$TMP_FLD" \
|
|
"$VERBOSITY" "$LAN" "$KEEP_TMP" "$PREPROCESS_DESKEW" "$PREPROCESS_CLEAN" "$PREPROCESS_CLEANTOPDF" "$OVERSAMPLING_DPI" \
|
|
"$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO"
|
|
ret_code="$?"
|
|
[ $ret_code -ne 0 ] && exit $ret_code
|
|
|
|
# concatenate all pages and convert the pdf file to match PDF/A format
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Concatenating all pages to the final PDF/A file"
|
|
! gs -dQUIET -dBATCH -dNOPAUSE -sDEVICE=pdfwrite -sColorConversionStrategy=/RGB \
|
|
-sProcessColorModel=DeviceRGB -dPDFA -sPDFACompatibilityPolicy=2 \
|
|
-sOutputICCProfile=srgb.icc \
|
|
-sOutputFile="$FILE_OUTPUT_PDFA" "$(pwd)/PDFA_def.ps" "${TMP_FLD}/"*ocred*.pdf \
|
|
&& echo "Could not concatenate all pages to the final PDF/A file. Exiting..." && exit $EXIT_OTHER_ERROR
|
|
|
|
# validate generated pdf file (compliance to PDF/A)
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Checking compliance to PDF/A standard"
|
|
! java -jar "$JHOVE" -c "$JHOVE_CFG" -m PDF-hul "$FILE_OUTPUT_PDFA" 2> /dev/null 1> "$FILE_VALIDATION_LOG" \
|
|
&& echo "Unexpected error while checking compliance to PDF/A file. Exiting..." && exit $EXIT_OTHER_ERROR
|
|
grep -i "Status|Message" "$FILE_VALIDATION_LOG" # summary of the validation
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "The full validation log is available here: \"$FILE_VALIDATION_LOG\""
|
|
# check the validation results
|
|
pdf_valid=1
|
|
grep -i 'ErrorMessage' "$FILE_VALIDATION_LOG" && pdf_valid=0
|
|
grep -i 'Status.*not valid' "$FILE_VALIDATION_LOG" && pdf_valid=0
|
|
grep -i 'Status.*Not well-formed' "$FILE_VALIDATION_LOG" && pdf_valid=0
|
|
! grep -i 'Profile:.*PDF/A-1' "$FILE_VALIDATION_LOG" > /dev/null && echo "PDF file profile is not PDF/A-1" && pdf_valid=0
|
|
[ $pdf_valid -ne 1 ] && echo "Output file: The generated PDF/A file is INVALID"
|
|
[ $pdf_valid -eq 1 ] && [ $VERBOSITY -ge $LOG_INFO ] && echo "Output file: The generated PDF/A file is VALID"
|
|
|
|
|
|
|
|
|
|
# delete temporary files
|
|
if [ $KEEP_TMP -eq 0 ]; then
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Deleting temporary files"
|
|
rm -r -f "${TMP_FLD}"
|
|
fi
|
|
|
|
|
|
END=`date +%s`
|
|
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Script took $(($END-$START)) seconds"
|
|
|
|
|
|
[ $pdf_valid -ne 1 ] && exit $EXIT_INVALID_OUTPUT_PDFA || exit 0
|