Add rudimentary support for combining OCR layer with existing content

It appears to be very fragile due to weaknesses in PyPDF. Better
option is probably to use pdftk's watermark feature.
This commit is contained in:
Jim Barlow 2015-03-10 14:28:38 -07:00
parent 9229f7c6cc
commit a99ba3b696
2 changed files with 59 additions and 26 deletions

View File

@ -50,6 +50,7 @@ Usage: OCRmyPDF.sh [-h] [-v] [-g] [-k] [-d] [-c] [-i] [-o dpi] [-f] [-l languag
(which should not be the case for PDF files built from scanned images)
-s : If pages contain font data, do not perform processing on that page, but include the page in the final output.
-b : Skip big pages
-e : Use exact PDF pages with no changes other than inserting hidden OCR text layer (mutually exclusive with -d/-c/-i/-f)
-l : Set the language of the PDF file in order to improve OCR results (default "eng")
Any language supported by tesseract is supported (Tesseract uses 3-character ISO 639-2 language codes)
Multiple languages may be specified, separated by '+' characters.
@ -93,10 +94,11 @@ PDF_NOIMG="0" # 0=no, 1=yes (generates each PDF page twice, with and without i
FORCE_OCR="0" # 0=do not force, 1=force (force to OCR the whole document, even if some page already contain font data)
SKIP_TEXT="0" # 0=do not skip text pages, 1=skip text pages
SKIP_BIG="0"
EXACT_IMAGE="0"
TESS_CFG_FILES="" # list of additional configuration files to be used by tesseract
# Parse optional command line arguments
while getopts ":hvgkdcio:fsbl:C:" opt; do
while getopts ":hvgkdcio:fsbel:C:" opt; do
case $opt in
h) usage ; exit 0 ;;
v) VERBOSITY=$(($VERBOSITY+1)) ;;
@ -109,6 +111,7 @@ while getopts ":hvgkdcio:fsbl:C:" opt; do
f) FORCE_OCR="1" ;;
s) SKIP_TEXT="1" ;;
b) SKIP_BIG="1" ;;
e) EXACT_IMAGE="1" ;;
l) LAN="$OPTARG" ;;
C) TESS_CFG_FILES="$OPTARG $TESS_CFG_FILES" ;;
\?)
@ -273,10 +276,10 @@ sed '/^$/d' "$FILE_TMP" | awk '{printf "%04d %s\n", NR, $0}' > "$FILE_PAGES_INFO
numpages=`tail -n 1 "$FILE_PAGES_INFO" | cut -f1 -d" "`
# process each page of the input pdf file
parallel --gnu -q -k --halt-on-error 1 python3 -m src.ocrpage \
parallel --gnu -q -k --halt-on-error 1 -j2 python3 -m src.ocrpage \
"$FILE_INPUT_PDF" "{}" "$numpages" "$TMP_FLD" \
"$VERBOSITY" "$LAN" "$KEEP_TMP" "$PREPROCESS_DESKEW" "$PREPROCESS_CLEAN" "$PREPROCESS_CLEANTOPDF" "$OVERSAMPLING_DPI" \
"$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$SKIP_BIG" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO"
"$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$SKIP_BIG" "$EXACT_IMAGE" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO"
ret_code="$?"
[ $ret_code -ne 0 ] && exit $ret_code

View File

@ -6,6 +6,8 @@ import os.path
import fileinput
import re
from parse import parse
import PyPDF2 as pypdf
import shutil
from subprocess import Popen, check_call, PIPE, CalledProcessError, \
TimeoutExpired
@ -73,6 +75,9 @@ parser.add_argument(
parser.add_argument(
'skip_big', type=int,
help="Skip OCR for pages that are very large")
parser.add_argument(
'exact_image', type=int,
help="Use original page from PDF without re-rendering")
parser.add_argument(
'tess_cfg_files', default='', nargs='*', # Implemented
help="Tesseract configuration")
@ -225,11 +230,11 @@ def setup_working_directory(input_file, soft_link_name):
pass
@active_if(not ocr_required)
@active_if(not ocr_required or (ocr_required and options.exact_image))
@transform(setup_working_directory,
formatter(),
os.path.join(options.tmp_fld, '%04i.skip.pdf' % pageno))
def skip_ocr(
os.path.join(options.tmp_fld, '%04i.page.pdf' % pageno))
def extract_single_page(
input_file,
output_file):
args_pdfseparate = [
@ -526,27 +531,25 @@ def ocr_tesseract(
raise CalledProcessError(p.returncode, args_tesseract)
if os.path.exists(output_file + '.html'):
# Tesseract 3.02 appends suffix ".html" on its own
re_symlink(output_file + ".html", output_file,
logger, logger_mutex)
# Tesseract 3.02 appends suffix ".html" on its own (.hocr.html)
shutil.move(output_file + '.html', output_file)
elif os.path.exists(output_file + '.hocr'):
# Tesseract 3.03 appends suffix ".hocr" on its own
re_symlink(output_file + ".hocr", output_file,
logger, logger_mutex)
# Tesseract 3.03 appends suffix ".hocr" on its own (.hocr.hocr)
shutil.move(output_file + '.hocr', output_file)
# The filename gets inserted to hocr
# but Tesseract does not verify that it is escaped XML
# it's not necessary so strip it out
regex_nested_single_quotes = re.compile(
r"""title='image "([^"]*)";""")
with fileinput.input(files=(output_file,), inplace=True) as f:
for line in f:
line = regex_nested_single_quotes.sub(
r"""title='image " ";""", line)
print(line, end='') # stdout is redirected here
# Tesseract inserts source filename into hocr file without escaping
# it. This could break the XML parser. Rewrite the hocr file,
# replacing the filename with a space.
regex_nested_single_quotes = re.compile(
r"""title='image "([^"]*)";""")
with fileinput.input(files=(output_file,), inplace=True) as f:
for line in f:
line = regex_nested_single_quotes.sub(
r"""title='image " ";""", line)
print(line, end='') # fileinput.input redirects stdout
@active_if(ocr_required)
@active_if(ocr_required and not options.exact_image)
@merge([unpack_with_ghostscript, convert_to_png,
deskew_imagemagick, deskew_leptonica, cleaned_to_png],
os.path.join(options.tmp_fld, "%04i.image_for_pdf" % pageno))
@ -567,7 +570,7 @@ def select_image_for_pdf(infiles, output_file):
re_symlink(input_file, output_file, logger, logger_mutex)
@active_if(ocr_required)
@active_if(ocr_required and not options.exact_image)
@merge([ocr_tesseract, select_image_for_pdf],
os.path.join(options.tmp_fld, '%04i.rendered.pdf' % pageno))
def render_page(infiles, output_file):
@ -590,7 +593,34 @@ def render_text_output_page(input_file, output_file):
showBoundingboxes=True, invisibleText=False)
@merge([render_page, skip_ocr],
@active_if(ocr_required and options.exact_image)
@transform(ocr_tesseract, suffix(".hocr"), ".hocr.pdf")
def render_hocr_blank_page(input_file, output_file):
dpi = round(max(pageinfo['xres'], pageinfo['yres']))
hocrtransform = HocrTransform(input_file, dpi)
hocrtransform.to_pdf(output_file, imageFileName=None,
showBoundingboxes=False, invisibleText=True)
@active_if(ocr_required and options.exact_image)
@merge([render_hocr_blank_page, extract_single_page],
os.path.join(options.tmp_fld, "%04i.merged.pdf") % pageno)
def merge_hocr_with_original_page(infiles, output_file):
with open(infiles[0], 'rb') as hocr_input, \
open(infiles[1], 'rb') as page_input, \
open(output_file, 'wb') as output:
hocr_reader = pypdf.PdfFileReader(hocr_input)
page_reader = pypdf.PdfFileReader(page_input)
writer = pypdf.PdfFileWriter()
the_page = hocr_reader.getPage(0)
the_page.mergePage(page_reader.getPage(0))
writer.addPage(the_page)
writer.write(output)
@merge([render_page, merge_hocr_with_original_page, extract_single_page],
os.path.join(options.tmp_fld, '%04i.ocred.pdf' % pageno))
def select_final_page(infiles, output_file):
re_symlink(infiles[-1], output_file, logger, logger_mutex)
@ -600,7 +630,7 @@ cmdline.run(options)
# parser.add_argument(
# 'tess_cfg_files',
# help="Specific configuration files to be used by Tesseract during OCRing")
# help="Specific configuration files to be used by Tesseract during OCRing")
def main():