2014-09-26 04:19:41 -07:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# Reimplement ocrPage.sh as Python
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
import logging
|
|
|
|
import sys
|
2014-09-26 04:43:15 -07:00
|
|
|
import os.path
|
2014-09-26 04:19:41 -07:00
|
|
|
from parse import parse
|
|
|
|
from subprocess import Popen, PIPE, check_call
|
2014-09-27 15:03:07 -07:00
|
|
|
from tempfile import NamedTemporaryFile
|
2014-09-26 04:19:41 -07:00
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
SUBPROC_PIPE = dict(close_fds=True, stdin=PIPE, stdout=PIPE, stderr=PIPE,
|
|
|
|
universal_newlines=True)
|
|
|
|
|
|
|
|
|
2014-09-26 04:43:15 -07:00
|
|
|
def pdf_get_pageinfo(infile, page, width_pt, height_pt):
|
2014-09-26 04:19:41 -07:00
|
|
|
pageinfo = {}
|
2014-09-26 04:43:15 -07:00
|
|
|
pageinfo['pageno'] = page
|
|
|
|
pageinfo['width_inches'] = width_pt / 72.0
|
|
|
|
pageinfo['height_inches'] = height_pt / 72.0
|
|
|
|
pageinfo['images'] = []
|
2014-09-26 04:19:41 -07:00
|
|
|
|
|
|
|
p_pdffonts = Popen(['pdffonts', '-f', str(page), '-l', str(page), infile],
|
|
|
|
**SUBPROC_PIPE)
|
|
|
|
pdffonts, _ = p_pdffonts.communicate()
|
|
|
|
if len(pdffonts.splitlines()) > 2:
|
|
|
|
logger.info("Page already contains font data !!!")
|
|
|
|
pageinfo['has_text'] = True
|
|
|
|
else:
|
|
|
|
pageinfo['has_text'] = False
|
|
|
|
|
|
|
|
# pdfimages: get image dimensions
|
|
|
|
p_pdfimages = Popen(['pdfimages', '-list', '-f', str(page), '-l',
|
|
|
|
str(page), str(infile)], **SUBPROC_PIPE)
|
|
|
|
pdfimages, _ = p_pdfimages.communicate()
|
|
|
|
for n, line in enumerate(pdfimages.splitlines()):
|
|
|
|
if n <= 1:
|
|
|
|
continue # Skip first two lines
|
|
|
|
|
|
|
|
r = parse('{page:1d} {num:1d} {imtype:>} {width:1d} {height:1d} ' +
|
|
|
|
'{color:>} {comp:1d} {bpc:1d} {enc:>} {interp:>} ' +
|
|
|
|
'{pdfobject:1d} {pdfid:1d} {bad_dpi_w:1d} {bad_dpi_h:1d} ' +
|
|
|
|
'{size:>} {ratio:>}', line)
|
|
|
|
image = r.named
|
|
|
|
# pdfimages calculates DPI as 0.26.0, but adds +1 to dpi_h
|
|
|
|
# apparent bug, so calculate explicitly
|
|
|
|
image['dpi_w'] = image['width'] / pageinfo['width_inches']
|
|
|
|
image['dpi_h'] = image['height'] / pageinfo['height_inches']
|
|
|
|
image['dpi'] = (image['dpi_w'] * image['dpi_h']) ** 0.5
|
|
|
|
pageinfo['images'].append(image)
|
|
|
|
|
2014-09-27 15:03:07 -07:00
|
|
|
xres = max(image['dpi_w'] for image in pageinfo['images'])
|
|
|
|
yres = max(image['dpi_h'] for image in pageinfo['images'])
|
|
|
|
pageinfo['xres'], pageinfo['yres'] = xres, yres
|
|
|
|
pageinfo['width_pixels'] = int(round(xres * pageinfo['width_inches']))
|
|
|
|
pageinfo['height_pixels'] = int(round(yres * pageinfo['height_inches']))
|
|
|
|
|
2014-09-26 04:19:41 -07:00
|
|
|
return pageinfo
|
|
|
|
|
|
|
|
|
2014-09-26 04:43:15 -07:00
|
|
|
def unpack_with_pdftoppm(pageinfo, infile, output_folder, prefix,
|
|
|
|
force_ppm=False):
|
2014-09-26 04:19:41 -07:00
|
|
|
|
|
|
|
colorspace = 'color'
|
|
|
|
compression = 'deflate'
|
|
|
|
output_format = 'tiff'
|
|
|
|
if all(image['comp'] == 1 for image in pageinfo['images']):
|
|
|
|
if all(image['bpc'] == 1 for image in pageinfo['images']):
|
|
|
|
colorspace = 'mono'
|
|
|
|
compression = 'deflate'
|
2014-09-26 04:43:15 -07:00
|
|
|
elif not any(image['color'] == 'color'
|
|
|
|
for image in pageinfo['images']):
|
2014-09-26 04:19:41 -07:00
|
|
|
colorspace = 'gray'
|
|
|
|
|
2014-09-26 04:43:15 -07:00
|
|
|
if all(image['enc'] == 'jpeg' for image in pageinfo['images']):
|
2014-09-26 04:19:41 -07:00
|
|
|
output_format = 'jpeg'
|
|
|
|
|
|
|
|
args_pdftoppm = [
|
|
|
|
'pdftoppm',
|
2014-09-26 04:43:15 -07:00
|
|
|
'-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']),
|
2014-09-27 15:03:07 -07:00
|
|
|
'-rx', str(pageinfo['xres']),
|
|
|
|
'-ry', str(pageinfo['yres'])
|
2014-09-26 04:19:41 -07:00
|
|
|
]
|
|
|
|
|
|
|
|
if not force_ppm:
|
|
|
|
if output_format == 'tiff':
|
|
|
|
args_pdftoppm.append('-tiff')
|
|
|
|
if compression:
|
|
|
|
args_pdftoppm.append('-tiffcompression')
|
|
|
|
args_pdftoppm.append(compression)
|
|
|
|
elif output_format == 'jpeg':
|
|
|
|
args_pdftoppm.append('-jpeg')
|
|
|
|
|
|
|
|
if colorspace == 'mono':
|
|
|
|
args_pdftoppm.append('-mono')
|
|
|
|
elif colorspace == 'gray':
|
|
|
|
args_pdftoppm.append('-gray')
|
|
|
|
|
2014-09-26 04:43:15 -07:00
|
|
|
args_pdftoppm.extend([str(infile)])
|
2014-09-26 04:19:41 -07:00
|
|
|
|
2014-09-27 15:03:07 -07:00
|
|
|
with NamedTemporaryFile(prefix=prefix + "%04i.ppm" % pageinfo['pageno'],
|
|
|
|
suffix='.ppm', dir=output_folder,
|
|
|
|
delete=False) as tmpfile:
|
|
|
|
check_call(args_pdftoppm, close_fds=True, stdout=tmpfile)
|
|
|
|
return tmpfile.name
|
|
|
|
|
|
|
|
|
|
|
|
def deskew_imagemagick(pageinfo, infile, prefix, output_folder):
|
|
|
|
args_convert = [
|
|
|
|
'convert',
|
|
|
|
infile,
|
|
|
|
'-deskew', '40%',
|
|
|
|
'-gravity', 'center',
|
|
|
|
'-extent', '{width_pixels}x{height_pixels}'.format(**pageinfo)
|
|
|
|
]
|
|
|
|
|
|
|
|
with NamedTemporaryFile(prefix=prefix + "%04i.ppm" % pageinfo['pageno'],
|
|
|
|
suffix='.ppm', dir=output_folder,
|
|
|
|
delete=False) as tmpfile:
|
|
|
|
args_convert.append(tmpfile.name)
|
|
|
|
check_call(args_convert, close_fds=True)
|
|
|
|
return tmpfile.name
|
|
|
|
|
|
|
|
|
|
|
|
def clean_unpaper(pageinfo, infile, prefix, output_folder):
|
|
|
|
args_unpaper = [
|
|
|
|
'unpaper',
|
|
|
|
'--dpi', int(round((pageinfo['xres'] * pageinfo['yres']) ** 0.5)),
|
|
|
|
'--mask-scan-size', '100',
|
|
|
|
'--no-deskew',
|
|
|
|
'--no-grayfilter',
|
|
|
|
'--no-blackfilter',
|
|
|
|
'--no-mask-center',
|
|
|
|
'--no-border-align',
|
|
|
|
infile
|
|
|
|
]
|
|
|
|
|
|
|
|
with NamedTemporaryFile(prefix=prefix + "%04i.ppm" % pageinfo['pageno'],
|
|
|
|
suffix='.ppm', dir=output_folder,
|
|
|
|
delete=False) as tmpfile:
|
|
|
|
args_unpaper.append(tmpfile.name)
|
|
|
|
check_call(args_unpaper, close_fds=True)
|
|
|
|
return tmpfile.name
|
2014-09-26 04:19:41 -07:00
|
|
|
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
prog="ocrpage",
|
|
|
|
description="Run OCR and related jobs on a single page of a PDF file")
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
'input_pdf',
|
|
|
|
help="DF file containing the page to be OCRed")
|
|
|
|
parser.add_argument(
|
|
|
|
'page_info',
|
|
|
|
help="Various characteristics of the page to be OCRed")
|
|
|
|
parser.add_argument(
|
|
|
|
'num_pages',
|
|
|
|
help="Total number of page of the PDF file (required for logger)")
|
|
|
|
parser.add_argument(
|
|
|
|
'tmp_fld',
|
|
|
|
help="Folder where the temporary files should be placed")
|
|
|
|
parser.add_argument(
|
|
|
|
'verbosity',
|
|
|
|
help="Requested verbosity")
|
|
|
|
parser.add_argument(
|
|
|
|
'lan',
|
|
|
|
help="Language of the file to be OCRed")
|
|
|
|
parser.add_argument(
|
|
|
|
'keep_tmp',
|
|
|
|
help="Keep the temporary files after processing (helpful for debugging)")
|
|
|
|
parser.add_argument(
|
|
|
|
'preprocess_deskew',
|
|
|
|
help="Deskew the page to be OCRed")
|
|
|
|
parser.add_argument(
|
|
|
|
'preprocess_clean',
|
|
|
|
help="Clean the page to be OCRed")
|
|
|
|
parser.add_argument(
|
|
|
|
'preprocess_cleantopdf',
|
|
|
|
help="Put the cleaned paged in the OCRed PDF")
|
|
|
|
parser.add_argument(
|
|
|
|
'oversampling_dpi',
|
|
|
|
help="Oversampling resolution in dpi")
|
|
|
|
parser.add_argument(
|
|
|
|
'pdf_noimg',
|
|
|
|
help="Generate debug PDF pages with only the OCRed text and no image")
|
|
|
|
parser.add_argument(
|
|
|
|
'force_ocr',
|
|
|
|
help="Force to OCR, even if the page already contains fonts")
|
|
|
|
parser.add_argument(
|
|
|
|
'skip_text',
|
|
|
|
help="Skip OCR on pages that contain fonts and include the page anyway")
|
2014-09-26 04:43:15 -07:00
|
|
|
# parser.add_argument(
|
|
|
|
# 'tess_cfg_files',
|
|
|
|
# help="Specific configuration files to be used by Tesseract during OCRing")
|
2014-09-26 04:19:41 -07:00
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
args = parser.parse_args()
|
2014-09-27 15:03:07 -07:00
|
|
|
tmpfiles = {}
|
2014-09-26 04:19:41 -07:00
|
|
|
|
2014-09-26 04:43:15 -07:00
|
|
|
pageno, width_pt, height_pt = map(int, args.page_info.split(' ', 3))
|
2014-09-26 04:19:41 -07:00
|
|
|
|
|
|
|
logger.name += '(page=%i)' % pageno
|
|
|
|
|
|
|
|
logger.info("Processing page %i / %i", pageno, args.num_pages)
|
|
|
|
|
2014-09-26 04:43:15 -07:00
|
|
|
pageinfo = pdf_get_pageinfo(args.input_pdf, pageno, width_pt, height_pt)
|
2014-09-26 04:19:41 -07:00
|
|
|
|
|
|
|
if pageinfo['has_text']:
|
|
|
|
if args.force_ocr:
|
|
|
|
logger.info("Has text but forcing OCR (-f)")
|
|
|
|
else:
|
|
|
|
sys.exit(2)
|
|
|
|
|
|
|
|
if len(pageinfo['images']) > 1:
|
|
|
|
logger.warn("Page has more than one single image, proceeding anyway")
|
|
|
|
|
2014-09-27 15:03:07 -07:00
|
|
|
tmpfiles['pixmap'] = unpack_with_pdftoppm(
|
|
|
|
pageinfo, args.input_pdf, args.tmp_fld, prefix='', force_ppm=True)
|
|
|
|
|
|
|
|
if args.preprocess_deskew:
|
|
|
|
tmpfiles['deskew'] = deskew_imagemagick(
|
|
|
|
pageinfo, tmpfiles['pixmap'],
|
|
|
|
prefix='deskew', output_folder=args.tmp_fld)
|
|
|
|
else:
|
|
|
|
tmpfiles['deskew'] = tmpfiles['pixmap']
|
|
|
|
|
|
|
|
if args.preprocess_clean:
|
|
|
|
tmpfiles['clean'] = clean_unpaper(
|
|
|
|
pageinfo,
|
|
|
|
tmpfiles['deskew'])
|
|
|
|
else:
|
|
|
|
tmpfiles['clean'] = tmpfiles['deskew']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2014-09-26 04:19:41 -07:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2014-09-26 04:43:15 -07:00
|
|
|
main()
|