Add leptonica deskew

This commit is contained in:
Jim Barlow 2014-11-13 16:53:26 -08:00
parent b92f8e43f2
commit 4dc0370c57
2 changed files with 27 additions and 12 deletions

View File

@ -13,8 +13,11 @@ import argparse
import ctypes as C
import sys
import os
import logging
from tempfile import TemporaryFile
logger = logging.getLogger(__name__)
def stderr(*objs):
"""Python 2/3 compatible print to stderr.
@ -115,13 +118,13 @@ class LeptonicaErrorTrap(object):
# If there are Python errors, let them bubble up
if exc_type:
stderr(leptonica_output)
logger.warning(leptonica_output)
return False
# If there are Leptonica errors, wrap them in Python excpetions
if 'Error' in leptonica_output:
if 'image file not found' in leptonica_output:
raise LeptonicaIOError()
raise FileNotFoundError()
if 'pixWrite: stream not opened' in leptonica_output:
raise LeptonicaIOError()
raise LeptonicaError(leptonica_output)
@ -231,24 +234,22 @@ def getLeptonicaVersion():
return lept.getLeptonicaVersion().decode()
def deskew(args):
def deskew(infile, outfile, dpi):
try:
pix_source = pixRead(args.infile)
pix_source = pixRead(infile)
except LeptonicaIOError:
stderr("Failed to open file: %s" % args.infile)
sys.exit(2)
raise LeptonicaIOError("Failed to open file: %s" % infile)
if args.dpi < 150:
if dpi < 150:
reduction_factor = 1 # Don't downsample too much if DPI is already low
else:
reduction_factor = 0 # Use default
pix_deskewed = pixDeskew(pix_source, reduction_factor)
try:
pixWriteImpliedFormat(args.outfile, pix_deskewed)
pixWriteImpliedFormat(outfile, pix_deskewed)
except LeptonicaIOError:
stderr("Failed to open destination file: %s" % args.outfile)
sys.exit(5)
raise LeptonicaIOError("Failed to open destination file: %s" % outfile)
pixDestroy(pix_source)
pixDestroy(pix_deskewed)

View File

@ -71,6 +71,9 @@ parser.add_argument(
parser.add_argument(
'tess_cfg_files', default='', nargs='*',
help="Tesseract configuration")
parser.add_argument(
'--deskew-provider', choices=['imagemagick', 'leptonica'],
default='leptonica')
options = parser.parse_args()
@ -246,7 +249,8 @@ def convert_to_tiff(input_file, output_file):
check_call(args_convert)
@active_if(options.preprocess_deskew != 0)
@active_if(options.preprocess_deskew != 0
and options.deskew_provider == 'imagemagick')
@transform(convert_to_tiff, suffix(".tif"), ".deskewed.tif")
def deskew_imagemagick(input_file, output_file):
args_convert = [
@ -273,6 +277,16 @@ def deskew_imagemagick(input_file, output_file):
raise CalledProcessError(p.returncode, args_convert)
@active_if(options.preprocess_deskew != 0
and options.deskew_provider == 'leptonica')
@transform(convert_to_tiff, suffix(".tif"), ".deskewed.tif")
def deskew_leptonica(input_file, output_file):
from .leptonica import deskew
with logger_mutex:
deskew(input_file, output_file,
min(pageinfo['xres'], pageinfo['yres']))
def clean_unpaper(pageinfo, infile, prefix, output_folder):
args_unpaper = [
'unpaper',
@ -294,7 +308,7 @@ def clean_unpaper(pageinfo, infile, prefix, output_folder):
return tmpfile.name
@merge([convert_to_tiff, deskew_imagemagick],
@merge([convert_to_tiff, deskew_imagemagick, deskew_leptonica],
os.path.join(options.tmp_fld, "%04i.for_ocr.tif" % pageno))
def select_ocr_image(infiles, output_file):
re_symlink(infiles[-1], output_file, logger, logger_mutex)