From 9f90b5cb0a31927617e23728ece2e162cadf09df Mon Sep 17 00:00:00 2001 From: Jim Barlow Date: Sat, 25 Jul 2015 00:22:56 -0700 Subject: [PATCH] Modularize unpaper; get -d and -c working again --- src/ocrmypdf.py | 101 ++++++++++++++++++++++++------------------------ src/unpaper.py | 87 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 51 deletions(-) create mode 100644 src/unpaper.py diff --git a/src/ocrmypdf.py b/src/ocrmypdf.py index 9c0b84eb..4710b96e 100755 --- a/src/ocrmypdf.py +++ b/src/ocrmypdf.py @@ -153,6 +153,20 @@ if not set(options.language).issubset(tesseract.LANGUAGES): sys.exit(EXIT_BAD_ARGS) +# ---------- +# Arguments + + +if any((options.deskew, options.clean, options.clean_final)): + try: + from . import unpaper + except ImportError: + print("Install the 'unpaper' program to use the specified options", + file=sys.stderr) + sys.exit(EXIT_BAD_ARGS) +else: + unpaper = None + # ---------- # Logging @@ -384,69 +398,50 @@ def rasterize_with_ghostscript( @transform( input=rasterize_with_ghostscript, filter=suffix(".page.png"), - output=".pp.png", + output=".pp-deskew.png", extras=[_log, _pdfinfo, _pdfinfo_lock]) -def preprocess( +def preprocess_deskew( input_file, output_file, log, pdfinfo, pdfinfo_lock): - if not options.deskew and not options.clean: + if not options.deskew: re_symlink(input_file, output_file, log) return pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock) + dpi = int(pageinfo['xres']) - # unpaper documentation: - # https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md - args_unpaper = [ - 'unpaper', - '-v', - '--dpi', str(int(pageinfo['xres'])), - '--mask-scan-size', '100', # don't blank out narrow columns - '--no-border-align', # don't align visible content to borders - '--no-mask-center', # don't center visible content within page - '--no-grayfilter', # don't remove light gray areas - '--no-blackfilter', # don't remove solid black areas - ] - - if not options.clean: - args_unpaper.extend([ - '--no-noisefilter', - '--no-blurfilter']) - if not options.deskew: - args_unpaper.extend([ - '--no-deskew']) - - SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'} - suffix = '' - - im = Image.open(input_file) - suffix = SUFFIXES[im.mode] - with NamedTemporaryFile(suffix=suffix) as input_pnm, \ - NamedTemporaryFile(suffix=suffix, mode="r+b") as output_pnm: - im.save(input_pnm, format='PPM') - im.close() - - os.unlink(output_pnm.name) - - args_unpaper.extend([input_pnm.name, output_pnm.name]) - p_unpaper = Popen( - args_unpaper, close_fds=True, - universal_newlines=True, stdout=PIPE, stderr=PIPE - ) - out, err = p_unpaper.communicate() - log.debug(out) - log.debug(err) - - Image.open(output_pnm.name).save(output_file) + unpaper.deskew(input_file, output_file, dpi, log) @transform( - input=preprocess, - filter=suffix(".pp.png"), + input=preprocess_deskew, + filter=suffix(".pp-deskew.png"), + output=".pp-clean.png", + extras=[_log, _pdfinfo, _pdfinfo_lock]) +def preprocess_clean( + input_file, + output_file, + log, + pdfinfo, + pdfinfo_lock): + + if not options.clean: + re_symlink(input_file, output_file, log) + return + + pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock) + dpi = int(pageinfo['xres']) + + unpaper.clean(input_file, output_file, dpi, log) + + +@transform( + input=preprocess_clean, + filter=suffix(".pp-clean.png"), output=".hocr", extras=[_log, _pdfinfo, _pdfinfo_lock]) def ocr_tesseract( @@ -509,8 +504,8 @@ def ocr_tesseract( @collate( - input=[rasterize_with_ghostscript, ocr_tesseract], - filter=regex(r".*/(\d{6})(?:\.page\.png|\.hocr)"), + input=[preprocess_deskew, preprocess_clean, ocr_tesseract], + filter=regex(r".*/(\d{6})(?:\.pp-deskew\.png|\.pp-clean\.png|\.hocr)"), output=os.path.join(options.temp_folder, r'\1.rendered.pdf'), extras=[_log, _pdfinfo, _pdfinfo_lock]) def render_page( @@ -520,7 +515,11 @@ def render_page( pdfinfo, pdfinfo_lock): hocr = next(ii for ii in infiles if ii.endswith('.hocr')) - image = next(ii for ii in infiles if ii.endswith('.page.png')) + if options.clean_final: + image_suffix = '.pp-clean.png' + else: + image_suffix = '.pp-deskew.png' + image = next(ii for ii in infiles if ii.endswith(image_suffix)) pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock) dpi = round(max(pageinfo['xres'], pageinfo['yres'])) diff --git a/src/unpaper.py b/src/unpaper.py new file mode 100644 index 00000000..f6707dc5 --- /dev/null +++ b/src/unpaper.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +# unpaper documentation: +# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md + +from subprocess import Popen, PIPE +from tempfile import NamedTemporaryFile +import sys +import os + + +def _version(): + args_unpaper = [ + 'unpaper', + '--version' + ] + p_unpaper = Popen(args_unpaper, close_fds=True, universal_newlines=True, + stdout=PIPE, stderr=PIPE) + version, _ = p_unpaper.communicate(timeout=5) + + return version.strip() + + +try: + VERSION = _version() +except FileNotFoundError: + print("Could not find 'unpaper' executable", file=sys.stderr) + raise + +try: + from PIL import Image +except ImportError: + print("Could not find Python3 imaging library", file=sys.stderr) + raise + + +def run(input_file, output_file, dpi, log, mode_args): + args_unpaper = [ + 'unpaper', + '-v', + '--dpi', str(dpi) + ] + mode_args + + SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'} + suffix = '' + + im = Image.open(input_file) + suffix = SUFFIXES[im.mode] + with NamedTemporaryFile(suffix=suffix) as input_pnm, \ + NamedTemporaryFile(suffix=suffix, mode="r+b") as output_pnm: + im.save(input_pnm, format='PPM') + im.close() + + os.unlink(output_pnm.name) + + args_unpaper.extend([input_pnm.name, output_pnm.name]) + p_unpaper = Popen( + args_unpaper, close_fds=True, + universal_newlines=True, stdout=PIPE, stderr=PIPE + ) + out, err = p_unpaper.communicate() + log.debug(out) + log.debug(err) + + Image.open(output_pnm.name).save(output_file) + + +def deskew(input_file, output_file, dpi, log): + run(input_file, output_file, dpi, log, [ + '--mask-scan-size', '100', # don't blank out narrow columns + '--no-border-align', # don't align visible content to borders + '--no-mask-center', # don't center visible content within page + '--no-grayfilter', # don't remove light gray areas + '--no-blackfilter', # don't remove solid black areas + '--no-noisefilter', # don't remove salt and pepper noise + '--no-blurfilter' # don't remove blurry objects/debris + ]) + + +def clean(input_file, output_file, dpi, log): + run(input_file, output_file, dpi, log, [ + '--mask-scan-size', '100', # don't blank out narrow columns + '--no-border-align', # don't align visible content to borders + '--no-mask-center', # don't center visible content within page + '--no-grayfilter', # don't remove light gray areas + '--no-blackfilter', # don't remove solid black areas + '--no-deskew', # don't deskew + ])