OCRmyPDF/ocrmypdf/unpaper.py

88 lines
2.6 KiB
Python

#!/usr/bin/env python3
# unpaper documentation:
# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
from subprocess import Popen, PIPE
from tempfile import NamedTemporaryFile
import sys
import os
def _version():
args_unpaper = [
'unpaper',
'--version'
]
p_unpaper = Popen(args_unpaper, close_fds=True, universal_newlines=True,
stdout=PIPE, stderr=PIPE)
version, _ = p_unpaper.communicate(timeout=5)
return version.strip()
try:
VERSION = _version()
except FileNotFoundError:
print("Could not find 'unpaper' executable", file=sys.stderr)
raise
try:
from PIL import Image
except ImportError:
print("Could not find Python3 imaging library", file=sys.stderr)
raise
def run(input_file, output_file, dpi, log, mode_args):
args_unpaper = [
'unpaper',
'-v',
'--dpi', str(dpi)
] + mode_args
SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'}
suffix = ''
im = Image.open(input_file)
suffix = SUFFIXES[im.mode]
with NamedTemporaryFile(suffix=suffix) as input_pnm, \
NamedTemporaryFile(suffix=suffix, mode="r+b") as output_pnm:
im.save(input_pnm, format='PPM')
im.close()
os.unlink(output_pnm.name)
args_unpaper.extend([input_pnm.name, output_pnm.name])
p_unpaper = Popen(
args_unpaper, close_fds=True,
universal_newlines=True, stdout=PIPE, stderr=PIPE
)
out, err = p_unpaper.communicate()
log.debug(out)
log.debug(err)
Image.open(output_pnm.name).save(output_file)
def deskew(input_file, output_file, dpi, log):
run(input_file, output_file, dpi, log, [
'--mask-scan-size', '100', # don't blank out narrow columns
'--no-border-align', # don't align visible content to borders
'--no-mask-center', # don't center visible content within page
'--no-grayfilter', # don't remove light gray areas
'--no-blackfilter', # don't remove solid black areas
'--no-noisefilter', # don't remove salt and pepper noise
'--no-blurfilter' # don't remove blurry objects/debris
])
def clean(input_file, output_file, dpi, log):
run(input_file, output_file, dpi, log, [
'--mask-scan-size', '100', # don't blank out narrow columns
'--no-border-align', # don't align visible content to borders
'--no-mask-center', # don't center visible content within page
'--no-grayfilter', # don't remove light gray areas
'--no-blackfilter', # don't remove solid black areas
'--no-deskew', # don't deskew
])