mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-11-02 10:50:29 +00:00
88 lines
2.6 KiB
Python
88 lines
2.6 KiB
Python
#!/usr/bin/env python3
|
|
# unpaper documentation:
|
|
# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
|
|
|
|
from subprocess import Popen, PIPE
|
|
from tempfile import NamedTemporaryFile
|
|
import sys
|
|
import os
|
|
|
|
|
|
def _version():
|
|
args_unpaper = [
|
|
'unpaper',
|
|
'--version'
|
|
]
|
|
p_unpaper = Popen(args_unpaper, close_fds=True, universal_newlines=True,
|
|
stdout=PIPE, stderr=PIPE)
|
|
version, _ = p_unpaper.communicate(timeout=5)
|
|
|
|
return version.strip()
|
|
|
|
|
|
try:
|
|
VERSION = _version()
|
|
except FileNotFoundError:
|
|
print("Could not find 'unpaper' executable", file=sys.stderr)
|
|
raise
|
|
|
|
try:
|
|
from PIL import Image
|
|
except ImportError:
|
|
print("Could not find Python3 imaging library", file=sys.stderr)
|
|
raise
|
|
|
|
|
|
def run(input_file, output_file, dpi, log, mode_args):
|
|
args_unpaper = [
|
|
'unpaper',
|
|
'-v',
|
|
'--dpi', str(dpi)
|
|
] + mode_args
|
|
|
|
SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'}
|
|
suffix = ''
|
|
|
|
im = Image.open(input_file)
|
|
suffix = SUFFIXES[im.mode]
|
|
with NamedTemporaryFile(suffix=suffix) as input_pnm, \
|
|
NamedTemporaryFile(suffix=suffix, mode="r+b") as output_pnm:
|
|
im.save(input_pnm, format='PPM')
|
|
im.close()
|
|
|
|
os.unlink(output_pnm.name)
|
|
|
|
args_unpaper.extend([input_pnm.name, output_pnm.name])
|
|
p_unpaper = Popen(
|
|
args_unpaper, close_fds=True,
|
|
universal_newlines=True, stdout=PIPE, stderr=PIPE
|
|
)
|
|
out, err = p_unpaper.communicate()
|
|
log.debug(out)
|
|
log.debug(err)
|
|
|
|
Image.open(output_pnm.name).save(output_file)
|
|
|
|
|
|
def deskew(input_file, output_file, dpi, log):
|
|
run(input_file, output_file, dpi, log, [
|
|
'--mask-scan-size', '100', # don't blank out narrow columns
|
|
'--no-border-align', # don't align visible content to borders
|
|
'--no-mask-center', # don't center visible content within page
|
|
'--no-grayfilter', # don't remove light gray areas
|
|
'--no-blackfilter', # don't remove solid black areas
|
|
'--no-noisefilter', # don't remove salt and pepper noise
|
|
'--no-blurfilter' # don't remove blurry objects/debris
|
|
])
|
|
|
|
|
|
def clean(input_file, output_file, dpi, log):
|
|
run(input_file, output_file, dpi, log, [
|
|
'--mask-scan-size', '100', # don't blank out narrow columns
|
|
'--no-border-align', # don't align visible content to borders
|
|
'--no-mask-center', # don't center visible content within page
|
|
'--no-grayfilter', # don't remove light gray areas
|
|
'--no-blackfilter', # don't remove solid black areas
|
|
'--no-deskew', # don't deskew
|
|
])
|