mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-11-01 10:19:58 +00:00
Modularize unpaper; get -d and -c working again
This commit is contained in:
parent
5adff94545
commit
9f90b5cb0a
101
src/ocrmypdf.py
101
src/ocrmypdf.py
@ -153,6 +153,20 @@ if not set(options.language).issubset(tesseract.LANGUAGES):
|
||||
sys.exit(EXIT_BAD_ARGS)
|
||||
|
||||
|
||||
# ----------
|
||||
# Arguments
|
||||
|
||||
|
||||
if any((options.deskew, options.clean, options.clean_final)):
|
||||
try:
|
||||
from . import unpaper
|
||||
except ImportError:
|
||||
print("Install the 'unpaper' program to use the specified options",
|
||||
file=sys.stderr)
|
||||
sys.exit(EXIT_BAD_ARGS)
|
||||
else:
|
||||
unpaper = None
|
||||
|
||||
# ----------
|
||||
# Logging
|
||||
|
||||
@ -384,69 +398,50 @@ def rasterize_with_ghostscript(
|
||||
@transform(
|
||||
input=rasterize_with_ghostscript,
|
||||
filter=suffix(".page.png"),
|
||||
output=".pp.png",
|
||||
output=".pp-deskew.png",
|
||||
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
||||
def preprocess(
|
||||
def preprocess_deskew(
|
||||
input_file,
|
||||
output_file,
|
||||
log,
|
||||
pdfinfo,
|
||||
pdfinfo_lock):
|
||||
|
||||
if not options.deskew and not options.clean:
|
||||
if not options.deskew:
|
||||
re_symlink(input_file, output_file, log)
|
||||
return
|
||||
|
||||
pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
|
||||
dpi = int(pageinfo['xres'])
|
||||
|
||||
# unpaper documentation:
|
||||
# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
|
||||
args_unpaper = [
|
||||
'unpaper',
|
||||
'-v',
|
||||
'--dpi', str(int(pageinfo['xres'])),
|
||||
'--mask-scan-size', '100', # don't blank out narrow columns
|
||||
'--no-border-align', # don't align visible content to borders
|
||||
'--no-mask-center', # don't center visible content within page
|
||||
'--no-grayfilter', # don't remove light gray areas
|
||||
'--no-blackfilter', # don't remove solid black areas
|
||||
]
|
||||
|
||||
if not options.clean:
|
||||
args_unpaper.extend([
|
||||
'--no-noisefilter',
|
||||
'--no-blurfilter'])
|
||||
if not options.deskew:
|
||||
args_unpaper.extend([
|
||||
'--no-deskew'])
|
||||
|
||||
SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'}
|
||||
suffix = ''
|
||||
|
||||
im = Image.open(input_file)
|
||||
suffix = SUFFIXES[im.mode]
|
||||
with NamedTemporaryFile(suffix=suffix) as input_pnm, \
|
||||
NamedTemporaryFile(suffix=suffix, mode="r+b") as output_pnm:
|
||||
im.save(input_pnm, format='PPM')
|
||||
im.close()
|
||||
|
||||
os.unlink(output_pnm.name)
|
||||
|
||||
args_unpaper.extend([input_pnm.name, output_pnm.name])
|
||||
p_unpaper = Popen(
|
||||
args_unpaper, close_fds=True,
|
||||
universal_newlines=True, stdout=PIPE, stderr=PIPE
|
||||
)
|
||||
out, err = p_unpaper.communicate()
|
||||
log.debug(out)
|
||||
log.debug(err)
|
||||
|
||||
Image.open(output_pnm.name).save(output_file)
|
||||
unpaper.deskew(input_file, output_file, dpi, log)
|
||||
|
||||
|
||||
@transform(
|
||||
input=preprocess,
|
||||
filter=suffix(".pp.png"),
|
||||
input=preprocess_deskew,
|
||||
filter=suffix(".pp-deskew.png"),
|
||||
output=".pp-clean.png",
|
||||
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
||||
def preprocess_clean(
|
||||
input_file,
|
||||
output_file,
|
||||
log,
|
||||
pdfinfo,
|
||||
pdfinfo_lock):
|
||||
|
||||
if not options.clean:
|
||||
re_symlink(input_file, output_file, log)
|
||||
return
|
||||
|
||||
pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
|
||||
dpi = int(pageinfo['xres'])
|
||||
|
||||
unpaper.clean(input_file, output_file, dpi, log)
|
||||
|
||||
|
||||
@transform(
|
||||
input=preprocess_clean,
|
||||
filter=suffix(".pp-clean.png"),
|
||||
output=".hocr",
|
||||
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
||||
def ocr_tesseract(
|
||||
@ -509,8 +504,8 @@ def ocr_tesseract(
|
||||
|
||||
|
||||
@collate(
|
||||
input=[rasterize_with_ghostscript, ocr_tesseract],
|
||||
filter=regex(r".*/(\d{6})(?:\.page\.png|\.hocr)"),
|
||||
input=[preprocess_deskew, preprocess_clean, ocr_tesseract],
|
||||
filter=regex(r".*/(\d{6})(?:\.pp-deskew\.png|\.pp-clean\.png|\.hocr)"),
|
||||
output=os.path.join(options.temp_folder, r'\1.rendered.pdf'),
|
||||
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
||||
def render_page(
|
||||
@ -520,7 +515,11 @@ def render_page(
|
||||
pdfinfo,
|
||||
pdfinfo_lock):
|
||||
hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
|
||||
image = next(ii for ii in infiles if ii.endswith('.page.png'))
|
||||
if options.clean_final:
|
||||
image_suffix = '.pp-clean.png'
|
||||
else:
|
||||
image_suffix = '.pp-deskew.png'
|
||||
image = next(ii for ii in infiles if ii.endswith(image_suffix))
|
||||
|
||||
pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
|
||||
dpi = round(max(pageinfo['xres'], pageinfo['yres']))
|
||||
|
||||
87
src/unpaper.py
Normal file
87
src/unpaper.py
Normal file
@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env python3
|
||||
# unpaper documentation:
|
||||
# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
|
||||
|
||||
from subprocess import Popen, PIPE
|
||||
from tempfile import NamedTemporaryFile
|
||||
import sys
|
||||
import os
|
||||
|
||||
|
||||
def _version():
|
||||
args_unpaper = [
|
||||
'unpaper',
|
||||
'--version'
|
||||
]
|
||||
p_unpaper = Popen(args_unpaper, close_fds=True, universal_newlines=True,
|
||||
stdout=PIPE, stderr=PIPE)
|
||||
version, _ = p_unpaper.communicate(timeout=5)
|
||||
|
||||
return version.strip()
|
||||
|
||||
|
||||
try:
|
||||
VERSION = _version()
|
||||
except FileNotFoundError:
|
||||
print("Could not find 'unpaper' executable", file=sys.stderr)
|
||||
raise
|
||||
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
print("Could not find Python3 imaging library", file=sys.stderr)
|
||||
raise
|
||||
|
||||
|
||||
def run(input_file, output_file, dpi, log, mode_args):
|
||||
args_unpaper = [
|
||||
'unpaper',
|
||||
'-v',
|
||||
'--dpi', str(dpi)
|
||||
] + mode_args
|
||||
|
||||
SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'}
|
||||
suffix = ''
|
||||
|
||||
im = Image.open(input_file)
|
||||
suffix = SUFFIXES[im.mode]
|
||||
with NamedTemporaryFile(suffix=suffix) as input_pnm, \
|
||||
NamedTemporaryFile(suffix=suffix, mode="r+b") as output_pnm:
|
||||
im.save(input_pnm, format='PPM')
|
||||
im.close()
|
||||
|
||||
os.unlink(output_pnm.name)
|
||||
|
||||
args_unpaper.extend([input_pnm.name, output_pnm.name])
|
||||
p_unpaper = Popen(
|
||||
args_unpaper, close_fds=True,
|
||||
universal_newlines=True, stdout=PIPE, stderr=PIPE
|
||||
)
|
||||
out, err = p_unpaper.communicate()
|
||||
log.debug(out)
|
||||
log.debug(err)
|
||||
|
||||
Image.open(output_pnm.name).save(output_file)
|
||||
|
||||
|
||||
def deskew(input_file, output_file, dpi, log):
|
||||
run(input_file, output_file, dpi, log, [
|
||||
'--mask-scan-size', '100', # don't blank out narrow columns
|
||||
'--no-border-align', # don't align visible content to borders
|
||||
'--no-mask-center', # don't center visible content within page
|
||||
'--no-grayfilter', # don't remove light gray areas
|
||||
'--no-blackfilter', # don't remove solid black areas
|
||||
'--no-noisefilter', # don't remove salt and pepper noise
|
||||
'--no-blurfilter' # don't remove blurry objects/debris
|
||||
])
|
||||
|
||||
|
||||
def clean(input_file, output_file, dpi, log):
|
||||
run(input_file, output_file, dpi, log, [
|
||||
'--mask-scan-size', '100', # don't blank out narrow columns
|
||||
'--no-border-align', # don't align visible content to borders
|
||||
'--no-mask-center', # don't center visible content within page
|
||||
'--no-grayfilter', # don't remove light gray areas
|
||||
'--no-blackfilter', # don't remove solid black areas
|
||||
'--no-deskew', # don't deskew
|
||||
])
|
||||
Loading…
x
Reference in New Issue
Block a user