mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-11-01 10:19:58 +00:00
Modularize unpaper; get -d and -c working again
This commit is contained in:
parent
5adff94545
commit
9f90b5cb0a
101
src/ocrmypdf.py
101
src/ocrmypdf.py
@ -153,6 +153,20 @@ if not set(options.language).issubset(tesseract.LANGUAGES):
|
|||||||
sys.exit(EXIT_BAD_ARGS)
|
sys.exit(EXIT_BAD_ARGS)
|
||||||
|
|
||||||
|
|
||||||
|
# ----------
|
||||||
|
# Arguments
|
||||||
|
|
||||||
|
|
||||||
|
if any((options.deskew, options.clean, options.clean_final)):
|
||||||
|
try:
|
||||||
|
from . import unpaper
|
||||||
|
except ImportError:
|
||||||
|
print("Install the 'unpaper' program to use the specified options",
|
||||||
|
file=sys.stderr)
|
||||||
|
sys.exit(EXIT_BAD_ARGS)
|
||||||
|
else:
|
||||||
|
unpaper = None
|
||||||
|
|
||||||
# ----------
|
# ----------
|
||||||
# Logging
|
# Logging
|
||||||
|
|
||||||
@ -384,69 +398,50 @@ def rasterize_with_ghostscript(
|
|||||||
@transform(
|
@transform(
|
||||||
input=rasterize_with_ghostscript,
|
input=rasterize_with_ghostscript,
|
||||||
filter=suffix(".page.png"),
|
filter=suffix(".page.png"),
|
||||||
output=".pp.png",
|
output=".pp-deskew.png",
|
||||||
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
||||||
def preprocess(
|
def preprocess_deskew(
|
||||||
input_file,
|
input_file,
|
||||||
output_file,
|
output_file,
|
||||||
log,
|
log,
|
||||||
pdfinfo,
|
pdfinfo,
|
||||||
pdfinfo_lock):
|
pdfinfo_lock):
|
||||||
|
|
||||||
if not options.deskew and not options.clean:
|
if not options.deskew:
|
||||||
re_symlink(input_file, output_file, log)
|
re_symlink(input_file, output_file, log)
|
||||||
return
|
return
|
||||||
|
|
||||||
pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
|
pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
|
||||||
|
dpi = int(pageinfo['xres'])
|
||||||
|
|
||||||
# unpaper documentation:
|
unpaper.deskew(input_file, output_file, dpi, log)
|
||||||
# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
|
|
||||||
args_unpaper = [
|
|
||||||
'unpaper',
|
|
||||||
'-v',
|
|
||||||
'--dpi', str(int(pageinfo['xres'])),
|
|
||||||
'--mask-scan-size', '100', # don't blank out narrow columns
|
|
||||||
'--no-border-align', # don't align visible content to borders
|
|
||||||
'--no-mask-center', # don't center visible content within page
|
|
||||||
'--no-grayfilter', # don't remove light gray areas
|
|
||||||
'--no-blackfilter', # don't remove solid black areas
|
|
||||||
]
|
|
||||||
|
|
||||||
if not options.clean:
|
|
||||||
args_unpaper.extend([
|
|
||||||
'--no-noisefilter',
|
|
||||||
'--no-blurfilter'])
|
|
||||||
if not options.deskew:
|
|
||||||
args_unpaper.extend([
|
|
||||||
'--no-deskew'])
|
|
||||||
|
|
||||||
SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'}
|
|
||||||
suffix = ''
|
|
||||||
|
|
||||||
im = Image.open(input_file)
|
|
||||||
suffix = SUFFIXES[im.mode]
|
|
||||||
with NamedTemporaryFile(suffix=suffix) as input_pnm, \
|
|
||||||
NamedTemporaryFile(suffix=suffix, mode="r+b") as output_pnm:
|
|
||||||
im.save(input_pnm, format='PPM')
|
|
||||||
im.close()
|
|
||||||
|
|
||||||
os.unlink(output_pnm.name)
|
|
||||||
|
|
||||||
args_unpaper.extend([input_pnm.name, output_pnm.name])
|
|
||||||
p_unpaper = Popen(
|
|
||||||
args_unpaper, close_fds=True,
|
|
||||||
universal_newlines=True, stdout=PIPE, stderr=PIPE
|
|
||||||
)
|
|
||||||
out, err = p_unpaper.communicate()
|
|
||||||
log.debug(out)
|
|
||||||
log.debug(err)
|
|
||||||
|
|
||||||
Image.open(output_pnm.name).save(output_file)
|
|
||||||
|
|
||||||
|
|
||||||
@transform(
|
@transform(
|
||||||
input=preprocess,
|
input=preprocess_deskew,
|
||||||
filter=suffix(".pp.png"),
|
filter=suffix(".pp-deskew.png"),
|
||||||
|
output=".pp-clean.png",
|
||||||
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
||||||
|
def preprocess_clean(
|
||||||
|
input_file,
|
||||||
|
output_file,
|
||||||
|
log,
|
||||||
|
pdfinfo,
|
||||||
|
pdfinfo_lock):
|
||||||
|
|
||||||
|
if not options.clean:
|
||||||
|
re_symlink(input_file, output_file, log)
|
||||||
|
return
|
||||||
|
|
||||||
|
pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
|
||||||
|
dpi = int(pageinfo['xres'])
|
||||||
|
|
||||||
|
unpaper.clean(input_file, output_file, dpi, log)
|
||||||
|
|
||||||
|
|
||||||
|
@transform(
|
||||||
|
input=preprocess_clean,
|
||||||
|
filter=suffix(".pp-clean.png"),
|
||||||
output=".hocr",
|
output=".hocr",
|
||||||
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
||||||
def ocr_tesseract(
|
def ocr_tesseract(
|
||||||
@ -509,8 +504,8 @@ def ocr_tesseract(
|
|||||||
|
|
||||||
|
|
||||||
@collate(
|
@collate(
|
||||||
input=[rasterize_with_ghostscript, ocr_tesseract],
|
input=[preprocess_deskew, preprocess_clean, ocr_tesseract],
|
||||||
filter=regex(r".*/(\d{6})(?:\.page\.png|\.hocr)"),
|
filter=regex(r".*/(\d{6})(?:\.pp-deskew\.png|\.pp-clean\.png|\.hocr)"),
|
||||||
output=os.path.join(options.temp_folder, r'\1.rendered.pdf'),
|
output=os.path.join(options.temp_folder, r'\1.rendered.pdf'),
|
||||||
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
||||||
def render_page(
|
def render_page(
|
||||||
@ -520,7 +515,11 @@ def render_page(
|
|||||||
pdfinfo,
|
pdfinfo,
|
||||||
pdfinfo_lock):
|
pdfinfo_lock):
|
||||||
hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
|
hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
|
||||||
image = next(ii for ii in infiles if ii.endswith('.page.png'))
|
if options.clean_final:
|
||||||
|
image_suffix = '.pp-clean.png'
|
||||||
|
else:
|
||||||
|
image_suffix = '.pp-deskew.png'
|
||||||
|
image = next(ii for ii in infiles if ii.endswith(image_suffix))
|
||||||
|
|
||||||
pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
|
pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
|
||||||
dpi = round(max(pageinfo['xres'], pageinfo['yres']))
|
dpi = round(max(pageinfo['xres'], pageinfo['yres']))
|
||||||
|
|||||||
87
src/unpaper.py
Normal file
87
src/unpaper.py
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# unpaper documentation:
|
||||||
|
# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
|
||||||
|
|
||||||
|
from subprocess import Popen, PIPE
|
||||||
|
from tempfile import NamedTemporaryFile
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def _version():
|
||||||
|
args_unpaper = [
|
||||||
|
'unpaper',
|
||||||
|
'--version'
|
||||||
|
]
|
||||||
|
p_unpaper = Popen(args_unpaper, close_fds=True, universal_newlines=True,
|
||||||
|
stdout=PIPE, stderr=PIPE)
|
||||||
|
version, _ = p_unpaper.communicate(timeout=5)
|
||||||
|
|
||||||
|
return version.strip()
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
VERSION = _version()
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("Could not find 'unpaper' executable", file=sys.stderr)
|
||||||
|
raise
|
||||||
|
|
||||||
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
except ImportError:
|
||||||
|
print("Could not find Python3 imaging library", file=sys.stderr)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def run(input_file, output_file, dpi, log, mode_args):
|
||||||
|
args_unpaper = [
|
||||||
|
'unpaper',
|
||||||
|
'-v',
|
||||||
|
'--dpi', str(dpi)
|
||||||
|
] + mode_args
|
||||||
|
|
||||||
|
SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'}
|
||||||
|
suffix = ''
|
||||||
|
|
||||||
|
im = Image.open(input_file)
|
||||||
|
suffix = SUFFIXES[im.mode]
|
||||||
|
with NamedTemporaryFile(suffix=suffix) as input_pnm, \
|
||||||
|
NamedTemporaryFile(suffix=suffix, mode="r+b") as output_pnm:
|
||||||
|
im.save(input_pnm, format='PPM')
|
||||||
|
im.close()
|
||||||
|
|
||||||
|
os.unlink(output_pnm.name)
|
||||||
|
|
||||||
|
args_unpaper.extend([input_pnm.name, output_pnm.name])
|
||||||
|
p_unpaper = Popen(
|
||||||
|
args_unpaper, close_fds=True,
|
||||||
|
universal_newlines=True, stdout=PIPE, stderr=PIPE
|
||||||
|
)
|
||||||
|
out, err = p_unpaper.communicate()
|
||||||
|
log.debug(out)
|
||||||
|
log.debug(err)
|
||||||
|
|
||||||
|
Image.open(output_pnm.name).save(output_file)
|
||||||
|
|
||||||
|
|
||||||
|
def deskew(input_file, output_file, dpi, log):
|
||||||
|
run(input_file, output_file, dpi, log, [
|
||||||
|
'--mask-scan-size', '100', # don't blank out narrow columns
|
||||||
|
'--no-border-align', # don't align visible content to borders
|
||||||
|
'--no-mask-center', # don't center visible content within page
|
||||||
|
'--no-grayfilter', # don't remove light gray areas
|
||||||
|
'--no-blackfilter', # don't remove solid black areas
|
||||||
|
'--no-noisefilter', # don't remove salt and pepper noise
|
||||||
|
'--no-blurfilter' # don't remove blurry objects/debris
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
def clean(input_file, output_file, dpi, log):
|
||||||
|
run(input_file, output_file, dpi, log, [
|
||||||
|
'--mask-scan-size', '100', # don't blank out narrow columns
|
||||||
|
'--no-border-align', # don't align visible content to borders
|
||||||
|
'--no-mask-center', # don't center visible content within page
|
||||||
|
'--no-grayfilter', # don't remove light gray areas
|
||||||
|
'--no-blackfilter', # don't remove solid black areas
|
||||||
|
'--no-deskew', # don't deskew
|
||||||
|
])
|
||||||
Loading…
x
Reference in New Issue
Block a user