OCRmyPDF/ocrmypdf/pipeline.py
James R. Barlow bad67c6dc5 Rename ‘tesstop’ to ‘tess4’
There’s no reason text-only PDF shouldn’t become the default for
tesseract 4.
2017-01-26 12:28:51 -08:00

1077 lines
35 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# © 2016 James R. Barlow: github.com/jbarlow83
from contextlib import suppress
from tempfile import mkdtemp
from functools import partial
import sys
import os
import re
import shutil
import warnings
import multiprocessing
import atexit
import textwrap
import img2pdf
import logging
import argparse
import PyPDF2 as pypdf
from PIL import Image
from ruffus import formatter, regex, Pipeline, suffix
from .hocrtransform import HocrTransform
from .pageinfo import pdf_get_all_pageinfo
from .pdfa import generate_pdfa_def, file_claims_pdfa
from .helpers import re_symlink, is_iterable_notstr, page_number
from .exec import ghostscript, tesseract, qpdf
from .exceptions import *
from . import leptonica
from . import PROGRAM_NAME, VERSION
VECTOR_PAGE_DPI = 400
# -------------
# Pipeline state manager
class JobContext:
"""Holds our context for a particular run of the pipeline
A multiprocessing manager effectively creates a separate process
that keeps the master job context object. Other threads access
job context via multiprocessing proxy objects.
While this would naturally lend itself @property's it seems to make
a little more sense to use functions to make it explicitly that the
invocation requires marshalling data across a process boundary.
"""
def __init__(self):
self.pdfinfo = []
def get_pdfinfo(self):
"What we know about the input PDF"
return self.pdfinfo
def set_pdfinfo(self, pdfinfo):
self.pdfinfo = pdfinfo
def get_options(self):
return self.options
def set_options(self, options):
self.options = options
def get_work_folder(self):
return self.work_folder
def set_work_folder(self, work_folder):
self.work_folder = work_folder
from multiprocessing.managers import BaseManager
class JobContextManager(BaseManager):
pass
def cleanup_working_files(work_folder, options):
if options.keep_temporary_files:
print("Temporary working files saved at:\n{0}".format(work_folder),
file=sys.stderr)
else:
with suppress(FileNotFoundError):
shutil.rmtree(work_folder)
#
# The Pipeline
#
def triage_image_file(input_file, output_file, log, options):
try:
log.info("Input file is not a PDF, checking if it is an image...")
im = Image.open(input_file)
except EnvironmentError as e:
msg = str(e)
# Recover the original filename
realpath = ''
if os.path.islink(input_file):
realpath = os.path.realpath(input_file)
elif os.path.isfile(input_file):
realpath = '<stdin>'
msg = msg.replace(input_file, realpath)
log.error(msg)
raise UnsupportedImageFormatError() from e
else:
log.info("Input file is an image")
if 'dpi' in im.info:
if im.info['dpi'] <= (96, 96) and not options.image_dpi:
log.info("Image size: (%d, %d)" % im.size)
log.info("Image resolution: (%d, %d)" % im.info['dpi'])
log.error(
"Input file is an image, but the resolution (DPI) is "
"not credible. Estimate the resolution at which the "
"image was scanned and specify it using --image-dpi.")
raise DpiError()
elif not options.image_dpi:
log.info("Image size: (%d, %d)" % im.size)
log.error(
"Input file is an image, but has no resolution (DPI) "
"in its metadata. Estimate the resolution at which "
"image was scanned and specify it using --image-dpi.")
raise DpiError()
if 'iccprofile' not in im.info:
if im.mode == 'RGB':
log.info('Input image has no ICC profile, assuming sRGB')
elif im.mode == 'CMYK':
log.info('Input CMYK image has no ICC profile, not usable')
raise UnsupportedImageFormatError()
im.close()
try:
log.info("Image seems valid. Try converting to PDF...")
layout_fun = img2pdf.default_layout_fun
if options.image_dpi:
layout_fun = img2pdf.get_fixed_dpi_layout_fun(
(options.image_dpi, options.image_dpi))
with open(output_file, 'wb') as outf:
img2pdf.convert(
input_file,
layout_fun=layout_fun,
with_pdfrw=False,
outputstream=outf)
log.info("Successfully converted to PDF, processing...")
except img2pdf.ImageOpenError as e:
log.error(e)
raise UnsupportedImageFormatError() from e
def triage(
input_file,
output_file,
log,
context):
try:
with open(input_file, 'rb') as f:
signature = f.read(4)
if signature == b'%PDF':
re_symlink(input_file, output_file, log)
return
except EnvironmentError as e:
log.error(e)
raise InputFileError() from e
options = context.get_options()
triage_image_file(input_file, output_file, log, options)
def repair_pdf(
input_file,
output_file,
log,
context):
qpdf.repair(input_file, output_file, log)
pdfinfo = pdf_get_all_pageinfo(output_file)
context.set_pdfinfo(pdfinfo)
log.debug(pdfinfo)
def get_pageinfo(input_file, context):
pageno = int(os.path.basename(input_file)[0:6]) - 1
pageinfo = context.get_pdfinfo()[pageno]
return pageinfo
def get_page_dpi(pageinfo, options):
"Get the DPI when nonsquare DPI is tolerable"
xres = max(pageinfo.get('xres', VECTOR_PAGE_DPI), options.oversample or 0)
yres = max(pageinfo.get('yres', VECTOR_PAGE_DPI), options.oversample or 0)
return (float(xres), float(yres))
def get_page_square_dpi(pageinfo, options):
"Get the DPI when we require xres == yres"
return float(max(
pageinfo.get('xres', VECTOR_PAGE_DPI),
pageinfo.get('yres', VECTOR_PAGE_DPI),
options.oversample or 0))
def is_ocr_required(pageinfo, log, options):
page = pageinfo['pageno'] + 1
ocr_required = True
if not pageinfo['images']:
if options.force_ocr and options.oversample:
# The user really wants to reprocess this file
log.info(
"{0:4d}: page has no images - "
"rasterizing at {1} DPI because "
"--force-ocr --oversample was specified".format(
page, options.oversample))
elif options.force_ocr:
# Warn the user they might not want to do this
log.warning(
"{0:4d}: page has no images - "
"all vector content will be "
"rasterized at {1} DPI, losing some resolution and likely "
"increasing file size. Use --oversample to adjust the "
"DPI.".format(page, VECTOR_PAGE_DPI))
else:
log.info(
"{0:4d}: page has no images - "
"skipping all processing on this page".format(page))
ocr_required = False
elif pageinfo['has_text']:
msg = "{0:4d}: page already has text! {1}"
if not options.force_ocr and not options.skip_text:
log.error(msg.format(page,
"aborting (use --force-ocr to force OCR)"))
raise PriorOcrFoundError()
elif options.force_ocr:
log.info(msg.format(page,
"rasterizing text and running OCR anyway"))
ocr_required = True
elif options.skip_text:
log.info(msg.format(page,
"skipping all processing on this page"))
ocr_required = False
if ocr_required and options.skip_big:
pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
if pixel_count > (options.skip_big * 1000000):
ocr_required = False
log.warning(
"{0:4d}: page too big, skipping OCR "
"({1:.1f} MPixels > {2:.1f} MPixels --skip-big)".format(
page, pixel_count / 1000000, options.skip_big))
return ocr_required
def split_pages(
input_files,
output_files,
log,
context):
options = context.get_options()
work_folder = context.get_work_folder()
if is_iterable_notstr(input_files):
input_file = input_files[0]
else:
input_file = input_files
for oo in output_files:
with suppress(FileNotFoundError):
os.unlink(oo)
# If no files were repaired the input will be empty
if not input_file:
log.error("{0}: file not found or invalid argument".format(
options.input_file))
raise InputFileError()
pdfinfo = context.get_pdfinfo()
npages = len(pdfinfo)
qpdf.split_pages(input_file, work_folder, npages)
from glob import glob
for filename in glob(os.path.join(work_folder, '*.page.pdf')):
pageinfo = get_pageinfo(filename, context)
alt_suffix = \
'.ocr.page.pdf' if is_ocr_required(pageinfo, log, options) \
else '.skip.page.pdf'
re_symlink(
filename,
os.path.join(
work_folder,
os.path.basename(filename)[0:6] + alt_suffix),
log)
def rasterize_preview(
input_file,
output_file,
log,
context):
ghostscript.rasterize_pdf(
input_file=input_file,
output_file=output_file,
xres=200,
yres=200,
raster_device='jpeggray',
log=log)
def orient_page(
infiles,
output_file,
log,
context):
options = context.get_options()
page_pdf = next(ii for ii in infiles if ii.endswith('.page.pdf'))
if not options.rotate_pages:
re_symlink(page_pdf, output_file, log)
return
preview = next(ii for ii in infiles if ii.endswith('.preview.jpg'))
orient_conf = tesseract.get_orientation(
preview,
language=options.language,
engine_mode=options.tesseract_oem,
timeout=options.tesseract_timeout,
log=log)
direction = {
0: '',
90: '',
180: '',
270: ''
}
apply_correction = False
description = ''
if orient_conf.confidence >= options.rotate_pages_threshold:
if orient_conf.angle != 0:
apply_correction = True
description = ' - will rotate'
else:
description = ' - rotation appears correct'
else:
if orient_conf.angle != 0:
description = ' - confidence too low to rotate'
else:
description = ' - no change'
log.info(
'{0:4d}: page is facing {1}, confidence {2:.2f}{3}'.format(
page_number(preview),
direction.get(orient_conf.angle, '?'),
orient_conf.confidence,
description)
)
if not apply_correction:
re_symlink(page_pdf, output_file, log)
else:
writer = pypdf.PdfFileWriter()
reader = pypdf.PdfFileReader(page_pdf)
page = reader.pages[0]
# angle is a clockwise angle, so rotating ccw will correct the error
rotated_page = page.rotateCounterClockwise(orient_conf.angle)
writer.addPage(rotated_page)
with open(output_file, 'wb') as out:
writer.write(out)
pageno = int(os.path.basename(page_pdf)[0:6]) - 1
pdfinfo = context.get_pdfinfo()
pdfinfo[pageno]['rotated'] = orient_conf.angle
context.set_pdfinfo(pdfinfo)
def rasterize_with_ghostscript(
input_file,
output_file,
log,
context):
options = context.get_options()
pageinfo = get_pageinfo(input_file, context)
device = 'png16m' # 24-bit
if all(image['comp'] == 1 for image in pageinfo['images']):
if all(image['bpc'] == 1 for image in pageinfo['images']):
device = 'pngmono'
elif all(image['bpc'] > 1 and image['color'] == 'index'
for image in pageinfo['images']):
device = 'png256'
elif all(image['bpc'] > 1 and image['color'] == 'gray'
for image in pageinfo['images']):
device = 'pnggray'
log.debug("Rasterize {0} with {1}".format(
os.path.basename(input_file), device))
# Produce the page image with square resolution or else deskew and OCR
# will not work properly
dpi = get_page_square_dpi(pageinfo, options)
ghostscript.rasterize_pdf(
input_file, output_file, xres=dpi, yres=dpi, raster_device=device,
log=log)
def preprocess_remove_background(
input_file,
output_file,
log,
context):
options = context.get_options()
if not options.remove_background:
re_symlink(input_file, output_file, log)
return
pageinfo = get_pageinfo(input_file, context)
if any(image['bpc'] > 1 for image in pageinfo['images']):
leptonica.remove_background(input_file, output_file)
else:
log.info("{0:4d}: background removal skipped on mono page".format(
pageinfo['pageno']))
re_symlink(input_file, output_file, log)
def preprocess_deskew(
input_file,
output_file,
log,
context):
options = context.get_options()
if not options.deskew:
re_symlink(input_file, output_file, log)
return
pageinfo = get_pageinfo(input_file, context)
dpi = get_page_square_dpi(pageinfo, options)
leptonica.deskew(input_file, output_file, dpi)
def preprocess_clean(
input_file,
output_file,
log,
context):
options = context.get_options()
if not options.clean:
re_symlink(input_file, output_file, log)
return
from .exec import unpaper
pageinfo = get_pageinfo(input_file, context)
dpi = get_page_square_dpi(pageinfo, options)
unpaper.clean(input_file, output_file, dpi, log)
def select_ocr_image(
infiles,
output_file,
log,
contenxt):
"""Select the image we send for OCR. May not be the same as the display
image depending on preprocessing."""
# For the moment this is always the .pp-clean.png image
image = infiles[0]
re_symlink(image, output_file, log)
def ocr_tesseract_hocr(
input_file,
output_file,
log,
context):
options = context.get_options()
tesseract.generate_hocr(
input_file=input_file,
output_hocr=output_file,
language=options.language,
engine_mode=options.tesseract_oem,
tessconfig=options.tesseract_config,
timeout=options.tesseract_timeout,
pagesegmode=options.tesseract_pagesegmode,
log=log
)
def select_visible_page_image(
infiles,
output_file,
log,
context):
"Selects a whole page image that we can show the user (if necessary)"
options = context.get_options()
if options.clean_final:
image_suffix = '.pp-clean.png'
elif options.deskew:
image_suffix = '.pp-deskew.png'
elif options.remove_background:
image_suffix = '.pp-background.png'
else:
image_suffix = '.page.png'
image = next(ii for ii in infiles if ii.endswith(image_suffix))
pageinfo = get_pageinfo(image, context)
if all(orig_image['enc'] == 'jpeg' for orig_image in pageinfo['images']):
# If all images were JPEGs originally, produce a JPEG as output
im = Image.open(image)
# At this point the image should be a .png, but deskew, unpaper might
# have removed the DPI information. In this case, fall back to square
# DPI used to rasterize. When the preview image was rasterized, it
# was also converted to square resolution, which is what we want to
# give tesseract, so keep it square.
fallback_dpi = get_page_square_dpi(pageinfo, options)
dpi = im.info.get('dpi', (fallback_dpi, fallback_dpi))
# Pillow requires integer DPI
dpi = round(dpi[0]), round(dpi[1])
im.save(output_file, format='JPEG', dpi=dpi)
else:
re_symlink(image, output_file, log)
def select_image_layer(
infiles,
output_file,
log,
context):
"""Selects the image layer for the output page. If possible this is the
orientation-corrected input page, or an image of the whole page converted
to PDF."""
options = context.get_options()
page_pdf = next(ii for ii in infiles if ii.endswith('.ocr.oriented.pdf'))
image = next(ii for ii in infiles if ii.endswith('.image'))
if options.lossless_reconstruction:
log.debug("{:4d}: page eligible for lossless reconstruction".format(
page_number(page_pdf)))
re_symlink(page_pdf, output_file, log)
else:
pageinfo = get_pageinfo(image, context)
dpi = get_page_dpi(pageinfo, options)
dpi = float(dpi[0]), float(dpi[1])
layout_fun = img2pdf.get_fixed_dpi_layout_fun(dpi)
with open(image, 'rb') as imfile, \
open(output_file, 'wb') as pdf:
rawdata = imfile.read()
log.debug('{:4d}: convert'.format(page_number(page_pdf)))
img2pdf.convert(
rawdata, with_pdfrw=False,
layout_fun=layout_fun, outputstream=pdf)
log.debug('{:4d}: convert done'.format(page_number(page_pdf)))
def render_hocr_page(
input_file,
output_file,
log,
context):
options = context.get_options()
hocr = input_file
pageinfo = get_pageinfo(hocr, context)
dpi = get_page_square_dpi(pageinfo, options)
hocrtransform = HocrTransform(hocr, dpi)
hocrtransform.to_pdf(output_file, imageFileName=None,
showBoundingboxes=False, invisibleText=True)
def render_hocr_debug_page(
infiles,
output_file,
log,
context):
options = context.get_options()
hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
image = next(ii for ii in infiles if ii.endswith('.image'))
pageinfo = get_pageinfo(image, context)
dpi = get_page_square_dpi(pageinfo, options)
hocrtransform = HocrTransform(hocr, dpi)
hocrtransform.to_pdf(output_file, imageFileName=None,
showBoundingboxes=True, invisibleText=False)
def combine_layers(
infiles,
output_file,
log,
context):
text = next(ii for ii in infiles if ii.endswith('.text.pdf'))
image = next(ii for ii in infiles if ii.endswith('.image-layer.pdf'))
pdf_text = pypdf.PdfFileReader(open(text, "rb"))
pdf_image = pypdf.PdfFileReader(open(image, "rb"))
page_text = pdf_text.getPage(0)
# The text page always will be oriented up by this stage
# but if lossless_reconstruction, pdf_image may have a rotation applied
# We have to eliminate the /Rotate tag (because it applies to the whole
# page) and rotate the image layer to match the text page
# Also, pdf_image may not have its mediabox nailed to (0, 0), so may need
# translation
page_image = pdf_image.getPage(0)
try:
# pypdf DictionaryObject.get() does not resolve indirect objects but
# __getitem__ does
rotation = page_image['/Rotate']
except KeyError:
rotation = 0
# /Rotate is a clockwise rotation: 90 means page facing "east"
# The negative of this value is the angle that eliminates that rotation
rotation = -rotation % 360
x1 = page_image.mediaBox.getLowerLeft_x()
x2 = page_image.mediaBox.getUpperRight_x()
y1 = page_image.mediaBox.getLowerLeft_y()
y2 = page_image.mediaBox.getUpperRight_y()
# Rotation occurs about the page's (0, 0). Most pages will have the media
# box at (0, 0) with all content in the first quadrant but some cropped
# files may have an offset mediabox. We translate the page so that its
# bottom left corner after rotation is pinned to (0, 0) with the image
# in the first quadrant.
if rotation == 0:
tx, ty = -x1, -y1
elif rotation == 90:
tx, ty = y2, -x1
elif rotation == 180:
tx, ty = x2, y2
elif rotation == 270:
tx, ty = -y1, x2
else:
pass
if rotation != 0:
log.info("{0:4d}: rotating image layer {1} degrees".format(
page_number(image), rotation, tx, ty))
try:
page_text.mergeRotatedScaledTranslatedPage(
page_image, rotation, 1.0, tx, ty, expand=False)
except (AttributeError, ValueError) as e:
if 'writeToStream' in str(e) or 'invalid literal' in str(e):
raise PdfMergeFailedError() from e
pdf_output = pypdf.PdfFileWriter()
pdf_output.addPage(page_text)
with open(output_file, "wb") as out:
pdf_output.write(out)
def ocr_tesseract_and_render_pdf(
infiles,
output_file,
log,
context):
options = context.get_options()
input_image = next((ii for ii in infiles if ii.endswith('.image')), '')
input_pdf = next((ii for ii in infiles if ii.endswith('.pdf')))
if not input_image:
# Skipping this page
re_symlink(input_pdf, output_file, log)
return
tesseract.generate_pdf(
input_image=input_image,
skip_pdf=input_pdf,
output_pdf=output_file,
language=options.language,
engine_mode=options.tesseract_oem,
text_only=False,
tessconfig=options.tesseract_config,
timeout=options.tesseract_timeout,
pagesegmode=options.tesseract_pagesegmode,
log=log)
def ocr_tesseract_textonly_pdf(
infiles,
output_file,
log,
context):
options = context.get_options()
input_image = next((ii for ii in infiles if ii.endswith('.ocr.png')), '')
if not input_image:
raise ValueError("No image rendered?")
skip_pdf = next((ii for ii in infiles if ii.endswith('.pdf')))
tesseract.generate_pdf(
input_image=input_image,
skip_pdf=skip_pdf,
output_pdf=output_file,
language=options.language,
engine_mode=options.tesseract_oem,
text_only=True,
tessconfig=options.tesseract_config,
timeout=options.tesseract_timeout,
pagesegmode=options.tesseract_pagesegmode,
log=log)
def get_pdfmark(base_pdf, options):
def from_document_info(key):
# pdf.documentInfo.get() DOES NOT behave as expected for a dict-like
# object, so call with precautions. TypeError may occur if the PDF
# is missing the optional document info section.
try:
s = base_pdf.documentInfo[key]
return str(s)
except (KeyError, TypeError):
return ''
pdfmark = {
'/Title': from_document_info('/Title'),
'/Author': from_document_info('/Author'),
'/Keywords': from_document_info('/Keywords'),
'/Subject': from_document_info('/Subject'),
}
if options.title:
pdfmark['/Title'] = options.title
if options.author:
pdfmark['/Author'] = options.author
if options.keywords:
pdfmark['/Keywords'] = options.keywords
if options.subject:
pdfmark['/Subject'] = options.subject
pdfmark['/Creator'] = '{0} {1} / Tesseract OCR{2} {3}'.format(
PROGRAM_NAME, VERSION,
'+PDF' if options.pdf_renderer == 'tesseract' else '',
tesseract.version())
return pdfmark
def generate_postscript_stub(
input_file,
output_file,
log,
context):
options = context.get_options()
pdf = pypdf.PdfFileReader(input_file)
pdfmark = get_pdfmark(pdf, options)
generate_pdfa_def(output_file, pdfmark)
def skip_page(
input_file,
output_file,
log):
# The purpose of this step is its filter to forward only the skipped
# files (.skip.oriented.pdf) while disregarding the processed ones
# (.ocr.oriented.pdf). Alternative would be for merge_pages to filter
# pages itself if it gets multiple copies of a page.
re_symlink(input_file, output_file, log)
def merge_pages_ghostscript(
input_files,
output_file,
log,
context):
options = context.get_options()
def input_file_order(s):
'''Sort order: All rendered pages followed
by their debug page, if any, followed by Postscript stub.
Ghostscript documentation has the Postscript stub at the
beginning, but it works at the end and also gets document info
right that way.'''
if s.endswith('.ps'):
return 99999999
key = int(os.path.basename(s)[0:6]) * 10
if 'debug' in os.path.basename(s):
key += 1
return key
pdf_pages = sorted(input_files, key=input_file_order)
log.debug("Final pages: " + "\n".join(pdf_pages))
ghostscript.generate_pdfa(pdf_pages, output_file, log, options.jobs or 1)
def merge_pages_qpdf(
input_files,
output_file,
log,
context):
options = context.get_options()
metadata_file = next(
(ii for ii in input_files if ii.endswith('.repaired.pdf')))
input_files.remove(metadata_file)
def input_file_order(s):
'''Sort order: All rendered pages followed
by their debug page.'''
key = int(os.path.basename(s)[0:6]) * 10
if 'debug' in os.path.basename(s):
key += 1
return key
pdf_pages = sorted(input_files, key=input_file_order)
log.debug("Final pages: " + "\n".join(pdf_pages))
reader_metadata = pypdf.PdfFileReader(metadata_file)
pdfmark = get_pdfmark(reader_metadata, options)
pdfmark['/Producer'] = 'qpdf ' + qpdf.version()
first_page = pypdf.PdfFileReader(pdf_pages[0])
writer = pypdf.PdfFileWriter()
writer.appendPagesFromReader(first_page)
writer.addMetadata(pdfmark)
writer_file = pdf_pages[0].replace('.pdf', '.metadata.pdf')
with open(writer_file, 'wb') as f:
writer.write(f)
pdf_pages[0] = writer_file
qpdf.merge(pdf_pages, output_file)
def copy_final(
input_files,
output_file,
log,
context):
input_file = next((ii for ii in input_files if ii.endswith('.pdf')))
if output_file == '-':
from shutil import copyfileobj
with open(input_file, 'rb') as input_stream:
copyfileobj(input_stream, sys.stdout.buffer)
sys.stdout.flush()
else:
shutil.copy(input_file, output_file)
def build_pipeline(options, work_folder, log, context):
main_pipeline = Pipeline.pipelines['main']
# Triage
task_triage = main_pipeline.transform(
task_func=triage,
input=os.path.join(work_folder, 'origin'),
filter=formatter('(?i)'),
output=os.path.join(work_folder, 'origin.pdf'),
extras=[log, context])
task_repair_pdf = main_pipeline.transform(
task_func=repair_pdf,
input=task_triage,
filter=suffix('.pdf'),
output='.repaired.pdf',
output_dir=work_folder,
extras=[log, context])
# Split (kwargs for split seems to be broken, so pass plain args)
task_split_pages = main_pipeline.split(
split_pages,
task_repair_pdf,
os.path.join(work_folder, '*.page.pdf'),
extras=[log, context])
# Rasterize preview
task_rasterize_preview = main_pipeline.transform(
task_func=rasterize_preview,
input=task_split_pages,
filter=suffix('.page.pdf'),
output='.preview.jpg',
output_dir=work_folder,
extras=[log, context])
task_rasterize_preview.active_if(options.rotate_pages)
# Orient
task_orient_page = main_pipeline.collate(
task_func=orient_page,
input=[task_split_pages, task_rasterize_preview],
filter=regex(r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"),
output=os.path.join(work_folder, r'\1\2.oriented.pdf'),
extras=[log, context])
# Rasterize actual
task_rasterize_with_ghostscript = main_pipeline.transform(
task_func=rasterize_with_ghostscript,
input=task_orient_page,
filter=suffix('.ocr.oriented.pdf'),
output='.page.png',
output_dir=work_folder,
extras=[log, context])
# Preprocessing subpipeline
task_preprocess_remove_background = main_pipeline.transform(
task_func=preprocess_remove_background,
input=task_rasterize_with_ghostscript,
filter=suffix(".page.png"),
output=".pp-background.png",
extras=[log, context])
task_preprocess_deskew = main_pipeline.transform(
task_func=preprocess_deskew,
input=task_preprocess_remove_background,
filter=suffix(".pp-background.png"),
output=".pp-deskew.png",
extras=[log, context])
task_preprocess_clean = main_pipeline.transform(
task_func=preprocess_clean,
input=task_preprocess_deskew,
filter=suffix(".pp-deskew.png"),
output=".pp-clean.png",
extras=[log, context])
task_select_ocr_image = main_pipeline.collate(
task_func=select_ocr_image,
input=[task_preprocess_clean],
filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
output=os.path.join(work_folder, r"\1.ocr.png"),
extras=[log, context])
# HOCR OCR
task_ocr_tesseract_hocr = main_pipeline.transform(
task_func=ocr_tesseract_hocr,
input=task_select_ocr_image,
filter=suffix(".ocr.png"),
output=".hocr",
extras=[log, context])
task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"')
task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr')
if tesseract.v4():
task_ocr_tesseract_hocr.jobs_limit(1) # Uses multi-core on its own
task_select_visible_page_image = main_pipeline.collate(
task_func=select_visible_page_image,
input=[task_rasterize_with_ghostscript,
task_preprocess_remove_background,
task_preprocess_deskew,
task_preprocess_clean],
filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
output=os.path.join(work_folder, r'\1.image'),
extras=[log, context])
task_select_visible_page_image.graphviz(shape='diamond')
task_select_image_layer = main_pipeline.collate(
task_func=select_image_layer,
input=[task_select_visible_page_image, task_orient_page],
filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
output=os.path.join(work_folder, r'\1.image-layer.pdf'),
extras=[log, context])
task_select_image_layer.graphviz(
fillcolor='"#00cc66"', shape='diamond')
task_select_image_layer.active_if(
options.pdf_renderer == 'hocr' or options.pdf_renderer == 'tess4')
task_render_hocr_page = main_pipeline.transform(
task_func=render_hocr_page,
input=task_ocr_tesseract_hocr,
filter=suffix('.hocr'),
output='.text.pdf',
extras=[log, context])
task_render_hocr_page.graphviz(fillcolor='"#00cc66"')
task_render_hocr_page.active_if(options.pdf_renderer == 'hocr')
task_render_hocr_debug_page = main_pipeline.collate(
task_func=render_hocr_debug_page,
input=[task_select_visible_page_image, task_ocr_tesseract_hocr],
filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"),
output=os.path.join(work_folder, r'\1.debug.pdf'),
extras=[log, context])
task_render_hocr_debug_page.graphviz(fillcolor='"#00cc66"')
task_render_hocr_debug_page.active_if(options.pdf_renderer == 'hocr')
task_render_hocr_debug_page.active_if(options.debug_rendering)
# Tesseract OCR + text only PDF
task_ocr_tesseract_textonly_pdf = main_pipeline.collate(
task_func=ocr_tesseract_textonly_pdf,
input=[task_select_ocr_image, task_orient_page],
filter=regex(r".*/(\d{6})(?:\.ocr.png|\.ocr\.oriented\.pdf)"),
output=os.path.join(work_folder, r'\1.text.pdf'),
extras=[log, context])
task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'tess4')
if tesseract.v4():
task_ocr_tesseract_textonly_pdf.jobs_limit(1)
task_combine_layers = main_pipeline.collate(
task_func=combine_layers,
input=[task_render_hocr_page,
task_ocr_tesseract_textonly_pdf,
task_select_image_layer],
filter=regex(r".*/(\d{6})(?:\.text\.pdf|\.image-layer\.pdf)"),
output=os.path.join(work_folder, r'\1.rendered.pdf'),
extras=[log, context])
task_combine_layers.graphviz(fillcolor='"#00cc66"')
task_combine_layers.active_if(options.pdf_renderer == 'hocr' or options.pdf_renderer == 'tess4')
# Tesseract OCR+PDF
task_ocr_tesseract_and_render_pdf = main_pipeline.collate(
task_func=ocr_tesseract_and_render_pdf,
input=[task_select_visible_page_image, task_orient_page],
filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
output=os.path.join(work_folder, r'\1.rendered.pdf'),
extras=[log, context])
task_ocr_tesseract_and_render_pdf.graphviz(fillcolor='"#66ccff"')
task_ocr_tesseract_and_render_pdf.active_if(options.pdf_renderer == 'tesseract')
if tesseract.v4():
task_ocr_tesseract_and_render_pdf.jobs_limit(1) # Uses multi-core
# PDF/A
task_generate_postscript_stub = main_pipeline.transform(
task_func=generate_postscript_stub,
input=task_repair_pdf,
filter=formatter(r'\.repaired\.pdf'),
output=os.path.join(work_folder, 'pdfa_def.ps'),
extras=[log, context])
task_generate_postscript_stub.active_if(options.output_type == 'pdfa')
# Bypass valve
task_skip_page = main_pipeline.transform(
task_func=skip_page,
input=task_orient_page,
filter=suffix('.skip.oriented.pdf'),
output='.done.pdf',
output_dir=work_folder,
extras=[log])
# Merge pages
task_merge_pages_ghostscript = main_pipeline.merge(
task_func=merge_pages_ghostscript,
input=[task_combine_layers,
task_render_hocr_debug_page,
task_skip_page,
task_ocr_tesseract_and_render_pdf,
task_generate_postscript_stub],
output=os.path.join(work_folder, 'merged.pdf'),
extras=[log, context])
task_merge_pages_ghostscript.active_if(options.output_type == 'pdfa')
task_merge_pages_qpdf = main_pipeline.merge(
task_func=merge_pages_qpdf,
input=[task_combine_layers,
task_render_hocr_debug_page,
task_skip_page,
task_ocr_tesseract_and_render_pdf,
task_repair_pdf],
output=os.path.join(work_folder, 'merged.pdf'),
extras=[log, context])
task_merge_pages_qpdf.active_if(options.output_type == 'pdf')
# Finalize
task_copy_final = main_pipeline.merge(
task_func=copy_final,
input=[task_merge_pages_ghostscript, task_merge_pages_qpdf],
output=options.output_file,
extras=[log, context])