OCRmyPDF/ocrmypdf/main.py

#!/usr/bin/env python3
# © 2015 James R. Barlow: github.com/jbarlow83

from contextlib import suppress
from tempfile import NamedTemporaryFile, mkdtemp
import sys
import os
import re
import shutil
import warnings
import multiprocessing
import atexit
import textwrap

import PyPDF2 as pypdf
from PIL import Image

from functools import partial

from subprocess import Popen, check_call, PIPE, CalledProcessError, \
    TimeoutExpired, check_output, STDOUT
try:
    from subprocess import DEVNULL
except ImportError:
    DEVNULL = open(os.devnull, 'wb')


from ruffus import transform, suffix, merge, active_if, regex, jobs_limit, \
    formatter, follows, split, collate, check_if_uptodate
import ruffus.ruffus_exceptions as ruffus_exceptions
import ruffus.cmdline as cmdline

from .hocrtransform import HocrTransform
from .pageinfo import pdf_get_all_pageinfo
from .pdfa import generate_pdfa_def
from . import ghostscript
from . import tesseract
from . import qpdf
from . import ExitCode

warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)


BASEDIR = os.path.dirname(os.path.realpath(__file__))
VERSION = '3.1'


# -------------
# External dependencies

MINIMUM_TESS_VERSION = '3.02.02'


def complain(message):
    print(*textwrap.wrap(message), file=sys.stderr)


if tesseract.version() < MINIMUM_TESS_VERSION:
    complain(
        "Please install tesseract {0} or newer "
        "(currently installed version is {1})".format(
            MINIMUM_TESS_VERSION, tesseract.version()))
    sys.exit(ExitCode.missing_dependency)


try:
    import PIL.features
    check_codec = PIL.features.check_codec
except (ImportError, AttributeError):
    def check_codec(codec_name):
        if codec_name == 'jpg':
            return 'jpeg_encoder' in dir(Image.core)
        elif codec_name == 'zlib':
            return 'zip_encoder' in dir(Image.core)
        raise NotImplementedError(codec_name)


def check_pil_encoder(codec_name, friendly_name):
    try:
        if check_codec(codec_name):
            return
    except Exception:
        pass
    complain(
        "ERROR: Your version of the Python imaging library (Pillow) was "
        "compiled without support for " + friendly_name + " encoding/decoding."
        "\n"
        "You will need to uninstall Pillow and reinstall it with PNG and JPEG "
        "support (libjpeg and zlib)."
        "\n"
        "See installation instructions for your platform here:\n"
        "    https://pillow.readthedocs.org/installation.html"
    )
    sys.exit(ExitCode.missing_dependency)


check_pil_encoder('jpg', 'JPEG')
check_pil_encoder('zlib', 'PNG')


# -------------
# Parser

parser = cmdline.get_argparse(
    prog="ocrmypdf",
    description="Generate searchable PDF file from an image-only PDF file.",
    version=VERSION,
    fromfile_prefix_chars='@',
    ignored_args=[
        'touch_files_only', 'recreate_database', 'checksum_file_name',
        'key_legend_in_graph', 'draw_graph_horizontally', 'flowchart_format',
        'forced_tasks', 'target_tasks', 'use_threads'])

parser.add_argument(
    'input_file',
    help="PDF file containing the images to be OCRed")
parser.add_argument(
    'output_file',
    help="output searchable PDF file")
parser.add_argument(
    '-l', '--language', action='append',
    help="languages of the file to be OCRed")

metadata = parser.add_argument_group(
    "Metadata options",
    "Set output PDF/A metadata (default: use input document's title)")
metadata.add_argument(
    '--title', type=str,
    help="set document title (place multiple words in quotes)")
metadata.add_argument(
    '--author', type=str,
    help="set document author")
metadata.add_argument(
    '--subject', type=str,
    help="set document")
metadata.add_argument(
    '--keywords', type=str,
    help="set document keywords")


preprocessing = parser.add_argument_group(
    "Preprocessing options",
    "Improve OCR quality and final image")
preprocessing.add_argument(
    '-d', '--deskew', action='store_true',
    help="deskew each page before performing OCR")
preprocessing.add_argument(
    '-c', '--clean', action='store_true',
    help="clean pages from scanning artifacts before performing OCR")
preprocessing.add_argument(
    '-i', '--clean-final', action='store_true',
    help="incorporate the cleaned image in the final PDF file")
preprocessing.add_argument(
    '--oversample', metavar='DPI', type=int, default=0,
    help="oversample images to at least the specified DPI, to improve OCR "
         "results slightly")

parser.add_argument(
    '-f', '--force-ocr', action='store_true',
    help="rasterize any fonts or vector images on each page and apply OCR")
parser.add_argument(
    '-s', '--skip-text', action='store_true',
    help="skip OCR on any pages that already contain text, but include the"
         " page in final output")
parser.add_argument(
    '--skip-big', type=float, metavar='MPixels',
    help="skip OCR on pages larger than the specified amount of megapixels, "
         "but include skipped pages in final output")
# parser.add_argument(
#     '--exact-image', action='store_true',
#     help="Use original page from PDF without re-rendering")

advanced = parser.add_argument_group(
    "Advanced",
    "Advanced options for power users")
advanced.add_argument(
    '--tesseract-config', default=[], type=list, action='append',
    help="additional Tesseract configuration files")
advanced.add_argument(
    '--pdf-renderer', choices=['auto', 'tesseract', 'hocr'], default='auto',
    help='choose OCR PDF renderer')
advanced.add_argument(
    '--tesseract-timeout', default=180.0, type=float,
    help='give up on OCR after the timeout, but copy the preprocessed page '
         'into the final output')

debugging = parser.add_argument_group(
    "Debugging",
    "Arguments to help with troubleshooting and debugging")
debugging.add_argument(
    '-k', '--keep-temporary-files', action='store_true',
    help="keep temporary files (helpful for debugging)")
debugging.add_argument(
    '-g', '--debug-rendering', action='store_true',
    help="render each page twice with debug information on second page")

options = parser.parse_args()


# ----------
# Languages

if not options.language:
    options.language = ['eng']  # Enforce English hegemony

# Support v2.x "eng+deu" language syntax
if '+' in options.language[0]:
    options.language = options.language[0].split('+')

if not set(options.language).issubset(tesseract.languages()):
    complain(
        "The installed version of tesseract does not have language "
        "data for the following requested languages: ")
    for lang in (set(options.language) - tesseract.languages()):
        complain(lang)
    sys.exit(ExitCode.bad_args)


# ----------
# Arguments

if options.pdf_renderer == 'auto':
    options.pdf_renderer = 'hocr'

if any((options.deskew, options.clean, options.clean_final)):
    try:
        from . import unpaper
    except ImportError:
        complain(
            "Install the 'unpaper' program to use --deskew or --clean.")
        sys.exit(ExitCode.bad_args)
else:
    unpaper = None

if options.debug_rendering and options.pdf_renderer == 'tesseract':
    complain(
        "Ignoring --debug-rendering because it is not supported with"
        "--pdf-renderer=tesseract.")

if options.force_ocr and options.skip_text:
    complain(
        "Error: --force-ocr and --skip-text are mutually incompatible.")
    sys.exit(ExitCode.bad_args)

if options.clean and not options.clean_final \
        and options.pdf_renderer == 'tesseract':
    complain(
        "Tesseract PDF renderer cannot render --clean pages without "
        "also performing --clean-final, so --clean-final is assumed.")


# ----------
# Logging


_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
                                               options.verbose)


class WrappedLogger:

    def __init__(self, my_logger, my_mutex):
        self.logger = my_logger
        self.mutex = my_mutex

    def log(self, *args, **kwargs):
        with self.mutex:
            self.logger.log(*args, **kwargs)

    def debug(self, *args, **kwargs):
        with self.mutex:
            self.logger.debug(*args, **kwargs)

    def info(self, *args, **kwargs):
        with self.mutex:
            self.logger.info(*args, **kwargs)

    def warning(self, *args, **kwargs):
        with self.mutex:
            self.logger.warning(*args, **kwargs)

    def error(self, *args, **kwargs):
        with self.mutex:
            self.logger.error(*args, **kwargs)

    def critical(self, *args, **kwargs):
        with self.mutex:
            self.logger.critical(*args, **kwargs)

_log = WrappedLogger(_logger, _logger_mutex)


def re_symlink(input_file, soft_link_name, log=_log):
    """
    Helper function: relinks soft symbolic link if necessary
    """
    # Guard against soft linking to oneself
    if input_file == soft_link_name:
        log.debug("Warning: No symbolic link made. You are using " +
                  "the original data directory as the working directory.")
        return

    # Soft link already exists: delete for relink?
    if os.path.lexists(soft_link_name):
        # do not delete or overwrite real (non-soft link) file
        if not os.path.islink(soft_link_name):
            raise Exception("%s exists and is not a link" % soft_link_name)
        try:
            os.unlink(soft_link_name)
        except:
            log.debug("Can't unlink %s" % (soft_link_name))

    if not os.path.exists(input_file):
        raise Exception("trying to create a broken symlink to %s" % input_file)

    log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))

    # Create symbolic link using absolute path
    os.symlink(
        os.path.abspath(input_file),
        soft_link_name
    )


# -------------
# The Pipeline

manager = multiprocessing.Manager()
_pdfinfo = manager.list()
_pdfinfo_lock = manager.Lock()

work_folder = mkdtemp(prefix="com.github.ocrmypdf.")


@atexit.register
def cleanup_working_files(*args):
    if options.keep_temporary_files:
        print("Temporary working files saved at:")
        print(work_folder)
    else:
        with suppress(FileNotFoundError):
            shutil.rmtree(work_folder)


@transform(
    input=options.input_file,
    filter=formatter('(?i)\.pdf'),
    output=work_folder + '{basename[0]}.repaired.pdf',
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def repair_pdf(
        input_file,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):

    qpdf.repair(input_file, output_file, log)
    with pdfinfo_lock:
        pdfinfo.extend(pdf_get_all_pageinfo(output_file))
        log.info(pdfinfo)


def get_pageinfo(input_file, pdfinfo, pdfinfo_lock):
    pageno = int(os.path.basename(input_file)[0:6]) - 1
    with pdfinfo_lock:
        pageinfo = pdfinfo[pageno].copy()
    return pageinfo


def is_ocr_required(pageinfo, log):
    page = pageinfo['pageno'] + 1
    ocr_required = True
    if not pageinfo['images']:
        # If the page has no images, then it contains vector content or text
        # or both. It seems quite unlikely that one would find meaningful text
        # from rasterizing vector content. So skip the page.
        log.info(
            "Page {0} has no images - skipping OCR".format(page)
        )
        ocr_required = False
    elif pageinfo['has_text']:
        s = "Page {0} already has text! – {1}"

        if not options.force_ocr and not options.skip_text:
            log.error(s.format(page,
                               "aborting (use --force-ocr to force OCR)"))
            sys.exit(ExitCode.already_done_ocr)
        elif options.force_ocr:
            log.info(s.format(page,
                              "rasterizing text and running OCR anyway"))
            ocr_required = True
        elif options.skip_text:
            log.info(s.format(page,
                              "skipping all processing on this page"))
            ocr_required = False

    if ocr_required and options.skip_big:
        pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
        if pixel_count > (options.skip_big * 1000000):
            ocr_required = False
            log.info(
                "Page {0} is very large; skipping due to -b".format(page))

    return ocr_required


@split(
    repair_pdf,
    os.path.join(work_folder, '*.page.pdf'),
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def split_pages(
        input_file,
        output_files,
        log,
        pdfinfo,
        pdfinfo_lock):

    for oo in output_files:
        with suppress(FileNotFoundError):
            os.unlink(oo)

    npages = qpdf.get_npages(input_file)
    qpdf.split_pages(input_file, work_folder, npages)

    from glob import glob
    for filename in glob(os.path.join(work_folder, '*.page.pdf')):
        pageinfo = get_pageinfo(filename, pdfinfo, pdfinfo_lock)

        alt_suffix = '.ocr.page.pdf' if is_ocr_required(pageinfo, log) \
                     else '.skip.page.pdf'
        re_symlink(
            filename,
            os.path.join(
                work_folder,
                os.path.basename(filename)[0:6] + alt_suffix))


@transform(
    input=split_pages,
    filter=suffix('.ocr.page.pdf'),
    output='.page.png',
    output_dir=work_folder,
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def rasterize_with_ghostscript(
        input_file,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):
    pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)

    device = 'png16m'  # 24-bit
    if all(image['comp'] == 1 for image in pageinfo['images']):
        if all(image['bpc'] == 1 for image in pageinfo['images']):
            device = 'pngmono'
        elif all(image['bpc'] > 1 and image['color'] == 'index'
                 for image in pageinfo['images']):
            device = 'png256'
        elif all(image['bpc'] > 1 and image['color'] == 'gray'
                 for image in pageinfo['images']):
            device = 'pnggray'

    log.debug("Rendering {0} with {1}".format(
            os.path.basename(input_file), device))
    xres = max(pageinfo['xres'], options.oversample or 0)
    yres = max(pageinfo['yres'], options.oversample or 0)

    ghostscript.rasterize_pdf(input_file, output_file, xres, yres, device, log)


@transform(
    input=rasterize_with_ghostscript,
    filter=suffix(".page.png"),
    output=".pp-deskew.png",
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def preprocess_deskew(
        input_file,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):

    if not options.deskew:
        re_symlink(input_file, output_file, log)
        return

    pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
    dpi = int(pageinfo['xres'])

    unpaper.deskew(input_file, output_file, dpi, log)


@transform(
    input=preprocess_deskew,
    filter=suffix(".pp-deskew.png"),
    output=".pp-clean.png",
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def preprocess_clean(
        input_file,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):

    if not options.clean:
        re_symlink(input_file, output_file, log)
        return

    pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
    dpi = int(pageinfo['xres'])

    unpaper.clean(input_file, output_file, dpi, log)


@active_if(options.pdf_renderer == 'hocr')
@transform(
    input=preprocess_clean,
    filter=suffix(".pp-clean.png"),
    output=".hocr",
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def ocr_tesseract_hocr(
        input_file,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):

    tesseract.generate_hocr(
        input_file=input_file,
        output_hocr=output_file,
        language=options.language,
        tessconfig=options.tesseract_config,
        timeout=options.tesseract_timeout,
        pageinfo_getter=partial(get_pageinfo, input_file, pdfinfo,
                                pdfinfo_lock),
        log=log
        )


@active_if(options.pdf_renderer == 'hocr')
@collate(
    input=[rasterize_with_ghostscript, preprocess_deskew, preprocess_clean],
    filter=regex(r".*/(\d{6})(?:\.page|\.pp-deskew|\.pp-clean)\.png"),
    output=os.path.join(work_folder, r'\1.image'),
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def select_image_for_pdf(
        infiles,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):
    if options.clean_final:
        image_suffix = '.pp-clean.png'
    elif options.deskew:
        image_suffix = '.pp-deskew.png'
    else:
        image_suffix = '.page.png'
    image = next(ii for ii in infiles if ii.endswith(image_suffix))

    pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
    if all(image['enc'] == 'jpeg' for image in pageinfo['images']):
        # If all images were JPEGs originally, produce a JPEG as output
        Image.open(image).save(output_file, format='JPEG')
    else:
        re_symlink(image, output_file)


@active_if(options.pdf_renderer == 'hocr')
@collate(
    input=[select_image_for_pdf, ocr_tesseract_hocr],
    filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"),
    output=os.path.join(work_folder, r'\1.rendered.pdf'),
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def render_hocr_page(
        infiles,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):
    hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
    image = next(ii for ii in infiles if ii.endswith('.image'))

    pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
    dpi = round(max(pageinfo['xres'], pageinfo['yres'], options.oversample))

    hocrtransform = HocrTransform(hocr, dpi)
    hocrtransform.to_pdf(output_file, imageFileName=image,
                         showBoundingboxes=False, invisibleText=True)


@active_if(options.pdf_renderer == 'hocr')
@active_if(options.debug_rendering)
@collate(
    input=[select_image_for_pdf, ocr_tesseract_hocr],
    filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"),
    output=os.path.join(work_folder, r'\1.debug.pdf'),
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def render_hocr_debug_page(
        infiles,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):
    hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
    image = next(ii for ii in infiles if ii.endswith('.image'))

    pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
    dpi = round(max(pageinfo['xres'], pageinfo['yres'], options.oversample))

    hocrtransform = HocrTransform(hocr, dpi)
    hocrtransform.to_pdf(output_file, imageFileName=None,
                         showBoundingboxes=True, invisibleText=False)


@active_if(options.pdf_renderer == 'tesseract')
@collate(
    input=[preprocess_clean, split_pages],
    filter=regex(r".*/(\d{6})(?:\.pp-clean\.png|\.page\.pdf)"),
    output=os.path.join(work_folder, r'\1.rendered.pdf'),
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def tesseract_ocr_and_render_pdf(
        input_files,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):

    input_image = next((ii for ii in input_files if ii.endswith('.png')), '')
    input_pdf = next((ii for ii in input_files if ii.endswith('.pdf')))
    if not input_image:
        # Skipping this page
        re_symlink(input_pdf, output_file)
        return

    tesseract.generate_pdf(
        input_image=input_image,
        skip_pdf=input_pdf,
        output_pdf=output_file,
        language=options.language,
        tessconfig=options.tesseract_config,
        timeout=options.tesseract_timeout,
        log=log)


@transform(
    input=repair_pdf,
    filter=suffix('.repaired.pdf'),
    output='.pdfa_def.ps',
    output_dir=work_folder,
    extras=[_log])
def generate_postscript_stub(
        input_file,
        output_file,
        log):

    pdf = pypdf.PdfFileReader(input_file)

    def from_document_info(key):
        # pdf.documentInfo.get() DOES NOT behave as expected for a dict-like
        # object, so call with precautions.  TypeError may occur if the PDF
        # is missing the optional document info section.
        try:
            s = pdf.documentInfo[key]
            return str(s)
        except (KeyError, TypeError):
            return ''

    pdfmark = {
        'title': from_document_info('/Title'),
        'author': from_document_info('/Author'),
        'keywords': from_document_info('/Keywords'),
        'subject': from_document_info('/Subject'),
    }
    if options.title:
        pdfmark['title'] = options.title
    if options.author:
        pdfmark['author'] = options.author
    if options.keywords:
        pdfmark['keywords'] = options.keywords
    if options.subject:
        pdfmark['subject'] = options.subject

    pdfmark['creator'] = '{0} {1} / Tesseract OCR{2} {3}'.format(
            parser.prog, VERSION,
            '+PDF' if options.pdf_renderer == 'tesseract' else '',
            tesseract.version())

    generate_pdfa_def(output_file, pdfmark)


@transform(
    input=split_pages,
    filter=suffix('.skip.page.pdf'),
    output='.done.pdf',
    output_dir=work_folder,
    extras=[_log])
def skip_page(
        input_file,
        output_file,
        log):
    re_symlink(input_file, output_file, log)


@merge(
    input=[render_hocr_page, render_hocr_debug_page, skip_page,
           tesseract_ocr_and_render_pdf, generate_postscript_stub],
    output=os.path.join(work_folder, 'merged.pdf'),
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def merge_pages(
        input_files,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):

    def input_file_order(s):
        '''Sort order: All rendered pages followed
        by their debug page, if any, followed by Postscript stub.
        Ghostscript documentation has the Postscript stub at the
        beginning, but it works at the end and also gets document info
        right that way.'''
        if s.endswith('.ps'):
            return 99999999
        key = int(os.path.basename(s)[0:6]) * 10
        if 'debug' in os.path.basename(s):
            key += 1
        return key

    pdf_pages = sorted(input_files, key=input_file_order)
    log.info(pdf_pages)
    ghostscript.generate_pdfa(pdf_pages, output_file, options.jobs or 1)


@transform(
    input=merge_pages,
    filter=formatter(),
    output=options.output_file,
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def copy_final(
        input_file,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):
    shutil.copy(input_file, output_file)


def validate_pdfa(
        input_file,
        log):

    args_qpdf = [
        'qpdf',
        '--check',
        input_file
    ]

    try:
        check_output(args_qpdf, stderr=STDOUT, universal_newlines=True)
    except CalledProcessError as e:
        if e.returncode == 2:
            print("{0}: not a valid PDF, and could not repair it.".format(
                    options.input_file))
            print("Details:")
            print(e.output)
        elif e.returncode == 3:
            log.info("qpdf --check returned warnings:")
            log.info(e.output)
        else:
            print(e.output)
        return False

    return True


# @active_if(ocr_required and options.exact_image)
# @merge([render_hocr_blank_page, extract_single_page],
#        os.path.join(work_folder, "%04i.merged.pdf") % pageno)
# def merge_hocr_with_original_page(infiles, output_file):
#     with open(infiles[0], 'rb') as hocr_input, \
#             open(infiles[1], 'rb') as page_input, \
#             open(output_file, 'wb') as output:
#         hocr_reader = pypdf.PdfFileReader(hocr_input)
#         page_reader = pypdf.PdfFileReader(page_input)
#         writer = pypdf.PdfFileWriter()

#         the_page = hocr_reader.getPage(0)
#         the_page.mergePage(page_reader.getPage(0))
#         writer.addPage(the_page)
#         writer.write(output)


def available_cpu_count():
    try:
        return multiprocessing.cpu_count()
    except NotImplementedError:
        pass

    try:
        import psutil
        return psutil.cpu_count()
    except (ImportError, AttributeError):
        pass

    complain(
        "Could not get CPU count.  Assuming one (1) CPU."
        "Use -j N to set manually.")
    return 1


def cleanup_ruffus_error_message(msg):
    msg = re.sub(r'\s+', r' ', msg, re.MULTILINE)
    msg = re.sub(r"\((.+?)\)", r'\1', msg)
    msg = msg.strip()
    return msg


def run_pipeline():
    if not options.jobs or options.jobs == 1:
        options.jobs = available_cpu_count()
    try:
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            print(e)

        # Yuck. Hunt through the ruffus exception to find out what the
        # return code is supposed to be.
        for exc in e.args:
            task_name, job_name, exc_name, exc_value, exc_stack = exc
            if exc_name == 'builtins.SystemExit':
                match = re.search(r"\.(.+?)\)", exc_value)
                exit_code_name = match.groups()[0]
                exit_code = getattr(ExitCode, exit_code_name, 'other_error')
                return exit_code
            elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
                print(cleanup_ruffus_error_message(exc_value))
                return ExitCode.input_file
            elif exc_name == 'builtins.TypeError':
                # Even though repair_pdf will fail, ruffus will still try
                # to call split_pages with no input files, likely due to a bug
                if task_name == 'split_pages':
                    print("Input file '{0}' is not a valid PDF".format(
                        options.input_file))
                    return ExitCode.input_file

        return ExitCode.other_error

    if not validate_pdfa(options.output_file, _log):
        _log.warning('Output file: The generated PDF/A file is INVALID')
        return ExitCode.invalid_output_pdfa

    return ExitCode.ok


if __name__ == '__main__':
    sys.exit(run_pipeline())
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
+								#!/usr/bin/env python3
-												Update release notes, add copyrights

											
										
										
											2015-07-28 04:36:58 -07:00
+								# © 2015 James R. Barlow: github.com/jbarlow83
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								from contextlib import suppress
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								from tempfile import NamedTemporaryFile, mkdtemp
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
+								import sys
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								import os
-												Remove filenames from .hocr files

As documented, Tesseract does not escape the filename when inserting it
into .hocr, potentially creating an invalid XML file as a result. Since
there is no use for the title, regex it and nuke it.

											
										
										
											2015-02-13 13:41:14 -08:00
+								import re
-												Add rudimentary support for combining OCR layer with existing content

It appears to be very fragile due to weaknesses in PyPDF. Better
option is probably to use pdftk's watermark feature.

											
										
										
											2015-03-10 14:28:38 -07:00
+								import shutil
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								import warnings
 								import multiprocessing
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								import atexit
-												Minor tweaks to uncommon arguments

											
										
										
											2015-07-28 02:25:50 -07:00
+								import textwrap
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
 								import PyPDF2 as pypdf
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
+								from PIL import Image
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
-												Migrate tesseract-hocr code to tesseract module, because modularity

											
										
										
											2015-12-16 17:36:11 -08:00
+								from functools import partial
-												Add Tesseract timeout to keep things reasonable

											
										
										
											2014-11-14 02:06:23 -08:00
+								from subprocess import Popen, check_call, PIPE, CalledProcessError, \
-												Improve ruffus exception handling

ruffus swallows the return code if the process of handling an exception
we hit an error in ruffus' own code, which can happen.  So pick through
its error stack and find out if there's an interesting return code in
there.  Had to use eval() of all things.

Also suppress the stack trace for normal error conditions that don't
need one.

											
										
										
											2015-08-11 02:19:46 -07:00
+								    TimeoutExpired, check_output, STDOUT
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								try:
 								    from subprocess import DEVNULL
 								except ImportError:
 								    DEVNULL = open(os.devnull, 'wb')
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
+								from ruffus import transform, suffix, merge, active_if, regex, jobs_limit, \
-												Change @subdivide to @split

@split is for "1 to many" operations, so it's the right tool for this
case.

											
										
										
											2015-07-25 02:58:34 -07:00
+								    formatter, follows, split, collate, check_if_uptodate
-												Improve ruffus exception handling

ruffus swallows the return code if the process of handling an exception
we hit an error in ruffus' own code, which can happen.  So pick through
its error stack and find out if there's an interesting return code in
there.  Had to use eval() of all things.

Also suppress the stack trace for normal error conditions that don't
need one.

											
										
										
											2015-08-11 02:19:46 -07:00
+								import ruffus.ruffus_exceptions as ruffus_exceptions
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								import ruffus.cmdline as cmdline
-												Try a method for passing along the pdfinfo struct

											
										
										
											2015-07-23 02:39:42 -07:00
-												Call HocrTransform directly instead of through a subprocess

											
										
										
											2015-02-20 17:20:48 -08:00
+								from .hocrtransform import HocrTransform
-												Try a method for passing along the pdfinfo struct

											
										
										
											2015-07-23 02:39:42 -07:00
+								from .pageinfo import pdf_get_all_pageinfo
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								from .pdfa import generate_pdfa_def
-												Minor tweaks to uncommon arguments

											
										
										
											2015-07-28 02:25:50 -07:00
+								from . import ghostscript
-												Langauge checking

											
										
										
											2015-07-23 18:38:59 -07:00
+								from . import tesseract
-												Refactor qpdf subprocess calls into module

											
										
										
											2015-12-17 08:19:53 -08:00
+								from . import qpdf
-												Refactor exit codes; test for missing tessdata

Some versions of tesseract installed by homebrew end up without a
functional tessdata folder, and tesseract is not helpful in this
situation, so add a new test to make sure our output is at least
indicative of the problem.

In the process of properly handling return codes I discovered
test_override_metadata triggers a NPE inside JHOVE probably due to the
Unicode character checking.  This could be specific to my JRE (1.6.0_65,
Oracle) but it's probably JHOVE's fault.  A valid PDF/A (per Acrobat)
is still generated.

											
										
										
											2015-08-11 00:17:02 -07:00
+								from . import ExitCode
-												Fixes to colorspace and other inquiries

											
										
										
											2015-04-09 03:12:04 -07:00
-												Suppress the xref warning for now

											
										
										
											2015-04-09 14:06:55 -07:00
+								warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
-												Fixes to colorspace and other inquiries

											
										
										
											2015-04-09 03:12:04 -07:00
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
-												Add PDF/A validation

											
										
										
											2015-07-23 14:48:46 -07:00
+								BASEDIR = os.path.dirname(os.path.realpath(__file__))
-												All tests passed, bump version

											
										
										
											2015-12-04 04:31:01 -08:00
+								VERSION = '3.1'
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
-												Add tesseract version check

											
										
										
											2015-07-23 17:06:00 -07:00
 								# -------------
 								# External dependencies
 								MINIMUM_TESS_VERSION = '3.02.02'
-												Minor tweaks to uncommon arguments

											
										
										
											2015-07-28 02:25:50 -07:00
 								def complain(message):
-												Improve ruffus exception handling

ruffus swallows the return code if the process of handling an exception
we hit an error in ruffus' own code, which can happen.  So pick through
its error stack and find out if there's an interesting return code in
there.  Had to use eval() of all things.

Also suppress the stack trace for normal error conditions that don't
need one.

											
										
										
											2015-08-11 02:19:46 -07:00
+								    print(*textwrap.wrap(message), file=sys.stderr)
-												Minor tweaks to uncommon arguments

											
										
										
											2015-07-28 02:25:50 -07:00
-												Get rid of subprocess call on import of tesseract, unpaper -- bit nasty

											
										
										
											2015-07-28 01:00:29 -07:00
+								if tesseract.version() < MINIMUM_TESS_VERSION:
-												Minor tweaks to uncommon arguments

											
										
										
											2015-07-28 02:25:50 -07:00
+								    complain(
-												Add tesseract version check

											
										
										
											2015-07-23 17:06:00 -07:00
+								        "Please install tesseract {0} or newer "
 								        "(currently installed version is {1})".format(
-												Minor tweaks to uncommon arguments

											
										
										
											2015-07-28 02:25:50 -07:00
+								            MINIMUM_TESS_VERSION, tesseract.version()))
-												Refactor exit codes; test for missing tessdata

Some versions of tesseract installed by homebrew end up without a
functional tessdata folder, and tesseract is not helpful in this
situation, so add a new test to make sure our output is at least
indicative of the problem.

In the process of properly handling return codes I discovered
test_override_metadata triggers a NPE inside JHOVE probably due to the
Unicode character checking.  This could be specific to my JRE (1.6.0_65,
Oracle) but it's probably JHOVE's fault.  A valid PDF/A (per Acrobat)
is still generated.

											
										
										
											2015-08-11 00:17:02 -07:00
+								    sys.exit(ExitCode.missing_dependency)
-												Add tesseract version check

											
										
										
											2015-07-23 17:06:00 -07:00
-												Pillow sucks

Far from being fluffy or friendly, Pillow silently allows installation
of itself without support for major image types.  Reportlab calls for
pillow 2.4.0.  On Ubuntu 14.04 LTS this will trigger an upgrade of
pillow that will be built without JPEG or ZLIB so it is effectively
neutered, and unfortunately Pillow will not detect this situation at
install time and guide users to a resolution.  Instead, you see nasty
stack traces.

So add a run-time check to ensure that Pillow is sane and capable of JPEG
and PNG support since both may be used internally.

											
										
										
											2015-08-16 00:54:03 -07:00
+								try:
 								    import PIL.features
 								    check_codec = PIL.features.check_codec
 								except (ImportError, AttributeError):
 								    def check_codec(codec_name):
 								        if codec_name == 'jpg':
 								            return 'jpeg_encoder' in dir(Image.core)
 								        elif codec_name == 'zlib':
 								            return 'zip_encoder' in dir(Image.core)
 								        raise NotImplementedError(codec_name)
 								def check_pil_encoder(codec_name, friendly_name):
 								    try:
 								        if check_codec(codec_name):
 								            return
 								    except Exception:
 								        pass
 								    complain(
 								        "ERROR: Your version of the Python imaging library (Pillow) was "
 								        "compiled without support for " + friendly_name + " encoding/decoding."
 								        "\n"
 								        "You will need to uninstall Pillow and reinstall it with PNG and JPEG "
 								        "support (libjpeg and zlib)."
 								        "\n"
 								        "See installation instructions for your platform here:\n"
 								        "    https://pillow.readthedocs.org/installation.html"
 								    )
 								    sys.exit(ExitCode.missing_dependency)
 								check_pil_encoder('jpg', 'JPEG')
 								check_pil_encoder('zlib', 'PNG')
-												Add tesseract version check

											
										
										
											2015-07-23 17:06:00 -07:00
+								# -------------
 								# Parser
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								parser = cmdline.get_argparse(
-												Packaging stuff

											
										
										
											2015-07-25 23:45:13 -07:00
+								    prog="ocrmypdf",
-												Improve argument handling, test cases

											
										
										
											2015-07-27 15:39:54 -07:00
+								    description="Generate searchable PDF file from an image-only PDF file.",
-												Set /Creator metadata to OCRmyPDF

with reference to Tess version and settings

											
										
										
											2015-12-02 02:19:39 -08:00
+								    version=VERSION,
-												Improve argument handling, test cases

											
										
										
											2015-07-27 15:39:54 -07:00
+								    fromfile_prefix_chars='@',
 								    ignored_args=[
 								        'touch_files_only', 'recreate_database', 'checksum_file_name',
 								        'key_legend_in_graph', 'draw_graph_horizontally', 'flowchart_format',
-												We don't want threads. Really. Do. Not. Want.

											
										
										
											2015-12-04 03:11:38 -08:00
+								        'forced_tasks', 'target_tasks', 'use_threads'])
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
 								parser.add_argument(
-												Fix errors related to use working directory

Mainly workaround lack of @split(...output_dir) in ruffus

											
										
										
											2015-07-23 01:16:05 -07:00
+								    'input_file',
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
+								    help="PDF file containing the images to be OCRed")
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								parser.add_argument(
-												Fix errors related to use working directory

Mainly workaround lack of @split(...output_dir) in ruffus

											
										
										
											2015-07-23 01:16:05 -07:00
+								    'output_file',
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
+								    help="output searchable PDF file")
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								parser.add_argument(
-												Langauge checking

											
										
										
											2015-07-23 18:38:59 -07:00
+								    '-l', '--language', action='append',
-												Improve usage text

											
										
										
											2015-08-05 16:56:53 -07:00
+								    help="languages of the file to be OCRed")
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
-												Metadata override from command lien

											
										
										
											2015-07-25 18:12:25 -07:00
+								metadata = parser.add_argument_group(
 								    "Metadata options",
 								    "Set output PDF/A metadata (default: use input document's title)")
 								metadata.add_argument(
 								    '--title', type=str,
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								    help="set document title (place multiple words in quotes)")
-												Metadata override from command lien

											
										
										
											2015-07-25 18:12:25 -07:00
+								metadata.add_argument(
 								    '--author', type=str,
 								    help="set document author")
 								metadata.add_argument(
 								    '--subject', type=str,
 								    help="set document")
 								metadata.add_argument(
 								    '--keywords', type=str,
 								    help="set document keywords")
-												Fixes from early testing of new pipeline

											
										
										
											2015-07-22 22:51:38 -07:00
+								preprocessing = parser.add_argument_group(
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
+								    "Preprocessing options",
 								    "Improve OCR quality and final image")
 								preprocessing.add_argument(
 								    '-d', '--deskew', action='store_true',
 								    help="deskew each page before performing OCR")
 								preprocessing.add_argument(
 								    '-c', '--clean', action='store_true',
-												Improve usage text

											
										
										
											2015-08-05 16:56:53 -07:00
+								    help="clean pages from scanning artifacts before performing OCR")
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
+								preprocessing.add_argument(
 								    '-i', '--clean-final', action='store_true',
 								    help="incorporate the cleaned image in the final PDF file")
 								preprocessing.add_argument(
-												--oversample: Default to 0

											
										
										
											2015-07-27 20:42:16 -07:00
+								    '--oversample', metavar='DPI', type=int, default=0,
-												Improve usage text

											
										
										
											2015-08-05 16:56:53 -07:00
+								    help="oversample images to at least the specified DPI, to improve OCR "
 								         "results slightly")
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								parser.add_argument(
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								    '-f', '--force-ocr', action='store_true',
-												Improve usage text

											
										
										
											2015-08-05 16:56:53 -07:00
+								    help="rasterize any fonts or vector images on each page and apply OCR")
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								parser.add_argument(
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								    '-s', '--skip-text', action='store_true',
-												Improve usage text

											
										
										
											2015-08-05 16:56:53 -07:00
+								    help="skip OCR on any pages that already contain text, but include the"
 								         " page in final output")
-												Add support for -b (skip big pages)

											
										
										
											2015-02-20 15:26:33 -08:00
+								parser.add_argument(
-												Minor tweaks to uncommon arguments

											
										
										
											2015-07-28 02:25:50 -07:00
+								    '--skip-big', type=float, metavar='MPixels',
-												Improve usage text

											
										
										
											2015-08-05 16:56:53 -07:00
+								    help="skip OCR on pages larger than the specified amount of megapixels, "
 								         "but include skipped pages in final output")
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								# parser.add_argument(
 								#     '--exact-image', action='store_true',
 								#     help="Use original page from PDF without re-rendering")
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
-												Fixes from early testing of new pipeline

											
										
										
											2015-07-22 22:51:38 -07:00
+								advanced = parser.add_argument_group(
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
+								    "Advanced",
-												For now, unpaper is the only deskew provider

											
										
										
											2015-07-25 01:46:16 -07:00
+								    "Advanced options for power users")
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
+								advanced.add_argument(
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								    '--tesseract-config', default=[], type=list, action='append',
-												Minor tweaks to uncommon arguments

											
										
										
											2015-07-28 02:25:50 -07:00
+								    help="additional Tesseract configuration files")
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								advanced.add_argument(
-												Introduce --pdf-renderer auto

Tess 3.03's has various quality problems like wrong DPI that are fixed
in Tess 3.04. Idea here is to introduce an option to let OCRmyPDF
select the rendering backend based on the options and system.

However, we're not ready for tesseract as the main renderer.
Setting pdf-renderer to tesseract does not pass all test cases, mainly
the one where --tesseract-timeout is triggered, and some others.

											
										
										
											2015-12-02 23:20:31 -08:00
+								    '--pdf-renderer', choices=['auto', 'tesseract', 'hocr'], default='auto',
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								    help='choose OCR PDF renderer')
-												Implement tesseract timeout

											
										
										
											2015-07-27 04:23:37 -07:00
+								advanced.add_argument(
 								    '--tesseract-timeout', default=180.0, type=float,
-												Improve usage text

											
										
										
											2015-08-05 16:56:53 -07:00
+								    help='give up on OCR after the timeout, but copy the preprocessed page '
 								         'into the final output')
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
-												Fixes from early testing of new pipeline

											
										
										
											2015-07-22 22:51:38 -07:00
+								debugging = parser.add_argument_group(
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
+								    "Debugging",
 								    "Arguments to help with troubleshooting and debugging")
 								debugging.add_argument(
 								    '-k', '--keep-temporary-files', action='store_true',
 								    help="keep temporary files (helpful for debugging)")
 								debugging.add_argument(
-												diff --git a/src/ocrmypdf.py b/src/ocrmypdf.py
index 68d1591..95afa8f 100755
--- a/src/ocrmypdf.py
+++ b/src/ocrmypdf.py
@@ -24,6 +24,7 @@ import ruffus.cmdline as cmdline
 from .hocrtransform import HocrTransform

 import warnings
+import multiprocessing

 warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)

@@ -96,7 +97,7 @@ debugging.add_argument(
     '-k', '--keep-temporary-files', action='store_true',
     help="keep temporary files (helpful for debugging)")
 debugging.add_argument(
-    '-g' ,'--debug-rendering', action='store_true',
+    '-g', '--debug-rendering', action='store_true',
     help="render each page twice with debug information on second page")

@@ -106,51 +107,19 @@ if not options.temp_folder:
     options.temp_folder = 'tmp'

-_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
-                                               options.verbose)
+log, log_mutex = cmdline.setup_logging(__name__, options.log_file,
+                                       options.verbose)

-class WrappedLogger:
-
-    def __init__(self, my_logger, my_mutex):
-        self.logger = my_logger
-        self.mutex = my_mutex
-
-    def log(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.log(*args, **kwargs)
-
-    def debug(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.debug(*args, **kwargs)
-
-    def info(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.info(*args, **kwargs)
-
-    def warning(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.warning(*args, **kwargs)
-
-    def error(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.error(*args, **kwargs)
-
-    def critical(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.critical(*args, **kwargs)
-
-log = WrappedLogger(_logger, _logger_mutex)
-
-
-def re_symlink(input_file, soft_link_name, log=log):
+def re_symlink(input_file, soft_link_name, log, mutex):
     """
     Helper function: relinks soft symbolic link if necessary
     """
     if input_file == soft_link_name:
-        log.debug("Warning: No symbolic link made. You are using " +
-                     "the original data directory as the working directory.")
+        with mutex:
+            log.debug("Warning: No symbolic link made. You are using " +
+                      "the original data directory as the working directory.")
         return

@@ -161,12 +130,14 @@ def re_symlink(input_file, soft_link_name, log=log):
         try:
             os.unlink(soft_link_name)
         except:
-            log.debug("Can't unlink %s" % (soft_link_name))
+            with mutex:
+                log.debug("Can't unlink %s" % (soft_link_name))

     if not os.path.exists(input_file):
         raise Exception("trying to create a broken symlink to %s" % input_file)

-    log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
+    with mutex:
+        log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))

     os.symlink(

											
										
										
											2015-07-23 02:22:12 -07:00
+								    '-g', '--debug-rendering', action='store_true',
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
+								    help="render each page twice with debug information on second page")
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
-												Don't mess with options

											
										
										
											2015-07-28 04:46:21 -07:00
+								options = parser.parse_args()
-												Improve argument handling, test cases

											
										
										
											2015-07-27 15:39:54 -07:00
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
-												Langauge checking

											
										
										
											2015-07-23 18:38:59 -07:00
+								# ----------
 								# Languages
 								if not options.language:
 								    options.language = ['eng']  # Enforce English hegemony
 								# Support v2.x "eng+deu" language syntax
 								if '+' in options.language[0]:
 								    options.language = options.language[0].split('+')
-												Get rid of subprocess call on import of tesseract, unpaper -- bit nasty

											
										
										
											2015-07-28 01:00:29 -07:00
+								if not set(options.language).issubset(tesseract.languages()):
-												Minor tweaks to uncommon arguments

											
										
										
											2015-07-28 02:25:50 -07:00
+								    complain(
-												Langauge checking

											
										
										
											2015-07-23 18:38:59 -07:00
+								        "The installed version of tesseract does not have language "
-												Minor tweaks to uncommon arguments

											
										
										
											2015-07-28 02:25:50 -07:00
+								        "data for the following requested languages: ")
-												Get rid of subprocess call on import of tesseract, unpaper -- bit nasty

											
										
										
											2015-07-28 01:00:29 -07:00
+								    for lang in (set(options.language) - tesseract.languages()):
-												Replace fileinput with regular open-replace

fileinput is supposed to save time in these cases but it's not capable
of doing both in-place rewrites and working with a non-ascii encoding.
This was not noticed until characters outside of ASCII were picked up
by tesseract and saved in a HOCR file. Rework some surrounding code as
well and add multilingual test cases.

											
										
										
											2015-08-18 23:27:50 -07:00
+								        complain(lang)
-												Refactor exit codes; test for missing tessdata

Some versions of tesseract installed by homebrew end up without a
functional tessdata folder, and tesseract is not helpful in this
situation, so add a new test to make sure our output is at least
indicative of the problem.

In the process of properly handling return codes I discovered
test_override_metadata triggers a NPE inside JHOVE probably due to the
Unicode character checking.  This could be specific to my JRE (1.6.0_65,
Oracle) but it's probably JHOVE's fault.  A valid PDF/A (per Acrobat)
is still generated.

											
										
										
											2015-08-11 00:17:02 -07:00
+								    sys.exit(ExitCode.bad_args)
-												Langauge checking

											
										
										
											2015-07-23 18:38:59 -07:00
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								# ----------
 								# Arguments
-												Introduce --pdf-renderer auto

Tess 3.03's has various quality problems like wrong DPI that are fixed
in Tess 3.04. Idea here is to introduce an option to let OCRmyPDF
select the rendering backend based on the options and system.

However, we're not ready for tesseract as the main renderer.
Setting pdf-renderer to tesseract does not pass all test cases, mainly
the one where --tesseract-timeout is triggered, and some others.

											
										
										
											2015-12-02 23:20:31 -08:00
+								if options.pdf_renderer == 'auto':
 								    options.pdf_renderer = 'hocr'
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
 								if any((options.deskew, options.clean, options.clean_final)):
 								    try:
 								        from . import unpaper
 								    except ImportError:
-												Minor tweaks to uncommon arguments

											
										
										
											2015-07-28 02:25:50 -07:00
+								        complain(
 								            "Install the 'unpaper' program to use --deskew or --clean.")
-												Refactor exit codes; test for missing tessdata

Some versions of tesseract installed by homebrew end up without a
functional tessdata folder, and tesseract is not helpful in this
situation, so add a new test to make sure our output is at least
indicative of the problem.

In the process of properly handling return codes I discovered
test_override_metadata triggers a NPE inside JHOVE probably due to the
Unicode character checking.  This could be specific to my JRE (1.6.0_65,
Oracle) but it's probably JHOVE's fault.  A valid PDF/A (per Acrobat)
is still generated.

											
										
										
											2015-08-11 00:17:02 -07:00
+								        sys.exit(ExitCode.bad_args)
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								else:
 								    unpaper = None
-												Minor tweaks to uncommon arguments

											
										
										
											2015-07-28 02:25:50 -07:00
+								if options.debug_rendering and options.pdf_renderer == 'tesseract':
 								    complain(
 								        "Ignoring --debug-rendering because it is not supported with"
 								        "--pdf-renderer=tesseract.")
 								if options.force_ocr and options.skip_text:
 								    complain(
 								        "Error: --force-ocr and --skip-text are mutually incompatible.")
-												Refactor exit codes; test for missing tessdata

Some versions of tesseract installed by homebrew end up without a
functional tessdata folder, and tesseract is not helpful in this
situation, so add a new test to make sure our output is at least
indicative of the problem.

In the process of properly handling return codes I discovered
test_override_metadata triggers a NPE inside JHOVE probably due to the
Unicode character checking.  This could be specific to my JRE (1.6.0_65,
Oracle) but it's probably JHOVE's fault.  A valid PDF/A (per Acrobat)
is still generated.

											
										
										
											2015-08-11 00:17:02 -07:00
+								    sys.exit(ExitCode.bad_args)
-												Minor tweaks to uncommon arguments

											
										
										
											2015-07-28 02:25:50 -07:00
 								if options.clean and not options.clean_final \
 								        and options.pdf_renderer == 'tesseract':
 								    complain(
 								        "Tesseract PDF renderer cannot render --clean pages without "
 								        "also performing --clean-final, so --clean-final is assumed.")
-												Try a method for passing along the pdfinfo struct

											
										
										
											2015-07-23 02:39:42 -07:00
+								# ----------
 								# Logging
-												New pipeline runs, splits pages

											
										
										
											2015-07-22 22:58:13 -07:00
-												Reinstate WrapperLogger with more multiprocessing fixes

											
										
										
											2015-07-23 02:26:09 -07:00
+								_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
 								                                               options.verbose)
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
-												Reinstate WrapperLogger with more multiprocessing fixes

											
										
										
											2015-07-23 02:26:09 -07:00
+								class WrappedLogger:
 								    def __init__(self, my_logger, my_mutex):
 								        self.logger = my_logger
 								        self.mutex = my_mutex
 								    def log(self, *args, **kwargs):
 								        with self.mutex:
 								            self.logger.log(*args, **kwargs)
 								    def debug(self, *args, **kwargs):
 								        with self.mutex:
 								            self.logger.debug(*args, **kwargs)
 								    def info(self, *args, **kwargs):
 								        with self.mutex:
 								            self.logger.info(*args, **kwargs)
 								    def warning(self, *args, **kwargs):
 								        with self.mutex:
 								            self.logger.warning(*args, **kwargs)
 								    def error(self, *args, **kwargs):
 								        with self.mutex:
 								            self.logger.error(*args, **kwargs)
 								    def critical(self, *args, **kwargs):
 								        with self.mutex:
 								            self.logger.critical(*args, **kwargs)
 								_log = WrappedLogger(_logger, _logger_mutex)
 								def re_symlink(input_file, soft_link_name, log=_log):
-												Learn to split PDF into pages

											
										
										
											2015-07-22 22:46:00 -07:00
+								    """
 								    Helper function: relinks soft symbolic link if necessary
 								    """
 								    # Guard against soft linking to oneself
 								    if input_file == soft_link_name:
-												Reinstate WrapperLogger with more multiprocessing fixes

											
										
										
											2015-07-23 02:26:09 -07:00
+								        log.debug("Warning: No symbolic link made. You are using " +
-												Tidy up

											
										
										
											2015-07-24 01:27:01 -07:00
+								                  "the original data directory as the working directory.")
-												Learn to split PDF into pages

											
										
										
											2015-07-22 22:46:00 -07:00
+								        return
 								    # Soft link already exists: delete for relink?
 								    if os.path.lexists(soft_link_name):
 								        # do not delete or overwrite real (non-soft link) file
 								        if not os.path.islink(soft_link_name):
 								            raise Exception("%s exists and is not a link" % soft_link_name)
 								        try:
 								            os.unlink(soft_link_name)
 								        except:
-												Reinstate WrapperLogger with more multiprocessing fixes

											
										
										
											2015-07-23 02:26:09 -07:00
+								            log.debug("Can't unlink %s" % (soft_link_name))
-												Learn to split PDF into pages

											
										
										
											2015-07-22 22:46:00 -07:00
 								    if not os.path.exists(input_file):
 								        raise Exception("trying to create a broken symlink to %s" % input_file)
-												Reinstate WrapperLogger with more multiprocessing fixes

											
										
										
											2015-07-23 02:26:09 -07:00
+								    log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
-												Learn to split PDF into pages

											
										
										
											2015-07-22 22:46:00 -07:00
 								    # Create symbolic link using absolute path
 								    os.symlink(
 								        os.path.abspath(input_file),
 								        soft_link_name
 								    )
-												Try a method for passing along the pdfinfo struct

											
										
										
											2015-07-23 02:39:42 -07:00
+								# -------------
 								# The Pipeline
-												Wrap a proxy around pdfinfo block so it can be passed around processes

											
										
										
											2015-07-23 03:49:30 -07:00
+								manager = multiprocessing.Manager()
 								_pdfinfo = manager.list()
 								_pdfinfo_lock = manager.Lock()
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
 								@atexit.register
 								def cleanup_working_files(*args):
 								    if options.keep_temporary_files:
 								        print("Temporary working files saved at:")
 								        print(work_folder)
 								    else:
 								        with suppress(FileNotFoundError):
 								            shutil.rmtree(work_folder)
-												Try a method for passing along the pdfinfo struct

											
										
										
											2015-07-23 02:39:42 -07:00
-												Fix errors related to use working directory

Mainly workaround lack of @split(...output_dir) in ruffus

											
										
										
											2015-07-23 01:16:05 -07:00
+								@transform(
-												Get rid of chdir, replace deprecated @split with @subdivide

											
										
										
											2015-07-23 03:09:03 -07:00
+								    input=options.input_file,
-												Fix issue #20 - fails on uppercase .PDF

											
										
										
											2015-12-04 02:14:09 -08:00
+								    filter=formatter('(?i)\.pdf'),
 								    output=work_folder + '{basename[0]}.repaired.pdf',
-												Wrap a proxy around pdfinfo block so it can be passed around processes

											
										
										
											2015-07-23 03:49:30 -07:00
+								    extras=[_log, _pdfinfo, _pdfinfo_lock])
-												Change 'clean' to 'repair' for clarity since 'clean' is what unpaper does

											
										
										
											2015-07-24 01:55:54 -07:00
+								def repair_pdf(
-												Fix errors related to use working directory

Mainly workaround lack of @split(...output_dir) in ruffus

											
										
										
											2015-07-23 01:16:05 -07:00
+								        input_file,
-												Try a method for passing along the pdfinfo struct

											
										
										
											2015-07-23 02:39:42 -07:00
+								        output_file,
-												Wrap a proxy around pdfinfo block so it can be passed around processes

											
										
										
											2015-07-23 03:49:30 -07:00
+								        log,
-												Get rid of chdir, replace deprecated @split with @subdivide

											
										
										
											2015-07-23 03:09:03 -07:00
+								        pdfinfo,
-												Wrap a proxy around pdfinfo block so it can be passed around processes

											
										
										
											2015-07-23 03:49:30 -07:00
+								        pdfinfo_lock):
-												Fix errors related to use working directory

Mainly workaround lack of @split(...output_dir) in ruffus

											
										
										
											2015-07-23 01:16:05 -07:00
-												Refactor qpdf subprocess calls into module

											
										
										
											2015-12-17 08:19:53 -08:00
+								    qpdf.repair(input_file, output_file, log)
-												Wrap a proxy around pdfinfo block so it can be passed around processes

											
										
										
											2015-07-23 03:49:30 -07:00
+								    with pdfinfo_lock:
 								        pdfinfo.extend(pdf_get_all_pageinfo(output_file))
 								        log.info(pdfinfo)
-												Fix errors related to use working directory

Mainly workaround lack of @split(...output_dir) in ruffus

											
										
										
											2015-07-23 01:16:05 -07:00
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								def get_pageinfo(input_file, pdfinfo, pdfinfo_lock):
 								    pageno = int(os.path.basename(input_file)[0:6]) - 1
 								    with pdfinfo_lock:
 								        pageinfo = pdfinfo[pageno].copy()
 								    return pageinfo
-												Try a method for passing along the pdfinfo struct

											
										
										
											2015-07-23 02:39:42 -07:00
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
 								def is_ocr_required(pageinfo, log):
 								    page = pageinfo['pageno'] + 1
 								    ocr_required = True
 								    if not pageinfo['images']:
 								        # If the page has no images, then it contains vector content or text
 								        # or both. It seems quite unlikely that one would find meaningful text
 								        # from rasterizing vector content. So skip the page.
 								        log.info(
 								            "Page {0} has no images - skipping OCR".format(page)
 								        )
 								        ocr_required = False
 								    elif pageinfo['has_text']:
 								        s = "Page {0} already has text! – {1}"
 								        if not options.force_ocr and not options.skip_text:
 								            log.error(s.format(page,
 								                               "aborting (use --force-ocr to force OCR)"))
-												Refactor exit codes; test for missing tessdata

Some versions of tesseract installed by homebrew end up without a
functional tessdata folder, and tesseract is not helpful in this
situation, so add a new test to make sure our output is at least
indicative of the problem.

In the process of properly handling return codes I discovered
test_override_metadata triggers a NPE inside JHOVE probably due to the
Unicode character checking.  This could be specific to my JRE (1.6.0_65,
Oracle) but it's probably JHOVE's fault.  A valid PDF/A (per Acrobat)
is still generated.

											
										
										
											2015-08-11 00:17:02 -07:00
+								            sys.exit(ExitCode.already_done_ocr)
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								        elif options.force_ocr:
 								            log.info(s.format(page,
 								                              "rasterizing text and running OCR anyway"))
 								            ocr_required = True
 								        elif options.skip_text:
 								            log.info(s.format(page,
 								                              "skipping all processing on this page"))
 								            ocr_required = False
 								    if ocr_required and options.skip_big:
 								        pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
-												Minor tweaks to uncommon arguments

											
										
										
											2015-07-28 02:25:50 -07:00
+								        if pixel_count > (options.skip_big * 1000000):
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								            ocr_required = False
 								            log.info(
 								                "Page {0} is very large; skipping due to -b".format(page))
 								    return ocr_required
-												Fix errors related to use working directory

Mainly workaround lack of @split(...output_dir) in ruffus

											
										
										
											2015-07-23 01:16:05 -07:00
-												Change @subdivide to @split

@split is for "1 to many" operations, so it's the right tool for this
case.

											
										
										
											2015-07-25 02:58:34 -07:00
+								@split(
-												Change 'clean' to 'repair' for clarity since 'clean' is what unpaper does

											
										
										
											2015-07-24 01:55:54 -07:00
+								    repair_pdf,
-												Change @subdivide to @split

@split is for "1 to many" operations, so it's the right tool for this
case.

											
										
										
											2015-07-25 02:58:34 -07:00
+								    os.path.join(work_folder, '*.page.pdf'),
 								    extras=[_log, _pdfinfo, _pdfinfo_lock])
-												Learn to split PDF into pages

											
										
										
											2015-07-22 22:46:00 -07:00
+								def split_pages(
 								        input_file,
-												Get rid of chdir, replace deprecated @split with @subdivide

											
										
										
											2015-07-23 03:09:03 -07:00
+								        output_files,
-												Wrap a proxy around pdfinfo block so it can be passed around processes

											
										
										
											2015-07-23 03:49:30 -07:00
+								        log,
-												Get rid of chdir, replace deprecated @split with @subdivide

											
										
										
											2015-07-23 03:09:03 -07:00
+								        pdfinfo,
-												Wrap a proxy around pdfinfo block so it can be passed around processes

											
										
										
											2015-07-23 03:49:30 -07:00
+								        pdfinfo_lock):
-												Learn to split PDF into pages

											
										
										
											2015-07-22 22:46:00 -07:00
 								    for oo in output_files:
 								        with suppress(FileNotFoundError):
 								            os.unlink(oo)
-												Replace mupdf and poppler with qpdf

Drop two dependencies and replace them with one that does the job of
both.  Smells like progress.

mupdf does PDF file repair and rendering
poppler does rendering and page splitting
qpdf does PDF file repair and page splitting
ghostscript does PDF file repair, rendering, and page splitting (sort of)

So we use qpdf.  Ghostscript's page splitting is supposed is less
efficient because it reprints the page (PDF -> Postscript -> PDF) and
possibly loses quality.  qpdf's library could be used to improve
performance.

This causes a slight performance regression:

py.test tests/test_main.py::test_maximum_options went from 187 seconds
up to 192.  This is likely due to O(n) serialized invocations of qpdf
compared to a single serialized call to pdfseparate.  Could improve on
this situation by using the example code in qpdf: pdf-split-pages.cc
or create marker files in split_pages() and then write a new @transform
function that would split pages on each CPU.  Probably not worth it,
overall, unless this causes problems on files with hundreds of pages.

											
										
										
											2015-07-30 04:06:31 -07:00
-												Move more qpdf calls into qpdf.py

											
										
										
											2015-12-17 08:24:48 -08:00
+								    npages = qpdf.get_npages(input_file)
 								    qpdf.split_pages(input_file, work_folder, npages)
-												Learn to split PDF into pages

											
										
										
											2015-07-22 22:46:00 -07:00
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								    from glob import glob
 								    for filename in glob(os.path.join(work_folder, '*.page.pdf')):
 								        pageinfo = get_pageinfo(filename, pdfinfo, pdfinfo_lock)
-												Wrap a proxy around pdfinfo block so it can be passed around processes

											
										
										
											2015-07-23 03:49:30 -07:00
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								        alt_suffix = '.ocr.page.pdf' if is_ocr_required(pageinfo, log) \
 								                     else '.skip.page.pdf'
 								        re_symlink(
 								            filename,
 								            os.path.join(
 								                work_folder,
 								                os.path.basename(filename)[0:6] + alt_suffix))
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								@transform(
 								    input=split_pages,
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								    filter=suffix('.ocr.page.pdf'),
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								    output='.page.png',
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								    output_dir=work_folder,
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								    extras=[_log, _pdfinfo, _pdfinfo_lock])
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								def rasterize_with_ghostscript(
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								        input_file,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								    pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
 								    device = 'png16m'  # 24-bit
 								    if all(image['comp'] == 1 for image in pageinfo['images']):
 								        if all(image['bpc'] == 1 for image in pageinfo['images']):
 								            device = 'pngmono'
-												Use png256 raster device when possible

Someone reported a bug where the .png input to unpaper ended up being
type 'P' (palette) for some reason, which was not supported in unpaper.

Not sure how it happened, but seemed easier to fix by explicitly
supporting. Here we use png256 if it would capture all colors in the
input file. It's up to tesseract/reportlab to make use of the palette
PNG when rendering.

											
										
										
											2015-08-28 04:47:57 -07:00
+								        elif all(image['bpc'] > 1 and image['color'] == 'index'
 								                 for image in pageinfo['images']):
 								            device = 'png256'
 								        elif all(image['bpc'] > 1 and image['color'] == 'gray'
 								                 for image in pageinfo['images']):
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								            device = 'pnggray'
-												Use png256 raster device when possible

Someone reported a bug where the .png input to unpaper ended up being
type 'P' (palette) for some reason, which was not supported in unpaper.

Not sure how it happened, but seemed easier to fix by explicitly
supporting. Here we use png256 if it would capture all colors in the
input file. It's up to tesseract/reportlab to make use of the palette
PNG when rendering.

											
										
										
											2015-08-28 04:47:57 -07:00
+								    log.debug("Rendering {0} with {1}".format(
 								            os.path.basename(input_file), device))
-												Remove redundant *res_render

											
										
										
											2015-07-26 12:56:10 -07:00
+								    xres = max(pageinfo['xres'], options.oversample or 0)
 								    yres = max(pageinfo['yres'], options.oversample or 0)
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
-												Minor tweaks to uncommon arguments

											
										
										
											2015-07-28 02:25:50 -07:00
+								    ghostscript.rasterize_pdf(input_file, output_file, xres, yres, device, log)
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
 								@transform(
 								    input=rasterize_with_ghostscript,
 								    filter=suffix(".page.png"),
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    output=".pp-deskew.png",
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
+								    extras=[_log, _pdfinfo, _pdfinfo_lock])
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								def preprocess_deskew(
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
+								        input_file,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    if not options.deskew:
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
+								        re_symlink(input_file, output_file, log)
 								        return
 								    pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    dpi = int(pageinfo['xres'])
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    unpaper.deskew(input_file, output_file, dpi, log)
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								@transform(
 								    input=preprocess_deskew,
 								    filter=suffix(".pp-deskew.png"),
 								    output=".pp-clean.png",
 								    extras=[_log, _pdfinfo, _pdfinfo_lock])
 								def preprocess_clean(
 								        input_file,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    if not options.clean:
 								        re_symlink(input_file, output_file, log)
 								        return
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
 								    dpi = int(pageinfo['xres'])
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    unpaper.clean(input_file, output_file, dpi, log)
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								@active_if(options.pdf_renderer == 'hocr')
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
+								@transform(
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    input=preprocess_clean,
 								    filter=suffix(".pp-clean.png"),
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								    output=".hocr",
 								    extras=[_log, _pdfinfo, _pdfinfo_lock])
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								def ocr_tesseract_hocr(
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								        input_file,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
-												Migrate tesseract-hocr code to tesseract module, because modularity

											
										
										
											2015-12-16 17:36:11 -08:00
+								    tesseract.generate_hocr(
 								        input_file=input_file,
 								        output_hocr=output_file,
 								        language=options.language,
 								        tessconfig=options.tesseract_config,
 								        timeout=options.tesseract_timeout,
 								        pageinfo_getter=partial(get_pageinfo, input_file, pdfinfo,
 								                                pdfinfo_lock),
 								        log=log
 								        )
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								@active_if(options.pdf_renderer == 'hocr')
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								@collate(
-												Split selecting final image and render PDF result into separate tasks

Simplifies the logic - one deals with all images, the other details
with an image and .hocr. Also add JPEG reconversion.

											
										
										
											2015-07-25 00:54:00 -07:00
+								    input=[rasterize_with_ghostscript, preprocess_deskew, preprocess_clean],
 								    filter=regex(r".*/(\d{6})(?:\.page|\.pp-deskew|\.pp-clean)\.png"),
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								    output=os.path.join(work_folder, r'\1.image'),
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								    extras=[_log, _pdfinfo, _pdfinfo_lock])
-												Split selecting final image and render PDF result into separate tasks

Simplifies the logic - one deals with all images, the other details
with an image and .hocr. Also add JPEG reconversion.

											
										
										
											2015-07-25 00:54:00 -07:00
+								def select_image_for_pdf(
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								        infiles,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    if options.clean_final:
 								        image_suffix = '.pp-clean.png'
-												Split selecting final image and render PDF result into separate tasks

Simplifies the logic - one deals with all images, the other details
with an image and .hocr. Also add JPEG reconversion.

											
										
										
											2015-07-25 00:54:00 -07:00
+								    elif options.deskew:
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								        image_suffix = '.pp-deskew.png'
-												Split selecting final image and render PDF result into separate tasks

Simplifies the logic - one deals with all images, the other details
with an image and .hocr. Also add JPEG reconversion.

											
										
										
											2015-07-25 00:54:00 -07:00
+								    else:
 								        image_suffix = '.page.png'
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    image = next(ii for ii in infiles if ii.endswith(image_suffix))
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
-												Split selecting final image and render PDF result into separate tasks

Simplifies the logic - one deals with all images, the other details
with an image and .hocr. Also add JPEG reconversion.

											
										
										
											2015-07-25 00:54:00 -07:00
+								    pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
 								    if all(image['enc'] == 'jpeg' for image in pageinfo['images']):
 								        # If all images were JPEGs originally, produce a JPEG as output
 								        Image.open(image).save(output_file, format='JPEG')
 								    else:
 								        re_symlink(image, output_file)
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								@active_if(options.pdf_renderer == 'hocr')
-												Split selecting final image and render PDF result into separate tasks

Simplifies the logic - one deals with all images, the other details
with an image and .hocr. Also add JPEG reconversion.

											
										
										
											2015-07-25 00:54:00 -07:00
+								@collate(
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								    input=[select_image_for_pdf, ocr_tesseract_hocr],
-												Split selecting final image and render PDF result into separate tasks

Simplifies the logic - one deals with all images, the other details
with an image and .hocr. Also add JPEG reconversion.

											
										
										
											2015-07-25 00:54:00 -07:00
+								    filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"),
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								    output=os.path.join(work_folder, r'\1.rendered.pdf'),
-												Split selecting final image and render PDF result into separate tasks

Simplifies the logic - one deals with all images, the other details
with an image and .hocr. Also add JPEG reconversion.

											
										
										
											2015-07-25 00:54:00 -07:00
+								    extras=[_log, _pdfinfo, _pdfinfo_lock])
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								def render_hocr_page(
-												Split selecting final image and render PDF result into separate tasks

Simplifies the logic - one deals with all images, the other details
with an image and .hocr. Also add JPEG reconversion.

											
										
										
											2015-07-25 00:54:00 -07:00
+								        infiles,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
 								    hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
 								    image = next(ii for ii in infiles if ii.endswith('.image'))
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								    pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
-												Add --oversample test for hocr rendering

											
										
										
											2015-07-27 17:18:02 -07:00
+								    dpi = round(max(pageinfo['xres'], pageinfo['yres'], options.oversample))
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
 								    hocrtransform = HocrTransform(hocr, dpi)
 								    hocrtransform.to_pdf(output_file, imageFileName=image,
 								                         showBoundingboxes=False, invisibleText=True)
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								@active_if(options.pdf_renderer == 'hocr')
-												Reimplement debug pages

											
										
										
											2015-07-25 14:14:02 -07:00
+								@active_if(options.debug_rendering)
 								@collate(
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								    input=[select_image_for_pdf, ocr_tesseract_hocr],
-												Reimplement debug pages

											
										
										
											2015-07-25 14:14:02 -07:00
+								    filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"),
 								    output=os.path.join(work_folder, r'\1.debug.pdf'),
 								    extras=[_log, _pdfinfo, _pdfinfo_lock])
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								def render_hocr_debug_page(
-												Reimplement debug pages

											
										
										
											2015-07-25 14:14:02 -07:00
+								        infiles,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
 								    hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
 								    image = next(ii for ii in infiles if ii.endswith('.image'))
 								    pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
-												Add --oversample test for hocr rendering

											
										
										
											2015-07-27 17:18:02 -07:00
+								    dpi = round(max(pageinfo['xres'], pageinfo['yres'], options.oversample))
-												Reimplement debug pages

											
										
										
											2015-07-25 14:14:02 -07:00
 								    hocrtransform = HocrTransform(hocr, dpi)
 								    hocrtransform.to_pdf(output_file, imageFileName=None,
 								                         showBoundingboxes=True, invisibleText=False)
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								@active_if(options.pdf_renderer == 'tesseract')
-												Test cases for --tesseract-timeout

											
										
										
											2015-07-28 01:47:30 -07:00
+								@collate(
 								    input=[preprocess_clean, split_pages],
 								    filter=regex(r".*/(\d{6})(?:\.pp-clean\.png|\.page\.pdf)"),
 								    output=os.path.join(work_folder, r'\1.rendered.pdf'),
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								    extras=[_log, _pdfinfo, _pdfinfo_lock])
 								def tesseract_ocr_and_render_pdf(
-												Test cases for --tesseract-timeout

											
										
										
											2015-07-28 01:47:30 -07:00
+								        input_files,
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
-												More test cases

											
										
										
											2015-07-28 03:02:35 -07:00
+								    input_image = next((ii for ii in input_files if ii.endswith('.png')), '')
 								    input_pdf = next((ii for ii in input_files if ii.endswith('.pdf')))
 								    if not input_image:
 								        # Skipping this page
 								        re_symlink(input_pdf, output_file)
 								        return
-												Test cases for --tesseract-timeout

											
										
										
											2015-07-28 01:47:30 -07:00
-												Refactor tesseract --pdfrenderer calls to tesseract.py

											
										
										
											2015-12-16 17:48:26 -08:00
+								    tesseract.generate_pdf(
 								        input_image=input_image,
 								        skip_pdf=input_pdf,
 								        output_pdf=output_file,
 								        language=options.language,
 								        tessconfig=options.tesseract_config,
 								        timeout=options.tesseract_timeout,
 								        log=log)
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								@transform(
-												Change 'clean' to 'repair' for clarity since 'clean' is what unpaper does

											
										
										
											2015-07-24 01:55:54 -07:00
+								    input=repair_pdf,
 								    filter=suffix('.repaired.pdf'),
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								    output='.pdfa_def.ps',
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								    output_dir=work_folder,
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								    extras=[_log])
 								def generate_postscript_stub(
 								        input_file,
 								        output_file,
 								        log):
-												Copy document metadata from source document into output (untested)

This works for ASCII only; will do Unicode version.

											
										
										
											2015-07-25 15:31:02 -07:00
-												Transfer Unicode document information from input PDF to output PDF

What a pain getting Unicode right, but there it is.

I cannot find anything to confirm that it is acceptable to put the PDF/A
definition file at the end of the Ghostscript inputs.  I did this because
Ghostscript seems to copy document info from the last document on the
list so reportlab's information "wins" in normal order, so it fixes that
issue, and reportlab 'helpfully' fills in all of those fields even if it
does not have information.

It could also work to pass document information along to reportlab, and
set it in each output PDF: .debug.pdf, .rendered.pdf, and .page.pdf to
ensure that whatever page is last in the pipeline has the right
information. Or perhaps it's possible to write a Postscript trailer that
overwrites any previous docinfo with no side effects, but I can't find
any information on how to do that.  I don't think it's worth pursuing
unless this arrangement causes some problem with PDF/A generation.

On a minor note, Jhove misreads the way I have encoded the strings in
producing its validation log.  It reads them as UTF-16 little endian, so
will tend to produce a string of Asian characters in place of the real
data.

											
										
										
											2015-07-25 18:05:25 -07:00
+								    pdf = pypdf.PdfFileReader(input_file)
 								    def from_document_info(key):
-												Bug fix: exception thrown if input PDF was missing DocumentInfo block

											
										
										
											2015-08-24 01:23:30 -07:00
+								        # pdf.documentInfo.get() DOES NOT behave as expected for a dict-like
 								        # object, so call with precautions.  TypeError may occur if the PDF
 								        # is missing the optional document info section.
-												Transfer Unicode document information from input PDF to output PDF

What a pain getting Unicode right, but there it is.

I cannot find anything to confirm that it is acceptable to put the PDF/A
definition file at the end of the Ghostscript inputs.  I did this because
Ghostscript seems to copy document info from the last document on the
list so reportlab's information "wins" in normal order, so it fixes that
issue, and reportlab 'helpfully' fills in all of those fields even if it
does not have information.

It could also work to pass document information along to reportlab, and
set it in each output PDF: .debug.pdf, .rendered.pdf, and .page.pdf to
ensure that whatever page is last in the pipeline has the right
information. Or perhaps it's possible to write a Postscript trailer that
overwrites any previous docinfo with no side effects, but I can't find
any information on how to do that.  I don't think it's worth pursuing
unless this arrangement causes some problem with PDF/A generation.

On a minor note, Jhove misreads the way I have encoded the strings in
producing its validation log.  It reads them as UTF-16 little endian, so
will tend to produce a string of Asian characters in place of the real
data.

											
										
										
											2015-07-25 18:05:25 -07:00
+								        try:
 								            s = pdf.documentInfo[key]
 								            return str(s)
-												Bug fix: exception thrown if input PDF was missing DocumentInfo block

											
										
										
											2015-08-24 01:23:30 -07:00
+								        except (KeyError, TypeError):
-												Transfer Unicode document information from input PDF to output PDF

What a pain getting Unicode right, but there it is.

I cannot find anything to confirm that it is acceptable to put the PDF/A
definition file at the end of the Ghostscript inputs.  I did this because
Ghostscript seems to copy document info from the last document on the
list so reportlab's information "wins" in normal order, so it fixes that
issue, and reportlab 'helpfully' fills in all of those fields even if it
does not have information.

It could also work to pass document information along to reportlab, and
set it in each output PDF: .debug.pdf, .rendered.pdf, and .page.pdf to
ensure that whatever page is last in the pipeline has the right
information. Or perhaps it's possible to write a Postscript trailer that
overwrites any previous docinfo with no side effects, but I can't find
any information on how to do that.  I don't think it's worth pursuing
unless this arrangement causes some problem with PDF/A generation.

On a minor note, Jhove misreads the way I have encoded the strings in
producing its validation log.  It reads them as UTF-16 little endian, so
will tend to produce a string of Asian characters in place of the real
data.

											
										
										
											2015-07-25 18:05:25 -07:00
+								            return ''
 								    pdfmark = {
 								        'title': from_document_info('/Title'),
 								        'author': from_document_info('/Author'),
 								        'keywords': from_document_info('/Keywords'),
 								        'subject': from_document_info('/Subject'),
 								    }
-												Metadata override from command lien

											
										
										
											2015-07-25 18:12:25 -07:00
+								    if options.title:
 								        pdfmark['title'] = options.title
 								    if options.author:
 								        pdfmark['author'] = options.author
 								    if options.keywords:
 								        pdfmark['keywords'] = options.keywords
 								    if options.subject:
 								        pdfmark['subject'] = options.subject
-												Set /Creator metadata to OCRmyPDF

with reference to Tess version and settings

											
										
										
											2015-12-02 02:19:39 -08:00
+								    pdfmark['creator'] = '{0} {1} / Tesseract OCR{2} {3}'.format(
 								            parser.prog, VERSION,
 								            '+PDF' if options.pdf_renderer == 'tesseract' else '',
 								            tesseract.version())
-												Copy document metadata from source document into output (untested)

This works for ASCII only; will do Unicode version.

											
										
										
											2015-07-25 15:31:02 -07:00
+								    generate_pdfa_def(output_file, pdfmark)
-												Learn to split PDF into pages

											
										
										
											2015-07-22 22:46:00 -07:00
-												Fixes from early testing of new pipeline

											
										
										
											2015-07-22 22:51:38 -07:00
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								@transform(
 								    input=split_pages,
 								    filter=suffix('.skip.page.pdf'),
 								    output='.done.pdf',
 								    output_dir=work_folder,
 								    extras=[_log])
 								def skip_page(
 								        input_file,
 								        output_file,
 								        log):
 								    re_symlink(input_file, output_file, log)
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								@merge(
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								    input=[render_hocr_page, render_hocr_debug_page, skip_page,
 								           tesseract_ocr_and_render_pdf, generate_postscript_stub],
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								    output=os.path.join(work_folder, 'merged.pdf'),
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								    extras=[_log, _pdfinfo, _pdfinfo_lock])
 								def merge_pages(
 								        input_files,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
-												Fixes from early testing of new pipeline

											
										
										
											2015-07-22 22:51:38 -07:00
-												Reimplement debug pages

											
										
										
											2015-07-25 14:14:02 -07:00
+								    def input_file_order(s):
-												Transfer Unicode document information from input PDF to output PDF

What a pain getting Unicode right, but there it is.

I cannot find anything to confirm that it is acceptable to put the PDF/A
definition file at the end of the Ghostscript inputs.  I did this because
Ghostscript seems to copy document info from the last document on the
list so reportlab's information "wins" in normal order, so it fixes that
issue, and reportlab 'helpfully' fills in all of those fields even if it
does not have information.

It could also work to pass document information along to reportlab, and
set it in each output PDF: .debug.pdf, .rendered.pdf, and .page.pdf to
ensure that whatever page is last in the pipeline has the right
information. Or perhaps it's possible to write a Postscript trailer that
overwrites any previous docinfo with no side effects, but I can't find
any information on how to do that.  I don't think it's worth pursuing
unless this arrangement causes some problem with PDF/A generation.

On a minor note, Jhove misreads the way I have encoded the strings in
producing its validation log.  It reads them as UTF-16 little endian, so
will tend to produce a string of Asian characters in place of the real
data.

											
										
										
											2015-07-25 18:05:25 -07:00
+								        '''Sort order: All rendered pages followed
 								        by their debug page, if any, followed by Postscript stub.
 								        Ghostscript documentation has the Postscript stub at the
 								        beginning, but it works at the end and also gets document info
 								        right that way.'''
-												Reimplement debug pages

											
										
										
											2015-07-25 14:14:02 -07:00
+								        if s.endswith('.ps'):
-												Transfer Unicode document information from input PDF to output PDF

What a pain getting Unicode right, but there it is.

I cannot find anything to confirm that it is acceptable to put the PDF/A
definition file at the end of the Ghostscript inputs.  I did this because
Ghostscript seems to copy document info from the last document on the
list so reportlab's information "wins" in normal order, so it fixes that
issue, and reportlab 'helpfully' fills in all of those fields even if it
does not have information.

It could also work to pass document information along to reportlab, and
set it in each output PDF: .debug.pdf, .rendered.pdf, and .page.pdf to
ensure that whatever page is last in the pipeline has the right
information. Or perhaps it's possible to write a Postscript trailer that
overwrites any previous docinfo with no side effects, but I can't find
any information on how to do that.  I don't think it's worth pursuing
unless this arrangement causes some problem with PDF/A generation.

On a minor note, Jhove misreads the way I have encoded the strings in
producing its validation log.  It reads them as UTF-16 little endian, so
will tend to produce a string of Asian characters in place of the real
data.

											
										
										
											2015-07-25 18:05:25 -07:00
+								            return 99999999
-												Reimplement debug pages

											
										
										
											2015-07-25 14:14:02 -07:00
+								        key = int(os.path.basename(s)[0:6]) * 10
 								        if 'debug' in os.path.basename(s):
 								            key += 1
 								        return key
 								    pdf_pages = sorted(input_files, key=input_file_order)
 								    log.info(pdf_pages)
-												Automatically use all available cores unless told not to

											
										
										
											2015-07-30 23:20:21 -07:00
+								    ghostscript.generate_pdfa(pdf_pages, output_file, options.jobs or 1)
-												Fixes from early testing of new pipeline

											
										
										
											2015-07-22 22:51:38 -07:00
-												Add PDF/A validation

											
										
										
											2015-07-23 14:48:46 -07:00
+								@transform(
 								    input=merge_pages,
 								    filter=formatter(),
 								    output=options.output_file,
 								    extras=[_log, _pdfinfo, _pdfinfo_lock])
-												Fix code, test case: complain when GS fails to produce PDF/A

Modified pipeline to fix regression and return the proper error code if
we did not produce a PDF/A as expected.  The wrapper forces the output
to be PDF 1.3 which is not PDF/A compliant.

The funny thing is that in some cases JHOVE incorrectly states that a
file is PDF/A-1b compliant, well formed and valid, even when it is not
according to Acrobat XI and is missing the PDF/A metadata marker, as
far as I can tell.  JHOVE may not be as beneficial as hoped.

											
										
										
											2015-08-10 16:05:00 -07:00
+								def copy_final(
-												Add PDF/A validation

											
										
										
											2015-07-23 14:48:46 -07:00
+								        input_file,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
-												Fix code, test case: complain when GS fails to produce PDF/A

Modified pipeline to fix regression and return the proper error code if
we did not produce a PDF/A as expected.  The wrapper forces the output
to be PDF 1.3 which is not PDF/A compliant.

The funny thing is that in some cases JHOVE incorrectly states that a
file is PDF/A-1b compliant, well formed and valid, even when it is not
according to Acrobat XI and is missing the PDF/A metadata marker, as
far as I can tell.  JHOVE may not be as beneficial as hoped.

											
										
										
											2015-08-10 16:05:00 -07:00
+								    shutil.copy(input_file, output_file)
 								def validate_pdfa(
 								        input_file,
 								        log):
-												Add PDF/A validation

											
										
										
											2015-07-23 14:48:46 -07:00
-												Remove JHOVE

JHOVE is not an effective PDF/A validator, as detailed in this article:
http://www.pdfa.org/2014/12/ensuring-long-term-access-pdf-validation-with-jhove/

In short, it's buggy. Out of 670 invalid PDF/A files in a test suite,
it only flagged 5.  It only looks for certain problems that Ghostscript
generated PDFs are unlikely to have.  So use qpdf as a final check for
general ill-formed PDF problems since it is quite reliable.

JHOVE 1 is no longer maintained. There's a JHOVE 2 but it has no PDF
support.  I also don't know if it's appropriate to bundle JHOVE, with an
LGPL, under this project and its current license.

Removing a dependency on Java is a huge win.  A world with less Java is
a world with less AbstractFactoryConstructorInterfaces.

											
										
										
											2015-08-11 15:31:32 -07:00
+								    args_qpdf = [
 								        'qpdf',
 								        '--check',
-												Add PDF/A validation

											
										
										
											2015-07-23 14:48:46 -07:00
+								        input_file
 								    ]
-												Remove JHOVE

JHOVE is not an effective PDF/A validator, as detailed in this article:
http://www.pdfa.org/2014/12/ensuring-long-term-access-pdf-validation-with-jhove/

In short, it's buggy. Out of 670 invalid PDF/A files in a test suite,
it only flagged 5.  It only looks for certain problems that Ghostscript
generated PDFs are unlikely to have.  So use qpdf as a final check for
general ill-formed PDF problems since it is quite reliable.

JHOVE 1 is no longer maintained. There's a JHOVE 2 but it has no PDF
support.  I also don't know if it's appropriate to bundle JHOVE, with an
LGPL, under this project and its current license.

Removing a dependency on Java is a huge win.  A world with less Java is
a world with less AbstractFactoryConstructorInterfaces.

											
										
										
											2015-08-11 15:31:32 -07:00
 								    try:
 								        check_output(args_qpdf, stderr=STDOUT, universal_newlines=True)
 								    except CalledProcessError as e:
 								        if e.returncode == 2:
 								            print("{0}: not a valid PDF, and could not repair it.".format(
 								                    options.input_file))
 								            print("Details:")
 								            print(e.output)
 								        elif e.returncode == 3:
 								            log.info("qpdf --check returned warnings:")
 								            log.info(e.output)
 								        else:
 								            print(e.output)
 								        return False
 								    return True
-												Add PDF/A validation

											
										
										
											2015-07-23 14:48:46 -07:00
-												Fixes from early testing of new pipeline

											
										
										
											2015-07-22 22:51:38 -07:00
+								# @active_if(ocr_required and options.exact_image)
 								# @merge([render_hocr_blank_page, extract_single_page],
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								#        os.path.join(work_folder, "%04i.merged.pdf") % pageno)
-												Fixes from early testing of new pipeline

											
										
										
											2015-07-22 22:51:38 -07:00
+								# def merge_hocr_with_original_page(infiles, output_file):
 								#     with open(infiles[0], 'rb') as hocr_input, \
 								#             open(infiles[1], 'rb') as page_input, \
 								#             open(output_file, 'wb') as output:
 								#         hocr_reader = pypdf.PdfFileReader(hocr_input)
 								#         page_reader = pypdf.PdfFileReader(page_input)
 								#         writer = pypdf.PdfFileWriter()
 								#         the_page = hocr_reader.getPage(0)
 								#         the_page.mergePage(page_reader.getPage(0))
 								#         writer.addPage(the_page)
 								#         writer.write(output)
-												Automatically try to use all available CPUs

											
										
										
											2015-07-25 01:10:14 -07:00
+								def available_cpu_count():
 								    try:
 								        return multiprocessing.cpu_count()
 								    except NotImplementedError:
 								        pass
-												Fixes from early testing of new pipeline

											
										
										
											2015-07-22 22:51:38 -07:00
-												Automatically try to use all available CPUs

											
										
										
											2015-07-25 01:10:14 -07:00
+								    try:
 								        import psutil
 								        return psutil.cpu_count()
 								    except (ImportError, AttributeError):
 								        pass
-												Minor tweaks to uncommon arguments

											
										
										
											2015-07-28 02:25:50 -07:00
+								    complain(
-												Automatically try to use all available CPUs

											
										
										
											2015-07-25 01:10:14 -07:00
+								        "Could not get CPU count.  Assuming one (1) CPU."
-												Minor tweaks to uncommon arguments

											
										
										
											2015-07-28 02:25:50 -07:00
+								        "Use -j N to set manually.")
-												Automatically try to use all available CPUs

											
										
										
											2015-07-25 01:10:14 -07:00
+								    return 1
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
-												Better error messages for input file not found or invalid

Not as good finding a general way to deal with ruffus exceptions, but
better than nil.

											
										
										
											2015-12-04 03:07:53 -08:00
+								def cleanup_ruffus_error_message(msg):
 								    msg = re.sub(r'\s+', r' ', msg, re.MULTILINE)
-												Trivial

											
										
										
											2015-12-04 04:03:38 -08:00
+								    msg = re.sub(r"\((.+?)\)", r'\1', msg)
-												Better error messages for input file not found or invalid

Not as good finding a general way to deal with ruffus exceptions, but
better than nil.

											
										
										
											2015-12-04 03:07:53 -08:00
+								    msg = msg.strip()
 								    return msg
-												More packaging changes: move jhove, fix console script

											
										
										
											2015-07-26 01:52:08 -07:00
+								def run_pipeline():
-												Automatically use all available cores unless told not to

											
										
										
											2015-07-30 23:20:21 -07:00
+								    if not options.jobs or options.jobs == 1:
 								        options.jobs = available_cpu_count()
-												Improve ruffus exception handling

ruffus swallows the return code if the process of handling an exception
we hit an error in ruffus' own code, which can happen.  So pick through
its error stack and find out if there's an interesting return code in
there.  Had to use eval() of all things.

Also suppress the stack trace for normal error conditions that don't
need one.

											
										
										
											2015-08-11 02:19:46 -07:00
+								    try:
 								        cmdline.run(options)
 								    except ruffus_exceptions.RethrownJobError as e:
 								        if options.verbose:
 								            print(e)
 								        # Yuck. Hunt through the ruffus exception to find out what the
 								        # return code is supposed to be.
 								        for exc in e.args:
 								            task_name, job_name, exc_name, exc_value, exc_stack = exc
 								            if exc_name == 'builtins.SystemExit':
-												Remove eval() call by introspecting ExitCode

											
										
										
											2015-12-04 03:34:53 -08:00
+								                match = re.search(r"\.(.+?)\)", exc_value)
 								                exit_code_name = match.groups()[0]
 								                exit_code = getattr(ExitCode, exit_code_name, 'other_error')
 								                return exit_code
-												Better error messages for input file not found or invalid

Not as good finding a general way to deal with ruffus exceptions, but
better than nil.

											
										
										
											2015-12-04 03:07:53 -08:00
+								            elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
 								                print(cleanup_ruffus_error_message(exc_value))
 								                return ExitCode.input_file
 								            elif exc_name == 'builtins.TypeError':
-												Comments

											
										
										
											2015-12-04 03:09:39 -08:00
+								                # Even though repair_pdf will fail, ruffus will still try
 								                # to call split_pages with no input files, likely due to a bug
-												Better error messages for input file not found or invalid

Not as good finding a general way to deal with ruffus exceptions, but
better than nil.

											
										
										
											2015-12-04 03:07:53 -08:00
+								                if task_name == 'split_pages':
 								                    print("Input file '{0}' is not a valid PDF".format(
 								                        options.input_file))
 								                    return ExitCode.input_file
-												Prevent running validation on missing file after an exception is thrown

											
										
										
											2015-08-28 04:48:29 -07:00
+								        return ExitCode.other_error
-												More packaging changes: move jhove, fix console script

											
										
										
											2015-07-26 01:52:08 -07:00
-												Remove JHOVE

JHOVE is not an effective PDF/A validator, as detailed in this article:
http://www.pdfa.org/2014/12/ensuring-long-term-access-pdf-validation-with-jhove/

In short, it's buggy. Out of 670 invalid PDF/A files in a test suite,
it only flagged 5.  It only looks for certain problems that Ghostscript
generated PDFs are unlikely to have.  So use qpdf as a final check for
general ill-formed PDF problems since it is quite reliable.

JHOVE 1 is no longer maintained. There's a JHOVE 2 but it has no PDF
support.  I also don't know if it's appropriate to bundle JHOVE, with an
LGPL, under this project and its current license.

Removing a dependency on Java is a huge win.  A world with less Java is
a world with less AbstractFactoryConstructorInterfaces.

											
										
										
											2015-08-11 15:31:32 -07:00
+								    if not validate_pdfa(options.output_file, _log):
-												Fix code, test case: complain when GS fails to produce PDF/A

Modified pipeline to fix regression and return the proper error code if
we did not produce a PDF/A as expected.  The wrapper forces the output
to be PDF 1.3 which is not PDF/A compliant.

The funny thing is that in some cases JHOVE incorrectly states that a
file is PDF/A-1b compliant, well formed and valid, even when it is not
according to Acrobat XI and is missing the PDF/A metadata marker, as
far as I can tell.  JHOVE may not be as beneficial as hoped.

											
										
										
											2015-08-10 16:05:00 -07:00
+								        _log.warning('Output file: The generated PDF/A file is INVALID')
-												Remove JHOVE

JHOVE is not an effective PDF/A validator, as detailed in this article:
http://www.pdfa.org/2014/12/ensuring-long-term-access-pdf-validation-with-jhove/

In short, it's buggy. Out of 670 invalid PDF/A files in a test suite,
it only flagged 5.  It only looks for certain problems that Ghostscript
generated PDFs are unlikely to have.  So use qpdf as a final check for
general ill-formed PDF problems since it is quite reliable.

JHOVE 1 is no longer maintained. There's a JHOVE 2 but it has no PDF
support.  I also don't know if it's appropriate to bundle JHOVE, with an
LGPL, under this project and its current license.

Removing a dependency on Java is a huge win.  A world with less Java is
a world with less AbstractFactoryConstructorInterfaces.

											
										
										
											2015-08-11 15:31:32 -07:00
+								        return ExitCode.invalid_output_pdfa
 								    return ExitCode.ok
-												Fix code, test case: complain when GS fails to produce PDF/A

Modified pipeline to fix regression and return the proper error code if
we did not produce a PDF/A as expected.  The wrapper forces the output
to be PDF 1.3 which is not PDF/A compliant.

The funny thing is that in some cases JHOVE incorrectly states that a
file is PDF/A-1b compliant, well formed and valid, even when it is not
according to Acrobat XI and is missing the PDF/A metadata marker, as
far as I can tell.  JHOVE may not be as beneficial as hoped.

											
										
										
											2015-08-10 16:05:00 -07:00
-												More packaging changes: move jhove, fix console script

											
										
										
											2015-07-26 01:52:08 -07:00
 								if __name__ == '__main__':
-												Fix code, test case: complain when GS fails to produce PDF/A

Modified pipeline to fix regression and return the proper error code if
we did not produce a PDF/A as expected.  The wrapper forces the output
to be PDF 1.3 which is not PDF/A compliant.

The funny thing is that in some cases JHOVE incorrectly states that a
file is PDF/A-1b compliant, well formed and valid, even when it is not
according to Acrobat XI and is missing the PDF/A metadata marker, as
far as I can tell.  JHOVE may not be as beneficial as hoped.

											
										
										
											2015-08-10 16:05:00 -07:00
+								    sys.exit(run_pipeline())