OCRmyPDF/ocrmypdf/main.py

#!/usr/bin/env python3

from contextlib import suppress
from tempfile import NamedTemporaryFile, mkdtemp
import sys
import os
import fileinput
import re
import shutil
import warnings
import multiprocessing
import atexit

import PyPDF2 as pypdf
from PIL import Image

from subprocess import Popen, check_call, PIPE, CalledProcessError, \
    TimeoutExpired
try:
    from subprocess import DEVNULL
except ImportError:
    DEVNULL = open(os.devnull, 'wb')


from ruffus import transform, suffix, merge, active_if, regex, jobs_limit, \
    formatter, follows, split, collate, check_if_uptodate
import ruffus.cmdline as cmdline

from .hocrtransform import HocrTransform
from .pageinfo import pdf_get_all_pageinfo
from .pdfa import generate_pdfa_def
from .ghostscript import rasterize_pdf, generate_pdfa
from . import tesseract


warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)


BASEDIR = os.path.dirname(os.path.realpath(__file__))
JHOVE_PATH = os.path.realpath(os.path.join(BASEDIR, 'jhove'))
JHOVE_JAR = os.path.join(JHOVE_PATH, 'bin', 'JhoveApp.jar')
JHOVE_CFG = os.path.join(JHOVE_PATH, 'conf', 'jhove.conf')

EXIT_BAD_ARGS = 1
EXIT_BAD_INPUT_FILE = 2
EXIT_MISSING_DEPENDENCY = 3
EXIT_INVALID_OUTPUT_PDFA = 4
EXIT_FILE_ACCESS_ERROR = 5
EXIT_ALREADY_DONE_OCR = 6
EXIT_OTHER_ERROR = 15

# -------------
# External dependencies

MINIMUM_TESS_VERSION = '3.02.02'

if tesseract.VERSION < MINIMUM_TESS_VERSION:
    print(
        "Please install tesseract {0} or newer "
        "(currently installed version is {1})".format(
            MINIMUM_TESS_VERSION, tesseract.VERSION),
        file=sys.stderr)
    sys.exit(EXIT_MISSING_DEPENDENCY)


# -------------
# Parser

parser = cmdline.get_argparse(
    prog="ocrmypdf",
    description="Generate searchable PDF file from an image-only PDF file.",
    version='3.0rc1',
    fromfile_prefix_chars='@',
    ignored_args=[
        'touch_files_only', 'recreate_database', 'checksum_file_name',
        'key_legend_in_graph', 'draw_graph_horizontally', 'flowchart_format',
        'forced_tasks', 'target_tasks'])

parser.add_argument(
    'input_file',
    help="PDF file containing the images to be OCRed")
parser.add_argument(
    'output_file',
    help="output searchable PDF file")
parser.add_argument(
    '-l', '--language', action='append',
    help="language of the file to be OCRed")

metadata = parser.add_argument_group(
    "Metadata options",
    "Set output PDF/A metadata (default: use input document's title)")
metadata.add_argument(
    '--title', type=str,
    help="set document title (place multiple words in quotes)")
metadata.add_argument(
    '--author', type=str,
    help="set document author")
metadata.add_argument(
    '--subject', type=str,
    help="set document")
metadata.add_argument(
    '--keywords', type=str,
    help="set document keywords")


preprocessing = parser.add_argument_group(
    "Preprocessing options",
    "Improve OCR quality and final image")
preprocessing.add_argument(
    '-d', '--deskew', action='store_true',
    help="deskew each page before performing OCR")
preprocessing.add_argument(
    '-c', '--clean', action='store_true',
    help="clean pages with unpaper before performing OCR")
preprocessing.add_argument(
    '-i', '--clean-final', action='store_true',
    help="incorporate the cleaned image in the final PDF file")
preprocessing.add_argument(
    '--oversample', metavar='DPI', type=int,
    help="oversample images to improve OCR results slightly")

parser.add_argument(
    '-f', '--force-ocr', action='store_true',
    help="Force to OCR, even if the page already contains fonts")
parser.add_argument(
    '-s', '--skip-text', action='store_true',
    help="Skip OCR on pages that contain fonts and include the page anyway")
parser.add_argument(
    '--skip-big', action='store_true',
    help="Skip OCR for pages that are very large")
# parser.add_argument(
#     '--exact-image', action='store_true',
#     help="Use original page from PDF without re-rendering")

advanced = parser.add_argument_group(
    "Advanced",
    "Advanced options for power users")
advanced.add_argument(
    '--tesseract-config', default=[], type=list, action='append',
    help="Tesseract configuration")
advanced.add_argument(
    '--pdf-renderer', choices=['tesseract', 'hocr'], default='hocr',
    help='choose OCR PDF renderer')
advanced.add_argument(
    '--tesseract-timeout', default=180.0, type=float,
    help='give up on OCR after timeout')

debugging = parser.add_argument_group(
    "Debugging",
    "Arguments to help with troubleshooting and debugging")
debugging.add_argument(
    '-k', '--keep-temporary-files', action='store_true',
    help="keep temporary files (helpful for debugging)")
debugging.add_argument(
    '-g', '--debug-rendering', action='store_true',
    help="render each page twice with debug information on second page")


# Fiddle with arguments to support with unittest.mock
_argv = sys.argv
if _argv[0].startswith('python'):
    _argv = _argv[1:]
if _argv[0].endswith('.py'):
    _argv = _argv[1:]
options = parser.parse_args(_argv)


# ----------
# Languages

if not options.language:
    options.language = ['eng']  # Enforce English hegemony

# Support v2.x "eng+deu" language syntax
if '+' in options.language[0]:
    options.language = options.language[0].split('+')

if not set(options.language).issubset(tesseract.LANGUAGES):
    print(
        "The installed version of tesseract does not have language "
        "data for the following requested languages: ",
        file=sys.stderr)
    for lang in (set(options.language) - tesseract.LANGUAGES):
        print(lang, file=sys.stderr)
    sys.exit(EXIT_BAD_ARGS)


# ----------
# Arguments


if any((options.deskew, options.clean, options.clean_final)):
    try:
        from . import unpaper
    except ImportError:
        print("Install the 'unpaper' program to use the specified options",
              file=sys.stderr)
        sys.exit(EXIT_BAD_ARGS)
else:
    unpaper = None

# ----------
# Logging


_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
                                               options.verbose)


class WrappedLogger:

    def __init__(self, my_logger, my_mutex):
        self.logger = my_logger
        self.mutex = my_mutex

    def log(self, *args, **kwargs):
        with self.mutex:
            self.logger.log(*args, **kwargs)

    def debug(self, *args, **kwargs):
        with self.mutex:
            self.logger.debug(*args, **kwargs)

    def info(self, *args, **kwargs):
        with self.mutex:
            self.logger.info(*args, **kwargs)

    def warning(self, *args, **kwargs):
        with self.mutex:
            self.logger.warning(*args, **kwargs)

    def error(self, *args, **kwargs):
        with self.mutex:
            self.logger.error(*args, **kwargs)

    def critical(self, *args, **kwargs):
        with self.mutex:
            self.logger.critical(*args, **kwargs)

_log = WrappedLogger(_logger, _logger_mutex)


def re_symlink(input_file, soft_link_name, log=_log):
    """
    Helper function: relinks soft symbolic link if necessary
    """
    # Guard against soft linking to oneself
    if input_file == soft_link_name:
        log.debug("Warning: No symbolic link made. You are using " +
                  "the original data directory as the working directory.")
        return

    # Soft link already exists: delete for relink?
    if os.path.lexists(soft_link_name):
        # do not delete or overwrite real (non-soft link) file
        if not os.path.islink(soft_link_name):
            raise Exception("%s exists and is not a link" % soft_link_name)
        try:
            os.unlink(soft_link_name)
        except:
            log.debug("Can't unlink %s" % (soft_link_name))

    if not os.path.exists(input_file):
        raise Exception("trying to create a broken symlink to %s" % input_file)

    log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))

    # Create symbolic link using absolute path
    os.symlink(
        os.path.abspath(input_file),
        soft_link_name
    )


# -------------
# The Pipeline

manager = multiprocessing.Manager()
_pdfinfo = manager.list()
_pdfinfo_lock = manager.Lock()

work_folder = mkdtemp(prefix="com.github.ocrmypdf.")


@atexit.register
def cleanup_working_files(*args):
    if options.keep_temporary_files:
        print("Temporary working files saved at:")
        print(work_folder)
    else:
        with suppress(FileNotFoundError):
            shutil.rmtree(work_folder)


@transform(
    input=options.input_file,
    filter=suffix('.pdf'),
    output='.repaired.pdf',
    output_dir=work_folder,
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def repair_pdf(
        input_file,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):
    args_mutool = [
        'mutool', 'clean',
        input_file, output_file
    ]
    check_call(args_mutool)

    with pdfinfo_lock:
        pdfinfo.extend(pdf_get_all_pageinfo(output_file))
        log.info(pdfinfo)


def get_pageinfo(input_file, pdfinfo, pdfinfo_lock):
    pageno = int(os.path.basename(input_file)[0:6]) - 1
    with pdfinfo_lock:
        pageinfo = pdfinfo[pageno].copy()
    return pageinfo


def is_ocr_required(pageinfo, log):
    page = pageinfo['pageno'] + 1
    ocr_required = True
    if not pageinfo['images']:
        # If the page has no images, then it contains vector content or text
        # or both. It seems quite unlikely that one would find meaningful text
        # from rasterizing vector content. So skip the page.
        log.info(
            "Page {0} has no images - skipping OCR".format(page)
        )
        ocr_required = False
    elif pageinfo['has_text']:
        s = "Page {0} already has text! – {1}"

        if not options.force_ocr and not options.skip_text:
            log.error(s.format(page,
                               "aborting (use --force-ocr to force OCR)"))
            sys.exit(EXIT_ALREADY_DONE_OCR)
        elif options.force_ocr:
            log.info(s.format(page,
                              "rasterizing text and running OCR anyway"))
            ocr_required = True
        elif options.skip_text:
            log.info(s.format(page,
                              "skipping all processing on this page"))
            ocr_required = False

    if ocr_required and options.skip_big:
        area = pageinfo['width_inches'] * pageinfo['height_inches']
        pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
        if area > (11.0 * 17.0) or pixel_count > (300.0 * 300.0 * 11 * 17):
            ocr_required = False
            log.info(
                "Page {0} is very large; skipping due to -b".format(page))

    return ocr_required


@split(
    repair_pdf,
    os.path.join(work_folder, '*.page.pdf'),
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def split_pages(
        input_file,
        output_files,
        log,
        pdfinfo,
        pdfinfo_lock):

    for oo in output_files:
        with suppress(FileNotFoundError):
            os.unlink(oo)
    args_pdfseparate = [
        'pdfseparate',
        input_file,
        os.path.join(work_folder, '%06d.page.pdf')
    ]
    check_call(args_pdfseparate)

    from glob import glob
    for filename in glob(os.path.join(work_folder, '*.page.pdf')):
        pageinfo = get_pageinfo(filename, pdfinfo, pdfinfo_lock)

        alt_suffix = '.ocr.page.pdf' if is_ocr_required(pageinfo, log) \
                     else '.skip.page.pdf'
        re_symlink(
            filename,
            os.path.join(
                work_folder,
                os.path.basename(filename)[0:6] + alt_suffix))


@transform(
    input=split_pages,
    filter=suffix('.ocr.page.pdf'),
    output='.page.png',
    output_dir=work_folder,
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def rasterize_with_ghostscript(
        input_file,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):

    pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)

    device = 'png16m'  # 24-bit
    if all(image['comp'] == 1 for image in pageinfo['images']):
        if all(image['bpc'] == 1 for image in pageinfo['images']):
            device = 'pngmono'
        elif not any(image['color'] == 'color'
                     for image in pageinfo['images']):
            device = 'pnggray'

    xres = max(pageinfo['xres'], options.oversample or 0)
    yres = max(pageinfo['yres'], options.oversample or 0)

    rasterize_pdf(input_file, output_file, xres, yres, device, log)


@transform(
    input=rasterize_with_ghostscript,
    filter=suffix(".page.png"),
    output=".pp-deskew.png",
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def preprocess_deskew(
        input_file,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):

    if not options.deskew:
        re_symlink(input_file, output_file, log)
        return

    pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
    dpi = int(pageinfo['xres'])

    unpaper.deskew(input_file, output_file, dpi, log)


@transform(
    input=preprocess_deskew,
    filter=suffix(".pp-deskew.png"),
    output=".pp-clean.png",
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def preprocess_clean(
        input_file,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):

    if not options.clean:
        re_symlink(input_file, output_file, log)
        return

    pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
    dpi = int(pageinfo['xres'])

    unpaper.clean(input_file, output_file, dpi, log)


@active_if(options.pdf_renderer == 'hocr')
@transform(
    input=preprocess_clean,
    filter=suffix(".pp-clean.png"),
    output=".hocr",
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def ocr_tesseract_hocr(
        input_file,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):

    pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)

    args_tesseract = [
        'tesseract',
        '-l', '+'.join(options.language),
        input_file,
        output_file,
        'hocr'
    ] + options.tesseract_config
    p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
              universal_newlines=True)
    try:
        stdout, stderr = p.communicate(timeout=options.tesseract_timeout)
    except TimeoutExpired:
        p.kill()
        stdout, stderr = p.communicate()
        # Generate a HOCR file with no recognized text if tesseract times out
        # Temporary workaround to hocrTransform not being able to function if
        # it does not have a valid hOCR file.
        with open(output_file, 'w', encoding="utf-8") as f:
            f.write(tesseract.HOCR_TEMPLATE.format(
                pageinfo['width_pixels'],
                pageinfo['height_pixels']))
    else:
        if stdout:
            log.info(stdout)
        if stderr:
            log.error(stderr)

        if p.returncode != 0:
            raise CalledProcessError(p.returncode, args_tesseract)

        if os.path.exists(output_file + '.html'):
            # Tesseract 3.02 appends suffix ".html" on its own (.hocr.html)
            shutil.move(output_file + '.html', output_file)
        elif os.path.exists(output_file + '.hocr'):
            # Tesseract 3.03 appends suffix ".hocr" on its own (.hocr.hocr)
            shutil.move(output_file + '.hocr', output_file)

        # Tesseract 3.03 inserts source filename into hocr file without
        # escaping it, creating invalid XML and breaking the parser.
        # As a workaround, rewrite the hocr file, replacing the filename
        # with a space.
        regex_nested_single_quotes = re.compile(
            r"""title='image "([^"]*)";""")
        with fileinput.input(files=(output_file,), inplace=True) as f:
            for line in f:
                line = regex_nested_single_quotes.sub(
                    r"""title='image " ";""", line)
                print(line, end='')  # fileinput.input redirects stdout


@active_if(options.pdf_renderer == 'hocr')
@collate(
    input=[rasterize_with_ghostscript, preprocess_deskew, preprocess_clean],
    filter=regex(r".*/(\d{6})(?:\.page|\.pp-deskew|\.pp-clean)\.png"),
    output=os.path.join(work_folder, r'\1.image'),
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def select_image_for_pdf(
        infiles,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):
    if options.clean_final:
        image_suffix = '.pp-clean.png'
    elif options.deskew:
        image_suffix = '.pp-deskew.png'
    else:
        image_suffix = '.page.png'
    image = next(ii for ii in infiles if ii.endswith(image_suffix))

    pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
    if all(image['enc'] == 'jpeg' for image in pageinfo['images']):
        # If all images were JPEGs originally, produce a JPEG as output
        Image.open(image).save(output_file, format='JPEG')
    else:
        re_symlink(image, output_file)


@active_if(options.pdf_renderer == 'hocr')
@collate(
    input=[select_image_for_pdf, ocr_tesseract_hocr],
    filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"),
    output=os.path.join(work_folder, r'\1.rendered.pdf'),
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def render_hocr_page(
        infiles,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):
    hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
    image = next(ii for ii in infiles if ii.endswith('.image'))

    pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
    dpi = round(max(pageinfo['xres'], pageinfo['yres']))

    hocrtransform = HocrTransform(hocr, dpi)
    hocrtransform.to_pdf(output_file, imageFileName=image,
                         showBoundingboxes=False, invisibleText=True)


@active_if(options.pdf_renderer == 'hocr')
@active_if(options.debug_rendering)
@collate(
    input=[select_image_for_pdf, ocr_tesseract_hocr],
    filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"),
    output=os.path.join(work_folder, r'\1.debug.pdf'),
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def render_hocr_debug_page(
        infiles,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):
    hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
    image = next(ii for ii in infiles if ii.endswith('.image'))

    pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
    dpi = round(max(pageinfo['xres'], pageinfo['yres']))

    hocrtransform = HocrTransform(hocr, dpi)
    hocrtransform.to_pdf(output_file, imageFileName=None,
                         showBoundingboxes=True, invisibleText=False)


@active_if(options.pdf_renderer == 'tesseract')
@transform(
    input=preprocess_clean,
    filter=suffix(".pp-clean.png"),
    output=".rendered.pdf",
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def tesseract_ocr_and_render_pdf(
        input_file,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):

    args_tesseract = [
        'tesseract',
        '-l', '+'.join(options.language),
        input_file,
        os.path.splitext(output_file)[0],  # Tesseract appends suffix
        'pdf'
    ] + options.tesseract_config
    p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
              universal_newlines=True)

    stdout, stderr = p.communicate(timeout=options.tesseract_timeout)
    if stdout:
        log.info(stdout)
    if stderr:
        log.error(stderr)


@transform(
    input=repair_pdf,
    filter=suffix('.repaired.pdf'),
    output='.pdfa_def.ps',
    output_dir=work_folder,
    extras=[_log])
def generate_postscript_stub(
        input_file,
        output_file,
        log):

    pdf = pypdf.PdfFileReader(input_file)

    def from_document_info(key):
        # pdf.documentInfo.get() DOES NOT work as expected
        try:
            s = pdf.documentInfo[key]
            return str(s)
        except KeyError:
            return ''

    pdfmark = {
        'title': from_document_info('/Title'),
        'author': from_document_info('/Author'),
        'keywords': from_document_info('/Keywords'),
        'subject': from_document_info('/Subject'),
    }
    if options.title:
        pdfmark['title'] = options.title
    if options.author:
        pdfmark['author'] = options.author
    if options.keywords:
        pdfmark['keywords'] = options.keywords
    if options.subject:
        pdfmark['subject'] = options.subject

    generate_pdfa_def(output_file, pdfmark)


@transform(
    input=split_pages,
    filter=suffix('.skip.page.pdf'),
    output='.done.pdf',
    output_dir=work_folder,
    extras=[_log])
def skip_page(
        input_file,
        output_file,
        log):
    re_symlink(input_file, output_file, log)


@merge(
    input=[render_hocr_page, render_hocr_debug_page, skip_page,
           tesseract_ocr_and_render_pdf, generate_postscript_stub],
    output=os.path.join(work_folder, 'merged.pdf'),
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def merge_pages(
        input_files,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):

    def input_file_order(s):
        '''Sort order: All rendered pages followed
        by their debug page, if any, followed by Postscript stub.
        Ghostscript documentation has the Postscript stub at the
        beginning, but it works at the end and also gets document info
        right that way.'''
        if s.endswith('.ps'):
            return 99999999
        key = int(os.path.basename(s)[0:6]) * 10
        if 'debug' in os.path.basename(s):
            key += 1
        return key

    pdf_pages = sorted(input_files, key=input_file_order)
    log.info(pdf_pages)
    generate_pdfa(pdf_pages, output_file)


@transform(
    input=merge_pages,
    filter=formatter(),
    output=options.output_file,
    extras=[_log, _pdfinfo, _pdfinfo_lock])
def validate_pdfa(
        input_file,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):

    args_jhove = [
        'java',
        '-jar', JHOVE_JAR,
        '-c', JHOVE_CFG,
        '-m', 'PDF-hul',
        input_file
    ]
    p_jhove = Popen(args_jhove, close_fds=True, universal_newlines=True,
                    stdout=PIPE, stderr=DEVNULL)
    stdout, _ = p_jhove.communicate()

    log.debug(stdout)
    if p_jhove.returncode != 0:
        log.error(stdout)
        raise RuntimeError(
            "Unexpected error while checking compliance to PDF/A file.")

    pdf_is_valid = True
    if re.search(r'ErrorMessage', stdout,
                 re.IGNORECASE | re.MULTILINE):
        pdf_is_valid = False
    if re.search(r'^\s+Status.*not valid', stdout,
                 re.IGNORECASE | re.MULTILINE):
        pdf_is_valid = False
    if re.search(r'^\s+Status.*Not well-formed', stdout,
                 re.IGNORECASE | re.MULTILINE):
        pdf_is_valid = False

    pdf_is_pdfa = False
    if re.search(r'^\s+Profile:.*PDF/A-1', stdout,
                 re.IGNORECASE | re.MULTILINE):
        pdf_is_pdfa = True

    if not pdf_is_valid:
        log.warning('Output file: The generated PDF/A file is INVALID')
    elif pdf_is_valid and not pdf_is_pdfa:
        log.warning('Output file: Generated file is a VALID PDF but not PDF/A')
    elif pdf_is_valid and pdf_is_pdfa:
        log.info('Output file: The generated PDF/A file is VALID')
    shutil.copy(input_file, output_file)


# @active_if(ocr_required and options.exact_image)
# @merge([render_hocr_blank_page, extract_single_page],
#        os.path.join(work_folder, "%04i.merged.pdf") % pageno)
# def merge_hocr_with_original_page(infiles, output_file):
#     with open(infiles[0], 'rb') as hocr_input, \
#             open(infiles[1], 'rb') as page_input, \
#             open(output_file, 'wb') as output:
#         hocr_reader = pypdf.PdfFileReader(hocr_input)
#         page_reader = pypdf.PdfFileReader(page_input)
#         writer = pypdf.PdfFileWriter()

#         the_page = hocr_reader.getPage(0)
#         the_page.mergePage(page_reader.getPage(0))
#         writer.addPage(the_page)
#         writer.write(output)


def available_cpu_count():
    try:
        return multiprocessing.cpu_count()
    except NotImplementedError:
        pass

    try:
        import psutil
        return psutil.cpu_count()
    except (ImportError, AttributeError):
        pass

    print(
        "Could not get CPU count.  Assuming one (1) CPU."
        "Use -j N to set manually.", file=sys.stderr)
    return 1


def run_pipeline():
    cmdline.run(options, multiprocess=available_cpu_count())


if __name__ == '__main__':
    run_pipeline()
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
+								#!/usr/bin/env python3
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								from contextlib import suppress
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								from tempfile import NamedTemporaryFile, mkdtemp
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
+								import sys
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								import os
-												Remove filenames from .hocr files

As documented, Tesseract does not escape the filename when inserting it
into .hocr, potentially creating an invalid XML file as a result. Since
there is no use for the title, regex it and nuke it.

											
										
										
											2015-02-13 13:41:14 -08:00
+								import fileinput
 								import re
-												Add rudimentary support for combining OCR layer with existing content

It appears to be very fragile due to weaknesses in PyPDF. Better
option is probably to use pdftk's watermark feature.

											
										
										
											2015-03-10 14:28:38 -07:00
+								import shutil
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								import warnings
 								import multiprocessing
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								import atexit
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
 								import PyPDF2 as pypdf
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
+								from PIL import Image
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
-												Add Tesseract timeout to keep things reasonable

											
										
										
											2014-11-14 02:06:23 -08:00
+								from subprocess import Popen, check_call, PIPE, CalledProcessError, \
 								    TimeoutExpired
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								try:
 								    from subprocess import DEVNULL
 								except ImportError:
 								    DEVNULL = open(os.devnull, 'wb')
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
+								from ruffus import transform, suffix, merge, active_if, regex, jobs_limit, \
-												Change @subdivide to @split

@split is for "1 to many" operations, so it's the right tool for this
case.

											
										
										
											2015-07-25 02:58:34 -07:00
+								    formatter, follows, split, collate, check_if_uptodate
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								import ruffus.cmdline as cmdline
-												Try a method for passing along the pdfinfo struct

											
										
										
											2015-07-23 02:39:42 -07:00
-												Call HocrTransform directly instead of through a subprocess

											
										
										
											2015-02-20 17:20:48 -08:00
+								from .hocrtransform import HocrTransform
-												Try a method for passing along the pdfinfo struct

											
										
										
											2015-07-23 02:39:42 -07:00
+								from .pageinfo import pdf_get_all_pageinfo
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								from .pdfa import generate_pdfa_def
-												Put ghostscript in a module

											
										
										
											2015-07-27 15:22:00 -07:00
+								from .ghostscript import rasterize_pdf, generate_pdfa
-												Langauge checking

											
										
										
											2015-07-23 18:38:59 -07:00
+								from . import tesseract
-												Call HocrTransform directly instead of through a subprocess

											
										
										
											2015-02-20 17:20:48 -08:00
-												Fixes to colorspace and other inquiries

											
										
										
											2015-04-09 03:12:04 -07:00
-												Suppress the xref warning for now

											
										
										
											2015-04-09 14:06:55 -07:00
+								warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
-												Fixes to colorspace and other inquiries

											
										
										
											2015-04-09 03:12:04 -07:00
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
-												Add PDF/A validation

											
										
										
											2015-07-23 14:48:46 -07:00
+								BASEDIR = os.path.dirname(os.path.realpath(__file__))
-												More packaging changes: move jhove, fix console script

											
										
										
											2015-07-26 01:52:08 -07:00
+								JHOVE_PATH = os.path.realpath(os.path.join(BASEDIR, 'jhove'))
-												Add PDF/A validation

											
										
										
											2015-07-23 14:48:46 -07:00
+								JHOVE_JAR = os.path.join(JHOVE_PATH, 'bin', 'JhoveApp.jar')
 								JHOVE_CFG = os.path.join(JHOVE_PATH, 'conf', 'jhove.conf')
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
-												Tidy up

											
										
										
											2015-07-24 01:27:01 -07:00
+								EXIT_BAD_ARGS = 1
 								EXIT_BAD_INPUT_FILE = 2
 								EXIT_MISSING_DEPENDENCY = 3
 								EXIT_INVALID_OUTPUT_PDFA = 4
 								EXIT_FILE_ACCESS_ERROR = 5
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								EXIT_ALREADY_DONE_OCR = 6
-												Tidy up

											
										
										
											2015-07-24 01:27:01 -07:00
+								EXIT_OTHER_ERROR = 15
-												Add tesseract version check

											
										
										
											2015-07-23 17:06:00 -07:00
 								# -------------
 								# External dependencies
 								MINIMUM_TESS_VERSION = '3.02.02'
-												Langauge checking

											
										
										
											2015-07-23 18:38:59 -07:00
+								if tesseract.VERSION < MINIMUM_TESS_VERSION:
-												Add tesseract version check

											
										
										
											2015-07-23 17:06:00 -07:00
+								    print(
 								        "Please install tesseract {0} or newer "
 								        "(currently installed version is {1})".format(
-												Langauge checking

											
										
										
											2015-07-23 18:38:59 -07:00
+								            MINIMUM_TESS_VERSION, tesseract.VERSION),
-												Add tesseract version check

											
										
										
											2015-07-23 17:06:00 -07:00
+								        file=sys.stderr)
 								    sys.exit(EXIT_MISSING_DEPENDENCY)
 								# -------------
 								# Parser
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								parser = cmdline.get_argparse(
-												Packaging stuff

											
										
										
											2015-07-25 23:45:13 -07:00
+								    prog="ocrmypdf",
-												Improve argument handling, test cases

											
										
										
											2015-07-27 15:39:54 -07:00
+								    description="Generate searchable PDF file from an image-only PDF file.",
 								    version='3.0rc1',
 								    fromfile_prefix_chars='@',
 								    ignored_args=[
 								        'touch_files_only', 'recreate_database', 'checksum_file_name',
 								        'key_legend_in_graph', 'draw_graph_horizontally', 'flowchart_format',
 								        'forced_tasks', 'target_tasks'])
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
 								parser.add_argument(
-												Fix errors related to use working directory

Mainly workaround lack of @split(...output_dir) in ruffus

											
										
										
											2015-07-23 01:16:05 -07:00
+								    'input_file',
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
+								    help="PDF file containing the images to be OCRed")
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								parser.add_argument(
-												Fix errors related to use working directory

Mainly workaround lack of @split(...output_dir) in ruffus

											
										
										
											2015-07-23 01:16:05 -07:00
+								    'output_file',
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
+								    help="output searchable PDF file")
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								parser.add_argument(
-												Langauge checking

											
										
										
											2015-07-23 18:38:59 -07:00
+								    '-l', '--language', action='append',
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
+								    help="language of the file to be OCRed")
-												Metadata override from command lien

											
										
										
											2015-07-25 18:12:25 -07:00
+								metadata = parser.add_argument_group(
 								    "Metadata options",
 								    "Set output PDF/A metadata (default: use input document's title)")
 								metadata.add_argument(
 								    '--title', type=str,
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								    help="set document title (place multiple words in quotes)")
-												Metadata override from command lien

											
										
										
											2015-07-25 18:12:25 -07:00
+								metadata.add_argument(
 								    '--author', type=str,
 								    help="set document author")
 								metadata.add_argument(
 								    '--subject', type=str,
 								    help="set document")
 								metadata.add_argument(
 								    '--keywords', type=str,
 								    help="set document keywords")
-												Fixes from early testing of new pipeline

											
										
										
											2015-07-22 22:51:38 -07:00
+								preprocessing = parser.add_argument_group(
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
+								    "Preprocessing options",
 								    "Improve OCR quality and final image")
 								preprocessing.add_argument(
 								    '-d', '--deskew', action='store_true',
 								    help="deskew each page before performing OCR")
 								preprocessing.add_argument(
 								    '-c', '--clean', action='store_true',
 								    help="clean pages with unpaper before performing OCR")
 								preprocessing.add_argument(
 								    '-i', '--clean-final', action='store_true',
 								    help="incorporate the cleaned image in the final PDF file")
 								preprocessing.add_argument(
 								    '--oversample', metavar='DPI', type=int,
 								    help="oversample images to improve OCR results slightly")
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								parser.add_argument(
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								    '-f', '--force-ocr', action='store_true',
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    help="Force to OCR, even if the page already contains fonts")
 								parser.add_argument(
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								    '-s', '--skip-text', action='store_true',
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    help="Skip OCR on pages that contain fonts and include the page anyway")
-												Add support for -b (skip big pages)

											
										
										
											2015-02-20 15:26:33 -08:00
+								parser.add_argument(
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
+								    '--skip-big', action='store_true',
-												Add support for -b (skip big pages)

											
										
										
											2015-02-20 15:26:33 -08:00
+								    help="Skip OCR for pages that are very large")
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								# parser.add_argument(
 								#     '--exact-image', action='store_true',
 								#     help="Use original page from PDF without re-rendering")
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
-												Fixes from early testing of new pipeline

											
										
										
											2015-07-22 22:51:38 -07:00
+								advanced = parser.add_argument_group(
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
+								    "Advanced",
-												For now, unpaper is the only deskew provider

											
										
										
											2015-07-25 01:46:16 -07:00
+								    "Advanced options for power users")
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
+								advanced.add_argument(
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								    '--tesseract-config', default=[], type=list, action='append',
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
+								    help="Tesseract configuration")
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								advanced.add_argument(
 								    '--pdf-renderer', choices=['tesseract', 'hocr'], default='hocr',
 								    help='choose OCR PDF renderer')
-												Implement tesseract timeout

											
										
										
											2015-07-27 04:23:37 -07:00
+								advanced.add_argument(
 								    '--tesseract-timeout', default=180.0, type=float,
 								    help='give up on OCR after timeout')
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
-												Fixes from early testing of new pipeline

											
										
										
											2015-07-22 22:51:38 -07:00
+								debugging = parser.add_argument_group(
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
+								    "Debugging",
 								    "Arguments to help with troubleshooting and debugging")
 								debugging.add_argument(
 								    '-k', '--keep-temporary-files', action='store_true',
 								    help="keep temporary files (helpful for debugging)")
 								debugging.add_argument(
-												diff --git a/src/ocrmypdf.py b/src/ocrmypdf.py
index 68d1591..95afa8f 100755
--- a/src/ocrmypdf.py
+++ b/src/ocrmypdf.py
@@ -24,6 +24,7 @@ import ruffus.cmdline as cmdline
 from .hocrtransform import HocrTransform

 import warnings
+import multiprocessing

 warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)

@@ -96,7 +97,7 @@ debugging.add_argument(
     '-k', '--keep-temporary-files', action='store_true',
     help="keep temporary files (helpful for debugging)")
 debugging.add_argument(
-    '-g' ,'--debug-rendering', action='store_true',
+    '-g', '--debug-rendering', action='store_true',
     help="render each page twice with debug information on second page")

@@ -106,51 +107,19 @@ if not options.temp_folder:
     options.temp_folder = 'tmp'

-_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
-                                               options.verbose)
+log, log_mutex = cmdline.setup_logging(__name__, options.log_file,
+                                       options.verbose)

-class WrappedLogger:
-
-    def __init__(self, my_logger, my_mutex):
-        self.logger = my_logger
-        self.mutex = my_mutex
-
-    def log(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.log(*args, **kwargs)
-
-    def debug(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.debug(*args, **kwargs)
-
-    def info(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.info(*args, **kwargs)
-
-    def warning(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.warning(*args, **kwargs)
-
-    def error(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.error(*args, **kwargs)
-
-    def critical(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.critical(*args, **kwargs)
-
-log = WrappedLogger(_logger, _logger_mutex)
-
-
-def re_symlink(input_file, soft_link_name, log=log):
+def re_symlink(input_file, soft_link_name, log, mutex):
     """
     Helper function: relinks soft symbolic link if necessary
     """
     if input_file == soft_link_name:
-        log.debug("Warning: No symbolic link made. You are using " +
-                     "the original data directory as the working directory.")
+        with mutex:
+            log.debug("Warning: No symbolic link made. You are using " +
+                      "the original data directory as the working directory.")
         return

@@ -161,12 +130,14 @@ def re_symlink(input_file, soft_link_name, log=log):
         try:
             os.unlink(soft_link_name)
         except:
-            log.debug("Can't unlink %s" % (soft_link_name))
+            with mutex:
+                log.debug("Can't unlink %s" % (soft_link_name))

     if not os.path.exists(input_file):
         raise Exception("trying to create a broken symlink to %s" % input_file)

-    log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
+    with mutex:
+        log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))

     os.symlink(

											
										
										
											2015-07-23 02:22:12 -07:00
+								    '-g', '--debug-rendering', action='store_true',
-												Begin unifying main script and page script

											
										
										
											2015-07-22 22:30:00 -07:00
+								    help="render each page twice with debug information on second page")
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
-												Improve argument handling, test cases

											
										
										
											2015-07-27 15:39:54 -07:00
+								# Fiddle with arguments to support with unittest.mock
 								_argv = sys.argv
 								if _argv[0].startswith('python'):
 								    _argv = _argv[1:]
 								if _argv[0].endswith('.py'):
 								    _argv = _argv[1:]
 								options = parser.parse_args(_argv)
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
-												Langauge checking

											
										
										
											2015-07-23 18:38:59 -07:00
+								# ----------
 								# Languages
 								if not options.language:
 								    options.language = ['eng']  # Enforce English hegemony
 								# Support v2.x "eng+deu" language syntax
 								if '+' in options.language[0]:
 								    options.language = options.language[0].split('+')
 								if not set(options.language).issubset(tesseract.LANGUAGES):
 								    print(
 								        "The installed version of tesseract does not have language "
 								        "data for the following requested languages: ",
 								        file=sys.stderr)
 								    for lang in (set(options.language) - tesseract.LANGUAGES):
 								        print(lang, file=sys.stderr)
 								    sys.exit(EXIT_BAD_ARGS)
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								# ----------
 								# Arguments
 								if any((options.deskew, options.clean, options.clean_final)):
 								    try:
 								        from . import unpaper
 								    except ImportError:
 								        print("Install the 'unpaper' program to use the specified options",
 								              file=sys.stderr)
 								        sys.exit(EXIT_BAD_ARGS)
 								else:
 								    unpaper = None
-												Try a method for passing along the pdfinfo struct

											
										
										
											2015-07-23 02:39:42 -07:00
+								# ----------
 								# Logging
-												New pipeline runs, splits pages

											
										
										
											2015-07-22 22:58:13 -07:00
-												Reinstate WrapperLogger with more multiprocessing fixes

											
										
										
											2015-07-23 02:26:09 -07:00
+								_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
 								                                               options.verbose)
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
-												Reinstate WrapperLogger with more multiprocessing fixes

											
										
										
											2015-07-23 02:26:09 -07:00
+								class WrappedLogger:
 								    def __init__(self, my_logger, my_mutex):
 								        self.logger = my_logger
 								        self.mutex = my_mutex
 								    def log(self, *args, **kwargs):
 								        with self.mutex:
 								            self.logger.log(*args, **kwargs)
 								    def debug(self, *args, **kwargs):
 								        with self.mutex:
 								            self.logger.debug(*args, **kwargs)
 								    def info(self, *args, **kwargs):
 								        with self.mutex:
 								            self.logger.info(*args, **kwargs)
 								    def warning(self, *args, **kwargs):
 								        with self.mutex:
 								            self.logger.warning(*args, **kwargs)
 								    def error(self, *args, **kwargs):
 								        with self.mutex:
 								            self.logger.error(*args, **kwargs)
 								    def critical(self, *args, **kwargs):
 								        with self.mutex:
 								            self.logger.critical(*args, **kwargs)
 								_log = WrappedLogger(_logger, _logger_mutex)
 								def re_symlink(input_file, soft_link_name, log=_log):
-												Learn to split PDF into pages

											
										
										
											2015-07-22 22:46:00 -07:00
+								    """
 								    Helper function: relinks soft symbolic link if necessary
 								    """
 								    # Guard against soft linking to oneself
 								    if input_file == soft_link_name:
-												Reinstate WrapperLogger with more multiprocessing fixes

											
										
										
											2015-07-23 02:26:09 -07:00
+								        log.debug("Warning: No symbolic link made. You are using " +
-												Tidy up

											
										
										
											2015-07-24 01:27:01 -07:00
+								                  "the original data directory as the working directory.")
-												Learn to split PDF into pages

											
										
										
											2015-07-22 22:46:00 -07:00
+								        return
 								    # Soft link already exists: delete for relink?
 								    if os.path.lexists(soft_link_name):
 								        # do not delete or overwrite real (non-soft link) file
 								        if not os.path.islink(soft_link_name):
 								            raise Exception("%s exists and is not a link" % soft_link_name)
 								        try:
 								            os.unlink(soft_link_name)
 								        except:
-												Reinstate WrapperLogger with more multiprocessing fixes

											
										
										
											2015-07-23 02:26:09 -07:00
+								            log.debug("Can't unlink %s" % (soft_link_name))
-												Learn to split PDF into pages

											
										
										
											2015-07-22 22:46:00 -07:00
 								    if not os.path.exists(input_file):
 								        raise Exception("trying to create a broken symlink to %s" % input_file)
-												Reinstate WrapperLogger with more multiprocessing fixes

											
										
										
											2015-07-23 02:26:09 -07:00
+								    log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
-												Learn to split PDF into pages

											
										
										
											2015-07-22 22:46:00 -07:00
 								    # Create symbolic link using absolute path
 								    os.symlink(
 								        os.path.abspath(input_file),
 								        soft_link_name
 								    )
-												Try a method for passing along the pdfinfo struct

											
										
										
											2015-07-23 02:39:42 -07:00
+								# -------------
 								# The Pipeline
-												Wrap a proxy around pdfinfo block so it can be passed around processes

											
										
										
											2015-07-23 03:49:30 -07:00
+								manager = multiprocessing.Manager()
 								_pdfinfo = manager.list()
 								_pdfinfo_lock = manager.Lock()
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
 								@atexit.register
 								def cleanup_working_files(*args):
 								    if options.keep_temporary_files:
 								        print("Temporary working files saved at:")
 								        print(work_folder)
 								    else:
 								        with suppress(FileNotFoundError):
 								            shutil.rmtree(work_folder)
-												Try a method for passing along the pdfinfo struct

											
										
										
											2015-07-23 02:39:42 -07:00
-												Fix errors related to use working directory

Mainly workaround lack of @split(...output_dir) in ruffus

											
										
										
											2015-07-23 01:16:05 -07:00
+								@transform(
-												Get rid of chdir, replace deprecated @split with @subdivide

											
										
										
											2015-07-23 03:09:03 -07:00
+								    input=options.input_file,
 								    filter=suffix('.pdf'),
-												Change 'clean' to 'repair' for clarity since 'clean' is what unpaper does

											
										
										
											2015-07-24 01:55:54 -07:00
+								    output='.repaired.pdf',
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								    output_dir=work_folder,
-												Wrap a proxy around pdfinfo block so it can be passed around processes

											
										
										
											2015-07-23 03:49:30 -07:00
+								    extras=[_log, _pdfinfo, _pdfinfo_lock])
-												Change 'clean' to 'repair' for clarity since 'clean' is what unpaper does

											
										
										
											2015-07-24 01:55:54 -07:00
+								def repair_pdf(
-												Fix errors related to use working directory

Mainly workaround lack of @split(...output_dir) in ruffus

											
										
										
											2015-07-23 01:16:05 -07:00
+								        input_file,
-												Try a method for passing along the pdfinfo struct

											
										
										
											2015-07-23 02:39:42 -07:00
+								        output_file,
-												Wrap a proxy around pdfinfo block so it can be passed around processes

											
										
										
											2015-07-23 03:49:30 -07:00
+								        log,
-												Get rid of chdir, replace deprecated @split with @subdivide

											
										
										
											2015-07-23 03:09:03 -07:00
+								        pdfinfo,
-												Wrap a proxy around pdfinfo block so it can be passed around processes

											
										
										
											2015-07-23 03:49:30 -07:00
+								        pdfinfo_lock):
-												Fix errors related to use working directory

Mainly workaround lack of @split(...output_dir) in ruffus

											
										
										
											2015-07-23 01:16:05 -07:00
+								    args_mutool = [
 								        'mutool', 'clean',
 								        input_file, output_file
 								    ]
 								    check_call(args_mutool)
-												Wrap a proxy around pdfinfo block so it can be passed around processes

											
										
										
											2015-07-23 03:49:30 -07:00
+								    with pdfinfo_lock:
 								        pdfinfo.extend(pdf_get_all_pageinfo(output_file))
 								        log.info(pdfinfo)
-												Fix errors related to use working directory

Mainly workaround lack of @split(...output_dir) in ruffus

											
										
										
											2015-07-23 01:16:05 -07:00
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								def get_pageinfo(input_file, pdfinfo, pdfinfo_lock):
 								    pageno = int(os.path.basename(input_file)[0:6]) - 1
 								    with pdfinfo_lock:
 								        pageinfo = pdfinfo[pageno].copy()
 								    return pageinfo
-												Try a method for passing along the pdfinfo struct

											
										
										
											2015-07-23 02:39:42 -07:00
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
 								def is_ocr_required(pageinfo, log):
 								    page = pageinfo['pageno'] + 1
 								    ocr_required = True
 								    if not pageinfo['images']:
 								        # If the page has no images, then it contains vector content or text
 								        # or both. It seems quite unlikely that one would find meaningful text
 								        # from rasterizing vector content. So skip the page.
 								        log.info(
 								            "Page {0} has no images - skipping OCR".format(page)
 								        )
 								        ocr_required = False
 								    elif pageinfo['has_text']:
 								        s = "Page {0} already has text! – {1}"
 								        if not options.force_ocr and not options.skip_text:
 								            log.error(s.format(page,
 								                               "aborting (use --force-ocr to force OCR)"))
 								            sys.exit(EXIT_ALREADY_DONE_OCR)
 								        elif options.force_ocr:
 								            log.info(s.format(page,
 								                              "rasterizing text and running OCR anyway"))
 								            ocr_required = True
 								        elif options.skip_text:
 								            log.info(s.format(page,
 								                              "skipping all processing on this page"))
 								            ocr_required = False
 								    if ocr_required and options.skip_big:
 								        area = pageinfo['width_inches'] * pageinfo['height_inches']
 								        pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
 								        if area > (11.0 * 17.0) or pixel_count > (300.0 * 300.0 * 11 * 17):
 								            ocr_required = False
 								            log.info(
 								                "Page {0} is very large; skipping due to -b".format(page))
 								    return ocr_required
-												Fix errors related to use working directory

Mainly workaround lack of @split(...output_dir) in ruffus

											
										
										
											2015-07-23 01:16:05 -07:00
-												Change @subdivide to @split

@split is for "1 to many" operations, so it's the right tool for this
case.

											
										
										
											2015-07-25 02:58:34 -07:00
+								@split(
-												Change 'clean' to 'repair' for clarity since 'clean' is what unpaper does

											
										
										
											2015-07-24 01:55:54 -07:00
+								    repair_pdf,
-												Change @subdivide to @split

@split is for "1 to many" operations, so it's the right tool for this
case.

											
										
										
											2015-07-25 02:58:34 -07:00
+								    os.path.join(work_folder, '*.page.pdf'),
 								    extras=[_log, _pdfinfo, _pdfinfo_lock])
-												Learn to split PDF into pages

											
										
										
											2015-07-22 22:46:00 -07:00
+								def split_pages(
 								        input_file,
-												Get rid of chdir, replace deprecated @split with @subdivide

											
										
										
											2015-07-23 03:09:03 -07:00
+								        output_files,
-												Wrap a proxy around pdfinfo block so it can be passed around processes

											
										
										
											2015-07-23 03:49:30 -07:00
+								        log,
-												Get rid of chdir, replace deprecated @split with @subdivide

											
										
										
											2015-07-23 03:09:03 -07:00
+								        pdfinfo,
-												Wrap a proxy around pdfinfo block so it can be passed around processes

											
										
										
											2015-07-23 03:49:30 -07:00
+								        pdfinfo_lock):
-												Learn to split PDF into pages

											
										
										
											2015-07-22 22:46:00 -07:00
 								    for oo in output_files:
 								        with suppress(FileNotFoundError):
 								            os.unlink(oo)
 								    args_pdfseparate = [
 								        'pdfseparate',
 								        input_file,
-												Change @subdivide to @split

@split is for "1 to many" operations, so it's the right tool for this
case.

											
										
										
											2015-07-25 02:58:34 -07:00
+								        os.path.join(work_folder, '%06d.page.pdf')
-												Learn to split PDF into pages

											
										
										
											2015-07-22 22:46:00 -07:00
+								    ]
 								    check_call(args_pdfseparate)
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								    from glob import glob
 								    for filename in glob(os.path.join(work_folder, '*.page.pdf')):
 								        pageinfo = get_pageinfo(filename, pdfinfo, pdfinfo_lock)
-												Wrap a proxy around pdfinfo block so it can be passed around processes

											
										
										
											2015-07-23 03:49:30 -07:00
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								        alt_suffix = '.ocr.page.pdf' if is_ocr_required(pageinfo, log) \
 								                     else '.skip.page.pdf'
 								        re_symlink(
 								            filename,
 								            os.path.join(
 								                work_folder,
 								                os.path.basename(filename)[0:6] + alt_suffix))
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								@transform(
 								    input=split_pages,
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								    filter=suffix('.ocr.page.pdf'),
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								    output='.page.png',
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								    output_dir=work_folder,
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								    extras=[_log, _pdfinfo, _pdfinfo_lock])
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								def rasterize_with_ghostscript(
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								        input_file,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
 								    pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
 								    device = 'png16m'  # 24-bit
 								    if all(image['comp'] == 1 for image in pageinfo['images']):
 								        if all(image['bpc'] == 1 for image in pageinfo['images']):
 								            device = 'pngmono'
 								        elif not any(image['color'] == 'color'
 								                     for image in pageinfo['images']):
 								            device = 'pnggray'
-												Remove redundant *res_render

											
										
										
											2015-07-26 12:56:10 -07:00
+								    xres = max(pageinfo['xres'], options.oversample or 0)
 								    yres = max(pageinfo['yres'], options.oversample or 0)
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
-												Put ghostscript in a module

											
										
										
											2015-07-27 15:22:00 -07:00
+								    rasterize_pdf(input_file, output_file, xres, yres, device, log)
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
 								@transform(
 								    input=rasterize_with_ghostscript,
 								    filter=suffix(".page.png"),
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    output=".pp-deskew.png",
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
+								    extras=[_log, _pdfinfo, _pdfinfo_lock])
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								def preprocess_deskew(
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
+								        input_file,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    if not options.deskew:
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
+								        re_symlink(input_file, output_file, log)
 								        return
 								    pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    dpi = int(pageinfo['xres'])
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    unpaper.deskew(input_file, output_file, dpi, log)
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								@transform(
 								    input=preprocess_deskew,
 								    filter=suffix(".pp-deskew.png"),
 								    output=".pp-clean.png",
 								    extras=[_log, _pdfinfo, _pdfinfo_lock])
 								def preprocess_clean(
 								        input_file,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    if not options.clean:
 								        re_symlink(input_file, output_file, log)
 								        return
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
 								    dpi = int(pageinfo['xres'])
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    unpaper.clean(input_file, output_file, dpi, log)
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								@active_if(options.pdf_renderer == 'hocr')
-												Implement deskew and clean using unpaper

											
										
										
											2015-07-24 15:19:37 -07:00
+								@transform(
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    input=preprocess_clean,
 								    filter=suffix(".pp-clean.png"),
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								    output=".hocr",
 								    extras=[_log, _pdfinfo, _pdfinfo_lock])
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								def ocr_tesseract_hocr(
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								        input_file,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
 								    pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
 								    args_tesseract = [
 								        'tesseract',
 								        '-l', '+'.join(options.language),
 								        input_file,
 								        output_file,
 								        'hocr'
 								    ] + options.tesseract_config
 								    p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
 								              universal_newlines=True)
 								    try:
-												Implement tesseract timeout

											
										
										
											2015-07-27 04:23:37 -07:00
+								        stdout, stderr = p.communicate(timeout=options.tesseract_timeout)
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								    except TimeoutExpired:
 								        p.kill()
 								        stdout, stderr = p.communicate()
 								        # Generate a HOCR file with no recognized text if tesseract times out
 								        # Temporary workaround to hocrTransform not being able to function if
 								        # it does not have a valid hOCR file.
 								        with open(output_file, 'w', encoding="utf-8") as f:
 								            f.write(tesseract.HOCR_TEMPLATE.format(
 								                pageinfo['width_pixels'],
 								                pageinfo['height_pixels']))
 								    else:
 								        if stdout:
 								            log.info(stdout)
 								        if stderr:
 								            log.error(stderr)
 								        if p.returncode != 0:
 								            raise CalledProcessError(p.returncode, args_tesseract)
 								        if os.path.exists(output_file + '.html'):
 								            # Tesseract 3.02 appends suffix ".html" on its own (.hocr.html)
 								            shutil.move(output_file + '.html', output_file)
 								        elif os.path.exists(output_file + '.hocr'):
 								            # Tesseract 3.03 appends suffix ".hocr" on its own (.hocr.hocr)
 								            shutil.move(output_file + '.hocr', output_file)
 								        # Tesseract 3.03 inserts source filename into hocr file without
 								        # escaping it, creating invalid XML and breaking the parser.
 								        # As a workaround, rewrite the hocr file, replacing the filename
 								        # with a space.
 								        regex_nested_single_quotes = re.compile(
 								            r"""title='image "([^"]*)";""")
 								        with fileinput.input(files=(output_file,), inplace=True) as f:
 								            for line in f:
 								                line = regex_nested_single_quotes.sub(
 								                    r"""title='image " ";""", line)
 								                print(line, end='')  # fileinput.input redirects stdout
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								@active_if(options.pdf_renderer == 'hocr')
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								@collate(
-												Split selecting final image and render PDF result into separate tasks

Simplifies the logic - one deals with all images, the other details
with an image and .hocr. Also add JPEG reconversion.

											
										
										
											2015-07-25 00:54:00 -07:00
+								    input=[rasterize_with_ghostscript, preprocess_deskew, preprocess_clean],
 								    filter=regex(r".*/(\d{6})(?:\.page|\.pp-deskew|\.pp-clean)\.png"),
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								    output=os.path.join(work_folder, r'\1.image'),
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								    extras=[_log, _pdfinfo, _pdfinfo_lock])
-												Split selecting final image and render PDF result into separate tasks

Simplifies the logic - one deals with all images, the other details
with an image and .hocr. Also add JPEG reconversion.

											
										
										
											2015-07-25 00:54:00 -07:00
+								def select_image_for_pdf(
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								        infiles,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    if options.clean_final:
 								        image_suffix = '.pp-clean.png'
-												Split selecting final image and render PDF result into separate tasks

Simplifies the logic - one deals with all images, the other details
with an image and .hocr. Also add JPEG reconversion.

											
										
										
											2015-07-25 00:54:00 -07:00
+								    elif options.deskew:
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								        image_suffix = '.pp-deskew.png'
-												Split selecting final image and render PDF result into separate tasks

Simplifies the logic - one deals with all images, the other details
with an image and .hocr. Also add JPEG reconversion.

											
										
										
											2015-07-25 00:54:00 -07:00
+								    else:
 								        image_suffix = '.page.png'
-												Modularize unpaper; get -d and -c working again

											
										
										
											2015-07-25 00:22:56 -07:00
+								    image = next(ii for ii in infiles if ii.endswith(image_suffix))
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
-												Split selecting final image and render PDF result into separate tasks

Simplifies the logic - one deals with all images, the other details
with an image and .hocr. Also add JPEG reconversion.

											
										
										
											2015-07-25 00:54:00 -07:00
+								    pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
 								    if all(image['enc'] == 'jpeg' for image in pageinfo['images']):
 								        # If all images were JPEGs originally, produce a JPEG as output
 								        Image.open(image).save(output_file, format='JPEG')
 								    else:
 								        re_symlink(image, output_file)
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								@active_if(options.pdf_renderer == 'hocr')
-												Split selecting final image and render PDF result into separate tasks

Simplifies the logic - one deals with all images, the other details
with an image and .hocr. Also add JPEG reconversion.

											
										
										
											2015-07-25 00:54:00 -07:00
+								@collate(
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								    input=[select_image_for_pdf, ocr_tesseract_hocr],
-												Split selecting final image and render PDF result into separate tasks

Simplifies the logic - one deals with all images, the other details
with an image and .hocr. Also add JPEG reconversion.

											
										
										
											2015-07-25 00:54:00 -07:00
+								    filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"),
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								    output=os.path.join(work_folder, r'\1.rendered.pdf'),
-												Split selecting final image and render PDF result into separate tasks

Simplifies the logic - one deals with all images, the other details
with an image and .hocr. Also add JPEG reconversion.

											
										
										
											2015-07-25 00:54:00 -07:00
+								    extras=[_log, _pdfinfo, _pdfinfo_lock])
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								def render_hocr_page(
-												Split selecting final image and render PDF result into separate tasks

Simplifies the logic - one deals with all images, the other details
with an image and .hocr. Also add JPEG reconversion.

											
										
										
											2015-07-25 00:54:00 -07:00
+								        infiles,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
 								    hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
 								    image = next(ii for ii in infiles if ii.endswith('.image'))
-												Rasterize PDF pages and generate .hocr files

											
										
										
											2015-07-23 23:09:29 -07:00
+								    pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
 								    dpi = round(max(pageinfo['xres'], pageinfo['yres']))
 								    hocrtransform = HocrTransform(hocr, dpi)
 								    hocrtransform.to_pdf(output_file, imageFileName=image,
 								                         showBoundingboxes=False, invisibleText=True)
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								@active_if(options.pdf_renderer == 'hocr')
-												Reimplement debug pages

											
										
										
											2015-07-25 14:14:02 -07:00
+								@active_if(options.debug_rendering)
 								@collate(
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								    input=[select_image_for_pdf, ocr_tesseract_hocr],
-												Reimplement debug pages

											
										
										
											2015-07-25 14:14:02 -07:00
+								    filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"),
 								    output=os.path.join(work_folder, r'\1.debug.pdf'),
 								    extras=[_log, _pdfinfo, _pdfinfo_lock])
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								def render_hocr_debug_page(
-												Reimplement debug pages

											
										
										
											2015-07-25 14:14:02 -07:00
+								        infiles,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
 								    hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
 								    image = next(ii for ii in infiles if ii.endswith('.image'))
 								    pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
 								    dpi = round(max(pageinfo['xres'], pageinfo['yres']))
 								    hocrtransform = HocrTransform(hocr, dpi)
 								    hocrtransform.to_pdf(output_file, imageFileName=None,
 								                         showBoundingboxes=True, invisibleText=False)
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								@active_if(options.pdf_renderer == 'tesseract')
 								@transform(
 								    input=preprocess_clean,
 								    filter=suffix(".pp-clean.png"),
 								    output=".rendered.pdf",
 								    extras=[_log, _pdfinfo, _pdfinfo_lock])
 								def tesseract_ocr_and_render_pdf(
 								        input_file,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
 								    args_tesseract = [
 								        'tesseract',
 								        '-l', '+'.join(options.language),
 								        input_file,
 								        os.path.splitext(output_file)[0],  # Tesseract appends suffix
 								        'pdf'
 								    ] + options.tesseract_config
 								    p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
 								              universal_newlines=True)
-												Implement tesseract timeout

											
										
										
											2015-07-27 04:23:37 -07:00
+								    stdout, stderr = p.communicate(timeout=options.tesseract_timeout)
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								    if stdout:
 								        log.info(stdout)
 								    if stderr:
 								        log.error(stderr)
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								@transform(
-												Change 'clean' to 'repair' for clarity since 'clean' is what unpaper does

											
										
										
											2015-07-24 01:55:54 -07:00
+								    input=repair_pdf,
 								    filter=suffix('.repaired.pdf'),
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								    output='.pdfa_def.ps',
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								    output_dir=work_folder,
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								    extras=[_log])
 								def generate_postscript_stub(
 								        input_file,
 								        output_file,
 								        log):
-												Copy document metadata from source document into output (untested)

This works for ASCII only; will do Unicode version.

											
										
										
											2015-07-25 15:31:02 -07:00
-												Transfer Unicode document information from input PDF to output PDF

What a pain getting Unicode right, but there it is.

I cannot find anything to confirm that it is acceptable to put the PDF/A
definition file at the end of the Ghostscript inputs.  I did this because
Ghostscript seems to copy document info from the last document on the
list so reportlab's information "wins" in normal order, so it fixes that
issue, and reportlab 'helpfully' fills in all of those fields even if it
does not have information.

It could also work to pass document information along to reportlab, and
set it in each output PDF: .debug.pdf, .rendered.pdf, and .page.pdf to
ensure that whatever page is last in the pipeline has the right
information. Or perhaps it's possible to write a Postscript trailer that
overwrites any previous docinfo with no side effects, but I can't find
any information on how to do that.  I don't think it's worth pursuing
unless this arrangement causes some problem with PDF/A generation.

On a minor note, Jhove misreads the way I have encoded the strings in
producing its validation log.  It reads them as UTF-16 little endian, so
will tend to produce a string of Asian characters in place of the real
data.

											
										
										
											2015-07-25 18:05:25 -07:00
+								    pdf = pypdf.PdfFileReader(input_file)
 								    def from_document_info(key):
 								        # pdf.documentInfo.get() DOES NOT work as expected
 								        try:
 								            s = pdf.documentInfo[key]
 								            return str(s)
 								        except KeyError:
 								            return ''
 								    pdfmark = {
 								        'title': from_document_info('/Title'),
 								        'author': from_document_info('/Author'),
 								        'keywords': from_document_info('/Keywords'),
 								        'subject': from_document_info('/Subject'),
 								    }
-												Metadata override from command lien

											
										
										
											2015-07-25 18:12:25 -07:00
+								    if options.title:
 								        pdfmark['title'] = options.title
 								    if options.author:
 								        pdfmark['author'] = options.author
 								    if options.keywords:
 								        pdfmark['keywords'] = options.keywords
 								    if options.subject:
 								        pdfmark['subject'] = options.subject
-												Copy document metadata from source document into output (untested)

This works for ASCII only; will do Unicode version.

											
										
										
											2015-07-25 15:31:02 -07:00
+								    generate_pdfa_def(output_file, pdfmark)
-												Learn to split PDF into pages

											
										
										
											2015-07-22 22:46:00 -07:00
-												Fixes from early testing of new pipeline

											
										
										
											2015-07-22 22:51:38 -07:00
-												Reimplement skip text pages

											
										
										
											2015-07-25 04:25:19 -07:00
+								@transform(
 								    input=split_pages,
 								    filter=suffix('.skip.page.pdf'),
 								    output='.done.pdf',
 								    output_dir=work_folder,
 								    extras=[_log])
 								def skip_page(
 								        input_file,
 								        output_file,
 								        log):
 								    re_symlink(input_file, output_file, log)
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								@merge(
-												Implement tesseract PDF rendering as an alternative

It's much better a rendering text baselines than hocr and seems to
produce small file sizes, so it's progress.  Not available for
Tesseract 3.02 obviously, so both modes need to remove available.

											
										
										
											2015-07-27 04:20:49 -07:00
+								    input=[render_hocr_page, render_hocr_debug_page, skip_page,
 								           tesseract_ocr_and_render_pdf, generate_postscript_stub],
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								    output=os.path.join(work_folder, 'merged.pdf'),
-												Can now generate PDF/A files, multipage and single page

											
										
										
											2015-07-23 04:57:31 -07:00
+								    extras=[_log, _pdfinfo, _pdfinfo_lock])
 								def merge_pages(
 								        input_files,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
-												Fixes from early testing of new pipeline

											
										
										
											2015-07-22 22:51:38 -07:00
-												Reimplement debug pages

											
										
										
											2015-07-25 14:14:02 -07:00
+								    def input_file_order(s):
-												Transfer Unicode document information from input PDF to output PDF

What a pain getting Unicode right, but there it is.

I cannot find anything to confirm that it is acceptable to put the PDF/A
definition file at the end of the Ghostscript inputs.  I did this because
Ghostscript seems to copy document info from the last document on the
list so reportlab's information "wins" in normal order, so it fixes that
issue, and reportlab 'helpfully' fills in all of those fields even if it
does not have information.

It could also work to pass document information along to reportlab, and
set it in each output PDF: .debug.pdf, .rendered.pdf, and .page.pdf to
ensure that whatever page is last in the pipeline has the right
information. Or perhaps it's possible to write a Postscript trailer that
overwrites any previous docinfo with no side effects, but I can't find
any information on how to do that.  I don't think it's worth pursuing
unless this arrangement causes some problem with PDF/A generation.

On a minor note, Jhove misreads the way I have encoded the strings in
producing its validation log.  It reads them as UTF-16 little endian, so
will tend to produce a string of Asian characters in place of the real
data.

											
										
										
											2015-07-25 18:05:25 -07:00
+								        '''Sort order: All rendered pages followed
 								        by their debug page, if any, followed by Postscript stub.
 								        Ghostscript documentation has the Postscript stub at the
 								        beginning, but it works at the end and also gets document info
 								        right that way.'''
-												Reimplement debug pages

											
										
										
											2015-07-25 14:14:02 -07:00
+								        if s.endswith('.ps'):
-												Transfer Unicode document information from input PDF to output PDF

What a pain getting Unicode right, but there it is.

I cannot find anything to confirm that it is acceptable to put the PDF/A
definition file at the end of the Ghostscript inputs.  I did this because
Ghostscript seems to copy document info from the last document on the
list so reportlab's information "wins" in normal order, so it fixes that
issue, and reportlab 'helpfully' fills in all of those fields even if it
does not have information.

It could also work to pass document information along to reportlab, and
set it in each output PDF: .debug.pdf, .rendered.pdf, and .page.pdf to
ensure that whatever page is last in the pipeline has the right
information. Or perhaps it's possible to write a Postscript trailer that
overwrites any previous docinfo with no side effects, but I can't find
any information on how to do that.  I don't think it's worth pursuing
unless this arrangement causes some problem with PDF/A generation.

On a minor note, Jhove misreads the way I have encoded the strings in
producing its validation log.  It reads them as UTF-16 little endian, so
will tend to produce a string of Asian characters in place of the real
data.

											
										
										
											2015-07-25 18:05:25 -07:00
+								            return 99999999
-												Reimplement debug pages

											
										
										
											2015-07-25 14:14:02 -07:00
+								        key = int(os.path.basename(s)[0:6]) * 10
 								        if 'debug' in os.path.basename(s):
 								            key += 1
 								        return key
 								    pdf_pages = sorted(input_files, key=input_file_order)
 								    log.info(pdf_pages)
-												Put ghostscript in a module

											
										
										
											2015-07-27 15:22:00 -07:00
+								    generate_pdfa(pdf_pages, output_file)
-												Fixes from early testing of new pipeline

											
										
										
											2015-07-22 22:51:38 -07:00
-												Add PDF/A validation

											
										
										
											2015-07-23 14:48:46 -07:00
+								@transform(
 								    input=merge_pages,
 								    filter=formatter(),
 								    output=options.output_file,
 								    extras=[_log, _pdfinfo, _pdfinfo_lock])
 								def validate_pdfa(
 								        input_file,
 								        output_file,
 								        log,
 								        pdfinfo,
 								        pdfinfo_lock):
 								    args_jhove = [
 								        'java',
 								        '-jar', JHOVE_JAR,
 								        '-c', JHOVE_CFG,
 								        '-m', 'PDF-hul',
 								        input_file
 								    ]
 								    p_jhove = Popen(args_jhove, close_fds=True, universal_newlines=True,
 								                    stdout=PIPE, stderr=DEVNULL)
 								    stdout, _ = p_jhove.communicate()
 								    log.debug(stdout)
 								    if p_jhove.returncode != 0:
 								        log.error(stdout)
 								        raise RuntimeError(
 								            "Unexpected error while checking compliance to PDF/A file.")
 								    pdf_is_valid = True
 								    if re.search(r'ErrorMessage', stdout,
 								                 re.IGNORECASE | re.MULTILINE):
 								        pdf_is_valid = False
 								    if re.search(r'^\s+Status.*not valid', stdout,
 								                 re.IGNORECASE | re.MULTILINE):
 								        pdf_is_valid = False
 								    if re.search(r'^\s+Status.*Not well-formed', stdout,
 								                 re.IGNORECASE | re.MULTILINE):
 								        pdf_is_valid = False
 								    pdf_is_pdfa = False
 								    if re.search(r'^\s+Profile:.*PDF/A-1', stdout,
 								                 re.IGNORECASE | re.MULTILINE):
 								        pdf_is_pdfa = True
 								    if not pdf_is_valid:
 								        log.warning('Output file: The generated PDF/A file is INVALID')
 								    elif pdf_is_valid and not pdf_is_pdfa:
 								        log.warning('Output file: Generated file is a VALID PDF but not PDF/A')
 								    elif pdf_is_valid and pdf_is_pdfa:
 								        log.info('Output file: The generated PDF/A file is VALID')
 								    shutil.copy(input_file, output_file)
-												Fixes from early testing of new pipeline

											
										
										
											2015-07-22 22:51:38 -07:00
+								# @active_if(ocr_required and options.exact_image)
 								# @merge([render_hocr_blank_page, extract_single_page],
-												Remove ability to override temporary (working) folder

Little point to this feature - on most platforms the environment
variable can be overridden if desired to set a new root location.

At the same time, this change removes the ability to resume a partially
executed pipeline by deleting all of the results on failure.  If -k is
provided then the temporary files will survive but there's no way to
resume from them.  Because resuming doesn't really work away and would
only be useful to users experiencing very specific problems, this is
probably not worth it, so no major loss.  The intent of -k is to assist
debugging.

											
										
										
											2015-07-25 01:45:26 -07:00
+								#        os.path.join(work_folder, "%04i.merged.pdf") % pageno)
-												Fixes from early testing of new pipeline

											
										
										
											2015-07-22 22:51:38 -07:00
+								# def merge_hocr_with_original_page(infiles, output_file):
 								#     with open(infiles[0], 'rb') as hocr_input, \
 								#             open(infiles[1], 'rb') as page_input, \
 								#             open(output_file, 'wb') as output:
 								#         hocr_reader = pypdf.PdfFileReader(hocr_input)
 								#         page_reader = pypdf.PdfFileReader(page_input)
 								#         writer = pypdf.PdfFileWriter()
 								#         the_page = hocr_reader.getPage(0)
 								#         the_page.mergePage(page_reader.getPage(0))
 								#         writer.addPage(the_page)
 								#         writer.write(output)
-												Automatically try to use all available CPUs

											
										
										
											2015-07-25 01:10:14 -07:00
+								def available_cpu_count():
 								    try:
 								        return multiprocessing.cpu_count()
 								    except NotImplementedError:
 								        pass
-												Fixes from early testing of new pipeline

											
										
										
											2015-07-22 22:51:38 -07:00
-												Automatically try to use all available CPUs

											
										
										
											2015-07-25 01:10:14 -07:00
+								    try:
 								        import psutil
 								        return psutil.cpu_count()
 								    except (ImportError, AttributeError):
 								        pass
 								    print(
 								        "Could not get CPU count.  Assuming one (1) CPU."
 								        "Use -j N to set manually.", file=sys.stderr)
 								    return 1
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
-												More packaging changes: move jhove, fix console script

											
										
										
											2015-07-26 01:52:08 -07:00
+								def run_pipeline():
-												Automatically try to use all available CPUs

											
										
										
											2015-07-25 01:10:14 -07:00
+								    cmdline.run(options, multiprocess=available_cpu_count())
-												More packaging changes: move jhove, fix console script

											
										
										
											2015-07-26 01:52:08 -07:00
 								if __name__ == '__main__':
 								    run_pipeline()