OCRmyPDF/src/ocrpage.py

#!/usr/bin/env python3
# Reimplement ocrPage.sh as Python

import argparse
import logging
import sys
import os.path
from parse import parse

from subprocess import Popen, check_call, PIPE, CalledProcessError
try:
    from subprocess import DEVNULL
except ImportError:
    import os
    DEVNULL = open(os.devnull, 'wb')

from tempfile import NamedTemporaryFile

from ruffus import transform, suffix, merge, active_if, regex, jobs_limit, \
    mkdir, formatter
import ruffus.cmdline as cmdline

basedir = os.path.dirname(os.path.realpath(__file__))

parser = cmdline.get_argparse(
    prog="ocrpage",
    description="Run OCR and related jobs on a single page of a PDF file")

parser.add_argument(
    'input_pdf',
    help="PDF file containing the page to be OCRed")
parser.add_argument(
    'page_info',
    help="Various characteristics of the page to be OCRed")
parser.add_argument(
    'num_pages',
    help="Total number of page of the PDF file (required for logger)")
parser.add_argument(
    'tmp_fld',
    help="Folder where the temporary files should be placed")
parser.add_argument(
    'verbosity', type=int,
    help="Requested verbosity")
parser.add_argument(
    'language',
    help="Language of the file to be OCRed")
parser.add_argument(
    'keep_tmp', type=int,
    help="Keep the temporary files after processing (helpful for debugging)")
parser.add_argument(
    'preprocess_deskew', type=int,
    help="Deskew the page to be OCRed")
parser.add_argument(
    'preprocess_clean', type=int,
    help="Clean the page to be OCRed")
parser.add_argument(
    'preprocess_cleantopdf', type=int,
    help="Put the cleaned paged in the OCRed PDF")
parser.add_argument(
    'oversampling_dpi', type=int,
    help="Oversampling resolution in dpi")
parser.add_argument(
    'pdf_noimg', type=int,
    help="Generate debug PDF pages with only the OCRed text and no image")
parser.add_argument(
    'force_ocr', type=int,
    help="Force to OCR, even if the page already contains fonts")
parser.add_argument(
    'skip_text', type=int,
    help="Skip OCR on pages that contain fonts and include the page anyway")
parser.add_argument(
    'tess_cfg_files',
    help="Tesseract configuration")


options = parser.parse_args()

logger, logger_mutex = cmdline.setup_logging(__name__, options.log_file,
                                             options.verbose)


def pdf_get_pageinfo(infile, page, width_pt, height_pt):
    pageinfo = {}
    pageinfo['pageno'] = page
    pageinfo['width_inches'] = width_pt / 72.0
    pageinfo['height_inches'] = height_pt / 72.0
    pageinfo['images'] = []

    p_pdffonts = Popen(['pdffonts', '-f', str(page), '-l', str(page), infile],
                       close_fds=True, stdout=PIPE, stderr=PIPE,
                       universal_newlines=True)
    pdffonts, _ = p_pdffonts.communicate()
    if len(pdffonts.splitlines()) > 2:
        logger.info("Page already contains font data!")
        pageinfo['has_text'] = True
    else:
        pageinfo['has_text'] = False

    # pdfimages: get image dimensions
    p_pdfimages = Popen(['pdfimages', '-list', '-f', str(page), '-l',
                        str(page), str(infile)], close_fds=True, stdout=PIPE,
                        stderr=PIPE, universal_newlines=True)
    pdfimages, _ = p_pdfimages.communicate()
    for n, line in enumerate(pdfimages.splitlines()):
        if n <= 1:
            continue  # Skip first two lines

        r = parse('{page:1d} {num:1d} {imtype:>} {width:1d} {height:1d} ' +
                  '{color:>} {comp:1d} {bpc:1d} {enc:>} {interp:>} ' +
                  '{pdfobject:1d} {pdfid:1d} {bad_dpi_w:1d} {bad_dpi_h:1d} ' +
                  '{size:>} {ratio:>}', line)
        image = r.named
        # pdfimages calculates DPI as 0.26.0, but adds +1 to dpi_h
        # apparent bug, so calculate explicitly
        image['dpi_w'] = image['width'] / pageinfo['width_inches']
        image['dpi_h'] = image['height'] / pageinfo['height_inches']
        image['dpi'] = (image['dpi_w'] * image['dpi_h']) ** 0.5
        pageinfo['images'].append(image)

    xres = max(image['dpi_w'] for image in pageinfo['images'])
    yres = max(image['dpi_h'] for image in pageinfo['images'])
    pageinfo['xres'], pageinfo['yres'] = xres, yres
    pageinfo['width_pixels'] = int(round(xres * pageinfo['width_inches']))
    pageinfo['height_pixels'] = int(round(yres * pageinfo['height_inches']))

    return pageinfo

pageno, width_pt, height_pt = map(int, options.page_info.split(' ', 3))
pageinfo = pdf_get_pageinfo(options.input_pdf, pageno, width_pt, height_pt)


def re_symlink(input_file, soft_link_name, logger, logger_mutex):
    """
    Helper function: relinks soft symbolic link if necessary
    """
    # Guard against soft linking to oneself
    if input_file == soft_link_name:
        logger.debug("Warning: No symbolic link made. You are using " +
                     "the original data directory as the working directory.")
        return

    # Soft link already exists: delete for relink?
    if os.path.lexists(soft_link_name):
        # do not delete or overwrite real (non-soft link) file
        if not os.path.islink(soft_link_name):
            raise Exception("%s exists and is not a link" % soft_link_name)
        try:
            os.unlink(soft_link_name)
        except:
            with logger_mutex:
                logger.debug("Can't unlink %s" % (soft_link_name))
    with logger_mutex:
        logger.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))

    # Create symbolic link relative to original directory, so that the entire
    # path can be moved around
    os.symlink(
        os.path.relpath(os.path.abspath(input_file),
                        os.path.abspath(os.path.dirname(soft_link_name))),
        soft_link_name
    )


@jobs_limit(1)
@mkdir(options.tmp_fld)
@transform([options.input_pdf],
           formatter(),
           os.path.join(options.tmp_fld, "original{ext[0]}"))
def setup_working_directory(input_file, soft_link_name):
    with logger_mutex:
        logger.debug("Linking %(input_file)s -> %(soft_link_name)s" % locals())
    re_symlink(input_file, soft_link_name, logger, logger_mutex)


@transform(setup_working_directory,
           formatter(),
           "{path[0]}/%04i.pnm" % pageno)
def unpack_with_pdftoppm(
        input_file,
        output_file):
    force_ppm = True

    colorspace = 'color'
    compression = 'deflate'
    output_format = 'tiff'
    if all(image['comp'] == 1 for image in pageinfo['images']):
        if all(image['bpc'] == 1 for image in pageinfo['images']):
            colorspace = 'mono'
            compression = 'deflate'
        elif not any(image['color'] == 'color'
                     for image in pageinfo['images']):
            colorspace = 'gray'

    if all(image['enc'] == 'jpeg' for image in pageinfo['images']):
        output_format = 'jpeg'

    args_pdftoppm = [
        'pdftoppm',
        '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']),
        '-rx', str(pageinfo['xres']),
        '-ry', str(pageinfo['yres'])
    ]

    if not force_ppm:
        if output_format == 'tiff':
            args_pdftoppm.append('-tiff')
            if compression:
                args_pdftoppm.append('-tiffcompression')
                args_pdftoppm.append(compression)
        elif output_format == 'jpeg':
            args_pdftoppm.append('-jpeg')

    if colorspace == 'mono':
        args_pdftoppm.append('-mono')
    elif colorspace == 'gray':
        args_pdftoppm.append('-gray')

    args_pdftoppm.extend([str(input_file)])

    p = Popen(args_pdftoppm, close_fds=True, stdout=open(output_file, 'wb'),
              stderr=PIPE, universal_newlines=False)
    _, stderr = p.communicate()
    if stderr:
        from codecs import iterdecode
        with logger_mutex:
            logger.error(iterdecode(stderr, sys.getdefaultencoding(),
                                    errors='ignore'))
    if p.returncode != 0:
        raise CalledProcessError(' '.join(args_pdftoppm))


@active_if(options.preprocess_deskew != 0)
@transform(unpack_with_pdftoppm, suffix(".pnm"), ".deskewed.pnm")
def deskew_imagemagick(input_file, output_file):
    args_convert = [
        'convert',
        input_file,
        '-deskew', '40%',
        '-gravity', 'center',
        '-extent', '{width_pixels}x{height_pixels}'.format(**pageinfo),
        output_file
    ]

    p = Popen(args_convert, close_fds=True, stdout=PIPE, stderr=PIPE,
              universal_newlines=True)
    stdout, stderr = p.communicate()

    with logger_mutex:
        if stdout:
            logger.info(stdout)
        if stderr:
            logger.error(stderr)

    if p.returncode != 0:
        raise CalledProcessError(' '.join(args_convert))


def clean_unpaper(pageinfo, infile, prefix, output_folder):
    args_unpaper = [
        'unpaper',
        '--dpi', int(round((pageinfo['xres'] * pageinfo['yres']) ** 0.5)),
        '--mask-scan-size', '100',
        '--no-deskew',
        '--no-grayfilter',
        '--no-blackfilter',
        '--no-mask-center',
        '--no-border-align',
        infile
    ]

    with NamedTemporaryFile(prefix=prefix + "%04i.pnm" % pageinfo['pageno'],
                            suffix='.pnm', dir=output_folder,
                            delete=False) as tmpfile:
        args_unpaper.append(tmpfile.name)
        check_call(args_unpaper, close_fds=True)
        return tmpfile.name


@merge([unpack_with_pdftoppm, deskew_imagemagick],
       os.path.join(options.tmp_fld, "%04i.for_ocr.pnm" % pageno))
def select_ocr_image(infiles, output_file):
    re_symlink(infiles[-1], output_file, logger, logger_mutex)


@transform(select_ocr_image, suffix(".for_ocr.pnm"), ".hocr")
def ocr_tesseract(
        input_file,
        output_file):

    args_tesseract = [
        'tesseract',
        '-l', options.language,
        input_file,
        output_file,
        'hocr',
        options.tess_cfg_files
    ]
    p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
              universal_newlines=True)
    stdout, stderr = p.communicate()

    with logger_mutex:
        if stdout:
            logger.info(stdout)
        if stderr:
            logger.error(stderr)

    if p.returncode != 0:
        raise CalledProcessError(p.returncode, args_tesseract)

    # Tesseract appends suffix ".html" on its own
    re_symlink(output_file + ".html", output_file, logger, logger_mutex)


@merge([ocr_tesseract, select_ocr_image],
       os.path.join(options.tmp_fld, '%04i.ocred.pdf' % pageno))
def render_page(infiles, output_file):
    # Call python in a subprocess because:
    #  -That is python2 and this is python3
    #  -It is written as a standalone script; not meant for import yet
    args_hocrTransform = [
        'python2',
        os.path.join(basedir, 'hocrTransform.py'),
        '-r', str(round(max(pageinfo['xres'], pageinfo['yres']))),
        '-i', infiles[1],
        infiles[0],
        output_file
    ]
    p = Popen(args_hocrTransform, close_fds=True, stdout=PIPE, stderr=PIPE,
              universal_newlines=True)
    stdout, stderr = p.communicate()

    with logger_mutex:
        if stdout:
            logger.info(stdout)
        if stderr:
            logger.error(stderr)

    if p.returncode != 0:
        raise CalledProcessError(p.returncode, args_hocrTransform)


cmdline.run(options)

# parser.add_argument(
#     'tess_cfg_files',
#     help="Specific configuration files to be used by Tesseract during OCRing")


def main():
    args = parser.parse_args()

    pageno, width_pt, height_pt = map(int, args.page_info.split(' ', 3))

    logger.name += '(page=%i)' % pageno

    logger.info("Processing page %i / %i", pageno, args.num_pages)

    pageinfo = pdf_get_pageinfo(args.input_pdf, pageno, width_pt, height_pt)
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00			`#!/usr/bin/env python3`
			`# Reimplement ocrPage.sh as Python`

			`import argparse`
			`import logging`
			`import sys`
Moving quickly - we can now output .ppm files at correct resolution 2014-09-26 04:43:15 -07:00			`import os.path`
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00			`from parse import parse`
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00
Basic error handling 2014-10-10 01:07:46 -07:00			`from subprocess import Popen, check_call, PIPE, CalledProcessError`
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`try:`
			`from subprocess import DEVNULL`
			`except ImportError:`
			`import os`
			`DEVNULL = open(os.devnull, 'wb')`

deskew and clean 2014-09-27 15:03:07 -07:00			`from tempfile import NamedTemporaryFile`
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`from ruffus import transform, suffix, merge, active_if, regex, jobs_limit, \`
			`mkdir, formatter`
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`import ruffus.cmdline as cmdline`

Now produces a finished OCR-PDF page 2014-10-08 03:54:06 -07:00			`basedir = os.path.dirname(os.path.realpath(__file__))`
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00
			`parser = cmdline.get_argparse(`
			`prog="ocrpage",`
			`description="Run OCR and related jobs on a single page of a PDF file")`

			`parser.add_argument(`
			`'input_pdf',`
			`help="PDF file containing the page to be OCRed")`
			`parser.add_argument(`
			`'page_info',`
			`help="Various characteristics of the page to be OCRed")`
			`parser.add_argument(`
			`'num_pages',`
			`help="Total number of page of the PDF file (required for logger)")`
			`parser.add_argument(`
			`'tmp_fld',`
			`help="Folder where the temporary files should be placed")`
			`parser.add_argument(`
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`'verbosity', type=int,`
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`help="Requested verbosity")`
			`parser.add_argument(`
			`'language',`
			`help="Language of the file to be OCRed")`
			`parser.add_argument(`
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`'keep_tmp', type=int,`
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`help="Keep the temporary files after processing (helpful for debugging)")`
			`parser.add_argument(`
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`'preprocess_deskew', type=int,`
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`help="Deskew the page to be OCRed")`
			`parser.add_argument(`
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`'preprocess_clean', type=int,`
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`help="Clean the page to be OCRed")`
			`parser.add_argument(`
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`'preprocess_cleantopdf', type=int,`
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`help="Put the cleaned paged in the OCRed PDF")`
			`parser.add_argument(`
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`'oversampling_dpi', type=int,`
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`help="Oversampling resolution in dpi")`
			`parser.add_argument(`
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`'pdf_noimg', type=int,`
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`help="Generate debug PDF pages with only the OCRed text and no image")`
			`parser.add_argument(`
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`'force_ocr', type=int,`
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`help="Force to OCR, even if the page already contains fonts")`
			`parser.add_argument(`
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`'skip_text', type=int,`
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`help="Skip OCR on pages that contain fonts and include the page anyway")`
			`parser.add_argument(`
			`'tess_cfg_files',`
			`help="Tesseract configuration")`


			`options = parser.parse_args()`

			`logger, logger_mutex = cmdline.setup_logging(__name__, options.log_file,`
			`options.verbose)`
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00

Moving quickly - we can now output .ppm files at correct resolution 2014-09-26 04:43:15 -07:00			`def pdf_get_pageinfo(infile, page, width_pt, height_pt):`
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00			`pageinfo = {}`
Moving quickly - we can now output .ppm files at correct resolution 2014-09-26 04:43:15 -07:00			`pageinfo['pageno'] = page`
			`pageinfo['width_inches'] = width_pt / 72.0`
			`pageinfo['height_inches'] = height_pt / 72.0`
			`pageinfo['images'] = []`
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00
			`p_pdffonts = Popen(['pdffonts', '-f', str(page), '-l', str(page), infile],`
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`close_fds=True, stdout=PIPE, stderr=PIPE,`
			`universal_newlines=True)`
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00			`pdffonts, _ = p_pdffonts.communicate()`
			`if len(pdffonts.splitlines()) > 2:`
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`logger.info("Page already contains font data!")`
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00			`pageinfo['has_text'] = True`
			`else:`
			`pageinfo['has_text'] = False`

			`# pdfimages: get image dimensions`
			`p_pdfimages = Popen(['pdfimages', '-list', '-f', str(page), '-l',`
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`str(page), str(infile)], close_fds=True, stdout=PIPE,`
			`stderr=PIPE, universal_newlines=True)`
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00			`pdfimages, _ = p_pdfimages.communicate()`
			`for n, line in enumerate(pdfimages.splitlines()):`
			`if n <= 1:`
			`continue # Skip first two lines`

			`r = parse('{page:1d} {num:1d} {imtype:>} {width:1d} {height:1d} ' +`
			`'{color:>} {comp:1d} {bpc:1d} {enc:>} {interp:>} ' +`
			`'{pdfobject:1d} {pdfid:1d} {bad_dpi_w:1d} {bad_dpi_h:1d} ' +`
			`'{size:>} {ratio:>}', line)`
			`image = r.named`
			`# pdfimages calculates DPI as 0.26.0, but adds +1 to dpi_h`
			`# apparent bug, so calculate explicitly`
			`image['dpi_w'] = image['width'] / pageinfo['width_inches']`
			`image['dpi_h'] = image['height'] / pageinfo['height_inches']`
			`image['dpi'] = (image['dpi_w'] * image['dpi_h']) ** 0.5`
			`pageinfo['images'].append(image)`

deskew and clean 2014-09-27 15:03:07 -07:00			`xres = max(image['dpi_w'] for image in pageinfo['images'])`
			`yres = max(image['dpi_h'] for image in pageinfo['images'])`
			`pageinfo['xres'], pageinfo['yres'] = xres, yres`
			`pageinfo['width_pixels'] = int(round(xres * pageinfo['width_inches']))`
			`pageinfo['height_pixels'] = int(round(yres * pageinfo['height_inches']))`

Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00			`return pageinfo`

First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`pageno, width_pt, height_pt = map(int, options.page_info.split(' ', 3))`
			`pageinfo = pdf_get_pageinfo(options.input_pdf, pageno, width_pt, height_pt)`

Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`def re_symlink(input_file, soft_link_name, logger, logger_mutex):`
			`"""`
			`Helper function: relinks soft symbolic link if necessary`
			`"""`
			`# Guard against soft linking to oneself`
			`if input_file == soft_link_name:`
			`logger.debug("Warning: No symbolic link made. You are using " +`
			`"the original data directory as the working directory.")`
			`return`

			`# Soft link already exists: delete for relink?`
			`if os.path.lexists(soft_link_name):`
			`# do not delete or overwrite real (non-soft link) file`
			`if not os.path.islink(soft_link_name):`
			`raise Exception("%s exists and is not a link" % soft_link_name)`
			`try:`
			`os.unlink(soft_link_name)`
			`except:`
			`with logger_mutex:`
			`logger.debug("Can't unlink %s" % (soft_link_name))`
			`with logger_mutex:`
			`logger.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))`

			`# Create symbolic link relative to original directory, so that the entire`
			`# path can be moved around`
			`os.symlink(`
			`os.path.relpath(os.path.abspath(input_file),`
			`os.path.abspath(os.path.dirname(soft_link_name))),`
			`soft_link_name`
			`)`


			`@jobs_limit(1)`
			`@mkdir(options.tmp_fld)`
			`@transform([options.input_pdf],`
			`formatter(),`
Standardize intermediate filenames better convert .pnm -deskew <...> .pnm seems to have a bug that produces an invalid .pnm file which later causes tesseract (specifically, leptonica) to choke (using 3.02/1.71 as versions, respectively). Will change pipeline to use tiffs internally since they are less stupid. 2014-10-10 01:30:19 -07:00			`os.path.join(options.tmp_fld, "original{ext[0]}"))`
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`def setup_working_directory(input_file, soft_link_name):`
			`with logger_mutex:`
			`logger.debug("Linking %(input_file)s -> %(soft_link_name)s" % locals())`
			`re_symlink(input_file, soft_link_name, logger, logger_mutex)`


Standardize intermediate filenames better convert .pnm -deskew <...> .pnm seems to have a bug that produces an invalid .pnm file which later causes tesseract (specifically, leptonica) to choke (using 3.02/1.71 as versions, respectively). Will change pipeline to use tiffs internally since they are less stupid. 2014-10-10 01:30:19 -07:00			`@transform(setup_working_directory,`
			`formatter(),`
			`"{path[0]}/%04i.pnm" % pageno)`
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`def unpack_with_pdftoppm(`
			`input_file,`
			`output_file):`
			`force_ppm = True`
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00
			`colorspace = 'color'`
			`compression = 'deflate'`
			`output_format = 'tiff'`
			`if all(image['comp'] == 1 for image in pageinfo['images']):`
			`if all(image['bpc'] == 1 for image in pageinfo['images']):`
			`colorspace = 'mono'`
			`compression = 'deflate'`
Moving quickly - we can now output .ppm files at correct resolution 2014-09-26 04:43:15 -07:00			`elif not any(image['color'] == 'color'`
			`for image in pageinfo['images']):`
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00			`colorspace = 'gray'`

Moving quickly - we can now output .ppm files at correct resolution 2014-09-26 04:43:15 -07:00			`if all(image['enc'] == 'jpeg' for image in pageinfo['images']):`
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00			`output_format = 'jpeg'`

			`args_pdftoppm = [`
			`'pdftoppm',`
Moving quickly - we can now output .ppm files at correct resolution 2014-09-26 04:43:15 -07:00			`'-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']),`
deskew and clean 2014-09-27 15:03:07 -07:00			`'-rx', str(pageinfo['xres']),`
			`'-ry', str(pageinfo['yres'])`
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00			`]`

			`if not force_ppm:`
			`if output_format == 'tiff':`
			`args_pdftoppm.append('-tiff')`
			`if compression:`
			`args_pdftoppm.append('-tiffcompression')`
			`args_pdftoppm.append(compression)`
			`elif output_format == 'jpeg':`
			`args_pdftoppm.append('-jpeg')`

			`if colorspace == 'mono':`
			`args_pdftoppm.append('-mono')`
			`elif colorspace == 'gray':`
			`args_pdftoppm.append('-gray')`

First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`args_pdftoppm.extend([str(input_file)])`
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`p = Popen(args_pdftoppm, close_fds=True, stdout=open(output_file, 'wb'),`
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`stderr=PIPE, universal_newlines=False)`
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`_, stderr = p.communicate()`
			`if stderr:`
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`from codecs import iterdecode`
			`with logger_mutex:`
			`logger.error(iterdecode(stderr, sys.getdefaultencoding(),`
			`errors='ignore'))`
Basic error handling 2014-10-10 01:07:46 -07:00			`if p.returncode != 0:`
			`raise CalledProcessError(' '.join(args_pdftoppm))`
deskew and clean 2014-09-27 15:03:07 -07:00

Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`@active_if(options.preprocess_deskew != 0)`
Standardize intermediate filenames better convert .pnm -deskew <...> .pnm seems to have a bug that produces an invalid .pnm file which later causes tesseract (specifically, leptonica) to choke (using 3.02/1.71 as versions, respectively). Will change pipeline to use tiffs internally since they are less stupid. 2014-10-10 01:30:19 -07:00			`@transform(unpack_with_pdftoppm, suffix(".pnm"), ".deskewed.pnm")`
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`def deskew_imagemagick(input_file, output_file):`
deskew and clean 2014-09-27 15:03:07 -07:00			`args_convert = [`
			`'convert',`
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`input_file,`
deskew and clean 2014-09-27 15:03:07 -07:00			`'-deskew', '40%',`
			`'-gravity', 'center',`
Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`'-extent', '{width_pixels}x{height_pixels}'.format(**pageinfo),`
			`output_file`
deskew and clean 2014-09-27 15:03:07 -07:00			`]`

Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`p = Popen(args_convert, close_fds=True, stdout=PIPE, stderr=PIPE,`
			`universal_newlines=True)`
			`stdout, stderr = p.communicate()`

			`with logger_mutex:`
			`if stdout:`
			`logger.info(stdout)`
			`if stderr:`
			`logger.error(stderr)`
deskew and clean 2014-09-27 15:03:07 -07:00
Basic error handling 2014-10-10 01:07:46 -07:00			`if p.returncode != 0:`
			`raise CalledProcessError(' '.join(args_convert))`

deskew and clean 2014-09-27 15:03:07 -07:00
			`def clean_unpaper(pageinfo, infile, prefix, output_folder):`
			`args_unpaper = [`
			`'unpaper',`
			`'--dpi', int(round((pageinfo['xres'] * pageinfo['yres']) ** 0.5)),`
			`'--mask-scan-size', '100',`
			`'--no-deskew',`
			`'--no-grayfilter',`
			`'--no-blackfilter',`
			`'--no-mask-center',`
			`'--no-border-align',`
			`infile`
			`]`

Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`with NamedTemporaryFile(prefix=prefix + "%04i.pnm" % pageinfo['pageno'],`
			`suffix='.pnm', dir=output_folder,`
deskew and clean 2014-09-27 15:03:07 -07:00			`delete=False) as tmpfile:`
			`args_unpaper.append(tmpfile.name)`
			`check_call(args_unpaper, close_fds=True)`
			`return tmpfile.name`
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00

Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`@merge([unpack_with_pdftoppm, deskew_imagemagick],`
			`os.path.join(options.tmp_fld, "%04i.for_ocr.pnm" % pageno))`
			`def select_ocr_image(infiles, output_file):`
			`re_symlink(infiles[-1], output_file, logger, logger_mutex)`


			`@transform(select_ocr_image, suffix(".for_ocr.pnm"), ".hocr")`
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`def ocr_tesseract(`
			`input_file,`
			`output_file):`

			`args_tesseract = [`
			`'tesseract',`
			`'-l', options.language,`
			`input_file,`
			`output_file,`
			`'hocr',`
			`options.tess_cfg_files`
			`]`
			`p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,`
			`universal_newlines=True)`
			`stdout, stderr = p.communicate()`

Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`with logger_mutex:`
			`if stdout:`
			`logger.info(stdout)`
			`if stderr:`
			`logger.error(stderr)`

Basic error handling 2014-10-10 01:07:46 -07:00			`if p.returncode != 0:`
			`raise CalledProcessError(p.returncode, args_tesseract)`

Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`# Tesseract appends suffix ".html" on its own`
			`re_symlink(output_file + ".html", output_file, logger, logger_mutex)`
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00

Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`@merge([ocr_tesseract, select_ocr_image],`
			`os.path.join(options.tmp_fld, '%04i.ocred.pdf' % pageno))`
Now produces a finished OCR-PDF page 2014-10-08 03:54:06 -07:00			`def render_page(infiles, output_file):`
			`# Call python in a subprocess because:`
			`# -That is python2 and this is python3`
			`# -It is written as a standalone script; not meant for import yet`
			`args_hocrTransform = [`
			`'python2',`
			`os.path.join(basedir, 'hocrTransform.py'),`
			`'-r', str(round(max(pageinfo['xres'], pageinfo['yres']))),`
			`'-i', infiles[1],`
			`infiles[0],`
			`output_file`
			`]`
			`p = Popen(args_hocrTransform, close_fds=True, stdout=PIPE, stderr=PIPE,`
			`universal_newlines=True)`
			`stdout, stderr = p.communicate()`

Sort of working, but fragile; uses tmp folder properly now 2014-10-10 00:35:49 -07:00			`with logger_mutex:`
			`if stdout:`
			`logger.info(stdout)`
			`if stderr:`
			`logger.error(stderr)`
Now produces a finished OCR-PDF page 2014-10-08 03:54:06 -07:00
Basic error handling 2014-10-10 01:07:46 -07:00			`if p.returncode != 0:`
			`raise CalledProcessError(p.returncode, args_hocrTransform)`

Now produces a finished OCR-PDF page 2014-10-08 03:54:06 -07:00
First crack at Ruffus, working well 2014-10-08 03:21:28 -07:00			`cmdline.run(options)`
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00
Moving quickly - we can now output .ppm files at correct resolution 2014-09-26 04:43:15 -07:00			`# parser.add_argument(`
			`# 'tess_cfg_files',`
			`# help="Specific configuration files to be used by Tesseract during OCRing")`
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00

			`def main():`
			`args = parser.parse_args()`

Moving quickly - we can now output .ppm files at correct resolution 2014-09-26 04:43:15 -07:00			`pageno, width_pt, height_pt = map(int, args.page_info.split(' ', 3))`
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00
			`logger.name += '(page=%i)' % pageno`

			`logger.info("Processing page %i / %i", pageno, args.num_pages)`

Moving quickly - we can now output .ppm files at correct resolution 2014-09-26 04:43:15 -07:00			`pageinfo = pdf_get_pageinfo(args.input_pdf, pageno, width_pt, height_pt)`
Initial ocrpage.py rewrite into python3 2014-09-26 04:19:41 -07:00