OCRmyPDF/src/ocrpage.py

#!/usr/bin/env python3
# Reimplement ocrPage.sh as Python

import sys
import os.path
import fileinput
import re
from parse import parse
import PyPDF2 as pypdf
import shutil

from subprocess import Popen, check_call, PIPE, CalledProcessError, \
    TimeoutExpired
try:
    from subprocess import DEVNULL
except ImportError:
    import os
    DEVNULL = open(os.devnull, 'wb')


from ruffus import transform, suffix, merge, active_if, regex, jobs_limit, \
    mkdir, formatter
import ruffus.cmdline as cmdline
from .hocrtransform import HocrTransform


basedir = os.path.dirname(os.path.realpath(__file__))

parser = cmdline.get_argparse(
    prog="ocrpage",
    description="Run OCR and related jobs on a single page of a PDF file")

parser.add_argument(
    'input_pdf',    # Implemented
    help="PDF file containing the page to be OCRed")
parser.add_argument(
    'page_info',    # Implemented
    help="Various characteristics of the page to be OCRed")
parser.add_argument(
    'num_pages',    # Unused
    help="Total number of page of the PDF file (required for logger)")
parser.add_argument(
    'tmp_fld',      # Implemented
    help="Folder where the temporary files should be placed")
parser.add_argument(
    'verbosity', type=int,      # Superseded
    help="Requested verbosity")
parser.add_argument(
    'language',     # Implemented
    help="Language of the file to be OCRed")
parser.add_argument(
    'keep_tmp', type=int,   # Not implemented
    help="Keep the temporary files after processing (helpful for debugging)")
parser.add_argument(
    'preprocess_deskew', type=int,          # Implemented
    help="Deskew the page to be OCRed")
parser.add_argument(
    'preprocess_clean', type=int,           # Implemented
    help="Clean the page to be OCRed")
parser.add_argument(
    'preprocess_cleantopdf', type=int,      # Implemented
    help="Put the cleaned paged in the OCRed PDF")
parser.add_argument(
    'oversampling_dpi', type=int,           # Implemented
    help="Oversampling resolution in dpi")
parser.add_argument(
    'pdf_noimg', type=int,                  # implemented
    help="Generate debug PDF pages with only the OCRed text and no image")
parser.add_argument(
    'force_ocr', type=int,                  # Implemented
    help="Force to OCR, even if the page already contains fonts")
parser.add_argument(
    'skip_text', type=int,                  # Implemented
    help="Skip OCR on pages that contain fonts and include the page anyway")
parser.add_argument(
    'skip_big', type=int,
    help="Skip OCR for pages that are very large")
parser.add_argument(
    'exact_image', type=int,
    help="Use original page from PDF without re-rendering")
parser.add_argument(
    'tess_cfg_files', default='', nargs='*',    # Implemented
    help="Tesseract configuration")
parser.add_argument(
    '--deskew-provider', choices=['imagemagick', 'leptonica'],
    default='leptonica')
parser.add_argument(
    '--page-renderer', choices=['pdftoppm', 'ghostscript'],
    default='ghostscript')


options = parser.parse_args()

_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
                                               options.verbose)


class WrappedLogger:

    def __init__(self, my_logger, my_mutex):
        self.logger = my_logger
        self.mutex = my_mutex

    def log(self, *args, **kwargs):
        with self.mutex:
            self.logger.log(*args, **kwargs)

    def debug(self, *args, **kwargs):
        with self.mutex:
            self.logger.debug(*args, **kwargs)

    def info(self, *args, **kwargs):
        with self.mutex:
            self.logger.info(*args, **kwargs)

    def warning(self, *args, **kwargs):
        with self.mutex:
            self.logger.warning(*args, **kwargs)

    def error(self, *args, **kwargs):
        with self.mutex:
            self.logger.error(*args, **kwargs)

    def critical(self, *args, **kwargs):
        with self.mutex:
            self.logger.critical(*args, **kwargs)

log = WrappedLogger(_logger, _logger_mutex)


def pdf_get_pageinfo(infile, page, width_pt, height_pt):
    pageinfo = {}
    pageinfo['pageno'] = page
    pageinfo['width_inches'] = width_pt / 72.0
    pageinfo['height_inches'] = height_pt / 72.0
    pageinfo['images'] = []

    p_pdftotext = Popen(['pdftotext', '-f', str(page), '-l', str(page),
                        '-raw', '-nopgbrk', infile, '-'],
                        close_fds=True, stdout=PIPE, stderr=PIPE,
                        universal_newlines=True)
    text, _ = p_pdftotext.communicate()
    if len(text.strip()) > 0:
        pageinfo['has_text'] = True
    else:
        pageinfo['has_text'] = False

    pdf = pypdf.PdfFileReader(infile)
    page = pdf.pages[page - 1]

    if not '/XObject' in page['/Resources']:
        # Missing /XObject means no images or possibly corrupt PDF
        return pageinfo

    for xobj in page['/Resources']['/XObject']:
        # PyPDF2 returns the keys as an iterator
        pdfimage = page['/Resources']['/XObject'][xobj]
        if pdfimage['/Subtype'] != '/Image':
            continue
        image = {}
        image['width'] = pdfimage['/Width']
        image['height'] = pdfimage['/Height']
        image['dpi_w'] = image['width'] / pageinfo['width_inches']
        image['dpi_h'] = image['height'] / pageinfo['height_inches']
        image['dpi'] = (image['dpi_w'] * image['dpi_h']) ** 0.5
        pageinfo['images'].append(image)

    if pageinfo['images']:
        xres = max(image['dpi_w'] for image in pageinfo['images'])
        yres = max(image['dpi_h'] for image in pageinfo['images'])
        pageinfo['xres'], pageinfo['yres'] = xres, yres
        pageinfo['width_pixels'] = \
            int(round(xres * pageinfo['width_inches']))
        pageinfo['height_pixels'] = \
            int(round(yres * pageinfo['height_inches']))

        if options.oversampling_dpi > 0:
            rx, ry = options.oversampling_dpi, options.oversampling_dpi
        else:
            rx, ry = pageinfo['xres'], pageinfo['yres']
        pageinfo['xres_render'], pageinfo['yres_render'] = rx, ry

    return pageinfo

pageno, width_pt, height_pt = map(int, options.page_info.split(' ', 3))
pageinfo = pdf_get_pageinfo(options.input_pdf, pageno, width_pt, height_pt)

if not pageinfo['images']:
    # If the page has no images, then it contains vector content or text
    # or both. It seems quite unlikely that one would find meaningful text
    # from rasterizing vector content. So skip the page.
    log.info(
        "Page {0} has no images - skipping OCR".format(pageno)
    )
elif pageinfo['has_text']:
    s = "Page {0} already has text! – {1}"

    if not options.force_ocr and not options.skip_text:
        log.error(s.format(pageno,
                     "aborting (use -f or -s to force OCR)"))
        sys.exit(1)
    elif options.force_ocr:
        log.info(s.format(pageno,
                    "rasterizing text and running OCR anyway"))
    elif options.skip_text:
        log.info(s.format(pageno,
                    "skipping all processing on this page"))

ocr_required = pageinfo['images'] and \
    (options.force_ocr or
        (not (pageinfo['has_text'] and options.skip_text)))

if ocr_required and options.skip_big:
    area = pageinfo['width_inches'] * pageinfo['height_inches']
    pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
    if area > (11.0 * 17.0) or pixel_count > (300.0 * 300.0 * 11 * 17):
        ocr_required = False
        log.info(
            "Page {0} is very large; skipping due to -b".format(pageno))


def re_symlink(input_file, soft_link_name, log=log):
    """
    Helper function: relinks soft symbolic link if necessary
    """
    # Guard against soft linking to oneself
    if input_file == soft_link_name:
        log.debug("Warning: No symbolic link made. You are using " +
                     "the original data directory as the working directory.")
        return

    # Soft link already exists: delete for relink?
    if os.path.lexists(soft_link_name):
        # do not delete or overwrite real (non-soft link) file
        if not os.path.islink(soft_link_name):
            raise Exception("%s exists and is not a link" % soft_link_name)
        try:
            os.unlink(soft_link_name)
        except:
            log.debug("Can't unlink %s" % (soft_link_name))

    if not os.path.exists(input_file):
        raise Exception("trying to create a broken symlink to %s" % input_file)

    log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))

    # Create symbolic link using absolute path
    os.symlink(
        os.path.abspath(input_file),
        soft_link_name
    )


@jobs_limit(1)
@mkdir(options.tmp_fld)
@transform([options.input_pdf],
           formatter(),
           os.path.join(options.tmp_fld, "original{ext[0]}"))
def setup_working_directory(input_file, soft_link_name):
    log.debug("Linking %(input_file)s -> %(soft_link_name)s" % locals())
    try:
        re_symlink(input_file, soft_link_name)
    except FileExistsError:
        pass


@active_if(not ocr_required or (ocr_required and options.exact_image))
@transform(setup_working_directory,
           formatter(),
           os.path.join(options.tmp_fld, '%04i.page.pdf' % pageno))
def extract_single_page(
        input_file,
        output_file):
    args_pdfseparate = [
        'pdfseparate',
        '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']),
        input_file,
        output_file
    ]
    check_call(args_pdfseparate)


@active_if(ocr_required)
@active_if(options.page_renderer == 'pdftoppm')
@transform(setup_working_directory,
           formatter(),
           "{path[0]}/%04i.pnm" % pageno)
def unpack_with_pdftoppm(
        input_file,
        output_file):
    force_ppm = True
    allow_jpeg = False

    colorspace = 'color'
    compression = 'deflate'
    output_format = 'tiff'
    if all(image['comp'] == 1 for image in pageinfo['images']):
        if all(image['bpc'] == 1 for image in pageinfo['images']):
            colorspace = 'mono'
            compression = 'deflate'
        elif not any(image['color'] == 'color'
                     for image in pageinfo['images']):
            colorspace = 'gray'

    if allow_jpeg and \
            all(image['enc'] == 'jpeg' for image in pageinfo['images']):
        output_format = 'jpeg'

    args_pdftoppm = [
        'pdftoppm',
        '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']),
        '-rx', str(pageinfo['xres_render']),
        '-ry', str(pageinfo['yres_render'])
    ]

    if not force_ppm:
        if output_format == 'tiff':
            args_pdftoppm.append('-tiff')
            if False and compression:
                args_pdftoppm.append('-tiffcompression')
                args_pdftoppm.append(compression)
        elif output_format == 'jpeg':
            args_pdftoppm.append('-jpeg')

    if colorspace == 'mono':
        args_pdftoppm.append('-mono')
    elif colorspace == 'gray':
        args_pdftoppm.append('-gray')

    args_pdftoppm.extend([str(input_file)])

    # Ask pdftoppm to write the binary output to stdout; therefore set
    # universal_newlines=False
    p = Popen(args_pdftoppm, close_fds=True, stdout=open(output_file, 'wb'),
              stderr=PIPE, universal_newlines=False)
    _, stderr = p.communicate()
    if stderr:
        # Because universal_newlines=False, stderr is bytes(), so we must
        # manually convert it to str for logging
        from codecs import decode
        log.error(decode(stderr, sys.getdefaultencoding(), 'ignore'))
    if p.returncode != 0:
        raise CalledProcessError(p.returncode, args_pdftoppm)


@active_if(ocr_required)
@transform(unpack_with_pdftoppm, suffix(".pnm"), ".png")
def convert_to_png(input_file, output_file):
    args_convert = [
        'convert',
        input_file,
        output_file
    ]
    check_call(args_convert)


@active_if(ocr_required)
@active_if(options.page_renderer == 'ghostscript')
@transform(setup_working_directory,
           formatter(),
           "{path[0]}/%04i.png" % pageno)
def unpack_with_ghostscript(
        input_file,
        output_file):
    device = 'png16m'  # 24-bit
    if all(image['comp'] == 1 for image in pageinfo['images']):
        if all(image['bpc'] == 1 for image in pageinfo['images']):
            device = 'pngmono'
        elif not any(image['color'] == 'color'
                     for image in pageinfo['images']):
            device = 'pnggray'

    args_gs = [
        'gs',
        '-dBATCH', '-dNOPAUSE',
        '-dFirstPage=%i' % pageno,
        '-dLastPage=%i' % pageno,
        '-sDEVICE=%s' % device,
        '-o', output_file,
        '-r{0}x{1}'.format(
            str(pageinfo['xres_render']), str(pageinfo['yres_render'])),
        input_file
    ]

    p = Popen(args_gs, close_fds=True, stdout=PIPE, stderr=PIPE,
              universal_newlines=True)
    stdout, stderr = p.communicate()
    if stdout:
        log.info(stdout)
    if stderr:
        log.error(stderr)

    try:
        f = open(output_file)
    except FileNotFoundError:
        raise
    else:
        f.close()


@active_if(ocr_required)
@active_if(options.preprocess_deskew != 0
           and options.deskew_provider == 'imagemagick')
@transform(convert_to_png, suffix(".png"), ".deskewed.png")
def deskew_imagemagick(input_file, output_file):
    args_convert = [
        'convert',
        input_file,
        '-deskew', '40%',
        '-gravity', 'center',
        '-extent', '{width_pixels}x{height_pixels}'.format(**pageinfo),
        '+repage',
        output_file
    ]

    p = Popen(args_convert, close_fds=True, stdout=PIPE, stderr=PIPE,
              universal_newlines=True)
    stdout, stderr = p.communicate()

    if stdout:
        log.info(stdout)
    if stderr:
        log.error(stderr)

    if p.returncode != 0:
        raise CalledProcessError(p.returncode, args_convert)


@active_if(ocr_required)
@active_if(options.preprocess_deskew != 0
           and options.deskew_provider == 'leptonica')
@transform(convert_to_png, suffix(".png"), ".deskewed.png")
def deskew_leptonica(input_file, output_file):
    from .leptonica import deskew
    deskew(input_file, output_file,
           min(pageinfo['xres'], pageinfo['yres']))


@active_if(ocr_required)
@active_if(options.preprocess_clean != 0)
@merge([unpack_with_pdftoppm, unpack_with_ghostscript,
        deskew_imagemagick, deskew_leptonica],
       os.path.join(options.tmp_fld, "%04i.for_clean.pnm" % pageno))
def select_image_for_cleaning(infiles, output_file):
    input_file = infiles[-1]
    args_convert = [
        'convert',
        input_file,
        output_file
    ]
    check_call(args_convert)


@active_if(ocr_required)
@active_if(options.preprocess_clean != 0)
@transform(select_image_for_cleaning, suffix(".pnm"), ".cleaned.pnm")
def clean_unpaper(input_file, output_file):
    args_unpaper = [
        'unpaper',
        '--dpi', str(int(round((pageinfo['xres'] * pageinfo['yres']) ** 0.5))),
        '--mask-scan-size', '100',
        '--no-deskew',
        '--no-grayfilter',
        '--no-blackfilter',
        '--no-mask-center',
        '--no-border-align',
        input_file,
        output_file
    ]

    p = Popen(args_unpaper, close_fds=True, stdout=PIPE, stderr=PIPE,
              universal_newlines=True)
    stdout, stderr = p.communicate()

    if stdout:
        log.info(stdout)
    if stderr:
        log.error(stderr)

    if p.returncode != 0:
        raise CalledProcessError(p.returncode, args_unpaper)


@active_if(ocr_required)
@transform(clean_unpaper, suffix(".cleaned.pnm"), ".cleaned.png")
def cleaned_to_png(input_file, output_file):
    args_convert = [
        'convert',
        input_file,
        output_file
    ]
    check_call(args_convert)


@active_if(ocr_required)
@merge([unpack_with_ghostscript, convert_to_png, deskew_imagemagick,
        deskew_leptonica, cleaned_to_png],
       os.path.join(options.tmp_fld, "%04i.for_ocr.png" % pageno))
def select_ocr_image(infiles, output_file):
    re_symlink(infiles[-1], output_file)


hocr_template = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 <head>
  <title></title>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  <meta name='ocr-system' content='tesseract 3.02.02' />
  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
 </head>
 <body>
  <div class='ocr_page' id='page_1' title='image "x.tif"; bbox 0 0 {0} {1}; ppageno 0'>
   <div class='ocr_carea' id='block_1_1' title="bbox 0 1 {0} {1}">
    <p class='ocr_par' dir='ltr' id='par_1' title="bbox 0 1 {0} {1}">
     <span class='ocr_line' id='line_1' title="bbox 0 1 {0} {1}"><span class='ocrx_word' id='word_1' title="bbox 0 1 {0} {1}"> </span>
     </span>
    </p>
   </div>
  </div>
 </body>
</html>'''


@active_if(ocr_required)
@transform(select_ocr_image, suffix(".for_ocr.png"), ".hocr")
def ocr_tesseract(
        input_file,
        output_file):

    args_tesseract = [
        'tesseract',
        '-l', options.language,
        input_file,
        output_file,
        'hocr',
        options.tess_cfg_files
    ]
    p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
              universal_newlines=True)
    try:
        stdout, stderr = p.communicate(timeout=180)
    except TimeoutExpired:
        p.kill()
        stdout, stderr = p.communicate()
        # Generate a HOCR file with no recognized text if tesseract times out
        # Temporary workaround to hocrTransform not being able to function if
        # it does not have a valid hOCR file.
        with open(output_file, 'w', encoding="utf-8") as f:
            f.write(hocr_template.format(pageinfo['width_pixels'],
                                         pageinfo['height_pixels']))
    else:
        if stdout:
            log.info(stdout)
        if stderr:
            log.error(stderr)

        if p.returncode != 0:
            raise CalledProcessError(p.returncode, args_tesseract)

        if os.path.exists(output_file + '.html'):
            # Tesseract 3.02 appends suffix ".html" on its own (.hocr.html)
            shutil.move(output_file + '.html', output_file)
        elif os.path.exists(output_file + '.hocr'):
            # Tesseract 3.03 appends suffix ".hocr" on its own (.hocr.hocr)
            shutil.move(output_file + '.hocr', output_file)

        # Tesseract inserts source filename into hocr file without escaping
        # it. This could break the XML parser. Rewrite the hocr file,
        # replacing the filename with a space.
        regex_nested_single_quotes = re.compile(
            r"""title='image "([^"]*)";""")
        with fileinput.input(files=(output_file,), inplace=True) as f:
            for line in f:
                line = regex_nested_single_quotes.sub(
                    r"""title='image " ";""", line)
                print(line, end='')  # fileinput.input redirects stdout


@active_if(ocr_required and not options.exact_image)
@merge([unpack_with_ghostscript, convert_to_png,
        deskew_imagemagick, deskew_leptonica, cleaned_to_png],
       os.path.join(options.tmp_fld, "%04i.image_for_pdf" % pageno))
def select_image_for_pdf(infiles, output_file):
    if options.preprocess_clean != 0 and options.preprocess_cleantopdf != 0:
        input_file = infiles[-1]
    elif options.preprocess_deskew != 0 and options.preprocess_clean != 0:
        input_file = infiles[-2]
    elif options.preprocess_deskew != 0 and options.preprocess_clean == 0:
        input_file = infiles[-1]
    else:
        input_file = infiles[0]

    if all(image['enc'] == 'jpeg' for image in pageinfo['images']):
        # If all images were JPEGs originally, produce a JPEG as output
        check_call(['convert', input_file, 'jpg:' + output_file])
    else:
        re_symlink(input_file, output_file)


@active_if(ocr_required and not options.exact_image)
@merge([ocr_tesseract, select_image_for_pdf],
       os.path.join(options.tmp_fld, '%04i.rendered.pdf' % pageno))
def render_page(infiles, output_file):
    hocr, image = infiles[0], infiles[1]

    dpi = round(max(pageinfo['xres'], pageinfo['yres']))

    hocrtransform = HocrTransform(hocr, dpi)
    hocrtransform.to_pdf(output_file, imageFileName=image,
                         showBoundingboxes=False, invisibleText=True)


@active_if(ocr_required and options.pdf_noimg)
@transform(ocr_tesseract, suffix(".hocr"), ".ocred.todebug.pdf")
def render_text_output_page(input_file, output_file):
    dpi = round(max(pageinfo['xres'], pageinfo['yres']))

    hocrtransform = HocrTransform(input_file, dpi)
    hocrtransform.to_pdf(output_file, imageFileName=None,
                         showBoundingboxes=True, invisibleText=False)


@active_if(ocr_required and options.exact_image)
@transform(ocr_tesseract, suffix(".hocr"), ".hocr.pdf")
def render_hocr_blank_page(input_file, output_file):
    dpi = round(max(pageinfo['xres'], pageinfo['yres']))

    hocrtransform = HocrTransform(input_file, dpi)
    hocrtransform.to_pdf(output_file, imageFileName=None,
                         showBoundingboxes=False, invisibleText=True)


@active_if(ocr_required and options.exact_image)
@merge([render_hocr_blank_page, extract_single_page],
       os.path.join(options.tmp_fld, "%04i.merged.pdf") % pageno)
def merge_hocr_with_original_page(infiles, output_file):
    with open(infiles[0], 'rb') as hocr_input, \
            open(infiles[1], 'rb') as page_input, \
            open(output_file, 'wb') as output:
        hocr_reader = pypdf.PdfFileReader(hocr_input)
        page_reader = pypdf.PdfFileReader(page_input)
        writer = pypdf.PdfFileWriter()

        the_page = hocr_reader.getPage(0)
        the_page.mergePage(page_reader.getPage(0))
        writer.addPage(the_page)
        writer.write(output)


@merge([render_page, merge_hocr_with_original_page, extract_single_page],
       os.path.join(options.tmp_fld, '%04i.ocred.pdf' % pageno))
def select_final_page(infiles, output_file):
    re_symlink(infiles[-1], output_file)


if __name__ == '__main__':
    cmdline.run(options)
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
+								#!/usr/bin/env python3
 								# Reimplement ocrPage.sh as Python
 								import sys
-												Moving quickly - we can now output .ppm files at correct resolution

											
										
										
											2014-09-26 04:43:15 -07:00
+								import os.path
-												Remove filenames from .hocr files

As documented, Tesseract does not escape the filename when inserting it
into .hocr, potentially creating an invalid XML file as a result. Since
there is no use for the title, regex it and nuke it.

											
										
										
											2015-02-13 13:41:14 -08:00
+								import fileinput
 								import re
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
+								from parse import parse
-												Add rudimentary support for combining OCR layer with existing content

It appears to be very fragile due to weaknesses in PyPDF. Better
option is probably to use pdftk's watermark feature.

											
										
										
											2015-03-10 14:28:38 -07:00
+								import PyPDF2 as pypdf
 								import shutil
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
-												Add Tesseract timeout to keep things reasonable

											
										
										
											2014-11-14 02:06:23 -08:00
+								from subprocess import Popen, check_call, PIPE, CalledProcessError, \
 								    TimeoutExpired
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								try:
 								    from subprocess import DEVNULL
 								except ImportError:
 								    import os
 								    DEVNULL = open(os.devnull, 'wb')
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
+								from ruffus import transform, suffix, merge, active_if, regex, jobs_limit, \
 								    mkdir, formatter
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								import ruffus.cmdline as cmdline
-												Call HocrTransform directly instead of through a subprocess

											
										
										
											2015-02-20 17:20:48 -08:00
+								from .hocrtransform import HocrTransform
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
-												Now produces a finished OCR-PDF page

											
										
										
											2014-10-08 03:54:06 -07:00
+								basedir = os.path.dirname(os.path.realpath(__file__))
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
 								parser = cmdline.get_argparse(
 								    prog="ocrpage",
 								    description="Run OCR and related jobs on a single page of a PDF file")
 								parser.add_argument(
-												Implement debug text only page option

											
										
										
											2015-02-08 19:51:41 -08:00
+								    'input_pdf',    # Implemented
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    help="PDF file containing the page to be OCRed")
 								parser.add_argument(
-												Implement debug text only page option

											
										
										
											2015-02-08 19:51:41 -08:00
+								    'page_info',    # Implemented
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    help="Various characteristics of the page to be OCRed")
 								parser.add_argument(
-												Implement debug text only page option

											
										
										
											2015-02-08 19:51:41 -08:00
+								    'num_pages',    # Unused
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    help="Total number of page of the PDF file (required for logger)")
 								parser.add_argument(
-												Implement debug text only page option

											
										
										
											2015-02-08 19:51:41 -08:00
+								    'tmp_fld',      # Implemented
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    help="Folder where the temporary files should be placed")
 								parser.add_argument(
-												Implement debug text only page option

											
										
										
											2015-02-08 19:51:41 -08:00
+								    'verbosity', type=int,      # Superseded
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    help="Requested verbosity")
 								parser.add_argument(
-												Implement debug text only page option

											
										
										
											2015-02-08 19:51:41 -08:00
+								    'language',     # Implemented
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    help="Language of the file to be OCRed")
 								parser.add_argument(
-												Implement debug text only page option

											
										
										
											2015-02-08 19:51:41 -08:00
+								    'keep_tmp', type=int,   # Not implemented
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    help="Keep the temporary files after processing (helpful for debugging)")
 								parser.add_argument(
-												Implement debug text only page option

											
										
										
											2015-02-08 19:51:41 -08:00
+								    'preprocess_deskew', type=int,          # Implemented
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    help="Deskew the page to be OCRed")
 								parser.add_argument(
-												Implement debug text only page option

											
										
										
											2015-02-08 19:51:41 -08:00
+								    'preprocess_clean', type=int,           # Implemented
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    help="Clean the page to be OCRed")
 								parser.add_argument(
-												Implement debug text only page option

											
										
										
											2015-02-08 19:51:41 -08:00
+								    'preprocess_cleantopdf', type=int,      # Implemented
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    help="Put the cleaned paged in the OCRed PDF")
 								parser.add_argument(
-												Implement oversampling in ocrpage.py

											
										
										
											2015-03-27 18:32:55 -07:00
+								    'oversampling_dpi', type=int,           # Implemented
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    help="Oversampling resolution in dpi")
 								parser.add_argument(
-												Implement debug text only page option

											
										
										
											2015-02-08 19:51:41 -08:00
+								    'pdf_noimg', type=int,                  # implemented
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    help="Generate debug PDF pages with only the OCRed text and no image")
 								parser.add_argument(
-												Implement debug text only page option

											
										
										
											2015-02-08 19:51:41 -08:00
+								    'force_ocr', type=int,                  # Implemented
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    help="Force to OCR, even if the page already contains fonts")
 								parser.add_argument(
-												Implement debug text only page option

											
										
										
											2015-02-08 19:51:41 -08:00
+								    'skip_text', type=int,                  # Implemented
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    help="Skip OCR on pages that contain fonts and include the page anyway")
-												Add support for -b (skip big pages)

											
										
										
											2015-02-20 15:26:33 -08:00
+								parser.add_argument(
 								    'skip_big', type=int,
 								    help="Skip OCR for pages that are very large")
-												Add rudimentary support for combining OCR layer with existing content

It appears to be very fragile due to weaknesses in PyPDF. Better
option is probably to use pdftk's watermark feature.

											
										
										
											2015-03-10 14:28:38 -07:00
+								parser.add_argument(
 								    'exact_image', type=int,
 								    help="Use original page from PDF without re-rendering")
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								parser.add_argument(
-												Implement debug text only page option

											
										
										
											2015-02-08 19:51:41 -08:00
+								    'tess_cfg_files', default='', nargs='*',    # Implemented
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    help="Tesseract configuration")
-												Add leptonica deskew

											
										
										
											2014-11-13 16:53:26 -08:00
+								parser.add_argument(
 								    '--deskew-provider', choices=['imagemagick', 'leptonica'],
 								    default='leptonica')
-												Use Ghostscript -> PNG instead of pdftoppm for rendering

Ghostscript has the clunkiest imaginable syntax, obtuse documentation,
quirky behavior, and poor diagnostics... but it *actually works* unlike
pdftoppm/poppler which gets things wrong.

In this case I observed poppler incorrectly decompresses certain CCITT
encoded monochrome PDFs. So set up Ghostscript to do the job instead.

For the moment this performs monochrome -> RGB conversion via reportlab.

											
										
										
											2015-02-11 03:13:07 -08:00
+								parser.add_argument(
 								    '--page-renderer', choices=['pdftoppm', 'ghostscript'],
 								    default='ghostscript')
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
 								options = parser.parse_args()
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
 								                                               options.verbose)
 								class WrappedLogger:
 								    def __init__(self, my_logger, my_mutex):
 								        self.logger = my_logger
 								        self.mutex = my_mutex
 								    def log(self, *args, **kwargs):
 								        with self.mutex:
 								            self.logger.log(*args, **kwargs)
 								    def debug(self, *args, **kwargs):
 								        with self.mutex:
 								            self.logger.debug(*args, **kwargs)
 								    def info(self, *args, **kwargs):
 								        with self.mutex:
 								            self.logger.info(*args, **kwargs)
 								    def warning(self, *args, **kwargs):
 								        with self.mutex:
 								            self.logger.warning(*args, **kwargs)
 								    def error(self, *args, **kwargs):
 								        with self.mutex:
 								            self.logger.error(*args, **kwargs)
 								    def critical(self, *args, **kwargs):
 								        with self.mutex:
 								            self.logger.critical(*args, **kwargs)
 								log = WrappedLogger(_logger, _logger_mutex)
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
-												Moving quickly - we can now output .ppm files at correct resolution

											
										
										
											2014-09-26 04:43:15 -07:00
+								def pdf_get_pageinfo(infile, page, width_pt, height_pt):
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
+								    pageinfo = {}
-												Moving quickly - we can now output .ppm files at correct resolution

											
										
										
											2014-09-26 04:43:15 -07:00
+								    pageinfo['pageno'] = page
 								    pageinfo['width_inches'] = width_pt / 72.0
 								    pageinfo['height_inches'] = height_pt / 72.0
 								    pageinfo['images'] = []
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
-												When deciding on OCR, check for presence of text rather than a font

It appears to be possible to have a PDF with an embedded font that is
either unused or used only for whitespace. So check for some amount of
actual text instead.

											
										
										
											2015-02-08 17:38:27 -08:00
+								    p_pdftotext = Popen(['pdftotext', '-f', str(page), '-l', str(page),
 								                        '-raw', '-nopgbrk', infile, '-'],
 								                        close_fds=True, stdout=PIPE, stderr=PIPE,
 								                        universal_newlines=True)
 								    text, _ = p_pdftotext.communicate()
 								    if len(text.strip()) > 0:
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
+								        pageinfo['has_text'] = True
 								    else:
 								        pageinfo['has_text'] = False
-												Replace pdfimages -list call to poppler with PyPDF test for image

The immediate reason for doing this is that (newer?) versions of parse()
seem to choke on the parse string. It appears to trigger exponential
behavior in the underlying regex. In any case, replacing subprocesses
with native Python is usually better.

											
										
										
											2015-04-08 21:55:23 -07:00
+								    pdf = pypdf.PdfFileReader(infile)
 								    page = pdf.pages[page - 1]
 								    if not '/XObject' in page['/Resources']:
 								        # Missing /XObject means no images or possibly corrupt PDF
 								        return pageinfo
 								    for xobj in page['/Resources']['/XObject']:
 								        # PyPDF2 returns the keys as an iterator
 								        pdfimage = page['/Resources']['/XObject'][xobj]
 								        if pdfimage['/Subtype'] != '/Image':
 								            continue
 								        image = {}
 								        image['width'] = pdfimage['/Width']
 								        image['height'] = pdfimage['/Height']
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
+								        image['dpi_w'] = image['width'] / pageinfo['width_inches']
 								        image['dpi_h'] = image['height'] / pageinfo['height_inches']
 								        image['dpi'] = (image['dpi_w'] * image['dpi_h']) ** 0.5
 								        pageinfo['images'].append(image)
-												Handle case where a page contains no images - don't OCR

It doesn't make much sense to do anything with an all vector page
except extract the page unmodified.

											
										
										
											2015-02-08 20:05:54 -08:00
+								    if pageinfo['images']:
 								        xres = max(image['dpi_w'] for image in pageinfo['images'])
 								        yres = max(image['dpi_h'] for image in pageinfo['images'])
 								        pageinfo['xres'], pageinfo['yres'] = xres, yres
 								        pageinfo['width_pixels'] = \
 								            int(round(xres * pageinfo['width_inches']))
 								        pageinfo['height_pixels'] = \
 								            int(round(yres * pageinfo['height_inches']))
-												deskew and clean

											
										
										
											2014-09-27 15:03:07 -07:00
-												Implement oversampling in ocrpage.py

											
										
										
											2015-03-27 18:32:55 -07:00
+								        if options.oversampling_dpi > 0:
 								            rx, ry = options.oversampling_dpi, options.oversampling_dpi
 								        else:
 								            rx, ry = pageinfo['xres'], pageinfo['yres']
 								        pageinfo['xres_render'], pageinfo['yres_render'] = rx, ry
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
+								    return pageinfo
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								pageno, width_pt, height_pt = map(int, options.page_info.split(' ', 3))
 								pageinfo = pdf_get_pageinfo(options.input_pdf, pageno, width_pt, height_pt)
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								if not pageinfo['images']:
 								    # If the page has no images, then it contains vector content or text
 								    # or both. It seems quite unlikely that one would find meaningful text
 								    # from rasterizing vector content. So skip the page.
 								    log.info(
 								        "Page {0} has no images - skipping OCR".format(pageno)
 								    )
 								elif pageinfo['has_text']:
 								    s = "Page {0} already has text! – {1}"
 								    if not options.force_ocr and not options.skip_text:
 								        log.error(s.format(pageno,
 								                     "aborting (use -f or -s to force OCR)"))
 								        sys.exit(1)
 								    elif options.force_ocr:
 								        log.info(s.format(pageno,
 								                    "rasterizing text and running OCR anyway"))
 								    elif options.skip_text:
 								        log.info(s.format(pageno,
 								                    "skipping all processing on this page"))
-												Describe what decision was made based on -f and -s and presence of text

											
										
										
											2015-02-08 19:51:18 -08:00
-												Handle case where a page contains no images - don't OCR

It doesn't make much sense to do anything with an all vector page
except extract the page unmodified.

											
										
										
											2015-02-08 20:05:54 -08:00
+								ocr_required = pageinfo['images'] and \
 								    (options.force_ocr or
 								        (not (pageinfo['has_text'] and options.skip_text)))
-												Describe what decision was made based on -f and -s and presence of text

											
										
										
											2015-02-08 19:51:18 -08:00
-												Add support for -b (skip big pages)

											
										
										
											2015-02-20 15:26:33 -08:00
+								if ocr_required and options.skip_big:
 								    area = pageinfo['width_inches'] * pageinfo['height_inches']
 								    pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
 								    if area > (11.0 * 17.0) or pixel_count > (300.0 * 300.0 * 11 * 17):
 								        ocr_required = False
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								        log.info(
-												Add support for -b (skip big pages)

											
										
										
											2015-02-20 15:26:33 -08:00
+								            "Page {0} is very large; skipping due to -b".format(pageno))
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								def re_symlink(input_file, soft_link_name, log=log):
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
+								    """
 								    Helper function: relinks soft symbolic link if necessary
 								    """
 								    # Guard against soft linking to oneself
 								    if input_file == soft_link_name:
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								        log.debug("Warning: No symbolic link made. You are using " +
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
+								                     "the original data directory as the working directory.")
 								        return
 								    # Soft link already exists: delete for relink?
 								    if os.path.lexists(soft_link_name):
 								        # do not delete or overwrite real (non-soft link) file
 								        if not os.path.islink(soft_link_name):
 								            raise Exception("%s exists and is not a link" % soft_link_name)
 								        try:
 								            os.unlink(soft_link_name)
 								        except:
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								            log.debug("Can't unlink %s" % (soft_link_name))
-												Support Tesseract 3.03 quirk: .html vs .hocr extension

											
										
										
											2015-02-11 10:24:10 -08:00
 								    if not os.path.exists(input_file):
 								        raise Exception("trying to create a broken symlink to %s" % input_file)
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								    log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
-												Use abspath instead of relpath for temporary directory symlink

											
										
										
											2014-10-11 17:48:56 -07:00
+								    # Create symbolic link using absolute path
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
+								    os.symlink(
-												Use abspath instead of relpath for temporary directory symlink

											
										
										
											2014-10-11 17:48:56 -07:00
+								        os.path.abspath(input_file),
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
+								        soft_link_name
 								    )
 								@jobs_limit(1)
 								@mkdir(options.tmp_fld)
 								@transform([options.input_pdf],
 								           formatter(),
-												Standardize intermediate filenames better

convert .pnm -deskew <...> .pnm seems to have a bug that produces an
invalid .pnm file which later causes tesseract (specifically,
leptonica) to choke (using 3.02/1.71 as versions, respectively). Will
change pipeline to use tiffs internally since they are less stupid.

											
										
										
											2014-10-10 01:30:19 -07:00
+								           os.path.join(options.tmp_fld, "original{ext[0]}"))
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
+								def setup_working_directory(input_file, soft_link_name):
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								    log.debug("Linking %(input_file)s -> %(soft_link_name)s" % locals())
-												Fix symlink error that occurs in multipage processing

											
										
										
											2014-11-13 15:58:36 -08:00
+								    try:
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								        re_symlink(input_file, soft_link_name)
-												Fix symlink error that occurs in multipage processing

											
										
										
											2014-11-13 15:58:36 -08:00
+								    except FileExistsError:
 								        pass
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
-												Add rudimentary support for combining OCR layer with existing content

It appears to be very fragile due to weaknesses in PyPDF. Better
option is probably to use pdftk's watermark feature.

											
										
										
											2015-03-10 14:28:38 -07:00
+								@active_if(not ocr_required or (ocr_required and options.exact_image))
-												Implement skipping OCR when -s is specified

Appears to be necessary to disable each state of the pipeline that is
inactive, not just initial and terminal stages of an inactive segment.
If nothing else this makes what is going on more explicit.

											
										
										
											2015-02-08 17:26:16 -08:00
+								@transform(setup_working_directory,
 								           formatter(),
-												Add rudimentary support for combining OCR layer with existing content

It appears to be very fragile due to weaknesses in PyPDF. Better
option is probably to use pdftk's watermark feature.

											
										
										
											2015-03-10 14:28:38 -07:00
+								           os.path.join(options.tmp_fld, '%04i.page.pdf' % pageno))
 								def extract_single_page(
-												Implement skipping OCR when -s is specified

Appears to be necessary to disable each state of the pipeline that is
inactive, not just initial and terminal stages of an inactive segment.
If nothing else this makes what is going on more explicit.

											
										
										
											2015-02-08 17:26:16 -08:00
+								        input_file,
 								        output_file):
 								    args_pdfseparate = [
 								        'pdfseparate',
 								        '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']),
 								        input_file,
 								        output_file
 								    ]
 								    check_call(args_pdfseparate)
 								@active_if(ocr_required)
-												Use Ghostscript -> PNG instead of pdftoppm for rendering

Ghostscript has the clunkiest imaginable syntax, obtuse documentation,
quirky behavior, and poor diagnostics... but it *actually works* unlike
pdftoppm/poppler which gets things wrong.

In this case I observed poppler incorrectly decompresses certain CCITT
encoded monochrome PDFs. So set up Ghostscript to do the job instead.

For the moment this performs monochrome -> RGB conversion via reportlab.

											
										
										
											2015-02-11 03:13:07 -08:00
+								@active_if(options.page_renderer == 'pdftoppm')
-												Standardize intermediate filenames better

convert .pnm -deskew <...> .pnm seems to have a bug that produces an
invalid .pnm file which later causes tesseract (specifically,
leptonica) to choke (using 3.02/1.71 as versions, respectively). Will
change pipeline to use tiffs internally since they are less stupid.

											
										
										
											2014-10-10 01:30:19 -07:00
+								@transform(setup_working_directory,
 								           formatter(),
 								           "{path[0]}/%04i.pnm" % pageno)
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								def unpack_with_pdftoppm(
 								        input_file,
 								        output_file):
 								    force_ppm = True
-												Use TIFFs as intermediates

pdftoppm in recent versions (0.26.4,5) seems to be incapable of
producing valid TIFFs, so have it dump a .pnm file and let ImageMagick
figure out how to convert it to TIFF. This is not ideal, but at least
it works.

											
										
										
											2014-10-10 01:54:16 -07:00
+								    allow_jpeg = False
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
 								    colorspace = 'color'
 								    compression = 'deflate'
 								    output_format = 'tiff'
 								    if all(image['comp'] == 1 for image in pageinfo['images']):
 								        if all(image['bpc'] == 1 for image in pageinfo['images']):
 								            colorspace = 'mono'
 								            compression = 'deflate'
-												Moving quickly - we can now output .ppm files at correct resolution

											
										
										
											2014-09-26 04:43:15 -07:00
+								        elif not any(image['color'] == 'color'
 								                     for image in pageinfo['images']):
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
+								            colorspace = 'gray'
-												Use TIFFs as intermediates

pdftoppm in recent versions (0.26.4,5) seems to be incapable of
producing valid TIFFs, so have it dump a .pnm file and let ImageMagick
figure out how to convert it to TIFF. This is not ideal, but at least
it works.

											
										
										
											2014-10-10 01:54:16 -07:00
+								    if allow_jpeg and \
 								            all(image['enc'] == 'jpeg' for image in pageinfo['images']):
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
+								        output_format = 'jpeg'
 								    args_pdftoppm = [
 								        'pdftoppm',
-												Moving quickly - we can now output .ppm files at correct resolution

											
										
										
											2014-09-26 04:43:15 -07:00
+								        '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']),
-												Implement oversampling in ocrpage.py

											
										
										
											2015-03-27 18:32:55 -07:00
+								        '-rx', str(pageinfo['xres_render']),
 								        '-ry', str(pageinfo['yres_render'])
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
+								    ]
 								    if not force_ppm:
 								        if output_format == 'tiff':
 								            args_pdftoppm.append('-tiff')
-												Use TIFFs as intermediates

pdftoppm in recent versions (0.26.4,5) seems to be incapable of
producing valid TIFFs, so have it dump a .pnm file and let ImageMagick
figure out how to convert it to TIFF. This is not ideal, but at least
it works.

											
										
										
											2014-10-10 01:54:16 -07:00
+								            if False and compression:
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
+								                args_pdftoppm.append('-tiffcompression')
 								                args_pdftoppm.append(compression)
 								        elif output_format == 'jpeg':
 								            args_pdftoppm.append('-jpeg')
 								    if colorspace == 'mono':
 								        args_pdftoppm.append('-mono')
 								    elif colorspace == 'gray':
 								        args_pdftoppm.append('-gray')
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    args_pdftoppm.extend([str(input_file)])
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
-												Comments

											
										
										
											2014-10-17 17:28:31 -07:00
+								    # Ask pdftoppm to write the binary output to stdout; therefore set
 								    # universal_newlines=False
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    p = Popen(args_pdftoppm, close_fds=True, stdout=open(output_file, 'wb'),
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
+								              stderr=PIPE, universal_newlines=False)
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								    _, stderr = p.communicate()
 								    if stderr:
-												Comments

											
										
										
											2014-10-17 17:28:31 -07:00
+								        # Because universal_newlines=False, stderr is bytes(), so we must
 								        # manually convert it to str for logging
-												Attempt to fix multiprocessing pickling error

											
										
										
											2014-11-13 15:58:57 -08:00
+								        from codecs import decode
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								        log.error(decode(stderr, sys.getdefaultencoding(), 'ignore'))
-												Basic error handling

											
										
										
											2014-10-10 01:07:46 -07:00
+								    if p.returncode != 0:
-												Use TIFFs as intermediates

pdftoppm in recent versions (0.26.4,5) seems to be incapable of
producing valid TIFFs, so have it dump a .pnm file and let ImageMagick
figure out how to convert it to TIFF. This is not ideal, but at least
it works.

											
										
										
											2014-10-10 01:54:16 -07:00
+								        raise CalledProcessError(p.returncode, args_pdftoppm)
-												Implement skipping OCR when -s is specified

Appears to be necessary to disable each state of the pipeline that is
inactive, not just initial and terminal stages of an inactive segment.
If nothing else this makes what is going on more explicit.

											
										
										
											2015-02-08 17:26:16 -08:00
+								@active_if(ocr_required)
-												Use Ghostscript -> PNG instead of pdftoppm for rendering

Ghostscript has the clunkiest imaginable syntax, obtuse documentation,
quirky behavior, and poor diagnostics... but it *actually works* unlike
pdftoppm/poppler which gets things wrong.

In this case I observed poppler incorrectly decompresses certain CCITT
encoded monochrome PDFs. So set up Ghostscript to do the job instead.

For the moment this performs monochrome -> RGB conversion via reportlab.

											
										
										
											2015-02-11 03:13:07 -08:00
+								@transform(unpack_with_pdftoppm, suffix(".pnm"), ".png")
 								def convert_to_png(input_file, output_file):
-												Use TIFFs as intermediates

pdftoppm in recent versions (0.26.4,5) seems to be incapable of
producing valid TIFFs, so have it dump a .pnm file and let ImageMagick
figure out how to convert it to TIFF. This is not ideal, but at least
it works.

											
										
										
											2014-10-10 01:54:16 -07:00
+								    args_convert = [
 								        'convert',
 								        input_file,
 								        output_file
 								    ]
 								    check_call(args_convert)
-												deskew and clean

											
										
										
											2014-09-27 15:03:07 -07:00
-												Use Ghostscript -> PNG instead of pdftoppm for rendering

Ghostscript has the clunkiest imaginable syntax, obtuse documentation,
quirky behavior, and poor diagnostics... but it *actually works* unlike
pdftoppm/poppler which gets things wrong.

In this case I observed poppler incorrectly decompresses certain CCITT
encoded monochrome PDFs. So set up Ghostscript to do the job instead.

For the moment this performs monochrome -> RGB conversion via reportlab.

											
										
										
											2015-02-11 03:13:07 -08:00
+								@active_if(ocr_required)
 								@active_if(options.page_renderer == 'ghostscript')
 								@transform(setup_working_directory,
 								           formatter(),
 								           "{path[0]}/%04i.png" % pageno)
 								def unpack_with_ghostscript(
 								        input_file,
 								        output_file):
-												Use the appropriate PNG rendered given the types of image present

											
										
										
											2015-02-11 03:32:00 -08:00
+								    device = 'png16m'  # 24-bit
 								    if all(image['comp'] == 1 for image in pageinfo['images']):
 								        if all(image['bpc'] == 1 for image in pageinfo['images']):
 								            device = 'pngmono'
 								        elif not any(image['color'] == 'color'
 								                     for image in pageinfo['images']):
 								            device = 'pnggray'
-												Use Ghostscript -> PNG instead of pdftoppm for rendering

Ghostscript has the clunkiest imaginable syntax, obtuse documentation,
quirky behavior, and poor diagnostics... but it *actually works* unlike
pdftoppm/poppler which gets things wrong.

In this case I observed poppler incorrectly decompresses certain CCITT
encoded monochrome PDFs. So set up Ghostscript to do the job instead.

For the moment this performs monochrome -> RGB conversion via reportlab.

											
										
										
											2015-02-11 03:13:07 -08:00
+								    args_gs = [
 								        'gs',
 								        '-dBATCH', '-dNOPAUSE',
 								        '-dFirstPage=%i' % pageno,
 								        '-dLastPage=%i' % pageno,
-												Use the appropriate PNG rendered given the types of image present

											
										
										
											2015-02-11 03:32:00 -08:00
+								        '-sDEVICE=%s' % device,
-												Use Ghostscript -> PNG instead of pdftoppm for rendering

Ghostscript has the clunkiest imaginable syntax, obtuse documentation,
quirky behavior, and poor diagnostics... but it *actually works* unlike
pdftoppm/poppler which gets things wrong.

In this case I observed poppler incorrectly decompresses certain CCITT
encoded monochrome PDFs. So set up Ghostscript to do the job instead.

For the moment this performs monochrome -> RGB conversion via reportlab.

											
										
										
											2015-02-11 03:13:07 -08:00
+								        '-o', output_file,
-												Implement oversampling in ocrpage.py

											
										
										
											2015-03-27 18:32:55 -07:00
+								        '-r{0}x{1}'.format(
 								            str(pageinfo['xres_render']), str(pageinfo['yres_render'])),
-												Use Ghostscript -> PNG instead of pdftoppm for rendering

Ghostscript has the clunkiest imaginable syntax, obtuse documentation,
quirky behavior, and poor diagnostics... but it *actually works* unlike
pdftoppm/poppler which gets things wrong.

In this case I observed poppler incorrectly decompresses certain CCITT
encoded monochrome PDFs. So set up Ghostscript to do the job instead.

For the moment this performs monochrome -> RGB conversion via reportlab.

											
										
										
											2015-02-11 03:13:07 -08:00
+								        input_file
 								    ]
 								    p = Popen(args_gs, close_fds=True, stdout=PIPE, stderr=PIPE,
 								              universal_newlines=True)
 								    stdout, stderr = p.communicate()
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								    if stdout:
 								        log.info(stdout)
 								    if stderr:
 								        log.error(stderr)
-												Use Ghostscript -> PNG instead of pdftoppm for rendering

Ghostscript has the clunkiest imaginable syntax, obtuse documentation,
quirky behavior, and poor diagnostics... but it *actually works* unlike
pdftoppm/poppler which gets things wrong.

In this case I observed poppler incorrectly decompresses certain CCITT
encoded monochrome PDFs. So set up Ghostscript to do the job instead.

For the moment this performs monochrome -> RGB conversion via reportlab.

											
										
										
											2015-02-11 03:13:07 -08:00
 								    try:
 								        f = open(output_file)
 								    except FileNotFoundError:
 								        raise
 								    else:
 								        f.close()
-												Implement skipping OCR when -s is specified

Appears to be necessary to disable each state of the pipeline that is
inactive, not just initial and terminal stages of an inactive segment.
If nothing else this makes what is going on more explicit.

											
										
										
											2015-02-08 17:26:16 -08:00
+								@active_if(ocr_required)
-												Add leptonica deskew

											
										
										
											2014-11-13 16:53:26 -08:00
+								@active_if(options.preprocess_deskew != 0
 								           and options.deskew_provider == 'imagemagick')
-												Use Ghostscript -> PNG instead of pdftoppm for rendering

Ghostscript has the clunkiest imaginable syntax, obtuse documentation,
quirky behavior, and poor diagnostics... but it *actually works* unlike
pdftoppm/poppler which gets things wrong.

In this case I observed poppler incorrectly decompresses certain CCITT
encoded monochrome PDFs. So set up Ghostscript to do the job instead.

For the moment this performs monochrome -> RGB conversion via reportlab.

											
										
										
											2015-02-11 03:13:07 -08:00
+								@transform(convert_to_png, suffix(".png"), ".deskewed.png")
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
+								def deskew_imagemagick(input_file, output_file):
-												deskew and clean

											
										
										
											2014-09-27 15:03:07 -07:00
+								    args_convert = [
 								        'convert',
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
+								        input_file,
-												deskew and clean

											
										
										
											2014-09-27 15:03:07 -07:00
+								        '-deskew', '40%',
 								        '-gravity', 'center',
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
+								        '-extent', '{width_pixels}x{height_pixels}'.format(**pageinfo),
-												Use TIFFs as intermediates

pdftoppm in recent versions (0.26.4,5) seems to be incapable of
producing valid TIFFs, so have it dump a .pnm file and let ImageMagick
figure out how to convert it to TIFF. This is not ideal, but at least
it works.

											
										
										
											2014-10-10 01:54:16 -07:00
+								        '+repage',
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
+								        output_file
-												deskew and clean

											
										
										
											2014-09-27 15:03:07 -07:00
+								    ]
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
+								    p = Popen(args_convert, close_fds=True, stdout=PIPE, stderr=PIPE,
 								              universal_newlines=True)
 								    stdout, stderr = p.communicate()
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								    if stdout:
 								        log.info(stdout)
 								    if stderr:
 								        log.error(stderr)
-												deskew and clean

											
										
										
											2014-09-27 15:03:07 -07:00
-												Basic error handling

											
										
										
											2014-10-10 01:07:46 -07:00
+								    if p.returncode != 0:
-												Use TIFFs as intermediates

pdftoppm in recent versions (0.26.4,5) seems to be incapable of
producing valid TIFFs, so have it dump a .pnm file and let ImageMagick
figure out how to convert it to TIFF. This is not ideal, but at least
it works.

											
										
										
											2014-10-10 01:54:16 -07:00
+								        raise CalledProcessError(p.returncode, args_convert)
-												Basic error handling

											
										
										
											2014-10-10 01:07:46 -07:00
-												deskew and clean

											
										
										
											2014-09-27 15:03:07 -07:00
-												Implement skipping OCR when -s is specified

Appears to be necessary to disable each state of the pipeline that is
inactive, not just initial and terminal stages of an inactive segment.
If nothing else this makes what is going on more explicit.

											
										
										
											2015-02-08 17:26:16 -08:00
+								@active_if(ocr_required)
-												Add leptonica deskew

											
										
										
											2014-11-13 16:53:26 -08:00
+								@active_if(options.preprocess_deskew != 0
 								           and options.deskew_provider == 'leptonica')
-												Use Ghostscript -> PNG instead of pdftoppm for rendering

Ghostscript has the clunkiest imaginable syntax, obtuse documentation,
quirky behavior, and poor diagnostics... but it *actually works* unlike
pdftoppm/poppler which gets things wrong.

In this case I observed poppler incorrectly decompresses certain CCITT
encoded monochrome PDFs. So set up Ghostscript to do the job instead.

For the moment this performs monochrome -> RGB conversion via reportlab.

											
										
										
											2015-02-11 03:13:07 -08:00
+								@transform(convert_to_png, suffix(".png"), ".deskewed.png")
-												Add leptonica deskew

											
										
										
											2014-11-13 16:53:26 -08:00
+								def deskew_leptonica(input_file, output_file):
 								    from .leptonica import deskew
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								    deskew(input_file, output_file,
 								           min(pageinfo['xres'], pageinfo['yres']))
-												Add leptonica deskew

											
										
										
											2014-11-13 16:53:26 -08:00
-												Implement skipping OCR when -s is specified

Appears to be necessary to disable each state of the pipeline that is
inactive, not just initial and terminal stages of an inactive segment.
If nothing else this makes what is going on more explicit.

											
										
										
											2015-02-08 17:26:16 -08:00
+								@active_if(ocr_required)
 								@active_if(options.preprocess_clean != 0)
-												Use Ghostscript -> PNG instead of pdftoppm for rendering

Ghostscript has the clunkiest imaginable syntax, obtuse documentation,
quirky behavior, and poor diagnostics... but it *actually works* unlike
pdftoppm/poppler which gets things wrong.

In this case I observed poppler incorrectly decompresses certain CCITT
encoded monochrome PDFs. So set up Ghostscript to do the job instead.

For the moment this performs monochrome -> RGB conversion via reportlab.

											
										
										
											2015-02-11 03:13:07 -08:00
+								@merge([unpack_with_pdftoppm, unpack_with_ghostscript,
 								        deskew_imagemagick, deskew_leptonica],
-												The -dci options now work (and valid combinations thereof)

											
										
										
											2014-11-14 00:23:22 -08:00
+								       os.path.join(options.tmp_fld, "%04i.for_clean.pnm" % pageno))
 								def select_image_for_cleaning(infiles, output_file):
 								    input_file = infiles[-1]
 								    args_convert = [
 								        'convert',
 								        input_file,
 								        output_file
 								    ]
 								    check_call(args_convert)
-												Implement skipping OCR when -s is specified

Appears to be necessary to disable each state of the pipeline that is
inactive, not just initial and terminal stages of an inactive segment.
If nothing else this makes what is going on more explicit.

											
										
										
											2015-02-08 17:26:16 -08:00
+								@active_if(ocr_required)
-												The -dci options now work (and valid combinations thereof)

											
										
										
											2014-11-14 00:23:22 -08:00
+								@active_if(options.preprocess_clean != 0)
 								@transform(select_image_for_cleaning, suffix(".pnm"), ".cleaned.pnm")
 								def clean_unpaper(input_file, output_file):
-												deskew and clean

											
										
										
											2014-09-27 15:03:07 -07:00
+								    args_unpaper = [
 								        'unpaper',
-												The -dci options now work (and valid combinations thereof)

											
										
										
											2014-11-14 00:23:22 -08:00
+								        '--dpi', str(int(round((pageinfo['xres'] * pageinfo['yres']) ** 0.5))),
-												deskew and clean

											
										
										
											2014-09-27 15:03:07 -07:00
+								        '--mask-scan-size', '100',
 								        '--no-deskew',
 								        '--no-grayfilter',
 								        '--no-blackfilter',
 								        '--no-mask-center',
 								        '--no-border-align',
-												The -dci options now work (and valid combinations thereof)

											
										
										
											2014-11-14 00:23:22 -08:00
+								        input_file,
 								        output_file
-												deskew and clean

											
										
										
											2014-09-27 15:03:07 -07:00
+								    ]
-												The -dci options now work (and valid combinations thereof)

											
										
										
											2014-11-14 00:23:22 -08:00
+								    p = Popen(args_unpaper, close_fds=True, stdout=PIPE, stderr=PIPE,
 								              universal_newlines=True)
 								    stdout, stderr = p.communicate()
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								    if stdout:
-												Complete wrapping of logger/logger_mutex

											
										
										
											2015-07-22 02:57:13 -07:00
+								        log.info(stdout)
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								    if stderr:
-												Complete wrapping of logger/logger_mutex

											
										
										
											2015-07-22 02:57:13 -07:00
+								        log.error(stderr)
-												The -dci options now work (and valid combinations thereof)

											
										
										
											2014-11-14 00:23:22 -08:00
 								    if p.returncode != 0:
 								        raise CalledProcessError(p.returncode, args_unpaper)
-												Implement skipping OCR when -s is specified

Appears to be necessary to disable each state of the pipeline that is
inactive, not just initial and terminal stages of an inactive segment.
If nothing else this makes what is going on more explicit.

											
										
										
											2015-02-08 17:26:16 -08:00
+								@active_if(ocr_required)
-												Use Ghostscript -> PNG instead of pdftoppm for rendering

Ghostscript has the clunkiest imaginable syntax, obtuse documentation,
quirky behavior, and poor diagnostics... but it *actually works* unlike
pdftoppm/poppler which gets things wrong.

In this case I observed poppler incorrectly decompresses certain CCITT
encoded monochrome PDFs. So set up Ghostscript to do the job instead.

For the moment this performs monochrome -> RGB conversion via reportlab.

											
										
										
											2015-02-11 03:13:07 -08:00
+								@transform(clean_unpaper, suffix(".cleaned.pnm"), ".cleaned.png")
 								def cleaned_to_png(input_file, output_file):
-												The -dci options now work (and valid combinations thereof)

											
										
										
											2014-11-14 00:23:22 -08:00
+								    args_convert = [
 								        'convert',
 								        input_file,
 								        output_file
 								    ]
 								    check_call(args_convert)
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00
-												Implement skipping OCR when -s is specified

Appears to be necessary to disable each state of the pipeline that is
inactive, not just initial and terminal stages of an inactive segment.
If nothing else this makes what is going on more explicit.

											
										
										
											2015-02-08 17:26:16 -08:00
+								@active_if(ocr_required)
-												Use Ghostscript -> PNG instead of pdftoppm for rendering

Ghostscript has the clunkiest imaginable syntax, obtuse documentation,
quirky behavior, and poor diagnostics... but it *actually works* unlike
pdftoppm/poppler which gets things wrong.

In this case I observed poppler incorrectly decompresses certain CCITT
encoded monochrome PDFs. So set up Ghostscript to do the job instead.

For the moment this performs monochrome -> RGB conversion via reportlab.

											
										
										
											2015-02-11 03:13:07 -08:00
+								@merge([unpack_with_ghostscript, convert_to_png, deskew_imagemagick,
 								        deskew_leptonica, cleaned_to_png],
 								       os.path.join(options.tmp_fld, "%04i.for_ocr.png" % pageno))
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
+								def select_ocr_image(infiles, output_file):
-												Complete wrapping of logger/logger_mutex

											
										
										
											2015-07-22 02:57:13 -07:00
+								    re_symlink(infiles[-1], output_file)
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
-												Add Tesseract timeout to keep things reasonable

											
										
										
											2014-11-14 02:06:23 -08:00
+								hocr_template = '''<?xml version="1.0" encoding="UTF-8"?>
 								<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
 								    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 								<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 								 <head>
 								  <title></title>
 								  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
 								  <meta name='ocr-system' content='tesseract 3.02.02' />
 								  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
 								 </head>
 								 <body>
 								  <div class='ocr_page' id='page_1' title='image "x.tif"; bbox 0 0 {0} {1}; ppageno 0'>
 								   <div class='ocr_carea' id='block_1_1' title="bbox 0 1 {0} {1}">
 								    <p class='ocr_par' dir='ltr' id='par_1' title="bbox 0 1 {0} {1}">
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								     <span class='ocr_line' id='line_1' title="bbox 0 1 {0} {1}"><span class='ocrx_word' id='word_1' title="bbox 0 1 {0} {1}"> </span>
-												Add Tesseract timeout to keep things reasonable

											
										
										
											2014-11-14 02:06:23 -08:00
+								     </span>
 								    </p>
 								   </div>
 								  </div>
 								 </body>
 								</html>'''
-												Implement skipping OCR when -s is specified

Appears to be necessary to disable each state of the pipeline that is
inactive, not just initial and terminal stages of an inactive segment.
If nothing else this makes what is going on more explicit.

											
										
										
											2015-02-08 17:26:16 -08:00
+								@active_if(ocr_required)
-												Use Ghostscript -> PNG instead of pdftoppm for rendering

Ghostscript has the clunkiest imaginable syntax, obtuse documentation,
quirky behavior, and poor diagnostics... but it *actually works* unlike
pdftoppm/poppler which gets things wrong.

In this case I observed poppler incorrectly decompresses certain CCITT
encoded monochrome PDFs. So set up Ghostscript to do the job instead.

For the moment this performs monochrome -> RGB conversion via reportlab.

											
										
										
											2015-02-11 03:13:07 -08:00
+								@transform(select_ocr_image, suffix(".for_ocr.png"), ".hocr")
-												First crack at Ruffus, working well

											
										
										
											2014-10-08 03:21:28 -07:00
+								def ocr_tesseract(
 								        input_file,
 								        output_file):
 								    args_tesseract = [
 								        'tesseract',
 								        '-l', options.language,
 								        input_file,
 								        output_file,
 								        'hocr',
 								        options.tess_cfg_files
 								    ]
 								    p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
 								              universal_newlines=True)
-												Add Tesseract timeout to keep things reasonable

											
										
										
											2014-11-14 02:06:23 -08:00
+								    try:
 								        stdout, stderr = p.communicate(timeout=180)
 								    except TimeoutExpired:
 								        p.kill()
 								        stdout, stderr = p.communicate()
 								        # Generate a HOCR file with no recognized text if tesseract times out
 								        # Temporary workaround to hocrTransform not being able to function if
 								        # it does not have a valid hOCR file.
 								        with open(output_file, 'w', encoding="utf-8") as f:
 								            f.write(hocr_template.format(pageinfo['width_pixels'],
 								                                         pageinfo['height_pixels']))
 								    else:
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								        if stdout:
-												Complete wrapping of logger/logger_mutex

											
										
										
											2015-07-22 02:57:13 -07:00
+								            log.info(stdout)
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								        if stderr:
-												Complete wrapping of logger/logger_mutex

											
										
										
											2015-07-22 02:57:13 -07:00
+								            log.error(stderr)
-												Sort of working, but fragile; uses tmp folder properly now

											
										
										
											2014-10-10 00:35:49 -07:00
-												Add Tesseract timeout to keep things reasonable

											
										
										
											2014-11-14 02:06:23 -08:00
+								        if p.returncode != 0:
 								            raise CalledProcessError(p.returncode, args_tesseract)
-												Basic error handling

											
										
										
											2014-10-10 01:07:46 -07:00
-												Support Tesseract 3.03 quirk: .html vs .hocr extension

											
										
										
											2015-02-11 10:24:10 -08:00
+								        if os.path.exists(output_file + '.html'):
-												Add rudimentary support for combining OCR layer with existing content

It appears to be very fragile due to weaknesses in PyPDF. Better
option is probably to use pdftk's watermark feature.

											
										
										
											2015-03-10 14:28:38 -07:00
+								            # Tesseract 3.02 appends suffix ".html" on its own (.hocr.html)
 								            shutil.move(output_file + '.html', output_file)
-												Support Tesseract 3.03 quirk: .html vs .hocr extension

											
										
										
											2015-02-11 10:24:10 -08:00
+								        elif os.path.exists(output_file + '.hocr'):
-												Add rudimentary support for combining OCR layer with existing content

It appears to be very fragile due to weaknesses in PyPDF. Better
option is probably to use pdftk's watermark feature.

											
										
										
											2015-03-10 14:28:38 -07:00
+								            # Tesseract 3.03 appends suffix ".hocr" on its own (.hocr.hocr)
 								            shutil.move(output_file + '.hocr', output_file)
 								        # Tesseract inserts source filename into hocr file without escaping
 								        # it. This could break the XML parser. Rewrite the hocr file,
 								        # replacing the filename with a space.
 								        regex_nested_single_quotes = re.compile(
 								            r"""title='image "([^"]*)";""")
 								        with fileinput.input(files=(output_file,), inplace=True) as f:
 								            for line in f:
 								                line = regex_nested_single_quotes.sub(
 								                    r"""title='image " ";""", line)
 								                print(line, end='')  # fileinput.input redirects stdout
 								@active_if(ocr_required and not options.exact_image)
-												Use Ghostscript -> PNG instead of pdftoppm for rendering

Ghostscript has the clunkiest imaginable syntax, obtuse documentation,
quirky behavior, and poor diagnostics... but it *actually works* unlike
pdftoppm/poppler which gets things wrong.

In this case I observed poppler incorrectly decompresses certain CCITT
encoded monochrome PDFs. So set up Ghostscript to do the job instead.

For the moment this performs monochrome -> RGB conversion via reportlab.

											
										
										
											2015-02-11 03:13:07 -08:00
+								@merge([unpack_with_ghostscript, convert_to_png,
 								        deskew_imagemagick, deskew_leptonica, cleaned_to_png],
-												Convert the final image to a JPEG if the original image was a JPEG

Of course, this introduces recompression artifacts, and is unnecessary
if no options are given that modify the final image (no -d, -c, -i).
But rather than worry about that, it would be better to ultimately find
a way to combine the original PDF page with the output PDF text in the
case where we want no changes to the original. This is good enough for
now.

The better option can apparently be achieved using pdftk background, or
probably better, PyPDF2's merge. If Tesseract PDF generation is used
then we need a way to remove the image. Tesseract PDF generation at 3.03
does layout better (I think) and also properly encodes the hidden layer,
which is less likely to give display issues (I think).

											
										
										
											2015-02-11 10:23:45 -08:00
+								       os.path.join(options.tmp_fld, "%04i.image_for_pdf" % pageno))
-												The -dci options now work (and valid combinations thereof)

											
										
										
											2014-11-14 00:23:22 -08:00
+								def select_image_for_pdf(infiles, output_file):
 								    if options.preprocess_clean != 0 and options.preprocess_cleantopdf != 0:
 								        input_file = infiles[-1]
 								    elif options.preprocess_deskew != 0 and options.preprocess_clean != 0:
 								        input_file = infiles[-2]
 								    elif options.preprocess_deskew != 0 and options.preprocess_clean == 0:
 								        input_file = infiles[-1]
 								    else:
 								        input_file = infiles[0]
-												Convert the final image to a JPEG if the original image was a JPEG

Of course, this introduces recompression artifacts, and is unnecessary
if no options are given that modify the final image (no -d, -c, -i).
But rather than worry about that, it would be better to ultimately find
a way to combine the original PDF page with the output PDF text in the
case where we want no changes to the original. This is good enough for
now.

The better option can apparently be achieved using pdftk background, or
probably better, PyPDF2's merge. If Tesseract PDF generation is used
then we need a way to remove the image. Tesseract PDF generation at 3.03
does layout better (I think) and also properly encodes the hidden layer,
which is less likely to give display issues (I think).

											
										
										
											2015-02-11 10:23:45 -08:00
 								    if all(image['enc'] == 'jpeg' for image in pageinfo['images']):
 								        # If all images were JPEGs originally, produce a JPEG as output
 								        check_call(['convert', input_file, 'jpg:' + output_file])
 								    else:
-												Complete wrapping of logger/logger_mutex

											
										
										
											2015-07-22 02:57:13 -07:00
+								        re_symlink(input_file, output_file)
-												The -dci options now work (and valid combinations thereof)

											
										
										
											2014-11-14 00:23:22 -08:00
-												Add rudimentary support for combining OCR layer with existing content

It appears to be very fragile due to weaknesses in PyPDF. Better
option is probably to use pdftk's watermark feature.

											
										
										
											2015-03-10 14:28:38 -07:00
+								@active_if(ocr_required and not options.exact_image)
-												The -dci options now work (and valid combinations thereof)

											
										
										
											2014-11-14 00:23:22 -08:00
+								@merge([ocr_tesseract, select_image_for_pdf],
-												Implement skipping OCR when -s is specified

Appears to be necessary to disable each state of the pipeline that is
inactive, not just initial and terminal stages of an inactive segment.
If nothing else this makes what is going on more explicit.

											
										
										
											2015-02-08 17:26:16 -08:00
+								       os.path.join(options.tmp_fld, '%04i.rendered.pdf' % pageno))
-												Now produces a finished OCR-PDF page

											
										
										
											2014-10-08 03:54:06 -07:00
+								def render_page(infiles, output_file):
-												Call HocrTransform directly instead of through a subprocess

											
										
										
											2015-02-20 17:20:48 -08:00
+								    hocr, image = infiles[0], infiles[1]
-												Implement debug text only page option

											
										
										
											2015-02-08 19:51:41 -08:00
-												Call HocrTransform directly instead of through a subprocess

											
										
										
											2015-02-20 17:20:48 -08:00
+								    dpi = round(max(pageinfo['xres'], pageinfo['yres']))
-												Implement debug text only page option

											
										
										
											2015-02-08 19:51:41 -08:00
-												Call HocrTransform directly instead of through a subprocess

											
										
										
											2015-02-20 17:20:48 -08:00
+								    hocrtransform = HocrTransform(hocr, dpi)
 								    hocrtransform.to_pdf(output_file, imageFileName=image,
-												Add option to render text as invisible OCR text

Prior to this change, hocrtransform would render printable text (black
on white) and then a fully opaque image on top of the text. According to
the PDF spec, text that is the output of OCR should be marked invisible,
so that PDF viewers /know/ it's OCR output in a document that might mix
OCR and text overlays. Another benefit is that PDF viewers would know
to skip rendering text if they are not smart enough to figure out the
image will completely overwrite it.

However, for debug, visible text is nice, so retain it as an option.

											
										
										
											2015-02-21 14:19:27 -08:00
+								                         showBoundingboxes=False, invisibleText=True)
-												Implement debug text only page option

											
										
										
											2015-02-08 19:51:41 -08:00
 								@active_if(ocr_required and options.pdf_noimg)
 								@transform(ocr_tesseract, suffix(".hocr"), ".ocred.todebug.pdf")
 								def render_text_output_page(input_file, output_file):
-												Call HocrTransform directly instead of through a subprocess

											
										
										
											2015-02-20 17:20:48 -08:00
+								    dpi = round(max(pageinfo['xres'], pageinfo['yres']))
-												Now produces a finished OCR-PDF page

											
										
										
											2014-10-08 03:54:06 -07:00
-												Call HocrTransform directly instead of through a subprocess

											
										
										
											2015-02-20 17:20:48 -08:00
+								    hocrtransform = HocrTransform(input_file, dpi)
 								    hocrtransform.to_pdf(output_file, imageFileName=None,
-												Add option to render text as invisible OCR text

Prior to this change, hocrtransform would render printable text (black
on white) and then a fully opaque image on top of the text. According to
the PDF spec, text that is the output of OCR should be marked invisible,
so that PDF viewers /know/ it's OCR output in a document that might mix
OCR and text overlays. Another benefit is that PDF viewers would know
to skip rendering text if they are not smart enough to figure out the
image will completely overwrite it.

However, for debug, visible text is nice, so retain it as an option.

											
										
										
											2015-02-21 14:19:27 -08:00
+								                         showBoundingboxes=True, invisibleText=False)
-												Basic error handling

											
										
										
											2014-10-10 01:07:46 -07:00
-												Now produces a finished OCR-PDF page

											
										
										
											2014-10-08 03:54:06 -07:00
-												Add rudimentary support for combining OCR layer with existing content

It appears to be very fragile due to weaknesses in PyPDF. Better
option is probably to use pdftk's watermark feature.

											
										
										
											2015-03-10 14:28:38 -07:00
+								@active_if(ocr_required and options.exact_image)
 								@transform(ocr_tesseract, suffix(".hocr"), ".hocr.pdf")
 								def render_hocr_blank_page(input_file, output_file):
 								    dpi = round(max(pageinfo['xres'], pageinfo['yres']))
 								    hocrtransform = HocrTransform(input_file, dpi)
 								    hocrtransform.to_pdf(output_file, imageFileName=None,
 								                         showBoundingboxes=False, invisibleText=True)
 								@active_if(ocr_required and options.exact_image)
 								@merge([render_hocr_blank_page, extract_single_page],
 								       os.path.join(options.tmp_fld, "%04i.merged.pdf") % pageno)
 								def merge_hocr_with_original_page(infiles, output_file):
 								    with open(infiles[0], 'rb') as hocr_input, \
 								            open(infiles[1], 'rb') as page_input, \
 								            open(output_file, 'wb') as output:
 								        hocr_reader = pypdf.PdfFileReader(hocr_input)
 								        page_reader = pypdf.PdfFileReader(page_input)
 								        writer = pypdf.PdfFileWriter()
 								        the_page = hocr_reader.getPage(0)
 								        the_page.mergePage(page_reader.getPage(0))
 								        writer.addPage(the_page)
 								        writer.write(output)
 								@merge([render_page, merge_hocr_with_original_page, extract_single_page],
-												Implement skipping OCR when -s is specified

Appears to be necessary to disable each state of the pipeline that is
inactive, not just initial and terminal stages of an inactive segment.
If nothing else this makes what is going on more explicit.

											
										
										
											2015-02-08 17:26:16 -08:00
+								       os.path.join(options.tmp_fld, '%04i.ocred.pdf' % pageno))
 								def select_final_page(infiles, output_file):
-												Complete wrapping of logger/logger_mutex

											
										
										
											2015-07-22 02:57:13 -07:00
+								    re_symlink(infiles[-1], output_file)
-												Implement skipping OCR when -s is specified

Appears to be necessary to disable each state of the pipeline that is
inactive, not just initial and terminal stages of an inactive segment.
If nothing else this makes what is going on more explicit.

											
										
										
											2015-02-08 17:26:16 -08:00
-												Cleanup logger

											
										
										
											2015-03-24 22:46:33 -07:00
+								if __name__ == '__main__':
 								    cmdline.run(options)
-												Initial ocrpage.py rewrite into python3

											
										
										
											2014-09-26 04:19:41 -07:00