From e89f482c3dc09c654317ca42bff51dfb23f835c6 Mon Sep 17 00:00:00 2001
From: Jim Barlow <jim@purplerock.ca>
Date: Wed, 22 Jul 2015 22:51:38 -0700
Subject: [PATCH] Fixes from early testing of new pipeline

---
 src/ocrmypdf.py | 1082 +++++++++++++++++++++++------------------------
 1 file changed, 541 insertions(+), 541 deletions(-)

diff --git a/src/ocrmypdf.py b/src/ocrmypdf.py
index 7aefb8e4..cd7031da 100755
--- a/src/ocrmypdf.py
+++ b/src/ocrmypdf.py
@@ -18,7 +18,7 @@ except ImportError:
 
 
 from ruffus import transform, suffix, merge, active_if, regex, jobs_limit, \
-    mkdir, formatter
+    mkdir, formatter, follows, split
 import ruffus.cmdline as cmdline
 from .hocrtransform import HocrTransform
 
@@ -40,10 +40,10 @@ parser.add_argument(
     'outputfile',
     help="output searchable PDF file")
 parser.add_argument(
-    '-l', '--language', nargs='*', default=['eng']
+    '-l', '--language', nargs='*', default=['eng'],
     help="language of the file to be OCRed")
 
-preprocessing = parser.add_group(
+preprocessing = parser.add_argument_group(
     "Preprocessing options",
     "Improve OCR quality and final image")
 preprocessing.add_argument(
@@ -72,7 +72,7 @@ parser.add_argument(
     '--exact-image', action='store_true',
     help="Use original page from PDF without re-rendering")
 
-advanced = parser.add_group(
+advanced = parser.add_argument_group(
     "Advanced",
     "Advanced options for power users and debugging")
 advanced.add_argument(
@@ -88,7 +88,7 @@ advanced.add_argument(
     '--tesseract-config', default='', nargs='*',    # Implemented
     help="Tesseract configuration")
 
-debugging = parser.add_group(
+debugging = parser.add_argument_group(
     "Debugging",
     "Arguments to help with troubleshooting and debugging")
 debugging.add_argument(
@@ -188,541 +188,541 @@ def split_pages(
     check_call(args_pdfseparate)
 
 
-FRIENDLY_COLORSPACE = {
-    '/DeviceGray': 'gray',
-    '/CalGray': 'gray',
-    '/DeviceRGB': 'rgb',
-    '/CalRGB': 'rgb',
-    '/DeviceCMYK': 'cmyk',
-    '/Lab': 'lab',
-    '/ICCBased': 'icc',
-    '/Indexed': 'index',
-    '/Separation': 'sep',
-    '/DeviceN': 'devn',
-    '/Pattern': '-'
-}
-
-FRIENDLY_ENCODING = {
-    '/CCITTFaxDecode': 'ccitt',
-    '/DCTDecode': 'jpeg',
-    '/JPXDecode': 'jpx',
-    '/JBIG2Decode': 'jbig2',
-}
-
-FRIENDLY_COMP = {
-    'gray': 1,
-    'rgb': 3,
-    'cmyk': 4,
-    'lab': 3,
-}
-
-
-def pdf_get_pageinfo(infile, page, width_pt, height_pt):
-    pageinfo = {}
-    pageinfo['pageno'] = page
-    pageinfo['width_inches'] = width_pt / 72.0
-    pageinfo['height_inches'] = height_pt / 72.0
-    pageinfo['images'] = []
-
-    p_pdftotext = Popen(['pdftotext', '-f', str(page), '-l', str(page),
-                        '-raw', '-nopgbrk', infile, '-'],
-                        close_fds=True, stdout=PIPE, stderr=PIPE,
-                        universal_newlines=True)
-    text, _ = p_pdftotext.communicate()
-    if len(text.strip()) > 0:
-        pageinfo['has_text'] = True
-    else:
-        pageinfo['has_text'] = False
-
-    pdf = pypdf.PdfFileReader(infile)
-    page = pdf.pages[page - 1]
-
-    if not '/XObject' in page['/Resources']:
-        # Missing /XObject means no images or possibly corrupt PDF
-        return pageinfo
-
-    for xobj in page['/Resources']['/XObject']:
-        # PyPDF2 returns the keys as an iterator
-        pdfimage = page['/Resources']['/XObject'][xobj]
-        if pdfimage['/Subtype'] != '/Image':
-            continue
-        if '/ImageMask' in pdfimage:
-            if pdfimage['/ImageMask']:
-                continue
-        image = {}
-        image['width'] = pdfimage['/Width']
-        image['height'] = pdfimage['/Height']
-        image['bpc'] = pdfimage['/BitsPerComponent']
-        if '/Filter' in pdfimage:
-            filter_ = pdfimage['/Filter']
-            if isinstance(filter_, pypdf.generic.ArrayObject):
-                filter_ = filter_[0]
-            image['enc'] = FRIENDLY_ENCODING.get(filter_, 'image')
-        else:
-            image['enc'] = 'image'
-        if '/ColorSpace' in pdfimage:
-            cs = pdfimage['/ColorSpace']
-            if isinstance(cs, pypdf.generic.ArrayObject):
-                cs = cs[0]
-            image['color'] = FRIENDLY_COLORSPACE.get(cs, '-')
-        else:
-            image['color'] = 'jpx' if image['enc'] == 'jpx' else '?'
-
-        image['comp'] = FRIENDLY_COMP.get(image['color'], '?')
-        image['dpi_w'] = image['width'] / pageinfo['width_inches']
-        image['dpi_h'] = image['height'] / pageinfo['height_inches']
-        image['dpi'] = (image['dpi_w'] * image['dpi_h']) ** 0.5
-        pageinfo['images'].append(image)
-
-    if pageinfo['images']:
-        xres = max(image['dpi_w'] for image in pageinfo['images'])
-        yres = max(image['dpi_h'] for image in pageinfo['images'])
-        pageinfo['xres'], pageinfo['yres'] = xres, yres
-        pageinfo['width_pixels'] = \
-            int(round(xres * pageinfo['width_inches']))
-        pageinfo['height_pixels'] = \
-            int(round(yres * pageinfo['height_inches']))
-
-        if options.oversampling_dpi > 0:
-            rx, ry = options.oversampling_dpi, options.oversampling_dpi
-        else:
-            rx, ry = pageinfo['xres'], pageinfo['yres']
-        pageinfo['xres_render'], pageinfo['yres_render'] = rx, ry
-
-    return pageinfo
-
-pageno, width_pt, height_pt = map(int, options.page_info.split(' ', 3))
-pageinfo = pdf_get_pageinfo(options.inputfile, pageno, width_pt, height_pt)
-
-if not pageinfo['images']:
-    # If the page has no images, then it contains vector content or text
-    # or both. It seems quite unlikely that one would find meaningful text
-    # from rasterizing vector content. So skip the page.
-    log.info(
-        "Page {0} has no images - skipping OCR".format(pageno)
-    )
-elif pageinfo['has_text']:
-    s = "Page {0} already has text! – {1}"
-
-    if not options.force_ocr and not options.skip_text:
-        log.error(s.format(pageno,
-                     "aborting (use -f or -s to force OCR)"))
-        sys.exit(1)
-    elif options.force_ocr:
-        log.info(s.format(pageno,
-                    "rasterizing text and running OCR anyway"))
-    elif options.skip_text:
-        log.info(s.format(pageno,
-                    "skipping all processing on this page"))
-
-ocr_required = pageinfo['images'] and \
-    (options.force_ocr or
-        (not (pageinfo['has_text'] and options.skip_text)))
-
-if ocr_required and options.skip_big:
-    area = pageinfo['width_inches'] * pageinfo['height_inches']
-    pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
-    if area > (11.0 * 17.0) or pixel_count > (300.0 * 300.0 * 11 * 17):
-        ocr_required = False
-        log.info(
-            "Page {0} is very large; skipping due to -b".format(pageno))
-
-
-
-
-
-@active_if(not ocr_required or (ocr_required and options.exact_image))
-@transform(setup_working_directory,
-           formatter(),
-           os.path.join(options.temp_folder, '%04i.page.pdf' % pageno))
-def extract_single_page(
-        input_file,
-        output_file):
-    args_pdfseparate = [
-        'pdfseparate',
-        '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']),
-        input_file,
-        output_file
-    ]
-    check_call(args_pdfseparate)
-
-
-@active_if(ocr_required)
-@active_if(options.page_renderer == 'pdftoppm')
-@transform(setup_working_directory,
-           formatter(),
-           "{path[0]}/%04i.pnm" % pageno)
-def unpack_with_pdftoppm(
-        input_file,
-        output_file):
-    force_ppm = True
-    allow_jpeg = False
-
-    colorspace = 'color'
-    compression = 'deflate'
-    output_format = 'tiff'
-    if all(image['comp'] == 1 for image in pageinfo['images']):
-        if all(image['bpc'] == 1 for image in pageinfo['images']):
-            colorspace = 'mono'
-            compression = 'deflate'
-        elif not any(image['color'] == 'color'
-                     for image in pageinfo['images']):
-            colorspace = 'gray'
-
-    if allow_jpeg and \
-            all(image['enc'] == 'jpeg' for image in pageinfo['images']):
-        output_format = 'jpeg'
-
-    args_pdftoppm = [
-        'pdftoppm',
-        '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']),
-        '-rx', str(pageinfo['xres_render']),
-        '-ry', str(pageinfo['yres_render'])
-    ]
-
-    if not force_ppm:
-        if output_format == 'tiff':
-            args_pdftoppm.append('-tiff')
-            if False and compression:
-                args_pdftoppm.append('-tiffcompression')
-                args_pdftoppm.append(compression)
-        elif output_format == 'jpeg':
-            args_pdftoppm.append('-jpeg')
-
-    if colorspace == 'mono':
-        args_pdftoppm.append('-mono')
-    elif colorspace == 'gray':
-        args_pdftoppm.append('-gray')
-
-    args_pdftoppm.extend([str(input_file)])
-
-    # Ask pdftoppm to write the binary output to stdout; therefore set
-    # universal_newlines=False
-    p = Popen(args_pdftoppm, close_fds=True, stdout=open(output_file, 'wb'),
-              stderr=PIPE, universal_newlines=False)
-    _, stderr = p.communicate()
-    if stderr:
-        # Because universal_newlines=False, stderr is bytes(), so we must
-        # manually convert it to str for logging
-        from codecs import decode
-        log.error(decode(stderr, sys.getdefaultencoding(), 'ignore'))
-    if p.returncode != 0:
-        raise CalledProcessError(p.returncode, args_pdftoppm)
-
-
-@active_if(ocr_required)
-@transform(unpack_with_pdftoppm, suffix(".pnm"), ".png")
-def convert_to_png(input_file, output_file):
-    args_convert = [
-        'convert',
-        input_file,
-        output_file
-    ]
-    check_call(args_convert)
-
-
-@active_if(ocr_required)
-@active_if(options.page_renderer == 'ghostscript')
-@transform(setup_working_directory,
-           formatter(),
-           "{path[0]}/%04i.png" % pageno)
-def unpack_with_ghostscript(
-        input_file,
-        output_file):
-    device = 'png16m'  # 24-bit
-    if all(image['comp'] == 1 for image in pageinfo['images']):
-        if all(image['bpc'] == 1 for image in pageinfo['images']):
-            device = 'pngmono'
-        elif not any(image['color'] == 'color'
-                     for image in pageinfo['images']):
-            device = 'pnggray'
-
-    args_gs = [
-        'gs',
-        '-dBATCH', '-dNOPAUSE',
-        '-dFirstPage=%i' % pageno,
-        '-dLastPage=%i' % pageno,
-        '-sDEVICE=%s' % device,
-        '-o', output_file,
-        '-r{0}x{1}'.format(
-            str(pageinfo['xres_render']), str(pageinfo['yres_render'])),
-        input_file
-    ]
-
-    p = Popen(args_gs, close_fds=True, stdout=PIPE, stderr=PIPE,
-              universal_newlines=True)
-    stdout, stderr = p.communicate()
-    if stdout:
-        log.info(stdout)
-    if stderr:
-        log.error(stderr)
-
-    try:
-        f = open(output_file)
-    except FileNotFoundError:
-        raise
-    else:
-        f.close()
-
-
-@active_if(ocr_required)
-@active_if(options.preprocess_deskew != 0
-           and options.deskew_provider == 'imagemagick')
-@transform(convert_to_png, suffix(".png"), ".deskewed.png")
-def deskew_imagemagick(input_file, output_file):
-    args_convert = [
-        'convert',
-        input_file,
-        '-deskew', '40%',
-        '-gravity', 'center',
-        '-extent', '{width_pixels}x{height_pixels}'.format(**pageinfo),
-        '+repage',
-        output_file
-    ]
-
-    p = Popen(args_convert, close_fds=True, stdout=PIPE, stderr=PIPE,
-              universal_newlines=True)
-    stdout, stderr = p.communicate()
-
-    if stdout:
-        log.info(stdout)
-    if stderr:
-        log.error(stderr)
-
-    if p.returncode != 0:
-        raise CalledProcessError(p.returncode, args_convert)
-
-
-@active_if(ocr_required)
-@active_if(options.preprocess_deskew != 0
-           and options.deskew_provider == 'leptonica')
-@transform(convert_to_png, suffix(".png"), ".deskewed.png")
-def deskew_leptonica(input_file, output_file):
-    from .leptonica import deskew
-    deskew(input_file, output_file,
-           min(pageinfo['xres'], pageinfo['yres']))
-
-
-@active_if(ocr_required)
-@active_if(options.preprocess_clean != 0)
-@merge([unpack_with_pdftoppm, unpack_with_ghostscript,
-        deskew_imagemagick, deskew_leptonica],
-       os.path.join(options.temp_folder, "%04i.for_clean.pnm" % pageno))
-def select_image_for_cleaning(infiles, output_file):
-    input_file = infiles[-1]
-    args_convert = [
-        'convert',
-        input_file,
-        output_file
-    ]
-    check_call(args_convert)
-
-
-@active_if(ocr_required)
-@active_if(options.preprocess_clean != 0)
-@transform(select_image_for_cleaning, suffix(".pnm"), ".cleaned.pnm")
-def clean_unpaper(input_file, output_file):
-    args_unpaper = [
-        'unpaper',
-        '--dpi', str(int(round((pageinfo['xres'] * pageinfo['yres']) ** 0.5))),
-        '--mask-scan-size', '100',
-        '--no-deskew',
-        '--no-grayfilter',
-        '--no-blackfilter',
-        '--no-mask-center',
-        '--no-border-align',
-        input_file,
-        output_file
-    ]
-
-    p = Popen(args_unpaper, close_fds=True, stdout=PIPE, stderr=PIPE,
-              universal_newlines=True)
-    stdout, stderr = p.communicate()
-
-    if stdout:
-        log.info(stdout)
-    if stderr:
-        log.error(stderr)
-
-    if p.returncode != 0:
-        raise CalledProcessError(p.returncode, args_unpaper)
-
-
-@active_if(ocr_required)
-@transform(clean_unpaper, suffix(".cleaned.pnm"), ".cleaned.png")
-def cleaned_to_png(input_file, output_file):
-    args_convert = [
-        'convert',
-        input_file,
-        output_file
-    ]
-    check_call(args_convert)
-
-
-@active_if(ocr_required)
-@merge([unpack_with_ghostscript, convert_to_png, deskew_imagemagick,
-        deskew_leptonica, cleaned_to_png],
-       os.path.join(options.temp_folder, "%04i.for_ocr.png" % pageno))
-def select_ocr_image(infiles, output_file):
-    re_symlink(infiles[-1], output_file)
-
-
-hocr_template = '''<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
-    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
- <head>
-  <title></title>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-  <meta name='ocr-system' content='tesseract 3.02.02' />
-  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
- </head>
- <body>
-  <div class='ocr_page' id='page_1' title='image "x.tif"; bbox 0 0 {0} {1}; ppageno 0'>
-   <div class='ocr_carea' id='block_1_1' title="bbox 0 1 {0} {1}">
-    <p class='ocr_par' dir='ltr' id='par_1' title="bbox 0 1 {0} {1}">
-     <span class='ocr_line' id='line_1' title="bbox 0 1 {0} {1}"><span class='ocrx_word' id='word_1' title="bbox 0 1 {0} {1}"> </span>
-     </span>
-    </p>
-   </div>
-  </div>
- </body>
-</html>'''
-
-
-@active_if(ocr_required)
-@transform(select_ocr_image, suffix(".for_ocr.png"), ".hocr")
-def ocr_tesseract(
-        input_file,
-        output_file):
-
-    args_tesseract = [
-        'tesseract',
-        '-l', options.language,
-        input_file,
-        output_file,
-        'hocr',
-        options.tess_cfg_files
-    ]
-    p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
-              universal_newlines=True)
-    try:
-        stdout, stderr = p.communicate(timeout=180)
-    except TimeoutExpired:
-        p.kill()
-        stdout, stderr = p.communicate()
-        # Generate a HOCR file with no recognized text if tesseract times out
-        # Temporary workaround to hocrTransform not being able to function if
-        # it does not have a valid hOCR file.
-        with open(output_file, 'w', encoding="utf-8") as f:
-            f.write(hocr_template.format(pageinfo['width_pixels'],
-                                         pageinfo['height_pixels']))
-    else:
-        if stdout:
-            log.info(stdout)
-        if stderr:
-            log.error(stderr)
-
-        if p.returncode != 0:
-            raise CalledProcessError(p.returncode, args_tesseract)
-
-        if os.path.exists(output_file + '.html'):
-            # Tesseract 3.02 appends suffix ".html" on its own (.hocr.html)
-            shutil.move(output_file + '.html', output_file)
-        elif os.path.exists(output_file + '.hocr'):
-            # Tesseract 3.03 appends suffix ".hocr" on its own (.hocr.hocr)
-            shutil.move(output_file + '.hocr', output_file)
-
-        # Tesseract inserts source filename into hocr file without escaping
-        # it. This could break the XML parser. Rewrite the hocr file,
-        # replacing the filename with a space.
-        regex_nested_single_quotes = re.compile(
-            r"""title='image "([^"]*)";""")
-        with fileinput.input(files=(output_file,), inplace=True) as f:
-            for line in f:
-                line = regex_nested_single_quotes.sub(
-                    r"""title='image " ";""", line)
-                print(line, end='')  # fileinput.input redirects stdout
-
-
-@active_if(ocr_required and not options.exact_image)
-@merge([unpack_with_ghostscript, convert_to_png,
-        deskew_imagemagick, deskew_leptonica, cleaned_to_png],
-       os.path.join(options.temp_folder, "%04i.image_for_pdf" % pageno))
-def select_image_for_pdf(infiles, output_file):
-    if options.preprocess_clean != 0 and options.preprocess_cleantopdf != 0:
-        input_file = infiles[-1]
-    elif options.preprocess_deskew != 0 and options.preprocess_clean != 0:
-        input_file = infiles[-2]
-    elif options.preprocess_deskew != 0 and options.preprocess_clean == 0:
-        input_file = infiles[-1]
-    else:
-        input_file = infiles[0]
-
-    if all(image['enc'] == 'jpeg' for image in pageinfo['images']):
-        # If all images were JPEGs originally, produce a JPEG as output
-        check_call(['convert', input_file, 'jpg:' + output_file])
-    else:
-        re_symlink(input_file, output_file)
-
-
-@active_if(ocr_required and not options.exact_image)
-@merge([ocr_tesseract, select_image_for_pdf],
-       os.path.join(options.temp_folder, '%04i.rendered.pdf' % pageno))
-def render_page(infiles, output_file):
-    hocr, image = infiles[0], infiles[1]
-
-    dpi = round(max(pageinfo['xres'], pageinfo['yres']))
-
-    hocrtransform = HocrTransform(hocr, dpi)
-    hocrtransform.to_pdf(output_file, imageFileName=image,
-                         showBoundingboxes=False, invisibleText=True)
-
-
-@active_if(ocr_required and options.pdf_noimg)
-@transform(ocr_tesseract, suffix(".hocr"), ".ocred.todebug.pdf")
-def render_text_output_page(input_file, output_file):
-    dpi = round(max(pageinfo['xres'], pageinfo['yres']))
-
-    hocrtransform = HocrTransform(input_file, dpi)
-    hocrtransform.to_pdf(output_file, imageFileName=None,
-                         showBoundingboxes=True, invisibleText=False)
-
-
-@active_if(ocr_required and options.exact_image)
-@transform(ocr_tesseract, suffix(".hocr"), ".hocr.pdf")
-def render_hocr_blank_page(input_file, output_file):
-    dpi = round(max(pageinfo['xres'], pageinfo['yres']))
-
-    hocrtransform = HocrTransform(input_file, dpi)
-    hocrtransform.to_pdf(output_file, imageFileName=None,
-                         showBoundingboxes=False, invisibleText=True)
-
-
-@active_if(ocr_required and options.exact_image)
-@merge([render_hocr_blank_page, extract_single_page],
-       os.path.join(options.temp_folder, "%04i.merged.pdf") % pageno)
-def merge_hocr_with_original_page(infiles, output_file):
-    with open(infiles[0], 'rb') as hocr_input, \
-            open(infiles[1], 'rb') as page_input, \
-            open(output_file, 'wb') as output:
-        hocr_reader = pypdf.PdfFileReader(hocr_input)
-        page_reader = pypdf.PdfFileReader(page_input)
-        writer = pypdf.PdfFileWriter()
-
-        the_page = hocr_reader.getPage(0)
-        the_page.mergePage(page_reader.getPage(0))
-        writer.addPage(the_page)
-        writer.write(output)
-
-
-@merge([render_page, merge_hocr_with_original_page, extract_single_page],
-       os.path.join(options.temp_folder, '%04i.ocred.pdf' % pageno))
-def select_final_page(infiles, output_file):
-    re_symlink(infiles[-1], output_file)
-
-
-if __name__ == '__main__':
-    cmdline.run(options)
+# FRIENDLY_COLORSPACE = {
+#     '/DeviceGray': 'gray',
+#     '/CalGray': 'gray',
+#     '/DeviceRGB': 'rgb',
+#     '/CalRGB': 'rgb',
+#     '/DeviceCMYK': 'cmyk',
+#     '/Lab': 'lab',
+#     '/ICCBased': 'icc',
+#     '/Indexed': 'index',
+#     '/Separation': 'sep',
+#     '/DeviceN': 'devn',
+#     '/Pattern': '-'
+# }
+
+# FRIENDLY_ENCODING = {
+#     '/CCITTFaxDecode': 'ccitt',
+#     '/DCTDecode': 'jpeg',
+#     '/JPXDecode': 'jpx',
+#     '/JBIG2Decode': 'jbig2',
+# }
+
+# FRIENDLY_COMP = {
+#     'gray': 1,
+#     'rgb': 3,
+#     'cmyk': 4,
+#     'lab': 3,
+# }
+
+
+# def pdf_get_pageinfo(infile, page, width_pt, height_pt):
+#     pageinfo = {}
+#     pageinfo['pageno'] = page
+#     pageinfo['width_inches'] = width_pt / 72.0
+#     pageinfo['height_inches'] = height_pt / 72.0
+#     pageinfo['images'] = []
+
+#     p_pdftotext = Popen(['pdftotext', '-f', str(page), '-l', str(page),
+#                         '-raw', '-nopgbrk', infile, '-'],
+#                         close_fds=True, stdout=PIPE, stderr=PIPE,
+#                         universal_newlines=True)
+#     text, _ = p_pdftotext.communicate()
+#     if len(text.strip()) > 0:
+#         pageinfo['has_text'] = True
+#     else:
+#         pageinfo['has_text'] = False
+
+#     pdf = pypdf.PdfFileReader(infile)
+#     page = pdf.pages[page - 1]
+
+#     if not '/XObject' in page['/Resources']:
+#         # Missing /XObject means no images or possibly corrupt PDF
+#         return pageinfo
+
+#     for xobj in page['/Resources']['/XObject']:
+#         # PyPDF2 returns the keys as an iterator
+#         pdfimage = page['/Resources']['/XObject'][xobj]
+#         if pdfimage['/Subtype'] != '/Image':
+#             continue
+#         if '/ImageMask' in pdfimage:
+#             if pdfimage['/ImageMask']:
+#                 continue
+#         image = {}
+#         image['width'] = pdfimage['/Width']
+#         image['height'] = pdfimage['/Height']
+#         image['bpc'] = pdfimage['/BitsPerComponent']
+#         if '/Filter' in pdfimage:
+#             filter_ = pdfimage['/Filter']
+#             if isinstance(filter_, pypdf.generic.ArrayObject):
+#                 filter_ = filter_[0]
+#             image['enc'] = FRIENDLY_ENCODING.get(filter_, 'image')
+#         else:
+#             image['enc'] = 'image'
+#         if '/ColorSpace' in pdfimage:
+#             cs = pdfimage['/ColorSpace']
+#             if isinstance(cs, pypdf.generic.ArrayObject):
+#                 cs = cs[0]
+#             image['color'] = FRIENDLY_COLORSPACE.get(cs, '-')
+#         else:
+#             image['color'] = 'jpx' if image['enc'] == 'jpx' else '?'
+
+#         image['comp'] = FRIENDLY_COMP.get(image['color'], '?')
+#         image['dpi_w'] = image['width'] / pageinfo['width_inches']
+#         image['dpi_h'] = image['height'] / pageinfo['height_inches']
+#         image['dpi'] = (image['dpi_w'] * image['dpi_h']) ** 0.5
+#         pageinfo['images'].append(image)
+
+#     if pageinfo['images']:
+#         xres = max(image['dpi_w'] for image in pageinfo['images'])
+#         yres = max(image['dpi_h'] for image in pageinfo['images'])
+#         pageinfo['xres'], pageinfo['yres'] = xres, yres
+#         pageinfo['width_pixels'] = \
+#             int(round(xres * pageinfo['width_inches']))
+#         pageinfo['height_pixels'] = \
+#             int(round(yres * pageinfo['height_inches']))
+
+#         if options.oversampling_dpi > 0:
+#             rx, ry = options.oversampling_dpi, options.oversampling_dpi
+#         else:
+#             rx, ry = pageinfo['xres'], pageinfo['yres']
+#         pageinfo['xres_render'], pageinfo['yres_render'] = rx, ry
+
+#     return pageinfo
+
+# pageno, width_pt, height_pt = map(int, options.page_info.split(' ', 3))
+# pageinfo = pdf_get_pageinfo(options.inputfile, pageno, width_pt, height_pt)
+
+# if not pageinfo['images']:
+#     # If the page has no images, then it contains vector content or text
+#     # or both. It seems quite unlikely that one would find meaningful text
+#     # from rasterizing vector content. So skip the page.
+#     log.info(
+#         "Page {0} has no images - skipping OCR".format(pageno)
+#     )
+# elif pageinfo['has_text']:
+#     s = "Page {0} already has text! – {1}"
+
+#     if not options.force_ocr and not options.skip_text:
+#         log.error(s.format(pageno,
+#                      "aborting (use -f or -s to force OCR)"))
+#         sys.exit(1)
+#     elif options.force_ocr:
+#         log.info(s.format(pageno,
+#                     "rasterizing text and running OCR anyway"))
+#     elif options.skip_text:
+#         log.info(s.format(pageno,
+#                     "skipping all processing on this page"))
+
+# ocr_required = pageinfo['images'] and \
+#     (options.force_ocr or
+#         (not (pageinfo['has_text'] and options.skip_text)))
+
+# if ocr_required and options.skip_big:
+#     area = pageinfo['width_inches'] * pageinfo['height_inches']
+#     pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
+#     if area > (11.0 * 17.0) or pixel_count > (300.0 * 300.0 * 11 * 17):
+#         ocr_required = False
+#         log.info(
+#             "Page {0} is very large; skipping due to -b".format(pageno))
+
+
+
+
+
+# @active_if(not ocr_required or (ocr_required and options.exact_image))
+# @transform(setup_working_directory,
+#            formatter(),
+#            os.path.join(options.temp_folder, '%04i.page.pdf' % pageno))
+# def extract_single_page(
+#         input_file,
+#         output_file):
+#     args_pdfseparate = [
+#         'pdfseparate',
+#         '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']),
+#         input_file,
+#         output_file
+#     ]
+#     check_call(args_pdfseparate)
+
+
+# @active_if(ocr_required)
+# @active_if(options.page_renderer == 'pdftoppm')
+# @transform(setup_working_directory,
+#            formatter(),
+#            "{path[0]}/%04i.pnm" % pageno)
+# def unpack_with_pdftoppm(
+#         input_file,
+#         output_file):
+#     force_ppm = True
+#     allow_jpeg = False
+
+#     colorspace = 'color'
+#     compression = 'deflate'
+#     output_format = 'tiff'
+#     if all(image['comp'] == 1 for image in pageinfo['images']):
+#         if all(image['bpc'] == 1 for image in pageinfo['images']):
+#             colorspace = 'mono'
+#             compression = 'deflate'
+#         elif not any(image['color'] == 'color'
+#                      for image in pageinfo['images']):
+#             colorspace = 'gray'
+
+#     if allow_jpeg and \
+#             all(image['enc'] == 'jpeg' for image in pageinfo['images']):
+#         output_format = 'jpeg'
+
+#     args_pdftoppm = [
+#         'pdftoppm',
+#         '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']),
+#         '-rx', str(pageinfo['xres_render']),
+#         '-ry', str(pageinfo['yres_render'])
+#     ]
+
+#     if not force_ppm:
+#         if output_format == 'tiff':
+#             args_pdftoppm.append('-tiff')
+#             if False and compression:
+#                 args_pdftoppm.append('-tiffcompression')
+#                 args_pdftoppm.append(compression)
+#         elif output_format == 'jpeg':
+#             args_pdftoppm.append('-jpeg')
+
+#     if colorspace == 'mono':
+#         args_pdftoppm.append('-mono')
+#     elif colorspace == 'gray':
+#         args_pdftoppm.append('-gray')
+
+#     args_pdftoppm.extend([str(input_file)])
+
+#     # Ask pdftoppm to write the binary output to stdout; therefore set
+#     # universal_newlines=False
+#     p = Popen(args_pdftoppm, close_fds=True, stdout=open(output_file, 'wb'),
+#               stderr=PIPE, universal_newlines=False)
+#     _, stderr = p.communicate()
+#     if stderr:
+#         # Because universal_newlines=False, stderr is bytes(), so we must
+#         # manually convert it to str for logging
+#         from codecs import decode
+#         log.error(decode(stderr, sys.getdefaultencoding(), 'ignore'))
+#     if p.returncode != 0:
+#         raise CalledProcessError(p.returncode, args_pdftoppm)
+
+
+# @active_if(ocr_required)
+# @transform(unpack_with_pdftoppm, suffix(".pnm"), ".png")
+# def convert_to_png(input_file, output_file):
+#     args_convert = [
+#         'convert',
+#         input_file,
+#         output_file
+#     ]
+#     check_call(args_convert)
+
+
+# @active_if(ocr_required)
+# @active_if(options.page_renderer == 'ghostscript')
+# @transform(setup_working_directory,
+#            formatter(),
+#            "{path[0]}/%04i.png" % pageno)
+# def unpack_with_ghostscript(
+#         input_file,
+#         output_file):
+#     device = 'png16m'  # 24-bit
+#     if all(image['comp'] == 1 for image in pageinfo['images']):
+#         if all(image['bpc'] == 1 for image in pageinfo['images']):
+#             device = 'pngmono'
+#         elif not any(image['color'] == 'color'
+#                      for image in pageinfo['images']):
+#             device = 'pnggray'
+
+#     args_gs = [
+#         'gs',
+#         '-dBATCH', '-dNOPAUSE',
+#         '-dFirstPage=%i' % pageno,
+#         '-dLastPage=%i' % pageno,
+#         '-sDEVICE=%s' % device,
+#         '-o', output_file,
+#         '-r{0}x{1}'.format(
+#             str(pageinfo['xres_render']), str(pageinfo['yres_render'])),
+#         input_file
+#     ]
+
+#     p = Popen(args_gs, close_fds=True, stdout=PIPE, stderr=PIPE,
+#               universal_newlines=True)
+#     stdout, stderr = p.communicate()
+#     if stdout:
+#         log.info(stdout)
+#     if stderr:
+#         log.error(stderr)
+
+#     try:
+#         f = open(output_file)
+#     except FileNotFoundError:
+#         raise
+#     else:
+#         f.close()
+
+
+# @active_if(ocr_required)
+# @active_if(options.preprocess_deskew != 0
+#            and options.deskew_provider == 'imagemagick')
+# @transform(convert_to_png, suffix(".png"), ".deskewed.png")
+# def deskew_imagemagick(input_file, output_file):
+#     args_convert = [
+#         'convert',
+#         input_file,
+#         '-deskew', '40%',
+#         '-gravity', 'center',
+#         '-extent', '{width_pixels}x{height_pixels}'.format(**pageinfo),
+#         '+repage',
+#         output_file
+#     ]
+
+#     p = Popen(args_convert, close_fds=True, stdout=PIPE, stderr=PIPE,
+#               universal_newlines=True)
+#     stdout, stderr = p.communicate()
+
+#     if stdout:
+#         log.info(stdout)
+#     if stderr:
+#         log.error(stderr)
+
+#     if p.returncode != 0:
+#         raise CalledProcessError(p.returncode, args_convert)
+
+
+# @active_if(ocr_required)
+# @active_if(options.preprocess_deskew != 0
+#            and options.deskew_provider == 'leptonica')
+# @transform(convert_to_png, suffix(".png"), ".deskewed.png")
+# def deskew_leptonica(input_file, output_file):
+#     from .leptonica import deskew
+#     deskew(input_file, output_file,
+#            min(pageinfo['xres'], pageinfo['yres']))
+
+
+# @active_if(ocr_required)
+# @active_if(options.preprocess_clean != 0)
+# @merge([unpack_with_pdftoppm, unpack_with_ghostscript,
+#         deskew_imagemagick, deskew_leptonica],
+#        os.path.join(options.temp_folder, "%04i.for_clean.pnm" % pageno))
+# def select_image_for_cleaning(infiles, output_file):
+#     input_file = infiles[-1]
+#     args_convert = [
+#         'convert',
+#         input_file,
+#         output_file
+#     ]
+#     check_call(args_convert)
+
+
+# @active_if(ocr_required)
+# @active_if(options.preprocess_clean != 0)
+# @transform(select_image_for_cleaning, suffix(".pnm"), ".cleaned.pnm")
+# def clean_unpaper(input_file, output_file):
+#     args_unpaper = [
+#         'unpaper',
+#         '--dpi', str(int(round((pageinfo['xres'] * pageinfo['yres']) ** 0.5))),
+#         '--mask-scan-size', '100',
+#         '--no-deskew',
+#         '--no-grayfilter',
+#         '--no-blackfilter',
+#         '--no-mask-center',
+#         '--no-border-align',
+#         input_file,
+#         output_file
+#     ]
+
+#     p = Popen(args_unpaper, close_fds=True, stdout=PIPE, stderr=PIPE,
+#               universal_newlines=True)
+#     stdout, stderr = p.communicate()
+
+#     if stdout:
+#         log.info(stdout)
+#     if stderr:
+#         log.error(stderr)
+
+#     if p.returncode != 0:
+#         raise CalledProcessError(p.returncode, args_unpaper)
+
+
+# @active_if(ocr_required)
+# @transform(clean_unpaper, suffix(".cleaned.pnm"), ".cleaned.png")
+# def cleaned_to_png(input_file, output_file):
+#     args_convert = [
+#         'convert',
+#         input_file,
+#         output_file
+#     ]
+#     check_call(args_convert)
+
+
+# @active_if(ocr_required)
+# @merge([unpack_with_ghostscript, convert_to_png, deskew_imagemagick,
+#         deskew_leptonica, cleaned_to_png],
+#        os.path.join(options.temp_folder, "%04i.for_ocr.png" % pageno))
+# def select_ocr_image(infiles, output_file):
+#     re_symlink(infiles[-1], output_file)
+
+
+# hocr_template = '''<?xml version="1.0" encoding="UTF-8"?>
+# <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+#     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+# <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+#  <head>
+#   <title></title>
+#   <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+#   <meta name='ocr-system' content='tesseract 3.02.02' />
+#   <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
+#  </head>
+#  <body>
+#   <div class='ocr_page' id='page_1' title='image "x.tif"; bbox 0 0 {0} {1}; ppageno 0'>
+#    <div class='ocr_carea' id='block_1_1' title="bbox 0 1 {0} {1}">
+#     <p class='ocr_par' dir='ltr' id='par_1' title="bbox 0 1 {0} {1}">
+#      <span class='ocr_line' id='line_1' title="bbox 0 1 {0} {1}"><span class='ocrx_word' id='word_1' title="bbox 0 1 {0} {1}"> </span>
+#      </span>
+#     </p>
+#    </div>
+#   </div>
+#  </body>
+# </html>'''
+
+
+# @active_if(ocr_required)
+# @transform(select_ocr_image, suffix(".for_ocr.png"), ".hocr")
+# def ocr_tesseract(
+#         input_file,
+#         output_file):
+
+#     args_tesseract = [
+#         'tesseract',
+#         '-l', options.language,
+#         input_file,
+#         output_file,
+#         'hocr',
+#         options.tess_cfg_files
+#     ]
+#     p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
+#               universal_newlines=True)
+#     try:
+#         stdout, stderr = p.communicate(timeout=180)
+#     except TimeoutExpired:
+#         p.kill()
+#         stdout, stderr = p.communicate()
+#         # Generate a HOCR file with no recognized text if tesseract times out
+#         # Temporary workaround to hocrTransform not being able to function if
+#         # it does not have a valid hOCR file.
+#         with open(output_file, 'w', encoding="utf-8") as f:
+#             f.write(hocr_template.format(pageinfo['width_pixels'],
+#                                          pageinfo['height_pixels']))
+#     else:
+#         if stdout:
+#             log.info(stdout)
+#         if stderr:
+#             log.error(stderr)
+
+#         if p.returncode != 0:
+#             raise CalledProcessError(p.returncode, args_tesseract)
+
+#         if os.path.exists(output_file + '.html'):
+#             # Tesseract 3.02 appends suffix ".html" on its own (.hocr.html)
+#             shutil.move(output_file + '.html', output_file)
+#         elif os.path.exists(output_file + '.hocr'):
+#             # Tesseract 3.03 appends suffix ".hocr" on its own (.hocr.hocr)
+#             shutil.move(output_file + '.hocr', output_file)
+
+#         # Tesseract inserts source filename into hocr file without escaping
+#         # it. This could break the XML parser. Rewrite the hocr file,
+#         # replacing the filename with a space.
+#         regex_nested_single_quotes = re.compile(
+#             r"""title='image "([^"]*)";""")
+#         with fileinput.input(files=(output_file,), inplace=True) as f:
+#             for line in f:
+#                 line = regex_nested_single_quotes.sub(
+#                     r"""title='image " ";""", line)
+#                 print(line, end='')  # fileinput.input redirects stdout
+
+
+# @active_if(ocr_required and not options.exact_image)
+# @merge([unpack_with_ghostscript, convert_to_png,
+#         deskew_imagemagick, deskew_leptonica, cleaned_to_png],
+#        os.path.join(options.temp_folder, "%04i.image_for_pdf" % pageno))
+# def select_image_for_pdf(infiles, output_file):
+#     if options.preprocess_clean != 0 and options.preprocess_cleantopdf != 0:
+#         input_file = infiles[-1]
+#     elif options.preprocess_deskew != 0 and options.preprocess_clean != 0:
+#         input_file = infiles[-2]
+#     elif options.preprocess_deskew != 0 and options.preprocess_clean == 0:
+#         input_file = infiles[-1]
+#     else:
+#         input_file = infiles[0]
+
+#     if all(image['enc'] == 'jpeg' for image in pageinfo['images']):
+#         # If all images were JPEGs originally, produce a JPEG as output
+#         check_call(['convert', input_file, 'jpg:' + output_file])
+#     else:
+#         re_symlink(input_file, output_file)
+
+
+# @active_if(ocr_required and not options.exact_image)
+# @merge([ocr_tesseract, select_image_for_pdf],
+#        os.path.join(options.temp_folder, '%04i.rendered.pdf' % pageno))
+# def render_page(infiles, output_file):
+#     hocr, image = infiles[0], infiles[1]
+
+#     dpi = round(max(pageinfo['xres'], pageinfo['yres']))
+
+#     hocrtransform = HocrTransform(hocr, dpi)
+#     hocrtransform.to_pdf(output_file, imageFileName=image,
+#                          showBoundingboxes=False, invisibleText=True)
+
+
+# @active_if(ocr_required and options.pdf_noimg)
+# @transform(ocr_tesseract, suffix(".hocr"), ".ocred.todebug.pdf")
+# def render_text_output_page(input_file, output_file):
+#     dpi = round(max(pageinfo['xres'], pageinfo['yres']))
+
+#     hocrtransform = HocrTransform(input_file, dpi)
+#     hocrtransform.to_pdf(output_file, imageFileName=None,
+#                          showBoundingboxes=True, invisibleText=False)
+
+
+# @active_if(ocr_required and options.exact_image)
+# @transform(ocr_tesseract, suffix(".hocr"), ".hocr.pdf")
+# def render_hocr_blank_page(input_file, output_file):
+#     dpi = round(max(pageinfo['xres'], pageinfo['yres']))
+
+#     hocrtransform = HocrTransform(input_file, dpi)
+#     hocrtransform.to_pdf(output_file, imageFileName=None,
+#                          showBoundingboxes=False, invisibleText=True)
+
+
+# @active_if(ocr_required and options.exact_image)
+# @merge([render_hocr_blank_page, extract_single_page],
+#        os.path.join(options.temp_folder, "%04i.merged.pdf") % pageno)
+# def merge_hocr_with_original_page(infiles, output_file):
+#     with open(infiles[0], 'rb') as hocr_input, \
+#             open(infiles[1], 'rb') as page_input, \
+#             open(output_file, 'wb') as output:
+#         hocr_reader = pypdf.PdfFileReader(hocr_input)
+#         page_reader = pypdf.PdfFileReader(page_input)
+#         writer = pypdf.PdfFileWriter()
+
+#         the_page = hocr_reader.getPage(0)
+#         the_page.mergePage(page_reader.getPage(0))
+#         writer.addPage(the_page)
+#         writer.write(output)
+
+
+# @merge([render_page, merge_hocr_with_original_page, extract_single_page],
+#        os.path.join(options.temp_folder, '%04i.ocred.pdf' % pageno))
+# def select_final_page(infiles, output_file):
+#     re_symlink(infiles[-1], output_file)
+
+
+# if __name__ == '__main__':
+#     cmdline.run(options)