diff --git a/src/ocrmypdf.py b/src/ocrmypdf.py
index 7aefb8e4..cd7031da 100755
--- a/src/ocrmypdf.py
+++ b/src/ocrmypdf.py
@@ -18,7 +18,7 @@ except ImportError:
from ruffus import transform, suffix, merge, active_if, regex, jobs_limit, \
- mkdir, formatter
+ mkdir, formatter, follows, split
import ruffus.cmdline as cmdline
from .hocrtransform import HocrTransform
@@ -40,10 +40,10 @@ parser.add_argument(
'outputfile',
help="output searchable PDF file")
parser.add_argument(
- '-l', '--language', nargs='*', default=['eng']
+ '-l', '--language', nargs='*', default=['eng'],
help="language of the file to be OCRed")
-preprocessing = parser.add_group(
+preprocessing = parser.add_argument_group(
"Preprocessing options",
"Improve OCR quality and final image")
preprocessing.add_argument(
@@ -72,7 +72,7 @@ parser.add_argument(
'--exact-image', action='store_true',
help="Use original page from PDF without re-rendering")
-advanced = parser.add_group(
+advanced = parser.add_argument_group(
"Advanced",
"Advanced options for power users and debugging")
advanced.add_argument(
@@ -88,7 +88,7 @@ advanced.add_argument(
'--tesseract-config', default='', nargs='*', # Implemented
help="Tesseract configuration")
-debugging = parser.add_group(
+debugging = parser.add_argument_group(
"Debugging",
"Arguments to help with troubleshooting and debugging")
debugging.add_argument(
@@ -188,541 +188,541 @@ def split_pages(
check_call(args_pdfseparate)
-FRIENDLY_COLORSPACE = {
- '/DeviceGray': 'gray',
- '/CalGray': 'gray',
- '/DeviceRGB': 'rgb',
- '/CalRGB': 'rgb',
- '/DeviceCMYK': 'cmyk',
- '/Lab': 'lab',
- '/ICCBased': 'icc',
- '/Indexed': 'index',
- '/Separation': 'sep',
- '/DeviceN': 'devn',
- '/Pattern': '-'
-}
-
-FRIENDLY_ENCODING = {
- '/CCITTFaxDecode': 'ccitt',
- '/DCTDecode': 'jpeg',
- '/JPXDecode': 'jpx',
- '/JBIG2Decode': 'jbig2',
-}
-
-FRIENDLY_COMP = {
- 'gray': 1,
- 'rgb': 3,
- 'cmyk': 4,
- 'lab': 3,
-}
-
-
-def pdf_get_pageinfo(infile, page, width_pt, height_pt):
- pageinfo = {}
- pageinfo['pageno'] = page
- pageinfo['width_inches'] = width_pt / 72.0
- pageinfo['height_inches'] = height_pt / 72.0
- pageinfo['images'] = []
-
- p_pdftotext = Popen(['pdftotext', '-f', str(page), '-l', str(page),
- '-raw', '-nopgbrk', infile, '-'],
- close_fds=True, stdout=PIPE, stderr=PIPE,
- universal_newlines=True)
- text, _ = p_pdftotext.communicate()
- if len(text.strip()) > 0:
- pageinfo['has_text'] = True
- else:
- pageinfo['has_text'] = False
-
- pdf = pypdf.PdfFileReader(infile)
- page = pdf.pages[page - 1]
-
- if not '/XObject' in page['/Resources']:
- # Missing /XObject means no images or possibly corrupt PDF
- return pageinfo
-
- for xobj in page['/Resources']['/XObject']:
- # PyPDF2 returns the keys as an iterator
- pdfimage = page['/Resources']['/XObject'][xobj]
- if pdfimage['/Subtype'] != '/Image':
- continue
- if '/ImageMask' in pdfimage:
- if pdfimage['/ImageMask']:
- continue
- image = {}
- image['width'] = pdfimage['/Width']
- image['height'] = pdfimage['/Height']
- image['bpc'] = pdfimage['/BitsPerComponent']
- if '/Filter' in pdfimage:
- filter_ = pdfimage['/Filter']
- if isinstance(filter_, pypdf.generic.ArrayObject):
- filter_ = filter_[0]
- image['enc'] = FRIENDLY_ENCODING.get(filter_, 'image')
- else:
- image['enc'] = 'image'
- if '/ColorSpace' in pdfimage:
- cs = pdfimage['/ColorSpace']
- if isinstance(cs, pypdf.generic.ArrayObject):
- cs = cs[0]
- image['color'] = FRIENDLY_COLORSPACE.get(cs, '-')
- else:
- image['color'] = 'jpx' if image['enc'] == 'jpx' else '?'
-
- image['comp'] = FRIENDLY_COMP.get(image['color'], '?')
- image['dpi_w'] = image['width'] / pageinfo['width_inches']
- image['dpi_h'] = image['height'] / pageinfo['height_inches']
- image['dpi'] = (image['dpi_w'] * image['dpi_h']) ** 0.5
- pageinfo['images'].append(image)
-
- if pageinfo['images']:
- xres = max(image['dpi_w'] for image in pageinfo['images'])
- yres = max(image['dpi_h'] for image in pageinfo['images'])
- pageinfo['xres'], pageinfo['yres'] = xres, yres
- pageinfo['width_pixels'] = \
- int(round(xres * pageinfo['width_inches']))
- pageinfo['height_pixels'] = \
- int(round(yres * pageinfo['height_inches']))
-
- if options.oversampling_dpi > 0:
- rx, ry = options.oversampling_dpi, options.oversampling_dpi
- else:
- rx, ry = pageinfo['xres'], pageinfo['yres']
- pageinfo['xres_render'], pageinfo['yres_render'] = rx, ry
-
- return pageinfo
-
-pageno, width_pt, height_pt = map(int, options.page_info.split(' ', 3))
-pageinfo = pdf_get_pageinfo(options.inputfile, pageno, width_pt, height_pt)
-
-if not pageinfo['images']:
- # If the page has no images, then it contains vector content or text
- # or both. It seems quite unlikely that one would find meaningful text
- # from rasterizing vector content. So skip the page.
- log.info(
- "Page {0} has no images - skipping OCR".format(pageno)
- )
-elif pageinfo['has_text']:
- s = "Page {0} already has text! – {1}"
-
- if not options.force_ocr and not options.skip_text:
- log.error(s.format(pageno,
- "aborting (use -f or -s to force OCR)"))
- sys.exit(1)
- elif options.force_ocr:
- log.info(s.format(pageno,
- "rasterizing text and running OCR anyway"))
- elif options.skip_text:
- log.info(s.format(pageno,
- "skipping all processing on this page"))
-
-ocr_required = pageinfo['images'] and \
- (options.force_ocr or
- (not (pageinfo['has_text'] and options.skip_text)))
-
-if ocr_required and options.skip_big:
- area = pageinfo['width_inches'] * pageinfo['height_inches']
- pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
- if area > (11.0 * 17.0) or pixel_count > (300.0 * 300.0 * 11 * 17):
- ocr_required = False
- log.info(
- "Page {0} is very large; skipping due to -b".format(pageno))
-
-
-
-
-
-@active_if(not ocr_required or (ocr_required and options.exact_image))
-@transform(setup_working_directory,
- formatter(),
- os.path.join(options.temp_folder, '%04i.page.pdf' % pageno))
-def extract_single_page(
- input_file,
- output_file):
- args_pdfseparate = [
- 'pdfseparate',
- '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']),
- input_file,
- output_file
- ]
- check_call(args_pdfseparate)
-
-
-@active_if(ocr_required)
-@active_if(options.page_renderer == 'pdftoppm')
-@transform(setup_working_directory,
- formatter(),
- "{path[0]}/%04i.pnm" % pageno)
-def unpack_with_pdftoppm(
- input_file,
- output_file):
- force_ppm = True
- allow_jpeg = False
-
- colorspace = 'color'
- compression = 'deflate'
- output_format = 'tiff'
- if all(image['comp'] == 1 for image in pageinfo['images']):
- if all(image['bpc'] == 1 for image in pageinfo['images']):
- colorspace = 'mono'
- compression = 'deflate'
- elif not any(image['color'] == 'color'
- for image in pageinfo['images']):
- colorspace = 'gray'
-
- if allow_jpeg and \
- all(image['enc'] == 'jpeg' for image in pageinfo['images']):
- output_format = 'jpeg'
-
- args_pdftoppm = [
- 'pdftoppm',
- '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']),
- '-rx', str(pageinfo['xres_render']),
- '-ry', str(pageinfo['yres_render'])
- ]
-
- if not force_ppm:
- if output_format == 'tiff':
- args_pdftoppm.append('-tiff')
- if False and compression:
- args_pdftoppm.append('-tiffcompression')
- args_pdftoppm.append(compression)
- elif output_format == 'jpeg':
- args_pdftoppm.append('-jpeg')
-
- if colorspace == 'mono':
- args_pdftoppm.append('-mono')
- elif colorspace == 'gray':
- args_pdftoppm.append('-gray')
-
- args_pdftoppm.extend([str(input_file)])
-
- # Ask pdftoppm to write the binary output to stdout; therefore set
- # universal_newlines=False
- p = Popen(args_pdftoppm, close_fds=True, stdout=open(output_file, 'wb'),
- stderr=PIPE, universal_newlines=False)
- _, stderr = p.communicate()
- if stderr:
- # Because universal_newlines=False, stderr is bytes(), so we must
- # manually convert it to str for logging
- from codecs import decode
- log.error(decode(stderr, sys.getdefaultencoding(), 'ignore'))
- if p.returncode != 0:
- raise CalledProcessError(p.returncode, args_pdftoppm)
-
-
-@active_if(ocr_required)
-@transform(unpack_with_pdftoppm, suffix(".pnm"), ".png")
-def convert_to_png(input_file, output_file):
- args_convert = [
- 'convert',
- input_file,
- output_file
- ]
- check_call(args_convert)
-
-
-@active_if(ocr_required)
-@active_if(options.page_renderer == 'ghostscript')
-@transform(setup_working_directory,
- formatter(),
- "{path[0]}/%04i.png" % pageno)
-def unpack_with_ghostscript(
- input_file,
- output_file):
- device = 'png16m' # 24-bit
- if all(image['comp'] == 1 for image in pageinfo['images']):
- if all(image['bpc'] == 1 for image in pageinfo['images']):
- device = 'pngmono'
- elif not any(image['color'] == 'color'
- for image in pageinfo['images']):
- device = 'pnggray'
-
- args_gs = [
- 'gs',
- '-dBATCH', '-dNOPAUSE',
- '-dFirstPage=%i' % pageno,
- '-dLastPage=%i' % pageno,
- '-sDEVICE=%s' % device,
- '-o', output_file,
- '-r{0}x{1}'.format(
- str(pageinfo['xres_render']), str(pageinfo['yres_render'])),
- input_file
- ]
-
- p = Popen(args_gs, close_fds=True, stdout=PIPE, stderr=PIPE,
- universal_newlines=True)
- stdout, stderr = p.communicate()
- if stdout:
- log.info(stdout)
- if stderr:
- log.error(stderr)
-
- try:
- f = open(output_file)
- except FileNotFoundError:
- raise
- else:
- f.close()
-
-
-@active_if(ocr_required)
-@active_if(options.preprocess_deskew != 0
- and options.deskew_provider == 'imagemagick')
-@transform(convert_to_png, suffix(".png"), ".deskewed.png")
-def deskew_imagemagick(input_file, output_file):
- args_convert = [
- 'convert',
- input_file,
- '-deskew', '40%',
- '-gravity', 'center',
- '-extent', '{width_pixels}x{height_pixels}'.format(**pageinfo),
- '+repage',
- output_file
- ]
-
- p = Popen(args_convert, close_fds=True, stdout=PIPE, stderr=PIPE,
- universal_newlines=True)
- stdout, stderr = p.communicate()
-
- if stdout:
- log.info(stdout)
- if stderr:
- log.error(stderr)
-
- if p.returncode != 0:
- raise CalledProcessError(p.returncode, args_convert)
-
-
-@active_if(ocr_required)
-@active_if(options.preprocess_deskew != 0
- and options.deskew_provider == 'leptonica')
-@transform(convert_to_png, suffix(".png"), ".deskewed.png")
-def deskew_leptonica(input_file, output_file):
- from .leptonica import deskew
- deskew(input_file, output_file,
- min(pageinfo['xres'], pageinfo['yres']))
-
-
-@active_if(ocr_required)
-@active_if(options.preprocess_clean != 0)
-@merge([unpack_with_pdftoppm, unpack_with_ghostscript,
- deskew_imagemagick, deskew_leptonica],
- os.path.join(options.temp_folder, "%04i.for_clean.pnm" % pageno))
-def select_image_for_cleaning(infiles, output_file):
- input_file = infiles[-1]
- args_convert = [
- 'convert',
- input_file,
- output_file
- ]
- check_call(args_convert)
-
-
-@active_if(ocr_required)
-@active_if(options.preprocess_clean != 0)
-@transform(select_image_for_cleaning, suffix(".pnm"), ".cleaned.pnm")
-def clean_unpaper(input_file, output_file):
- args_unpaper = [
- 'unpaper',
- '--dpi', str(int(round((pageinfo['xres'] * pageinfo['yres']) ** 0.5))),
- '--mask-scan-size', '100',
- '--no-deskew',
- '--no-grayfilter',
- '--no-blackfilter',
- '--no-mask-center',
- '--no-border-align',
- input_file,
- output_file
- ]
-
- p = Popen(args_unpaper, close_fds=True, stdout=PIPE, stderr=PIPE,
- universal_newlines=True)
- stdout, stderr = p.communicate()
-
- if stdout:
- log.info(stdout)
- if stderr:
- log.error(stderr)
-
- if p.returncode != 0:
- raise CalledProcessError(p.returncode, args_unpaper)
-
-
-@active_if(ocr_required)
-@transform(clean_unpaper, suffix(".cleaned.pnm"), ".cleaned.png")
-def cleaned_to_png(input_file, output_file):
- args_convert = [
- 'convert',
- input_file,
- output_file
- ]
- check_call(args_convert)
-
-
-@active_if(ocr_required)
-@merge([unpack_with_ghostscript, convert_to_png, deskew_imagemagick,
- deskew_leptonica, cleaned_to_png],
- os.path.join(options.temp_folder, "%04i.for_ocr.png" % pageno))
-def select_ocr_image(infiles, output_file):
- re_symlink(infiles[-1], output_file)
-
-
-hocr_template = '''
-
-
-
-
-
-
-
-
-
-
-
-'''
-
-
-@active_if(ocr_required)
-@transform(select_ocr_image, suffix(".for_ocr.png"), ".hocr")
-def ocr_tesseract(
- input_file,
- output_file):
-
- args_tesseract = [
- 'tesseract',
- '-l', options.language,
- input_file,
- output_file,
- 'hocr',
- options.tess_cfg_files
- ]
- p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
- universal_newlines=True)
- try:
- stdout, stderr = p.communicate(timeout=180)
- except TimeoutExpired:
- p.kill()
- stdout, stderr = p.communicate()
- # Generate a HOCR file with no recognized text if tesseract times out
- # Temporary workaround to hocrTransform not being able to function if
- # it does not have a valid hOCR file.
- with open(output_file, 'w', encoding="utf-8") as f:
- f.write(hocr_template.format(pageinfo['width_pixels'],
- pageinfo['height_pixels']))
- else:
- if stdout:
- log.info(stdout)
- if stderr:
- log.error(stderr)
-
- if p.returncode != 0:
- raise CalledProcessError(p.returncode, args_tesseract)
-
- if os.path.exists(output_file + '.html'):
- # Tesseract 3.02 appends suffix ".html" on its own (.hocr.html)
- shutil.move(output_file + '.html', output_file)
- elif os.path.exists(output_file + '.hocr'):
- # Tesseract 3.03 appends suffix ".hocr" on its own (.hocr.hocr)
- shutil.move(output_file + '.hocr', output_file)
-
- # Tesseract inserts source filename into hocr file without escaping
- # it. This could break the XML parser. Rewrite the hocr file,
- # replacing the filename with a space.
- regex_nested_single_quotes = re.compile(
- r"""title='image "([^"]*)";""")
- with fileinput.input(files=(output_file,), inplace=True) as f:
- for line in f:
- line = regex_nested_single_quotes.sub(
- r"""title='image " ";""", line)
- print(line, end='') # fileinput.input redirects stdout
-
-
-@active_if(ocr_required and not options.exact_image)
-@merge([unpack_with_ghostscript, convert_to_png,
- deskew_imagemagick, deskew_leptonica, cleaned_to_png],
- os.path.join(options.temp_folder, "%04i.image_for_pdf" % pageno))
-def select_image_for_pdf(infiles, output_file):
- if options.preprocess_clean != 0 and options.preprocess_cleantopdf != 0:
- input_file = infiles[-1]
- elif options.preprocess_deskew != 0 and options.preprocess_clean != 0:
- input_file = infiles[-2]
- elif options.preprocess_deskew != 0 and options.preprocess_clean == 0:
- input_file = infiles[-1]
- else:
- input_file = infiles[0]
-
- if all(image['enc'] == 'jpeg' for image in pageinfo['images']):
- # If all images were JPEGs originally, produce a JPEG as output
- check_call(['convert', input_file, 'jpg:' + output_file])
- else:
- re_symlink(input_file, output_file)
-
-
-@active_if(ocr_required and not options.exact_image)
-@merge([ocr_tesseract, select_image_for_pdf],
- os.path.join(options.temp_folder, '%04i.rendered.pdf' % pageno))
-def render_page(infiles, output_file):
- hocr, image = infiles[0], infiles[1]
-
- dpi = round(max(pageinfo['xres'], pageinfo['yres']))
-
- hocrtransform = HocrTransform(hocr, dpi)
- hocrtransform.to_pdf(output_file, imageFileName=image,
- showBoundingboxes=False, invisibleText=True)
-
-
-@active_if(ocr_required and options.pdf_noimg)
-@transform(ocr_tesseract, suffix(".hocr"), ".ocred.todebug.pdf")
-def render_text_output_page(input_file, output_file):
- dpi = round(max(pageinfo['xres'], pageinfo['yres']))
-
- hocrtransform = HocrTransform(input_file, dpi)
- hocrtransform.to_pdf(output_file, imageFileName=None,
- showBoundingboxes=True, invisibleText=False)
-
-
-@active_if(ocr_required and options.exact_image)
-@transform(ocr_tesseract, suffix(".hocr"), ".hocr.pdf")
-def render_hocr_blank_page(input_file, output_file):
- dpi = round(max(pageinfo['xres'], pageinfo['yres']))
-
- hocrtransform = HocrTransform(input_file, dpi)
- hocrtransform.to_pdf(output_file, imageFileName=None,
- showBoundingboxes=False, invisibleText=True)
-
-
-@active_if(ocr_required and options.exact_image)
-@merge([render_hocr_blank_page, extract_single_page],
- os.path.join(options.temp_folder, "%04i.merged.pdf") % pageno)
-def merge_hocr_with_original_page(infiles, output_file):
- with open(infiles[0], 'rb') as hocr_input, \
- open(infiles[1], 'rb') as page_input, \
- open(output_file, 'wb') as output:
- hocr_reader = pypdf.PdfFileReader(hocr_input)
- page_reader = pypdf.PdfFileReader(page_input)
- writer = pypdf.PdfFileWriter()
-
- the_page = hocr_reader.getPage(0)
- the_page.mergePage(page_reader.getPage(0))
- writer.addPage(the_page)
- writer.write(output)
-
-
-@merge([render_page, merge_hocr_with_original_page, extract_single_page],
- os.path.join(options.temp_folder, '%04i.ocred.pdf' % pageno))
-def select_final_page(infiles, output_file):
- re_symlink(infiles[-1], output_file)
-
-
-if __name__ == '__main__':
- cmdline.run(options)
+# FRIENDLY_COLORSPACE = {
+# '/DeviceGray': 'gray',
+# '/CalGray': 'gray',
+# '/DeviceRGB': 'rgb',
+# '/CalRGB': 'rgb',
+# '/DeviceCMYK': 'cmyk',
+# '/Lab': 'lab',
+# '/ICCBased': 'icc',
+# '/Indexed': 'index',
+# '/Separation': 'sep',
+# '/DeviceN': 'devn',
+# '/Pattern': '-'
+# }
+
+# FRIENDLY_ENCODING = {
+# '/CCITTFaxDecode': 'ccitt',
+# '/DCTDecode': 'jpeg',
+# '/JPXDecode': 'jpx',
+# '/JBIG2Decode': 'jbig2',
+# }
+
+# FRIENDLY_COMP = {
+# 'gray': 1,
+# 'rgb': 3,
+# 'cmyk': 4,
+# 'lab': 3,
+# }
+
+
+# def pdf_get_pageinfo(infile, page, width_pt, height_pt):
+# pageinfo = {}
+# pageinfo['pageno'] = page
+# pageinfo['width_inches'] = width_pt / 72.0
+# pageinfo['height_inches'] = height_pt / 72.0
+# pageinfo['images'] = []
+
+# p_pdftotext = Popen(['pdftotext', '-f', str(page), '-l', str(page),
+# '-raw', '-nopgbrk', infile, '-'],
+# close_fds=True, stdout=PIPE, stderr=PIPE,
+# universal_newlines=True)
+# text, _ = p_pdftotext.communicate()
+# if len(text.strip()) > 0:
+# pageinfo['has_text'] = True
+# else:
+# pageinfo['has_text'] = False
+
+# pdf = pypdf.PdfFileReader(infile)
+# page = pdf.pages[page - 1]
+
+# if not '/XObject' in page['/Resources']:
+# # Missing /XObject means no images or possibly corrupt PDF
+# return pageinfo
+
+# for xobj in page['/Resources']['/XObject']:
+# # PyPDF2 returns the keys as an iterator
+# pdfimage = page['/Resources']['/XObject'][xobj]
+# if pdfimage['/Subtype'] != '/Image':
+# continue
+# if '/ImageMask' in pdfimage:
+# if pdfimage['/ImageMask']:
+# continue
+# image = {}
+# image['width'] = pdfimage['/Width']
+# image['height'] = pdfimage['/Height']
+# image['bpc'] = pdfimage['/BitsPerComponent']
+# if '/Filter' in pdfimage:
+# filter_ = pdfimage['/Filter']
+# if isinstance(filter_, pypdf.generic.ArrayObject):
+# filter_ = filter_[0]
+# image['enc'] = FRIENDLY_ENCODING.get(filter_, 'image')
+# else:
+# image['enc'] = 'image'
+# if '/ColorSpace' in pdfimage:
+# cs = pdfimage['/ColorSpace']
+# if isinstance(cs, pypdf.generic.ArrayObject):
+# cs = cs[0]
+# image['color'] = FRIENDLY_COLORSPACE.get(cs, '-')
+# else:
+# image['color'] = 'jpx' if image['enc'] == 'jpx' else '?'
+
+# image['comp'] = FRIENDLY_COMP.get(image['color'], '?')
+# image['dpi_w'] = image['width'] / pageinfo['width_inches']
+# image['dpi_h'] = image['height'] / pageinfo['height_inches']
+# image['dpi'] = (image['dpi_w'] * image['dpi_h']) ** 0.5
+# pageinfo['images'].append(image)
+
+# if pageinfo['images']:
+# xres = max(image['dpi_w'] for image in pageinfo['images'])
+# yres = max(image['dpi_h'] for image in pageinfo['images'])
+# pageinfo['xres'], pageinfo['yres'] = xres, yres
+# pageinfo['width_pixels'] = \
+# int(round(xres * pageinfo['width_inches']))
+# pageinfo['height_pixels'] = \
+# int(round(yres * pageinfo['height_inches']))
+
+# if options.oversampling_dpi > 0:
+# rx, ry = options.oversampling_dpi, options.oversampling_dpi
+# else:
+# rx, ry = pageinfo['xres'], pageinfo['yres']
+# pageinfo['xres_render'], pageinfo['yres_render'] = rx, ry
+
+# return pageinfo
+
+# pageno, width_pt, height_pt = map(int, options.page_info.split(' ', 3))
+# pageinfo = pdf_get_pageinfo(options.inputfile, pageno, width_pt, height_pt)
+
+# if not pageinfo['images']:
+# # If the page has no images, then it contains vector content or text
+# # or both. It seems quite unlikely that one would find meaningful text
+# # from rasterizing vector content. So skip the page.
+# log.info(
+# "Page {0} has no images - skipping OCR".format(pageno)
+# )
+# elif pageinfo['has_text']:
+# s = "Page {0} already has text! – {1}"
+
+# if not options.force_ocr and not options.skip_text:
+# log.error(s.format(pageno,
+# "aborting (use -f or -s to force OCR)"))
+# sys.exit(1)
+# elif options.force_ocr:
+# log.info(s.format(pageno,
+# "rasterizing text and running OCR anyway"))
+# elif options.skip_text:
+# log.info(s.format(pageno,
+# "skipping all processing on this page"))
+
+# ocr_required = pageinfo['images'] and \
+# (options.force_ocr or
+# (not (pageinfo['has_text'] and options.skip_text)))
+
+# if ocr_required and options.skip_big:
+# area = pageinfo['width_inches'] * pageinfo['height_inches']
+# pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
+# if area > (11.0 * 17.0) or pixel_count > (300.0 * 300.0 * 11 * 17):
+# ocr_required = False
+# log.info(
+# "Page {0} is very large; skipping due to -b".format(pageno))
+
+
+
+
+
+# @active_if(not ocr_required or (ocr_required and options.exact_image))
+# @transform(setup_working_directory,
+# formatter(),
+# os.path.join(options.temp_folder, '%04i.page.pdf' % pageno))
+# def extract_single_page(
+# input_file,
+# output_file):
+# args_pdfseparate = [
+# 'pdfseparate',
+# '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']),
+# input_file,
+# output_file
+# ]
+# check_call(args_pdfseparate)
+
+
+# @active_if(ocr_required)
+# @active_if(options.page_renderer == 'pdftoppm')
+# @transform(setup_working_directory,
+# formatter(),
+# "{path[0]}/%04i.pnm" % pageno)
+# def unpack_with_pdftoppm(
+# input_file,
+# output_file):
+# force_ppm = True
+# allow_jpeg = False
+
+# colorspace = 'color'
+# compression = 'deflate'
+# output_format = 'tiff'
+# if all(image['comp'] == 1 for image in pageinfo['images']):
+# if all(image['bpc'] == 1 for image in pageinfo['images']):
+# colorspace = 'mono'
+# compression = 'deflate'
+# elif not any(image['color'] == 'color'
+# for image in pageinfo['images']):
+# colorspace = 'gray'
+
+# if allow_jpeg and \
+# all(image['enc'] == 'jpeg' for image in pageinfo['images']):
+# output_format = 'jpeg'
+
+# args_pdftoppm = [
+# 'pdftoppm',
+# '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']),
+# '-rx', str(pageinfo['xres_render']),
+# '-ry', str(pageinfo['yres_render'])
+# ]
+
+# if not force_ppm:
+# if output_format == 'tiff':
+# args_pdftoppm.append('-tiff')
+# if False and compression:
+# args_pdftoppm.append('-tiffcompression')
+# args_pdftoppm.append(compression)
+# elif output_format == 'jpeg':
+# args_pdftoppm.append('-jpeg')
+
+# if colorspace == 'mono':
+# args_pdftoppm.append('-mono')
+# elif colorspace == 'gray':
+# args_pdftoppm.append('-gray')
+
+# args_pdftoppm.extend([str(input_file)])
+
+# # Ask pdftoppm to write the binary output to stdout; therefore set
+# # universal_newlines=False
+# p = Popen(args_pdftoppm, close_fds=True, stdout=open(output_file, 'wb'),
+# stderr=PIPE, universal_newlines=False)
+# _, stderr = p.communicate()
+# if stderr:
+# # Because universal_newlines=False, stderr is bytes(), so we must
+# # manually convert it to str for logging
+# from codecs import decode
+# log.error(decode(stderr, sys.getdefaultencoding(), 'ignore'))
+# if p.returncode != 0:
+# raise CalledProcessError(p.returncode, args_pdftoppm)
+
+
+# @active_if(ocr_required)
+# @transform(unpack_with_pdftoppm, suffix(".pnm"), ".png")
+# def convert_to_png(input_file, output_file):
+# args_convert = [
+# 'convert',
+# input_file,
+# output_file
+# ]
+# check_call(args_convert)
+
+
+# @active_if(ocr_required)
+# @active_if(options.page_renderer == 'ghostscript')
+# @transform(setup_working_directory,
+# formatter(),
+# "{path[0]}/%04i.png" % pageno)
+# def unpack_with_ghostscript(
+# input_file,
+# output_file):
+# device = 'png16m' # 24-bit
+# if all(image['comp'] == 1 for image in pageinfo['images']):
+# if all(image['bpc'] == 1 for image in pageinfo['images']):
+# device = 'pngmono'
+# elif not any(image['color'] == 'color'
+# for image in pageinfo['images']):
+# device = 'pnggray'
+
+# args_gs = [
+# 'gs',
+# '-dBATCH', '-dNOPAUSE',
+# '-dFirstPage=%i' % pageno,
+# '-dLastPage=%i' % pageno,
+# '-sDEVICE=%s' % device,
+# '-o', output_file,
+# '-r{0}x{1}'.format(
+# str(pageinfo['xres_render']), str(pageinfo['yres_render'])),
+# input_file
+# ]
+
+# p = Popen(args_gs, close_fds=True, stdout=PIPE, stderr=PIPE,
+# universal_newlines=True)
+# stdout, stderr = p.communicate()
+# if stdout:
+# log.info(stdout)
+# if stderr:
+# log.error(stderr)
+
+# try:
+# f = open(output_file)
+# except FileNotFoundError:
+# raise
+# else:
+# f.close()
+
+
+# @active_if(ocr_required)
+# @active_if(options.preprocess_deskew != 0
+# and options.deskew_provider == 'imagemagick')
+# @transform(convert_to_png, suffix(".png"), ".deskewed.png")
+# def deskew_imagemagick(input_file, output_file):
+# args_convert = [
+# 'convert',
+# input_file,
+# '-deskew', '40%',
+# '-gravity', 'center',
+# '-extent', '{width_pixels}x{height_pixels}'.format(**pageinfo),
+# '+repage',
+# output_file
+# ]
+
+# p = Popen(args_convert, close_fds=True, stdout=PIPE, stderr=PIPE,
+# universal_newlines=True)
+# stdout, stderr = p.communicate()
+
+# if stdout:
+# log.info(stdout)
+# if stderr:
+# log.error(stderr)
+
+# if p.returncode != 0:
+# raise CalledProcessError(p.returncode, args_convert)
+
+
+# @active_if(ocr_required)
+# @active_if(options.preprocess_deskew != 0
+# and options.deskew_provider == 'leptonica')
+# @transform(convert_to_png, suffix(".png"), ".deskewed.png")
+# def deskew_leptonica(input_file, output_file):
+# from .leptonica import deskew
+# deskew(input_file, output_file,
+# min(pageinfo['xres'], pageinfo['yres']))
+
+
+# @active_if(ocr_required)
+# @active_if(options.preprocess_clean != 0)
+# @merge([unpack_with_pdftoppm, unpack_with_ghostscript,
+# deskew_imagemagick, deskew_leptonica],
+# os.path.join(options.temp_folder, "%04i.for_clean.pnm" % pageno))
+# def select_image_for_cleaning(infiles, output_file):
+# input_file = infiles[-1]
+# args_convert = [
+# 'convert',
+# input_file,
+# output_file
+# ]
+# check_call(args_convert)
+
+
+# @active_if(ocr_required)
+# @active_if(options.preprocess_clean != 0)
+# @transform(select_image_for_cleaning, suffix(".pnm"), ".cleaned.pnm")
+# def clean_unpaper(input_file, output_file):
+# args_unpaper = [
+# 'unpaper',
+# '--dpi', str(int(round((pageinfo['xres'] * pageinfo['yres']) ** 0.5))),
+# '--mask-scan-size', '100',
+# '--no-deskew',
+# '--no-grayfilter',
+# '--no-blackfilter',
+# '--no-mask-center',
+# '--no-border-align',
+# input_file,
+# output_file
+# ]
+
+# p = Popen(args_unpaper, close_fds=True, stdout=PIPE, stderr=PIPE,
+# universal_newlines=True)
+# stdout, stderr = p.communicate()
+
+# if stdout:
+# log.info(stdout)
+# if stderr:
+# log.error(stderr)
+
+# if p.returncode != 0:
+# raise CalledProcessError(p.returncode, args_unpaper)
+
+
+# @active_if(ocr_required)
+# @transform(clean_unpaper, suffix(".cleaned.pnm"), ".cleaned.png")
+# def cleaned_to_png(input_file, output_file):
+# args_convert = [
+# 'convert',
+# input_file,
+# output_file
+# ]
+# check_call(args_convert)
+
+
+# @active_if(ocr_required)
+# @merge([unpack_with_ghostscript, convert_to_png, deskew_imagemagick,
+# deskew_leptonica, cleaned_to_png],
+# os.path.join(options.temp_folder, "%04i.for_ocr.png" % pageno))
+# def select_ocr_image(infiles, output_file):
+# re_symlink(infiles[-1], output_file)
+
+
+# hocr_template = '''
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+# '''
+
+
+# @active_if(ocr_required)
+# @transform(select_ocr_image, suffix(".for_ocr.png"), ".hocr")
+# def ocr_tesseract(
+# input_file,
+# output_file):
+
+# args_tesseract = [
+# 'tesseract',
+# '-l', options.language,
+# input_file,
+# output_file,
+# 'hocr',
+# options.tess_cfg_files
+# ]
+# p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
+# universal_newlines=True)
+# try:
+# stdout, stderr = p.communicate(timeout=180)
+# except TimeoutExpired:
+# p.kill()
+# stdout, stderr = p.communicate()
+# # Generate a HOCR file with no recognized text if tesseract times out
+# # Temporary workaround to hocrTransform not being able to function if
+# # it does not have a valid hOCR file.
+# with open(output_file, 'w', encoding="utf-8") as f:
+# f.write(hocr_template.format(pageinfo['width_pixels'],
+# pageinfo['height_pixels']))
+# else:
+# if stdout:
+# log.info(stdout)
+# if stderr:
+# log.error(stderr)
+
+# if p.returncode != 0:
+# raise CalledProcessError(p.returncode, args_tesseract)
+
+# if os.path.exists(output_file + '.html'):
+# # Tesseract 3.02 appends suffix ".html" on its own (.hocr.html)
+# shutil.move(output_file + '.html', output_file)
+# elif os.path.exists(output_file + '.hocr'):
+# # Tesseract 3.03 appends suffix ".hocr" on its own (.hocr.hocr)
+# shutil.move(output_file + '.hocr', output_file)
+
+# # Tesseract inserts source filename into hocr file without escaping
+# # it. This could break the XML parser. Rewrite the hocr file,
+# # replacing the filename with a space.
+# regex_nested_single_quotes = re.compile(
+# r"""title='image "([^"]*)";""")
+# with fileinput.input(files=(output_file,), inplace=True) as f:
+# for line in f:
+# line = regex_nested_single_quotes.sub(
+# r"""title='image " ";""", line)
+# print(line, end='') # fileinput.input redirects stdout
+
+
+# @active_if(ocr_required and not options.exact_image)
+# @merge([unpack_with_ghostscript, convert_to_png,
+# deskew_imagemagick, deskew_leptonica, cleaned_to_png],
+# os.path.join(options.temp_folder, "%04i.image_for_pdf" % pageno))
+# def select_image_for_pdf(infiles, output_file):
+# if options.preprocess_clean != 0 and options.preprocess_cleantopdf != 0:
+# input_file = infiles[-1]
+# elif options.preprocess_deskew != 0 and options.preprocess_clean != 0:
+# input_file = infiles[-2]
+# elif options.preprocess_deskew != 0 and options.preprocess_clean == 0:
+# input_file = infiles[-1]
+# else:
+# input_file = infiles[0]
+
+# if all(image['enc'] == 'jpeg' for image in pageinfo['images']):
+# # If all images were JPEGs originally, produce a JPEG as output
+# check_call(['convert', input_file, 'jpg:' + output_file])
+# else:
+# re_symlink(input_file, output_file)
+
+
+# @active_if(ocr_required and not options.exact_image)
+# @merge([ocr_tesseract, select_image_for_pdf],
+# os.path.join(options.temp_folder, '%04i.rendered.pdf' % pageno))
+# def render_page(infiles, output_file):
+# hocr, image = infiles[0], infiles[1]
+
+# dpi = round(max(pageinfo['xres'], pageinfo['yres']))
+
+# hocrtransform = HocrTransform(hocr, dpi)
+# hocrtransform.to_pdf(output_file, imageFileName=image,
+# showBoundingboxes=False, invisibleText=True)
+
+
+# @active_if(ocr_required and options.pdf_noimg)
+# @transform(ocr_tesseract, suffix(".hocr"), ".ocred.todebug.pdf")
+# def render_text_output_page(input_file, output_file):
+# dpi = round(max(pageinfo['xres'], pageinfo['yres']))
+
+# hocrtransform = HocrTransform(input_file, dpi)
+# hocrtransform.to_pdf(output_file, imageFileName=None,
+# showBoundingboxes=True, invisibleText=False)
+
+
+# @active_if(ocr_required and options.exact_image)
+# @transform(ocr_tesseract, suffix(".hocr"), ".hocr.pdf")
+# def render_hocr_blank_page(input_file, output_file):
+# dpi = round(max(pageinfo['xres'], pageinfo['yres']))
+
+# hocrtransform = HocrTransform(input_file, dpi)
+# hocrtransform.to_pdf(output_file, imageFileName=None,
+# showBoundingboxes=False, invisibleText=True)
+
+
+# @active_if(ocr_required and options.exact_image)
+# @merge([render_hocr_blank_page, extract_single_page],
+# os.path.join(options.temp_folder, "%04i.merged.pdf") % pageno)
+# def merge_hocr_with_original_page(infiles, output_file):
+# with open(infiles[0], 'rb') as hocr_input, \
+# open(infiles[1], 'rb') as page_input, \
+# open(output_file, 'wb') as output:
+# hocr_reader = pypdf.PdfFileReader(hocr_input)
+# page_reader = pypdf.PdfFileReader(page_input)
+# writer = pypdf.PdfFileWriter()
+
+# the_page = hocr_reader.getPage(0)
+# the_page.mergePage(page_reader.getPage(0))
+# writer.addPage(the_page)
+# writer.write(output)
+
+
+# @merge([render_page, merge_hocr_with_original_page, extract_single_page],
+# os.path.join(options.temp_folder, '%04i.ocred.pdf' % pageno))
+# def select_final_page(infiles, output_file):
+# re_symlink(infiles[-1], output_file)
+
+
+# if __name__ == '__main__':
+# cmdline.run(options)