From e89f482c3dc09c654317ca42bff51dfb23f835c6 Mon Sep 17 00:00:00 2001 From: Jim Barlow Date: Wed, 22 Jul 2015 22:51:38 -0700 Subject: [PATCH] Fixes from early testing of new pipeline --- src/ocrmypdf.py | 1082 +++++++++++++++++++++++------------------------ 1 file changed, 541 insertions(+), 541 deletions(-) diff --git a/src/ocrmypdf.py b/src/ocrmypdf.py index 7aefb8e4..cd7031da 100755 --- a/src/ocrmypdf.py +++ b/src/ocrmypdf.py @@ -18,7 +18,7 @@ except ImportError: from ruffus import transform, suffix, merge, active_if, regex, jobs_limit, \ - mkdir, formatter + mkdir, formatter, follows, split import ruffus.cmdline as cmdline from .hocrtransform import HocrTransform @@ -40,10 +40,10 @@ parser.add_argument( 'outputfile', help="output searchable PDF file") parser.add_argument( - '-l', '--language', nargs='*', default=['eng'] + '-l', '--language', nargs='*', default=['eng'], help="language of the file to be OCRed") -preprocessing = parser.add_group( +preprocessing = parser.add_argument_group( "Preprocessing options", "Improve OCR quality and final image") preprocessing.add_argument( @@ -72,7 +72,7 @@ parser.add_argument( '--exact-image', action='store_true', help="Use original page from PDF without re-rendering") -advanced = parser.add_group( +advanced = parser.add_argument_group( "Advanced", "Advanced options for power users and debugging") advanced.add_argument( @@ -88,7 +88,7 @@ advanced.add_argument( '--tesseract-config', default='', nargs='*', # Implemented help="Tesseract configuration") -debugging = parser.add_group( +debugging = parser.add_argument_group( "Debugging", "Arguments to help with troubleshooting and debugging") debugging.add_argument( @@ -188,541 +188,541 @@ def split_pages( check_call(args_pdfseparate) -FRIENDLY_COLORSPACE = { - '/DeviceGray': 'gray', - '/CalGray': 'gray', - '/DeviceRGB': 'rgb', - '/CalRGB': 'rgb', - '/DeviceCMYK': 'cmyk', - '/Lab': 'lab', - '/ICCBased': 'icc', - '/Indexed': 'index', - '/Separation': 'sep', - '/DeviceN': 'devn', - '/Pattern': '-' -} - -FRIENDLY_ENCODING = { - '/CCITTFaxDecode': 'ccitt', - '/DCTDecode': 'jpeg', - '/JPXDecode': 'jpx', - '/JBIG2Decode': 'jbig2', -} - -FRIENDLY_COMP = { - 'gray': 1, - 'rgb': 3, - 'cmyk': 4, - 'lab': 3, -} - - -def pdf_get_pageinfo(infile, page, width_pt, height_pt): - pageinfo = {} - pageinfo['pageno'] = page - pageinfo['width_inches'] = width_pt / 72.0 - pageinfo['height_inches'] = height_pt / 72.0 - pageinfo['images'] = [] - - p_pdftotext = Popen(['pdftotext', '-f', str(page), '-l', str(page), - '-raw', '-nopgbrk', infile, '-'], - close_fds=True, stdout=PIPE, stderr=PIPE, - universal_newlines=True) - text, _ = p_pdftotext.communicate() - if len(text.strip()) > 0: - pageinfo['has_text'] = True - else: - pageinfo['has_text'] = False - - pdf = pypdf.PdfFileReader(infile) - page = pdf.pages[page - 1] - - if not '/XObject' in page['/Resources']: - # Missing /XObject means no images or possibly corrupt PDF - return pageinfo - - for xobj in page['/Resources']['/XObject']: - # PyPDF2 returns the keys as an iterator - pdfimage = page['/Resources']['/XObject'][xobj] - if pdfimage['/Subtype'] != '/Image': - continue - if '/ImageMask' in pdfimage: - if pdfimage['/ImageMask']: - continue - image = {} - image['width'] = pdfimage['/Width'] - image['height'] = pdfimage['/Height'] - image['bpc'] = pdfimage['/BitsPerComponent'] - if '/Filter' in pdfimage: - filter_ = pdfimage['/Filter'] - if isinstance(filter_, pypdf.generic.ArrayObject): - filter_ = filter_[0] - image['enc'] = FRIENDLY_ENCODING.get(filter_, 'image') - else: - image['enc'] = 'image' - if '/ColorSpace' in pdfimage: - cs = pdfimage['/ColorSpace'] - if isinstance(cs, pypdf.generic.ArrayObject): - cs = cs[0] - image['color'] = FRIENDLY_COLORSPACE.get(cs, '-') - else: - image['color'] = 'jpx' if image['enc'] == 'jpx' else '?' - - image['comp'] = FRIENDLY_COMP.get(image['color'], '?') - image['dpi_w'] = image['width'] / pageinfo['width_inches'] - image['dpi_h'] = image['height'] / pageinfo['height_inches'] - image['dpi'] = (image['dpi_w'] * image['dpi_h']) ** 0.5 - pageinfo['images'].append(image) - - if pageinfo['images']: - xres = max(image['dpi_w'] for image in pageinfo['images']) - yres = max(image['dpi_h'] for image in pageinfo['images']) - pageinfo['xres'], pageinfo['yres'] = xres, yres - pageinfo['width_pixels'] = \ - int(round(xres * pageinfo['width_inches'])) - pageinfo['height_pixels'] = \ - int(round(yres * pageinfo['height_inches'])) - - if options.oversampling_dpi > 0: - rx, ry = options.oversampling_dpi, options.oversampling_dpi - else: - rx, ry = pageinfo['xres'], pageinfo['yres'] - pageinfo['xres_render'], pageinfo['yres_render'] = rx, ry - - return pageinfo - -pageno, width_pt, height_pt = map(int, options.page_info.split(' ', 3)) -pageinfo = pdf_get_pageinfo(options.inputfile, pageno, width_pt, height_pt) - -if not pageinfo['images']: - # If the page has no images, then it contains vector content or text - # or both. It seems quite unlikely that one would find meaningful text - # from rasterizing vector content. So skip the page. - log.info( - "Page {0} has no images - skipping OCR".format(pageno) - ) -elif pageinfo['has_text']: - s = "Page {0} already has text! – {1}" - - if not options.force_ocr and not options.skip_text: - log.error(s.format(pageno, - "aborting (use -f or -s to force OCR)")) - sys.exit(1) - elif options.force_ocr: - log.info(s.format(pageno, - "rasterizing text and running OCR anyway")) - elif options.skip_text: - log.info(s.format(pageno, - "skipping all processing on this page")) - -ocr_required = pageinfo['images'] and \ - (options.force_ocr or - (not (pageinfo['has_text'] and options.skip_text))) - -if ocr_required and options.skip_big: - area = pageinfo['width_inches'] * pageinfo['height_inches'] - pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels'] - if area > (11.0 * 17.0) or pixel_count > (300.0 * 300.0 * 11 * 17): - ocr_required = False - log.info( - "Page {0} is very large; skipping due to -b".format(pageno)) - - - - - -@active_if(not ocr_required or (ocr_required and options.exact_image)) -@transform(setup_working_directory, - formatter(), - os.path.join(options.temp_folder, '%04i.page.pdf' % pageno)) -def extract_single_page( - input_file, - output_file): - args_pdfseparate = [ - 'pdfseparate', - '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']), - input_file, - output_file - ] - check_call(args_pdfseparate) - - -@active_if(ocr_required) -@active_if(options.page_renderer == 'pdftoppm') -@transform(setup_working_directory, - formatter(), - "{path[0]}/%04i.pnm" % pageno) -def unpack_with_pdftoppm( - input_file, - output_file): - force_ppm = True - allow_jpeg = False - - colorspace = 'color' - compression = 'deflate' - output_format = 'tiff' - if all(image['comp'] == 1 for image in pageinfo['images']): - if all(image['bpc'] == 1 for image in pageinfo['images']): - colorspace = 'mono' - compression = 'deflate' - elif not any(image['color'] == 'color' - for image in pageinfo['images']): - colorspace = 'gray' - - if allow_jpeg and \ - all(image['enc'] == 'jpeg' for image in pageinfo['images']): - output_format = 'jpeg' - - args_pdftoppm = [ - 'pdftoppm', - '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']), - '-rx', str(pageinfo['xres_render']), - '-ry', str(pageinfo['yres_render']) - ] - - if not force_ppm: - if output_format == 'tiff': - args_pdftoppm.append('-tiff') - if False and compression: - args_pdftoppm.append('-tiffcompression') - args_pdftoppm.append(compression) - elif output_format == 'jpeg': - args_pdftoppm.append('-jpeg') - - if colorspace == 'mono': - args_pdftoppm.append('-mono') - elif colorspace == 'gray': - args_pdftoppm.append('-gray') - - args_pdftoppm.extend([str(input_file)]) - - # Ask pdftoppm to write the binary output to stdout; therefore set - # universal_newlines=False - p = Popen(args_pdftoppm, close_fds=True, stdout=open(output_file, 'wb'), - stderr=PIPE, universal_newlines=False) - _, stderr = p.communicate() - if stderr: - # Because universal_newlines=False, stderr is bytes(), so we must - # manually convert it to str for logging - from codecs import decode - log.error(decode(stderr, sys.getdefaultencoding(), 'ignore')) - if p.returncode != 0: - raise CalledProcessError(p.returncode, args_pdftoppm) - - -@active_if(ocr_required) -@transform(unpack_with_pdftoppm, suffix(".pnm"), ".png") -def convert_to_png(input_file, output_file): - args_convert = [ - 'convert', - input_file, - output_file - ] - check_call(args_convert) - - -@active_if(ocr_required) -@active_if(options.page_renderer == 'ghostscript') -@transform(setup_working_directory, - formatter(), - "{path[0]}/%04i.png" % pageno) -def unpack_with_ghostscript( - input_file, - output_file): - device = 'png16m' # 24-bit - if all(image['comp'] == 1 for image in pageinfo['images']): - if all(image['bpc'] == 1 for image in pageinfo['images']): - device = 'pngmono' - elif not any(image['color'] == 'color' - for image in pageinfo['images']): - device = 'pnggray' - - args_gs = [ - 'gs', - '-dBATCH', '-dNOPAUSE', - '-dFirstPage=%i' % pageno, - '-dLastPage=%i' % pageno, - '-sDEVICE=%s' % device, - '-o', output_file, - '-r{0}x{1}'.format( - str(pageinfo['xres_render']), str(pageinfo['yres_render'])), - input_file - ] - - p = Popen(args_gs, close_fds=True, stdout=PIPE, stderr=PIPE, - universal_newlines=True) - stdout, stderr = p.communicate() - if stdout: - log.info(stdout) - if stderr: - log.error(stderr) - - try: - f = open(output_file) - except FileNotFoundError: - raise - else: - f.close() - - -@active_if(ocr_required) -@active_if(options.preprocess_deskew != 0 - and options.deskew_provider == 'imagemagick') -@transform(convert_to_png, suffix(".png"), ".deskewed.png") -def deskew_imagemagick(input_file, output_file): - args_convert = [ - 'convert', - input_file, - '-deskew', '40%', - '-gravity', 'center', - '-extent', '{width_pixels}x{height_pixels}'.format(**pageinfo), - '+repage', - output_file - ] - - p = Popen(args_convert, close_fds=True, stdout=PIPE, stderr=PIPE, - universal_newlines=True) - stdout, stderr = p.communicate() - - if stdout: - log.info(stdout) - if stderr: - log.error(stderr) - - if p.returncode != 0: - raise CalledProcessError(p.returncode, args_convert) - - -@active_if(ocr_required) -@active_if(options.preprocess_deskew != 0 - and options.deskew_provider == 'leptonica') -@transform(convert_to_png, suffix(".png"), ".deskewed.png") -def deskew_leptonica(input_file, output_file): - from .leptonica import deskew - deskew(input_file, output_file, - min(pageinfo['xres'], pageinfo['yres'])) - - -@active_if(ocr_required) -@active_if(options.preprocess_clean != 0) -@merge([unpack_with_pdftoppm, unpack_with_ghostscript, - deskew_imagemagick, deskew_leptonica], - os.path.join(options.temp_folder, "%04i.for_clean.pnm" % pageno)) -def select_image_for_cleaning(infiles, output_file): - input_file = infiles[-1] - args_convert = [ - 'convert', - input_file, - output_file - ] - check_call(args_convert) - - -@active_if(ocr_required) -@active_if(options.preprocess_clean != 0) -@transform(select_image_for_cleaning, suffix(".pnm"), ".cleaned.pnm") -def clean_unpaper(input_file, output_file): - args_unpaper = [ - 'unpaper', - '--dpi', str(int(round((pageinfo['xres'] * pageinfo['yres']) ** 0.5))), - '--mask-scan-size', '100', - '--no-deskew', - '--no-grayfilter', - '--no-blackfilter', - '--no-mask-center', - '--no-border-align', - input_file, - output_file - ] - - p = Popen(args_unpaper, close_fds=True, stdout=PIPE, stderr=PIPE, - universal_newlines=True) - stdout, stderr = p.communicate() - - if stdout: - log.info(stdout) - if stderr: - log.error(stderr) - - if p.returncode != 0: - raise CalledProcessError(p.returncode, args_unpaper) - - -@active_if(ocr_required) -@transform(clean_unpaper, suffix(".cleaned.pnm"), ".cleaned.png") -def cleaned_to_png(input_file, output_file): - args_convert = [ - 'convert', - input_file, - output_file - ] - check_call(args_convert) - - -@active_if(ocr_required) -@merge([unpack_with_ghostscript, convert_to_png, deskew_imagemagick, - deskew_leptonica, cleaned_to_png], - os.path.join(options.temp_folder, "%04i.for_ocr.png" % pageno)) -def select_ocr_image(infiles, output_file): - re_symlink(infiles[-1], output_file) - - -hocr_template = ''' - - - - - - - - - -
-
-

- - -

-
-
- -''' - - -@active_if(ocr_required) -@transform(select_ocr_image, suffix(".for_ocr.png"), ".hocr") -def ocr_tesseract( - input_file, - output_file): - - args_tesseract = [ - 'tesseract', - '-l', options.language, - input_file, - output_file, - 'hocr', - options.tess_cfg_files - ] - p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE, - universal_newlines=True) - try: - stdout, stderr = p.communicate(timeout=180) - except TimeoutExpired: - p.kill() - stdout, stderr = p.communicate() - # Generate a HOCR file with no recognized text if tesseract times out - # Temporary workaround to hocrTransform not being able to function if - # it does not have a valid hOCR file. - with open(output_file, 'w', encoding="utf-8") as f: - f.write(hocr_template.format(pageinfo['width_pixels'], - pageinfo['height_pixels'])) - else: - if stdout: - log.info(stdout) - if stderr: - log.error(stderr) - - if p.returncode != 0: - raise CalledProcessError(p.returncode, args_tesseract) - - if os.path.exists(output_file + '.html'): - # Tesseract 3.02 appends suffix ".html" on its own (.hocr.html) - shutil.move(output_file + '.html', output_file) - elif os.path.exists(output_file + '.hocr'): - # Tesseract 3.03 appends suffix ".hocr" on its own (.hocr.hocr) - shutil.move(output_file + '.hocr', output_file) - - # Tesseract inserts source filename into hocr file without escaping - # it. This could break the XML parser. Rewrite the hocr file, - # replacing the filename with a space. - regex_nested_single_quotes = re.compile( - r"""title='image "([^"]*)";""") - with fileinput.input(files=(output_file,), inplace=True) as f: - for line in f: - line = regex_nested_single_quotes.sub( - r"""title='image " ";""", line) - print(line, end='') # fileinput.input redirects stdout - - -@active_if(ocr_required and not options.exact_image) -@merge([unpack_with_ghostscript, convert_to_png, - deskew_imagemagick, deskew_leptonica, cleaned_to_png], - os.path.join(options.temp_folder, "%04i.image_for_pdf" % pageno)) -def select_image_for_pdf(infiles, output_file): - if options.preprocess_clean != 0 and options.preprocess_cleantopdf != 0: - input_file = infiles[-1] - elif options.preprocess_deskew != 0 and options.preprocess_clean != 0: - input_file = infiles[-2] - elif options.preprocess_deskew != 0 and options.preprocess_clean == 0: - input_file = infiles[-1] - else: - input_file = infiles[0] - - if all(image['enc'] == 'jpeg' for image in pageinfo['images']): - # If all images were JPEGs originally, produce a JPEG as output - check_call(['convert', input_file, 'jpg:' + output_file]) - else: - re_symlink(input_file, output_file) - - -@active_if(ocr_required and not options.exact_image) -@merge([ocr_tesseract, select_image_for_pdf], - os.path.join(options.temp_folder, '%04i.rendered.pdf' % pageno)) -def render_page(infiles, output_file): - hocr, image = infiles[0], infiles[1] - - dpi = round(max(pageinfo['xres'], pageinfo['yres'])) - - hocrtransform = HocrTransform(hocr, dpi) - hocrtransform.to_pdf(output_file, imageFileName=image, - showBoundingboxes=False, invisibleText=True) - - -@active_if(ocr_required and options.pdf_noimg) -@transform(ocr_tesseract, suffix(".hocr"), ".ocred.todebug.pdf") -def render_text_output_page(input_file, output_file): - dpi = round(max(pageinfo['xres'], pageinfo['yres'])) - - hocrtransform = HocrTransform(input_file, dpi) - hocrtransform.to_pdf(output_file, imageFileName=None, - showBoundingboxes=True, invisibleText=False) - - -@active_if(ocr_required and options.exact_image) -@transform(ocr_tesseract, suffix(".hocr"), ".hocr.pdf") -def render_hocr_blank_page(input_file, output_file): - dpi = round(max(pageinfo['xres'], pageinfo['yres'])) - - hocrtransform = HocrTransform(input_file, dpi) - hocrtransform.to_pdf(output_file, imageFileName=None, - showBoundingboxes=False, invisibleText=True) - - -@active_if(ocr_required and options.exact_image) -@merge([render_hocr_blank_page, extract_single_page], - os.path.join(options.temp_folder, "%04i.merged.pdf") % pageno) -def merge_hocr_with_original_page(infiles, output_file): - with open(infiles[0], 'rb') as hocr_input, \ - open(infiles[1], 'rb') as page_input, \ - open(output_file, 'wb') as output: - hocr_reader = pypdf.PdfFileReader(hocr_input) - page_reader = pypdf.PdfFileReader(page_input) - writer = pypdf.PdfFileWriter() - - the_page = hocr_reader.getPage(0) - the_page.mergePage(page_reader.getPage(0)) - writer.addPage(the_page) - writer.write(output) - - -@merge([render_page, merge_hocr_with_original_page, extract_single_page], - os.path.join(options.temp_folder, '%04i.ocred.pdf' % pageno)) -def select_final_page(infiles, output_file): - re_symlink(infiles[-1], output_file) - - -if __name__ == '__main__': - cmdline.run(options) +# FRIENDLY_COLORSPACE = { +# '/DeviceGray': 'gray', +# '/CalGray': 'gray', +# '/DeviceRGB': 'rgb', +# '/CalRGB': 'rgb', +# '/DeviceCMYK': 'cmyk', +# '/Lab': 'lab', +# '/ICCBased': 'icc', +# '/Indexed': 'index', +# '/Separation': 'sep', +# '/DeviceN': 'devn', +# '/Pattern': '-' +# } + +# FRIENDLY_ENCODING = { +# '/CCITTFaxDecode': 'ccitt', +# '/DCTDecode': 'jpeg', +# '/JPXDecode': 'jpx', +# '/JBIG2Decode': 'jbig2', +# } + +# FRIENDLY_COMP = { +# 'gray': 1, +# 'rgb': 3, +# 'cmyk': 4, +# 'lab': 3, +# } + + +# def pdf_get_pageinfo(infile, page, width_pt, height_pt): +# pageinfo = {} +# pageinfo['pageno'] = page +# pageinfo['width_inches'] = width_pt / 72.0 +# pageinfo['height_inches'] = height_pt / 72.0 +# pageinfo['images'] = [] + +# p_pdftotext = Popen(['pdftotext', '-f', str(page), '-l', str(page), +# '-raw', '-nopgbrk', infile, '-'], +# close_fds=True, stdout=PIPE, stderr=PIPE, +# universal_newlines=True) +# text, _ = p_pdftotext.communicate() +# if len(text.strip()) > 0: +# pageinfo['has_text'] = True +# else: +# pageinfo['has_text'] = False + +# pdf = pypdf.PdfFileReader(infile) +# page = pdf.pages[page - 1] + +# if not '/XObject' in page['/Resources']: +# # Missing /XObject means no images or possibly corrupt PDF +# return pageinfo + +# for xobj in page['/Resources']['/XObject']: +# # PyPDF2 returns the keys as an iterator +# pdfimage = page['/Resources']['/XObject'][xobj] +# if pdfimage['/Subtype'] != '/Image': +# continue +# if '/ImageMask' in pdfimage: +# if pdfimage['/ImageMask']: +# continue +# image = {} +# image['width'] = pdfimage['/Width'] +# image['height'] = pdfimage['/Height'] +# image['bpc'] = pdfimage['/BitsPerComponent'] +# if '/Filter' in pdfimage: +# filter_ = pdfimage['/Filter'] +# if isinstance(filter_, pypdf.generic.ArrayObject): +# filter_ = filter_[0] +# image['enc'] = FRIENDLY_ENCODING.get(filter_, 'image') +# else: +# image['enc'] = 'image' +# if '/ColorSpace' in pdfimage: +# cs = pdfimage['/ColorSpace'] +# if isinstance(cs, pypdf.generic.ArrayObject): +# cs = cs[0] +# image['color'] = FRIENDLY_COLORSPACE.get(cs, '-') +# else: +# image['color'] = 'jpx' if image['enc'] == 'jpx' else '?' + +# image['comp'] = FRIENDLY_COMP.get(image['color'], '?') +# image['dpi_w'] = image['width'] / pageinfo['width_inches'] +# image['dpi_h'] = image['height'] / pageinfo['height_inches'] +# image['dpi'] = (image['dpi_w'] * image['dpi_h']) ** 0.5 +# pageinfo['images'].append(image) + +# if pageinfo['images']: +# xres = max(image['dpi_w'] for image in pageinfo['images']) +# yres = max(image['dpi_h'] for image in pageinfo['images']) +# pageinfo['xres'], pageinfo['yres'] = xres, yres +# pageinfo['width_pixels'] = \ +# int(round(xres * pageinfo['width_inches'])) +# pageinfo['height_pixels'] = \ +# int(round(yres * pageinfo['height_inches'])) + +# if options.oversampling_dpi > 0: +# rx, ry = options.oversampling_dpi, options.oversampling_dpi +# else: +# rx, ry = pageinfo['xres'], pageinfo['yres'] +# pageinfo['xres_render'], pageinfo['yres_render'] = rx, ry + +# return pageinfo + +# pageno, width_pt, height_pt = map(int, options.page_info.split(' ', 3)) +# pageinfo = pdf_get_pageinfo(options.inputfile, pageno, width_pt, height_pt) + +# if not pageinfo['images']: +# # If the page has no images, then it contains vector content or text +# # or both. It seems quite unlikely that one would find meaningful text +# # from rasterizing vector content. So skip the page. +# log.info( +# "Page {0} has no images - skipping OCR".format(pageno) +# ) +# elif pageinfo['has_text']: +# s = "Page {0} already has text! – {1}" + +# if not options.force_ocr and not options.skip_text: +# log.error(s.format(pageno, +# "aborting (use -f or -s to force OCR)")) +# sys.exit(1) +# elif options.force_ocr: +# log.info(s.format(pageno, +# "rasterizing text and running OCR anyway")) +# elif options.skip_text: +# log.info(s.format(pageno, +# "skipping all processing on this page")) + +# ocr_required = pageinfo['images'] and \ +# (options.force_ocr or +# (not (pageinfo['has_text'] and options.skip_text))) + +# if ocr_required and options.skip_big: +# area = pageinfo['width_inches'] * pageinfo['height_inches'] +# pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels'] +# if area > (11.0 * 17.0) or pixel_count > (300.0 * 300.0 * 11 * 17): +# ocr_required = False +# log.info( +# "Page {0} is very large; skipping due to -b".format(pageno)) + + + + + +# @active_if(not ocr_required or (ocr_required and options.exact_image)) +# @transform(setup_working_directory, +# formatter(), +# os.path.join(options.temp_folder, '%04i.page.pdf' % pageno)) +# def extract_single_page( +# input_file, +# output_file): +# args_pdfseparate = [ +# 'pdfseparate', +# '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']), +# input_file, +# output_file +# ] +# check_call(args_pdfseparate) + + +# @active_if(ocr_required) +# @active_if(options.page_renderer == 'pdftoppm') +# @transform(setup_working_directory, +# formatter(), +# "{path[0]}/%04i.pnm" % pageno) +# def unpack_with_pdftoppm( +# input_file, +# output_file): +# force_ppm = True +# allow_jpeg = False + +# colorspace = 'color' +# compression = 'deflate' +# output_format = 'tiff' +# if all(image['comp'] == 1 for image in pageinfo['images']): +# if all(image['bpc'] == 1 for image in pageinfo['images']): +# colorspace = 'mono' +# compression = 'deflate' +# elif not any(image['color'] == 'color' +# for image in pageinfo['images']): +# colorspace = 'gray' + +# if allow_jpeg and \ +# all(image['enc'] == 'jpeg' for image in pageinfo['images']): +# output_format = 'jpeg' + +# args_pdftoppm = [ +# 'pdftoppm', +# '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']), +# '-rx', str(pageinfo['xres_render']), +# '-ry', str(pageinfo['yres_render']) +# ] + +# if not force_ppm: +# if output_format == 'tiff': +# args_pdftoppm.append('-tiff') +# if False and compression: +# args_pdftoppm.append('-tiffcompression') +# args_pdftoppm.append(compression) +# elif output_format == 'jpeg': +# args_pdftoppm.append('-jpeg') + +# if colorspace == 'mono': +# args_pdftoppm.append('-mono') +# elif colorspace == 'gray': +# args_pdftoppm.append('-gray') + +# args_pdftoppm.extend([str(input_file)]) + +# # Ask pdftoppm to write the binary output to stdout; therefore set +# # universal_newlines=False +# p = Popen(args_pdftoppm, close_fds=True, stdout=open(output_file, 'wb'), +# stderr=PIPE, universal_newlines=False) +# _, stderr = p.communicate() +# if stderr: +# # Because universal_newlines=False, stderr is bytes(), so we must +# # manually convert it to str for logging +# from codecs import decode +# log.error(decode(stderr, sys.getdefaultencoding(), 'ignore')) +# if p.returncode != 0: +# raise CalledProcessError(p.returncode, args_pdftoppm) + + +# @active_if(ocr_required) +# @transform(unpack_with_pdftoppm, suffix(".pnm"), ".png") +# def convert_to_png(input_file, output_file): +# args_convert = [ +# 'convert', +# input_file, +# output_file +# ] +# check_call(args_convert) + + +# @active_if(ocr_required) +# @active_if(options.page_renderer == 'ghostscript') +# @transform(setup_working_directory, +# formatter(), +# "{path[0]}/%04i.png" % pageno) +# def unpack_with_ghostscript( +# input_file, +# output_file): +# device = 'png16m' # 24-bit +# if all(image['comp'] == 1 for image in pageinfo['images']): +# if all(image['bpc'] == 1 for image in pageinfo['images']): +# device = 'pngmono' +# elif not any(image['color'] == 'color' +# for image in pageinfo['images']): +# device = 'pnggray' + +# args_gs = [ +# 'gs', +# '-dBATCH', '-dNOPAUSE', +# '-dFirstPage=%i' % pageno, +# '-dLastPage=%i' % pageno, +# '-sDEVICE=%s' % device, +# '-o', output_file, +# '-r{0}x{1}'.format( +# str(pageinfo['xres_render']), str(pageinfo['yres_render'])), +# input_file +# ] + +# p = Popen(args_gs, close_fds=True, stdout=PIPE, stderr=PIPE, +# universal_newlines=True) +# stdout, stderr = p.communicate() +# if stdout: +# log.info(stdout) +# if stderr: +# log.error(stderr) + +# try: +# f = open(output_file) +# except FileNotFoundError: +# raise +# else: +# f.close() + + +# @active_if(ocr_required) +# @active_if(options.preprocess_deskew != 0 +# and options.deskew_provider == 'imagemagick') +# @transform(convert_to_png, suffix(".png"), ".deskewed.png") +# def deskew_imagemagick(input_file, output_file): +# args_convert = [ +# 'convert', +# input_file, +# '-deskew', '40%', +# '-gravity', 'center', +# '-extent', '{width_pixels}x{height_pixels}'.format(**pageinfo), +# '+repage', +# output_file +# ] + +# p = Popen(args_convert, close_fds=True, stdout=PIPE, stderr=PIPE, +# universal_newlines=True) +# stdout, stderr = p.communicate() + +# if stdout: +# log.info(stdout) +# if stderr: +# log.error(stderr) + +# if p.returncode != 0: +# raise CalledProcessError(p.returncode, args_convert) + + +# @active_if(ocr_required) +# @active_if(options.preprocess_deskew != 0 +# and options.deskew_provider == 'leptonica') +# @transform(convert_to_png, suffix(".png"), ".deskewed.png") +# def deskew_leptonica(input_file, output_file): +# from .leptonica import deskew +# deskew(input_file, output_file, +# min(pageinfo['xres'], pageinfo['yres'])) + + +# @active_if(ocr_required) +# @active_if(options.preprocess_clean != 0) +# @merge([unpack_with_pdftoppm, unpack_with_ghostscript, +# deskew_imagemagick, deskew_leptonica], +# os.path.join(options.temp_folder, "%04i.for_clean.pnm" % pageno)) +# def select_image_for_cleaning(infiles, output_file): +# input_file = infiles[-1] +# args_convert = [ +# 'convert', +# input_file, +# output_file +# ] +# check_call(args_convert) + + +# @active_if(ocr_required) +# @active_if(options.preprocess_clean != 0) +# @transform(select_image_for_cleaning, suffix(".pnm"), ".cleaned.pnm") +# def clean_unpaper(input_file, output_file): +# args_unpaper = [ +# 'unpaper', +# '--dpi', str(int(round((pageinfo['xres'] * pageinfo['yres']) ** 0.5))), +# '--mask-scan-size', '100', +# '--no-deskew', +# '--no-grayfilter', +# '--no-blackfilter', +# '--no-mask-center', +# '--no-border-align', +# input_file, +# output_file +# ] + +# p = Popen(args_unpaper, close_fds=True, stdout=PIPE, stderr=PIPE, +# universal_newlines=True) +# stdout, stderr = p.communicate() + +# if stdout: +# log.info(stdout) +# if stderr: +# log.error(stderr) + +# if p.returncode != 0: +# raise CalledProcessError(p.returncode, args_unpaper) + + +# @active_if(ocr_required) +# @transform(clean_unpaper, suffix(".cleaned.pnm"), ".cleaned.png") +# def cleaned_to_png(input_file, output_file): +# args_convert = [ +# 'convert', +# input_file, +# output_file +# ] +# check_call(args_convert) + + +# @active_if(ocr_required) +# @merge([unpack_with_ghostscript, convert_to_png, deskew_imagemagick, +# deskew_leptonica, cleaned_to_png], +# os.path.join(options.temp_folder, "%04i.for_ocr.png" % pageno)) +# def select_ocr_image(infiles, output_file): +# re_symlink(infiles[-1], output_file) + + +# hocr_template = ''' +# +# +# +# +# +# +# +# +# +#
+#
+#

+# +# +#

+#
+#
+# +# ''' + + +# @active_if(ocr_required) +# @transform(select_ocr_image, suffix(".for_ocr.png"), ".hocr") +# def ocr_tesseract( +# input_file, +# output_file): + +# args_tesseract = [ +# 'tesseract', +# '-l', options.language, +# input_file, +# output_file, +# 'hocr', +# options.tess_cfg_files +# ] +# p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE, +# universal_newlines=True) +# try: +# stdout, stderr = p.communicate(timeout=180) +# except TimeoutExpired: +# p.kill() +# stdout, stderr = p.communicate() +# # Generate a HOCR file with no recognized text if tesseract times out +# # Temporary workaround to hocrTransform not being able to function if +# # it does not have a valid hOCR file. +# with open(output_file, 'w', encoding="utf-8") as f: +# f.write(hocr_template.format(pageinfo['width_pixels'], +# pageinfo['height_pixels'])) +# else: +# if stdout: +# log.info(stdout) +# if stderr: +# log.error(stderr) + +# if p.returncode != 0: +# raise CalledProcessError(p.returncode, args_tesseract) + +# if os.path.exists(output_file + '.html'): +# # Tesseract 3.02 appends suffix ".html" on its own (.hocr.html) +# shutil.move(output_file + '.html', output_file) +# elif os.path.exists(output_file + '.hocr'): +# # Tesseract 3.03 appends suffix ".hocr" on its own (.hocr.hocr) +# shutil.move(output_file + '.hocr', output_file) + +# # Tesseract inserts source filename into hocr file without escaping +# # it. This could break the XML parser. Rewrite the hocr file, +# # replacing the filename with a space. +# regex_nested_single_quotes = re.compile( +# r"""title='image "([^"]*)";""") +# with fileinput.input(files=(output_file,), inplace=True) as f: +# for line in f: +# line = regex_nested_single_quotes.sub( +# r"""title='image " ";""", line) +# print(line, end='') # fileinput.input redirects stdout + + +# @active_if(ocr_required and not options.exact_image) +# @merge([unpack_with_ghostscript, convert_to_png, +# deskew_imagemagick, deskew_leptonica, cleaned_to_png], +# os.path.join(options.temp_folder, "%04i.image_for_pdf" % pageno)) +# def select_image_for_pdf(infiles, output_file): +# if options.preprocess_clean != 0 and options.preprocess_cleantopdf != 0: +# input_file = infiles[-1] +# elif options.preprocess_deskew != 0 and options.preprocess_clean != 0: +# input_file = infiles[-2] +# elif options.preprocess_deskew != 0 and options.preprocess_clean == 0: +# input_file = infiles[-1] +# else: +# input_file = infiles[0] + +# if all(image['enc'] == 'jpeg' for image in pageinfo['images']): +# # If all images were JPEGs originally, produce a JPEG as output +# check_call(['convert', input_file, 'jpg:' + output_file]) +# else: +# re_symlink(input_file, output_file) + + +# @active_if(ocr_required and not options.exact_image) +# @merge([ocr_tesseract, select_image_for_pdf], +# os.path.join(options.temp_folder, '%04i.rendered.pdf' % pageno)) +# def render_page(infiles, output_file): +# hocr, image = infiles[0], infiles[1] + +# dpi = round(max(pageinfo['xres'], pageinfo['yres'])) + +# hocrtransform = HocrTransform(hocr, dpi) +# hocrtransform.to_pdf(output_file, imageFileName=image, +# showBoundingboxes=False, invisibleText=True) + + +# @active_if(ocr_required and options.pdf_noimg) +# @transform(ocr_tesseract, suffix(".hocr"), ".ocred.todebug.pdf") +# def render_text_output_page(input_file, output_file): +# dpi = round(max(pageinfo['xres'], pageinfo['yres'])) + +# hocrtransform = HocrTransform(input_file, dpi) +# hocrtransform.to_pdf(output_file, imageFileName=None, +# showBoundingboxes=True, invisibleText=False) + + +# @active_if(ocr_required and options.exact_image) +# @transform(ocr_tesseract, suffix(".hocr"), ".hocr.pdf") +# def render_hocr_blank_page(input_file, output_file): +# dpi = round(max(pageinfo['xres'], pageinfo['yres'])) + +# hocrtransform = HocrTransform(input_file, dpi) +# hocrtransform.to_pdf(output_file, imageFileName=None, +# showBoundingboxes=False, invisibleText=True) + + +# @active_if(ocr_required and options.exact_image) +# @merge([render_hocr_blank_page, extract_single_page], +# os.path.join(options.temp_folder, "%04i.merged.pdf") % pageno) +# def merge_hocr_with_original_page(infiles, output_file): +# with open(infiles[0], 'rb') as hocr_input, \ +# open(infiles[1], 'rb') as page_input, \ +# open(output_file, 'wb') as output: +# hocr_reader = pypdf.PdfFileReader(hocr_input) +# page_reader = pypdf.PdfFileReader(page_input) +# writer = pypdf.PdfFileWriter() + +# the_page = hocr_reader.getPage(0) +# the_page.mergePage(page_reader.getPage(0)) +# writer.addPage(the_page) +# writer.write(output) + + +# @merge([render_page, merge_hocr_with_original_page, extract_single_page], +# os.path.join(options.temp_folder, '%04i.ocred.pdf' % pageno)) +# def select_final_page(infiles, output_file): +# re_symlink(infiles[-1], output_file) + + +# if __name__ == '__main__': +# cmdline.run(options)