2014-09-26 04:19:41 -07:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
import sys
|
2014-09-26 04:43:15 -07:00
|
|
|
|
import os.path
|
2015-02-13 13:41:14 -08:00
|
|
|
|
import fileinput
|
|
|
|
|
import re
|
2014-09-26 04:19:41 -07:00
|
|
|
|
from parse import parse
|
2015-03-10 14:28:38 -07:00
|
|
|
|
import PyPDF2 as pypdf
|
|
|
|
|
import shutil
|
2015-07-23 01:16:05 -07:00
|
|
|
|
from contextlib import suppress
|
2014-10-08 03:21:28 -07:00
|
|
|
|
|
2014-11-14 02:06:23 -08:00
|
|
|
|
from subprocess import Popen, check_call, PIPE, CalledProcessError, \
|
|
|
|
|
TimeoutExpired
|
2014-10-08 03:21:28 -07:00
|
|
|
|
try:
|
|
|
|
|
from subprocess import DEVNULL
|
|
|
|
|
except ImportError:
|
|
|
|
|
import os
|
|
|
|
|
DEVNULL = open(os.devnull, 'wb')
|
|
|
|
|
|
2014-09-26 04:19:41 -07:00
|
|
|
|
|
2014-10-10 00:35:49 -07:00
|
|
|
|
from ruffus import transform, suffix, merge, active_if, regex, jobs_limit, \
|
2015-07-22 22:51:38 -07:00
|
|
|
|
mkdir, formatter, follows, split
|
2014-10-08 03:21:28 -07:00
|
|
|
|
import ruffus.cmdline as cmdline
|
2015-02-20 17:20:48 -08:00
|
|
|
|
from .hocrtransform import HocrTransform
|
|
|
|
|
|
2015-04-09 03:12:04 -07:00
|
|
|
|
import warnings
|
diff --git a/src/ocrmypdf.py b/src/ocrmypdf.py
index 68d1591..95afa8f 100755
--- a/src/ocrmypdf.py
+++ b/src/ocrmypdf.py
@@ -24,6 +24,7 @@ import ruffus.cmdline as cmdline
from .hocrtransform import HocrTransform
import warnings
+import multiprocessing
warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
@@ -96,7 +97,7 @@ debugging.add_argument(
'-k', '--keep-temporary-files', action='store_true',
help="keep temporary files (helpful for debugging)")
debugging.add_argument(
- '-g' ,'--debug-rendering', action='store_true',
+ '-g', '--debug-rendering', action='store_true',
help="render each page twice with debug information on second page")
@@ -106,51 +107,19 @@ if not options.temp_folder:
options.temp_folder = 'tmp'
-_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
- options.verbose)
+log, log_mutex = cmdline.setup_logging(__name__, options.log_file,
+ options.verbose)
-class WrappedLogger:
-
- def __init__(self, my_logger, my_mutex):
- self.logger = my_logger
- self.mutex = my_mutex
-
- def log(self, *args, **kwargs):
- with self.mutex:
- self.logger.log(*args, **kwargs)
-
- def debug(self, *args, **kwargs):
- with self.mutex:
- self.logger.debug(*args, **kwargs)
-
- def info(self, *args, **kwargs):
- with self.mutex:
- self.logger.info(*args, **kwargs)
-
- def warning(self, *args, **kwargs):
- with self.mutex:
- self.logger.warning(*args, **kwargs)
-
- def error(self, *args, **kwargs):
- with self.mutex:
- self.logger.error(*args, **kwargs)
-
- def critical(self, *args, **kwargs):
- with self.mutex:
- self.logger.critical(*args, **kwargs)
-
-log = WrappedLogger(_logger, _logger_mutex)
-
-
-def re_symlink(input_file, soft_link_name, log=log):
+def re_symlink(input_file, soft_link_name, log, mutex):
"""
Helper function: relinks soft symbolic link if necessary
"""
if input_file == soft_link_name:
- log.debug("Warning: No symbolic link made. You are using " +
- "the original data directory as the working directory.")
+ with mutex:
+ log.debug("Warning: No symbolic link made. You are using " +
+ "the original data directory as the working directory.")
return
@@ -161,12 +130,14 @@ def re_symlink(input_file, soft_link_name, log=log):
try:
os.unlink(soft_link_name)
except:
- log.debug("Can't unlink %s" % (soft_link_name))
+ with mutex:
+ log.debug("Can't unlink %s" % (soft_link_name))
if not os.path.exists(input_file):
raise Exception("trying to create a broken symlink to %s" % input_file)
- log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
+ with mutex:
+ log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
os.symlink(
2015-07-23 02:22:12 -07:00
|
|
|
|
import multiprocessing
|
2015-04-09 03:12:04 -07:00
|
|
|
|
|
2015-04-09 14:06:55 -07:00
|
|
|
|
warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
|
2015-04-09 03:12:04 -07:00
|
|
|
|
|
2014-10-08 03:21:28 -07:00
|
|
|
|
|
2014-10-08 03:54:06 -07:00
|
|
|
|
basedir = os.path.dirname(os.path.realpath(__file__))
|
2014-10-08 03:21:28 -07:00
|
|
|
|
|
|
|
|
|
parser = cmdline.get_argparse(
|
2015-07-22 22:30:00 -07:00
|
|
|
|
prog="OCRmyPDF",
|
|
|
|
|
description="Generate searchable PDF file from an image-only PDF file.")
|
2014-10-08 03:21:28 -07:00
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
2015-07-23 01:16:05 -07:00
|
|
|
|
'input_file',
|
2015-07-22 22:30:00 -07:00
|
|
|
|
help="PDF file containing the images to be OCRed")
|
2014-10-08 03:21:28 -07:00
|
|
|
|
parser.add_argument(
|
2015-07-23 01:16:05 -07:00
|
|
|
|
'output_file',
|
2015-07-22 22:30:00 -07:00
|
|
|
|
help="output searchable PDF file")
|
2014-10-08 03:21:28 -07:00
|
|
|
|
parser.add_argument(
|
2015-07-22 22:51:38 -07:00
|
|
|
|
'-l', '--language', nargs='*', default=['eng'],
|
2015-07-22 22:30:00 -07:00
|
|
|
|
help="language of the file to be OCRed")
|
|
|
|
|
|
2015-07-22 22:51:38 -07:00
|
|
|
|
preprocessing = parser.add_argument_group(
|
2015-07-22 22:30:00 -07:00
|
|
|
|
"Preprocessing options",
|
|
|
|
|
"Improve OCR quality and final image")
|
|
|
|
|
preprocessing.add_argument(
|
|
|
|
|
'-d', '--deskew', action='store_true',
|
|
|
|
|
help="deskew each page before performing OCR")
|
|
|
|
|
preprocessing.add_argument(
|
|
|
|
|
'-c', '--clean', action='store_true',
|
|
|
|
|
help="clean pages with unpaper before performing OCR")
|
|
|
|
|
preprocessing.add_argument(
|
|
|
|
|
'-i', '--clean-final', action='store_true',
|
|
|
|
|
help="incorporate the cleaned image in the final PDF file")
|
|
|
|
|
preprocessing.add_argument(
|
|
|
|
|
'--oversample', metavar='DPI', type=int,
|
|
|
|
|
help="oversample images to improve OCR results slightly")
|
|
|
|
|
|
2014-10-08 03:21:28 -07:00
|
|
|
|
parser.add_argument(
|
2015-07-22 22:30:00 -07:00
|
|
|
|
'--force-ocr', action='store_true',
|
2014-10-08 03:21:28 -07:00
|
|
|
|
help="Force to OCR, even if the page already contains fonts")
|
|
|
|
|
parser.add_argument(
|
2015-07-22 22:30:00 -07:00
|
|
|
|
'--skip-text', action='store_true',
|
2014-10-08 03:21:28 -07:00
|
|
|
|
help="Skip OCR on pages that contain fonts and include the page anyway")
|
2015-02-20 15:26:33 -08:00
|
|
|
|
parser.add_argument(
|
2015-07-22 22:30:00 -07:00
|
|
|
|
'--skip-big', action='store_true',
|
2015-02-20 15:26:33 -08:00
|
|
|
|
help="Skip OCR for pages that are very large")
|
2015-03-10 14:28:38 -07:00
|
|
|
|
parser.add_argument(
|
2015-07-22 22:30:00 -07:00
|
|
|
|
'--exact-image', action='store_true',
|
2015-03-10 14:28:38 -07:00
|
|
|
|
help="Use original page from PDF without re-rendering")
|
2015-07-22 22:30:00 -07:00
|
|
|
|
|
2015-07-22 22:51:38 -07:00
|
|
|
|
advanced = parser.add_argument_group(
|
2015-07-22 22:30:00 -07:00
|
|
|
|
"Advanced",
|
|
|
|
|
"Advanced options for power users and debugging")
|
|
|
|
|
advanced.add_argument(
|
2014-11-13 16:53:26 -08:00
|
|
|
|
'--deskew-provider', choices=['imagemagick', 'leptonica'],
|
|
|
|
|
default='leptonica')
|
2015-07-22 22:30:00 -07:00
|
|
|
|
advanced.add_argument(
|
2015-02-11 03:13:07 -08:00
|
|
|
|
'--page-renderer', choices=['pdftoppm', 'ghostscript'],
|
|
|
|
|
default='ghostscript')
|
2015-07-22 22:30:00 -07:00
|
|
|
|
advanced.add_argument(
|
2015-07-22 22:58:13 -07:00
|
|
|
|
'--temp-folder', default='', type=str,
|
2015-07-22 22:30:00 -07:00
|
|
|
|
help="folder where the temporary files should be placed")
|
|
|
|
|
advanced.add_argument(
|
|
|
|
|
'--tesseract-config', default='', nargs='*', # Implemented
|
|
|
|
|
help="Tesseract configuration")
|
|
|
|
|
|
2015-07-22 22:51:38 -07:00
|
|
|
|
debugging = parser.add_argument_group(
|
2015-07-22 22:30:00 -07:00
|
|
|
|
"Debugging",
|
|
|
|
|
"Arguments to help with troubleshooting and debugging")
|
|
|
|
|
debugging.add_argument(
|
|
|
|
|
'-k', '--keep-temporary-files', action='store_true',
|
|
|
|
|
help="keep temporary files (helpful for debugging)")
|
|
|
|
|
debugging.add_argument(
|
diff --git a/src/ocrmypdf.py b/src/ocrmypdf.py
index 68d1591..95afa8f 100755
--- a/src/ocrmypdf.py
+++ b/src/ocrmypdf.py
@@ -24,6 +24,7 @@ import ruffus.cmdline as cmdline
from .hocrtransform import HocrTransform
import warnings
+import multiprocessing
warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
@@ -96,7 +97,7 @@ debugging.add_argument(
'-k', '--keep-temporary-files', action='store_true',
help="keep temporary files (helpful for debugging)")
debugging.add_argument(
- '-g' ,'--debug-rendering', action='store_true',
+ '-g', '--debug-rendering', action='store_true',
help="render each page twice with debug information on second page")
@@ -106,51 +107,19 @@ if not options.temp_folder:
options.temp_folder = 'tmp'
-_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
- options.verbose)
+log, log_mutex = cmdline.setup_logging(__name__, options.log_file,
+ options.verbose)
-class WrappedLogger:
-
- def __init__(self, my_logger, my_mutex):
- self.logger = my_logger
- self.mutex = my_mutex
-
- def log(self, *args, **kwargs):
- with self.mutex:
- self.logger.log(*args, **kwargs)
-
- def debug(self, *args, **kwargs):
- with self.mutex:
- self.logger.debug(*args, **kwargs)
-
- def info(self, *args, **kwargs):
- with self.mutex:
- self.logger.info(*args, **kwargs)
-
- def warning(self, *args, **kwargs):
- with self.mutex:
- self.logger.warning(*args, **kwargs)
-
- def error(self, *args, **kwargs):
- with self.mutex:
- self.logger.error(*args, **kwargs)
-
- def critical(self, *args, **kwargs):
- with self.mutex:
- self.logger.critical(*args, **kwargs)
-
-log = WrappedLogger(_logger, _logger_mutex)
-
-
-def re_symlink(input_file, soft_link_name, log=log):
+def re_symlink(input_file, soft_link_name, log, mutex):
"""
Helper function: relinks soft symbolic link if necessary
"""
if input_file == soft_link_name:
- log.debug("Warning: No symbolic link made. You are using " +
- "the original data directory as the working directory.")
+ with mutex:
+ log.debug("Warning: No symbolic link made. You are using " +
+ "the original data directory as the working directory.")
return
@@ -161,12 +130,14 @@ def re_symlink(input_file, soft_link_name, log=log):
try:
os.unlink(soft_link_name)
except:
- log.debug("Can't unlink %s" % (soft_link_name))
+ with mutex:
+ log.debug("Can't unlink %s" % (soft_link_name))
if not os.path.exists(input_file):
raise Exception("trying to create a broken symlink to %s" % input_file)
- log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
+ with mutex:
+ log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
os.symlink(
2015-07-23 02:22:12 -07:00
|
|
|
|
'-g', '--debug-rendering', action='store_true',
|
2015-07-22 22:30:00 -07:00
|
|
|
|
help="render each page twice with debug information on second page")
|
2014-10-08 03:21:28 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
options = parser.parse_args()
|
|
|
|
|
|
2015-07-22 22:58:13 -07:00
|
|
|
|
if not options.temp_folder:
|
|
|
|
|
options.temp_folder = 'tmp'
|
|
|
|
|
|
|
|
|
|
|
diff --git a/src/ocrmypdf.py b/src/ocrmypdf.py
index 68d1591..95afa8f 100755
--- a/src/ocrmypdf.py
+++ b/src/ocrmypdf.py
@@ -24,6 +24,7 @@ import ruffus.cmdline as cmdline
from .hocrtransform import HocrTransform
import warnings
+import multiprocessing
warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
@@ -96,7 +97,7 @@ debugging.add_argument(
'-k', '--keep-temporary-files', action='store_true',
help="keep temporary files (helpful for debugging)")
debugging.add_argument(
- '-g' ,'--debug-rendering', action='store_true',
+ '-g', '--debug-rendering', action='store_true',
help="render each page twice with debug information on second page")
@@ -106,51 +107,19 @@ if not options.temp_folder:
options.temp_folder = 'tmp'
-_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
- options.verbose)
+log, log_mutex = cmdline.setup_logging(__name__, options.log_file,
+ options.verbose)
-class WrappedLogger:
-
- def __init__(self, my_logger, my_mutex):
- self.logger = my_logger
- self.mutex = my_mutex
-
- def log(self, *args, **kwargs):
- with self.mutex:
- self.logger.log(*args, **kwargs)
-
- def debug(self, *args, **kwargs):
- with self.mutex:
- self.logger.debug(*args, **kwargs)
-
- def info(self, *args, **kwargs):
- with self.mutex:
- self.logger.info(*args, **kwargs)
-
- def warning(self, *args, **kwargs):
- with self.mutex:
- self.logger.warning(*args, **kwargs)
-
- def error(self, *args, **kwargs):
- with self.mutex:
- self.logger.error(*args, **kwargs)
-
- def critical(self, *args, **kwargs):
- with self.mutex:
- self.logger.critical(*args, **kwargs)
-
-log = WrappedLogger(_logger, _logger_mutex)
-
-
-def re_symlink(input_file, soft_link_name, log=log):
+def re_symlink(input_file, soft_link_name, log, mutex):
"""
Helper function: relinks soft symbolic link if necessary
"""
if input_file == soft_link_name:
- log.debug("Warning: No symbolic link made. You are using " +
- "the original data directory as the working directory.")
+ with mutex:
+ log.debug("Warning: No symbolic link made. You are using " +
+ "the original data directory as the working directory.")
return
@@ -161,12 +130,14 @@ def re_symlink(input_file, soft_link_name, log=log):
try:
os.unlink(soft_link_name)
except:
- log.debug("Can't unlink %s" % (soft_link_name))
+ with mutex:
+ log.debug("Can't unlink %s" % (soft_link_name))
if not os.path.exists(input_file):
raise Exception("trying to create a broken symlink to %s" % input_file)
- log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
+ with mutex:
+ log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
os.symlink(
2015-07-23 02:22:12 -07:00
|
|
|
|
log, log_mutex = cmdline.setup_logging(__name__, options.log_file,
|
|
|
|
|
options.verbose)
|
2015-03-24 22:46:33 -07:00
|
|
|
|
|
|
|
|
|
|
diff --git a/src/ocrmypdf.py b/src/ocrmypdf.py
index 68d1591..95afa8f 100755
--- a/src/ocrmypdf.py
+++ b/src/ocrmypdf.py
@@ -24,6 +24,7 @@ import ruffus.cmdline as cmdline
from .hocrtransform import HocrTransform
import warnings
+import multiprocessing
warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
@@ -96,7 +97,7 @@ debugging.add_argument(
'-k', '--keep-temporary-files', action='store_true',
help="keep temporary files (helpful for debugging)")
debugging.add_argument(
- '-g' ,'--debug-rendering', action='store_true',
+ '-g', '--debug-rendering', action='store_true',
help="render each page twice with debug information on second page")
@@ -106,51 +107,19 @@ if not options.temp_folder:
options.temp_folder = 'tmp'
-_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
- options.verbose)
+log, log_mutex = cmdline.setup_logging(__name__, options.log_file,
+ options.verbose)
-class WrappedLogger:
-
- def __init__(self, my_logger, my_mutex):
- self.logger = my_logger
- self.mutex = my_mutex
-
- def log(self, *args, **kwargs):
- with self.mutex:
- self.logger.log(*args, **kwargs)
-
- def debug(self, *args, **kwargs):
- with self.mutex:
- self.logger.debug(*args, **kwargs)
-
- def info(self, *args, **kwargs):
- with self.mutex:
- self.logger.info(*args, **kwargs)
-
- def warning(self, *args, **kwargs):
- with self.mutex:
- self.logger.warning(*args, **kwargs)
-
- def error(self, *args, **kwargs):
- with self.mutex:
- self.logger.error(*args, **kwargs)
-
- def critical(self, *args, **kwargs):
- with self.mutex:
- self.logger.critical(*args, **kwargs)
-
-log = WrappedLogger(_logger, _logger_mutex)
-
-
-def re_symlink(input_file, soft_link_name, log=log):
+def re_symlink(input_file, soft_link_name, log, mutex):
"""
Helper function: relinks soft symbolic link if necessary
"""
if input_file == soft_link_name:
- log.debug("Warning: No symbolic link made. You are using " +
- "the original data directory as the working directory.")
+ with mutex:
+ log.debug("Warning: No symbolic link made. You are using " +
+ "the original data directory as the working directory.")
return
@@ -161,12 +130,14 @@ def re_symlink(input_file, soft_link_name, log=log):
try:
os.unlink(soft_link_name)
except:
- log.debug("Can't unlink %s" % (soft_link_name))
+ with mutex:
+ log.debug("Can't unlink %s" % (soft_link_name))
if not os.path.exists(input_file):
raise Exception("trying to create a broken symlink to %s" % input_file)
- log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
+ with mutex:
+ log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
os.symlink(
2015-07-23 02:22:12 -07:00
|
|
|
|
def re_symlink(input_file, soft_link_name, log, mutex):
|
2015-07-22 22:46:00 -07:00
|
|
|
|
"""
|
|
|
|
|
Helper function: relinks soft symbolic link if necessary
|
|
|
|
|
"""
|
|
|
|
|
# Guard against soft linking to oneself
|
|
|
|
|
if input_file == soft_link_name:
|
diff --git a/src/ocrmypdf.py b/src/ocrmypdf.py
index 68d1591..95afa8f 100755
--- a/src/ocrmypdf.py
+++ b/src/ocrmypdf.py
@@ -24,6 +24,7 @@ import ruffus.cmdline as cmdline
from .hocrtransform import HocrTransform
import warnings
+import multiprocessing
warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
@@ -96,7 +97,7 @@ debugging.add_argument(
'-k', '--keep-temporary-files', action='store_true',
help="keep temporary files (helpful for debugging)")
debugging.add_argument(
- '-g' ,'--debug-rendering', action='store_true',
+ '-g', '--debug-rendering', action='store_true',
help="render each page twice with debug information on second page")
@@ -106,51 +107,19 @@ if not options.temp_folder:
options.temp_folder = 'tmp'
-_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
- options.verbose)
+log, log_mutex = cmdline.setup_logging(__name__, options.log_file,
+ options.verbose)
-class WrappedLogger:
-
- def __init__(self, my_logger, my_mutex):
- self.logger = my_logger
- self.mutex = my_mutex
-
- def log(self, *args, **kwargs):
- with self.mutex:
- self.logger.log(*args, **kwargs)
-
- def debug(self, *args, **kwargs):
- with self.mutex:
- self.logger.debug(*args, **kwargs)
-
- def info(self, *args, **kwargs):
- with self.mutex:
- self.logger.info(*args, **kwargs)
-
- def warning(self, *args, **kwargs):
- with self.mutex:
- self.logger.warning(*args, **kwargs)
-
- def error(self, *args, **kwargs):
- with self.mutex:
- self.logger.error(*args, **kwargs)
-
- def critical(self, *args, **kwargs):
- with self.mutex:
- self.logger.critical(*args, **kwargs)
-
-log = WrappedLogger(_logger, _logger_mutex)
-
-
-def re_symlink(input_file, soft_link_name, log=log):
+def re_symlink(input_file, soft_link_name, log, mutex):
"""
Helper function: relinks soft symbolic link if necessary
"""
if input_file == soft_link_name:
- log.debug("Warning: No symbolic link made. You are using " +
- "the original data directory as the working directory.")
+ with mutex:
+ log.debug("Warning: No symbolic link made. You are using " +
+ "the original data directory as the working directory.")
return
@@ -161,12 +130,14 @@ def re_symlink(input_file, soft_link_name, log=log):
try:
os.unlink(soft_link_name)
except:
- log.debug("Can't unlink %s" % (soft_link_name))
+ with mutex:
+ log.debug("Can't unlink %s" % (soft_link_name))
if not os.path.exists(input_file):
raise Exception("trying to create a broken symlink to %s" % input_file)
- log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
+ with mutex:
+ log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
os.symlink(
2015-07-23 02:22:12 -07:00
|
|
|
|
with mutex:
|
|
|
|
|
log.debug("Warning: No symbolic link made. You are using " +
|
|
|
|
|
"the original data directory as the working directory.")
|
2015-07-22 22:46:00 -07:00
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# Soft link already exists: delete for relink?
|
|
|
|
|
if os.path.lexists(soft_link_name):
|
|
|
|
|
# do not delete or overwrite real (non-soft link) file
|
|
|
|
|
if not os.path.islink(soft_link_name):
|
|
|
|
|
raise Exception("%s exists and is not a link" % soft_link_name)
|
|
|
|
|
try:
|
|
|
|
|
os.unlink(soft_link_name)
|
|
|
|
|
except:
|
diff --git a/src/ocrmypdf.py b/src/ocrmypdf.py
index 68d1591..95afa8f 100755
--- a/src/ocrmypdf.py
+++ b/src/ocrmypdf.py
@@ -24,6 +24,7 @@ import ruffus.cmdline as cmdline
from .hocrtransform import HocrTransform
import warnings
+import multiprocessing
warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
@@ -96,7 +97,7 @@ debugging.add_argument(
'-k', '--keep-temporary-files', action='store_true',
help="keep temporary files (helpful for debugging)")
debugging.add_argument(
- '-g' ,'--debug-rendering', action='store_true',
+ '-g', '--debug-rendering', action='store_true',
help="render each page twice with debug information on second page")
@@ -106,51 +107,19 @@ if not options.temp_folder:
options.temp_folder = 'tmp'
-_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
- options.verbose)
+log, log_mutex = cmdline.setup_logging(__name__, options.log_file,
+ options.verbose)
-class WrappedLogger:
-
- def __init__(self, my_logger, my_mutex):
- self.logger = my_logger
- self.mutex = my_mutex
-
- def log(self, *args, **kwargs):
- with self.mutex:
- self.logger.log(*args, **kwargs)
-
- def debug(self, *args, **kwargs):
- with self.mutex:
- self.logger.debug(*args, **kwargs)
-
- def info(self, *args, **kwargs):
- with self.mutex:
- self.logger.info(*args, **kwargs)
-
- def warning(self, *args, **kwargs):
- with self.mutex:
- self.logger.warning(*args, **kwargs)
-
- def error(self, *args, **kwargs):
- with self.mutex:
- self.logger.error(*args, **kwargs)
-
- def critical(self, *args, **kwargs):
- with self.mutex:
- self.logger.critical(*args, **kwargs)
-
-log = WrappedLogger(_logger, _logger_mutex)
-
-
-def re_symlink(input_file, soft_link_name, log=log):
+def re_symlink(input_file, soft_link_name, log, mutex):
"""
Helper function: relinks soft symbolic link if necessary
"""
if input_file == soft_link_name:
- log.debug("Warning: No symbolic link made. You are using " +
- "the original data directory as the working directory.")
+ with mutex:
+ log.debug("Warning: No symbolic link made. You are using " +
+ "the original data directory as the working directory.")
return
@@ -161,12 +130,14 @@ def re_symlink(input_file, soft_link_name, log=log):
try:
os.unlink(soft_link_name)
except:
- log.debug("Can't unlink %s" % (soft_link_name))
+ with mutex:
+ log.debug("Can't unlink %s" % (soft_link_name))
if not os.path.exists(input_file):
raise Exception("trying to create a broken symlink to %s" % input_file)
- log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
+ with mutex:
+ log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
os.symlink(
2015-07-23 02:22:12 -07:00
|
|
|
|
with mutex:
|
|
|
|
|
log.debug("Can't unlink %s" % (soft_link_name))
|
2015-07-22 22:46:00 -07:00
|
|
|
|
|
|
|
|
|
if not os.path.exists(input_file):
|
|
|
|
|
raise Exception("trying to create a broken symlink to %s" % input_file)
|
|
|
|
|
|
diff --git a/src/ocrmypdf.py b/src/ocrmypdf.py
index 68d1591..95afa8f 100755
--- a/src/ocrmypdf.py
+++ b/src/ocrmypdf.py
@@ -24,6 +24,7 @@ import ruffus.cmdline as cmdline
from .hocrtransform import HocrTransform
import warnings
+import multiprocessing
warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
@@ -96,7 +97,7 @@ debugging.add_argument(
'-k', '--keep-temporary-files', action='store_true',
help="keep temporary files (helpful for debugging)")
debugging.add_argument(
- '-g' ,'--debug-rendering', action='store_true',
+ '-g', '--debug-rendering', action='store_true',
help="render each page twice with debug information on second page")
@@ -106,51 +107,19 @@ if not options.temp_folder:
options.temp_folder = 'tmp'
-_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
- options.verbose)
+log, log_mutex = cmdline.setup_logging(__name__, options.log_file,
+ options.verbose)
-class WrappedLogger:
-
- def __init__(self, my_logger, my_mutex):
- self.logger = my_logger
- self.mutex = my_mutex
-
- def log(self, *args, **kwargs):
- with self.mutex:
- self.logger.log(*args, **kwargs)
-
- def debug(self, *args, **kwargs):
- with self.mutex:
- self.logger.debug(*args, **kwargs)
-
- def info(self, *args, **kwargs):
- with self.mutex:
- self.logger.info(*args, **kwargs)
-
- def warning(self, *args, **kwargs):
- with self.mutex:
- self.logger.warning(*args, **kwargs)
-
- def error(self, *args, **kwargs):
- with self.mutex:
- self.logger.error(*args, **kwargs)
-
- def critical(self, *args, **kwargs):
- with self.mutex:
- self.logger.critical(*args, **kwargs)
-
-log = WrappedLogger(_logger, _logger_mutex)
-
-
-def re_symlink(input_file, soft_link_name, log=log):
+def re_symlink(input_file, soft_link_name, log, mutex):
"""
Helper function: relinks soft symbolic link if necessary
"""
if input_file == soft_link_name:
- log.debug("Warning: No symbolic link made. You are using " +
- "the original data directory as the working directory.")
+ with mutex:
+ log.debug("Warning: No symbolic link made. You are using " +
+ "the original data directory as the working directory.")
return
@@ -161,12 +130,14 @@ def re_symlink(input_file, soft_link_name, log=log):
try:
os.unlink(soft_link_name)
except:
- log.debug("Can't unlink %s" % (soft_link_name))
+ with mutex:
+ log.debug("Can't unlink %s" % (soft_link_name))
if not os.path.exists(input_file):
raise Exception("trying to create a broken symlink to %s" % input_file)
- log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
+ with mutex:
+ log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
os.symlink(
2015-07-23 02:22:12 -07:00
|
|
|
|
with mutex:
|
|
|
|
|
log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
|
2015-07-22 22:46:00 -07:00
|
|
|
|
|
|
|
|
|
# Create symbolic link using absolute path
|
|
|
|
|
os.symlink(
|
|
|
|
|
os.path.abspath(input_file),
|
|
|
|
|
soft_link_name
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2015-07-23 01:16:05 -07:00
|
|
|
|
original_cwd = os.getcwd()
|
|
|
|
|
with suppress(FileExistsError):
|
|
|
|
|
os.mkdir(options.temp_folder)
|
|
|
|
|
os.chdir(options.temp_folder)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@transform(
|
|
|
|
|
os.path.join(original_cwd, options.input_file),
|
|
|
|
|
suffix('.pdf'),
|
|
|
|
|
'.cleaned.pdf')
|
|
|
|
|
def clean_pdf(
|
|
|
|
|
input_file,
|
|
|
|
|
output_file):
|
|
|
|
|
args_mutool = [
|
|
|
|
|
'mutool', 'clean',
|
|
|
|
|
input_file, output_file
|
|
|
|
|
]
|
|
|
|
|
check_call(args_mutool)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pageno, width_pt, height_pt = map(int, options.page_info.split(' ', 3))
|
|
|
|
|
pageinfo = pdf_get_pageinfo(options.input_file, pageno, width_pt, height_pt)
|
|
|
|
|
|
|
|
|
|
if not pageinfo['images']:
|
|
|
|
|
# If the page has no images, then it contains vector content or text
|
|
|
|
|
# or both. It seems quite unlikely that one would find meaningful text
|
|
|
|
|
# from rasterizing vector content. So skip the page.
|
|
|
|
|
log.info(
|
|
|
|
|
"Page {0} has no images - skipping OCR".format(pageno)
|
|
|
|
|
)
|
|
|
|
|
elif pageinfo['has_text']:
|
|
|
|
|
s = "Page {0} already has text! – {1}"
|
|
|
|
|
|
|
|
|
|
if not options.force_ocr and not options.skip_text:
|
|
|
|
|
log.error(s.format(pageno,
|
|
|
|
|
"aborting (use -f or -s to force OCR)"))
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
elif options.force_ocr:
|
|
|
|
|
log.info(s.format(pageno,
|
|
|
|
|
"rasterizing text and running OCR anyway"))
|
|
|
|
|
elif options.skip_text:
|
|
|
|
|
log.info(s.format(pageno,
|
|
|
|
|
"skipping all processing on this page"))
|
|
|
|
|
|
|
|
|
|
ocr_required = pageinfo['images'] and \
|
|
|
|
|
(options.force_ocr or
|
|
|
|
|
(not (pageinfo['has_text'] and options.skip_text)))
|
|
|
|
|
|
|
|
|
|
if ocr_required and options.skip_big:
|
|
|
|
|
area = pageinfo['width_inches'] * pageinfo['height_inches']
|
|
|
|
|
pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
|
|
|
|
|
if area > (11.0 * 17.0) or pixel_count > (300.0 * 300.0 * 11 * 17):
|
|
|
|
|
ocr_required = False
|
|
|
|
|
log.info(
|
|
|
|
|
"Page {0} is very large; skipping due to -b".format(pageno))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@split(
|
|
|
|
|
clean_pdf,
|
|
|
|
|
'*.page.pdf')
|
2015-07-22 22:46:00 -07:00
|
|
|
|
def split_pages(
|
|
|
|
|
input_file,
|
|
|
|
|
output_files):
|
|
|
|
|
|
|
|
|
|
for oo in output_files:
|
|
|
|
|
with suppress(FileNotFoundError):
|
|
|
|
|
os.unlink(oo)
|
|
|
|
|
|
|
|
|
|
args_pdfseparate = [
|
|
|
|
|
'pdfseparate',
|
|
|
|
|
input_file,
|
|
|
|
|
'%06d.page.pdf'
|
|
|
|
|
]
|
|
|
|
|
check_call(args_pdfseparate)
|
|
|
|
|
|
|
|
|
|
|
2015-07-22 22:51:38 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @active_if(not ocr_required or (ocr_required and options.exact_image))
|
|
|
|
|
# @transform(setup_working_directory,
|
|
|
|
|
# formatter(),
|
|
|
|
|
# os.path.join(options.temp_folder, '%04i.page.pdf' % pageno))
|
|
|
|
|
# def extract_single_page(
|
|
|
|
|
# input_file,
|
|
|
|
|
# output_file):
|
|
|
|
|
# args_pdfseparate = [
|
|
|
|
|
# 'pdfseparate',
|
|
|
|
|
# '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']),
|
|
|
|
|
# input_file,
|
|
|
|
|
# output_file
|
|
|
|
|
# ]
|
|
|
|
|
# check_call(args_pdfseparate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @active_if(ocr_required)
|
|
|
|
|
# @active_if(options.page_renderer == 'pdftoppm')
|
|
|
|
|
# @transform(setup_working_directory,
|
|
|
|
|
# formatter(),
|
|
|
|
|
# "{path[0]}/%04i.pnm" % pageno)
|
|
|
|
|
# def unpack_with_pdftoppm(
|
|
|
|
|
# input_file,
|
|
|
|
|
# output_file):
|
|
|
|
|
# force_ppm = True
|
|
|
|
|
# allow_jpeg = False
|
|
|
|
|
|
|
|
|
|
# colorspace = 'color'
|
|
|
|
|
# compression = 'deflate'
|
|
|
|
|
# output_format = 'tiff'
|
|
|
|
|
# if all(image['comp'] == 1 for image in pageinfo['images']):
|
|
|
|
|
# if all(image['bpc'] == 1 for image in pageinfo['images']):
|
|
|
|
|
# colorspace = 'mono'
|
|
|
|
|
# compression = 'deflate'
|
|
|
|
|
# elif not any(image['color'] == 'color'
|
|
|
|
|
# for image in pageinfo['images']):
|
|
|
|
|
# colorspace = 'gray'
|
|
|
|
|
|
|
|
|
|
# if allow_jpeg and \
|
|
|
|
|
# all(image['enc'] == 'jpeg' for image in pageinfo['images']):
|
|
|
|
|
# output_format = 'jpeg'
|
|
|
|
|
|
|
|
|
|
# args_pdftoppm = [
|
|
|
|
|
# 'pdftoppm',
|
|
|
|
|
# '-f', str(pageinfo['pageno']), '-l', str(pageinfo['pageno']),
|
|
|
|
|
# '-rx', str(pageinfo['xres_render']),
|
|
|
|
|
# '-ry', str(pageinfo['yres_render'])
|
|
|
|
|
# ]
|
|
|
|
|
|
|
|
|
|
# if not force_ppm:
|
|
|
|
|
# if output_format == 'tiff':
|
|
|
|
|
# args_pdftoppm.append('-tiff')
|
|
|
|
|
# if False and compression:
|
|
|
|
|
# args_pdftoppm.append('-tiffcompression')
|
|
|
|
|
# args_pdftoppm.append(compression)
|
|
|
|
|
# elif output_format == 'jpeg':
|
|
|
|
|
# args_pdftoppm.append('-jpeg')
|
|
|
|
|
|
|
|
|
|
# if colorspace == 'mono':
|
|
|
|
|
# args_pdftoppm.append('-mono')
|
|
|
|
|
# elif colorspace == 'gray':
|
|
|
|
|
# args_pdftoppm.append('-gray')
|
|
|
|
|
|
|
|
|
|
# args_pdftoppm.extend([str(input_file)])
|
|
|
|
|
|
|
|
|
|
# # Ask pdftoppm to write the binary output to stdout; therefore set
|
|
|
|
|
# # universal_newlines=False
|
|
|
|
|
# p = Popen(args_pdftoppm, close_fds=True, stdout=open(output_file, 'wb'),
|
|
|
|
|
# stderr=PIPE, universal_newlines=False)
|
|
|
|
|
# _, stderr = p.communicate()
|
|
|
|
|
# if stderr:
|
|
|
|
|
# # Because universal_newlines=False, stderr is bytes(), so we must
|
|
|
|
|
# # manually convert it to str for logging
|
|
|
|
|
# from codecs import decode
|
|
|
|
|
# log.error(decode(stderr, sys.getdefaultencoding(), 'ignore'))
|
|
|
|
|
# if p.returncode != 0:
|
|
|
|
|
# raise CalledProcessError(p.returncode, args_pdftoppm)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @active_if(ocr_required)
|
|
|
|
|
# @transform(unpack_with_pdftoppm, suffix(".pnm"), ".png")
|
|
|
|
|
# def convert_to_png(input_file, output_file):
|
|
|
|
|
# args_convert = [
|
|
|
|
|
# 'convert',
|
|
|
|
|
# input_file,
|
|
|
|
|
# output_file
|
|
|
|
|
# ]
|
|
|
|
|
# check_call(args_convert)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @active_if(ocr_required)
|
|
|
|
|
# @active_if(options.page_renderer == 'ghostscript')
|
|
|
|
|
# @transform(setup_working_directory,
|
|
|
|
|
# formatter(),
|
|
|
|
|
# "{path[0]}/%04i.png" % pageno)
|
|
|
|
|
# def unpack_with_ghostscript(
|
|
|
|
|
# input_file,
|
|
|
|
|
# output_file):
|
|
|
|
|
# device = 'png16m' # 24-bit
|
|
|
|
|
# if all(image['comp'] == 1 for image in pageinfo['images']):
|
|
|
|
|
# if all(image['bpc'] == 1 for image in pageinfo['images']):
|
|
|
|
|
# device = 'pngmono'
|
|
|
|
|
# elif not any(image['color'] == 'color'
|
|
|
|
|
# for image in pageinfo['images']):
|
|
|
|
|
# device = 'pnggray'
|
|
|
|
|
|
|
|
|
|
# args_gs = [
|
|
|
|
|
# 'gs',
|
|
|
|
|
# '-dBATCH', '-dNOPAUSE',
|
|
|
|
|
# '-dFirstPage=%i' % pageno,
|
|
|
|
|
# '-dLastPage=%i' % pageno,
|
|
|
|
|
# '-sDEVICE=%s' % device,
|
|
|
|
|
# '-o', output_file,
|
|
|
|
|
# '-r{0}x{1}'.format(
|
|
|
|
|
# str(pageinfo['xres_render']), str(pageinfo['yres_render'])),
|
|
|
|
|
# input_file
|
|
|
|
|
# ]
|
|
|
|
|
|
|
|
|
|
# p = Popen(args_gs, close_fds=True, stdout=PIPE, stderr=PIPE,
|
|
|
|
|
# universal_newlines=True)
|
|
|
|
|
# stdout, stderr = p.communicate()
|
|
|
|
|
# if stdout:
|
|
|
|
|
# log.info(stdout)
|
|
|
|
|
# if stderr:
|
|
|
|
|
# log.error(stderr)
|
|
|
|
|
|
|
|
|
|
# try:
|
|
|
|
|
# f = open(output_file)
|
|
|
|
|
# except FileNotFoundError:
|
|
|
|
|
# raise
|
|
|
|
|
# else:
|
|
|
|
|
# f.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @active_if(ocr_required)
|
|
|
|
|
# @active_if(options.preprocess_deskew != 0
|
|
|
|
|
# and options.deskew_provider == 'imagemagick')
|
|
|
|
|
# @transform(convert_to_png, suffix(".png"), ".deskewed.png")
|
|
|
|
|
# def deskew_imagemagick(input_file, output_file):
|
|
|
|
|
# args_convert = [
|
|
|
|
|
# 'convert',
|
|
|
|
|
# input_file,
|
|
|
|
|
# '-deskew', '40%',
|
|
|
|
|
# '-gravity', 'center',
|
|
|
|
|
# '-extent', '{width_pixels}x{height_pixels}'.format(**pageinfo),
|
|
|
|
|
# '+repage',
|
|
|
|
|
# output_file
|
|
|
|
|
# ]
|
|
|
|
|
|
|
|
|
|
# p = Popen(args_convert, close_fds=True, stdout=PIPE, stderr=PIPE,
|
|
|
|
|
# universal_newlines=True)
|
|
|
|
|
# stdout, stderr = p.communicate()
|
|
|
|
|
|
|
|
|
|
# if stdout:
|
|
|
|
|
# log.info(stdout)
|
|
|
|
|
# if stderr:
|
|
|
|
|
# log.error(stderr)
|
|
|
|
|
|
|
|
|
|
# if p.returncode != 0:
|
|
|
|
|
# raise CalledProcessError(p.returncode, args_convert)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @active_if(ocr_required)
|
|
|
|
|
# @active_if(options.preprocess_deskew != 0
|
|
|
|
|
# and options.deskew_provider == 'leptonica')
|
|
|
|
|
# @transform(convert_to_png, suffix(".png"), ".deskewed.png")
|
|
|
|
|
# def deskew_leptonica(input_file, output_file):
|
|
|
|
|
# from .leptonica import deskew
|
|
|
|
|
# deskew(input_file, output_file,
|
|
|
|
|
# min(pageinfo['xres'], pageinfo['yres']))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @active_if(ocr_required)
|
|
|
|
|
# @active_if(options.preprocess_clean != 0)
|
|
|
|
|
# @merge([unpack_with_pdftoppm, unpack_with_ghostscript,
|
|
|
|
|
# deskew_imagemagick, deskew_leptonica],
|
|
|
|
|
# os.path.join(options.temp_folder, "%04i.for_clean.pnm" % pageno))
|
|
|
|
|
# def select_image_for_cleaning(infiles, output_file):
|
|
|
|
|
# input_file = infiles[-1]
|
|
|
|
|
# args_convert = [
|
|
|
|
|
# 'convert',
|
|
|
|
|
# input_file,
|
|
|
|
|
# output_file
|
|
|
|
|
# ]
|
|
|
|
|
# check_call(args_convert)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @active_if(ocr_required)
|
|
|
|
|
# @active_if(options.preprocess_clean != 0)
|
|
|
|
|
# @transform(select_image_for_cleaning, suffix(".pnm"), ".cleaned.pnm")
|
|
|
|
|
# def clean_unpaper(input_file, output_file):
|
|
|
|
|
# args_unpaper = [
|
|
|
|
|
# 'unpaper',
|
|
|
|
|
# '--dpi', str(int(round((pageinfo['xres'] * pageinfo['yres']) ** 0.5))),
|
|
|
|
|
# '--mask-scan-size', '100',
|
|
|
|
|
# '--no-deskew',
|
|
|
|
|
# '--no-grayfilter',
|
|
|
|
|
# '--no-blackfilter',
|
|
|
|
|
# '--no-mask-center',
|
|
|
|
|
# '--no-border-align',
|
|
|
|
|
# input_file,
|
|
|
|
|
# output_file
|
|
|
|
|
# ]
|
|
|
|
|
|
|
|
|
|
# p = Popen(args_unpaper, close_fds=True, stdout=PIPE, stderr=PIPE,
|
|
|
|
|
# universal_newlines=True)
|
|
|
|
|
# stdout, stderr = p.communicate()
|
|
|
|
|
|
|
|
|
|
# if stdout:
|
|
|
|
|
# log.info(stdout)
|
|
|
|
|
# if stderr:
|
|
|
|
|
# log.error(stderr)
|
|
|
|
|
|
|
|
|
|
# if p.returncode != 0:
|
|
|
|
|
# raise CalledProcessError(p.returncode, args_unpaper)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @active_if(ocr_required)
|
|
|
|
|
# @transform(clean_unpaper, suffix(".cleaned.pnm"), ".cleaned.png")
|
|
|
|
|
# def cleaned_to_png(input_file, output_file):
|
|
|
|
|
# args_convert = [
|
|
|
|
|
# 'convert',
|
|
|
|
|
# input_file,
|
|
|
|
|
# output_file
|
|
|
|
|
# ]
|
|
|
|
|
# check_call(args_convert)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @active_if(ocr_required)
|
|
|
|
|
# @merge([unpack_with_ghostscript, convert_to_png, deskew_imagemagick,
|
|
|
|
|
# deskew_leptonica, cleaned_to_png],
|
|
|
|
|
# os.path.join(options.temp_folder, "%04i.for_ocr.png" % pageno))
|
|
|
|
|
# def select_ocr_image(infiles, output_file):
|
|
|
|
|
# re_symlink(infiles[-1], output_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# hocr_template = '''<?xml version="1.0" encoding="UTF-8"?>
|
|
|
|
|
# <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
|
|
|
# "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
|
|
|
# <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
|
|
|
|
# <head>
|
|
|
|
|
# <title></title>
|
|
|
|
|
# <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
|
|
|
|
# <meta name='ocr-system' content='tesseract 3.02.02' />
|
|
|
|
|
# <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
|
|
|
|
|
# </head>
|
|
|
|
|
# <body>
|
|
|
|
|
# <div class='ocr_page' id='page_1' title='image "x.tif"; bbox 0 0 {0} {1}; ppageno 0'>
|
|
|
|
|
# <div class='ocr_carea' id='block_1_1' title="bbox 0 1 {0} {1}">
|
|
|
|
|
# <p class='ocr_par' dir='ltr' id='par_1' title="bbox 0 1 {0} {1}">
|
|
|
|
|
# <span class='ocr_line' id='line_1' title="bbox 0 1 {0} {1}"><span class='ocrx_word' id='word_1' title="bbox 0 1 {0} {1}"> </span>
|
|
|
|
|
# </span>
|
|
|
|
|
# </p>
|
|
|
|
|
# </div>
|
|
|
|
|
# </div>
|
|
|
|
|
# </body>
|
|
|
|
|
# </html>'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @active_if(ocr_required)
|
|
|
|
|
# @transform(select_ocr_image, suffix(".for_ocr.png"), ".hocr")
|
|
|
|
|
# def ocr_tesseract(
|
|
|
|
|
# input_file,
|
|
|
|
|
# output_file):
|
|
|
|
|
|
|
|
|
|
# args_tesseract = [
|
|
|
|
|
# 'tesseract',
|
|
|
|
|
# '-l', options.language,
|
|
|
|
|
# input_file,
|
|
|
|
|
# output_file,
|
|
|
|
|
# 'hocr',
|
|
|
|
|
# options.tess_cfg_files
|
|
|
|
|
# ]
|
|
|
|
|
# p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
|
|
|
|
|
# universal_newlines=True)
|
|
|
|
|
# try:
|
|
|
|
|
# stdout, stderr = p.communicate(timeout=180)
|
|
|
|
|
# except TimeoutExpired:
|
|
|
|
|
# p.kill()
|
|
|
|
|
# stdout, stderr = p.communicate()
|
|
|
|
|
# # Generate a HOCR file with no recognized text if tesseract times out
|
|
|
|
|
# # Temporary workaround to hocrTransform not being able to function if
|
|
|
|
|
# # it does not have a valid hOCR file.
|
|
|
|
|
# with open(output_file, 'w', encoding="utf-8") as f:
|
|
|
|
|
# f.write(hocr_template.format(pageinfo['width_pixels'],
|
|
|
|
|
# pageinfo['height_pixels']))
|
|
|
|
|
# else:
|
|
|
|
|
# if stdout:
|
|
|
|
|
# log.info(stdout)
|
|
|
|
|
# if stderr:
|
|
|
|
|
# log.error(stderr)
|
|
|
|
|
|
|
|
|
|
# if p.returncode != 0:
|
|
|
|
|
# raise CalledProcessError(p.returncode, args_tesseract)
|
|
|
|
|
|
|
|
|
|
# if os.path.exists(output_file + '.html'):
|
|
|
|
|
# # Tesseract 3.02 appends suffix ".html" on its own (.hocr.html)
|
|
|
|
|
# shutil.move(output_file + '.html', output_file)
|
|
|
|
|
# elif os.path.exists(output_file + '.hocr'):
|
|
|
|
|
# # Tesseract 3.03 appends suffix ".hocr" on its own (.hocr.hocr)
|
|
|
|
|
# shutil.move(output_file + '.hocr', output_file)
|
|
|
|
|
|
|
|
|
|
# # Tesseract inserts source filename into hocr file without escaping
|
|
|
|
|
# # it. This could break the XML parser. Rewrite the hocr file,
|
|
|
|
|
# # replacing the filename with a space.
|
|
|
|
|
# regex_nested_single_quotes = re.compile(
|
|
|
|
|
# r"""title='image "([^"]*)";""")
|
|
|
|
|
# with fileinput.input(files=(output_file,), inplace=True) as f:
|
|
|
|
|
# for line in f:
|
|
|
|
|
# line = regex_nested_single_quotes.sub(
|
|
|
|
|
# r"""title='image " ";""", line)
|
|
|
|
|
# print(line, end='') # fileinput.input redirects stdout
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @active_if(ocr_required and not options.exact_image)
|
|
|
|
|
# @merge([unpack_with_ghostscript, convert_to_png,
|
|
|
|
|
# deskew_imagemagick, deskew_leptonica, cleaned_to_png],
|
|
|
|
|
# os.path.join(options.temp_folder, "%04i.image_for_pdf" % pageno))
|
|
|
|
|
# def select_image_for_pdf(infiles, output_file):
|
|
|
|
|
# if options.preprocess_clean != 0 and options.preprocess_cleantopdf != 0:
|
|
|
|
|
# input_file = infiles[-1]
|
|
|
|
|
# elif options.preprocess_deskew != 0 and options.preprocess_clean != 0:
|
|
|
|
|
# input_file = infiles[-2]
|
|
|
|
|
# elif options.preprocess_deskew != 0 and options.preprocess_clean == 0:
|
|
|
|
|
# input_file = infiles[-1]
|
|
|
|
|
# else:
|
|
|
|
|
# input_file = infiles[0]
|
|
|
|
|
|
|
|
|
|
# if all(image['enc'] == 'jpeg' for image in pageinfo['images']):
|
|
|
|
|
# # If all images were JPEGs originally, produce a JPEG as output
|
|
|
|
|
# check_call(['convert', input_file, 'jpg:' + output_file])
|
|
|
|
|
# else:
|
|
|
|
|
# re_symlink(input_file, output_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @active_if(ocr_required and not options.exact_image)
|
|
|
|
|
# @merge([ocr_tesseract, select_image_for_pdf],
|
|
|
|
|
# os.path.join(options.temp_folder, '%04i.rendered.pdf' % pageno))
|
|
|
|
|
# def render_page(infiles, output_file):
|
|
|
|
|
# hocr, image = infiles[0], infiles[1]
|
|
|
|
|
|
|
|
|
|
# dpi = round(max(pageinfo['xres'], pageinfo['yres']))
|
|
|
|
|
|
|
|
|
|
# hocrtransform = HocrTransform(hocr, dpi)
|
|
|
|
|
# hocrtransform.to_pdf(output_file, imageFileName=image,
|
|
|
|
|
# showBoundingboxes=False, invisibleText=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @active_if(ocr_required and options.pdf_noimg)
|
|
|
|
|
# @transform(ocr_tesseract, suffix(".hocr"), ".ocred.todebug.pdf")
|
|
|
|
|
# def render_text_output_page(input_file, output_file):
|
|
|
|
|
# dpi = round(max(pageinfo['xres'], pageinfo['yres']))
|
|
|
|
|
|
|
|
|
|
# hocrtransform = HocrTransform(input_file, dpi)
|
|
|
|
|
# hocrtransform.to_pdf(output_file, imageFileName=None,
|
|
|
|
|
# showBoundingboxes=True, invisibleText=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @active_if(ocr_required and options.exact_image)
|
|
|
|
|
# @transform(ocr_tesseract, suffix(".hocr"), ".hocr.pdf")
|
|
|
|
|
# def render_hocr_blank_page(input_file, output_file):
|
|
|
|
|
# dpi = round(max(pageinfo['xres'], pageinfo['yres']))
|
|
|
|
|
|
|
|
|
|
# hocrtransform = HocrTransform(input_file, dpi)
|
|
|
|
|
# hocrtransform.to_pdf(output_file, imageFileName=None,
|
|
|
|
|
# showBoundingboxes=False, invisibleText=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @active_if(ocr_required and options.exact_image)
|
|
|
|
|
# @merge([render_hocr_blank_page, extract_single_page],
|
|
|
|
|
# os.path.join(options.temp_folder, "%04i.merged.pdf") % pageno)
|
|
|
|
|
# def merge_hocr_with_original_page(infiles, output_file):
|
|
|
|
|
# with open(infiles[0], 'rb') as hocr_input, \
|
|
|
|
|
# open(infiles[1], 'rb') as page_input, \
|
|
|
|
|
# open(output_file, 'wb') as output:
|
|
|
|
|
# hocr_reader = pypdf.PdfFileReader(hocr_input)
|
|
|
|
|
# page_reader = pypdf.PdfFileReader(page_input)
|
|
|
|
|
# writer = pypdf.PdfFileWriter()
|
|
|
|
|
|
|
|
|
|
# the_page = hocr_reader.getPage(0)
|
|
|
|
|
# the_page.mergePage(page_reader.getPage(0))
|
|
|
|
|
# writer.addPage(the_page)
|
|
|
|
|
# writer.write(output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @merge([render_page, merge_hocr_with_original_page, extract_single_page],
|
|
|
|
|
# os.path.join(options.temp_folder, '%04i.ocred.pdf' % pageno))
|
|
|
|
|
# def select_final_page(infiles, output_file):
|
|
|
|
|
# re_symlink(infiles[-1], output_file)
|
|
|
|
|
|
|
|
|
|
|
2015-07-22 22:58:13 -07:00
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
cmdline.run(options)
|
2014-09-26 04:19:41 -07:00
|
|
|
|
|
|
|
|
|
|