2014-09-26 04:19:41 -07:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
2015-07-23 04:57:31 -07:00
|
|
|
|
from contextlib import suppress
|
2015-07-25 01:45:26 -07:00
|
|
|
|
from tempfile import NamedTemporaryFile, mkdtemp
|
2014-09-26 04:19:41 -07:00
|
|
|
|
import sys
|
2015-07-23 04:57:31 -07:00
|
|
|
|
import os
|
2015-02-13 13:41:14 -08:00
|
|
|
|
import fileinput
|
|
|
|
|
import re
|
2015-03-10 14:28:38 -07:00
|
|
|
|
import shutil
|
2015-07-23 04:57:31 -07:00
|
|
|
|
import warnings
|
|
|
|
|
import multiprocessing
|
2015-07-25 01:45:26 -07:00
|
|
|
|
import atexit
|
2015-07-23 04:57:31 -07:00
|
|
|
|
|
|
|
|
|
import PyPDF2 as pypdf
|
2015-07-24 15:19:37 -07:00
|
|
|
|
from PIL import Image
|
2014-10-08 03:21:28 -07:00
|
|
|
|
|
2014-11-14 02:06:23 -08:00
|
|
|
|
from subprocess import Popen, check_call, PIPE, CalledProcessError, \
|
|
|
|
|
TimeoutExpired
|
2014-10-08 03:21:28 -07:00
|
|
|
|
try:
|
|
|
|
|
from subprocess import DEVNULL
|
|
|
|
|
except ImportError:
|
|
|
|
|
DEVNULL = open(os.devnull, 'wb')
|
|
|
|
|
|
2014-09-26 04:19:41 -07:00
|
|
|
|
|
2014-10-10 00:35:49 -07:00
|
|
|
|
from ruffus import transform, suffix, merge, active_if, regex, jobs_limit, \
|
2015-07-25 02:58:34 -07:00
|
|
|
|
formatter, follows, split, collate, check_if_uptodate
|
2014-10-08 03:21:28 -07:00
|
|
|
|
import ruffus.cmdline as cmdline
|
2015-07-23 02:39:42 -07:00
|
|
|
|
|
2015-02-20 17:20:48 -08:00
|
|
|
|
from .hocrtransform import HocrTransform
|
2015-07-23 02:39:42 -07:00
|
|
|
|
from .pageinfo import pdf_get_all_pageinfo
|
2015-07-23 04:57:31 -07:00
|
|
|
|
from .pdfa import generate_pdfa_def
|
2015-07-27 15:22:00 -07:00
|
|
|
|
from .ghostscript import rasterize_pdf, generate_pdfa
|
2015-07-23 18:38:59 -07:00
|
|
|
|
from . import tesseract
|
2015-02-20 17:20:48 -08:00
|
|
|
|
|
2015-04-09 03:12:04 -07:00
|
|
|
|
|
2015-04-09 14:06:55 -07:00
|
|
|
|
warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
|
2015-04-09 03:12:04 -07:00
|
|
|
|
|
2014-10-08 03:21:28 -07:00
|
|
|
|
|
2015-07-23 14:48:46 -07:00
|
|
|
|
BASEDIR = os.path.dirname(os.path.realpath(__file__))
|
2015-07-26 01:52:08 -07:00
|
|
|
|
JHOVE_PATH = os.path.realpath(os.path.join(BASEDIR, 'jhove'))
|
2015-07-23 14:48:46 -07:00
|
|
|
|
JHOVE_JAR = os.path.join(JHOVE_PATH, 'bin', 'JhoveApp.jar')
|
|
|
|
|
JHOVE_CFG = os.path.join(JHOVE_PATH, 'conf', 'jhove.conf')
|
2014-10-08 03:21:28 -07:00
|
|
|
|
|
2015-07-24 01:27:01 -07:00
|
|
|
|
EXIT_BAD_ARGS = 1
|
|
|
|
|
EXIT_BAD_INPUT_FILE = 2
|
|
|
|
|
EXIT_MISSING_DEPENDENCY = 3
|
|
|
|
|
EXIT_INVALID_OUTPUT_PDFA = 4
|
|
|
|
|
EXIT_FILE_ACCESS_ERROR = 5
|
2015-07-25 04:25:19 -07:00
|
|
|
|
EXIT_ALREADY_DONE_OCR = 6
|
2015-07-24 01:27:01 -07:00
|
|
|
|
EXIT_OTHER_ERROR = 15
|
2015-07-23 17:06:00 -07:00
|
|
|
|
|
|
|
|
|
# -------------
|
|
|
|
|
# External dependencies
|
|
|
|
|
|
|
|
|
|
MINIMUM_TESS_VERSION = '3.02.02'
|
|
|
|
|
|
2015-07-23 18:38:59 -07:00
|
|
|
|
if tesseract.VERSION < MINIMUM_TESS_VERSION:
|
2015-07-23 17:06:00 -07:00
|
|
|
|
print(
|
|
|
|
|
"Please install tesseract {0} or newer "
|
|
|
|
|
"(currently installed version is {1})".format(
|
2015-07-23 18:38:59 -07:00
|
|
|
|
MINIMUM_TESS_VERSION, tesseract.VERSION),
|
2015-07-23 17:06:00 -07:00
|
|
|
|
file=sys.stderr)
|
|
|
|
|
sys.exit(EXIT_MISSING_DEPENDENCY)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# -------------
|
|
|
|
|
# Parser
|
|
|
|
|
|
2014-10-08 03:21:28 -07:00
|
|
|
|
parser = cmdline.get_argparse(
|
2015-07-25 23:45:13 -07:00
|
|
|
|
prog="ocrmypdf",
|
2015-07-27 15:39:54 -07:00
|
|
|
|
description="Generate searchable PDF file from an image-only PDF file.",
|
|
|
|
|
version='3.0rc1',
|
|
|
|
|
fromfile_prefix_chars='@',
|
|
|
|
|
ignored_args=[
|
|
|
|
|
'touch_files_only', 'recreate_database', 'checksum_file_name',
|
|
|
|
|
'key_legend_in_graph', 'draw_graph_horizontally', 'flowchart_format',
|
|
|
|
|
'forced_tasks', 'target_tasks'])
|
2014-10-08 03:21:28 -07:00
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
2015-07-23 01:16:05 -07:00
|
|
|
|
'input_file',
|
2015-07-22 22:30:00 -07:00
|
|
|
|
help="PDF file containing the images to be OCRed")
|
2014-10-08 03:21:28 -07:00
|
|
|
|
parser.add_argument(
|
2015-07-23 01:16:05 -07:00
|
|
|
|
'output_file',
|
2015-07-22 22:30:00 -07:00
|
|
|
|
help="output searchable PDF file")
|
2014-10-08 03:21:28 -07:00
|
|
|
|
parser.add_argument(
|
2015-07-23 18:38:59 -07:00
|
|
|
|
'-l', '--language', action='append',
|
2015-07-22 22:30:00 -07:00
|
|
|
|
help="language of the file to be OCRed")
|
|
|
|
|
|
2015-07-25 18:12:25 -07:00
|
|
|
|
metadata = parser.add_argument_group(
|
|
|
|
|
"Metadata options",
|
|
|
|
|
"Set output PDF/A metadata (default: use input document's title)")
|
|
|
|
|
metadata.add_argument(
|
|
|
|
|
'--title', type=str,
|
2015-07-27 04:20:49 -07:00
|
|
|
|
help="set document title (place multiple words in quotes)")
|
2015-07-25 18:12:25 -07:00
|
|
|
|
metadata.add_argument(
|
|
|
|
|
'--author', type=str,
|
|
|
|
|
help="set document author")
|
|
|
|
|
metadata.add_argument(
|
|
|
|
|
'--subject', type=str,
|
|
|
|
|
help="set document")
|
|
|
|
|
metadata.add_argument(
|
|
|
|
|
'--keywords', type=str,
|
|
|
|
|
help="set document keywords")
|
|
|
|
|
|
|
|
|
|
|
2015-07-22 22:51:38 -07:00
|
|
|
|
preprocessing = parser.add_argument_group(
|
2015-07-22 22:30:00 -07:00
|
|
|
|
"Preprocessing options",
|
|
|
|
|
"Improve OCR quality and final image")
|
|
|
|
|
preprocessing.add_argument(
|
|
|
|
|
'-d', '--deskew', action='store_true',
|
|
|
|
|
help="deskew each page before performing OCR")
|
|
|
|
|
preprocessing.add_argument(
|
|
|
|
|
'-c', '--clean', action='store_true',
|
|
|
|
|
help="clean pages with unpaper before performing OCR")
|
|
|
|
|
preprocessing.add_argument(
|
|
|
|
|
'-i', '--clean-final', action='store_true',
|
|
|
|
|
help="incorporate the cleaned image in the final PDF file")
|
|
|
|
|
preprocessing.add_argument(
|
|
|
|
|
'--oversample', metavar='DPI', type=int,
|
|
|
|
|
help="oversample images to improve OCR results slightly")
|
|
|
|
|
|
2014-10-08 03:21:28 -07:00
|
|
|
|
parser.add_argument(
|
2015-07-25 04:25:19 -07:00
|
|
|
|
'-f', '--force-ocr', action='store_true',
|
2014-10-08 03:21:28 -07:00
|
|
|
|
help="Force to OCR, even if the page already contains fonts")
|
|
|
|
|
parser.add_argument(
|
2015-07-25 04:25:19 -07:00
|
|
|
|
'-s', '--skip-text', action='store_true',
|
2014-10-08 03:21:28 -07:00
|
|
|
|
help="Skip OCR on pages that contain fonts and include the page anyway")
|
2015-02-20 15:26:33 -08:00
|
|
|
|
parser.add_argument(
|
2015-07-22 22:30:00 -07:00
|
|
|
|
'--skip-big', action='store_true',
|
2015-02-20 15:26:33 -08:00
|
|
|
|
help="Skip OCR for pages that are very large")
|
2015-07-25 04:25:19 -07:00
|
|
|
|
# parser.add_argument(
|
|
|
|
|
# '--exact-image', action='store_true',
|
|
|
|
|
# help="Use original page from PDF without re-rendering")
|
2015-07-22 22:30:00 -07:00
|
|
|
|
|
2015-07-22 22:51:38 -07:00
|
|
|
|
advanced = parser.add_argument_group(
|
2015-07-22 22:30:00 -07:00
|
|
|
|
"Advanced",
|
2015-07-25 01:46:16 -07:00
|
|
|
|
"Advanced options for power users")
|
2015-07-22 22:30:00 -07:00
|
|
|
|
advanced.add_argument(
|
2015-07-23 23:09:29 -07:00
|
|
|
|
'--tesseract-config', default=[], type=list, action='append',
|
2015-07-22 22:30:00 -07:00
|
|
|
|
help="Tesseract configuration")
|
2015-07-27 04:20:49 -07:00
|
|
|
|
advanced.add_argument(
|
|
|
|
|
'--pdf-renderer', choices=['tesseract', 'hocr'], default='hocr',
|
|
|
|
|
help='choose OCR PDF renderer')
|
2015-07-27 04:23:37 -07:00
|
|
|
|
advanced.add_argument(
|
|
|
|
|
'--tesseract-timeout', default=180.0, type=float,
|
|
|
|
|
help='give up on OCR after timeout')
|
2015-07-22 22:30:00 -07:00
|
|
|
|
|
2015-07-22 22:51:38 -07:00
|
|
|
|
debugging = parser.add_argument_group(
|
2015-07-22 22:30:00 -07:00
|
|
|
|
"Debugging",
|
|
|
|
|
"Arguments to help with troubleshooting and debugging")
|
|
|
|
|
debugging.add_argument(
|
|
|
|
|
'-k', '--keep-temporary-files', action='store_true',
|
|
|
|
|
help="keep temporary files (helpful for debugging)")
|
|
|
|
|
debugging.add_argument(
|
diff --git a/src/ocrmypdf.py b/src/ocrmypdf.py
index 68d1591..95afa8f 100755
--- a/src/ocrmypdf.py
+++ b/src/ocrmypdf.py
@@ -24,6 +24,7 @@ import ruffus.cmdline as cmdline
from .hocrtransform import HocrTransform
import warnings
+import multiprocessing
warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
@@ -96,7 +97,7 @@ debugging.add_argument(
'-k', '--keep-temporary-files', action='store_true',
help="keep temporary files (helpful for debugging)")
debugging.add_argument(
- '-g' ,'--debug-rendering', action='store_true',
+ '-g', '--debug-rendering', action='store_true',
help="render each page twice with debug information on second page")
@@ -106,51 +107,19 @@ if not options.temp_folder:
options.temp_folder = 'tmp'
-_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
- options.verbose)
+log, log_mutex = cmdline.setup_logging(__name__, options.log_file,
+ options.verbose)
-class WrappedLogger:
-
- def __init__(self, my_logger, my_mutex):
- self.logger = my_logger
- self.mutex = my_mutex
-
- def log(self, *args, **kwargs):
- with self.mutex:
- self.logger.log(*args, **kwargs)
-
- def debug(self, *args, **kwargs):
- with self.mutex:
- self.logger.debug(*args, **kwargs)
-
- def info(self, *args, **kwargs):
- with self.mutex:
- self.logger.info(*args, **kwargs)
-
- def warning(self, *args, **kwargs):
- with self.mutex:
- self.logger.warning(*args, **kwargs)
-
- def error(self, *args, **kwargs):
- with self.mutex:
- self.logger.error(*args, **kwargs)
-
- def critical(self, *args, **kwargs):
- with self.mutex:
- self.logger.critical(*args, **kwargs)
-
-log = WrappedLogger(_logger, _logger_mutex)
-
-
-def re_symlink(input_file, soft_link_name, log=log):
+def re_symlink(input_file, soft_link_name, log, mutex):
"""
Helper function: relinks soft symbolic link if necessary
"""
if input_file == soft_link_name:
- log.debug("Warning: No symbolic link made. You are using " +
- "the original data directory as the working directory.")
+ with mutex:
+ log.debug("Warning: No symbolic link made. You are using " +
+ "the original data directory as the working directory.")
return
@@ -161,12 +130,14 @@ def re_symlink(input_file, soft_link_name, log=log):
try:
os.unlink(soft_link_name)
except:
- log.debug("Can't unlink %s" % (soft_link_name))
+ with mutex:
+ log.debug("Can't unlink %s" % (soft_link_name))
if not os.path.exists(input_file):
raise Exception("trying to create a broken symlink to %s" % input_file)
- log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
+ with mutex:
+ log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
os.symlink(
2015-07-23 02:22:12 -07:00
|
|
|
|
'-g', '--debug-rendering', action='store_true',
|
2015-07-22 22:30:00 -07:00
|
|
|
|
help="render each page twice with debug information on second page")
|
2014-10-08 03:21:28 -07:00
|
|
|
|
|
|
|
|
|
|
2015-07-27 15:39:54 -07:00
|
|
|
|
# Fiddle with arguments to support with unittest.mock
|
|
|
|
|
_argv = sys.argv
|
|
|
|
|
if _argv[0].startswith('python'):
|
|
|
|
|
_argv = _argv[1:]
|
|
|
|
|
if _argv[0].endswith('.py'):
|
|
|
|
|
_argv = _argv[1:]
|
|
|
|
|
options = parser.parse_args(_argv)
|
|
|
|
|
|
2014-10-08 03:21:28 -07:00
|
|
|
|
|
2015-07-23 18:38:59 -07:00
|
|
|
|
# ----------
|
|
|
|
|
# Languages
|
|
|
|
|
|
|
|
|
|
if not options.language:
|
|
|
|
|
options.language = ['eng'] # Enforce English hegemony
|
|
|
|
|
|
|
|
|
|
# Support v2.x "eng+deu" language syntax
|
|
|
|
|
if '+' in options.language[0]:
|
|
|
|
|
options.language = options.language[0].split('+')
|
|
|
|
|
|
|
|
|
|
if not set(options.language).issubset(tesseract.LANGUAGES):
|
|
|
|
|
print(
|
|
|
|
|
"The installed version of tesseract does not have language "
|
|
|
|
|
"data for the following requested languages: ",
|
|
|
|
|
file=sys.stderr)
|
|
|
|
|
for lang in (set(options.language) - tesseract.LANGUAGES):
|
|
|
|
|
print(lang, file=sys.stderr)
|
|
|
|
|
sys.exit(EXIT_BAD_ARGS)
|
|
|
|
|
|
|
|
|
|
|
2015-07-25 00:22:56 -07:00
|
|
|
|
# ----------
|
|
|
|
|
# Arguments
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if any((options.deskew, options.clean, options.clean_final)):
|
|
|
|
|
try:
|
|
|
|
|
from . import unpaper
|
|
|
|
|
except ImportError:
|
|
|
|
|
print("Install the 'unpaper' program to use the specified options",
|
|
|
|
|
file=sys.stderr)
|
|
|
|
|
sys.exit(EXIT_BAD_ARGS)
|
|
|
|
|
else:
|
|
|
|
|
unpaper = None
|
|
|
|
|
|
2015-07-23 02:39:42 -07:00
|
|
|
|
# ----------
|
|
|
|
|
# Logging
|
2015-07-22 22:58:13 -07:00
|
|
|
|
|
|
|
|
|
|
2015-07-23 02:26:09 -07:00
|
|
|
|
_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
|
|
|
|
|
options.verbose)
|
2015-03-24 22:46:33 -07:00
|
|
|
|
|
|
|
|
|
|
2015-07-23 02:26:09 -07:00
|
|
|
|
class WrappedLogger:
|
|
|
|
|
|
|
|
|
|
def __init__(self, my_logger, my_mutex):
|
|
|
|
|
self.logger = my_logger
|
|
|
|
|
self.mutex = my_mutex
|
|
|
|
|
|
|
|
|
|
def log(self, *args, **kwargs):
|
|
|
|
|
with self.mutex:
|
|
|
|
|
self.logger.log(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
def debug(self, *args, **kwargs):
|
|
|
|
|
with self.mutex:
|
|
|
|
|
self.logger.debug(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
def info(self, *args, **kwargs):
|
|
|
|
|
with self.mutex:
|
|
|
|
|
self.logger.info(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
def warning(self, *args, **kwargs):
|
|
|
|
|
with self.mutex:
|
|
|
|
|
self.logger.warning(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
def error(self, *args, **kwargs):
|
|
|
|
|
with self.mutex:
|
|
|
|
|
self.logger.error(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
def critical(self, *args, **kwargs):
|
|
|
|
|
with self.mutex:
|
|
|
|
|
self.logger.critical(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
_log = WrappedLogger(_logger, _logger_mutex)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def re_symlink(input_file, soft_link_name, log=_log):
|
2015-07-22 22:46:00 -07:00
|
|
|
|
"""
|
|
|
|
|
Helper function: relinks soft symbolic link if necessary
|
|
|
|
|
"""
|
|
|
|
|
# Guard against soft linking to oneself
|
|
|
|
|
if input_file == soft_link_name:
|
2015-07-23 02:26:09 -07:00
|
|
|
|
log.debug("Warning: No symbolic link made. You are using " +
|
2015-07-24 01:27:01 -07:00
|
|
|
|
"the original data directory as the working directory.")
|
2015-07-22 22:46:00 -07:00
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# Soft link already exists: delete for relink?
|
|
|
|
|
if os.path.lexists(soft_link_name):
|
|
|
|
|
# do not delete or overwrite real (non-soft link) file
|
|
|
|
|
if not os.path.islink(soft_link_name):
|
|
|
|
|
raise Exception("%s exists and is not a link" % soft_link_name)
|
|
|
|
|
try:
|
|
|
|
|
os.unlink(soft_link_name)
|
|
|
|
|
except:
|
2015-07-23 02:26:09 -07:00
|
|
|
|
log.debug("Can't unlink %s" % (soft_link_name))
|
2015-07-22 22:46:00 -07:00
|
|
|
|
|
|
|
|
|
if not os.path.exists(input_file):
|
|
|
|
|
raise Exception("trying to create a broken symlink to %s" % input_file)
|
|
|
|
|
|
2015-07-23 02:26:09 -07:00
|
|
|
|
log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
|
2015-07-22 22:46:00 -07:00
|
|
|
|
|
|
|
|
|
# Create symbolic link using absolute path
|
|
|
|
|
os.symlink(
|
|
|
|
|
os.path.abspath(input_file),
|
|
|
|
|
soft_link_name
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2015-07-23 02:39:42 -07:00
|
|
|
|
# -------------
|
|
|
|
|
# The Pipeline
|
|
|
|
|
|
2015-07-23 03:49:30 -07:00
|
|
|
|
manager = multiprocessing.Manager()
|
|
|
|
|
_pdfinfo = manager.list()
|
|
|
|
|
_pdfinfo_lock = manager.Lock()
|
|
|
|
|
|
2015-07-25 01:45:26 -07:00
|
|
|
|
work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@atexit.register
|
|
|
|
|
def cleanup_working_files(*args):
|
|
|
|
|
if options.keep_temporary_files:
|
|
|
|
|
print("Temporary working files saved at:")
|
|
|
|
|
print(work_folder)
|
|
|
|
|
else:
|
|
|
|
|
with suppress(FileNotFoundError):
|
|
|
|
|
shutil.rmtree(work_folder)
|
2015-07-23 02:39:42 -07:00
|
|
|
|
|
|
|
|
|
|
2015-07-23 01:16:05 -07:00
|
|
|
|
@transform(
|
2015-07-23 03:09:03 -07:00
|
|
|
|
input=options.input_file,
|
|
|
|
|
filter=suffix('.pdf'),
|
2015-07-24 01:55:54 -07:00
|
|
|
|
output='.repaired.pdf',
|
2015-07-25 01:45:26 -07:00
|
|
|
|
output_dir=work_folder,
|
2015-07-23 03:49:30 -07:00
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
2015-07-24 01:55:54 -07:00
|
|
|
|
def repair_pdf(
|
2015-07-23 01:16:05 -07:00
|
|
|
|
input_file,
|
2015-07-23 02:39:42 -07:00
|
|
|
|
output_file,
|
2015-07-23 03:49:30 -07:00
|
|
|
|
log,
|
2015-07-23 03:09:03 -07:00
|
|
|
|
pdfinfo,
|
2015-07-23 03:49:30 -07:00
|
|
|
|
pdfinfo_lock):
|
2015-07-23 01:16:05 -07:00
|
|
|
|
args_mutool = [
|
|
|
|
|
'mutool', 'clean',
|
|
|
|
|
input_file, output_file
|
|
|
|
|
]
|
|
|
|
|
check_call(args_mutool)
|
|
|
|
|
|
2015-07-23 03:49:30 -07:00
|
|
|
|
with pdfinfo_lock:
|
|
|
|
|
pdfinfo.extend(pdf_get_all_pageinfo(output_file))
|
|
|
|
|
log.info(pdfinfo)
|
2015-07-23 01:16:05 -07:00
|
|
|
|
|
|
|
|
|
|
2015-07-25 04:25:19 -07:00
|
|
|
|
def get_pageinfo(input_file, pdfinfo, pdfinfo_lock):
|
|
|
|
|
pageno = int(os.path.basename(input_file)[0:6]) - 1
|
|
|
|
|
with pdfinfo_lock:
|
|
|
|
|
pageinfo = pdfinfo[pageno].copy()
|
|
|
|
|
return pageinfo
|
2015-07-23 02:39:42 -07:00
|
|
|
|
|
2015-07-25 04:25:19 -07:00
|
|
|
|
|
|
|
|
|
def is_ocr_required(pageinfo, log):
|
|
|
|
|
page = pageinfo['pageno'] + 1
|
|
|
|
|
ocr_required = True
|
|
|
|
|
if not pageinfo['images']:
|
|
|
|
|
# If the page has no images, then it contains vector content or text
|
|
|
|
|
# or both. It seems quite unlikely that one would find meaningful text
|
|
|
|
|
# from rasterizing vector content. So skip the page.
|
|
|
|
|
log.info(
|
|
|
|
|
"Page {0} has no images - skipping OCR".format(page)
|
|
|
|
|
)
|
|
|
|
|
ocr_required = False
|
|
|
|
|
elif pageinfo['has_text']:
|
|
|
|
|
s = "Page {0} already has text! – {1}"
|
|
|
|
|
|
|
|
|
|
if not options.force_ocr and not options.skip_text:
|
|
|
|
|
log.error(s.format(page,
|
|
|
|
|
"aborting (use --force-ocr to force OCR)"))
|
|
|
|
|
sys.exit(EXIT_ALREADY_DONE_OCR)
|
|
|
|
|
elif options.force_ocr:
|
|
|
|
|
log.info(s.format(page,
|
|
|
|
|
"rasterizing text and running OCR anyway"))
|
|
|
|
|
ocr_required = True
|
|
|
|
|
elif options.skip_text:
|
|
|
|
|
log.info(s.format(page,
|
|
|
|
|
"skipping all processing on this page"))
|
|
|
|
|
ocr_required = False
|
|
|
|
|
|
|
|
|
|
if ocr_required and options.skip_big:
|
|
|
|
|
area = pageinfo['width_inches'] * pageinfo['height_inches']
|
|
|
|
|
pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
|
|
|
|
|
if area > (11.0 * 17.0) or pixel_count > (300.0 * 300.0 * 11 * 17):
|
|
|
|
|
ocr_required = False
|
|
|
|
|
log.info(
|
|
|
|
|
"Page {0} is very large; skipping due to -b".format(page))
|
|
|
|
|
|
|
|
|
|
return ocr_required
|
2015-07-23 01:16:05 -07:00
|
|
|
|
|
|
|
|
|
|
2015-07-25 02:58:34 -07:00
|
|
|
|
@split(
|
2015-07-24 01:55:54 -07:00
|
|
|
|
repair_pdf,
|
2015-07-25 02:58:34 -07:00
|
|
|
|
os.path.join(work_folder, '*.page.pdf'),
|
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
2015-07-22 22:46:00 -07:00
|
|
|
|
def split_pages(
|
|
|
|
|
input_file,
|
2015-07-23 03:09:03 -07:00
|
|
|
|
output_files,
|
2015-07-23 03:49:30 -07:00
|
|
|
|
log,
|
2015-07-23 03:09:03 -07:00
|
|
|
|
pdfinfo,
|
2015-07-23 03:49:30 -07:00
|
|
|
|
pdfinfo_lock):
|
2015-07-22 22:46:00 -07:00
|
|
|
|
|
|
|
|
|
for oo in output_files:
|
|
|
|
|
with suppress(FileNotFoundError):
|
|
|
|
|
os.unlink(oo)
|
|
|
|
|
args_pdfseparate = [
|
|
|
|
|
'pdfseparate',
|
|
|
|
|
input_file,
|
2015-07-25 02:58:34 -07:00
|
|
|
|
os.path.join(work_folder, '%06d.page.pdf')
|
2015-07-22 22:46:00 -07:00
|
|
|
|
]
|
|
|
|
|
check_call(args_pdfseparate)
|
|
|
|
|
|
2015-07-25 04:25:19 -07:00
|
|
|
|
from glob import glob
|
|
|
|
|
for filename in glob(os.path.join(work_folder, '*.page.pdf')):
|
|
|
|
|
pageinfo = get_pageinfo(filename, pdfinfo, pdfinfo_lock)
|
2015-07-23 03:49:30 -07:00
|
|
|
|
|
2015-07-25 04:25:19 -07:00
|
|
|
|
alt_suffix = '.ocr.page.pdf' if is_ocr_required(pageinfo, log) \
|
|
|
|
|
else '.skip.page.pdf'
|
|
|
|
|
re_symlink(
|
|
|
|
|
filename,
|
|
|
|
|
os.path.join(
|
|
|
|
|
work_folder,
|
|
|
|
|
os.path.basename(filename)[0:6] + alt_suffix))
|
2015-07-23 23:09:29 -07:00
|
|
|
|
|
|
|
|
|
|
2015-07-23 04:57:31 -07:00
|
|
|
|
@transform(
|
|
|
|
|
input=split_pages,
|
2015-07-25 04:25:19 -07:00
|
|
|
|
filter=suffix('.ocr.page.pdf'),
|
2015-07-23 23:09:29 -07:00
|
|
|
|
output='.page.png',
|
2015-07-25 01:45:26 -07:00
|
|
|
|
output_dir=work_folder,
|
2015-07-23 04:57:31 -07:00
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
2015-07-23 23:09:29 -07:00
|
|
|
|
def rasterize_with_ghostscript(
|
2015-07-23 04:57:31 -07:00
|
|
|
|
input_file,
|
|
|
|
|
output_file,
|
|
|
|
|
log,
|
|
|
|
|
pdfinfo,
|
|
|
|
|
pdfinfo_lock):
|
2015-07-23 23:09:29 -07:00
|
|
|
|
|
|
|
|
|
pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
|
|
|
|
|
|
|
|
|
|
device = 'png16m' # 24-bit
|
|
|
|
|
if all(image['comp'] == 1 for image in pageinfo['images']):
|
|
|
|
|
if all(image['bpc'] == 1 for image in pageinfo['images']):
|
|
|
|
|
device = 'pngmono'
|
|
|
|
|
elif not any(image['color'] == 'color'
|
|
|
|
|
for image in pageinfo['images']):
|
|
|
|
|
device = 'pnggray'
|
|
|
|
|
|
2015-07-26 12:56:10 -07:00
|
|
|
|
xres = max(pageinfo['xres'], options.oversample or 0)
|
|
|
|
|
yres = max(pageinfo['yres'], options.oversample or 0)
|
2015-07-23 23:09:29 -07:00
|
|
|
|
|
2015-07-27 15:22:00 -07:00
|
|
|
|
rasterize_pdf(input_file, output_file, xres, yres, device, log)
|
2015-07-23 23:09:29 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@transform(
|
|
|
|
|
input=rasterize_with_ghostscript,
|
|
|
|
|
filter=suffix(".page.png"),
|
2015-07-25 00:22:56 -07:00
|
|
|
|
output=".pp-deskew.png",
|
2015-07-24 15:19:37 -07:00
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
2015-07-25 00:22:56 -07:00
|
|
|
|
def preprocess_deskew(
|
2015-07-24 15:19:37 -07:00
|
|
|
|
input_file,
|
|
|
|
|
output_file,
|
|
|
|
|
log,
|
|
|
|
|
pdfinfo,
|
|
|
|
|
pdfinfo_lock):
|
|
|
|
|
|
2015-07-25 00:22:56 -07:00
|
|
|
|
if not options.deskew:
|
2015-07-24 15:19:37 -07:00
|
|
|
|
re_symlink(input_file, output_file, log)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
|
2015-07-25 00:22:56 -07:00
|
|
|
|
dpi = int(pageinfo['xres'])
|
2015-07-24 15:19:37 -07:00
|
|
|
|
|
2015-07-25 00:22:56 -07:00
|
|
|
|
unpaper.deskew(input_file, output_file, dpi, log)
|
2015-07-24 15:19:37 -07:00
|
|
|
|
|
|
|
|
|
|
2015-07-25 00:22:56 -07:00
|
|
|
|
@transform(
|
|
|
|
|
input=preprocess_deskew,
|
|
|
|
|
filter=suffix(".pp-deskew.png"),
|
|
|
|
|
output=".pp-clean.png",
|
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
|
|
|
|
def preprocess_clean(
|
|
|
|
|
input_file,
|
|
|
|
|
output_file,
|
|
|
|
|
log,
|
|
|
|
|
pdfinfo,
|
|
|
|
|
pdfinfo_lock):
|
2015-07-24 15:19:37 -07:00
|
|
|
|
|
2015-07-25 00:22:56 -07:00
|
|
|
|
if not options.clean:
|
|
|
|
|
re_symlink(input_file, output_file, log)
|
|
|
|
|
return
|
2015-07-24 15:19:37 -07:00
|
|
|
|
|
2015-07-25 00:22:56 -07:00
|
|
|
|
pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
|
|
|
|
|
dpi = int(pageinfo['xres'])
|
2015-07-24 15:19:37 -07:00
|
|
|
|
|
2015-07-25 00:22:56 -07:00
|
|
|
|
unpaper.clean(input_file, output_file, dpi, log)
|
2015-07-24 15:19:37 -07:00
|
|
|
|
|
|
|
|
|
|
2015-07-27 04:20:49 -07:00
|
|
|
|
@active_if(options.pdf_renderer == 'hocr')
|
2015-07-24 15:19:37 -07:00
|
|
|
|
@transform(
|
2015-07-25 00:22:56 -07:00
|
|
|
|
input=preprocess_clean,
|
|
|
|
|
filter=suffix(".pp-clean.png"),
|
2015-07-23 23:09:29 -07:00
|
|
|
|
output=".hocr",
|
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
2015-07-27 04:20:49 -07:00
|
|
|
|
def ocr_tesseract_hocr(
|
2015-07-23 23:09:29 -07:00
|
|
|
|
input_file,
|
|
|
|
|
output_file,
|
|
|
|
|
log,
|
|
|
|
|
pdfinfo,
|
|
|
|
|
pdfinfo_lock):
|
|
|
|
|
|
|
|
|
|
pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
|
|
|
|
|
|
|
|
|
|
args_tesseract = [
|
|
|
|
|
'tesseract',
|
|
|
|
|
'-l', '+'.join(options.language),
|
|
|
|
|
input_file,
|
|
|
|
|
output_file,
|
|
|
|
|
'hocr'
|
|
|
|
|
] + options.tesseract_config
|
|
|
|
|
p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
|
|
|
|
|
universal_newlines=True)
|
|
|
|
|
try:
|
2015-07-27 04:23:37 -07:00
|
|
|
|
stdout, stderr = p.communicate(timeout=options.tesseract_timeout)
|
2015-07-23 23:09:29 -07:00
|
|
|
|
except TimeoutExpired:
|
|
|
|
|
p.kill()
|
|
|
|
|
stdout, stderr = p.communicate()
|
|
|
|
|
# Generate a HOCR file with no recognized text if tesseract times out
|
|
|
|
|
# Temporary workaround to hocrTransform not being able to function if
|
|
|
|
|
# it does not have a valid hOCR file.
|
|
|
|
|
with open(output_file, 'w', encoding="utf-8") as f:
|
|
|
|
|
f.write(tesseract.HOCR_TEMPLATE.format(
|
|
|
|
|
pageinfo['width_pixels'],
|
|
|
|
|
pageinfo['height_pixels']))
|
|
|
|
|
else:
|
|
|
|
|
if stdout:
|
|
|
|
|
log.info(stdout)
|
|
|
|
|
if stderr:
|
|
|
|
|
log.error(stderr)
|
|
|
|
|
|
|
|
|
|
if p.returncode != 0:
|
|
|
|
|
raise CalledProcessError(p.returncode, args_tesseract)
|
|
|
|
|
|
|
|
|
|
if os.path.exists(output_file + '.html'):
|
|
|
|
|
# Tesseract 3.02 appends suffix ".html" on its own (.hocr.html)
|
|
|
|
|
shutil.move(output_file + '.html', output_file)
|
|
|
|
|
elif os.path.exists(output_file + '.hocr'):
|
|
|
|
|
# Tesseract 3.03 appends suffix ".hocr" on its own (.hocr.hocr)
|
|
|
|
|
shutil.move(output_file + '.hocr', output_file)
|
|
|
|
|
|
|
|
|
|
# Tesseract 3.03 inserts source filename into hocr file without
|
|
|
|
|
# escaping it, creating invalid XML and breaking the parser.
|
|
|
|
|
# As a workaround, rewrite the hocr file, replacing the filename
|
|
|
|
|
# with a space.
|
|
|
|
|
regex_nested_single_quotes = re.compile(
|
|
|
|
|
r"""title='image "([^"]*)";""")
|
|
|
|
|
with fileinput.input(files=(output_file,), inplace=True) as f:
|
|
|
|
|
for line in f:
|
|
|
|
|
line = regex_nested_single_quotes.sub(
|
|
|
|
|
r"""title='image " ";""", line)
|
|
|
|
|
print(line, end='') # fileinput.input redirects stdout
|
|
|
|
|
|
|
|
|
|
|
2015-07-27 04:20:49 -07:00
|
|
|
|
@active_if(options.pdf_renderer == 'hocr')
|
2015-07-23 23:09:29 -07:00
|
|
|
|
@collate(
|
2015-07-25 00:54:00 -07:00
|
|
|
|
input=[rasterize_with_ghostscript, preprocess_deskew, preprocess_clean],
|
|
|
|
|
filter=regex(r".*/(\d{6})(?:\.page|\.pp-deskew|\.pp-clean)\.png"),
|
2015-07-25 01:45:26 -07:00
|
|
|
|
output=os.path.join(work_folder, r'\1.image'),
|
2015-07-23 23:09:29 -07:00
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
2015-07-25 00:54:00 -07:00
|
|
|
|
def select_image_for_pdf(
|
2015-07-23 23:09:29 -07:00
|
|
|
|
infiles,
|
|
|
|
|
output_file,
|
|
|
|
|
log,
|
|
|
|
|
pdfinfo,
|
|
|
|
|
pdfinfo_lock):
|
2015-07-25 00:22:56 -07:00
|
|
|
|
if options.clean_final:
|
|
|
|
|
image_suffix = '.pp-clean.png'
|
2015-07-25 00:54:00 -07:00
|
|
|
|
elif options.deskew:
|
2015-07-25 00:22:56 -07:00
|
|
|
|
image_suffix = '.pp-deskew.png'
|
2015-07-25 00:54:00 -07:00
|
|
|
|
else:
|
|
|
|
|
image_suffix = '.page.png'
|
2015-07-25 00:22:56 -07:00
|
|
|
|
image = next(ii for ii in infiles if ii.endswith(image_suffix))
|
2015-07-23 23:09:29 -07:00
|
|
|
|
|
2015-07-25 00:54:00 -07:00
|
|
|
|
pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
|
|
|
|
|
if all(image['enc'] == 'jpeg' for image in pageinfo['images']):
|
|
|
|
|
# If all images were JPEGs originally, produce a JPEG as output
|
|
|
|
|
Image.open(image).save(output_file, format='JPEG')
|
|
|
|
|
else:
|
|
|
|
|
re_symlink(image, output_file)
|
|
|
|
|
|
|
|
|
|
|
2015-07-27 04:20:49 -07:00
|
|
|
|
@active_if(options.pdf_renderer == 'hocr')
|
2015-07-25 00:54:00 -07:00
|
|
|
|
@collate(
|
2015-07-27 04:20:49 -07:00
|
|
|
|
input=[select_image_for_pdf, ocr_tesseract_hocr],
|
2015-07-25 00:54:00 -07:00
|
|
|
|
filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"),
|
2015-07-25 01:45:26 -07:00
|
|
|
|
output=os.path.join(work_folder, r'\1.rendered.pdf'),
|
2015-07-25 00:54:00 -07:00
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
2015-07-27 04:20:49 -07:00
|
|
|
|
def render_hocr_page(
|
2015-07-25 00:54:00 -07:00
|
|
|
|
infiles,
|
|
|
|
|
output_file,
|
|
|
|
|
log,
|
|
|
|
|
pdfinfo,
|
|
|
|
|
pdfinfo_lock):
|
|
|
|
|
hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
|
|
|
|
|
image = next(ii for ii in infiles if ii.endswith('.image'))
|
|
|
|
|
|
2015-07-23 23:09:29 -07:00
|
|
|
|
pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
|
|
|
|
|
dpi = round(max(pageinfo['xres'], pageinfo['yres']))
|
|
|
|
|
|
|
|
|
|
hocrtransform = HocrTransform(hocr, dpi)
|
|
|
|
|
hocrtransform.to_pdf(output_file, imageFileName=image,
|
|
|
|
|
showBoundingboxes=False, invisibleText=True)
|
|
|
|
|
|
2015-07-23 04:57:31 -07:00
|
|
|
|
|
2015-07-27 04:20:49 -07:00
|
|
|
|
@active_if(options.pdf_renderer == 'hocr')
|
2015-07-25 14:14:02 -07:00
|
|
|
|
@active_if(options.debug_rendering)
|
|
|
|
|
@collate(
|
2015-07-27 04:20:49 -07:00
|
|
|
|
input=[select_image_for_pdf, ocr_tesseract_hocr],
|
2015-07-25 14:14:02 -07:00
|
|
|
|
filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"),
|
|
|
|
|
output=os.path.join(work_folder, r'\1.debug.pdf'),
|
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
2015-07-27 04:20:49 -07:00
|
|
|
|
def render_hocr_debug_page(
|
2015-07-25 14:14:02 -07:00
|
|
|
|
infiles,
|
|
|
|
|
output_file,
|
|
|
|
|
log,
|
|
|
|
|
pdfinfo,
|
|
|
|
|
pdfinfo_lock):
|
|
|
|
|
hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
|
|
|
|
|
image = next(ii for ii in infiles if ii.endswith('.image'))
|
|
|
|
|
|
|
|
|
|
pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
|
|
|
|
|
dpi = round(max(pageinfo['xres'], pageinfo['yres']))
|
|
|
|
|
|
|
|
|
|
hocrtransform = HocrTransform(hocr, dpi)
|
|
|
|
|
hocrtransform.to_pdf(output_file, imageFileName=None,
|
|
|
|
|
showBoundingboxes=True, invisibleText=False)
|
|
|
|
|
|
|
|
|
|
|
2015-07-27 04:20:49 -07:00
|
|
|
|
@active_if(options.pdf_renderer == 'tesseract')
|
|
|
|
|
@transform(
|
|
|
|
|
input=preprocess_clean,
|
|
|
|
|
filter=suffix(".pp-clean.png"),
|
|
|
|
|
output=".rendered.pdf",
|
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
|
|
|
|
def tesseract_ocr_and_render_pdf(
|
|
|
|
|
input_file,
|
|
|
|
|
output_file,
|
|
|
|
|
log,
|
|
|
|
|
pdfinfo,
|
|
|
|
|
pdfinfo_lock):
|
|
|
|
|
|
|
|
|
|
args_tesseract = [
|
|
|
|
|
'tesseract',
|
|
|
|
|
'-l', '+'.join(options.language),
|
|
|
|
|
input_file,
|
|
|
|
|
os.path.splitext(output_file)[0], # Tesseract appends suffix
|
|
|
|
|
'pdf'
|
|
|
|
|
] + options.tesseract_config
|
|
|
|
|
p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
|
|
|
|
|
universal_newlines=True)
|
|
|
|
|
|
2015-07-27 04:23:37 -07:00
|
|
|
|
stdout, stderr = p.communicate(timeout=options.tesseract_timeout)
|
2015-07-27 04:20:49 -07:00
|
|
|
|
if stdout:
|
|
|
|
|
log.info(stdout)
|
|
|
|
|
if stderr:
|
|
|
|
|
log.error(stderr)
|
|
|
|
|
|
|
|
|
|
|
2015-07-23 04:57:31 -07:00
|
|
|
|
@transform(
|
2015-07-24 01:55:54 -07:00
|
|
|
|
input=repair_pdf,
|
|
|
|
|
filter=suffix('.repaired.pdf'),
|
2015-07-23 04:57:31 -07:00
|
|
|
|
output='.pdfa_def.ps',
|
2015-07-25 01:45:26 -07:00
|
|
|
|
output_dir=work_folder,
|
2015-07-23 04:57:31 -07:00
|
|
|
|
extras=[_log])
|
|
|
|
|
def generate_postscript_stub(
|
|
|
|
|
input_file,
|
|
|
|
|
output_file,
|
|
|
|
|
log):
|
2015-07-25 15:31:02 -07:00
|
|
|
|
|
2015-07-25 18:05:25 -07:00
|
|
|
|
pdf = pypdf.PdfFileReader(input_file)
|
|
|
|
|
|
|
|
|
|
def from_document_info(key):
|
|
|
|
|
# pdf.documentInfo.get() DOES NOT work as expected
|
|
|
|
|
try:
|
|
|
|
|
s = pdf.documentInfo[key]
|
|
|
|
|
return str(s)
|
|
|
|
|
except KeyError:
|
|
|
|
|
return ''
|
|
|
|
|
|
|
|
|
|
pdfmark = {
|
|
|
|
|
'title': from_document_info('/Title'),
|
|
|
|
|
'author': from_document_info('/Author'),
|
|
|
|
|
'keywords': from_document_info('/Keywords'),
|
|
|
|
|
'subject': from_document_info('/Subject'),
|
|
|
|
|
}
|
2015-07-25 18:12:25 -07:00
|
|
|
|
if options.title:
|
|
|
|
|
pdfmark['title'] = options.title
|
|
|
|
|
if options.author:
|
|
|
|
|
pdfmark['author'] = options.author
|
|
|
|
|
if options.keywords:
|
|
|
|
|
pdfmark['keywords'] = options.keywords
|
|
|
|
|
if options.subject:
|
|
|
|
|
pdfmark['subject'] = options.subject
|
|
|
|
|
|
2015-07-25 15:31:02 -07:00
|
|
|
|
generate_pdfa_def(output_file, pdfmark)
|
2015-07-22 22:46:00 -07:00
|
|
|
|
|
2015-07-22 22:51:38 -07:00
|
|
|
|
|
2015-07-25 04:25:19 -07:00
|
|
|
|
@transform(
|
|
|
|
|
input=split_pages,
|
|
|
|
|
filter=suffix('.skip.page.pdf'),
|
|
|
|
|
output='.done.pdf',
|
|
|
|
|
output_dir=work_folder,
|
|
|
|
|
extras=[_log])
|
|
|
|
|
def skip_page(
|
|
|
|
|
input_file,
|
|
|
|
|
output_file,
|
|
|
|
|
log):
|
|
|
|
|
re_symlink(input_file, output_file, log)
|
|
|
|
|
|
|
|
|
|
|
2015-07-23 04:57:31 -07:00
|
|
|
|
@merge(
|
2015-07-27 04:20:49 -07:00
|
|
|
|
input=[render_hocr_page, render_hocr_debug_page, skip_page,
|
|
|
|
|
tesseract_ocr_and_render_pdf, generate_postscript_stub],
|
2015-07-25 01:45:26 -07:00
|
|
|
|
output=os.path.join(work_folder, 'merged.pdf'),
|
2015-07-23 04:57:31 -07:00
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
|
|
|
|
def merge_pages(
|
|
|
|
|
input_files,
|
|
|
|
|
output_file,
|
|
|
|
|
log,
|
|
|
|
|
pdfinfo,
|
|
|
|
|
pdfinfo_lock):
|
2015-07-22 22:51:38 -07:00
|
|
|
|
|
2015-07-25 14:14:02 -07:00
|
|
|
|
def input_file_order(s):
|
2015-07-25 18:05:25 -07:00
|
|
|
|
'''Sort order: All rendered pages followed
|
|
|
|
|
by their debug page, if any, followed by Postscript stub.
|
|
|
|
|
Ghostscript documentation has the Postscript stub at the
|
|
|
|
|
beginning, but it works at the end and also gets document info
|
|
|
|
|
right that way.'''
|
2015-07-25 14:14:02 -07:00
|
|
|
|
if s.endswith('.ps'):
|
2015-07-25 18:05:25 -07:00
|
|
|
|
return 99999999
|
2015-07-25 14:14:02 -07:00
|
|
|
|
key = int(os.path.basename(s)[0:6]) * 10
|
|
|
|
|
if 'debug' in os.path.basename(s):
|
|
|
|
|
key += 1
|
|
|
|
|
return key
|
|
|
|
|
|
|
|
|
|
pdf_pages = sorted(input_files, key=input_file_order)
|
|
|
|
|
log.info(pdf_pages)
|
2015-07-27 15:22:00 -07:00
|
|
|
|
generate_pdfa(pdf_pages, output_file)
|
2015-07-22 22:51:38 -07:00
|
|
|
|
|
|
|
|
|
|
2015-07-23 14:48:46 -07:00
|
|
|
|
@transform(
|
|
|
|
|
input=merge_pages,
|
|
|
|
|
filter=formatter(),
|
|
|
|
|
output=options.output_file,
|
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
|
|
|
|
def validate_pdfa(
|
|
|
|
|
input_file,
|
|
|
|
|
output_file,
|
|
|
|
|
log,
|
|
|
|
|
pdfinfo,
|
|
|
|
|
pdfinfo_lock):
|
|
|
|
|
|
|
|
|
|
args_jhove = [
|
|
|
|
|
'java',
|
|
|
|
|
'-jar', JHOVE_JAR,
|
|
|
|
|
'-c', JHOVE_CFG,
|
|
|
|
|
'-m', 'PDF-hul',
|
|
|
|
|
input_file
|
|
|
|
|
]
|
|
|
|
|
p_jhove = Popen(args_jhove, close_fds=True, universal_newlines=True,
|
|
|
|
|
stdout=PIPE, stderr=DEVNULL)
|
|
|
|
|
stdout, _ = p_jhove.communicate()
|
|
|
|
|
|
|
|
|
|
log.debug(stdout)
|
|
|
|
|
if p_jhove.returncode != 0:
|
|
|
|
|
log.error(stdout)
|
|
|
|
|
raise RuntimeError(
|
|
|
|
|
"Unexpected error while checking compliance to PDF/A file.")
|
|
|
|
|
|
|
|
|
|
pdf_is_valid = True
|
|
|
|
|
if re.search(r'ErrorMessage', stdout,
|
|
|
|
|
re.IGNORECASE | re.MULTILINE):
|
|
|
|
|
pdf_is_valid = False
|
|
|
|
|
if re.search(r'^\s+Status.*not valid', stdout,
|
|
|
|
|
re.IGNORECASE | re.MULTILINE):
|
|
|
|
|
pdf_is_valid = False
|
|
|
|
|
if re.search(r'^\s+Status.*Not well-formed', stdout,
|
|
|
|
|
re.IGNORECASE | re.MULTILINE):
|
|
|
|
|
pdf_is_valid = False
|
|
|
|
|
|
|
|
|
|
pdf_is_pdfa = False
|
|
|
|
|
if re.search(r'^\s+Profile:.*PDF/A-1', stdout,
|
|
|
|
|
re.IGNORECASE | re.MULTILINE):
|
|
|
|
|
pdf_is_pdfa = True
|
|
|
|
|
|
|
|
|
|
if not pdf_is_valid:
|
|
|
|
|
log.warning('Output file: The generated PDF/A file is INVALID')
|
|
|
|
|
elif pdf_is_valid and not pdf_is_pdfa:
|
|
|
|
|
log.warning('Output file: Generated file is a VALID PDF but not PDF/A')
|
|
|
|
|
elif pdf_is_valid and pdf_is_pdfa:
|
|
|
|
|
log.info('Output file: The generated PDF/A file is VALID')
|
|
|
|
|
shutil.copy(input_file, output_file)
|
|
|
|
|
|
|
|
|
|
|
2015-07-22 22:51:38 -07:00
|
|
|
|
# @active_if(ocr_required and options.exact_image)
|
|
|
|
|
# @merge([render_hocr_blank_page, extract_single_page],
|
2015-07-25 01:45:26 -07:00
|
|
|
|
# os.path.join(work_folder, "%04i.merged.pdf") % pageno)
|
2015-07-22 22:51:38 -07:00
|
|
|
|
# def merge_hocr_with_original_page(infiles, output_file):
|
|
|
|
|
# with open(infiles[0], 'rb') as hocr_input, \
|
|
|
|
|
# open(infiles[1], 'rb') as page_input, \
|
|
|
|
|
# open(output_file, 'wb') as output:
|
|
|
|
|
# hocr_reader = pypdf.PdfFileReader(hocr_input)
|
|
|
|
|
# page_reader = pypdf.PdfFileReader(page_input)
|
|
|
|
|
# writer = pypdf.PdfFileWriter()
|
|
|
|
|
|
|
|
|
|
# the_page = hocr_reader.getPage(0)
|
|
|
|
|
# the_page.mergePage(page_reader.getPage(0))
|
|
|
|
|
# writer.addPage(the_page)
|
|
|
|
|
# writer.write(output)
|
|
|
|
|
|
|
|
|
|
|
2015-07-25 01:10:14 -07:00
|
|
|
|
def available_cpu_count():
|
|
|
|
|
try:
|
|
|
|
|
return multiprocessing.cpu_count()
|
|
|
|
|
except NotImplementedError:
|
|
|
|
|
pass
|
2015-07-22 22:51:38 -07:00
|
|
|
|
|
2015-07-25 01:10:14 -07:00
|
|
|
|
try:
|
|
|
|
|
import psutil
|
|
|
|
|
return psutil.cpu_count()
|
|
|
|
|
except (ImportError, AttributeError):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
print(
|
|
|
|
|
"Could not get CPU count. Assuming one (1) CPU."
|
|
|
|
|
"Use -j N to set manually.", file=sys.stderr)
|
|
|
|
|
return 1
|
2014-09-26 04:19:41 -07:00
|
|
|
|
|
|
|
|
|
|
2015-07-26 01:52:08 -07:00
|
|
|
|
def run_pipeline():
|
2015-07-25 01:10:14 -07:00
|
|
|
|
cmdline.run(options, multiprocess=available_cpu_count())
|
2015-07-26 01:52:08 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
run_pipeline()
|