2014-09-26 04:19:41 -07:00
|
|
|
|
#!/usr/bin/env python3
|
2015-07-28 04:36:58 -07:00
|
|
|
|
# © 2015 James R. Barlow: github.com/jbarlow83
|
2014-09-26 04:19:41 -07:00
|
|
|
|
|
2015-07-23 04:57:31 -07:00
|
|
|
|
from contextlib import suppress
|
2015-07-25 01:45:26 -07:00
|
|
|
|
from tempfile import NamedTemporaryFile, mkdtemp
|
2014-09-26 04:19:41 -07:00
|
|
|
|
import sys
|
2015-07-23 04:57:31 -07:00
|
|
|
|
import os
|
2015-02-13 13:41:14 -08:00
|
|
|
|
import re
|
2015-03-10 14:28:38 -07:00
|
|
|
|
import shutil
|
2015-07-23 04:57:31 -07:00
|
|
|
|
import warnings
|
|
|
|
|
|
import multiprocessing
|
2015-07-25 01:45:26 -07:00
|
|
|
|
import atexit
|
2015-07-28 02:25:50 -07:00
|
|
|
|
import textwrap
|
2015-07-23 04:57:31 -07:00
|
|
|
|
|
|
|
|
|
|
import PyPDF2 as pypdf
|
2015-07-24 15:19:37 -07:00
|
|
|
|
from PIL import Image
|
2014-10-08 03:21:28 -07:00
|
|
|
|
|
2015-12-16 17:36:11 -08:00
|
|
|
|
from functools import partial
|
|
|
|
|
|
|
2014-11-14 02:06:23 -08:00
|
|
|
|
from subprocess import Popen, check_call, PIPE, CalledProcessError, \
|
2015-08-11 02:19:46 -07:00
|
|
|
|
TimeoutExpired, check_output, STDOUT
|
2014-10-08 03:21:28 -07:00
|
|
|
|
try:
|
|
|
|
|
|
from subprocess import DEVNULL
|
|
|
|
|
|
except ImportError:
|
|
|
|
|
|
DEVNULL = open(os.devnull, 'wb')
|
|
|
|
|
|
|
2014-09-26 04:19:41 -07:00
|
|
|
|
|
2014-10-10 00:35:49 -07:00
|
|
|
|
from ruffus import transform, suffix, merge, active_if, regex, jobs_limit, \
|
2015-07-25 02:58:34 -07:00
|
|
|
|
formatter, follows, split, collate, check_if_uptodate
|
2015-08-11 02:19:46 -07:00
|
|
|
|
import ruffus.ruffus_exceptions as ruffus_exceptions
|
2014-10-08 03:21:28 -07:00
|
|
|
|
import ruffus.cmdline as cmdline
|
2015-07-23 02:39:42 -07:00
|
|
|
|
|
2015-02-20 17:20:48 -08:00
|
|
|
|
from .hocrtransform import HocrTransform
|
2015-07-23 02:39:42 -07:00
|
|
|
|
from .pageinfo import pdf_get_all_pageinfo
|
2015-07-23 04:57:31 -07:00
|
|
|
|
from .pdfa import generate_pdfa_def
|
2015-07-28 02:25:50 -07:00
|
|
|
|
from . import ghostscript
|
2015-07-23 18:38:59 -07:00
|
|
|
|
from . import tesseract
|
2015-12-17 08:19:53 -08:00
|
|
|
|
from . import qpdf
|
2015-08-11 00:17:02 -07:00
|
|
|
|
from . import ExitCode
|
2015-04-09 03:12:04 -07:00
|
|
|
|
|
2015-04-09 14:06:55 -07:00
|
|
|
|
warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
|
2015-04-09 03:12:04 -07:00
|
|
|
|
|
2014-10-08 03:21:28 -07:00
|
|
|
|
|
2015-07-23 14:48:46 -07:00
|
|
|
|
BASEDIR = os.path.dirname(os.path.realpath(__file__))
|
2015-12-04 04:31:01 -08:00
|
|
|
|
VERSION = '3.1'
|
2014-10-08 03:21:28 -07:00
|
|
|
|
|
2015-07-23 17:06:00 -07:00
|
|
|
|
|
|
|
|
|
|
# -------------
|
|
|
|
|
|
# External dependencies
|
|
|
|
|
|
|
|
|
|
|
|
MINIMUM_TESS_VERSION = '3.02.02'
|
|
|
|
|
|
|
2015-07-28 02:25:50 -07:00
|
|
|
|
|
|
|
|
|
|
def complain(message):
|
2015-08-11 02:19:46 -07:00
|
|
|
|
print(*textwrap.wrap(message), file=sys.stderr)
|
2015-07-28 02:25:50 -07:00
|
|
|
|
|
|
|
|
|
|
|
2015-07-28 01:00:29 -07:00
|
|
|
|
if tesseract.version() < MINIMUM_TESS_VERSION:
|
2015-07-28 02:25:50 -07:00
|
|
|
|
complain(
|
2015-07-23 17:06:00 -07:00
|
|
|
|
"Please install tesseract {0} or newer "
|
|
|
|
|
|
"(currently installed version is {1})".format(
|
2015-07-28 02:25:50 -07:00
|
|
|
|
MINIMUM_TESS_VERSION, tesseract.version()))
|
2015-08-11 00:17:02 -07:00
|
|
|
|
sys.exit(ExitCode.missing_dependency)
|
2015-07-23 17:06:00 -07:00
|
|
|
|
|
|
|
|
|
|
|
2015-08-16 00:54:03 -07:00
|
|
|
|
try:
|
|
|
|
|
|
import PIL.features
|
|
|
|
|
|
check_codec = PIL.features.check_codec
|
|
|
|
|
|
except (ImportError, AttributeError):
|
|
|
|
|
|
def check_codec(codec_name):
|
|
|
|
|
|
if codec_name == 'jpg':
|
|
|
|
|
|
return 'jpeg_encoder' in dir(Image.core)
|
|
|
|
|
|
elif codec_name == 'zlib':
|
|
|
|
|
|
return 'zip_encoder' in dir(Image.core)
|
|
|
|
|
|
raise NotImplementedError(codec_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def check_pil_encoder(codec_name, friendly_name):
|
|
|
|
|
|
try:
|
|
|
|
|
|
if check_codec(codec_name):
|
|
|
|
|
|
return
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
pass
|
|
|
|
|
|
complain(
|
|
|
|
|
|
"ERROR: Your version of the Python imaging library (Pillow) was "
|
|
|
|
|
|
"compiled without support for " + friendly_name + " encoding/decoding."
|
|
|
|
|
|
"\n"
|
|
|
|
|
|
"You will need to uninstall Pillow and reinstall it with PNG and JPEG "
|
|
|
|
|
|
"support (libjpeg and zlib)."
|
|
|
|
|
|
"\n"
|
|
|
|
|
|
"See installation instructions for your platform here:\n"
|
|
|
|
|
|
" https://pillow.readthedocs.org/installation.html"
|
|
|
|
|
|
)
|
|
|
|
|
|
sys.exit(ExitCode.missing_dependency)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
check_pil_encoder('jpg', 'JPEG')
|
|
|
|
|
|
check_pil_encoder('zlib', 'PNG')
|
|
|
|
|
|
|
|
|
|
|
|
|
2015-07-23 17:06:00 -07:00
|
|
|
|
# -------------
|
|
|
|
|
|
# Parser
|
|
|
|
|
|
|
2014-10-08 03:21:28 -07:00
|
|
|
|
parser = cmdline.get_argparse(
|
2015-07-25 23:45:13 -07:00
|
|
|
|
prog="ocrmypdf",
|
2015-07-27 15:39:54 -07:00
|
|
|
|
description="Generate searchable PDF file from an image-only PDF file.",
|
2015-12-02 02:19:39 -08:00
|
|
|
|
version=VERSION,
|
2015-07-27 15:39:54 -07:00
|
|
|
|
fromfile_prefix_chars='@',
|
|
|
|
|
|
ignored_args=[
|
|
|
|
|
|
'touch_files_only', 'recreate_database', 'checksum_file_name',
|
|
|
|
|
|
'key_legend_in_graph', 'draw_graph_horizontally', 'flowchart_format',
|
2015-12-04 03:11:38 -08:00
|
|
|
|
'forced_tasks', 'target_tasks', 'use_threads'])
|
2014-10-08 03:21:28 -07:00
|
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
2015-07-23 01:16:05 -07:00
|
|
|
|
'input_file',
|
2015-07-22 22:30:00 -07:00
|
|
|
|
help="PDF file containing the images to be OCRed")
|
2014-10-08 03:21:28 -07:00
|
|
|
|
parser.add_argument(
|
2015-07-23 01:16:05 -07:00
|
|
|
|
'output_file',
|
2015-07-22 22:30:00 -07:00
|
|
|
|
help="output searchable PDF file")
|
2014-10-08 03:21:28 -07:00
|
|
|
|
parser.add_argument(
|
2015-07-23 18:38:59 -07:00
|
|
|
|
'-l', '--language', action='append',
|
2015-08-05 16:56:53 -07:00
|
|
|
|
help="languages of the file to be OCRed")
|
2015-07-22 22:30:00 -07:00
|
|
|
|
|
2015-07-25 18:12:25 -07:00
|
|
|
|
metadata = parser.add_argument_group(
|
|
|
|
|
|
"Metadata options",
|
|
|
|
|
|
"Set output PDF/A metadata (default: use input document's title)")
|
|
|
|
|
|
metadata.add_argument(
|
|
|
|
|
|
'--title', type=str,
|
2015-07-27 04:20:49 -07:00
|
|
|
|
help="set document title (place multiple words in quotes)")
|
2015-07-25 18:12:25 -07:00
|
|
|
|
metadata.add_argument(
|
|
|
|
|
|
'--author', type=str,
|
|
|
|
|
|
help="set document author")
|
|
|
|
|
|
metadata.add_argument(
|
|
|
|
|
|
'--subject', type=str,
|
|
|
|
|
|
help="set document")
|
|
|
|
|
|
metadata.add_argument(
|
|
|
|
|
|
'--keywords', type=str,
|
|
|
|
|
|
help="set document keywords")
|
|
|
|
|
|
|
|
|
|
|
|
|
2015-07-22 22:51:38 -07:00
|
|
|
|
preprocessing = parser.add_argument_group(
|
2015-07-22 22:30:00 -07:00
|
|
|
|
"Preprocessing options",
|
|
|
|
|
|
"Improve OCR quality and final image")
|
|
|
|
|
|
preprocessing.add_argument(
|
|
|
|
|
|
'-d', '--deskew', action='store_true',
|
|
|
|
|
|
help="deskew each page before performing OCR")
|
|
|
|
|
|
preprocessing.add_argument(
|
|
|
|
|
|
'-c', '--clean', action='store_true',
|
2015-08-05 16:56:53 -07:00
|
|
|
|
help="clean pages from scanning artifacts before performing OCR")
|
2015-07-22 22:30:00 -07:00
|
|
|
|
preprocessing.add_argument(
|
|
|
|
|
|
'-i', '--clean-final', action='store_true',
|
|
|
|
|
|
help="incorporate the cleaned image in the final PDF file")
|
|
|
|
|
|
preprocessing.add_argument(
|
2015-07-27 20:42:16 -07:00
|
|
|
|
'--oversample', metavar='DPI', type=int, default=0,
|
2015-08-05 16:56:53 -07:00
|
|
|
|
help="oversample images to at least the specified DPI, to improve OCR "
|
|
|
|
|
|
"results slightly")
|
2015-07-22 22:30:00 -07:00
|
|
|
|
|
2014-10-08 03:21:28 -07:00
|
|
|
|
parser.add_argument(
|
2015-07-25 04:25:19 -07:00
|
|
|
|
'-f', '--force-ocr', action='store_true',
|
2015-08-05 16:56:53 -07:00
|
|
|
|
help="rasterize any fonts or vector images on each page and apply OCR")
|
2014-10-08 03:21:28 -07:00
|
|
|
|
parser.add_argument(
|
2015-07-25 04:25:19 -07:00
|
|
|
|
'-s', '--skip-text', action='store_true',
|
2015-08-05 16:56:53 -07:00
|
|
|
|
help="skip OCR on any pages that already contain text, but include the"
|
|
|
|
|
|
" page in final output")
|
2015-02-20 15:26:33 -08:00
|
|
|
|
parser.add_argument(
|
2015-07-28 02:25:50 -07:00
|
|
|
|
'--skip-big', type=float, metavar='MPixels',
|
2015-08-05 16:56:53 -07:00
|
|
|
|
help="skip OCR on pages larger than the specified amount of megapixels, "
|
|
|
|
|
|
"but include skipped pages in final output")
|
2015-07-25 04:25:19 -07:00
|
|
|
|
# parser.add_argument(
|
|
|
|
|
|
# '--exact-image', action='store_true',
|
|
|
|
|
|
# help="Use original page from PDF without re-rendering")
|
2015-07-22 22:30:00 -07:00
|
|
|
|
|
2015-07-22 22:51:38 -07:00
|
|
|
|
advanced = parser.add_argument_group(
|
2015-07-22 22:30:00 -07:00
|
|
|
|
"Advanced",
|
2015-07-25 01:46:16 -07:00
|
|
|
|
"Advanced options for power users")
|
2015-07-22 22:30:00 -07:00
|
|
|
|
advanced.add_argument(
|
2015-07-23 23:09:29 -07:00
|
|
|
|
'--tesseract-config', default=[], type=list, action='append',
|
2015-07-28 02:25:50 -07:00
|
|
|
|
help="additional Tesseract configuration files")
|
2015-07-27 04:20:49 -07:00
|
|
|
|
advanced.add_argument(
|
2015-12-02 23:20:31 -08:00
|
|
|
|
'--pdf-renderer', choices=['auto', 'tesseract', 'hocr'], default='auto',
|
2015-07-27 04:20:49 -07:00
|
|
|
|
help='choose OCR PDF renderer')
|
2015-07-27 04:23:37 -07:00
|
|
|
|
advanced.add_argument(
|
|
|
|
|
|
'--tesseract-timeout', default=180.0, type=float,
|
2015-08-05 16:56:53 -07:00
|
|
|
|
help='give up on OCR after the timeout, but copy the preprocessed page '
|
|
|
|
|
|
'into the final output')
|
2015-07-22 22:30:00 -07:00
|
|
|
|
|
2015-07-22 22:51:38 -07:00
|
|
|
|
debugging = parser.add_argument_group(
|
2015-07-22 22:30:00 -07:00
|
|
|
|
"Debugging",
|
|
|
|
|
|
"Arguments to help with troubleshooting and debugging")
|
|
|
|
|
|
debugging.add_argument(
|
|
|
|
|
|
'-k', '--keep-temporary-files', action='store_true',
|
|
|
|
|
|
help="keep temporary files (helpful for debugging)")
|
|
|
|
|
|
debugging.add_argument(
|
diff --git a/src/ocrmypdf.py b/src/ocrmypdf.py
index 68d1591..95afa8f 100755
--- a/src/ocrmypdf.py
+++ b/src/ocrmypdf.py
@@ -24,6 +24,7 @@ import ruffus.cmdline as cmdline
from .hocrtransform import HocrTransform
import warnings
+import multiprocessing
warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
@@ -96,7 +97,7 @@ debugging.add_argument(
'-k', '--keep-temporary-files', action='store_true',
help="keep temporary files (helpful for debugging)")
debugging.add_argument(
- '-g' ,'--debug-rendering', action='store_true',
+ '-g', '--debug-rendering', action='store_true',
help="render each page twice with debug information on second page")
@@ -106,51 +107,19 @@ if not options.temp_folder:
options.temp_folder = 'tmp'
-_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
- options.verbose)
+log, log_mutex = cmdline.setup_logging(__name__, options.log_file,
+ options.verbose)
-class WrappedLogger:
-
- def __init__(self, my_logger, my_mutex):
- self.logger = my_logger
- self.mutex = my_mutex
-
- def log(self, *args, **kwargs):
- with self.mutex:
- self.logger.log(*args, **kwargs)
-
- def debug(self, *args, **kwargs):
- with self.mutex:
- self.logger.debug(*args, **kwargs)
-
- def info(self, *args, **kwargs):
- with self.mutex:
- self.logger.info(*args, **kwargs)
-
- def warning(self, *args, **kwargs):
- with self.mutex:
- self.logger.warning(*args, **kwargs)
-
- def error(self, *args, **kwargs):
- with self.mutex:
- self.logger.error(*args, **kwargs)
-
- def critical(self, *args, **kwargs):
- with self.mutex:
- self.logger.critical(*args, **kwargs)
-
-log = WrappedLogger(_logger, _logger_mutex)
-
-
-def re_symlink(input_file, soft_link_name, log=log):
+def re_symlink(input_file, soft_link_name, log, mutex):
"""
Helper function: relinks soft symbolic link if necessary
"""
if input_file == soft_link_name:
- log.debug("Warning: No symbolic link made. You are using " +
- "the original data directory as the working directory.")
+ with mutex:
+ log.debug("Warning: No symbolic link made. You are using " +
+ "the original data directory as the working directory.")
return
@@ -161,12 +130,14 @@ def re_symlink(input_file, soft_link_name, log=log):
try:
os.unlink(soft_link_name)
except:
- log.debug("Can't unlink %s" % (soft_link_name))
+ with mutex:
+ log.debug("Can't unlink %s" % (soft_link_name))
if not os.path.exists(input_file):
raise Exception("trying to create a broken symlink to %s" % input_file)
- log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
+ with mutex:
+ log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
os.symlink(
2015-07-23 02:22:12 -07:00
|
|
|
|
'-g', '--debug-rendering', action='store_true',
|
2015-07-22 22:30:00 -07:00
|
|
|
|
help="render each page twice with debug information on second page")
|
2014-10-08 03:21:28 -07:00
|
|
|
|
|
2015-07-28 04:46:21 -07:00
|
|
|
|
options = parser.parse_args()
|
2015-07-27 15:39:54 -07:00
|
|
|
|
|
2014-10-08 03:21:28 -07:00
|
|
|
|
|
2015-07-23 18:38:59 -07:00
|
|
|
|
# ----------
|
|
|
|
|
|
# Languages
|
|
|
|
|
|
|
|
|
|
|
|
if not options.language:
|
|
|
|
|
|
options.language = ['eng'] # Enforce English hegemony
|
|
|
|
|
|
|
|
|
|
|
|
# Support v2.x "eng+deu" language syntax
|
|
|
|
|
|
if '+' in options.language[0]:
|
|
|
|
|
|
options.language = options.language[0].split('+')
|
|
|
|
|
|
|
2015-07-28 01:00:29 -07:00
|
|
|
|
if not set(options.language).issubset(tesseract.languages()):
|
2015-07-28 02:25:50 -07:00
|
|
|
|
complain(
|
2015-07-23 18:38:59 -07:00
|
|
|
|
"The installed version of tesseract does not have language "
|
2015-07-28 02:25:50 -07:00
|
|
|
|
"data for the following requested languages: ")
|
2015-07-28 01:00:29 -07:00
|
|
|
|
for lang in (set(options.language) - tesseract.languages()):
|
2015-08-18 23:27:50 -07:00
|
|
|
|
complain(lang)
|
2015-08-11 00:17:02 -07:00
|
|
|
|
sys.exit(ExitCode.bad_args)
|
2015-07-23 18:38:59 -07:00
|
|
|
|
|
|
|
|
|
|
|
2015-07-25 00:22:56 -07:00
|
|
|
|
# ----------
|
|
|
|
|
|
# Arguments
|
|
|
|
|
|
|
2015-12-02 23:20:31 -08:00
|
|
|
|
if options.pdf_renderer == 'auto':
|
|
|
|
|
|
options.pdf_renderer = 'hocr'
|
2015-07-25 00:22:56 -07:00
|
|
|
|
|
|
|
|
|
|
if any((options.deskew, options.clean, options.clean_final)):
|
|
|
|
|
|
try:
|
|
|
|
|
|
from . import unpaper
|
|
|
|
|
|
except ImportError:
|
2015-07-28 02:25:50 -07:00
|
|
|
|
complain(
|
|
|
|
|
|
"Install the 'unpaper' program to use --deskew or --clean.")
|
2015-08-11 00:17:02 -07:00
|
|
|
|
sys.exit(ExitCode.bad_args)
|
2015-07-25 00:22:56 -07:00
|
|
|
|
else:
|
|
|
|
|
|
unpaper = None
|
|
|
|
|
|
|
2015-07-28 02:25:50 -07:00
|
|
|
|
if options.debug_rendering and options.pdf_renderer == 'tesseract':
|
|
|
|
|
|
complain(
|
|
|
|
|
|
"Ignoring --debug-rendering because it is not supported with"
|
|
|
|
|
|
"--pdf-renderer=tesseract.")
|
|
|
|
|
|
|
|
|
|
|
|
if options.force_ocr and options.skip_text:
|
|
|
|
|
|
complain(
|
|
|
|
|
|
"Error: --force-ocr and --skip-text are mutually incompatible.")
|
2015-08-11 00:17:02 -07:00
|
|
|
|
sys.exit(ExitCode.bad_args)
|
2015-07-28 02:25:50 -07:00
|
|
|
|
|
|
|
|
|
|
if options.clean and not options.clean_final \
|
|
|
|
|
|
and options.pdf_renderer == 'tesseract':
|
|
|
|
|
|
complain(
|
|
|
|
|
|
"Tesseract PDF renderer cannot render --clean pages without "
|
|
|
|
|
|
"also performing --clean-final, so --clean-final is assumed.")
|
|
|
|
|
|
|
|
|
|
|
|
|
2015-07-23 02:39:42 -07:00
|
|
|
|
# ----------
|
|
|
|
|
|
# Logging
|
2015-07-22 22:58:13 -07:00
|
|
|
|
|
|
|
|
|
|
|
2015-07-23 02:26:09 -07:00
|
|
|
|
_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
|
|
|
|
|
|
options.verbose)
|
2015-03-24 22:46:33 -07:00
|
|
|
|
|
|
|
|
|
|
|
2015-07-23 02:26:09 -07:00
|
|
|
|
class WrappedLogger:
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, my_logger, my_mutex):
|
|
|
|
|
|
self.logger = my_logger
|
|
|
|
|
|
self.mutex = my_mutex
|
|
|
|
|
|
|
|
|
|
|
|
def log(self, *args, **kwargs):
|
|
|
|
|
|
with self.mutex:
|
|
|
|
|
|
self.logger.log(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
def debug(self, *args, **kwargs):
|
|
|
|
|
|
with self.mutex:
|
|
|
|
|
|
self.logger.debug(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
def info(self, *args, **kwargs):
|
|
|
|
|
|
with self.mutex:
|
|
|
|
|
|
self.logger.info(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
def warning(self, *args, **kwargs):
|
|
|
|
|
|
with self.mutex:
|
|
|
|
|
|
self.logger.warning(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
def error(self, *args, **kwargs):
|
|
|
|
|
|
with self.mutex:
|
|
|
|
|
|
self.logger.error(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
def critical(self, *args, **kwargs):
|
|
|
|
|
|
with self.mutex:
|
|
|
|
|
|
self.logger.critical(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
_log = WrappedLogger(_logger, _logger_mutex)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def re_symlink(input_file, soft_link_name, log=_log):
|
2015-07-22 22:46:00 -07:00
|
|
|
|
"""
|
|
|
|
|
|
Helper function: relinks soft symbolic link if necessary
|
|
|
|
|
|
"""
|
|
|
|
|
|
# Guard against soft linking to oneself
|
|
|
|
|
|
if input_file == soft_link_name:
|
2015-07-23 02:26:09 -07:00
|
|
|
|
log.debug("Warning: No symbolic link made. You are using " +
|
2015-07-24 01:27:01 -07:00
|
|
|
|
"the original data directory as the working directory.")
|
2015-07-22 22:46:00 -07:00
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
# Soft link already exists: delete for relink?
|
|
|
|
|
|
if os.path.lexists(soft_link_name):
|
|
|
|
|
|
# do not delete or overwrite real (non-soft link) file
|
|
|
|
|
|
if not os.path.islink(soft_link_name):
|
|
|
|
|
|
raise Exception("%s exists and is not a link" % soft_link_name)
|
|
|
|
|
|
try:
|
|
|
|
|
|
os.unlink(soft_link_name)
|
|
|
|
|
|
except:
|
2015-07-23 02:26:09 -07:00
|
|
|
|
log.debug("Can't unlink %s" % (soft_link_name))
|
2015-07-22 22:46:00 -07:00
|
|
|
|
|
|
|
|
|
|
if not os.path.exists(input_file):
|
|
|
|
|
|
raise Exception("trying to create a broken symlink to %s" % input_file)
|
|
|
|
|
|
|
2015-07-23 02:26:09 -07:00
|
|
|
|
log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
|
2015-07-22 22:46:00 -07:00
|
|
|
|
|
|
|
|
|
|
# Create symbolic link using absolute path
|
|
|
|
|
|
os.symlink(
|
|
|
|
|
|
os.path.abspath(input_file),
|
|
|
|
|
|
soft_link_name
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
2015-07-23 02:39:42 -07:00
|
|
|
|
# -------------
|
|
|
|
|
|
# The Pipeline
|
|
|
|
|
|
|
2015-07-23 03:49:30 -07:00
|
|
|
|
manager = multiprocessing.Manager()
|
|
|
|
|
|
_pdfinfo = manager.list()
|
|
|
|
|
|
_pdfinfo_lock = manager.Lock()
|
|
|
|
|
|
|
2015-07-25 01:45:26 -07:00
|
|
|
|
work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@atexit.register
|
|
|
|
|
|
def cleanup_working_files(*args):
|
|
|
|
|
|
if options.keep_temporary_files:
|
|
|
|
|
|
print("Temporary working files saved at:")
|
|
|
|
|
|
print(work_folder)
|
|
|
|
|
|
else:
|
|
|
|
|
|
with suppress(FileNotFoundError):
|
|
|
|
|
|
shutil.rmtree(work_folder)
|
2015-07-23 02:39:42 -07:00
|
|
|
|
|
|
|
|
|
|
|
2015-07-23 01:16:05 -07:00
|
|
|
|
@transform(
|
2015-07-23 03:09:03 -07:00
|
|
|
|
input=options.input_file,
|
2015-12-04 02:14:09 -08:00
|
|
|
|
filter=formatter('(?i)\.pdf'),
|
|
|
|
|
|
output=work_folder + '{basename[0]}.repaired.pdf',
|
2015-07-23 03:49:30 -07:00
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
2015-07-24 01:55:54 -07:00
|
|
|
|
def repair_pdf(
|
2015-07-23 01:16:05 -07:00
|
|
|
|
input_file,
|
2015-07-23 02:39:42 -07:00
|
|
|
|
output_file,
|
2015-07-23 03:49:30 -07:00
|
|
|
|
log,
|
2015-07-23 03:09:03 -07:00
|
|
|
|
pdfinfo,
|
2015-07-23 03:49:30 -07:00
|
|
|
|
pdfinfo_lock):
|
2015-07-23 01:16:05 -07:00
|
|
|
|
|
2015-12-17 08:19:53 -08:00
|
|
|
|
qpdf.repair(input_file, output_file, log)
|
2015-07-23 03:49:30 -07:00
|
|
|
|
with pdfinfo_lock:
|
|
|
|
|
|
pdfinfo.extend(pdf_get_all_pageinfo(output_file))
|
|
|
|
|
|
log.info(pdfinfo)
|
2015-07-23 01:16:05 -07:00
|
|
|
|
|
|
|
|
|
|
|
2015-07-25 04:25:19 -07:00
|
|
|
|
def get_pageinfo(input_file, pdfinfo, pdfinfo_lock):
|
|
|
|
|
|
pageno = int(os.path.basename(input_file)[0:6]) - 1
|
|
|
|
|
|
with pdfinfo_lock:
|
|
|
|
|
|
pageinfo = pdfinfo[pageno].copy()
|
|
|
|
|
|
return pageinfo
|
2015-07-23 02:39:42 -07:00
|
|
|
|
|
2015-07-25 04:25:19 -07:00
|
|
|
|
|
|
|
|
|
|
def is_ocr_required(pageinfo, log):
|
|
|
|
|
|
page = pageinfo['pageno'] + 1
|
|
|
|
|
|
ocr_required = True
|
|
|
|
|
|
if not pageinfo['images']:
|
|
|
|
|
|
# If the page has no images, then it contains vector content or text
|
|
|
|
|
|
# or both. It seems quite unlikely that one would find meaningful text
|
|
|
|
|
|
# from rasterizing vector content. So skip the page.
|
|
|
|
|
|
log.info(
|
|
|
|
|
|
"Page {0} has no images - skipping OCR".format(page)
|
|
|
|
|
|
)
|
|
|
|
|
|
ocr_required = False
|
|
|
|
|
|
elif pageinfo['has_text']:
|
|
|
|
|
|
s = "Page {0} already has text! – {1}"
|
|
|
|
|
|
|
|
|
|
|
|
if not options.force_ocr and not options.skip_text:
|
|
|
|
|
|
log.error(s.format(page,
|
|
|
|
|
|
"aborting (use --force-ocr to force OCR)"))
|
2015-08-11 00:17:02 -07:00
|
|
|
|
sys.exit(ExitCode.already_done_ocr)
|
2015-07-25 04:25:19 -07:00
|
|
|
|
elif options.force_ocr:
|
|
|
|
|
|
log.info(s.format(page,
|
|
|
|
|
|
"rasterizing text and running OCR anyway"))
|
|
|
|
|
|
ocr_required = True
|
|
|
|
|
|
elif options.skip_text:
|
|
|
|
|
|
log.info(s.format(page,
|
|
|
|
|
|
"skipping all processing on this page"))
|
|
|
|
|
|
ocr_required = False
|
|
|
|
|
|
|
|
|
|
|
|
if ocr_required and options.skip_big:
|
|
|
|
|
|
pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
|
2015-07-28 02:25:50 -07:00
|
|
|
|
if pixel_count > (options.skip_big * 1000000):
|
2015-07-25 04:25:19 -07:00
|
|
|
|
ocr_required = False
|
|
|
|
|
|
log.info(
|
|
|
|
|
|
"Page {0} is very large; skipping due to -b".format(page))
|
|
|
|
|
|
|
|
|
|
|
|
return ocr_required
|
2015-07-23 01:16:05 -07:00
|
|
|
|
|
|
|
|
|
|
|
2015-07-25 02:58:34 -07:00
|
|
|
|
@split(
|
2015-07-24 01:55:54 -07:00
|
|
|
|
repair_pdf,
|
2015-07-25 02:58:34 -07:00
|
|
|
|
os.path.join(work_folder, '*.page.pdf'),
|
|
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
2015-07-22 22:46:00 -07:00
|
|
|
|
def split_pages(
|
|
|
|
|
|
input_file,
|
2015-07-23 03:09:03 -07:00
|
|
|
|
output_files,
|
2015-07-23 03:49:30 -07:00
|
|
|
|
log,
|
2015-07-23 03:09:03 -07:00
|
|
|
|
pdfinfo,
|
2015-07-23 03:49:30 -07:00
|
|
|
|
pdfinfo_lock):
|
2015-07-22 22:46:00 -07:00
|
|
|
|
|
|
|
|
|
|
for oo in output_files:
|
|
|
|
|
|
with suppress(FileNotFoundError):
|
|
|
|
|
|
os.unlink(oo)
|
2015-07-30 04:06:31 -07:00
|
|
|
|
|
2015-12-17 08:24:48 -08:00
|
|
|
|
npages = qpdf.get_npages(input_file)
|
|
|
|
|
|
qpdf.split_pages(input_file, work_folder, npages)
|
2015-07-22 22:46:00 -07:00
|
|
|
|
|
2015-07-25 04:25:19 -07:00
|
|
|
|
from glob import glob
|
|
|
|
|
|
for filename in glob(os.path.join(work_folder, '*.page.pdf')):
|
|
|
|
|
|
pageinfo = get_pageinfo(filename, pdfinfo, pdfinfo_lock)
|
2015-07-23 03:49:30 -07:00
|
|
|
|
|
2015-07-25 04:25:19 -07:00
|
|
|
|
alt_suffix = '.ocr.page.pdf' if is_ocr_required(pageinfo, log) \
|
|
|
|
|
|
else '.skip.page.pdf'
|
|
|
|
|
|
re_symlink(
|
|
|
|
|
|
filename,
|
|
|
|
|
|
os.path.join(
|
|
|
|
|
|
work_folder,
|
|
|
|
|
|
os.path.basename(filename)[0:6] + alt_suffix))
|
2015-07-23 23:09:29 -07:00
|
|
|
|
|
|
|
|
|
|
|
2015-07-23 04:57:31 -07:00
|
|
|
|
@transform(
|
|
|
|
|
|
input=split_pages,
|
2015-07-25 04:25:19 -07:00
|
|
|
|
filter=suffix('.ocr.page.pdf'),
|
2015-07-23 23:09:29 -07:00
|
|
|
|
output='.page.png',
|
2015-07-25 01:45:26 -07:00
|
|
|
|
output_dir=work_folder,
|
2015-07-23 04:57:31 -07:00
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
2015-07-23 23:09:29 -07:00
|
|
|
|
def rasterize_with_ghostscript(
|
2015-07-23 04:57:31 -07:00
|
|
|
|
input_file,
|
|
|
|
|
|
output_file,
|
|
|
|
|
|
log,
|
|
|
|
|
|
pdfinfo,
|
|
|
|
|
|
pdfinfo_lock):
|
2015-07-23 23:09:29 -07:00
|
|
|
|
pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
|
|
|
|
|
|
|
|
|
|
|
|
device = 'png16m' # 24-bit
|
|
|
|
|
|
if all(image['comp'] == 1 for image in pageinfo['images']):
|
|
|
|
|
|
if all(image['bpc'] == 1 for image in pageinfo['images']):
|
|
|
|
|
|
device = 'pngmono'
|
2015-08-28 04:47:57 -07:00
|
|
|
|
elif all(image['bpc'] > 1 and image['color'] == 'index'
|
|
|
|
|
|
for image in pageinfo['images']):
|
|
|
|
|
|
device = 'png256'
|
|
|
|
|
|
elif all(image['bpc'] > 1 and image['color'] == 'gray'
|
|
|
|
|
|
for image in pageinfo['images']):
|
2015-07-23 23:09:29 -07:00
|
|
|
|
device = 'pnggray'
|
|
|
|
|
|
|
2015-08-28 04:47:57 -07:00
|
|
|
|
log.debug("Rendering {0} with {1}".format(
|
|
|
|
|
|
os.path.basename(input_file), device))
|
2015-07-26 12:56:10 -07:00
|
|
|
|
xres = max(pageinfo['xres'], options.oversample or 0)
|
|
|
|
|
|
yres = max(pageinfo['yres'], options.oversample or 0)
|
2015-07-23 23:09:29 -07:00
|
|
|
|
|
2015-07-28 02:25:50 -07:00
|
|
|
|
ghostscript.rasterize_pdf(input_file, output_file, xres, yres, device, log)
|
2015-07-23 23:09:29 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@transform(
|
|
|
|
|
|
input=rasterize_with_ghostscript,
|
|
|
|
|
|
filter=suffix(".page.png"),
|
2015-07-25 00:22:56 -07:00
|
|
|
|
output=".pp-deskew.png",
|
2015-07-24 15:19:37 -07:00
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
2015-07-25 00:22:56 -07:00
|
|
|
|
def preprocess_deskew(
|
2015-07-24 15:19:37 -07:00
|
|
|
|
input_file,
|
|
|
|
|
|
output_file,
|
|
|
|
|
|
log,
|
|
|
|
|
|
pdfinfo,
|
|
|
|
|
|
pdfinfo_lock):
|
|
|
|
|
|
|
2015-07-25 00:22:56 -07:00
|
|
|
|
if not options.deskew:
|
2015-07-24 15:19:37 -07:00
|
|
|
|
re_symlink(input_file, output_file, log)
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
|
2015-07-25 00:22:56 -07:00
|
|
|
|
dpi = int(pageinfo['xres'])
|
2015-07-24 15:19:37 -07:00
|
|
|
|
|
2015-07-25 00:22:56 -07:00
|
|
|
|
unpaper.deskew(input_file, output_file, dpi, log)
|
2015-07-24 15:19:37 -07:00
|
|
|
|
|
|
|
|
|
|
|
2015-07-25 00:22:56 -07:00
|
|
|
|
@transform(
|
|
|
|
|
|
input=preprocess_deskew,
|
|
|
|
|
|
filter=suffix(".pp-deskew.png"),
|
|
|
|
|
|
output=".pp-clean.png",
|
|
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
|
|
|
|
|
def preprocess_clean(
|
|
|
|
|
|
input_file,
|
|
|
|
|
|
output_file,
|
|
|
|
|
|
log,
|
|
|
|
|
|
pdfinfo,
|
|
|
|
|
|
pdfinfo_lock):
|
2015-07-24 15:19:37 -07:00
|
|
|
|
|
2015-07-25 00:22:56 -07:00
|
|
|
|
if not options.clean:
|
|
|
|
|
|
re_symlink(input_file, output_file, log)
|
|
|
|
|
|
return
|
2015-07-24 15:19:37 -07:00
|
|
|
|
|
2015-07-25 00:22:56 -07:00
|
|
|
|
pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
|
|
|
|
|
|
dpi = int(pageinfo['xres'])
|
2015-07-24 15:19:37 -07:00
|
|
|
|
|
2015-07-25 00:22:56 -07:00
|
|
|
|
unpaper.clean(input_file, output_file, dpi, log)
|
2015-07-24 15:19:37 -07:00
|
|
|
|
|
|
|
|
|
|
|
2015-07-27 04:20:49 -07:00
|
|
|
|
@active_if(options.pdf_renderer == 'hocr')
|
2015-07-24 15:19:37 -07:00
|
|
|
|
@transform(
|
2015-07-25 00:22:56 -07:00
|
|
|
|
input=preprocess_clean,
|
|
|
|
|
|
filter=suffix(".pp-clean.png"),
|
2015-07-23 23:09:29 -07:00
|
|
|
|
output=".hocr",
|
|
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
2015-07-27 04:20:49 -07:00
|
|
|
|
def ocr_tesseract_hocr(
|
2015-07-23 23:09:29 -07:00
|
|
|
|
input_file,
|
|
|
|
|
|
output_file,
|
|
|
|
|
|
log,
|
|
|
|
|
|
pdfinfo,
|
|
|
|
|
|
pdfinfo_lock):
|
|
|
|
|
|
|
2015-12-16 17:36:11 -08:00
|
|
|
|
tesseract.generate_hocr(
|
|
|
|
|
|
input_file=input_file,
|
|
|
|
|
|
output_hocr=output_file,
|
|
|
|
|
|
language=options.language,
|
|
|
|
|
|
tessconfig=options.tesseract_config,
|
|
|
|
|
|
timeout=options.tesseract_timeout,
|
|
|
|
|
|
pageinfo_getter=partial(get_pageinfo, input_file, pdfinfo,
|
|
|
|
|
|
pdfinfo_lock),
|
|
|
|
|
|
log=log
|
|
|
|
|
|
)
|
2015-07-23 23:09:29 -07:00
|
|
|
|
|
|
|
|
|
|
|
2015-07-27 04:20:49 -07:00
|
|
|
|
@active_if(options.pdf_renderer == 'hocr')
|
2015-07-23 23:09:29 -07:00
|
|
|
|
@collate(
|
2015-07-25 00:54:00 -07:00
|
|
|
|
input=[rasterize_with_ghostscript, preprocess_deskew, preprocess_clean],
|
|
|
|
|
|
filter=regex(r".*/(\d{6})(?:\.page|\.pp-deskew|\.pp-clean)\.png"),
|
2015-07-25 01:45:26 -07:00
|
|
|
|
output=os.path.join(work_folder, r'\1.image'),
|
2015-07-23 23:09:29 -07:00
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
2015-07-25 00:54:00 -07:00
|
|
|
|
def select_image_for_pdf(
|
2015-07-23 23:09:29 -07:00
|
|
|
|
infiles,
|
|
|
|
|
|
output_file,
|
|
|
|
|
|
log,
|
|
|
|
|
|
pdfinfo,
|
|
|
|
|
|
pdfinfo_lock):
|
2015-07-25 00:22:56 -07:00
|
|
|
|
if options.clean_final:
|
|
|
|
|
|
image_suffix = '.pp-clean.png'
|
2015-07-25 00:54:00 -07:00
|
|
|
|
elif options.deskew:
|
2015-07-25 00:22:56 -07:00
|
|
|
|
image_suffix = '.pp-deskew.png'
|
2015-07-25 00:54:00 -07:00
|
|
|
|
else:
|
|
|
|
|
|
image_suffix = '.page.png'
|
2015-07-25 00:22:56 -07:00
|
|
|
|
image = next(ii for ii in infiles if ii.endswith(image_suffix))
|
2015-07-23 23:09:29 -07:00
|
|
|
|
|
2015-07-25 00:54:00 -07:00
|
|
|
|
pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
|
|
|
|
|
|
if all(image['enc'] == 'jpeg' for image in pageinfo['images']):
|
|
|
|
|
|
# If all images were JPEGs originally, produce a JPEG as output
|
|
|
|
|
|
Image.open(image).save(output_file, format='JPEG')
|
|
|
|
|
|
else:
|
|
|
|
|
|
re_symlink(image, output_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
2015-07-27 04:20:49 -07:00
|
|
|
|
@active_if(options.pdf_renderer == 'hocr')
|
2015-07-25 00:54:00 -07:00
|
|
|
|
@collate(
|
2015-07-27 04:20:49 -07:00
|
|
|
|
input=[select_image_for_pdf, ocr_tesseract_hocr],
|
2015-07-25 00:54:00 -07:00
|
|
|
|
filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"),
|
2015-07-25 01:45:26 -07:00
|
|
|
|
output=os.path.join(work_folder, r'\1.rendered.pdf'),
|
2015-07-25 00:54:00 -07:00
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
2015-07-27 04:20:49 -07:00
|
|
|
|
def render_hocr_page(
|
2015-07-25 00:54:00 -07:00
|
|
|
|
infiles,
|
|
|
|
|
|
output_file,
|
|
|
|
|
|
log,
|
|
|
|
|
|
pdfinfo,
|
|
|
|
|
|
pdfinfo_lock):
|
|
|
|
|
|
hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
|
|
|
|
|
|
image = next(ii for ii in infiles if ii.endswith('.image'))
|
|
|
|
|
|
|
2015-07-23 23:09:29 -07:00
|
|
|
|
pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
|
2015-07-27 17:18:02 -07:00
|
|
|
|
dpi = round(max(pageinfo['xres'], pageinfo['yres'], options.oversample))
|
2015-07-23 23:09:29 -07:00
|
|
|
|
|
|
|
|
|
|
hocrtransform = HocrTransform(hocr, dpi)
|
|
|
|
|
|
hocrtransform.to_pdf(output_file, imageFileName=image,
|
|
|
|
|
|
showBoundingboxes=False, invisibleText=True)
|
|
|
|
|
|
|
2015-07-23 04:57:31 -07:00
|
|
|
|
|
2015-07-27 04:20:49 -07:00
|
|
|
|
@active_if(options.pdf_renderer == 'hocr')
|
2015-07-25 14:14:02 -07:00
|
|
|
|
@active_if(options.debug_rendering)
|
|
|
|
|
|
@collate(
|
2015-07-27 04:20:49 -07:00
|
|
|
|
input=[select_image_for_pdf, ocr_tesseract_hocr],
|
2015-07-25 14:14:02 -07:00
|
|
|
|
filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"),
|
|
|
|
|
|
output=os.path.join(work_folder, r'\1.debug.pdf'),
|
|
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
2015-07-27 04:20:49 -07:00
|
|
|
|
def render_hocr_debug_page(
|
2015-07-25 14:14:02 -07:00
|
|
|
|
infiles,
|
|
|
|
|
|
output_file,
|
|
|
|
|
|
log,
|
|
|
|
|
|
pdfinfo,
|
|
|
|
|
|
pdfinfo_lock):
|
|
|
|
|
|
hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
|
|
|
|
|
|
image = next(ii for ii in infiles if ii.endswith('.image'))
|
|
|
|
|
|
|
|
|
|
|
|
pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
|
2015-07-27 17:18:02 -07:00
|
|
|
|
dpi = round(max(pageinfo['xres'], pageinfo['yres'], options.oversample))
|
2015-07-25 14:14:02 -07:00
|
|
|
|
|
|
|
|
|
|
hocrtransform = HocrTransform(hocr, dpi)
|
|
|
|
|
|
hocrtransform.to_pdf(output_file, imageFileName=None,
|
|
|
|
|
|
showBoundingboxes=True, invisibleText=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
2015-07-27 04:20:49 -07:00
|
|
|
|
@active_if(options.pdf_renderer == 'tesseract')
|
2015-07-28 01:47:30 -07:00
|
|
|
|
@collate(
|
|
|
|
|
|
input=[preprocess_clean, split_pages],
|
|
|
|
|
|
filter=regex(r".*/(\d{6})(?:\.pp-clean\.png|\.page\.pdf)"),
|
|
|
|
|
|
output=os.path.join(work_folder, r'\1.rendered.pdf'),
|
2015-07-27 04:20:49 -07:00
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
|
|
|
|
|
def tesseract_ocr_and_render_pdf(
|
2015-07-28 01:47:30 -07:00
|
|
|
|
input_files,
|
2015-07-27 04:20:49 -07:00
|
|
|
|
output_file,
|
|
|
|
|
|
log,
|
|
|
|
|
|
pdfinfo,
|
|
|
|
|
|
pdfinfo_lock):
|
|
|
|
|
|
|
2015-07-28 03:02:35 -07:00
|
|
|
|
input_image = next((ii for ii in input_files if ii.endswith('.png')), '')
|
|
|
|
|
|
input_pdf = next((ii for ii in input_files if ii.endswith('.pdf')))
|
|
|
|
|
|
if not input_image:
|
|
|
|
|
|
# Skipping this page
|
|
|
|
|
|
re_symlink(input_pdf, output_file)
|
|
|
|
|
|
return
|
2015-07-28 01:47:30 -07:00
|
|
|
|
|
2015-12-16 17:48:26 -08:00
|
|
|
|
tesseract.generate_pdf(
|
|
|
|
|
|
input_image=input_image,
|
|
|
|
|
|
skip_pdf=input_pdf,
|
|
|
|
|
|
output_pdf=output_file,
|
|
|
|
|
|
language=options.language,
|
|
|
|
|
|
tessconfig=options.tesseract_config,
|
|
|
|
|
|
timeout=options.tesseract_timeout,
|
|
|
|
|
|
log=log)
|
2015-07-27 04:20:49 -07:00
|
|
|
|
|
|
|
|
|
|
|
2015-07-23 04:57:31 -07:00
|
|
|
|
@transform(
|
2015-07-24 01:55:54 -07:00
|
|
|
|
input=repair_pdf,
|
|
|
|
|
|
filter=suffix('.repaired.pdf'),
|
2015-07-23 04:57:31 -07:00
|
|
|
|
output='.pdfa_def.ps',
|
2015-07-25 01:45:26 -07:00
|
|
|
|
output_dir=work_folder,
|
2015-07-23 04:57:31 -07:00
|
|
|
|
extras=[_log])
|
|
|
|
|
|
def generate_postscript_stub(
|
|
|
|
|
|
input_file,
|
|
|
|
|
|
output_file,
|
|
|
|
|
|
log):
|
2015-07-25 15:31:02 -07:00
|
|
|
|
|
2015-07-25 18:05:25 -07:00
|
|
|
|
pdf = pypdf.PdfFileReader(input_file)
|
|
|
|
|
|
|
|
|
|
|
|
def from_document_info(key):
|
2015-08-24 01:23:30 -07:00
|
|
|
|
# pdf.documentInfo.get() DOES NOT behave as expected for a dict-like
|
|
|
|
|
|
# object, so call with precautions. TypeError may occur if the PDF
|
|
|
|
|
|
# is missing the optional document info section.
|
2015-07-25 18:05:25 -07:00
|
|
|
|
try:
|
|
|
|
|
|
s = pdf.documentInfo[key]
|
|
|
|
|
|
return str(s)
|
2015-08-24 01:23:30 -07:00
|
|
|
|
except (KeyError, TypeError):
|
2015-07-25 18:05:25 -07:00
|
|
|
|
return ''
|
|
|
|
|
|
|
|
|
|
|
|
pdfmark = {
|
|
|
|
|
|
'title': from_document_info('/Title'),
|
|
|
|
|
|
'author': from_document_info('/Author'),
|
|
|
|
|
|
'keywords': from_document_info('/Keywords'),
|
|
|
|
|
|
'subject': from_document_info('/Subject'),
|
|
|
|
|
|
}
|
2015-07-25 18:12:25 -07:00
|
|
|
|
if options.title:
|
|
|
|
|
|
pdfmark['title'] = options.title
|
|
|
|
|
|
if options.author:
|
|
|
|
|
|
pdfmark['author'] = options.author
|
|
|
|
|
|
if options.keywords:
|
|
|
|
|
|
pdfmark['keywords'] = options.keywords
|
|
|
|
|
|
if options.subject:
|
|
|
|
|
|
pdfmark['subject'] = options.subject
|
|
|
|
|
|
|
2015-12-02 02:19:39 -08:00
|
|
|
|
pdfmark['creator'] = '{0} {1} / Tesseract OCR{2} {3}'.format(
|
|
|
|
|
|
parser.prog, VERSION,
|
|
|
|
|
|
'+PDF' if options.pdf_renderer == 'tesseract' else '',
|
|
|
|
|
|
tesseract.version())
|
|
|
|
|
|
|
2015-07-25 15:31:02 -07:00
|
|
|
|
generate_pdfa_def(output_file, pdfmark)
|
2015-07-22 22:46:00 -07:00
|
|
|
|
|
2015-07-22 22:51:38 -07:00
|
|
|
|
|
2015-07-25 04:25:19 -07:00
|
|
|
|
@transform(
|
|
|
|
|
|
input=split_pages,
|
|
|
|
|
|
filter=suffix('.skip.page.pdf'),
|
|
|
|
|
|
output='.done.pdf',
|
|
|
|
|
|
output_dir=work_folder,
|
|
|
|
|
|
extras=[_log])
|
|
|
|
|
|
def skip_page(
|
|
|
|
|
|
input_file,
|
|
|
|
|
|
output_file,
|
|
|
|
|
|
log):
|
|
|
|
|
|
re_symlink(input_file, output_file, log)
|
|
|
|
|
|
|
|
|
|
|
|
|
2015-07-23 04:57:31 -07:00
|
|
|
|
@merge(
|
2015-07-27 04:20:49 -07:00
|
|
|
|
input=[render_hocr_page, render_hocr_debug_page, skip_page,
|
|
|
|
|
|
tesseract_ocr_and_render_pdf, generate_postscript_stub],
|
2015-07-25 01:45:26 -07:00
|
|
|
|
output=os.path.join(work_folder, 'merged.pdf'),
|
2015-07-23 04:57:31 -07:00
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
|
|
|
|
|
def merge_pages(
|
|
|
|
|
|
input_files,
|
|
|
|
|
|
output_file,
|
|
|
|
|
|
log,
|
|
|
|
|
|
pdfinfo,
|
|
|
|
|
|
pdfinfo_lock):
|
2015-07-22 22:51:38 -07:00
|
|
|
|
|
2015-07-25 14:14:02 -07:00
|
|
|
|
def input_file_order(s):
|
2015-07-25 18:05:25 -07:00
|
|
|
|
'''Sort order: All rendered pages followed
|
|
|
|
|
|
by their debug page, if any, followed by Postscript stub.
|
|
|
|
|
|
Ghostscript documentation has the Postscript stub at the
|
|
|
|
|
|
beginning, but it works at the end and also gets document info
|
|
|
|
|
|
right that way.'''
|
2015-07-25 14:14:02 -07:00
|
|
|
|
if s.endswith('.ps'):
|
2015-07-25 18:05:25 -07:00
|
|
|
|
return 99999999
|
2015-07-25 14:14:02 -07:00
|
|
|
|
key = int(os.path.basename(s)[0:6]) * 10
|
|
|
|
|
|
if 'debug' in os.path.basename(s):
|
|
|
|
|
|
key += 1
|
|
|
|
|
|
return key
|
|
|
|
|
|
|
|
|
|
|
|
pdf_pages = sorted(input_files, key=input_file_order)
|
|
|
|
|
|
log.info(pdf_pages)
|
2015-07-30 23:20:21 -07:00
|
|
|
|
ghostscript.generate_pdfa(pdf_pages, output_file, options.jobs or 1)
|
2015-07-22 22:51:38 -07:00
|
|
|
|
|
|
|
|
|
|
|
2015-07-23 14:48:46 -07:00
|
|
|
|
@transform(
|
|
|
|
|
|
input=merge_pages,
|
|
|
|
|
|
filter=formatter(),
|
|
|
|
|
|
output=options.output_file,
|
|
|
|
|
|
extras=[_log, _pdfinfo, _pdfinfo_lock])
|
2015-08-10 16:05:00 -07:00
|
|
|
|
def copy_final(
|
2015-07-23 14:48:46 -07:00
|
|
|
|
input_file,
|
|
|
|
|
|
output_file,
|
|
|
|
|
|
log,
|
|
|
|
|
|
pdfinfo,
|
|
|
|
|
|
pdfinfo_lock):
|
2015-08-10 16:05:00 -07:00
|
|
|
|
shutil.copy(input_file, output_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def validate_pdfa(
|
|
|
|
|
|
input_file,
|
|
|
|
|
|
log):
|
2015-07-23 14:48:46 -07:00
|
|
|
|
|
2015-08-11 15:31:32 -07:00
|
|
|
|
args_qpdf = [
|
|
|
|
|
|
'qpdf',
|
|
|
|
|
|
'--check',
|
2015-07-23 14:48:46 -07:00
|
|
|
|
input_file
|
|
|
|
|
|
]
|
2015-08-11 15:31:32 -07:00
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
check_output(args_qpdf, stderr=STDOUT, universal_newlines=True)
|
|
|
|
|
|
except CalledProcessError as e:
|
|
|
|
|
|
if e.returncode == 2:
|
|
|
|
|
|
print("{0}: not a valid PDF, and could not repair it.".format(
|
|
|
|
|
|
options.input_file))
|
|
|
|
|
|
print("Details:")
|
|
|
|
|
|
print(e.output)
|
|
|
|
|
|
elif e.returncode == 3:
|
|
|
|
|
|
log.info("qpdf --check returned warnings:")
|
|
|
|
|
|
log.info(e.output)
|
|
|
|
|
|
else:
|
|
|
|
|
|
print(e.output)
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
return True
|
2015-07-23 14:48:46 -07:00
|
|
|
|
|
|
|
|
|
|
|
2015-07-22 22:51:38 -07:00
|
|
|
|
# @active_if(ocr_required and options.exact_image)
|
|
|
|
|
|
# @merge([render_hocr_blank_page, extract_single_page],
|
2015-07-25 01:45:26 -07:00
|
|
|
|
# os.path.join(work_folder, "%04i.merged.pdf") % pageno)
|
2015-07-22 22:51:38 -07:00
|
|
|
|
# def merge_hocr_with_original_page(infiles, output_file):
|
|
|
|
|
|
# with open(infiles[0], 'rb') as hocr_input, \
|
|
|
|
|
|
# open(infiles[1], 'rb') as page_input, \
|
|
|
|
|
|
# open(output_file, 'wb') as output:
|
|
|
|
|
|
# hocr_reader = pypdf.PdfFileReader(hocr_input)
|
|
|
|
|
|
# page_reader = pypdf.PdfFileReader(page_input)
|
|
|
|
|
|
# writer = pypdf.PdfFileWriter()
|
|
|
|
|
|
|
|
|
|
|
|
# the_page = hocr_reader.getPage(0)
|
|
|
|
|
|
# the_page.mergePage(page_reader.getPage(0))
|
|
|
|
|
|
# writer.addPage(the_page)
|
|
|
|
|
|
# writer.write(output)
|
|
|
|
|
|
|
|
|
|
|
|
|
2015-07-25 01:10:14 -07:00
|
|
|
|
def available_cpu_count():
|
|
|
|
|
|
try:
|
|
|
|
|
|
return multiprocessing.cpu_count()
|
|
|
|
|
|
except NotImplementedError:
|
|
|
|
|
|
pass
|
2015-07-22 22:51:38 -07:00
|
|
|
|
|
2015-07-25 01:10:14 -07:00
|
|
|
|
try:
|
|
|
|
|
|
import psutil
|
|
|
|
|
|
return psutil.cpu_count()
|
|
|
|
|
|
except (ImportError, AttributeError):
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
2015-07-28 02:25:50 -07:00
|
|
|
|
complain(
|
2015-07-25 01:10:14 -07:00
|
|
|
|
"Could not get CPU count. Assuming one (1) CPU."
|
2015-07-28 02:25:50 -07:00
|
|
|
|
"Use -j N to set manually.")
|
2015-07-25 01:10:14 -07:00
|
|
|
|
return 1
|
2014-09-26 04:19:41 -07:00
|
|
|
|
|
|
|
|
|
|
|
2015-12-04 03:07:53 -08:00
|
|
|
|
def cleanup_ruffus_error_message(msg):
|
|
|
|
|
|
msg = re.sub(r'\s+', r' ', msg, re.MULTILINE)
|
2015-12-04 04:03:38 -08:00
|
|
|
|
msg = re.sub(r"\((.+?)\)", r'\1', msg)
|
2015-12-04 03:07:53 -08:00
|
|
|
|
msg = msg.strip()
|
|
|
|
|
|
return msg
|
|
|
|
|
|
|
|
|
|
|
|
|
2015-07-26 01:52:08 -07:00
|
|
|
|
def run_pipeline():
|
2015-07-30 23:20:21 -07:00
|
|
|
|
if not options.jobs or options.jobs == 1:
|
|
|
|
|
|
options.jobs = available_cpu_count()
|
2015-08-11 02:19:46 -07:00
|
|
|
|
try:
|
|
|
|
|
|
cmdline.run(options)
|
|
|
|
|
|
except ruffus_exceptions.RethrownJobError as e:
|
|
|
|
|
|
if options.verbose:
|
|
|
|
|
|
print(e)
|
|
|
|
|
|
|
|
|
|
|
|
# Yuck. Hunt through the ruffus exception to find out what the
|
|
|
|
|
|
# return code is supposed to be.
|
|
|
|
|
|
for exc in e.args:
|
|
|
|
|
|
task_name, job_name, exc_name, exc_value, exc_stack = exc
|
|
|
|
|
|
if exc_name == 'builtins.SystemExit':
|
2015-12-04 03:34:53 -08:00
|
|
|
|
match = re.search(r"\.(.+?)\)", exc_value)
|
|
|
|
|
|
exit_code_name = match.groups()[0]
|
|
|
|
|
|
exit_code = getattr(ExitCode, exit_code_name, 'other_error')
|
|
|
|
|
|
return exit_code
|
2015-12-04 03:07:53 -08:00
|
|
|
|
elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
|
|
|
|
|
|
print(cleanup_ruffus_error_message(exc_value))
|
|
|
|
|
|
return ExitCode.input_file
|
|
|
|
|
|
elif exc_name == 'builtins.TypeError':
|
2015-12-04 03:09:39 -08:00
|
|
|
|
# Even though repair_pdf will fail, ruffus will still try
|
|
|
|
|
|
# to call split_pages with no input files, likely due to a bug
|
2015-12-04 03:07:53 -08:00
|
|
|
|
if task_name == 'split_pages':
|
|
|
|
|
|
print("Input file '{0}' is not a valid PDF".format(
|
|
|
|
|
|
options.input_file))
|
|
|
|
|
|
return ExitCode.input_file
|
|
|
|
|
|
|
2015-08-28 04:48:29 -07:00
|
|
|
|
return ExitCode.other_error
|
2015-07-26 01:52:08 -07:00
|
|
|
|
|
2015-08-11 15:31:32 -07:00
|
|
|
|
if not validate_pdfa(options.output_file, _log):
|
2015-08-10 16:05:00 -07:00
|
|
|
|
_log.warning('Output file: The generated PDF/A file is INVALID')
|
2015-08-11 15:31:32 -07:00
|
|
|
|
return ExitCode.invalid_output_pdfa
|
|
|
|
|
|
|
|
|
|
|
|
return ExitCode.ok
|
2015-08-10 16:05:00 -07:00
|
|
|
|
|
2015-07-26 01:52:08 -07:00
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2015-08-10 16:05:00 -07:00
|
|
|
|
sys.exit(run_pipeline())
|