| 
									
										
										
										
											2014-09-26 04:19:41 -07:00
										 |  |  |  | #!/usr/bin/env python3 | 
					
						
							| 
									
										
										
										
											2015-07-28 04:36:58 -07:00
										 |  |  |  | # © 2015 James R. Barlow: github.com/jbarlow83 | 
					
						
							| 
									
										
										
										
											2014-09-26 04:19:41 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 04:57:31 -07:00
										 |  |  |  | from contextlib import suppress | 
					
						
							| 
									
										
										
										
											2015-07-25 01:45:26 -07:00
										 |  |  |  | from tempfile import NamedTemporaryFile, mkdtemp | 
					
						
							| 
									
										
										
										
											2014-09-26 04:19:41 -07:00
										 |  |  |  | import sys | 
					
						
							| 
									
										
										
										
											2015-07-23 04:57:31 -07:00
										 |  |  |  | import os | 
					
						
							| 
									
										
										
										
											2015-02-13 13:41:14 -08:00
										 |  |  |  | import re | 
					
						
							| 
									
										
										
										
											2015-03-10 14:28:38 -07:00
										 |  |  |  | import shutil | 
					
						
							| 
									
										
										
										
											2015-07-23 04:57:31 -07:00
										 |  |  |  | import warnings | 
					
						
							|  |  |  |  | import multiprocessing | 
					
						
							| 
									
										
										
										
											2015-07-25 01:45:26 -07:00
										 |  |  |  | import atexit | 
					
						
							| 
									
										
										
										
											2015-07-28 02:25:50 -07:00
										 |  |  |  | import textwrap | 
					
						
							| 
									
										
										
										
											2015-07-23 04:57:31 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | import PyPDF2 as pypdf | 
					
						
							| 
									
										
										
										
											2015-07-24 15:19:37 -07:00
										 |  |  |  | from PIL import Image | 
					
						
							| 
									
										
										
										
											2014-10-08 03:21:28 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-11-14 02:06:23 -08:00
										 |  |  |  | from subprocess import Popen, check_call, PIPE, CalledProcessError, \ | 
					
						
							| 
									
										
										
										
											2015-08-11 02:19:46 -07:00
										 |  |  |  |     TimeoutExpired, check_output, STDOUT | 
					
						
							| 
									
										
										
										
											2014-10-08 03:21:28 -07:00
										 |  |  |  | try: | 
					
						
							|  |  |  |  |     from subprocess import DEVNULL | 
					
						
							|  |  |  |  | except ImportError: | 
					
						
							|  |  |  |  |     DEVNULL = open(os.devnull, 'wb') | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-26 04:19:41 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-10-10 00:35:49 -07:00
										 |  |  |  | from ruffus import transform, suffix, merge, active_if, regex, jobs_limit, \ | 
					
						
							| 
									
										
										
										
											2015-07-25 02:58:34 -07:00
										 |  |  |  |     formatter, follows, split, collate, check_if_uptodate | 
					
						
							| 
									
										
										
										
											2015-08-11 02:19:46 -07:00
										 |  |  |  | import ruffus.ruffus_exceptions as ruffus_exceptions | 
					
						
							| 
									
										
										
										
											2014-10-08 03:21:28 -07:00
										 |  |  |  | import ruffus.cmdline as cmdline | 
					
						
							| 
									
										
										
										
											2015-07-23 02:39:42 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-02-20 17:20:48 -08:00
										 |  |  |  | from .hocrtransform import HocrTransform | 
					
						
							| 
									
										
										
										
											2015-07-23 02:39:42 -07:00
										 |  |  |  | from .pageinfo import pdf_get_all_pageinfo | 
					
						
							| 
									
										
										
										
											2015-07-23 04:57:31 -07:00
										 |  |  |  | from .pdfa import generate_pdfa_def | 
					
						
							| 
									
										
										
										
											2015-07-28 02:25:50 -07:00
										 |  |  |  | from . import ghostscript | 
					
						
							| 
									
										
										
										
											2015-07-23 18:38:59 -07:00
										 |  |  |  | from . import tesseract | 
					
						
							| 
									
										
										
										
											2015-08-11 00:17:02 -07:00
										 |  |  |  | from . import ExitCode | 
					
						
							| 
									
										
										
										
											2015-04-09 03:12:04 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-04-09 14:06:55 -07:00
										 |  |  |  | warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning) | 
					
						
							| 
									
										
										
										
											2015-04-09 03:12:04 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-10-08 03:21:28 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 14:48:46 -07:00
										 |  |  |  | BASEDIR = os.path.dirname(os.path.realpath(__file__)) | 
					
						
							| 
									
										
										
										
											2015-07-26 01:52:08 -07:00
										 |  |  |  | JHOVE_PATH = os.path.realpath(os.path.join(BASEDIR, 'jhove')) | 
					
						
							| 
									
										
										
										
											2015-07-23 14:48:46 -07:00
										 |  |  |  | JHOVE_JAR = os.path.join(JHOVE_PATH, 'bin', 'JhoveApp.jar') | 
					
						
							|  |  |  |  | JHOVE_CFG = os.path.join(JHOVE_PATH, 'conf', 'jhove.conf') | 
					
						
							| 
									
										
										
										
											2014-10-08 03:21:28 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 17:06:00 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | # ------------- | 
					
						
							|  |  |  |  | # External dependencies | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | MINIMUM_TESS_VERSION = '3.02.02' | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-28 02:25:50 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | def complain(message): | 
					
						
							| 
									
										
										
										
											2015-08-11 02:19:46 -07:00
										 |  |  |  |     print(*textwrap.wrap(message), file=sys.stderr) | 
					
						
							| 
									
										
										
										
											2015-07-28 02:25:50 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-28 01:00:29 -07:00
										 |  |  |  | if tesseract.version() < MINIMUM_TESS_VERSION: | 
					
						
							| 
									
										
										
										
											2015-07-28 02:25:50 -07:00
										 |  |  |  |     complain( | 
					
						
							| 
									
										
										
										
											2015-07-23 17:06:00 -07:00
										 |  |  |  |         "Please install tesseract {0} or newer " | 
					
						
							|  |  |  |  |         "(currently installed version is {1})".format( | 
					
						
							| 
									
										
										
										
											2015-07-28 02:25:50 -07:00
										 |  |  |  |             MINIMUM_TESS_VERSION, tesseract.version())) | 
					
						
							| 
									
										
										
										
											2015-08-11 00:17:02 -07:00
										 |  |  |  |     sys.exit(ExitCode.missing_dependency) | 
					
						
							| 
									
										
										
										
											2015-07-23 17:06:00 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-08-16 00:54:03 -07:00
										 |  |  |  | try: | 
					
						
							|  |  |  |  |     import PIL.features | 
					
						
							|  |  |  |  |     check_codec = PIL.features.check_codec | 
					
						
							|  |  |  |  | except (ImportError, AttributeError): | 
					
						
							|  |  |  |  |     def check_codec(codec_name): | 
					
						
							|  |  |  |  |         if codec_name == 'jpg': | 
					
						
							|  |  |  |  |             return 'jpeg_encoder' in dir(Image.core) | 
					
						
							|  |  |  |  |         elif codec_name == 'zlib': | 
					
						
							|  |  |  |  |             return 'zip_encoder' in dir(Image.core) | 
					
						
							|  |  |  |  |         raise NotImplementedError(codec_name) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def check_pil_encoder(codec_name, friendly_name): | 
					
						
							|  |  |  |  |     try: | 
					
						
							|  |  |  |  |         if check_codec(codec_name): | 
					
						
							|  |  |  |  |             return | 
					
						
							|  |  |  |  |     except Exception: | 
					
						
							|  |  |  |  |         pass | 
					
						
							|  |  |  |  |     complain( | 
					
						
							|  |  |  |  |         "ERROR: Your version of the Python imaging library (Pillow) was " | 
					
						
							|  |  |  |  |         "compiled without support for " + friendly_name + " encoding/decoding." | 
					
						
							|  |  |  |  |         "\n" | 
					
						
							|  |  |  |  |         "You will need to uninstall Pillow and reinstall it with PNG and JPEG " | 
					
						
							|  |  |  |  |         "support (libjpeg and zlib)." | 
					
						
							|  |  |  |  |         "\n" | 
					
						
							|  |  |  |  |         "See installation instructions for your platform here:\n" | 
					
						
							|  |  |  |  |         "    https://pillow.readthedocs.org/installation.html" | 
					
						
							|  |  |  |  |     ) | 
					
						
							|  |  |  |  |     sys.exit(ExitCode.missing_dependency) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | check_pil_encoder('jpg', 'JPEG') | 
					
						
							|  |  |  |  | check_pil_encoder('zlib', 'PNG') | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 17:06:00 -07:00
										 |  |  |  | # ------------- | 
					
						
							|  |  |  |  | # Parser | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-10-08 03:21:28 -07:00
										 |  |  |  | parser = cmdline.get_argparse( | 
					
						
							| 
									
										
										
										
											2015-07-25 23:45:13 -07:00
										 |  |  |  |     prog="ocrmypdf", | 
					
						
							| 
									
										
										
										
											2015-07-27 15:39:54 -07:00
										 |  |  |  |     description="Generate searchable PDF file from an image-only PDF file.", | 
					
						
							| 
									
										
										
										
											2015-09-05 00:53:14 -07:00
										 |  |  |  |     version='3.0', | 
					
						
							| 
									
										
										
										
											2015-07-27 15:39:54 -07:00
										 |  |  |  |     fromfile_prefix_chars='@', | 
					
						
							|  |  |  |  |     ignored_args=[ | 
					
						
							|  |  |  |  |         'touch_files_only', 'recreate_database', 'checksum_file_name', | 
					
						
							|  |  |  |  |         'key_legend_in_graph', 'draw_graph_horizontally', 'flowchart_format', | 
					
						
							|  |  |  |  |         'forced_tasks', 'target_tasks']) | 
					
						
							| 
									
										
										
										
											2014-10-08 03:21:28 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | parser.add_argument( | 
					
						
							| 
									
										
										
										
											2015-07-23 01:16:05 -07:00
										 |  |  |  |     'input_file', | 
					
						
							| 
									
										
										
										
											2015-07-22 22:30:00 -07:00
										 |  |  |  |     help="PDF file containing the images to be OCRed") | 
					
						
							| 
									
										
										
										
											2014-10-08 03:21:28 -07:00
										 |  |  |  | parser.add_argument( | 
					
						
							| 
									
										
										
										
											2015-07-23 01:16:05 -07:00
										 |  |  |  |     'output_file', | 
					
						
							| 
									
										
										
										
											2015-07-22 22:30:00 -07:00
										 |  |  |  |     help="output searchable PDF file") | 
					
						
							| 
									
										
										
										
											2014-10-08 03:21:28 -07:00
										 |  |  |  | parser.add_argument( | 
					
						
							| 
									
										
										
										
											2015-07-23 18:38:59 -07:00
										 |  |  |  |     '-l', '--language', action='append', | 
					
						
							| 
									
										
										
										
											2015-08-05 16:56:53 -07:00
										 |  |  |  |     help="languages of the file to be OCRed") | 
					
						
							| 
									
										
										
										
											2015-07-22 22:30:00 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 18:12:25 -07:00
										 |  |  |  | metadata = parser.add_argument_group( | 
					
						
							|  |  |  |  |     "Metadata options", | 
					
						
							|  |  |  |  |     "Set output PDF/A metadata (default: use input document's title)") | 
					
						
							|  |  |  |  | metadata.add_argument( | 
					
						
							|  |  |  |  |     '--title', type=str, | 
					
						
							| 
									
										
										
										
											2015-07-27 04:20:49 -07:00
										 |  |  |  |     help="set document title (place multiple words in quotes)") | 
					
						
							| 
									
										
										
										
											2015-07-25 18:12:25 -07:00
										 |  |  |  | metadata.add_argument( | 
					
						
							|  |  |  |  |     '--author', type=str, | 
					
						
							|  |  |  |  |     help="set document author") | 
					
						
							|  |  |  |  | metadata.add_argument( | 
					
						
							|  |  |  |  |     '--subject', type=str, | 
					
						
							|  |  |  |  |     help="set document") | 
					
						
							|  |  |  |  | metadata.add_argument( | 
					
						
							|  |  |  |  |     '--keywords', type=str, | 
					
						
							|  |  |  |  |     help="set document keywords") | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-22 22:51:38 -07:00
										 |  |  |  | preprocessing = parser.add_argument_group( | 
					
						
							| 
									
										
										
										
											2015-07-22 22:30:00 -07:00
										 |  |  |  |     "Preprocessing options", | 
					
						
							|  |  |  |  |     "Improve OCR quality and final image") | 
					
						
							|  |  |  |  | preprocessing.add_argument( | 
					
						
							|  |  |  |  |     '-d', '--deskew', action='store_true', | 
					
						
							|  |  |  |  |     help="deskew each page before performing OCR") | 
					
						
							|  |  |  |  | preprocessing.add_argument( | 
					
						
							|  |  |  |  |     '-c', '--clean', action='store_true', | 
					
						
							| 
									
										
										
										
											2015-08-05 16:56:53 -07:00
										 |  |  |  |     help="clean pages from scanning artifacts before performing OCR") | 
					
						
							| 
									
										
										
										
											2015-07-22 22:30:00 -07:00
										 |  |  |  | preprocessing.add_argument( | 
					
						
							|  |  |  |  |     '-i', '--clean-final', action='store_true', | 
					
						
							|  |  |  |  |     help="incorporate the cleaned image in the final PDF file") | 
					
						
							|  |  |  |  | preprocessing.add_argument( | 
					
						
							| 
									
										
										
										
											2015-07-27 20:42:16 -07:00
										 |  |  |  |     '--oversample', metavar='DPI', type=int, default=0, | 
					
						
							| 
									
										
										
										
											2015-08-05 16:56:53 -07:00
										 |  |  |  |     help="oversample images to at least the specified DPI, to improve OCR " | 
					
						
							|  |  |  |  |          "results slightly") | 
					
						
							| 
									
										
										
										
											2015-07-22 22:30:00 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-10-08 03:21:28 -07:00
										 |  |  |  | parser.add_argument( | 
					
						
							| 
									
										
										
										
											2015-07-25 04:25:19 -07:00
										 |  |  |  |     '-f', '--force-ocr', action='store_true', | 
					
						
							| 
									
										
										
										
											2015-08-05 16:56:53 -07:00
										 |  |  |  |     help="rasterize any fonts or vector images on each page and apply OCR") | 
					
						
							| 
									
										
										
										
											2014-10-08 03:21:28 -07:00
										 |  |  |  | parser.add_argument( | 
					
						
							| 
									
										
										
										
											2015-07-25 04:25:19 -07:00
										 |  |  |  |     '-s', '--skip-text', action='store_true', | 
					
						
							| 
									
										
										
										
											2015-08-05 16:56:53 -07:00
										 |  |  |  |     help="skip OCR on any pages that already contain text, but include the" | 
					
						
							|  |  |  |  |          " page in final output") | 
					
						
							| 
									
										
										
										
											2015-02-20 15:26:33 -08:00
										 |  |  |  | parser.add_argument( | 
					
						
							| 
									
										
										
										
											2015-07-28 02:25:50 -07:00
										 |  |  |  |     '--skip-big', type=float, metavar='MPixels', | 
					
						
							| 
									
										
										
										
											2015-08-05 16:56:53 -07:00
										 |  |  |  |     help="skip OCR on pages larger than the specified amount of megapixels, " | 
					
						
							|  |  |  |  |          "but include skipped pages in final output") | 
					
						
							| 
									
										
										
										
											2015-07-25 04:25:19 -07:00
										 |  |  |  | # parser.add_argument( | 
					
						
							|  |  |  |  | #     '--exact-image', action='store_true', | 
					
						
							|  |  |  |  | #     help="Use original page from PDF without re-rendering") | 
					
						
							| 
									
										
										
										
											2015-07-22 22:30:00 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-22 22:51:38 -07:00
										 |  |  |  | advanced = parser.add_argument_group( | 
					
						
							| 
									
										
										
										
											2015-07-22 22:30:00 -07:00
										 |  |  |  |     "Advanced", | 
					
						
							| 
									
										
										
										
											2015-07-25 01:46:16 -07:00
										 |  |  |  |     "Advanced options for power users") | 
					
						
							| 
									
										
										
										
											2015-07-22 22:30:00 -07:00
										 |  |  |  | advanced.add_argument( | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  |     '--tesseract-config', default=[], type=list, action='append', | 
					
						
							| 
									
										
										
										
											2015-07-28 02:25:50 -07:00
										 |  |  |  |     help="additional Tesseract configuration files") | 
					
						
							| 
									
										
										
										
											2015-07-27 04:20:49 -07:00
										 |  |  |  | advanced.add_argument( | 
					
						
							|  |  |  |  |     '--pdf-renderer', choices=['tesseract', 'hocr'], default='hocr', | 
					
						
							|  |  |  |  |     help='choose OCR PDF renderer') | 
					
						
							| 
									
										
										
										
											2015-07-27 04:23:37 -07:00
										 |  |  |  | advanced.add_argument( | 
					
						
							|  |  |  |  |     '--tesseract-timeout', default=180.0, type=float, | 
					
						
							| 
									
										
										
										
											2015-08-05 16:56:53 -07:00
										 |  |  |  |     help='give up on OCR after the timeout, but copy the preprocessed page ' | 
					
						
							|  |  |  |  |          'into the final output') | 
					
						
							| 
									
										
										
										
											2015-07-22 22:30:00 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-22 22:51:38 -07:00
										 |  |  |  | debugging = parser.add_argument_group( | 
					
						
							| 
									
										
										
										
											2015-07-22 22:30:00 -07:00
										 |  |  |  |     "Debugging", | 
					
						
							|  |  |  |  |     "Arguments to help with troubleshooting and debugging") | 
					
						
							|  |  |  |  | debugging.add_argument( | 
					
						
							|  |  |  |  |     '-k', '--keep-temporary-files', action='store_true', | 
					
						
							|  |  |  |  |     help="keep temporary files (helpful for debugging)") | 
					
						
							|  |  |  |  | debugging.add_argument( | 
					
						
							| 
									
										
											  
											
												diff --git a/src/ocrmypdf.py b/src/ocrmypdf.py
index 68d1591..95afa8f 100755
--- a/src/ocrmypdf.py
+++ b/src/ocrmypdf.py
@@ -24,6 +24,7 @@ import ruffus.cmdline as cmdline
 from .hocrtransform import HocrTransform
 import warnings
+import multiprocessing
 warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
@@ -96,7 +97,7 @@ debugging.add_argument(
     '-k', '--keep-temporary-files', action='store_true',
     help="keep temporary files (helpful for debugging)")
 debugging.add_argument(
-    '-g' ,'--debug-rendering', action='store_true',
+    '-g', '--debug-rendering', action='store_true',
     help="render each page twice with debug information on second page")
@@ -106,51 +107,19 @@ if not options.temp_folder:
     options.temp_folder = 'tmp'
-_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
-                                               options.verbose)
+log, log_mutex = cmdline.setup_logging(__name__, options.log_file,
+                                       options.verbose)
-class WrappedLogger:
-
-    def __init__(self, my_logger, my_mutex):
-        self.logger = my_logger
-        self.mutex = my_mutex
-
-    def log(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.log(*args, **kwargs)
-
-    def debug(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.debug(*args, **kwargs)
-
-    def info(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.info(*args, **kwargs)
-
-    def warning(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.warning(*args, **kwargs)
-
-    def error(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.error(*args, **kwargs)
-
-    def critical(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.critical(*args, **kwargs)
-
-log = WrappedLogger(_logger, _logger_mutex)
-
-
-def re_symlink(input_file, soft_link_name, log=log):
+def re_symlink(input_file, soft_link_name, log, mutex):
     """
     Helper function: relinks soft symbolic link if necessary
     """
     if input_file == soft_link_name:
-        log.debug("Warning: No symbolic link made. You are using " +
-                     "the original data directory as the working directory.")
+        with mutex:
+            log.debug("Warning: No symbolic link made. You are using " +
+                      "the original data directory as the working directory.")
         return
@@ -161,12 +130,14 @@ def re_symlink(input_file, soft_link_name, log=log):
         try:
             os.unlink(soft_link_name)
         except:
-            log.debug("Can't unlink %s" % (soft_link_name))
+            with mutex:
+                log.debug("Can't unlink %s" % (soft_link_name))
     if not os.path.exists(input_file):
         raise Exception("trying to create a broken symlink to %s" % input_file)
-    log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
+    with mutex:
+        log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
     os.symlink(
											
										 
											2015-07-23 02:22:12 -07:00
										 |  |  |  |     '-g', '--debug-rendering', action='store_true', | 
					
						
							| 
									
										
										
										
											2015-07-22 22:30:00 -07:00
										 |  |  |  |     help="render each page twice with debug information on second page") | 
					
						
							| 
									
										
										
										
											2014-10-08 03:21:28 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-28 04:46:21 -07:00
										 |  |  |  | options = parser.parse_args() | 
					
						
							| 
									
										
										
										
											2015-07-27 15:39:54 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-10-08 03:21:28 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 18:38:59 -07:00
										 |  |  |  | # ---------- | 
					
						
							|  |  |  |  | # Languages | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | if not options.language: | 
					
						
							|  |  |  |  |     options.language = ['eng']  # Enforce English hegemony | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | # Support v2.x "eng+deu" language syntax | 
					
						
							|  |  |  |  | if '+' in options.language[0]: | 
					
						
							|  |  |  |  |     options.language = options.language[0].split('+') | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-28 01:00:29 -07:00
										 |  |  |  | if not set(options.language).issubset(tesseract.languages()): | 
					
						
							| 
									
										
										
										
											2015-07-28 02:25:50 -07:00
										 |  |  |  |     complain( | 
					
						
							| 
									
										
										
										
											2015-07-23 18:38:59 -07:00
										 |  |  |  |         "The installed version of tesseract does not have language " | 
					
						
							| 
									
										
										
										
											2015-07-28 02:25:50 -07:00
										 |  |  |  |         "data for the following requested languages: ") | 
					
						
							| 
									
										
										
										
											2015-07-28 01:00:29 -07:00
										 |  |  |  |     for lang in (set(options.language) - tesseract.languages()): | 
					
						
							| 
									
										
										
										
											2015-08-18 23:27:50 -07:00
										 |  |  |  |         complain(lang) | 
					
						
							| 
									
										
										
										
											2015-08-11 00:17:02 -07:00
										 |  |  |  |     sys.exit(ExitCode.bad_args) | 
					
						
							| 
									
										
										
										
											2015-07-23 18:38:59 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 00:22:56 -07:00
										 |  |  |  | # ---------- | 
					
						
							|  |  |  |  | # Arguments | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | if any((options.deskew, options.clean, options.clean_final)): | 
					
						
							|  |  |  |  |     try: | 
					
						
							|  |  |  |  |         from . import unpaper | 
					
						
							|  |  |  |  |     except ImportError: | 
					
						
							| 
									
										
										
										
											2015-07-28 02:25:50 -07:00
										 |  |  |  |         complain( | 
					
						
							|  |  |  |  |             "Install the 'unpaper' program to use --deskew or --clean.") | 
					
						
							| 
									
										
										
										
											2015-08-11 00:17:02 -07:00
										 |  |  |  |         sys.exit(ExitCode.bad_args) | 
					
						
							| 
									
										
										
										
											2015-07-25 00:22:56 -07:00
										 |  |  |  | else: | 
					
						
							|  |  |  |  |     unpaper = None | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-28 02:25:50 -07:00
										 |  |  |  | if options.debug_rendering and options.pdf_renderer == 'tesseract': | 
					
						
							|  |  |  |  |     complain( | 
					
						
							|  |  |  |  |         "Ignoring --debug-rendering because it is not supported with" | 
					
						
							|  |  |  |  |         "--pdf-renderer=tesseract.") | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | if options.force_ocr and options.skip_text: | 
					
						
							|  |  |  |  |     complain( | 
					
						
							|  |  |  |  |         "Error: --force-ocr and --skip-text are mutually incompatible.") | 
					
						
							| 
									
										
										
										
											2015-08-11 00:17:02 -07:00
										 |  |  |  |     sys.exit(ExitCode.bad_args) | 
					
						
							| 
									
										
										
										
											2015-07-28 02:25:50 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | if options.clean and not options.clean_final \ | 
					
						
							|  |  |  |  |         and options.pdf_renderer == 'tesseract': | 
					
						
							|  |  |  |  |     complain( | 
					
						
							|  |  |  |  |         "Tesseract PDF renderer cannot render --clean pages without " | 
					
						
							|  |  |  |  |         "also performing --clean-final, so --clean-final is assumed.") | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 02:39:42 -07:00
										 |  |  |  | # ---------- | 
					
						
							|  |  |  |  | # Logging | 
					
						
							| 
									
										
										
										
											2015-07-22 22:58:13 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 02:26:09 -07:00
										 |  |  |  | _logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file, | 
					
						
							|  |  |  |  |                                                options.verbose) | 
					
						
							| 
									
										
										
										
											2015-03-24 22:46:33 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 02:26:09 -07:00
										 |  |  |  | class WrappedLogger: | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     def __init__(self, my_logger, my_mutex): | 
					
						
							|  |  |  |  |         self.logger = my_logger | 
					
						
							|  |  |  |  |         self.mutex = my_mutex | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     def log(self, *args, **kwargs): | 
					
						
							|  |  |  |  |         with self.mutex: | 
					
						
							|  |  |  |  |             self.logger.log(*args, **kwargs) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     def debug(self, *args, **kwargs): | 
					
						
							|  |  |  |  |         with self.mutex: | 
					
						
							|  |  |  |  |             self.logger.debug(*args, **kwargs) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     def info(self, *args, **kwargs): | 
					
						
							|  |  |  |  |         with self.mutex: | 
					
						
							|  |  |  |  |             self.logger.info(*args, **kwargs) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     def warning(self, *args, **kwargs): | 
					
						
							|  |  |  |  |         with self.mutex: | 
					
						
							|  |  |  |  |             self.logger.warning(*args, **kwargs) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     def error(self, *args, **kwargs): | 
					
						
							|  |  |  |  |         with self.mutex: | 
					
						
							|  |  |  |  |             self.logger.error(*args, **kwargs) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     def critical(self, *args, **kwargs): | 
					
						
							|  |  |  |  |         with self.mutex: | 
					
						
							|  |  |  |  |             self.logger.critical(*args, **kwargs) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | _log = WrappedLogger(_logger, _logger_mutex) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def re_symlink(input_file, soft_link_name, log=_log): | 
					
						
							| 
									
										
										
										
											2015-07-22 22:46:00 -07:00
										 |  |  |  |     """
 | 
					
						
							|  |  |  |  |     Helper function: relinks soft symbolic link if necessary | 
					
						
							|  |  |  |  |     """
 | 
					
						
							|  |  |  |  |     # Guard against soft linking to oneself | 
					
						
							|  |  |  |  |     if input_file == soft_link_name: | 
					
						
							| 
									
										
										
										
											2015-07-23 02:26:09 -07:00
										 |  |  |  |         log.debug("Warning: No symbolic link made. You are using " + | 
					
						
							| 
									
										
										
										
											2015-07-24 01:27:01 -07:00
										 |  |  |  |                   "the original data directory as the working directory.") | 
					
						
							| 
									
										
										
										
											2015-07-22 22:46:00 -07:00
										 |  |  |  |         return | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     # Soft link already exists: delete for relink? | 
					
						
							|  |  |  |  |     if os.path.lexists(soft_link_name): | 
					
						
							|  |  |  |  |         # do not delete or overwrite real (non-soft link) file | 
					
						
							|  |  |  |  |         if not os.path.islink(soft_link_name): | 
					
						
							|  |  |  |  |             raise Exception("%s exists and is not a link" % soft_link_name) | 
					
						
							|  |  |  |  |         try: | 
					
						
							|  |  |  |  |             os.unlink(soft_link_name) | 
					
						
							|  |  |  |  |         except: | 
					
						
							| 
									
										
										
										
											2015-07-23 02:26:09 -07:00
										 |  |  |  |             log.debug("Can't unlink %s" % (soft_link_name)) | 
					
						
							| 
									
										
										
										
											2015-07-22 22:46:00 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     if not os.path.exists(input_file): | 
					
						
							|  |  |  |  |         raise Exception("trying to create a broken symlink to %s" % input_file) | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 02:26:09 -07:00
										 |  |  |  |     log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name)) | 
					
						
							| 
									
										
										
										
											2015-07-22 22:46:00 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     # Create symbolic link using absolute path | 
					
						
							|  |  |  |  |     os.symlink( | 
					
						
							|  |  |  |  |         os.path.abspath(input_file), | 
					
						
							|  |  |  |  |         soft_link_name | 
					
						
							|  |  |  |  |     ) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 02:39:42 -07:00
										 |  |  |  | # ------------- | 
					
						
							|  |  |  |  | # The Pipeline | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 03:49:30 -07:00
										 |  |  |  | manager = multiprocessing.Manager() | 
					
						
							|  |  |  |  | _pdfinfo = manager.list() | 
					
						
							|  |  |  |  | _pdfinfo_lock = manager.Lock() | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 01:45:26 -07:00
										 |  |  |  | work_folder = mkdtemp(prefix="com.github.ocrmypdf.") | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | @atexit.register | 
					
						
							|  |  |  |  | def cleanup_working_files(*args): | 
					
						
							|  |  |  |  |     if options.keep_temporary_files: | 
					
						
							|  |  |  |  |         print("Temporary working files saved at:") | 
					
						
							|  |  |  |  |         print(work_folder) | 
					
						
							|  |  |  |  |     else: | 
					
						
							|  |  |  |  |         with suppress(FileNotFoundError): | 
					
						
							|  |  |  |  |             shutil.rmtree(work_folder) | 
					
						
							| 
									
										
										
										
											2015-07-23 02:39:42 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 01:16:05 -07:00
										 |  |  |  | @transform( | 
					
						
							| 
									
										
										
										
											2015-07-23 03:09:03 -07:00
										 |  |  |  |     input=options.input_file, | 
					
						
							|  |  |  |  |     filter=suffix('.pdf'), | 
					
						
							| 
									
										
										
										
											2015-07-24 01:55:54 -07:00
										 |  |  |  |     output='.repaired.pdf', | 
					
						
							| 
									
										
										
										
											2015-07-25 01:45:26 -07:00
										 |  |  |  |     output_dir=work_folder, | 
					
						
							| 
									
										
										
										
											2015-07-23 03:49:30 -07:00
										 |  |  |  |     extras=[_log, _pdfinfo, _pdfinfo_lock]) | 
					
						
							| 
									
										
										
										
											2015-07-24 01:55:54 -07:00
										 |  |  |  | def repair_pdf( | 
					
						
							| 
									
										
										
										
											2015-07-23 01:16:05 -07:00
										 |  |  |  |         input_file, | 
					
						
							| 
									
										
										
										
											2015-07-23 02:39:42 -07:00
										 |  |  |  |         output_file, | 
					
						
							| 
									
										
										
										
											2015-07-23 03:49:30 -07:00
										 |  |  |  |         log, | 
					
						
							| 
									
										
										
										
											2015-07-23 03:09:03 -07:00
										 |  |  |  |         pdfinfo, | 
					
						
							| 
									
										
										
										
											2015-07-23 03:49:30 -07:00
										 |  |  |  |         pdfinfo_lock): | 
					
						
							| 
									
										
										
										
											2015-07-30 04:06:31 -07:00
										 |  |  |  |     args_qpdf = [ | 
					
						
							|  |  |  |  |         'qpdf', input_file, output_file | 
					
						
							| 
									
										
										
										
											2015-07-23 01:16:05 -07:00
										 |  |  |  |     ] | 
					
						
							| 
									
										
										
										
											2015-08-11 02:19:46 -07:00
										 |  |  |  |     try: | 
					
						
							|  |  |  |  |         out = check_output(args_qpdf, stderr=STDOUT, universal_newlines=True) | 
					
						
							|  |  |  |  |     except CalledProcessError as e: | 
					
						
							|  |  |  |  |         if e.returncode == 2: | 
					
						
							|  |  |  |  |             print("{0}: not a valid PDF, and could not repair it.".format( | 
					
						
							|  |  |  |  |                     options.input_file)) | 
					
						
							|  |  |  |  |             print("Details:") | 
					
						
							|  |  |  |  |             print(e.output) | 
					
						
							|  |  |  |  |         else: | 
					
						
							|  |  |  |  |             print(e.output) | 
					
						
							|  |  |  |  |         sys.exit(ExitCode.input_file) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     log.debug(out) | 
					
						
							| 
									
										
										
										
											2015-07-23 01:16:05 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 03:49:30 -07:00
										 |  |  |  |     with pdfinfo_lock: | 
					
						
							|  |  |  |  |         pdfinfo.extend(pdf_get_all_pageinfo(output_file)) | 
					
						
							|  |  |  |  |         log.info(pdfinfo) | 
					
						
							| 
									
										
										
										
											2015-07-23 01:16:05 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 04:25:19 -07:00
										 |  |  |  | def get_pageinfo(input_file, pdfinfo, pdfinfo_lock): | 
					
						
							|  |  |  |  |     pageno = int(os.path.basename(input_file)[0:6]) - 1 | 
					
						
							|  |  |  |  |     with pdfinfo_lock: | 
					
						
							|  |  |  |  |         pageinfo = pdfinfo[pageno].copy() | 
					
						
							|  |  |  |  |     return pageinfo | 
					
						
							| 
									
										
										
										
											2015-07-23 02:39:42 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 04:25:19 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | def is_ocr_required(pageinfo, log): | 
					
						
							|  |  |  |  |     page = pageinfo['pageno'] + 1 | 
					
						
							|  |  |  |  |     ocr_required = True | 
					
						
							|  |  |  |  |     if not pageinfo['images']: | 
					
						
							|  |  |  |  |         # If the page has no images, then it contains vector content or text | 
					
						
							|  |  |  |  |         # or both. It seems quite unlikely that one would find meaningful text | 
					
						
							|  |  |  |  |         # from rasterizing vector content. So skip the page. | 
					
						
							|  |  |  |  |         log.info( | 
					
						
							|  |  |  |  |             "Page {0} has no images - skipping OCR".format(page) | 
					
						
							|  |  |  |  |         ) | 
					
						
							|  |  |  |  |         ocr_required = False | 
					
						
							|  |  |  |  |     elif pageinfo['has_text']: | 
					
						
							|  |  |  |  |         s = "Page {0} already has text! – {1}" | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         if not options.force_ocr and not options.skip_text: | 
					
						
							|  |  |  |  |             log.error(s.format(page, | 
					
						
							|  |  |  |  |                                "aborting (use --force-ocr to force OCR)")) | 
					
						
							| 
									
										
										
										
											2015-08-11 00:17:02 -07:00
										 |  |  |  |             sys.exit(ExitCode.already_done_ocr) | 
					
						
							| 
									
										
										
										
											2015-07-25 04:25:19 -07:00
										 |  |  |  |         elif options.force_ocr: | 
					
						
							|  |  |  |  |             log.info(s.format(page, | 
					
						
							|  |  |  |  |                               "rasterizing text and running OCR anyway")) | 
					
						
							|  |  |  |  |             ocr_required = True | 
					
						
							|  |  |  |  |         elif options.skip_text: | 
					
						
							|  |  |  |  |             log.info(s.format(page, | 
					
						
							|  |  |  |  |                               "skipping all processing on this page")) | 
					
						
							|  |  |  |  |             ocr_required = False | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     if ocr_required and options.skip_big: | 
					
						
							|  |  |  |  |         pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels'] | 
					
						
							| 
									
										
										
										
											2015-07-28 02:25:50 -07:00
										 |  |  |  |         if pixel_count > (options.skip_big * 1000000): | 
					
						
							| 
									
										
										
										
											2015-07-25 04:25:19 -07:00
										 |  |  |  |             ocr_required = False | 
					
						
							|  |  |  |  |             log.info( | 
					
						
							|  |  |  |  |                 "Page {0} is very large; skipping due to -b".format(page)) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     return ocr_required | 
					
						
							| 
									
										
										
										
											2015-07-23 01:16:05 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 02:58:34 -07:00
										 |  |  |  | @split( | 
					
						
							| 
									
										
										
										
											2015-07-24 01:55:54 -07:00
										 |  |  |  |     repair_pdf, | 
					
						
							| 
									
										
										
										
											2015-07-25 02:58:34 -07:00
										 |  |  |  |     os.path.join(work_folder, '*.page.pdf'), | 
					
						
							|  |  |  |  |     extras=[_log, _pdfinfo, _pdfinfo_lock]) | 
					
						
							| 
									
										
										
										
											2015-07-22 22:46:00 -07:00
										 |  |  |  | def split_pages( | 
					
						
							|  |  |  |  |         input_file, | 
					
						
							| 
									
										
										
										
											2015-07-23 03:09:03 -07:00
										 |  |  |  |         output_files, | 
					
						
							| 
									
										
										
										
											2015-07-23 03:49:30 -07:00
										 |  |  |  |         log, | 
					
						
							| 
									
										
										
										
											2015-07-23 03:09:03 -07:00
										 |  |  |  |         pdfinfo, | 
					
						
							| 
									
										
										
										
											2015-07-23 03:49:30 -07:00
										 |  |  |  |         pdfinfo_lock): | 
					
						
							| 
									
										
										
										
											2015-07-22 22:46:00 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     for oo in output_files: | 
					
						
							|  |  |  |  |         with suppress(FileNotFoundError): | 
					
						
							|  |  |  |  |             os.unlink(oo) | 
					
						
							| 
									
										
										
										
											2015-07-30 04:06:31 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     pages = check_output(['qpdf', '--show-npages', input_file], | 
					
						
							|  |  |  |  |                          universal_newlines=True, close_fds=True) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     for n in range(int(pages)): | 
					
						
							|  |  |  |  |         args_qpdf = [ | 
					
						
							|  |  |  |  |             'qpdf', input_file, | 
					
						
							|  |  |  |  |             '--pages', input_file, '{0}'.format(n + 1), '--', | 
					
						
							|  |  |  |  |             os.path.join(work_folder, '{0:06d}.page.pdf'.format(n + 1)) | 
					
						
							|  |  |  |  |         ] | 
					
						
							|  |  |  |  |         check_call(args_qpdf) | 
					
						
							| 
									
										
										
										
											2015-07-22 22:46:00 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 04:25:19 -07:00
										 |  |  |  |     from glob import glob | 
					
						
							|  |  |  |  |     for filename in glob(os.path.join(work_folder, '*.page.pdf')): | 
					
						
							|  |  |  |  |         pageinfo = get_pageinfo(filename, pdfinfo, pdfinfo_lock) | 
					
						
							| 
									
										
										
										
											2015-07-23 03:49:30 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 04:25:19 -07:00
										 |  |  |  |         alt_suffix = '.ocr.page.pdf' if is_ocr_required(pageinfo, log) \ | 
					
						
							|  |  |  |  |                      else '.skip.page.pdf' | 
					
						
							|  |  |  |  |         re_symlink( | 
					
						
							|  |  |  |  |             filename, | 
					
						
							|  |  |  |  |             os.path.join( | 
					
						
							|  |  |  |  |                 work_folder, | 
					
						
							|  |  |  |  |                 os.path.basename(filename)[0:6] + alt_suffix)) | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 04:57:31 -07:00
										 |  |  |  | @transform( | 
					
						
							|  |  |  |  |     input=split_pages, | 
					
						
							| 
									
										
										
										
											2015-07-25 04:25:19 -07:00
										 |  |  |  |     filter=suffix('.ocr.page.pdf'), | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  |     output='.page.png', | 
					
						
							| 
									
										
										
										
											2015-07-25 01:45:26 -07:00
										 |  |  |  |     output_dir=work_folder, | 
					
						
							| 
									
										
										
										
											2015-07-23 04:57:31 -07:00
										 |  |  |  |     extras=[_log, _pdfinfo, _pdfinfo_lock]) | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  | def rasterize_with_ghostscript( | 
					
						
							| 
									
										
										
										
											2015-07-23 04:57:31 -07:00
										 |  |  |  |         input_file, | 
					
						
							|  |  |  |  |         output_file, | 
					
						
							|  |  |  |  |         log, | 
					
						
							|  |  |  |  |         pdfinfo, | 
					
						
							|  |  |  |  |         pdfinfo_lock): | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     device = 'png16m'  # 24-bit | 
					
						
							|  |  |  |  |     if all(image['comp'] == 1 for image in pageinfo['images']): | 
					
						
							|  |  |  |  |         if all(image['bpc'] == 1 for image in pageinfo['images']): | 
					
						
							|  |  |  |  |             device = 'pngmono' | 
					
						
							| 
									
										
										
										
											2015-08-28 04:47:57 -07:00
										 |  |  |  |         elif all(image['bpc'] > 1 and image['color'] == 'index' | 
					
						
							|  |  |  |  |                  for image in pageinfo['images']): | 
					
						
							|  |  |  |  |             device = 'png256' | 
					
						
							|  |  |  |  |         elif all(image['bpc'] > 1 and image['color'] == 'gray' | 
					
						
							|  |  |  |  |                  for image in pageinfo['images']): | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  |             device = 'pnggray' | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-08-28 04:47:57 -07:00
										 |  |  |  |     log.debug("Rendering {0} with {1}".format( | 
					
						
							|  |  |  |  |             os.path.basename(input_file), device)) | 
					
						
							| 
									
										
										
										
											2015-07-26 12:56:10 -07:00
										 |  |  |  |     xres = max(pageinfo['xres'], options.oversample or 0) | 
					
						
							|  |  |  |  |     yres = max(pageinfo['yres'], options.oversample or 0) | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-28 02:25:50 -07:00
										 |  |  |  |     ghostscript.rasterize_pdf(input_file, output_file, xres, yres, device, log) | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | @transform( | 
					
						
							|  |  |  |  |     input=rasterize_with_ghostscript, | 
					
						
							|  |  |  |  |     filter=suffix(".page.png"), | 
					
						
							| 
									
										
										
										
											2015-07-25 00:22:56 -07:00
										 |  |  |  |     output=".pp-deskew.png", | 
					
						
							| 
									
										
										
										
											2015-07-24 15:19:37 -07:00
										 |  |  |  |     extras=[_log, _pdfinfo, _pdfinfo_lock]) | 
					
						
							| 
									
										
										
										
											2015-07-25 00:22:56 -07:00
										 |  |  |  | def preprocess_deskew( | 
					
						
							| 
									
										
										
										
											2015-07-24 15:19:37 -07:00
										 |  |  |  |         input_file, | 
					
						
							|  |  |  |  |         output_file, | 
					
						
							|  |  |  |  |         log, | 
					
						
							|  |  |  |  |         pdfinfo, | 
					
						
							|  |  |  |  |         pdfinfo_lock): | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 00:22:56 -07:00
										 |  |  |  |     if not options.deskew: | 
					
						
							| 
									
										
										
										
											2015-07-24 15:19:37 -07:00
										 |  |  |  |         re_symlink(input_file, output_file, log) | 
					
						
							|  |  |  |  |         return | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock) | 
					
						
							| 
									
										
										
										
											2015-07-25 00:22:56 -07:00
										 |  |  |  |     dpi = int(pageinfo['xres']) | 
					
						
							| 
									
										
										
										
											2015-07-24 15:19:37 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 00:22:56 -07:00
										 |  |  |  |     unpaper.deskew(input_file, output_file, dpi, log) | 
					
						
							| 
									
										
										
										
											2015-07-24 15:19:37 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 00:22:56 -07:00
										 |  |  |  | @transform( | 
					
						
							|  |  |  |  |     input=preprocess_deskew, | 
					
						
							|  |  |  |  |     filter=suffix(".pp-deskew.png"), | 
					
						
							|  |  |  |  |     output=".pp-clean.png", | 
					
						
							|  |  |  |  |     extras=[_log, _pdfinfo, _pdfinfo_lock]) | 
					
						
							|  |  |  |  | def preprocess_clean( | 
					
						
							|  |  |  |  |         input_file, | 
					
						
							|  |  |  |  |         output_file, | 
					
						
							|  |  |  |  |         log, | 
					
						
							|  |  |  |  |         pdfinfo, | 
					
						
							|  |  |  |  |         pdfinfo_lock): | 
					
						
							| 
									
										
										
										
											2015-07-24 15:19:37 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 00:22:56 -07:00
										 |  |  |  |     if not options.clean: | 
					
						
							|  |  |  |  |         re_symlink(input_file, output_file, log) | 
					
						
							|  |  |  |  |         return | 
					
						
							| 
									
										
										
										
											2015-07-24 15:19:37 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 00:22:56 -07:00
										 |  |  |  |     pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock) | 
					
						
							|  |  |  |  |     dpi = int(pageinfo['xres']) | 
					
						
							| 
									
										
										
										
											2015-07-24 15:19:37 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 00:22:56 -07:00
										 |  |  |  |     unpaper.clean(input_file, output_file, dpi, log) | 
					
						
							| 
									
										
										
										
											2015-07-24 15:19:37 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-27 04:20:49 -07:00
										 |  |  |  | @active_if(options.pdf_renderer == 'hocr') | 
					
						
							| 
									
										
										
										
											2015-07-24 15:19:37 -07:00
										 |  |  |  | @transform( | 
					
						
							| 
									
										
										
										
											2015-07-25 00:22:56 -07:00
										 |  |  |  |     input=preprocess_clean, | 
					
						
							|  |  |  |  |     filter=suffix(".pp-clean.png"), | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  |     output=".hocr", | 
					
						
							|  |  |  |  |     extras=[_log, _pdfinfo, _pdfinfo_lock]) | 
					
						
							| 
									
										
										
										
											2015-07-27 04:20:49 -07:00
										 |  |  |  | def ocr_tesseract_hocr( | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  |         input_file, | 
					
						
							|  |  |  |  |         output_file, | 
					
						
							|  |  |  |  |         log, | 
					
						
							|  |  |  |  |         pdfinfo, | 
					
						
							|  |  |  |  |         pdfinfo_lock): | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock) | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-08-18 23:27:50 -07:00
										 |  |  |  |     badxml = os.path.splitext(output_file)[0] + '.badxml' | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  |     args_tesseract = [ | 
					
						
							|  |  |  |  |         'tesseract', | 
					
						
							|  |  |  |  |         '-l', '+'.join(options.language), | 
					
						
							|  |  |  |  |         input_file, | 
					
						
							| 
									
										
										
										
											2015-08-18 23:27:50 -07:00
										 |  |  |  |         badxml, | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  |         'hocr' | 
					
						
							|  |  |  |  |     ] + options.tesseract_config | 
					
						
							|  |  |  |  |     p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE, | 
					
						
							|  |  |  |  |               universal_newlines=True) | 
					
						
							|  |  |  |  |     try: | 
					
						
							| 
									
										
										
										
											2015-07-27 04:23:37 -07:00
										 |  |  |  |         stdout, stderr = p.communicate(timeout=options.tesseract_timeout) | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  |     except TimeoutExpired: | 
					
						
							|  |  |  |  |         p.kill() | 
					
						
							|  |  |  |  |         stdout, stderr = p.communicate() | 
					
						
							|  |  |  |  |         # Generate a HOCR file with no recognized text if tesseract times out | 
					
						
							|  |  |  |  |         # Temporary workaround to hocrTransform not being able to function if | 
					
						
							|  |  |  |  |         # it does not have a valid hOCR file. | 
					
						
							|  |  |  |  |         with open(output_file, 'w', encoding="utf-8") as f: | 
					
						
							|  |  |  |  |             f.write(tesseract.HOCR_TEMPLATE.format( | 
					
						
							|  |  |  |  |                 pageinfo['width_pixels'], | 
					
						
							|  |  |  |  |                 pageinfo['height_pixels'])) | 
					
						
							|  |  |  |  |     else: | 
					
						
							|  |  |  |  |         if stdout: | 
					
						
							|  |  |  |  |             log.info(stdout) | 
					
						
							|  |  |  |  |         if stderr: | 
					
						
							|  |  |  |  |             log.error(stderr) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         if p.returncode != 0: | 
					
						
							|  |  |  |  |             raise CalledProcessError(p.returncode, args_tesseract) | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-08-18 23:27:50 -07:00
										 |  |  |  |         if os.path.exists(badxml + '.html'): | 
					
						
							|  |  |  |  |             # Tesseract 3.02 appends suffix ".html" on its own (.badxml.html) | 
					
						
							|  |  |  |  |             shutil.move(badxml + '.html', badxml) | 
					
						
							|  |  |  |  |         elif os.path.exists(badxml + '.hocr'): | 
					
						
							|  |  |  |  |             # Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr) | 
					
						
							|  |  |  |  |             shutil.move(badxml + '.hocr', badxml) | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |         # Tesseract 3.03 inserts source filename into hocr file without | 
					
						
							|  |  |  |  |         # escaping it, creating invalid XML and breaking the parser. | 
					
						
							|  |  |  |  |         # As a workaround, rewrite the hocr file, replacing the filename | 
					
						
							| 
									
										
										
										
											2015-08-18 23:27:50 -07:00
										 |  |  |  |         # with a space.  Don't know if Tesseract 3.02 does the same. | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  |         regex_nested_single_quotes = re.compile( | 
					
						
							|  |  |  |  |             r"""title='image "([^"]*)";""") | 
					
						
							| 
									
										
										
										
											2015-08-18 23:27:50 -07:00
										 |  |  |  |         with open(badxml, mode='r', encoding='utf-8') as f_in, \ | 
					
						
							|  |  |  |  |                 open(output_file, mode='w', encoding='utf-8') as f_out: | 
					
						
							|  |  |  |  |             for line in f_in: | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  |                 line = regex_nested_single_quotes.sub( | 
					
						
							|  |  |  |  |                     r"""title='image " ";""", line) | 
					
						
							| 
									
										
										
										
											2015-08-18 23:27:50 -07:00
										 |  |  |  |                 f_out.write(line) | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-27 04:20:49 -07:00
										 |  |  |  | @active_if(options.pdf_renderer == 'hocr') | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  | @collate( | 
					
						
							| 
									
										
										
										
											2015-07-25 00:54:00 -07:00
										 |  |  |  |     input=[rasterize_with_ghostscript, preprocess_deskew, preprocess_clean], | 
					
						
							|  |  |  |  |     filter=regex(r".*/(\d{6})(?:\.page|\.pp-deskew|\.pp-clean)\.png"), | 
					
						
							| 
									
										
										
										
											2015-07-25 01:45:26 -07:00
										 |  |  |  |     output=os.path.join(work_folder, r'\1.image'), | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  |     extras=[_log, _pdfinfo, _pdfinfo_lock]) | 
					
						
							| 
									
										
										
										
											2015-07-25 00:54:00 -07:00
										 |  |  |  | def select_image_for_pdf( | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  |         infiles, | 
					
						
							|  |  |  |  |         output_file, | 
					
						
							|  |  |  |  |         log, | 
					
						
							|  |  |  |  |         pdfinfo, | 
					
						
							|  |  |  |  |         pdfinfo_lock): | 
					
						
							| 
									
										
										
										
											2015-07-25 00:22:56 -07:00
										 |  |  |  |     if options.clean_final: | 
					
						
							|  |  |  |  |         image_suffix = '.pp-clean.png' | 
					
						
							| 
									
										
										
										
											2015-07-25 00:54:00 -07:00
										 |  |  |  |     elif options.deskew: | 
					
						
							| 
									
										
										
										
											2015-07-25 00:22:56 -07:00
										 |  |  |  |         image_suffix = '.pp-deskew.png' | 
					
						
							| 
									
										
										
										
											2015-07-25 00:54:00 -07:00
										 |  |  |  |     else: | 
					
						
							|  |  |  |  |         image_suffix = '.page.png' | 
					
						
							| 
									
										
										
										
											2015-07-25 00:22:56 -07:00
										 |  |  |  |     image = next(ii for ii in infiles if ii.endswith(image_suffix)) | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 00:54:00 -07:00
										 |  |  |  |     pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock) | 
					
						
							|  |  |  |  |     if all(image['enc'] == 'jpeg' for image in pageinfo['images']): | 
					
						
							|  |  |  |  |         # If all images were JPEGs originally, produce a JPEG as output | 
					
						
							|  |  |  |  |         Image.open(image).save(output_file, format='JPEG') | 
					
						
							|  |  |  |  |     else: | 
					
						
							|  |  |  |  |         re_symlink(image, output_file) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-27 04:20:49 -07:00
										 |  |  |  | @active_if(options.pdf_renderer == 'hocr') | 
					
						
							| 
									
										
										
										
											2015-07-25 00:54:00 -07:00
										 |  |  |  | @collate( | 
					
						
							| 
									
										
										
										
											2015-07-27 04:20:49 -07:00
										 |  |  |  |     input=[select_image_for_pdf, ocr_tesseract_hocr], | 
					
						
							| 
									
										
										
										
											2015-07-25 00:54:00 -07:00
										 |  |  |  |     filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"), | 
					
						
							| 
									
										
										
										
											2015-07-25 01:45:26 -07:00
										 |  |  |  |     output=os.path.join(work_folder, r'\1.rendered.pdf'), | 
					
						
							| 
									
										
										
										
											2015-07-25 00:54:00 -07:00
										 |  |  |  |     extras=[_log, _pdfinfo, _pdfinfo_lock]) | 
					
						
							| 
									
										
										
										
											2015-07-27 04:20:49 -07:00
										 |  |  |  | def render_hocr_page( | 
					
						
							| 
									
										
										
										
											2015-07-25 00:54:00 -07:00
										 |  |  |  |         infiles, | 
					
						
							|  |  |  |  |         output_file, | 
					
						
							|  |  |  |  |         log, | 
					
						
							|  |  |  |  |         pdfinfo, | 
					
						
							|  |  |  |  |         pdfinfo_lock): | 
					
						
							|  |  |  |  |     hocr = next(ii for ii in infiles if ii.endswith('.hocr')) | 
					
						
							|  |  |  |  |     image = next(ii for ii in infiles if ii.endswith('.image')) | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  |     pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock) | 
					
						
							| 
									
										
										
										
											2015-07-27 17:18:02 -07:00
										 |  |  |  |     dpi = round(max(pageinfo['xres'], pageinfo['yres'], options.oversample)) | 
					
						
							| 
									
										
										
										
											2015-07-23 23:09:29 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     hocrtransform = HocrTransform(hocr, dpi) | 
					
						
							|  |  |  |  |     hocrtransform.to_pdf(output_file, imageFileName=image, | 
					
						
							|  |  |  |  |                          showBoundingboxes=False, invisibleText=True) | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 04:57:31 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-27 04:20:49 -07:00
										 |  |  |  | @active_if(options.pdf_renderer == 'hocr') | 
					
						
							| 
									
										
										
										
											2015-07-25 14:14:02 -07:00
										 |  |  |  | @active_if(options.debug_rendering) | 
					
						
							|  |  |  |  | @collate( | 
					
						
							| 
									
										
										
										
											2015-07-27 04:20:49 -07:00
										 |  |  |  |     input=[select_image_for_pdf, ocr_tesseract_hocr], | 
					
						
							| 
									
										
										
										
											2015-07-25 14:14:02 -07:00
										 |  |  |  |     filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"), | 
					
						
							|  |  |  |  |     output=os.path.join(work_folder, r'\1.debug.pdf'), | 
					
						
							|  |  |  |  |     extras=[_log, _pdfinfo, _pdfinfo_lock]) | 
					
						
							| 
									
										
										
										
											2015-07-27 04:20:49 -07:00
										 |  |  |  | def render_hocr_debug_page( | 
					
						
							| 
									
										
										
										
											2015-07-25 14:14:02 -07:00
										 |  |  |  |         infiles, | 
					
						
							|  |  |  |  |         output_file, | 
					
						
							|  |  |  |  |         log, | 
					
						
							|  |  |  |  |         pdfinfo, | 
					
						
							|  |  |  |  |         pdfinfo_lock): | 
					
						
							|  |  |  |  |     hocr = next(ii for ii in infiles if ii.endswith('.hocr')) | 
					
						
							|  |  |  |  |     image = next(ii for ii in infiles if ii.endswith('.image')) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock) | 
					
						
							| 
									
										
										
										
											2015-07-27 17:18:02 -07:00
										 |  |  |  |     dpi = round(max(pageinfo['xres'], pageinfo['yres'], options.oversample)) | 
					
						
							| 
									
										
										
										
											2015-07-25 14:14:02 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     hocrtransform = HocrTransform(hocr, dpi) | 
					
						
							|  |  |  |  |     hocrtransform.to_pdf(output_file, imageFileName=None, | 
					
						
							|  |  |  |  |                          showBoundingboxes=True, invisibleText=False) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-27 04:20:49 -07:00
										 |  |  |  | @active_if(options.pdf_renderer == 'tesseract') | 
					
						
							| 
									
										
										
										
											2015-07-28 01:47:30 -07:00
										 |  |  |  | @collate( | 
					
						
							|  |  |  |  |     input=[preprocess_clean, split_pages], | 
					
						
							|  |  |  |  |     filter=regex(r".*/(\d{6})(?:\.pp-clean\.png|\.page\.pdf)"), | 
					
						
							|  |  |  |  |     output=os.path.join(work_folder, r'\1.rendered.pdf'), | 
					
						
							| 
									
										
										
										
											2015-07-27 04:20:49 -07:00
										 |  |  |  |     extras=[_log, _pdfinfo, _pdfinfo_lock]) | 
					
						
							|  |  |  |  | def tesseract_ocr_and_render_pdf( | 
					
						
							| 
									
										
										
										
											2015-07-28 01:47:30 -07:00
										 |  |  |  |         input_files, | 
					
						
							| 
									
										
										
										
											2015-07-27 04:20:49 -07:00
										 |  |  |  |         output_file, | 
					
						
							|  |  |  |  |         log, | 
					
						
							|  |  |  |  |         pdfinfo, | 
					
						
							|  |  |  |  |         pdfinfo_lock): | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-28 03:02:35 -07:00
										 |  |  |  |     input_image = next((ii for ii in input_files if ii.endswith('.png')), '') | 
					
						
							|  |  |  |  |     input_pdf = next((ii for ii in input_files if ii.endswith('.pdf'))) | 
					
						
							|  |  |  |  |     if not input_image: | 
					
						
							|  |  |  |  |         # Skipping this page | 
					
						
							|  |  |  |  |         re_symlink(input_pdf, output_file) | 
					
						
							|  |  |  |  |         return | 
					
						
							| 
									
										
										
										
											2015-07-28 01:47:30 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-27 04:20:49 -07:00
										 |  |  |  |     args_tesseract = [ | 
					
						
							|  |  |  |  |         'tesseract', | 
					
						
							|  |  |  |  |         '-l', '+'.join(options.language), | 
					
						
							| 
									
										
										
										
											2015-07-28 01:47:30 -07:00
										 |  |  |  |         input_image, | 
					
						
							| 
									
										
										
										
											2015-07-27 04:20:49 -07:00
										 |  |  |  |         os.path.splitext(output_file)[0],  # Tesseract appends suffix | 
					
						
							|  |  |  |  |         'pdf' | 
					
						
							|  |  |  |  |     ] + options.tesseract_config | 
					
						
							|  |  |  |  |     p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE, | 
					
						
							|  |  |  |  |               universal_newlines=True) | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-28 01:47:30 -07:00
										 |  |  |  |     try: | 
					
						
							|  |  |  |  |         stdout, stderr = p.communicate(timeout=options.tesseract_timeout) | 
					
						
							|  |  |  |  |         if stdout: | 
					
						
							|  |  |  |  |             log.info(stdout) | 
					
						
							|  |  |  |  |         if stderr: | 
					
						
							|  |  |  |  |             log.error(stderr) | 
					
						
							| 
									
										
										
										
											2015-07-31 00:06:58 -07:00
										 |  |  |  |     except TimeoutExpired: | 
					
						
							| 
									
										
										
										
											2015-07-28 01:47:30 -07:00
										 |  |  |  |         p.kill() | 
					
						
							|  |  |  |  |         log.info("Tesseract - page timed out") | 
					
						
							|  |  |  |  |         re_symlink(input_pdf, output_file) | 
					
						
							| 
									
										
										
										
											2015-07-27 04:20:49 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 04:57:31 -07:00
										 |  |  |  | @transform( | 
					
						
							| 
									
										
										
										
											2015-07-24 01:55:54 -07:00
										 |  |  |  |     input=repair_pdf, | 
					
						
							|  |  |  |  |     filter=suffix('.repaired.pdf'), | 
					
						
							| 
									
										
										
										
											2015-07-23 04:57:31 -07:00
										 |  |  |  |     output='.pdfa_def.ps', | 
					
						
							| 
									
										
										
										
											2015-07-25 01:45:26 -07:00
										 |  |  |  |     output_dir=work_folder, | 
					
						
							| 
									
										
										
										
											2015-07-23 04:57:31 -07:00
										 |  |  |  |     extras=[_log]) | 
					
						
							|  |  |  |  | def generate_postscript_stub( | 
					
						
							|  |  |  |  |         input_file, | 
					
						
							|  |  |  |  |         output_file, | 
					
						
							|  |  |  |  |         log): | 
					
						
							| 
									
										
										
										
											2015-07-25 15:31:02 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 18:05:25 -07:00
										 |  |  |  |     pdf = pypdf.PdfFileReader(input_file) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     def from_document_info(key): | 
					
						
							| 
									
										
										
										
											2015-08-24 01:23:30 -07:00
										 |  |  |  |         # pdf.documentInfo.get() DOES NOT behave as expected for a dict-like | 
					
						
							|  |  |  |  |         # object, so call with precautions.  TypeError may occur if the PDF | 
					
						
							|  |  |  |  |         # is missing the optional document info section. | 
					
						
							| 
									
										
										
										
											2015-07-25 18:05:25 -07:00
										 |  |  |  |         try: | 
					
						
							|  |  |  |  |             s = pdf.documentInfo[key] | 
					
						
							|  |  |  |  |             return str(s) | 
					
						
							| 
									
										
										
										
											2015-08-24 01:23:30 -07:00
										 |  |  |  |         except (KeyError, TypeError): | 
					
						
							| 
									
										
										
										
											2015-07-25 18:05:25 -07:00
										 |  |  |  |             return '' | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     pdfmark = { | 
					
						
							|  |  |  |  |         'title': from_document_info('/Title'), | 
					
						
							|  |  |  |  |         'author': from_document_info('/Author'), | 
					
						
							|  |  |  |  |         'keywords': from_document_info('/Keywords'), | 
					
						
							|  |  |  |  |         'subject': from_document_info('/Subject'), | 
					
						
							|  |  |  |  |     } | 
					
						
							| 
									
										
										
										
											2015-07-25 18:12:25 -07:00
										 |  |  |  |     if options.title: | 
					
						
							|  |  |  |  |         pdfmark['title'] = options.title | 
					
						
							|  |  |  |  |     if options.author: | 
					
						
							|  |  |  |  |         pdfmark['author'] = options.author | 
					
						
							|  |  |  |  |     if options.keywords: | 
					
						
							|  |  |  |  |         pdfmark['keywords'] = options.keywords | 
					
						
							|  |  |  |  |     if options.subject: | 
					
						
							|  |  |  |  |         pdfmark['subject'] = options.subject | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 15:31:02 -07:00
										 |  |  |  |     generate_pdfa_def(output_file, pdfmark) | 
					
						
							| 
									
										
										
										
											2015-07-22 22:46:00 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-22 22:51:38 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 04:25:19 -07:00
										 |  |  |  | @transform( | 
					
						
							|  |  |  |  |     input=split_pages, | 
					
						
							|  |  |  |  |     filter=suffix('.skip.page.pdf'), | 
					
						
							|  |  |  |  |     output='.done.pdf', | 
					
						
							|  |  |  |  |     output_dir=work_folder, | 
					
						
							|  |  |  |  |     extras=[_log]) | 
					
						
							|  |  |  |  | def skip_page( | 
					
						
							|  |  |  |  |         input_file, | 
					
						
							|  |  |  |  |         output_file, | 
					
						
							|  |  |  |  |         log): | 
					
						
							|  |  |  |  |     re_symlink(input_file, output_file, log) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 04:57:31 -07:00
										 |  |  |  | @merge( | 
					
						
							| 
									
										
										
										
											2015-07-27 04:20:49 -07:00
										 |  |  |  |     input=[render_hocr_page, render_hocr_debug_page, skip_page, | 
					
						
							|  |  |  |  |            tesseract_ocr_and_render_pdf, generate_postscript_stub], | 
					
						
							| 
									
										
										
										
											2015-07-25 01:45:26 -07:00
										 |  |  |  |     output=os.path.join(work_folder, 'merged.pdf'), | 
					
						
							| 
									
										
										
										
											2015-07-23 04:57:31 -07:00
										 |  |  |  |     extras=[_log, _pdfinfo, _pdfinfo_lock]) | 
					
						
							|  |  |  |  | def merge_pages( | 
					
						
							|  |  |  |  |         input_files, | 
					
						
							|  |  |  |  |         output_file, | 
					
						
							|  |  |  |  |         log, | 
					
						
							|  |  |  |  |         pdfinfo, | 
					
						
							|  |  |  |  |         pdfinfo_lock): | 
					
						
							| 
									
										
										
										
											2015-07-22 22:51:38 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 14:14:02 -07:00
										 |  |  |  |     def input_file_order(s): | 
					
						
							| 
									
										
										
										
											2015-07-25 18:05:25 -07:00
										 |  |  |  |         '''Sort order: All rendered pages followed
 | 
					
						
							|  |  |  |  |         by their debug page, if any, followed by Postscript stub. | 
					
						
							|  |  |  |  |         Ghostscript documentation has the Postscript stub at the | 
					
						
							|  |  |  |  |         beginning, but it works at the end and also gets document info | 
					
						
							|  |  |  |  |         right that way.'''
 | 
					
						
							| 
									
										
										
										
											2015-07-25 14:14:02 -07:00
										 |  |  |  |         if s.endswith('.ps'): | 
					
						
							| 
									
										
										
										
											2015-07-25 18:05:25 -07:00
										 |  |  |  |             return 99999999 | 
					
						
							| 
									
										
										
										
											2015-07-25 14:14:02 -07:00
										 |  |  |  |         key = int(os.path.basename(s)[0:6]) * 10 | 
					
						
							|  |  |  |  |         if 'debug' in os.path.basename(s): | 
					
						
							|  |  |  |  |             key += 1 | 
					
						
							|  |  |  |  |         return key | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     pdf_pages = sorted(input_files, key=input_file_order) | 
					
						
							|  |  |  |  |     log.info(pdf_pages) | 
					
						
							| 
									
										
										
										
											2015-07-30 23:20:21 -07:00
										 |  |  |  |     ghostscript.generate_pdfa(pdf_pages, output_file, options.jobs or 1) | 
					
						
							| 
									
										
										
										
											2015-07-22 22:51:38 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-23 14:48:46 -07:00
										 |  |  |  | @transform( | 
					
						
							|  |  |  |  |     input=merge_pages, | 
					
						
							|  |  |  |  |     filter=formatter(), | 
					
						
							|  |  |  |  |     output=options.output_file, | 
					
						
							|  |  |  |  |     extras=[_log, _pdfinfo, _pdfinfo_lock]) | 
					
						
							| 
									
										
										
										
											2015-08-10 16:05:00 -07:00
										 |  |  |  | def copy_final( | 
					
						
							| 
									
										
										
										
											2015-07-23 14:48:46 -07:00
										 |  |  |  |         input_file, | 
					
						
							|  |  |  |  |         output_file, | 
					
						
							|  |  |  |  |         log, | 
					
						
							|  |  |  |  |         pdfinfo, | 
					
						
							|  |  |  |  |         pdfinfo_lock): | 
					
						
							| 
									
										
										
										
											2015-08-10 16:05:00 -07:00
										 |  |  |  |     shutil.copy(input_file, output_file) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def validate_pdfa( | 
					
						
							|  |  |  |  |         input_file, | 
					
						
							|  |  |  |  |         log): | 
					
						
							| 
									
										
										
										
											2015-07-23 14:48:46 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-08-11 15:31:32 -07:00
										 |  |  |  |     args_qpdf = [ | 
					
						
							|  |  |  |  |         'qpdf', | 
					
						
							|  |  |  |  |         '--check', | 
					
						
							| 
									
										
										
										
											2015-07-23 14:48:46 -07:00
										 |  |  |  |         input_file | 
					
						
							|  |  |  |  |     ] | 
					
						
							| 
									
										
										
										
											2015-08-11 15:31:32 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     try: | 
					
						
							|  |  |  |  |         check_output(args_qpdf, stderr=STDOUT, universal_newlines=True) | 
					
						
							|  |  |  |  |     except CalledProcessError as e: | 
					
						
							|  |  |  |  |         if e.returncode == 2: | 
					
						
							|  |  |  |  |             print("{0}: not a valid PDF, and could not repair it.".format( | 
					
						
							|  |  |  |  |                     options.input_file)) | 
					
						
							|  |  |  |  |             print("Details:") | 
					
						
							|  |  |  |  |             print(e.output) | 
					
						
							|  |  |  |  |         elif e.returncode == 3: | 
					
						
							|  |  |  |  |             log.info("qpdf --check returned warnings:") | 
					
						
							|  |  |  |  |             log.info(e.output) | 
					
						
							|  |  |  |  |         else: | 
					
						
							|  |  |  |  |             print(e.output) | 
					
						
							|  |  |  |  |         return False | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     return True | 
					
						
							| 
									
										
										
										
											2015-07-23 14:48:46 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-22 22:51:38 -07:00
										 |  |  |  | # @active_if(ocr_required and options.exact_image) | 
					
						
							|  |  |  |  | # @merge([render_hocr_blank_page, extract_single_page], | 
					
						
							| 
									
										
										
										
											2015-07-25 01:45:26 -07:00
										 |  |  |  | #        os.path.join(work_folder, "%04i.merged.pdf") % pageno) | 
					
						
							| 
									
										
										
										
											2015-07-22 22:51:38 -07:00
										 |  |  |  | # def merge_hocr_with_original_page(infiles, output_file): | 
					
						
							|  |  |  |  | #     with open(infiles[0], 'rb') as hocr_input, \ | 
					
						
							|  |  |  |  | #             open(infiles[1], 'rb') as page_input, \ | 
					
						
							|  |  |  |  | #             open(output_file, 'wb') as output: | 
					
						
							|  |  |  |  | #         hocr_reader = pypdf.PdfFileReader(hocr_input) | 
					
						
							|  |  |  |  | #         page_reader = pypdf.PdfFileReader(page_input) | 
					
						
							|  |  |  |  | #         writer = pypdf.PdfFileWriter() | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | #         the_page = hocr_reader.getPage(0) | 
					
						
							|  |  |  |  | #         the_page.mergePage(page_reader.getPage(0)) | 
					
						
							|  |  |  |  | #         writer.addPage(the_page) | 
					
						
							|  |  |  |  | #         writer.write(output) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 01:10:14 -07:00
										 |  |  |  | def available_cpu_count(): | 
					
						
							|  |  |  |  |     try: | 
					
						
							|  |  |  |  |         return multiprocessing.cpu_count() | 
					
						
							|  |  |  |  |     except NotImplementedError: | 
					
						
							|  |  |  |  |         pass | 
					
						
							| 
									
										
										
										
											2015-07-22 22:51:38 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-25 01:10:14 -07:00
										 |  |  |  |     try: | 
					
						
							|  |  |  |  |         import psutil | 
					
						
							|  |  |  |  |         return psutil.cpu_count() | 
					
						
							|  |  |  |  |     except (ImportError, AttributeError): | 
					
						
							|  |  |  |  |         pass | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-28 02:25:50 -07:00
										 |  |  |  |     complain( | 
					
						
							| 
									
										
										
										
											2015-07-25 01:10:14 -07:00
										 |  |  |  |         "Could not get CPU count.  Assuming one (1) CPU." | 
					
						
							| 
									
										
										
										
											2015-07-28 02:25:50 -07:00
										 |  |  |  |         "Use -j N to set manually.") | 
					
						
							| 
									
										
										
										
											2015-07-25 01:10:14 -07:00
										 |  |  |  |     return 1 | 
					
						
							| 
									
										
										
										
											2014-09-26 04:19:41 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-26 01:52:08 -07:00
										 |  |  |  | def run_pipeline(): | 
					
						
							| 
									
										
										
										
											2015-07-30 23:20:21 -07:00
										 |  |  |  |     if not options.jobs or options.jobs == 1: | 
					
						
							|  |  |  |  |         options.jobs = available_cpu_count() | 
					
						
							| 
									
										
										
										
											2015-08-10 16:05:00 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-08-11 02:19:46 -07:00
										 |  |  |  |     try: | 
					
						
							|  |  |  |  |         cmdline.run(options) | 
					
						
							|  |  |  |  |     except ruffus_exceptions.RethrownJobError as e: | 
					
						
							|  |  |  |  |         if options.verbose: | 
					
						
							|  |  |  |  |             print(e) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         # Yuck. Hunt through the ruffus exception to find out what the | 
					
						
							|  |  |  |  |         # return code is supposed to be. | 
					
						
							|  |  |  |  |         for exc in e.args: | 
					
						
							|  |  |  |  |             task_name, job_name, exc_name, exc_value, exc_stack = exc | 
					
						
							|  |  |  |  |             if exc_name == 'builtins.SystemExit': | 
					
						
							|  |  |  |  |                 return eval( | 
					
						
							|  |  |  |  |                     exc_value, | 
					
						
							|  |  |  |  |                     {'ExitCode': ExitCode}, {'exc_value': exc_value}) | 
					
						
							| 
									
										
										
										
											2015-08-28 04:48:29 -07:00
										 |  |  |  |         return ExitCode.other_error | 
					
						
							| 
									
										
										
										
											2015-07-26 01:52:08 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-08-11 15:31:32 -07:00
										 |  |  |  |     if not validate_pdfa(options.output_file, _log): | 
					
						
							| 
									
										
										
										
											2015-08-10 16:05:00 -07:00
										 |  |  |  |         _log.warning('Output file: The generated PDF/A file is INVALID') | 
					
						
							| 
									
										
										
										
											2015-08-11 15:31:32 -07:00
										 |  |  |  |         return ExitCode.invalid_output_pdfa | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     return ExitCode.ok | 
					
						
							| 
									
										
										
										
											2015-08-10 16:05:00 -07:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-26 01:52:08 -07:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | if __name__ == '__main__': | 
					
						
							| 
									
										
										
										
											2015-08-10 16:05:00 -07:00
										 |  |  |  |     sys.exit(run_pipeline()) |