26 Commits

Author SHA1 Message Date
Jim Barlow
9f90b5cb0a Modularize unpaper; get -d and -c working again 2015-07-25 00:22:56 -07:00
Jim Barlow
5adff94545 Remove more dead/old code 2015-07-24 15:41:24 -07:00
Jim Barlow
aa2baabfa9 Implement deskew and clean using unpaper 2015-07-24 15:19:37 -07:00
Jim Barlow
6451017962 Implement oversample 2015-07-24 01:56:44 -07:00
Jim Barlow
0f857a6a34 Put .rendered.pdf files into temp folder 2015-07-24 01:56:19 -07:00
Jim Barlow
7638a88a6a Change 'clean' to 'repair' for clarity since 'clean' is what unpaper does 2015-07-24 01:55:54 -07:00
Jim Barlow
bed12d2021 Remove 'pdftoppm' renderer
Ghostscript is more reliable than Poppler's pdftoppm renderer. gs is
also a hard dependency, as the only open source tool that can produce
a PDF/A file, while Poppler could be removed.  pdftoppm has awkward
syntax with some special handling needed for different versions.  I have
found isolated rendering bugs with pdftoppm as well.

With that, I'm removing supporting for multiple rasterizers.

A minor advantage of pdftoppm is that its code produced JPEGs where
possible, but this can be achieved with gs.
2015-07-24 01:35:33 -07:00
Jim Barlow
587569fcb6 Tidy up 2015-07-24 01:27:01 -07:00
Jim Barlow
8c0dc9a06d Platform independent search for iccprofiles for PDF/A 2015-07-24 01:18:46 -07:00
Jim Barlow
289e4025ad First successful PDF/A produced by new pipeline 2015-07-23 23:28:32 -07:00
Jim Barlow
5476eafe4c Rasterize PDF pages and generate .hocr files 2015-07-23 23:09:29 -07:00
Jim Barlow
df32f283cd Langauge checking 2015-07-23 18:38:59 -07:00
Jim Barlow
68ecaac9cc Add tesseract version check 2015-07-23 17:06:00 -07:00
Jim Barlow
cffd4623ca Add PDF/A validation 2015-07-23 17:05:34 -07:00
Jim Barlow
6dc2782e80 Can now generate PDF/A files, multipage and single page 2015-07-23 04:57:31 -07:00
Jim Barlow
5df187c086 Wrap a proxy around pdfinfo block so it can be passed around processes 2015-07-23 03:49:30 -07:00
Jim Barlow
7fd172e41e Get rid of chdir, replace deprecated @split with @subdivide 2015-07-23 03:09:03 -07:00
Jim Barlow
619528a1b5 Try a method for passing along the pdfinfo struct 2015-07-23 02:39:42 -07:00
Jim Barlow
596d468c14 Reinstate WrapperLogger with more multiprocessing fixes 2015-07-23 02:26:09 -07:00
Jim Barlow
eddbf1060a diff --git a/src/ocrmypdf.py b/src/ocrmypdf.py
index 68d1591..95afa8f 100755
--- a/src/ocrmypdf.py
+++ b/src/ocrmypdf.py
@@ -24,6 +24,7 @@ import ruffus.cmdline as cmdline
 from .hocrtransform import HocrTransform

 import warnings
+import multiprocessing

 warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)

@@ -96,7 +97,7 @@ debugging.add_argument(
     '-k', '--keep-temporary-files', action='store_true',
     help="keep temporary files (helpful for debugging)")
 debugging.add_argument(
-    '-g' ,'--debug-rendering', action='store_true',
+    '-g', '--debug-rendering', action='store_true',
     help="render each page twice with debug information on second page")

@@ -106,51 +107,19 @@ if not options.temp_folder:
     options.temp_folder = 'tmp'

-_logger, _logger_mutex = cmdline.setup_logging(__name__, options.log_file,
-                                               options.verbose)
+log, log_mutex = cmdline.setup_logging(__name__, options.log_file,
+                                       options.verbose)

-class WrappedLogger:
-
-    def __init__(self, my_logger, my_mutex):
-        self.logger = my_logger
-        self.mutex = my_mutex
-
-    def log(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.log(*args, **kwargs)
-
-    def debug(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.debug(*args, **kwargs)
-
-    def info(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.info(*args, **kwargs)
-
-    def warning(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.warning(*args, **kwargs)
-
-    def error(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.error(*args, **kwargs)
-
-    def critical(self, *args, **kwargs):
-        with self.mutex:
-            self.logger.critical(*args, **kwargs)
-
-log = WrappedLogger(_logger, _logger_mutex)
-
-
-def re_symlink(input_file, soft_link_name, log=log):
+def re_symlink(input_file, soft_link_name, log, mutex):
     """
     Helper function: relinks soft symbolic link if necessary
     """
     if input_file == soft_link_name:
-        log.debug("Warning: No symbolic link made. You are using " +
-                     "the original data directory as the working directory.")
+        with mutex:
+            log.debug("Warning: No symbolic link made. You are using " +
+                      "the original data directory as the working directory.")
         return

@@ -161,12 +130,14 @@ def re_symlink(input_file, soft_link_name, log=log):
         try:
             os.unlink(soft_link_name)
         except:
-            log.debug("Can't unlink %s" % (soft_link_name))
+            with mutex:
+                log.debug("Can't unlink %s" % (soft_link_name))

     if not os.path.exists(input_file):
         raise Exception("trying to create a broken symlink to %s" % input_file)

-    log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))
+    with mutex:
+        log.debug("os.symlink(%s, %s)" % (input_file, soft_link_name))

     os.symlink(
2015-07-23 02:22:12 -07:00
Jim Barlow
33731a6864 Move pageinfo code out of the pipeline 2015-07-23 02:17:13 -07:00
Jim Barlow
0c36cd2e24 Fix errors related to use working directory
Mainly workaround lack of @split(...output_dir) in ruffus
2015-07-23 01:16:05 -07:00
Jim Barlow
5cef1be26d New pipeline runs, splits pages 2015-07-22 22:58:13 -07:00
Jim Barlow
e89f482c3d Fixes from early testing of new pipeline 2015-07-22 22:51:38 -07:00
Jim Barlow
fe3e40305d Learn to split PDF into pages 2015-07-22 22:46:00 -07:00
Jim Barlow
a92b5ceb6b Begin unifying main script and page script 2015-07-22 22:30:00 -07:00