mirror of
				https://github.com/ocrmypdf/OCRmyPDF.git
				synced 2025-10-31 18:00:01 +00:00 
			
		
		
		
	Modularize unpaper; get -d and -c working again
This commit is contained in:
		
							parent
							
								
									5adff94545
								
							
						
					
					
						commit
						9f90b5cb0a
					
				
							
								
								
									
										101
									
								
								src/ocrmypdf.py
									
									
									
									
									
								
							
							
						
						
									
										101
									
								
								src/ocrmypdf.py
									
									
									
									
									
								
							| @ -153,6 +153,20 @@ if not set(options.language).issubset(tesseract.LANGUAGES): | ||||
|     sys.exit(EXIT_BAD_ARGS) | ||||
| 
 | ||||
| 
 | ||||
| # ---------- | ||||
| # Arguments | ||||
| 
 | ||||
| 
 | ||||
| if any((options.deskew, options.clean, options.clean_final)): | ||||
|     try: | ||||
|         from . import unpaper | ||||
|     except ImportError: | ||||
|         print("Install the 'unpaper' program to use the specified options", | ||||
|               file=sys.stderr) | ||||
|         sys.exit(EXIT_BAD_ARGS) | ||||
| else: | ||||
|     unpaper = None | ||||
| 
 | ||||
| # ---------- | ||||
| # Logging | ||||
| 
 | ||||
| @ -384,69 +398,50 @@ def rasterize_with_ghostscript( | ||||
| @transform( | ||||
|     input=rasterize_with_ghostscript, | ||||
|     filter=suffix(".page.png"), | ||||
|     output=".pp.png", | ||||
|     output=".pp-deskew.png", | ||||
|     extras=[_log, _pdfinfo, _pdfinfo_lock]) | ||||
| def preprocess( | ||||
| def preprocess_deskew( | ||||
|         input_file, | ||||
|         output_file, | ||||
|         log, | ||||
|         pdfinfo, | ||||
|         pdfinfo_lock): | ||||
| 
 | ||||
|     if not options.deskew and not options.clean: | ||||
|     if not options.deskew: | ||||
|         re_symlink(input_file, output_file, log) | ||||
|         return | ||||
| 
 | ||||
|     pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock) | ||||
|     dpi = int(pageinfo['xres']) | ||||
| 
 | ||||
|     # unpaper documentation: | ||||
|     # https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md | ||||
|     args_unpaper = [ | ||||
|         'unpaper', | ||||
|         '-v', | ||||
|         '--dpi', str(int(pageinfo['xres'])), | ||||
|         '--mask-scan-size', '100',  # don't blank out narrow columns | ||||
|         '--no-border-align',  # don't align visible content to borders | ||||
|         '--no-mask-center',   # don't center visible content within page | ||||
|         '--no-grayfilter',    # don't remove light gray areas | ||||
|         '--no-blackfilter',   # don't remove solid black areas | ||||
|     ] | ||||
| 
 | ||||
|     if not options.clean: | ||||
|         args_unpaper.extend([ | ||||
|             '--no-noisefilter', | ||||
|             '--no-blurfilter']) | ||||
|     if not options.deskew: | ||||
|         args_unpaper.extend([ | ||||
|             '--no-deskew']) | ||||
| 
 | ||||
|     SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'} | ||||
|     suffix = '' | ||||
| 
 | ||||
|     im = Image.open(input_file) | ||||
|     suffix = SUFFIXES[im.mode] | ||||
|     with NamedTemporaryFile(suffix=suffix) as input_pnm, \ | ||||
|             NamedTemporaryFile(suffix=suffix, mode="r+b") as output_pnm: | ||||
|         im.save(input_pnm, format='PPM') | ||||
|         im.close() | ||||
| 
 | ||||
|         os.unlink(output_pnm.name) | ||||
| 
 | ||||
|         args_unpaper.extend([input_pnm.name, output_pnm.name]) | ||||
|         p_unpaper = Popen( | ||||
|             args_unpaper, close_fds=True, | ||||
|             universal_newlines=True, stdout=PIPE, stderr=PIPE | ||||
|             ) | ||||
|         out, err = p_unpaper.communicate() | ||||
|         log.debug(out) | ||||
|         log.debug(err) | ||||
| 
 | ||||
|         Image.open(output_pnm.name).save(output_file) | ||||
|     unpaper.deskew(input_file, output_file, dpi, log) | ||||
| 
 | ||||
| 
 | ||||
| @transform( | ||||
|     input=preprocess, | ||||
|     filter=suffix(".pp.png"), | ||||
|     input=preprocess_deskew, | ||||
|     filter=suffix(".pp-deskew.png"), | ||||
|     output=".pp-clean.png", | ||||
|     extras=[_log, _pdfinfo, _pdfinfo_lock]) | ||||
| def preprocess_clean( | ||||
|         input_file, | ||||
|         output_file, | ||||
|         log, | ||||
|         pdfinfo, | ||||
|         pdfinfo_lock): | ||||
| 
 | ||||
|     if not options.clean: | ||||
|         re_symlink(input_file, output_file, log) | ||||
|         return | ||||
| 
 | ||||
|     pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock) | ||||
|     dpi = int(pageinfo['xres']) | ||||
| 
 | ||||
|     unpaper.clean(input_file, output_file, dpi, log) | ||||
| 
 | ||||
| 
 | ||||
| @transform( | ||||
|     input=preprocess_clean, | ||||
|     filter=suffix(".pp-clean.png"), | ||||
|     output=".hocr", | ||||
|     extras=[_log, _pdfinfo, _pdfinfo_lock]) | ||||
| def ocr_tesseract( | ||||
| @ -509,8 +504,8 @@ def ocr_tesseract( | ||||
| 
 | ||||
| 
 | ||||
| @collate( | ||||
|     input=[rasterize_with_ghostscript, ocr_tesseract], | ||||
|     filter=regex(r".*/(\d{6})(?:\.page\.png|\.hocr)"), | ||||
|     input=[preprocess_deskew, preprocess_clean, ocr_tesseract], | ||||
|     filter=regex(r".*/(\d{6})(?:\.pp-deskew\.png|\.pp-clean\.png|\.hocr)"), | ||||
|     output=os.path.join(options.temp_folder, r'\1.rendered.pdf'), | ||||
|     extras=[_log, _pdfinfo, _pdfinfo_lock]) | ||||
| def render_page( | ||||
| @ -520,7 +515,11 @@ def render_page( | ||||
|         pdfinfo, | ||||
|         pdfinfo_lock): | ||||
|     hocr = next(ii for ii in infiles if ii.endswith('.hocr')) | ||||
|     image = next(ii for ii in infiles if ii.endswith('.page.png')) | ||||
|     if options.clean_final: | ||||
|         image_suffix = '.pp-clean.png' | ||||
|     else: | ||||
|         image_suffix = '.pp-deskew.png' | ||||
|     image = next(ii for ii in infiles if ii.endswith(image_suffix)) | ||||
| 
 | ||||
|     pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock) | ||||
|     dpi = round(max(pageinfo['xres'], pageinfo['yres'])) | ||||
|  | ||||
							
								
								
									
										87
									
								
								src/unpaper.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										87
									
								
								src/unpaper.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,87 @@ | ||||
| #!/usr/bin/env python3 | ||||
| # unpaper documentation: | ||||
| # https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md | ||||
| 
 | ||||
| from subprocess import Popen, PIPE | ||||
| from tempfile import NamedTemporaryFile | ||||
| import sys | ||||
| import os | ||||
| 
 | ||||
| 
 | ||||
| def _version(): | ||||
|     args_unpaper = [ | ||||
|         'unpaper', | ||||
|         '--version' | ||||
|     ] | ||||
|     p_unpaper = Popen(args_unpaper, close_fds=True, universal_newlines=True, | ||||
|                       stdout=PIPE, stderr=PIPE) | ||||
|     version, _ = p_unpaper.communicate(timeout=5) | ||||
| 
 | ||||
|     return version.strip() | ||||
| 
 | ||||
| 
 | ||||
| try: | ||||
|     VERSION = _version() | ||||
| except FileNotFoundError: | ||||
|     print("Could not find 'unpaper' executable", file=sys.stderr) | ||||
|     raise | ||||
| 
 | ||||
| try: | ||||
|     from PIL import Image | ||||
| except ImportError: | ||||
|     print("Could not find Python3 imaging library", file=sys.stderr) | ||||
|     raise | ||||
| 
 | ||||
| 
 | ||||
| def run(input_file, output_file, dpi, log, mode_args): | ||||
|     args_unpaper = [ | ||||
|         'unpaper', | ||||
|         '-v', | ||||
|         '--dpi', str(dpi) | ||||
|     ] + mode_args | ||||
| 
 | ||||
|     SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'} | ||||
|     suffix = '' | ||||
| 
 | ||||
|     im = Image.open(input_file) | ||||
|     suffix = SUFFIXES[im.mode] | ||||
|     with NamedTemporaryFile(suffix=suffix) as input_pnm, \ | ||||
|             NamedTemporaryFile(suffix=suffix, mode="r+b") as output_pnm: | ||||
|         im.save(input_pnm, format='PPM') | ||||
|         im.close() | ||||
| 
 | ||||
|         os.unlink(output_pnm.name) | ||||
| 
 | ||||
|         args_unpaper.extend([input_pnm.name, output_pnm.name]) | ||||
|         p_unpaper = Popen( | ||||
|             args_unpaper, close_fds=True, | ||||
|             universal_newlines=True, stdout=PIPE, stderr=PIPE | ||||
|             ) | ||||
|         out, err = p_unpaper.communicate() | ||||
|         log.debug(out) | ||||
|         log.debug(err) | ||||
| 
 | ||||
|         Image.open(output_pnm.name).save(output_file) | ||||
| 
 | ||||
| 
 | ||||
| def deskew(input_file, output_file, dpi, log): | ||||
|     run(input_file, output_file, dpi, log, [ | ||||
|         '--mask-scan-size', '100',  # don't blank out narrow columns | ||||
|         '--no-border-align',  # don't align visible content to borders | ||||
|         '--no-mask-center',   # don't center visible content within page | ||||
|         '--no-grayfilter',    # don't remove light gray areas | ||||
|         '--no-blackfilter',   # don't remove solid black areas | ||||
|         '--no-noisefilter',   # don't remove salt and pepper noise | ||||
|         '--no-blurfilter'     # don't remove blurry objects/debris | ||||
|     ]) | ||||
| 
 | ||||
| 
 | ||||
| def clean(input_file, output_file, dpi, log): | ||||
|     run(input_file, output_file, dpi, log, [ | ||||
|         '--mask-scan-size', '100',  # don't blank out narrow columns | ||||
|         '--no-border-align',  # don't align visible content to borders | ||||
|         '--no-mask-center',   # don't center visible content within page | ||||
|         '--no-grayfilter',    # don't remove light gray areas | ||||
|         '--no-blackfilter',   # don't remove solid black areas | ||||
|         '--no-deskew',        # don't deskew | ||||
|     ]) | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Jim Barlow
						Jim Barlow