Environment variables can now override default programs

This commit is contained in:
James R. Barlow 2015-12-17 09:05:10 -08:00
parent 276f421c44
commit 1731ce2a44
7 changed files with 37 additions and 16 deletions

View File

@ -6,6 +6,18 @@ Please always read this file before installing the package
Download software here: https://github.com/jbarlow83/OCRmyPDF/tags
Latest:
=====
Changes
-------
- Fixed bug where successful repair of PDF files errors led to abort (thanks to @shemgp)
- You can now override the location of external programs OCRmyPDF uses by setting
environmental variables of the form OCRMYPDF_PROGRAMNAME, e.g. OCRMYPDF_TESSERACT will
override system Tesseract.
v3.1:
=====

View File

@ -1,4 +1,5 @@
from enum import IntEnum
import os
class ExitCode(IntEnum):
@ -10,3 +11,8 @@ class ExitCode(IntEnum):
file_access_error = 5
already_done_ocr = 6
other_error = 15
def get_program(name):
envvar = 'OCRMYPDF_' + name.upper()
return os.environ.get(envvar, name)

View File

@ -4,12 +4,13 @@
from tempfile import NamedTemporaryFile
from subprocess import Popen, PIPE, check_call
from shutil import copy
from . import get_program
def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log):
with NamedTemporaryFile(delete=True) as tmp:
args_gs = [
'gs',
get_program('gs'),
'-dQUIET',
'-dBATCH',
'-dNOPAUSE',
@ -36,7 +37,7 @@ def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log):
def generate_pdfa(pdf_pages, output_file, threads=1):
with NamedTemporaryFile(delete=True) as gs_pdf:
args_gs = [
"gs",
get_program("gs"),
"-dQUIET",
"-dBATCH",
"-dNOPAUSE",

View File

@ -8,6 +8,7 @@ from string import Template
from subprocess import Popen, PIPE
import os
import codecs
from . import get_program
# This is a template written in PostScript which is needed to create PDF/A
@ -95,7 +96,8 @@ def _get_pdfa_def(icc_profile, icc_identifier, pdfmark):
def _get_postscript_icc_path():
"Parse Ghostscript's help message to find where iccprofiles are stored"
p_gs = Popen(['gs', '--help'], close_fds=True, universal_newlines=True,
p_gs = Popen([get_program('gs'), '--help'], close_fds=True,
universal_newlines=True,
stdout=PIPE, stderr=PIPE)
out, _ = p_gs.communicate()
lines = out.splitlines()

View File

@ -5,12 +5,12 @@ from subprocess import CalledProcessError, check_output, STDOUT, check_call
import sys
import os
from . import ExitCode
from . import ExitCode, get_program
def check(input_file, log):
args_qpdf = [
'qpdf',
get_program('qpdf'),
'--check',
input_file
]
@ -34,7 +34,7 @@ def check(input_file, log):
def repair(input_file, output_file, log):
args_qpdf = [
'qpdf', input_file, output_file
get_program('qpdf'), input_file, output_file
]
try:
check_output(args_qpdf, stderr=STDOUT, universal_newlines=True)
@ -60,7 +60,7 @@ def repair(input_file, output_file, log):
def get_npages(input_file):
pages = check_output(
['qpdf', '--show-npages', input_file],
[get_program('qpdf'), '--show-npages', input_file],
universal_newlines=True, close_fds=True)
return int(pages)
@ -73,7 +73,7 @@ def split_pages(input_file, work_folder, npages):
"""
for n in range(int(npages)):
args_qpdf = [
'qpdf', input_file,
get_program('qpdf'), input_file,
'--pages', input_file, '{0}'.format(n + 1), '--',
os.path.join(work_folder, '{0:06d}.page.pdf'.format(n + 1))
]

View File

@ -6,7 +6,7 @@ import os
import re
import shutil
from functools import lru_cache
from . import ExitCode
from . import ExitCode, get_program
from subprocess import Popen, PIPE, CalledProcessError, \
TimeoutExpired, check_output, STDOUT
@ -42,7 +42,7 @@ HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
@lru_cache(maxsize=1)
def version():
args_tess = [
'tesseract',
get_program('tesseract'),
'--version'
]
try:
@ -60,7 +60,7 @@ def version():
@lru_cache(maxsize=1)
def languages():
args_tess = [
'tesseract',
get_program('tesseract'),
'--list-langs'
]
try:
@ -82,7 +82,7 @@ def generate_hocr(input_file, output_hocr, language: list, tessconfig: list,
badxml = os.path.splitext(output_hocr)[0] + '.badxml'
args_tesseract = [
'tesseract',
get_program('tesseract'),
'-l', '+'.join(language),
input_file,
badxml,
@ -147,7 +147,7 @@ def generate_pdf(input_image, skip_pdf, output_pdf, language: list,
'''
args_tesseract = [
'tesseract',
get_program('tesseract'),
'-l', '+'.join(language),
input_image,
os.path.splitext(output_pdf)[0], # Tesseract appends suffix

View File

@ -8,13 +8,13 @@ from tempfile import NamedTemporaryFile
import sys
import os
from functools import lru_cache
from . import ExitCode
from . import ExitCode, get_program
@lru_cache(maxsize=1)
def version():
args_unpaper = [
'unpaper',
get_program('unpaper'),
'--version'
]
p_unpaper = Popen(args_unpaper, close_fds=True, universal_newlines=True,
@ -33,7 +33,7 @@ except ImportError:
def run(input_file, output_file, dpi, log, mode_args):
args_unpaper = [
'unpaper',
get_program('unpaper'),
'-v',
'--dpi', str(dpi)
] + mode_args