mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-12-29 08:01:04 +00:00
Check that the locale is sane before allowing OCR to proceed
This commit is contained in:
parent
5d2c67c62b
commit
40aa82ab41
@ -28,6 +28,7 @@ from . import PROGRAM_NAME, VERSION
|
||||
|
||||
from .exceptions import *
|
||||
from . import exceptions as ocrmypdf_exceptions
|
||||
from . import _unicodefun
|
||||
|
||||
warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
|
||||
|
||||
@ -49,6 +50,8 @@ def complain(message):
|
||||
if 'IDE_PROJECT_ROOTS' in os.environ:
|
||||
os.environ['PATH'] = '/usr/local/bin:' + os.environ['PATH']
|
||||
|
||||
_unicodefun._verify_python3_env()
|
||||
|
||||
if tesseract.version() < MINIMUM_TESS_VERSION:
|
||||
complain(
|
||||
"Please install tesseract {0} or newer "
|
||||
|
||||
108
ocrmypdf/_unicodefun.py
Normal file
108
ocrmypdf/_unicodefun.py
Normal file
@ -0,0 +1,108 @@
|
||||
# Copyright (c) 2014, Armin Ronacher
|
||||
#
|
||||
# Copyright (c) 2017, James R Barlow
|
||||
#
|
||||
# Some rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
#
|
||||
# * Redistributions in binary form must reproduce the above
|
||||
# copyright notice, this list of conditions and the following
|
||||
# disclaimer in the documentation and/or other materials provided
|
||||
# with the distribution.
|
||||
#
|
||||
# * The names of the contributors may not be used to endorse or
|
||||
# promote products derived from this software without specific
|
||||
# prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
import os
|
||||
import sys
|
||||
import codecs
|
||||
|
||||
|
||||
def _verify_python3_env():
|
||||
"""Ensures that the environment is good for unicode on Python 3."""
|
||||
try:
|
||||
import locale
|
||||
fs_enc = codecs.lookup(locale.getpreferredencoding()).name
|
||||
except Exception:
|
||||
fs_enc = 'ascii'
|
||||
if fs_enc != 'ascii':
|
||||
return
|
||||
|
||||
extra = ''
|
||||
if os.name == 'posix':
|
||||
import subprocess
|
||||
rv = subprocess.Popen(['locale', '-a'], stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE).communicate()[0]
|
||||
good_locales = set()
|
||||
has_c_utf8 = False
|
||||
|
||||
# Make sure we're operating on text here.
|
||||
if isinstance(rv, bytes):
|
||||
rv = rv.decode('ascii', 'replace')
|
||||
|
||||
for line in rv.splitlines():
|
||||
locale = line.strip()
|
||||
if locale.lower().endswith(('.utf-8', '.utf8')):
|
||||
good_locales.add(locale)
|
||||
if locale.lower() in ('c.utf8', 'c.utf-8'):
|
||||
has_c_utf8 = True
|
||||
|
||||
extra += '\n\n'
|
||||
if not good_locales:
|
||||
extra += (
|
||||
'Additional information: on this system no suitable UTF-8\n'
|
||||
'locales were discovered. This most likely requires resolving\n'
|
||||
'by reconfiguring the locale system.'
|
||||
)
|
||||
elif has_c_utf8:
|
||||
extra += (
|
||||
'This system supports the C.UTF-8 locale which is recommended.\n'
|
||||
'You might be able to resolve your issue by exporting the\n'
|
||||
'following environment variables:\n\n'
|
||||
' export LC_ALL=C.UTF-8\n'
|
||||
' export LANG=C.UTF-8'
|
||||
)
|
||||
else:
|
||||
extra += (
|
||||
'This system lists a couple of UTF-8 supporting locales that\n'
|
||||
'you can pick from. The following suitable locales were\n'
|
||||
'discovered: %s'
|
||||
) % ', '.join(sorted(good_locales))
|
||||
|
||||
bad_locale = None
|
||||
for locale in os.environ.get('LC_ALL'), os.environ.get('LANG'):
|
||||
if locale and locale.lower().endswith(('.utf-8', '.utf8')):
|
||||
bad_locale = locale
|
||||
if locale is not None:
|
||||
break
|
||||
if bad_locale is not None:
|
||||
extra += (
|
||||
'\n\ocrmypdf discovered that you exported a UTF-8 locale\n'
|
||||
'but the locale system could not pick up from it because\n'
|
||||
'it does not exist. The exported locale is "%s" but it\n'
|
||||
'is not supported'
|
||||
) % bad_locale
|
||||
|
||||
raise RuntimeError('ocrmypdf will abort further execution because Python 3 '
|
||||
'was configured to use ASCII as encoding for the '
|
||||
'environment.' + extra)
|
||||
@ -1004,4 +1004,16 @@ def test_pdfa_1(spoof_tesseract_cache, resources, outpdf):
|
||||
)
|
||||
|
||||
pdfa_info = file_claims_pdfa(outpdf)
|
||||
assert pdfa_info['conformance'] == 'PDF/A-1B'
|
||||
assert pdfa_info['conformance'] == 'PDF/A-1B'
|
||||
|
||||
|
||||
def test_bad_locale():
|
||||
env = os.environ.copy()
|
||||
env['LANG'] = 'C'
|
||||
|
||||
p, out, err = run_ocrmypdf(
|
||||
'a', 'b', env=env
|
||||
)
|
||||
assert out == '', "stdout not clean"
|
||||
assert p.returncode != 0
|
||||
assert 'configured to use ASCII as encoding' in err, "should whine"
|
||||
Loading…
x
Reference in New Issue
Block a user