Check that the locale is sane before allowing OCR to proceed

2025-12-29 08:01:04 +00:00 · 2017-11-16 17:18:02 -08:00 · 2017-11-16 17:18:02 -08:00 · 40aa82ab41
commit 40aa82ab41
parent 5d2c67c62b
3 changed files with 124 additions and 1 deletions
--- a/ocrmypdf/main.py
+++ b/ocrmypdf/main.py
@ -28,6 +28,7 @@ from . import PROGRAM_NAME, VERSION

 from .exceptions import *
 from . import exceptions as ocrmypdf_exceptions
+from . import _unicodefun

 warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)

@ -49,6 +50,8 @@ def complain(message):
 if 'IDE_PROJECT_ROOTS' in os.environ:
    os.environ['PATH'] = '/usr/local/bin:' + os.environ['PATH']

+_unicodefun._verify_python3_env()
+
 if tesseract.version() < MINIMUM_TESS_VERSION:
    complain(
        "Please install tesseract {0} or newer "
--- a/ocrmypdf/_unicodefun.py
+++ b/ocrmypdf/_unicodefun.py
@ -0,0 +1,108 @@
+# Copyright (c) 2014, Armin Ronacher
+#
+# Copyright (c) 2017, James R Barlow
+#
+# Some rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#
+#     * Redistributions in binary form must reproduce the above
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
+#       with the distribution.
+#
+#     * The names of the contributors may not be used to endorse or
+#       promote products derived from this software without specific
+#       prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import os
+import sys
+import codecs
+
+
+def _verify_python3_env():
+    """Ensures that the environment is good for unicode on Python 3."""
+    try:
+        import locale
+        fs_enc = codecs.lookup(locale.getpreferredencoding()).name
+    except Exception:
+        fs_enc = 'ascii'
+    if fs_enc != 'ascii':
+        return
+
+    extra = ''
+    if os.name == 'posix':
+        import subprocess
+        rv = subprocess.Popen(['locale', '-a'], stdout=subprocess.PIPE,
+                              stderr=subprocess.PIPE).communicate()[0]
+        good_locales = set()
+        has_c_utf8 = False
+
+        # Make sure we're operating on text here.
+        if isinstance(rv, bytes):
+            rv = rv.decode('ascii', 'replace')
+
+        for line in rv.splitlines():
+            locale = line.strip()
+            if locale.lower().endswith(('.utf-8', '.utf8')):
+                good_locales.add(locale)
+                if locale.lower() in ('c.utf8', 'c.utf-8'):
+                    has_c_utf8 = True
+
+        extra += '\n\n'
+        if not good_locales:
+            extra += (
+                'Additional information: on this system no suitable UTF-8\n'
+                'locales were discovered.  This most likely requires resolving\n'
+                'by reconfiguring the locale system.'
+            )
+        elif has_c_utf8:
+            extra += (
+                'This system supports the C.UTF-8 locale which is recommended.\n'
+                'You might be able to resolve your issue by exporting the\n'
+                'following environment variables:\n\n'
+                '    export LC_ALL=C.UTF-8\n'
+                '    export LANG=C.UTF-8'
+            )
+        else:
+            extra += (
+                'This system lists a couple of UTF-8 supporting locales that\n'
+                'you can pick from.  The following suitable locales were\n'
+                'discovered: %s'
+            ) % ', '.join(sorted(good_locales))
+
+        bad_locale = None
+        for locale in os.environ.get('LC_ALL'), os.environ.get('LANG'):
+            if locale and locale.lower().endswith(('.utf-8', '.utf8')):
+                bad_locale = locale
+            if locale is not None:
+                break
+        if bad_locale is not None:
+            extra += (
+                '\n\ocrmypdf discovered that you exported a UTF-8 locale\n'
+                'but the locale system could not pick up from it because\n'
+                'it does not exist.  The exported locale is "%s" but it\n'
+                'is not supported'
+            ) % bad_locale
+
+    raise RuntimeError('ocrmypdf will abort further execution because Python 3 '
+                       'was configured to use ASCII as encoding for the '
+                       'environment.' + extra)
--- a/tests/test_main.py
+++ b/tests/test_main.py
@ -1004,4 +1004,16 @@ def test_pdfa_1(spoof_tesseract_cache, resources, outpdf):
    )

    pdfa_info = file_claims_pdfa(outpdf)
-    assert pdfa_info['conformance'] == 'PDF/A-1B'
+    assert pdfa_info['conformance'] == 'PDF/A-1B'
+
+
+def test_bad_locale():
+    env = os.environ.copy()
+    env['LANG'] = 'C'
+
+    p, out, err = run_ocrmypdf(
+        'a', 'b', env=env
+    )
+    assert out == '', "stdout not clean"
+    assert p.returncode != 0
+    assert 'configured to use ASCII as encoding' in err, "should whine"