Issue/unicode error (#608)

This PR adds functionality to try other common encodings if an error related to the encoding is raised and the user has not specified an encoding.
2025-06-27 02:30:08 +00:00 · 2023-05-23 15:35:38 -05:00 · 2023-05-23 15:35:38 -05:00 · a1fed6d4c6
commit a1fed6d4c6
parent a78719666a
11 changed files with 150 additions and 11 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.6.9-dev1
+## 0.6.9-dev2

 ### Enhancements

@ -8,6 +8,7 @@

 ### Fixes

+* Adds functionality to try other common encodings if an error related to the encoding is raised and the user has not specified an encoding.
 * Adds additional MIME types for CSV

 ## 0.6.8
--- a/example-docs/fake-text-utf-16-le.txt
+++ b/example-docs/fake-text-utf-16-le.txt
--- a/example-docs/fake-text-utf-16.txt
+++ b/example-docs/fake-text-utf-16.txt
--- a/example-docs/fake-text-utf-32.txt
+++ b/example-docs/fake-text-utf-32.txt
--- a/requirements/base.txt
+++ b/requirements/base.txt
@ -18,6 +18,8 @@ certifi==2022.12.7
    #   unstructured (setup.py)
 cffi==1.15.1
    # via cryptography
+chardet==5.1.0
+    # via unstructured (setup.py)
 charset-normalizer==3.1.0
    # via
    #   pdfminer-six
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@ -35,6 +35,8 @@ cffi==1.15.1
    # via argon2-cffi-bindings
 cfgv==3.3.1
    # via pre-commit
+chardet==5.1.0
+    # via -r requirements/dev.in
 click==8.1.3
    # via pip-tools
 comm==0.1.3
--- a/setup.py
+++ b/setup.py
@ -51,6 +51,7 @@ setup(
    },
    install_requires=[
        "argilla",
+        "chardet",
        "lxml",
        "msg_parser",
        "nltk",
--- a/test_unstructured/partition/test_text.py
+++ b/test_unstructured/partition/test_text.py
@ -30,6 +30,17 @@ def test_partition_text_from_filename(filename, encoding):
    assert elements == EXPECTED_OUTPUT


+@pytest.mark.parametrize(
+    "filename",
+    ["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
+)
+def test_partition_text_from_filename_default_encoding(filename):
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    elements = partition_text(filename=filename)
+    assert len(elements) > 0
+    assert elements == EXPECTED_OUTPUT
+
+
@pytest.mark.parametrize(
    ("filename", "encoding", "error"),
    [
@ -51,6 +62,18 @@ def test_partition_text_from_file():
    assert elements == EXPECTED_OUTPUT


+@pytest.mark.parametrize(
+    "filename",
+    ["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
+)
+def test_partition_text_from_file_default_encoding(filename):
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    with open(filename) as f:
+        elements = partition_text(file=f)
+    assert len(elements) > 0
+    assert elements == EXPECTED_OUTPUT
+
+
 def test_partition_text_from_bytes_file():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
    with open(filename, "rb") as f:
@ -59,6 +82,18 @@ def test_partition_text_from_bytes_file():
    assert elements == EXPECTED_OUTPUT


+@pytest.mark.parametrize(
+    "filename",
+    ["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
+)
+def test_partition_text_from_bytes_file_default_encoding(filename):
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    with open(filename, "rb") as f:
+        elements = partition_text(file=f)
+    assert len(elements) > 0
+    assert elements == EXPECTED_OUTPUT
+
+
 def test_partition_text_from_text():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
    with open(filename) as f:
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.6.9-dev1"  # pragma: no cover
+__version__ = "0.6.9-dev2"  # pragma: no cover
--- a/unstructured/file_utils/encoding.py
+++ b/unstructured/file_utils/encoding.py
@ -0,0 +1,103 @@
+from typing import IO, Optional, Tuple
+
+import chardet
+
+ENCODE_REC_THRESHOLD = 0.5
+
+# popular encodings from https://en.wikipedia.org/wiki/Popularity_of_text_encodings
+COMMON_ENCODINGS = [
+    "utf_8",
+    "iso_8859_1",
+    "ascii",
+    "big5",
+    "utf_16",
+    "utf_16_be",
+    "utf_16_le",
+    "utf_32",
+    "utf_32_be",
+    "utf_32_le",
+    "euc_jis_2004",
+    "euc_jisx0213",
+    "euc_jp",
+    "euc_kr",
+    "gb18030",
+    "shift_jis",
+    "shift_jis_2004",
+    "shift_jisx0213",
+]
+
+
+def detect_file_encoding(filename: str = "", file: Optional[IO] = None) -> Tuple[str, str]:
+    if filename:
+        with open(filename, "rb") as f:
+            binary_data = f.read()
+    elif file:
+        if "b" in file.mode:
+            binary_data = file.read()
+        else:
+            with open(file.name, "rb") as f:
+                binary_data = f.read()
+    else:
+        raise FileNotFoundError("No filename nor file were specified")
+
+    result = chardet.detect(binary_data)
+    encoding = result["encoding"]
+    confidence = result["confidence"]
+
+    if encoding is None or confidence < ENCODE_REC_THRESHOLD:
+        # Encoding detection failed, fallback to predefined encodings
+        for enc in COMMON_ENCODINGS:
+            try:
+                with open(filename, encoding=enc) as f:
+                    file_text = f.read()
+                encoding = enc
+                break
+            except (UnicodeDecodeError, UnicodeError):
+                continue
+        else:
+            raise UnicodeDecodeError(
+                "Unable to determine the encoding of the file or match it with any "
+                "of the specified encodings.",
+                binary_data,
+                0,
+                len(binary_data),
+                "Invalid encoding",
+            )
+
+    else:
+        file_text = binary_data.decode(encoding)
+
+    return encoding, file_text
+
+
+def read_txt_file(
+    filename: str = "",
+    file: Optional[IO] = None,
+    encoding: Optional[str] = None,
+) -> Tuple[str, str]:
+    """Extracts document metadata from a plain text document."""
+    if filename:
+        if encoding:
+            with open(filename, encoding=encoding) as f:
+                try:
+                    file_text = f.read()
+                except (UnicodeDecodeError, UnicodeError) as error:
+                    raise error
+        else:
+            encoding, file_text = detect_file_encoding(filename)
+    elif file:
+        if encoding:
+            try:
+                file_content = file.read()
+                if isinstance(file_content, bytes):
+                    file_text = file_content.decode(encoding)
+                else:
+                    file_text = file_content
+            except (UnicodeDecodeError, UnicodeError) as error:
+                raise error
+        else:
+            encoding, file_text = detect_file_encoding(file=file)
+    else:
+        raise FileNotFoundError("No filename was specified")
+
+    return encoding, file_text
--- a/unstructured/partition/text.py
+++ b/unstructured/partition/text.py
@ -11,6 +11,7 @@ from unstructured.documents.elements import (
    Text,
    Title,
 )
+from unstructured.file_utils.encoding import read_txt_file
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.nlp.patterns import PARAGRAPH_PATTERN
 from unstructured.partition.common import exactly_one
@ -31,7 +32,7 @@ def partition_text(
    filename: Optional[str] = None,
    file: Optional[IO] = None,
    text: Optional[str] = None,
-    encoding: Optional[str] = "utf-8",
+    encoding: Optional[str] = None,
    paragraph_grouper: Optional[Callable[[str], str]] = None,
    metadata_filename: Optional[str] = None,
    include_metadata: bool = True,
@ -60,16 +61,10 @@ def partition_text(
    exactly_one(filename=filename, file=file, text=text)

    if filename is not None:
-        with open(filename, encoding=encoding) as f:
-            try:
-                file_text = f.read()
-            except (UnicodeDecodeError, UnicodeError) as error:
-                raise error
+        encoding, file_text = read_txt_file(filename=filename, encoding=encoding)

    elif file is not None:
-        file_text = file.read()
-        if isinstance(file_text, bytes):
-            file_text = file_text.decode(encoding)
+        encoding, file_text = read_txt_file(file=file, encoding=encoding)

    elif text is not None:
        file_text = str(text)