fix: format Arabic and Hebrew annotated encodings (#823)

* add modified arabic and hebrew encodings * added calls to format_encoding_str so encoding is checked before use * added formatting to detect_filetype() * explicitly provided default value for null encoding parameter * fixed format of annotated encodings list * adding hebrew base64 test file * small lint fixes * update changelog * bump version to -dev2
2025-12-25 06:04:53 +00:00 · 2023-06-27 18:15:02 -07:00 · 2023-06-27 18:15:02 -07:00 · 433d6af1bc
commit 433d6af1bc
parent 58e988e110
7 changed files with 44 additions and 17 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.7.10-dev1
+## 0.7.10-dev2

 ### Enhancements

@ -9,6 +9,7 @@
 ### Fixes

 * Fix pre tag parsing for `partition_html`
+* Fix lookup error for annotated Arabic and Hebrew encodings

 ## 0.7.9

--- a/example-docs/hebrew-text-base64-iso88598i.txt
+++ b/example-docs/hebrew-text-base64-iso88598i.txt
@ -0,0 +1 @@
+8uPr5e8gVGVzbGEgLSDw4/j5+iDn+unu5CDy7CDk4ff55CDs+On55e0g
--- a/test_unstructured/cleaners/test_core.py
+++ b/test_unstructured/cleaners/test_core.py
@ -87,6 +87,11 @@ def test_replace_mime_encodings_works_with_different_encodings():
    assert core.replace_mime_encodings(text=text, encoding="latin-1") == "5 wâ\x80-99sâ\x80-92"


+def test_replace_mime_encodings_works_with_right_to_left_encodings():
+    text = "=EE=E0=E9=E4"
+    assert core.replace_mime_encodings(text=text, encoding="iso-8859-8") == "מאיה"
+
+
@pytest.mark.parametrize(
    ("text", "expected"),
    [
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.7.10-dev1"  # pragma: no cover
+__version__ = "0.7.10-dev2"  # pragma: no cover
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@ -3,6 +3,9 @@ import re
 import sys
 import unicodedata

+from unstructured.file_utils.encoding import (
+    format_encoding_str,
+)
 from unstructured.nlp.patterns import (
    DOUBLE_PARAGRAPH_PATTERN_RE,
    PARAGRAPH_PATTERN,
@ -194,7 +197,8 @@ def replace_mime_encodings(text: str, encoding: str = "utf-8") -> str:
    -------
    5 w=E2=80-99s -> 5 w’s
    """
-    return quopri.decodestring(text.encode(encoding)).decode(encoding)
+    formatted_encoding = format_encoding_str(encoding)
+    return quopri.decodestring(text.encode(formatted_encoding)).decode(formatted_encoding)


 def clean_prefix(text: str, pattern: str, ignore_case: bool = False, strip: bool = True) -> str:
@ -264,4 +268,5 @@ def bytes_string_to_string(text: str, encoding: str = "utf-8"):
    """Converts a string representation of a byte string to a regular string using the
    specified encoding."""
    text_bytes = bytes([ord(char) for char in text])
-    return text_bytes.decode(encoding)
+    formatted_encoding = format_encoding_str(encoding)
+    return text_bytes.decode(formatted_encoding)
--- a/unstructured/file_utils/encoding.py
+++ b/unstructured/file_utils/encoding.py
@ -10,6 +10,8 @@ ENCODE_REC_THRESHOLD = 0.5
 COMMON_ENCODINGS = [
    "utf_8",
    "iso_8859_1",
+    "iso_8859_6",
+    "iso_8859_8",
    "ascii",
    "big5",
    "utf_16",
@ -37,7 +39,14 @@ def format_encoding_str(encoding: str) -> str:
        The encoding string to be formatted (e.g., `UTF-8`, `utf_8`, `ISO-8859-1`, `iso_8859_1`,
        etc).
    """
-    return encoding.lower().replace("_", "-")
+    formatted_encoding = encoding.lower().replace("_", "-")
+
+    # Special case for Arabic and Hebrew charsets with directional annotations
+    annotated_encodings = ["iso-8859-6-i", "iso-8859-6-e", "iso-8859-8-i", "iso-8859-8-e"]
+    if formatted_encoding in annotated_encodings:
+        formatted_encoding = formatted_encoding[:-2]  # remove the annotation
+
+    return formatted_encoding


 def detect_file_encoding(
@ -82,7 +91,9 @@ def detect_file_encoding(
    else:
        file_text = byte_data.decode(encoding)

-    return encoding, file_text
+    formatted_encoding = format_encoding_str(encoding)
+
+    return formatted_encoding, file_text


 def read_txt_file(
@ -93,28 +104,28 @@ def read_txt_file(
    """Extracts document metadata from a plain text document."""
    if filename:
        if encoding:
-            with open(filename, encoding=encoding) as f:
+            formatted_encoding = format_encoding_str(encoding)
+            with open(filename, encoding=formatted_encoding) as f:
                try:
                    file_text = f.read()
                except (UnicodeDecodeError, UnicodeError) as error:
                    raise error
        else:
-            encoding, file_text = detect_file_encoding(filename)
+            formatted_encoding, file_text = detect_file_encoding(filename)
    elif file:
        if encoding:
+            formatted_encoding = format_encoding_str(encoding)
            try:
                file_content = file if isinstance(file, bytes) else file.read()
                if isinstance(file_content, bytes):
-                    file_text = file_content.decode(encoding)
+                    file_text = file_content.decode(formatted_encoding)
                else:
                    file_text = file_content
            except (UnicodeDecodeError, UnicodeError) as error:
                raise error
        else:
-            encoding, file_text = detect_file_encoding(file=file)
+            formatted_encoding, file_text = detect_file_encoding(file=file)
    else:
        raise FileNotFoundError("No filename was specified")

-    formatted_encoding = format_encoding_str(encoding)
-
    return formatted_encoding, file_text
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -10,7 +10,7 @@ from typing import IO, TYPE_CHECKING, Callable, List, Optional

 from unstructured.documents.coordinates import PixelSpace
 from unstructured.documents.elements import Element, PageBreak
-from unstructured.file_utils.encoding import detect_file_encoding
+from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
 from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
 from unstructured.partition.common import (
    _add_element_metadata,
@ -280,13 +280,17 @@ def detect_filetype(
            return FileType.XML

    elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"):
+        if not encoding:
+            encoding = "utf-8"
+        formatted_encoding = format_encoding_str(encoding)
+
        # NOTE(crag): for older versions of the OS libmagic package, such as is currently
        # installed on the Unstructured docker image, .json files resolve to "text/plain"
        # rather than "application/json". this corrects for that case.
-        if _is_text_file_a_json(file=file, filename=filename, encoding=encoding):
+        if _is_text_file_a_json(file=file, filename=filename, encoding=formatted_encoding):
            return FileType.JSON

-        if _is_text_file_a_csv(file=file, filename=filename, encoding=encoding):
+        if _is_text_file_a_csv(file=file, filename=filename, encoding=formatted_encoding):
            return FileType.CSV

        if file and _check_eml_from_buffer(file=file) is True:
@ -384,8 +388,8 @@ def _read_file_start_for_type_check(
            with open(filename, encoding=encoding) as f:
                file_text = f.read(4096)
        except UnicodeDecodeError:
-            encoding, _ = detect_file_encoding(filename=filename)
-            with open(filename, encoding=encoding) as f:
+            formatted_encoding, _ = detect_file_encoding(filename=filename)
+            with open(filename, encoding=formatted_encoding) as f:
                file_text = f.read(4096)
    return file_text
				`@ -0,0 +1 @@`
				`8uPr5e8gVGVzbGEgLSDw4/j5+iDn+unu5CDy7CDk4ff55CDs+On55e0g`