From 433d6af1bc4e5a6889166cf714a2f0b8d613a863 Mon Sep 17 00:00:00 2001
From: shreyanid <42684285+shreyanid@users.noreply.github.com>
Date: Tue, 27 Jun 2023 18:15:02 -0700
Subject: [PATCH] fix: format Arabic and Hebrew annotated encodings (#823)

* add modified arabic and hebrew encodings

* added calls to format_encoding_str so encoding is checked before use

* added formatting to detect_filetype()

* explicitly provided default value for null encoding parameter

* fixed format of annotated encodings list

* adding hebrew base64 test file

* small lint fixes

* update changelog

* bump version to -dev2
---
 CHANGELOG.md                                  |  3 ++-
 example-docs/hebrew-text-base64-iso88598i.txt |  1 +
 test_unstructured/cleaners/test_core.py       |  5 ++++
 unstructured/__version__.py                   |  2 +-
 unstructured/cleaners/core.py                 |  9 +++++--
 unstructured/file_utils/encoding.py           | 27 +++++++++++++------
 unstructured/file_utils/filetype.py           | 14 ++++++----
 7 files changed, 44 insertions(+), 17 deletions(-)
 create mode 100644 example-docs/hebrew-text-base64-iso88598i.txt

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c1e8a9503..2d76d0d9c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.7.10-dev1
+## 0.7.10-dev2
 
 ### Enhancements
 
@@ -9,6 +9,7 @@
 ### Fixes
 
 * Fix pre tag parsing for `partition_html`
+* Fix lookup error for annotated Arabic and Hebrew encodings
 
 ## 0.7.9
 
diff --git a/example-docs/hebrew-text-base64-iso88598i.txt b/example-docs/hebrew-text-base64-iso88598i.txt
new file mode 100644
index 000000000..31a5d561b
--- /dev/null
+++ b/example-docs/hebrew-text-base64-iso88598i.txt
@@ -0,0 +1 @@
+8uPr5e8gVGVzbGEgLSDw4/j5+iDn+unu5CDy7CDk4ff55CDs+On55e0g
\ No newline at end of file
diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py
index 196aa19a3..fa8e29f58 100644
--- a/test_unstructured/cleaners/test_core.py
+++ b/test_unstructured/cleaners/test_core.py
@@ -87,6 +87,11 @@ def test_replace_mime_encodings_works_with_different_encodings():
     assert core.replace_mime_encodings(text=text, encoding="latin-1") == "5 wâ\x80-99sâ\x80-92"
 
 
+def test_replace_mime_encodings_works_with_right_to_left_encodings():
+    text = "=EE=E0=E9=E4"
+    assert core.replace_mime_encodings(text=text, encoding="iso-8859-8") == "מאיה"
+
+
 @pytest.mark.parametrize(
     ("text", "expected"),
     [
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index a28807cc1..5a48fbf07 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.10-dev1"  # pragma: no cover
+__version__ = "0.7.10-dev2"  # pragma: no cover
diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
index 51058b896..6880f4b45 100644
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@@ -3,6 +3,9 @@ import re
 import sys
 import unicodedata
 
+from unstructured.file_utils.encoding import (
+    format_encoding_str,
+)
 from unstructured.nlp.patterns import (
     DOUBLE_PARAGRAPH_PATTERN_RE,
     PARAGRAPH_PATTERN,
@@ -194,7 +197,8 @@ def replace_mime_encodings(text: str, encoding: str = "utf-8") -> str:
     -------
     5 w=E2=80-99s -> 5 w’s
     """
-    return quopri.decodestring(text.encode(encoding)).decode(encoding)
+    formatted_encoding = format_encoding_str(encoding)
+    return quopri.decodestring(text.encode(formatted_encoding)).decode(formatted_encoding)
 
 
 def clean_prefix(text: str, pattern: str, ignore_case: bool = False, strip: bool = True) -> str:
@@ -264,4 +268,5 @@ def bytes_string_to_string(text: str, encoding: str = "utf-8"):
     """Converts a string representation of a byte string to a regular string using the
     specified encoding."""
     text_bytes = bytes([ord(char) for char in text])
-    return text_bytes.decode(encoding)
+    formatted_encoding = format_encoding_str(encoding)
+    return text_bytes.decode(formatted_encoding)
diff --git a/unstructured/file_utils/encoding.py b/unstructured/file_utils/encoding.py
index a718110e2..52f13e9d1 100644
--- a/unstructured/file_utils/encoding.py
+++ b/unstructured/file_utils/encoding.py
@@ -10,6 +10,8 @@ ENCODE_REC_THRESHOLD = 0.5
 COMMON_ENCODINGS = [
     "utf_8",
     "iso_8859_1",
+    "iso_8859_6",
+    "iso_8859_8",
     "ascii",
     "big5",
     "utf_16",
@@ -37,7 +39,14 @@ def format_encoding_str(encoding: str) -> str:
         The encoding string to be formatted (e.g., `UTF-8`, `utf_8`, `ISO-8859-1`, `iso_8859_1`,
         etc).
     """
-    return encoding.lower().replace("_", "-")
+    formatted_encoding = encoding.lower().replace("_", "-")
+
+    # Special case for Arabic and Hebrew charsets with directional annotations
+    annotated_encodings = ["iso-8859-6-i", "iso-8859-6-e", "iso-8859-8-i", "iso-8859-8-e"]
+    if formatted_encoding in annotated_encodings:
+        formatted_encoding = formatted_encoding[:-2]  # remove the annotation
+
+    return formatted_encoding
 
 
 def detect_file_encoding(
@@ -82,7 +91,9 @@ def detect_file_encoding(
     else:
         file_text = byte_data.decode(encoding)
 
-    return encoding, file_text
+    formatted_encoding = format_encoding_str(encoding)
+
+    return formatted_encoding, file_text
 
 
 def read_txt_file(
@@ -93,28 +104,28 @@ def read_txt_file(
     """Extracts document metadata from a plain text document."""
     if filename:
         if encoding:
-            with open(filename, encoding=encoding) as f:
+            formatted_encoding = format_encoding_str(encoding)
+            with open(filename, encoding=formatted_encoding) as f:
                 try:
                     file_text = f.read()
                 except (UnicodeDecodeError, UnicodeError) as error:
                     raise error
         else:
-            encoding, file_text = detect_file_encoding(filename)
+            formatted_encoding, file_text = detect_file_encoding(filename)
     elif file:
         if encoding:
+            formatted_encoding = format_encoding_str(encoding)
             try:
                 file_content = file if isinstance(file, bytes) else file.read()
                 if isinstance(file_content, bytes):
-                    file_text = file_content.decode(encoding)
+                    file_text = file_content.decode(formatted_encoding)
                 else:
                     file_text = file_content
             except (UnicodeDecodeError, UnicodeError) as error:
                 raise error
         else:
-            encoding, file_text = detect_file_encoding(file=file)
+            formatted_encoding, file_text = detect_file_encoding(file=file)
     else:
         raise FileNotFoundError("No filename was specified")
 
-    formatted_encoding = format_encoding_str(encoding)
-
     return formatted_encoding, file_text
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
index 919ec2fb6..3e75b0593 100644
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@@ -10,7 +10,7 @@ from typing import IO, TYPE_CHECKING, Callable, List, Optional
 
 from unstructured.documents.coordinates import PixelSpace
 from unstructured.documents.elements import Element, PageBreak
-from unstructured.file_utils.encoding import detect_file_encoding
+from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
 from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
 from unstructured.partition.common import (
     _add_element_metadata,
@@ -280,13 +280,17 @@ def detect_filetype(
             return FileType.XML
 
     elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"):
+        if not encoding:
+            encoding = "utf-8"
+        formatted_encoding = format_encoding_str(encoding)
+
         # NOTE(crag): for older versions of the OS libmagic package, such as is currently
         # installed on the Unstructured docker image, .json files resolve to "text/plain"
         # rather than "application/json". this corrects for that case.
-        if _is_text_file_a_json(file=file, filename=filename, encoding=encoding):
+        if _is_text_file_a_json(file=file, filename=filename, encoding=formatted_encoding):
             return FileType.JSON
 
-        if _is_text_file_a_csv(file=file, filename=filename, encoding=encoding):
+        if _is_text_file_a_csv(file=file, filename=filename, encoding=formatted_encoding):
             return FileType.CSV
 
         if file and _check_eml_from_buffer(file=file) is True:
@@ -384,8 +388,8 @@ def _read_file_start_for_type_check(
             with open(filename, encoding=encoding) as f:
                 file_text = f.read(4096)
         except UnicodeDecodeError:
-            encoding, _ = detect_file_encoding(filename=filename)
-            with open(filename, encoding=encoding) as f:
+            formatted_encoding, _ = detect_file_encoding(filename=filename)
+            with open(filename, encoding=formatted_encoding) as f:
                 file_text = f.read(4096)
     return file_text