From 433d6af1bc4e5a6889166cf714a2f0b8d613a863 Mon Sep 17 00:00:00 2001 From: shreyanid <42684285+shreyanid@users.noreply.github.com> Date: Tue, 27 Jun 2023 18:15:02 -0700 Subject: [PATCH] fix: format Arabic and Hebrew annotated encodings (#823) * add modified arabic and hebrew encodings * added calls to format_encoding_str so encoding is checked before use * added formatting to detect_filetype() * explicitly provided default value for null encoding parameter * fixed format of annotated encodings list * adding hebrew base64 test file * small lint fixes * update changelog * bump version to -dev2 --- CHANGELOG.md | 3 ++- example-docs/hebrew-text-base64-iso88598i.txt | 1 + test_unstructured/cleaners/test_core.py | 5 ++++ unstructured/__version__.py | 2 +- unstructured/cleaners/core.py | 9 +++++-- unstructured/file_utils/encoding.py | 27 +++++++++++++------ unstructured/file_utils/filetype.py | 14 ++++++---- 7 files changed, 44 insertions(+), 17 deletions(-) create mode 100644 example-docs/hebrew-text-base64-iso88598i.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index c1e8a9503..2d76d0d9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.7.10-dev1 +## 0.7.10-dev2 ### Enhancements @@ -9,6 +9,7 @@ ### Fixes * Fix pre tag parsing for `partition_html` +* Fix lookup error for annotated Arabic and Hebrew encodings ## 0.7.9 diff --git a/example-docs/hebrew-text-base64-iso88598i.txt b/example-docs/hebrew-text-base64-iso88598i.txt new file mode 100644 index 000000000..31a5d561b --- /dev/null +++ b/example-docs/hebrew-text-base64-iso88598i.txt @@ -0,0 +1 @@ +8uPr5e8gVGVzbGEgLSDw4/j5+iDn+unu5CDy7CDk4ff55CDs+On55e0g \ No newline at end of file diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py index 196aa19a3..fa8e29f58 100644 --- a/test_unstructured/cleaners/test_core.py +++ b/test_unstructured/cleaners/test_core.py @@ -87,6 +87,11 @@ def test_replace_mime_encodings_works_with_different_encodings(): assert core.replace_mime_encodings(text=text, encoding="latin-1") == "5 wâ\x80-99sâ\x80-92" +def test_replace_mime_encodings_works_with_right_to_left_encodings(): + text = "=EE=E0=E9=E4" + assert core.replace_mime_encodings(text=text, encoding="iso-8859-8") == "מאיה" + + @pytest.mark.parametrize( ("text", "expected"), [ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a28807cc1..5a48fbf07 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.7.10-dev1" # pragma: no cover +__version__ = "0.7.10-dev2" # pragma: no cover diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 51058b896..6880f4b45 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -3,6 +3,9 @@ import re import sys import unicodedata +from unstructured.file_utils.encoding import ( + format_encoding_str, +) from unstructured.nlp.patterns import ( DOUBLE_PARAGRAPH_PATTERN_RE, PARAGRAPH_PATTERN, @@ -194,7 +197,8 @@ def replace_mime_encodings(text: str, encoding: str = "utf-8") -> str: ------- 5 w=E2=80-99s -> 5 w’s """ - return quopri.decodestring(text.encode(encoding)).decode(encoding) + formatted_encoding = format_encoding_str(encoding) + return quopri.decodestring(text.encode(formatted_encoding)).decode(formatted_encoding) def clean_prefix(text: str, pattern: str, ignore_case: bool = False, strip: bool = True) -> str: @@ -264,4 +268,5 @@ def bytes_string_to_string(text: str, encoding: str = "utf-8"): """Converts a string representation of a byte string to a regular string using the specified encoding.""" text_bytes = bytes([ord(char) for char in text]) - return text_bytes.decode(encoding) + formatted_encoding = format_encoding_str(encoding) + return text_bytes.decode(formatted_encoding) diff --git a/unstructured/file_utils/encoding.py b/unstructured/file_utils/encoding.py index a718110e2..52f13e9d1 100644 --- a/unstructured/file_utils/encoding.py +++ b/unstructured/file_utils/encoding.py @@ -10,6 +10,8 @@ ENCODE_REC_THRESHOLD = 0.5 COMMON_ENCODINGS = [ "utf_8", "iso_8859_1", + "iso_8859_6", + "iso_8859_8", "ascii", "big5", "utf_16", @@ -37,7 +39,14 @@ def format_encoding_str(encoding: str) -> str: The encoding string to be formatted (e.g., `UTF-8`, `utf_8`, `ISO-8859-1`, `iso_8859_1`, etc). """ - return encoding.lower().replace("_", "-") + formatted_encoding = encoding.lower().replace("_", "-") + + # Special case for Arabic and Hebrew charsets with directional annotations + annotated_encodings = ["iso-8859-6-i", "iso-8859-6-e", "iso-8859-8-i", "iso-8859-8-e"] + if formatted_encoding in annotated_encodings: + formatted_encoding = formatted_encoding[:-2] # remove the annotation + + return formatted_encoding def detect_file_encoding( @@ -82,7 +91,9 @@ def detect_file_encoding( else: file_text = byte_data.decode(encoding) - return encoding, file_text + formatted_encoding = format_encoding_str(encoding) + + return formatted_encoding, file_text def read_txt_file( @@ -93,28 +104,28 @@ def read_txt_file( """Extracts document metadata from a plain text document.""" if filename: if encoding: - with open(filename, encoding=encoding) as f: + formatted_encoding = format_encoding_str(encoding) + with open(filename, encoding=formatted_encoding) as f: try: file_text = f.read() except (UnicodeDecodeError, UnicodeError) as error: raise error else: - encoding, file_text = detect_file_encoding(filename) + formatted_encoding, file_text = detect_file_encoding(filename) elif file: if encoding: + formatted_encoding = format_encoding_str(encoding) try: file_content = file if isinstance(file, bytes) else file.read() if isinstance(file_content, bytes): - file_text = file_content.decode(encoding) + file_text = file_content.decode(formatted_encoding) else: file_text = file_content except (UnicodeDecodeError, UnicodeError) as error: raise error else: - encoding, file_text = detect_file_encoding(file=file) + formatted_encoding, file_text = detect_file_encoding(file=file) else: raise FileNotFoundError("No filename was specified") - formatted_encoding = format_encoding_str(encoding) - return formatted_encoding, file_text diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 919ec2fb6..3e75b0593 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -10,7 +10,7 @@ from typing import IO, TYPE_CHECKING, Callable, List, Optional from unstructured.documents.coordinates import PixelSpace from unstructured.documents.elements import Element, PageBreak -from unstructured.file_utils.encoding import detect_file_encoding +from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN from unstructured.partition.common import ( _add_element_metadata, @@ -280,13 +280,17 @@ def detect_filetype( return FileType.XML elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"): + if not encoding: + encoding = "utf-8" + formatted_encoding = format_encoding_str(encoding) + # NOTE(crag): for older versions of the OS libmagic package, such as is currently # installed on the Unstructured docker image, .json files resolve to "text/plain" # rather than "application/json". this corrects for that case. - if _is_text_file_a_json(file=file, filename=filename, encoding=encoding): + if _is_text_file_a_json(file=file, filename=filename, encoding=formatted_encoding): return FileType.JSON - if _is_text_file_a_csv(file=file, filename=filename, encoding=encoding): + if _is_text_file_a_csv(file=file, filename=filename, encoding=formatted_encoding): return FileType.CSV if file and _check_eml_from_buffer(file=file) is True: @@ -384,8 +388,8 @@ def _read_file_start_for_type_check( with open(filename, encoding=encoding) as f: file_text = f.read(4096) except UnicodeDecodeError: - encoding, _ = detect_file_encoding(filename=filename) - with open(filename, encoding=encoding) as f: + formatted_encoding, _ = detect_file_encoding(filename=filename) + with open(filename, encoding=formatted_encoding) as f: file_text = f.read(4096) return file_text