fix: format Arabic and Hebrew annotated encodings (#823)

* add modified arabic and hebrew encodings

* added calls to format_encoding_str so encoding is checked before use

* added formatting to detect_filetype()

* explicitly provided default value for null encoding parameter

* fixed format of annotated encodings list

* adding hebrew base64 test file

* small lint fixes

* update changelog

* bump version to -dev2
This commit is contained in:
shreyanid 2023-06-27 18:15:02 -07:00 committed by GitHub
parent 58e988e110
commit 433d6af1bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 44 additions and 17 deletions

View File

@ -1,4 +1,4 @@
## 0.7.10-dev1
## 0.7.10-dev2
### Enhancements
@ -9,6 +9,7 @@
### Fixes
* Fix pre tag parsing for `partition_html`
* Fix lookup error for annotated Arabic and Hebrew encodings
## 0.7.9

View File

@ -0,0 +1 @@
8uPr5e8gVGVzbGEgLSDw4/j5+iDn+unu5CDy7CDk4ff55CDs+On55e0g

View File

@ -87,6 +87,11 @@ def test_replace_mime_encodings_works_with_different_encodings():
assert core.replace_mime_encodings(text=text, encoding="latin-1") == "5 wâ\x80-99sâ\x80-92"
def test_replace_mime_encodings_works_with_right_to_left_encodings():
text = "=EE=E0=E9=E4"
assert core.replace_mime_encodings(text=text, encoding="iso-8859-8") == "מאיה"
@pytest.mark.parametrize(
("text", "expected"),
[

View File

@ -1 +1 @@
__version__ = "0.7.10-dev1" # pragma: no cover
__version__ = "0.7.10-dev2" # pragma: no cover

View File

@ -3,6 +3,9 @@ import re
import sys
import unicodedata
from unstructured.file_utils.encoding import (
format_encoding_str,
)
from unstructured.nlp.patterns import (
DOUBLE_PARAGRAPH_PATTERN_RE,
PARAGRAPH_PATTERN,
@ -194,7 +197,8 @@ def replace_mime_encodings(text: str, encoding: str = "utf-8") -> str:
-------
5 w=E2=80-99s -> 5 ws
"""
return quopri.decodestring(text.encode(encoding)).decode(encoding)
formatted_encoding = format_encoding_str(encoding)
return quopri.decodestring(text.encode(formatted_encoding)).decode(formatted_encoding)
def clean_prefix(text: str, pattern: str, ignore_case: bool = False, strip: bool = True) -> str:
@ -264,4 +268,5 @@ def bytes_string_to_string(text: str, encoding: str = "utf-8"):
"""Converts a string representation of a byte string to a regular string using the
specified encoding."""
text_bytes = bytes([ord(char) for char in text])
return text_bytes.decode(encoding)
formatted_encoding = format_encoding_str(encoding)
return text_bytes.decode(formatted_encoding)

View File

@ -10,6 +10,8 @@ ENCODE_REC_THRESHOLD = 0.5
COMMON_ENCODINGS = [
"utf_8",
"iso_8859_1",
"iso_8859_6",
"iso_8859_8",
"ascii",
"big5",
"utf_16",
@ -37,7 +39,14 @@ def format_encoding_str(encoding: str) -> str:
The encoding string to be formatted (e.g., `UTF-8`, `utf_8`, `ISO-8859-1`, `iso_8859_1`,
etc).
"""
return encoding.lower().replace("_", "-")
formatted_encoding = encoding.lower().replace("_", "-")
# Special case for Arabic and Hebrew charsets with directional annotations
annotated_encodings = ["iso-8859-6-i", "iso-8859-6-e", "iso-8859-8-i", "iso-8859-8-e"]
if formatted_encoding in annotated_encodings:
formatted_encoding = formatted_encoding[:-2] # remove the annotation
return formatted_encoding
def detect_file_encoding(
@ -82,7 +91,9 @@ def detect_file_encoding(
else:
file_text = byte_data.decode(encoding)
return encoding, file_text
formatted_encoding = format_encoding_str(encoding)
return formatted_encoding, file_text
def read_txt_file(
@ -93,28 +104,28 @@ def read_txt_file(
"""Extracts document metadata from a plain text document."""
if filename:
if encoding:
with open(filename, encoding=encoding) as f:
formatted_encoding = format_encoding_str(encoding)
with open(filename, encoding=formatted_encoding) as f:
try:
file_text = f.read()
except (UnicodeDecodeError, UnicodeError) as error:
raise error
else:
encoding, file_text = detect_file_encoding(filename)
formatted_encoding, file_text = detect_file_encoding(filename)
elif file:
if encoding:
formatted_encoding = format_encoding_str(encoding)
try:
file_content = file if isinstance(file, bytes) else file.read()
if isinstance(file_content, bytes):
file_text = file_content.decode(encoding)
file_text = file_content.decode(formatted_encoding)
else:
file_text = file_content
except (UnicodeDecodeError, UnicodeError) as error:
raise error
else:
encoding, file_text = detect_file_encoding(file=file)
formatted_encoding, file_text = detect_file_encoding(file=file)
else:
raise FileNotFoundError("No filename was specified")
formatted_encoding = format_encoding_str(encoding)
return formatted_encoding, file_text

View File

@ -10,7 +10,7 @@ from typing import IO, TYPE_CHECKING, Callable, List, Optional
from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import Element, PageBreak
from unstructured.file_utils.encoding import detect_file_encoding
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
from unstructured.partition.common import (
_add_element_metadata,
@ -280,13 +280,17 @@ def detect_filetype(
return FileType.XML
elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"):
if not encoding:
encoding = "utf-8"
formatted_encoding = format_encoding_str(encoding)
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
# installed on the Unstructured docker image, .json files resolve to "text/plain"
# rather than "application/json". this corrects for that case.
if _is_text_file_a_json(file=file, filename=filename, encoding=encoding):
if _is_text_file_a_json(file=file, filename=filename, encoding=formatted_encoding):
return FileType.JSON
if _is_text_file_a_csv(file=file, filename=filename, encoding=encoding):
if _is_text_file_a_csv(file=file, filename=filename, encoding=formatted_encoding):
return FileType.CSV
if file and _check_eml_from_buffer(file=file) is True:
@ -384,8 +388,8 @@ def _read_file_start_for_type_check(
with open(filename, encoding=encoding) as f:
file_text = f.read(4096)
except UnicodeDecodeError:
encoding, _ = detect_file_encoding(filename=filename)
with open(filename, encoding=encoding) as f:
formatted_encoding, _ = detect_file_encoding(filename=filename)
with open(filename, encoding=formatted_encoding) as f:
file_text = f.read(4096)
return file_text