mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-25 06:04:53 +00:00
fix: format Arabic and Hebrew annotated encodings (#823)
* add modified arabic and hebrew encodings * added calls to format_encoding_str so encoding is checked before use * added formatting to detect_filetype() * explicitly provided default value for null encoding parameter * fixed format of annotated encodings list * adding hebrew base64 test file * small lint fixes * update changelog * bump version to -dev2
This commit is contained in:
parent
58e988e110
commit
433d6af1bc
@ -1,4 +1,4 @@
|
||||
## 0.7.10-dev1
|
||||
## 0.7.10-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
### Fixes
|
||||
|
||||
* Fix pre tag parsing for `partition_html`
|
||||
* Fix lookup error for annotated Arabic and Hebrew encodings
|
||||
|
||||
## 0.7.9
|
||||
|
||||
|
||||
1
example-docs/hebrew-text-base64-iso88598i.txt
Normal file
1
example-docs/hebrew-text-base64-iso88598i.txt
Normal file
@ -0,0 +1 @@
|
||||
8uPr5e8gVGVzbGEgLSDw4/j5+iDn+unu5CDy7CDk4ff55CDs+On55e0g
|
||||
@ -87,6 +87,11 @@ def test_replace_mime_encodings_works_with_different_encodings():
|
||||
assert core.replace_mime_encodings(text=text, encoding="latin-1") == "5 wâ\x80-99sâ\x80-92"
|
||||
|
||||
|
||||
def test_replace_mime_encodings_works_with_right_to_left_encodings():
|
||||
text = "=EE=E0=E9=E4"
|
||||
assert core.replace_mime_encodings(text=text, encoding="iso-8859-8") == "מאיה"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("text", "expected"),
|
||||
[
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.7.10-dev1" # pragma: no cover
|
||||
__version__ = "0.7.10-dev2" # pragma: no cover
|
||||
|
||||
@ -3,6 +3,9 @@ import re
|
||||
import sys
|
||||
import unicodedata
|
||||
|
||||
from unstructured.file_utils.encoding import (
|
||||
format_encoding_str,
|
||||
)
|
||||
from unstructured.nlp.patterns import (
|
||||
DOUBLE_PARAGRAPH_PATTERN_RE,
|
||||
PARAGRAPH_PATTERN,
|
||||
@ -194,7 +197,8 @@ def replace_mime_encodings(text: str, encoding: str = "utf-8") -> str:
|
||||
-------
|
||||
5 w=E2=80-99s -> 5 w’s
|
||||
"""
|
||||
return quopri.decodestring(text.encode(encoding)).decode(encoding)
|
||||
formatted_encoding = format_encoding_str(encoding)
|
||||
return quopri.decodestring(text.encode(formatted_encoding)).decode(formatted_encoding)
|
||||
|
||||
|
||||
def clean_prefix(text: str, pattern: str, ignore_case: bool = False, strip: bool = True) -> str:
|
||||
@ -264,4 +268,5 @@ def bytes_string_to_string(text: str, encoding: str = "utf-8"):
|
||||
"""Converts a string representation of a byte string to a regular string using the
|
||||
specified encoding."""
|
||||
text_bytes = bytes([ord(char) for char in text])
|
||||
return text_bytes.decode(encoding)
|
||||
formatted_encoding = format_encoding_str(encoding)
|
||||
return text_bytes.decode(formatted_encoding)
|
||||
|
||||
@ -10,6 +10,8 @@ ENCODE_REC_THRESHOLD = 0.5
|
||||
COMMON_ENCODINGS = [
|
||||
"utf_8",
|
||||
"iso_8859_1",
|
||||
"iso_8859_6",
|
||||
"iso_8859_8",
|
||||
"ascii",
|
||||
"big5",
|
||||
"utf_16",
|
||||
@ -37,7 +39,14 @@ def format_encoding_str(encoding: str) -> str:
|
||||
The encoding string to be formatted (e.g., `UTF-8`, `utf_8`, `ISO-8859-1`, `iso_8859_1`,
|
||||
etc).
|
||||
"""
|
||||
return encoding.lower().replace("_", "-")
|
||||
formatted_encoding = encoding.lower().replace("_", "-")
|
||||
|
||||
# Special case for Arabic and Hebrew charsets with directional annotations
|
||||
annotated_encodings = ["iso-8859-6-i", "iso-8859-6-e", "iso-8859-8-i", "iso-8859-8-e"]
|
||||
if formatted_encoding in annotated_encodings:
|
||||
formatted_encoding = formatted_encoding[:-2] # remove the annotation
|
||||
|
||||
return formatted_encoding
|
||||
|
||||
|
||||
def detect_file_encoding(
|
||||
@ -82,7 +91,9 @@ def detect_file_encoding(
|
||||
else:
|
||||
file_text = byte_data.decode(encoding)
|
||||
|
||||
return encoding, file_text
|
||||
formatted_encoding = format_encoding_str(encoding)
|
||||
|
||||
return formatted_encoding, file_text
|
||||
|
||||
|
||||
def read_txt_file(
|
||||
@ -93,28 +104,28 @@ def read_txt_file(
|
||||
"""Extracts document metadata from a plain text document."""
|
||||
if filename:
|
||||
if encoding:
|
||||
with open(filename, encoding=encoding) as f:
|
||||
formatted_encoding = format_encoding_str(encoding)
|
||||
with open(filename, encoding=formatted_encoding) as f:
|
||||
try:
|
||||
file_text = f.read()
|
||||
except (UnicodeDecodeError, UnicodeError) as error:
|
||||
raise error
|
||||
else:
|
||||
encoding, file_text = detect_file_encoding(filename)
|
||||
formatted_encoding, file_text = detect_file_encoding(filename)
|
||||
elif file:
|
||||
if encoding:
|
||||
formatted_encoding = format_encoding_str(encoding)
|
||||
try:
|
||||
file_content = file if isinstance(file, bytes) else file.read()
|
||||
if isinstance(file_content, bytes):
|
||||
file_text = file_content.decode(encoding)
|
||||
file_text = file_content.decode(formatted_encoding)
|
||||
else:
|
||||
file_text = file_content
|
||||
except (UnicodeDecodeError, UnicodeError) as error:
|
||||
raise error
|
||||
else:
|
||||
encoding, file_text = detect_file_encoding(file=file)
|
||||
formatted_encoding, file_text = detect_file_encoding(file=file)
|
||||
else:
|
||||
raise FileNotFoundError("No filename was specified")
|
||||
|
||||
formatted_encoding = format_encoding_str(encoding)
|
||||
|
||||
return formatted_encoding, file_text
|
||||
|
||||
@ -10,7 +10,7 @@ from typing import IO, TYPE_CHECKING, Callable, List, Optional
|
||||
|
||||
from unstructured.documents.coordinates import PixelSpace
|
||||
from unstructured.documents.elements import Element, PageBreak
|
||||
from unstructured.file_utils.encoding import detect_file_encoding
|
||||
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
|
||||
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
|
||||
from unstructured.partition.common import (
|
||||
_add_element_metadata,
|
||||
@ -280,13 +280,17 @@ def detect_filetype(
|
||||
return FileType.XML
|
||||
|
||||
elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"):
|
||||
if not encoding:
|
||||
encoding = "utf-8"
|
||||
formatted_encoding = format_encoding_str(encoding)
|
||||
|
||||
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
|
||||
# installed on the Unstructured docker image, .json files resolve to "text/plain"
|
||||
# rather than "application/json". this corrects for that case.
|
||||
if _is_text_file_a_json(file=file, filename=filename, encoding=encoding):
|
||||
if _is_text_file_a_json(file=file, filename=filename, encoding=formatted_encoding):
|
||||
return FileType.JSON
|
||||
|
||||
if _is_text_file_a_csv(file=file, filename=filename, encoding=encoding):
|
||||
if _is_text_file_a_csv(file=file, filename=filename, encoding=formatted_encoding):
|
||||
return FileType.CSV
|
||||
|
||||
if file and _check_eml_from_buffer(file=file) is True:
|
||||
@ -384,8 +388,8 @@ def _read_file_start_for_type_check(
|
||||
with open(filename, encoding=encoding) as f:
|
||||
file_text = f.read(4096)
|
||||
except UnicodeDecodeError:
|
||||
encoding, _ = detect_file_encoding(filename=filename)
|
||||
with open(filename, encoding=encoding) as f:
|
||||
formatted_encoding, _ = detect_file_encoding(filename=filename)
|
||||
with open(filename, encoding=formatted_encoding) as f:
|
||||
file_text = f.read(4096)
|
||||
return file_text
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user