Issue/unicode error (#608)

This PR adds functionality to try other common encodings if an error related to the encoding is raised and the user has not specified an encoding.
This commit is contained in:
Christine Straub 2023-05-23 15:35:38 -05:00 committed by GitHub
parent a78719666a
commit a1fed6d4c6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 150 additions and 11 deletions

View File

@ -1,4 +1,4 @@
## 0.6.9-dev1
## 0.6.9-dev2
### Enhancements
@ -8,6 +8,7 @@
### Fixes
* Adds functionality to try other common encodings if an error related to the encoding is raised and the user has not specified an encoding.
* Adds additional MIME types for CSV
## 0.6.8

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -18,6 +18,8 @@ certifi==2022.12.7
# unstructured (setup.py)
cffi==1.15.1
# via cryptography
chardet==5.1.0
# via unstructured (setup.py)
charset-normalizer==3.1.0
# via
# pdfminer-six

View File

@ -35,6 +35,8 @@ cffi==1.15.1
# via argon2-cffi-bindings
cfgv==3.3.1
# via pre-commit
chardet==5.1.0
# via -r requirements/dev.in
click==8.1.3
# via pip-tools
comm==0.1.3

View File

@ -51,6 +51,7 @@ setup(
},
install_requires=[
"argilla",
"chardet",
"lxml",
"msg_parser",
"nltk",

View File

@ -30,6 +30,17 @@ def test_partition_text_from_filename(filename, encoding):
assert elements == EXPECTED_OUTPUT
@pytest.mark.parametrize(
"filename",
["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
)
def test_partition_text_from_filename_default_encoding(filename):
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition_text(filename=filename)
assert len(elements) > 0
assert elements == EXPECTED_OUTPUT
@pytest.mark.parametrize(
("filename", "encoding", "error"),
[
@ -51,6 +62,18 @@ def test_partition_text_from_file():
assert elements == EXPECTED_OUTPUT
@pytest.mark.parametrize(
"filename",
["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
)
def test_partition_text_from_file_default_encoding(filename):
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(filename) as f:
elements = partition_text(file=f)
assert len(elements) > 0
assert elements == EXPECTED_OUTPUT
def test_partition_text_from_bytes_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
with open(filename, "rb") as f:
@ -59,6 +82,18 @@ def test_partition_text_from_bytes_file():
assert elements == EXPECTED_OUTPUT
@pytest.mark.parametrize(
"filename",
["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
)
def test_partition_text_from_bytes_file_default_encoding(filename):
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(filename, "rb") as f:
elements = partition_text(file=f)
assert len(elements) > 0
assert elements == EXPECTED_OUTPUT
def test_partition_text_from_text():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
with open(filename) as f:

View File

@ -1 +1 @@
__version__ = "0.6.9-dev1" # pragma: no cover
__version__ = "0.6.9-dev2" # pragma: no cover

View File

@ -0,0 +1,103 @@
from typing import IO, Optional, Tuple
import chardet
ENCODE_REC_THRESHOLD = 0.5
# popular encodings from https://en.wikipedia.org/wiki/Popularity_of_text_encodings
COMMON_ENCODINGS = [
"utf_8",
"iso_8859_1",
"ascii",
"big5",
"utf_16",
"utf_16_be",
"utf_16_le",
"utf_32",
"utf_32_be",
"utf_32_le",
"euc_jis_2004",
"euc_jisx0213",
"euc_jp",
"euc_kr",
"gb18030",
"shift_jis",
"shift_jis_2004",
"shift_jisx0213",
]
def detect_file_encoding(filename: str = "", file: Optional[IO] = None) -> Tuple[str, str]:
if filename:
with open(filename, "rb") as f:
binary_data = f.read()
elif file:
if "b" in file.mode:
binary_data = file.read()
else:
with open(file.name, "rb") as f:
binary_data = f.read()
else:
raise FileNotFoundError("No filename nor file were specified")
result = chardet.detect(binary_data)
encoding = result["encoding"]
confidence = result["confidence"]
if encoding is None or confidence < ENCODE_REC_THRESHOLD:
# Encoding detection failed, fallback to predefined encodings
for enc in COMMON_ENCODINGS:
try:
with open(filename, encoding=enc) as f:
file_text = f.read()
encoding = enc
break
except (UnicodeDecodeError, UnicodeError):
continue
else:
raise UnicodeDecodeError(
"Unable to determine the encoding of the file or match it with any "
"of the specified encodings.",
binary_data,
0,
len(binary_data),
"Invalid encoding",
)
else:
file_text = binary_data.decode(encoding)
return encoding, file_text
def read_txt_file(
filename: str = "",
file: Optional[IO] = None,
encoding: Optional[str] = None,
) -> Tuple[str, str]:
"""Extracts document metadata from a plain text document."""
if filename:
if encoding:
with open(filename, encoding=encoding) as f:
try:
file_text = f.read()
except (UnicodeDecodeError, UnicodeError) as error:
raise error
else:
encoding, file_text = detect_file_encoding(filename)
elif file:
if encoding:
try:
file_content = file.read()
if isinstance(file_content, bytes):
file_text = file_content.decode(encoding)
else:
file_text = file_content
except (UnicodeDecodeError, UnicodeError) as error:
raise error
else:
encoding, file_text = detect_file_encoding(file=file)
else:
raise FileNotFoundError("No filename was specified")
return encoding, file_text

View File

@ -11,6 +11,7 @@ from unstructured.documents.elements import (
Text,
Title,
)
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
from unstructured.partition.common import exactly_one
@ -31,7 +32,7 @@ def partition_text(
filename: Optional[str] = None,
file: Optional[IO] = None,
text: Optional[str] = None,
encoding: Optional[str] = "utf-8",
encoding: Optional[str] = None,
paragraph_grouper: Optional[Callable[[str], str]] = None,
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
@ -60,16 +61,10 @@ def partition_text(
exactly_one(filename=filename, file=file, text=text)
if filename is not None:
with open(filename, encoding=encoding) as f:
try:
file_text = f.read()
except (UnicodeDecodeError, UnicodeError) as error:
raise error
encoding, file_text = read_txt_file(filename=filename, encoding=encoding)
elif file is not None:
file_text = file.read()
if isinstance(file_text, bytes):
file_text = file_text.decode(encoding)
encoding, file_text = read_txt_file(file=file, encoding=encoding)
elif text is not None:
file_text = str(text)