mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
Issue/unicode error (#608)
This PR adds functionality to try other common encodings if an error related to the encoding is raised and the user has not specified an encoding.
This commit is contained in:
parent
a78719666a
commit
a1fed6d4c6
@ -1,4 +1,4 @@
|
||||
## 0.6.9-dev1
|
||||
## 0.6.9-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* Adds functionality to try other common encodings if an error related to the encoding is raised and the user has not specified an encoding.
|
||||
* Adds additional MIME types for CSV
|
||||
|
||||
## 0.6.8
|
||||
|
BIN
example-docs/fake-text-utf-16-le.txt
Normal file
BIN
example-docs/fake-text-utf-16-le.txt
Normal file
Binary file not shown.
BIN
example-docs/fake-text-utf-16.txt
Normal file
BIN
example-docs/fake-text-utf-16.txt
Normal file
Binary file not shown.
BIN
example-docs/fake-text-utf-32.txt
Normal file
BIN
example-docs/fake-text-utf-32.txt
Normal file
Binary file not shown.
@ -18,6 +18,8 @@ certifi==2022.12.7
|
||||
# unstructured (setup.py)
|
||||
cffi==1.15.1
|
||||
# via cryptography
|
||||
chardet==5.1.0
|
||||
# via unstructured (setup.py)
|
||||
charset-normalizer==3.1.0
|
||||
# via
|
||||
# pdfminer-six
|
||||
|
@ -35,6 +35,8 @@ cffi==1.15.1
|
||||
# via argon2-cffi-bindings
|
||||
cfgv==3.3.1
|
||||
# via pre-commit
|
||||
chardet==5.1.0
|
||||
# via -r requirements/dev.in
|
||||
click==8.1.3
|
||||
# via pip-tools
|
||||
comm==0.1.3
|
||||
|
1
setup.py
1
setup.py
@ -51,6 +51,7 @@ setup(
|
||||
},
|
||||
install_requires=[
|
||||
"argilla",
|
||||
"chardet",
|
||||
"lxml",
|
||||
"msg_parser",
|
||||
"nltk",
|
||||
|
@ -30,6 +30,17 @@ def test_partition_text_from_filename(filename, encoding):
|
||||
assert elements == EXPECTED_OUTPUT
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
|
||||
)
|
||||
def test_partition_text_from_filename_default_encoding(filename):
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
||||
elements = partition_text(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_OUTPUT
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("filename", "encoding", "error"),
|
||||
[
|
||||
@ -51,6 +62,18 @@ def test_partition_text_from_file():
|
||||
assert elements == EXPECTED_OUTPUT
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
|
||||
)
|
||||
def test_partition_text_from_file_default_encoding(filename):
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
||||
with open(filename) as f:
|
||||
elements = partition_text(file=f)
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_partition_text_from_bytes_file():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||
with open(filename, "rb") as f:
|
||||
@ -59,6 +82,18 @@ def test_partition_text_from_bytes_file():
|
||||
assert elements == EXPECTED_OUTPUT
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
|
||||
)
|
||||
def test_partition_text_from_bytes_file_default_encoding(filename):
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_text(file=f)
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_partition_text_from_text():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||
with open(filename) as f:
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.6.9-dev1" # pragma: no cover
|
||||
__version__ = "0.6.9-dev2" # pragma: no cover
|
||||
|
103
unstructured/file_utils/encoding.py
Normal file
103
unstructured/file_utils/encoding.py
Normal file
@ -0,0 +1,103 @@
|
||||
from typing import IO, Optional, Tuple
|
||||
|
||||
import chardet
|
||||
|
||||
ENCODE_REC_THRESHOLD = 0.5
|
||||
|
||||
# popular encodings from https://en.wikipedia.org/wiki/Popularity_of_text_encodings
|
||||
COMMON_ENCODINGS = [
|
||||
"utf_8",
|
||||
"iso_8859_1",
|
||||
"ascii",
|
||||
"big5",
|
||||
"utf_16",
|
||||
"utf_16_be",
|
||||
"utf_16_le",
|
||||
"utf_32",
|
||||
"utf_32_be",
|
||||
"utf_32_le",
|
||||
"euc_jis_2004",
|
||||
"euc_jisx0213",
|
||||
"euc_jp",
|
||||
"euc_kr",
|
||||
"gb18030",
|
||||
"shift_jis",
|
||||
"shift_jis_2004",
|
||||
"shift_jisx0213",
|
||||
]
|
||||
|
||||
|
||||
def detect_file_encoding(filename: str = "", file: Optional[IO] = None) -> Tuple[str, str]:
|
||||
if filename:
|
||||
with open(filename, "rb") as f:
|
||||
binary_data = f.read()
|
||||
elif file:
|
||||
if "b" in file.mode:
|
||||
binary_data = file.read()
|
||||
else:
|
||||
with open(file.name, "rb") as f:
|
||||
binary_data = f.read()
|
||||
else:
|
||||
raise FileNotFoundError("No filename nor file were specified")
|
||||
|
||||
result = chardet.detect(binary_data)
|
||||
encoding = result["encoding"]
|
||||
confidence = result["confidence"]
|
||||
|
||||
if encoding is None or confidence < ENCODE_REC_THRESHOLD:
|
||||
# Encoding detection failed, fallback to predefined encodings
|
||||
for enc in COMMON_ENCODINGS:
|
||||
try:
|
||||
with open(filename, encoding=enc) as f:
|
||||
file_text = f.read()
|
||||
encoding = enc
|
||||
break
|
||||
except (UnicodeDecodeError, UnicodeError):
|
||||
continue
|
||||
else:
|
||||
raise UnicodeDecodeError(
|
||||
"Unable to determine the encoding of the file or match it with any "
|
||||
"of the specified encodings.",
|
||||
binary_data,
|
||||
0,
|
||||
len(binary_data),
|
||||
"Invalid encoding",
|
||||
)
|
||||
|
||||
else:
|
||||
file_text = binary_data.decode(encoding)
|
||||
|
||||
return encoding, file_text
|
||||
|
||||
|
||||
def read_txt_file(
|
||||
filename: str = "",
|
||||
file: Optional[IO] = None,
|
||||
encoding: Optional[str] = None,
|
||||
) -> Tuple[str, str]:
|
||||
"""Extracts document metadata from a plain text document."""
|
||||
if filename:
|
||||
if encoding:
|
||||
with open(filename, encoding=encoding) as f:
|
||||
try:
|
||||
file_text = f.read()
|
||||
except (UnicodeDecodeError, UnicodeError) as error:
|
||||
raise error
|
||||
else:
|
||||
encoding, file_text = detect_file_encoding(filename)
|
||||
elif file:
|
||||
if encoding:
|
||||
try:
|
||||
file_content = file.read()
|
||||
if isinstance(file_content, bytes):
|
||||
file_text = file_content.decode(encoding)
|
||||
else:
|
||||
file_text = file_content
|
||||
except (UnicodeDecodeError, UnicodeError) as error:
|
||||
raise error
|
||||
else:
|
||||
encoding, file_text = detect_file_encoding(file=file)
|
||||
else:
|
||||
raise FileNotFoundError("No filename was specified")
|
||||
|
||||
return encoding, file_text
|
@ -11,6 +11,7 @@ from unstructured.documents.elements import (
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
from unstructured.file_utils.encoding import read_txt_file
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
|
||||
from unstructured.partition.common import exactly_one
|
||||
@ -31,7 +32,7 @@ def partition_text(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
text: Optional[str] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
encoding: Optional[str] = None,
|
||||
paragraph_grouper: Optional[Callable[[str], str]] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
include_metadata: bool = True,
|
||||
@ -60,16 +61,10 @@ def partition_text(
|
||||
exactly_one(filename=filename, file=file, text=text)
|
||||
|
||||
if filename is not None:
|
||||
with open(filename, encoding=encoding) as f:
|
||||
try:
|
||||
file_text = f.read()
|
||||
except (UnicodeDecodeError, UnicodeError) as error:
|
||||
raise error
|
||||
encoding, file_text = read_txt_file(filename=filename, encoding=encoding)
|
||||
|
||||
elif file is not None:
|
||||
file_text = file.read()
|
||||
if isinstance(file_text, bytes):
|
||||
file_text = file_text.decode(encoding)
|
||||
encoding, file_text = read_txt_file(file=file, encoding=encoding)
|
||||
|
||||
elif text is not None:
|
||||
file_text = str(text)
|
||||
|
Loading…
x
Reference in New Issue
Block a user