mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 13:44:05 +00:00
Adding optional encoding arg, and text_partition tests (#339)
This commit is contained in:
parent
213077e2ab
commit
64efcc0e50
3
.gitignore
vendored
3
.gitignore
vendored
@ -37,6 +37,9 @@ MANIFEST
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Pycharm
|
||||
.idea/
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
|
||||
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
||||
## 0.5.3-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
* Add optional `encoding` argument to the `partition_(text/email/html)` functions.
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.5.3-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
BIN
example-docs/fake-text-utf-16-be.txt
Normal file
BIN
example-docs/fake-text-utf-16-be.txt
Normal file
Binary file not shown.
@ -18,13 +18,30 @@ EXPECTED_OUTPUT = [
|
||||
]
|
||||
|
||||
|
||||
def test_partition_text_from_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||
elements = partition_text(filename=filename)
|
||||
@pytest.mark.parametrize(
|
||||
("filename", "encoding"),
|
||||
[("fake-text.txt", "utf-8"), ("fake-text.txt", None), ("fake-text-utf-16-be.txt", "utf-16-be")],
|
||||
)
|
||||
def test_partition_text_from_filename(filename, encoding):
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
||||
elements = partition_text(filename=filename, encoding=encoding)
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_OUTPUT
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("filename", "encoding", "error"),
|
||||
[
|
||||
("fake-text.txt", "utf-16", UnicodeDecodeError),
|
||||
("fake-text-utf-16-be.txt", "utf-16", UnicodeError),
|
||||
],
|
||||
)
|
||||
def test_partition_text_from_filename_raises_econding_error(filename, encoding, error):
|
||||
with pytest.raises(error):
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
||||
partition_text(filename=filename, encoding=encoding)
|
||||
|
||||
|
||||
def test_partition_text_from_file():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||
with open(filename) as f:
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.5.3-dev0" # pragma: no cover
|
||||
__version__ = "0.5.3-dev1" # pragma: no cover
|
||||
|
||||
@ -83,7 +83,13 @@ class XMLDocument(Document):
|
||||
return doc
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, filename, parser: VALID_PARSERS = None, stylesheet: Optional[str] = None):
|
||||
with open(filename, "r+", encoding="utf8") as f:
|
||||
def from_file(
|
||||
cls,
|
||||
filename,
|
||||
parser: VALID_PARSERS = None,
|
||||
stylesheet: Optional[str] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
):
|
||||
with open(filename, "r+", encoding=encoding) as f:
|
||||
content = f.read()
|
||||
return cls.from_string(content, parser=parser, stylesheet=stylesheet)
|
||||
|
||||
@ -139,6 +139,7 @@ def partition_email(
|
||||
file: Optional[IO] = None,
|
||||
text: Optional[str] = None,
|
||||
content_source: str = "text/html",
|
||||
encoding: Optional[str] = None,
|
||||
include_headers: bool = False,
|
||||
) -> List[Element]:
|
||||
"""Partitions an .eml documents into its constituent elements.
|
||||
@ -153,7 +154,12 @@ def partition_email(
|
||||
content_source
|
||||
default: "text/html"
|
||||
other: "text/plain"
|
||||
encoding
|
||||
The encoding method used to decode the text input. If None, utf-8 will be used.
|
||||
"""
|
||||
if not encoding:
|
||||
encoding = "utf-8"
|
||||
|
||||
if content_source not in VALID_CONTENT_SOURCES:
|
||||
raise ValueError(
|
||||
f"{content_source} is not a valid value for content_source. "
|
||||
@ -170,7 +176,7 @@ def partition_email(
|
||||
elif file is not None and not filename and not text:
|
||||
file_content = file.read()
|
||||
if isinstance(file_content, bytes):
|
||||
file_text = file_content.decode("utf-8")
|
||||
file_text = file_content.decode(encoding)
|
||||
else:
|
||||
file_text = file_content
|
||||
|
||||
|
||||
@ -13,6 +13,7 @@ def partition_html(
|
||||
file: Optional[IO] = None,
|
||||
text: Optional[str] = None,
|
||||
url: Optional[str] = None,
|
||||
encoding: Optional[str] = None,
|
||||
include_page_breaks: bool = False,
|
||||
include_metadata: bool = True,
|
||||
parser: VALID_PARSERS = None,
|
||||
@ -29,6 +30,8 @@ def partition_html(
|
||||
The string representation of the HTML document.
|
||||
url
|
||||
The URL of a webpage to parse. Only for URLs that return an HTML document.
|
||||
encoding
|
||||
The encoding method used to decode the text input. If None, utf-8 will be used.
|
||||
include_page_breaks
|
||||
If True, includes page breaks at the end of each page in the document.
|
||||
include_metadata
|
||||
@ -40,13 +43,16 @@ def partition_html(
|
||||
if not any([filename, file, text, url]):
|
||||
raise ValueError("One of filename, file, or text must be specified.")
|
||||
|
||||
if not encoding:
|
||||
encoding = "utf-8"
|
||||
|
||||
if filename is not None and not file and not text and not url:
|
||||
document = HTMLDocument.from_file(filename, parser=parser)
|
||||
document = HTMLDocument.from_file(filename, parser=parser, encoding=encoding)
|
||||
|
||||
elif file is not None and not filename and not text and not url:
|
||||
file_content = file.read()
|
||||
if isinstance(file_content, bytes):
|
||||
file_text = file_content.decode("utf-8")
|
||||
file_text = file_content.decode(encoding)
|
||||
else:
|
||||
file_text = file_content
|
||||
|
||||
|
||||
@ -28,6 +28,7 @@ def partition_text(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
text: Optional[str] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
) -> List[Element]:
|
||||
"""Partitions an .txt documents into its constituent elements.
|
||||
Parameters
|
||||
@ -38,14 +39,19 @@ def partition_text(
|
||||
A file-like object using "r" mode --> open(filename, "r").
|
||||
text
|
||||
The string representation of the .txt document.
|
||||
encoding
|
||||
The encoding method used to decode the text input. If None, utf-8 will be used.
|
||||
"""
|
||||
|
||||
if not any([filename, file, text]):
|
||||
raise ValueError("One of filename, file, or text must be specified.")
|
||||
|
||||
if filename is not None and not file and not text:
|
||||
with open(filename, encoding="utf8") as f:
|
||||
file_text = f.read()
|
||||
with open(filename, encoding=encoding) as f:
|
||||
try:
|
||||
file_text = f.read()
|
||||
except (UnicodeDecodeError, UnicodeError) as error:
|
||||
raise error
|
||||
|
||||
elif file is not None and not filename and not text:
|
||||
file_text = file.read()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user