Adding optional encoding arg, and text_partition tests (#339)

This commit is contained in:
Amanda Cameron 2023-03-06 15:07:33 -08:00 committed by GitHub
parent 213077e2ab
commit 64efcc0e50
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 65 additions and 11 deletions

3
.gitignore vendored
View File

@ -37,6 +37,9 @@ MANIFEST
pip-log.txt
pip-delete-this-directory.txt
# Pycharm
.idea/
# Unit test / coverage reports
htmlcov/
.tox/

View File

@ -1,3 +1,13 @@
## 0.5.3-dev1
### Enhancements
### Features
* Add optional `encoding` argument to the `partition_(text/email/html)` functions.
### Fixes
## 0.5.3-dev0
### Enhancements

Binary file not shown.

View File

@ -18,13 +18,30 @@ EXPECTED_OUTPUT = [
]
def test_partition_text_from_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
elements = partition_text(filename=filename)
@pytest.mark.parametrize(
("filename", "encoding"),
[("fake-text.txt", "utf-8"), ("fake-text.txt", None), ("fake-text-utf-16-be.txt", "utf-16-be")],
)
def test_partition_text_from_filename(filename, encoding):
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition_text(filename=filename, encoding=encoding)
assert len(elements) > 0
assert elements == EXPECTED_OUTPUT
@pytest.mark.parametrize(
("filename", "encoding", "error"),
[
("fake-text.txt", "utf-16", UnicodeDecodeError),
("fake-text-utf-16-be.txt", "utf-16", UnicodeError),
],
)
def test_partition_text_from_filename_raises_econding_error(filename, encoding, error):
with pytest.raises(error):
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
partition_text(filename=filename, encoding=encoding)
def test_partition_text_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
with open(filename) as f:

View File

@ -1 +1 @@
__version__ = "0.5.3-dev0" # pragma: no cover
__version__ = "0.5.3-dev1" # pragma: no cover

View File

@ -83,7 +83,13 @@ class XMLDocument(Document):
return doc
@classmethod
def from_file(cls, filename, parser: VALID_PARSERS = None, stylesheet: Optional[str] = None):
with open(filename, "r+", encoding="utf8") as f:
def from_file(
cls,
filename,
parser: VALID_PARSERS = None,
stylesheet: Optional[str] = None,
encoding: Optional[str] = "utf-8",
):
with open(filename, "r+", encoding=encoding) as f:
content = f.read()
return cls.from_string(content, parser=parser, stylesheet=stylesheet)

View File

@ -139,6 +139,7 @@ def partition_email(
file: Optional[IO] = None,
text: Optional[str] = None,
content_source: str = "text/html",
encoding: Optional[str] = None,
include_headers: bool = False,
) -> List[Element]:
"""Partitions an .eml documents into its constituent elements.
@ -153,7 +154,12 @@ def partition_email(
content_source
default: "text/html"
other: "text/plain"
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
"""
if not encoding:
encoding = "utf-8"
if content_source not in VALID_CONTENT_SOURCES:
raise ValueError(
f"{content_source} is not a valid value for content_source. "
@ -170,7 +176,7 @@ def partition_email(
elif file is not None and not filename and not text:
file_content = file.read()
if isinstance(file_content, bytes):
file_text = file_content.decode("utf-8")
file_text = file_content.decode(encoding)
else:
file_text = file_content

View File

@ -13,6 +13,7 @@ def partition_html(
file: Optional[IO] = None,
text: Optional[str] = None,
url: Optional[str] = None,
encoding: Optional[str] = None,
include_page_breaks: bool = False,
include_metadata: bool = True,
parser: VALID_PARSERS = None,
@ -29,6 +30,8 @@ def partition_html(
The string representation of the HTML document.
url
The URL of a webpage to parse. Only for URLs that return an HTML document.
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
include_page_breaks
If True, includes page breaks at the end of each page in the document.
include_metadata
@ -40,13 +43,16 @@ def partition_html(
if not any([filename, file, text, url]):
raise ValueError("One of filename, file, or text must be specified.")
if not encoding:
encoding = "utf-8"
if filename is not None and not file and not text and not url:
document = HTMLDocument.from_file(filename, parser=parser)
document = HTMLDocument.from_file(filename, parser=parser, encoding=encoding)
elif file is not None and not filename and not text and not url:
file_content = file.read()
if isinstance(file_content, bytes):
file_text = file_content.decode("utf-8")
file_text = file_content.decode(encoding)
else:
file_text = file_content

View File

@ -28,6 +28,7 @@ def partition_text(
filename: Optional[str] = None,
file: Optional[IO] = None,
text: Optional[str] = None,
encoding: Optional[str] = "utf-8",
) -> List[Element]:
"""Partitions an .txt documents into its constituent elements.
Parameters
@ -38,14 +39,19 @@ def partition_text(
A file-like object using "r" mode --> open(filename, "r").
text
The string representation of the .txt document.
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
"""
if not any([filename, file, text]):
raise ValueError("One of filename, file, or text must be specified.")
if filename is not None and not file and not text:
with open(filename, encoding="utf8") as f:
file_text = f.read()
with open(filename, encoding=encoding) as f:
try:
file_text = f.read()
except (UnicodeDecodeError, UnicodeError) as error:
raise error
elif file is not None and not filename and not text:
file_text = file.read()