diff --git a/.gitignore b/.gitignore index 7fe908cca..eebafcd20 100644 --- a/.gitignore +++ b/.gitignore @@ -37,6 +37,9 @@ MANIFEST pip-log.txt pip-delete-this-directory.txt +# Pycharm +.idea/ + # Unit test / coverage reports htmlcov/ .tox/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 500f98a86..660d09959 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.5.3-dev1 + +### Enhancements + +### Features + +* Add optional `encoding` argument to the `partition_(text/email/html)` functions. + +### Fixes + ## 0.5.3-dev0 ### Enhancements diff --git a/example-docs/fake-text-utf-16-be.txt b/example-docs/fake-text-utf-16-be.txt new file mode 100644 index 000000000..8d12b76d1 Binary files /dev/null and b/example-docs/fake-text-utf-16-be.txt differ diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py index b411a17fc..8529a3616 100644 --- a/test_unstructured/partition/test_text.py +++ b/test_unstructured/partition/test_text.py @@ -18,13 +18,30 @@ EXPECTED_OUTPUT = [ ] -def test_partition_text_from_filename(): - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt") - elements = partition_text(filename=filename) +@pytest.mark.parametrize( + ("filename", "encoding"), + [("fake-text.txt", "utf-8"), ("fake-text.txt", None), ("fake-text-utf-16-be.txt", "utf-16-be")], +) +def test_partition_text_from_filename(filename, encoding): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + elements = partition_text(filename=filename, encoding=encoding) assert len(elements) > 0 assert elements == EXPECTED_OUTPUT +@pytest.mark.parametrize( + ("filename", "encoding", "error"), + [ + ("fake-text.txt", "utf-16", UnicodeDecodeError), + ("fake-text-utf-16-be.txt", "utf-16", UnicodeError), + ], +) +def test_partition_text_from_filename_raises_econding_error(filename, encoding, error): + with pytest.raises(error): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) + partition_text(filename=filename, encoding=encoding) + + def test_partition_text_from_file(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt") with open(filename) as f: diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 83e11ce2f..bb2ef509d 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.5.3-dev0" # pragma: no cover +__version__ = "0.5.3-dev1" # pragma: no cover diff --git a/unstructured/documents/xml.py b/unstructured/documents/xml.py index 571d1a23c..f610cd673 100644 --- a/unstructured/documents/xml.py +++ b/unstructured/documents/xml.py @@ -83,7 +83,13 @@ class XMLDocument(Document): return doc @classmethod - def from_file(cls, filename, parser: VALID_PARSERS = None, stylesheet: Optional[str] = None): - with open(filename, "r+", encoding="utf8") as f: + def from_file( + cls, + filename, + parser: VALID_PARSERS = None, + stylesheet: Optional[str] = None, + encoding: Optional[str] = "utf-8", + ): + with open(filename, "r+", encoding=encoding) as f: content = f.read() return cls.from_string(content, parser=parser, stylesheet=stylesheet) diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index c9cca2813..ee87a2690 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -139,6 +139,7 @@ def partition_email( file: Optional[IO] = None, text: Optional[str] = None, content_source: str = "text/html", + encoding: Optional[str] = None, include_headers: bool = False, ) -> List[Element]: """Partitions an .eml documents into its constituent elements. @@ -153,7 +154,12 @@ def partition_email( content_source default: "text/html" other: "text/plain" + encoding + The encoding method used to decode the text input. If None, utf-8 will be used. """ + if not encoding: + encoding = "utf-8" + if content_source not in VALID_CONTENT_SOURCES: raise ValueError( f"{content_source} is not a valid value for content_source. " @@ -170,7 +176,7 @@ def partition_email( elif file is not None and not filename and not text: file_content = file.read() if isinstance(file_content, bytes): - file_text = file_content.decode("utf-8") + file_text = file_content.decode(encoding) else: file_text = file_content diff --git a/unstructured/partition/html.py b/unstructured/partition/html.py index 506657443..54bedc3f8 100644 --- a/unstructured/partition/html.py +++ b/unstructured/partition/html.py @@ -13,6 +13,7 @@ def partition_html( file: Optional[IO] = None, text: Optional[str] = None, url: Optional[str] = None, + encoding: Optional[str] = None, include_page_breaks: bool = False, include_metadata: bool = True, parser: VALID_PARSERS = None, @@ -29,6 +30,8 @@ def partition_html( The string representation of the HTML document. url The URL of a webpage to parse. Only for URLs that return an HTML document. + encoding + The encoding method used to decode the text input. If None, utf-8 will be used. include_page_breaks If True, includes page breaks at the end of each page in the document. include_metadata @@ -40,13 +43,16 @@ def partition_html( if not any([filename, file, text, url]): raise ValueError("One of filename, file, or text must be specified.") + if not encoding: + encoding = "utf-8" + if filename is not None and not file and not text and not url: - document = HTMLDocument.from_file(filename, parser=parser) + document = HTMLDocument.from_file(filename, parser=parser, encoding=encoding) elif file is not None and not filename and not text and not url: file_content = file.read() if isinstance(file_content, bytes): - file_text = file_content.decode("utf-8") + file_text = file_content.decode(encoding) else: file_text = file_content diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py index 53df77752..e19a04b95 100644 --- a/unstructured/partition/text.py +++ b/unstructured/partition/text.py @@ -28,6 +28,7 @@ def partition_text( filename: Optional[str] = None, file: Optional[IO] = None, text: Optional[str] = None, + encoding: Optional[str] = "utf-8", ) -> List[Element]: """Partitions an .txt documents into its constituent elements. Parameters @@ -38,14 +39,19 @@ def partition_text( A file-like object using "r" mode --> open(filename, "r"). text The string representation of the .txt document. + encoding + The encoding method used to decode the text input. If None, utf-8 will be used. """ if not any([filename, file, text]): raise ValueError("One of filename, file, or text must be specified.") if filename is not None and not file and not text: - with open(filename, encoding="utf8") as f: - file_text = f.read() + with open(filename, encoding=encoding) as f: + try: + file_text = f.read() + except (UnicodeDecodeError, UnicodeError) as error: + raise error elif file is not None and not filename and not text: file_text = file.read()