Adding optional encoding arg, and text_partition tests (#339)

2025-12-24 13:44:05 +00:00 · 2023-03-06 15:07:33 -08:00 · 2023-03-06 15:07:33 -08:00 · 64efcc0e50
commit 64efcc0e50
parent 213077e2ab
9 changed files with 65 additions and 11 deletions
--- a/.gitignore
+++ b/.gitignore
@ -37,6 +37,9 @@ MANIFEST
 pip-log.txt
 pip-delete-this-directory.txt

+# Pycharm
+.idea/
+
 # Unit test / coverage reports
 htmlcov/
 .tox/
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,13 @@
+## 0.5.3-dev1
+
+### Enhancements
+
+### Features
+
+* Add optional `encoding` argument to the `partition_(text/email/html)` functions.
+
+### Fixes
+
 ## 0.5.3-dev0

 ### Enhancements
--- a/example-docs/fake-text-utf-16-be.txt
+++ b/example-docs/fake-text-utf-16-be.txt
--- a/test_unstructured/partition/test_text.py
+++ b/test_unstructured/partition/test_text.py
@ -18,13 +18,30 @@ EXPECTED_OUTPUT = [
 ]


-def test_partition_text_from_filename():
-    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
-    elements = partition_text(filename=filename)
+@pytest.mark.parametrize(
+    ("filename", "encoding"),
+    [("fake-text.txt", "utf-8"), ("fake-text.txt", None), ("fake-text-utf-16-be.txt", "utf-16-be")],
+)
+def test_partition_text_from_filename(filename, encoding):
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    elements = partition_text(filename=filename, encoding=encoding)
    assert len(elements) > 0
    assert elements == EXPECTED_OUTPUT


+@pytest.mark.parametrize(
+    ("filename", "encoding", "error"),
+    [
+        ("fake-text.txt", "utf-16", UnicodeDecodeError),
+        ("fake-text-utf-16-be.txt", "utf-16", UnicodeError),
+    ],
+)
+def test_partition_text_from_filename_raises_econding_error(filename, encoding, error):
+    with pytest.raises(error):
+        filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+        partition_text(filename=filename, encoding=encoding)
+
+
 def test_partition_text_from_file():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
    with open(filename) as f:
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.5.3-dev0"  # pragma: no cover
+__version__ = "0.5.3-dev1"  # pragma: no cover
--- a/unstructured/documents/xml.py
+++ b/unstructured/documents/xml.py
@ -83,7 +83,13 @@ class XMLDocument(Document):
        return doc

    @classmethod
-    def from_file(cls, filename, parser: VALID_PARSERS = None, stylesheet: Optional[str] = None):
-        with open(filename, "r+", encoding="utf8") as f:
+    def from_file(
+        cls,
+        filename,
+        parser: VALID_PARSERS = None,
+        stylesheet: Optional[str] = None,
+        encoding: Optional[str] = "utf-8",
+    ):
+        with open(filename, "r+", encoding=encoding) as f:
            content = f.read()
        return cls.from_string(content, parser=parser, stylesheet=stylesheet)
--- a/unstructured/partition/email.py
+++ b/unstructured/partition/email.py
@ -139,6 +139,7 @@ def partition_email(
    file: Optional[IO] = None,
    text: Optional[str] = None,
    content_source: str = "text/html",
+    encoding: Optional[str] = None,
    include_headers: bool = False,
 ) -> List[Element]:
    """Partitions an .eml documents into its constituent elements.
@ -153,7 +154,12 @@ def partition_email(
    content_source
        default: "text/html"
        other: "text/plain"
+    encoding
+        The encoding method used to decode the text input. If None, utf-8 will be used.
    """
+    if not encoding:
+        encoding = "utf-8"
+
    if content_source not in VALID_CONTENT_SOURCES:
        raise ValueError(
            f"{content_source} is not a valid value for content_source. "
@ -170,7 +176,7 @@ def partition_email(
    elif file is not None and not filename and not text:
        file_content = file.read()
        if isinstance(file_content, bytes):
-            file_text = file_content.decode("utf-8")
+            file_text = file_content.decode(encoding)
        else:
            file_text = file_content

--- a/unstructured/partition/html.py
+++ b/unstructured/partition/html.py
@ -13,6 +13,7 @@ def partition_html(
    file: Optional[IO] = None,
    text: Optional[str] = None,
    url: Optional[str] = None,
+    encoding: Optional[str] = None,
    include_page_breaks: bool = False,
    include_metadata: bool = True,
    parser: VALID_PARSERS = None,
@ -29,6 +30,8 @@ def partition_html(
        The string representation of the HTML document.
    url
        The URL of a webpage to parse. Only for URLs that return an HTML document.
+    encoding
+        The encoding method used to decode the text input. If None, utf-8 will be used.
    include_page_breaks
        If True, includes page breaks at the end of each page in the document.
    include_metadata
@ -40,13 +43,16 @@ def partition_html(
    if not any([filename, file, text, url]):
        raise ValueError("One of filename, file, or text must be specified.")

+    if not encoding:
+        encoding = "utf-8"
+
    if filename is not None and not file and not text and not url:
-        document = HTMLDocument.from_file(filename, parser=parser)
+        document = HTMLDocument.from_file(filename, parser=parser, encoding=encoding)

    elif file is not None and not filename and not text and not url:
        file_content = file.read()
        if isinstance(file_content, bytes):
-            file_text = file_content.decode("utf-8")
+            file_text = file_content.decode(encoding)
        else:
            file_text = file_content

--- a/unstructured/partition/text.py
+++ b/unstructured/partition/text.py
@ -28,6 +28,7 @@ def partition_text(
    filename: Optional[str] = None,
    file: Optional[IO] = None,
    text: Optional[str] = None,
+    encoding: Optional[str] = "utf-8",
 ) -> List[Element]:
    """Partitions an .txt documents into its constituent elements.
    Parameters
@ -38,14 +39,19 @@ def partition_text(
        A file-like object using "r" mode --> open(filename, "r").
    text
        The string representation of the .txt document.
+    encoding
+        The encoding method used to decode the text input. If None, utf-8 will be used.
    """

    if not any([filename, file, text]):
        raise ValueError("One of filename, file, or text must be specified.")

    if filename is not None and not file and not text:
-        with open(filename, encoding="utf8") as f:
-            file_text = f.read()
+        with open(filename, encoding=encoding) as f:
+            try:
+                file_text = f.read()
+            except (UnicodeDecodeError, UnicodeError) as error:
+                raise error

    elif file is not None and not filename and not text:
        file_text = file.read()