feat: Support encoding parameter in partition_csv (#3564)

See added test file. Added support for the encoding parameter, which can be passed directly to `pd.read_csv`.
2025-12-24 13:44:05 +00:00 · 2024-08-28 10:19:58 -04:00 · 2024-08-28 10:19:58 -04:00 · f440eb476c
commit f440eb476c
parent f21c853ade
7 changed files with 26 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,9 +1,11 @@
-## 0.15.9-dev0
+## 0.15.9-dev1

 ### Enhancements

 ### Features

+* **Add support for encoding parameter in partition_csv**
+
 ### Fixes

 * **Fix disk space leaks and Windows errors when accessing file.name on a NamedTemporaryFile** Uses of `NamedTemporaryFile(..., delete=False)` and/or uses of `file.name` of NamedTemporaryFiles have been replaced with TemporaryFileDirectory to avoid a known issue: https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile
--- a/example-docs/stanley-cups-utf-16.csv
+++ b/example-docs/stanley-cups-utf-16.csv
--- a/test_unstructured/partition/test_csv.py
+++ b/test_unstructured/partition/test_csv.py
@ -74,6 +74,12 @@ def test_partition_csv_from_filename_with_metadata_filename():
    assert elements[0].metadata.filename == "test"


+def test_partition_csv_with_encoding():
+    elements = partition_csv(example_doc_path("stanley-cups-utf-16.csv"), encoding="utf-16")
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+
+
@pytest.mark.parametrize(
    ("filename", "expected_text", "expected_table"),
    [
@ -279,6 +285,7 @@ class Describe_CsvPartitioningContext:
        ctx = _CsvPartitioningContext.load(
            file_path=example_doc_path("stanley-cups.csv"),
            file=None,
+            encoding=None,
            metadata_file_path=None,
            metadata_last_modified=None,
            include_header=True,
@ -292,6 +299,7 @@ class Describe_CsvPartitioningContext:
            _CsvPartitioningContext.load(
                file_path=None,
                file=None,
+                encoding=None,
                metadata_file_path=None,
                metadata_last_modified=None,
                include_header=True,
--- a/typings/pandas/io/parsers/readers.pyi
+++ b/typings/pandas/io/parsers/readers.pyi
@ -7,6 +7,7 @@ from pandas.core.frame import DataFrame
 def read_csv(
    filepath_or_buffer: str | IO[bytes],
    *,
+    encoding: str | None = ...,
    sep: str | None = ...,
    header: int | None | Literal["infer"] = ...,
 ) -> DataFrame: ...
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.15.9-dev0"  # pragma: no cover
+__version__ = "0.15.9-dev1"  # pragma: no cover
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -207,6 +207,7 @@ def partition(
        elements = partition_csv(
            filename=filename,
            file=file,
+            encoding=encoding,
            infer_table_structure=infer_table_structure,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
--- a/unstructured/partition/csv.py
+++ b/unstructured/partition/csv.py
@ -29,6 +29,7 @@ DETECTION_ORIGIN: str = "csv"
 def partition_csv(
    filename: str | None = None,
    file: IO[bytes] | None = None,
+    encoding: str | None = None,
    metadata_filename: str | None = None,
    metadata_last_modified: str | None = None,
    include_header: bool = False,
@ -47,6 +48,8 @@ def partition_csv(
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
+    encoding
+        The encoding method used to decode the text input. If None, utf-8 will be used.
    metadata_filename
        The filename to use for the metadata.
    metadata_last_modified
@ -73,6 +76,7 @@ def partition_csv(
    ctx = _CsvPartitioningContext(
        file_path=filename,
        file=file,
+        encoding=encoding,
        metadata_file_path=metadata_filename,
        metadata_last_modified=metadata_last_modified,
        include_header=include_header,
@ -81,7 +85,7 @@ def partition_csv(
    )

    with ctx.open() as file:
-        dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter)
+        dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter, encoding=encoding)

    html_text = dataframe.to_html(index=False, header=include_header, na_rep="")
    text = soupparser_fromstring(html_text).text_content()
@ -110,6 +114,7 @@ class _CsvPartitioningContext:
        self,
        file_path: str | None = None,
        file: IO[bytes] | None = None,
+        encoding: str | None = None,
        metadata_file_path: str | None = None,
        metadata_last_modified: str | None = None,
        include_header: bool = False,
@ -118,6 +123,7 @@ class _CsvPartitioningContext:
    ):
        self._file_path = file_path
        self._file = file
+        self._encoding = encoding
        self._metadata_file_path = metadata_file_path
        self._metadata_last_modified = metadata_last_modified
        self._include_header = include_header
@ -129,6 +135,7 @@ class _CsvPartitioningContext:
        cls,
        file_path: str | None,
        file: IO[bytes] | None,
+        encoding: str | None,
        metadata_file_path: str | None,
        metadata_last_modified: str | None,
        include_header: bool,
@ -138,6 +145,7 @@ class _CsvPartitioningContext:
        return cls(
            file_path=file_path,
            file=file,
+            encoding=encoding,
            metadata_file_path=metadata_file_path,
            metadata_last_modified=metadata_last_modified,
            include_header=include_header,
@ -156,7 +164,9 @@ class _CsvPartitioningContext:

        with self.open() as file:
            # -- read whole lines, sniffer can be confused by a trailing partial line --
-            data = "\n".join(ln.decode("utf-8") for ln in file.readlines(num_bytes))
+            data = "\n".join(
+                ln.decode(self._encoding or "utf-8") for ln in file.readlines(num_bytes)
+            )

        try:
            return sniffer.sniff(data, delimiters=",;").delimiter