fix(csv): partition_csv() raises on long lines (#2998)

**Summary** The CSV delimiter-sniffer requires whole lines to properly detect the delimiter character. Limiting bytes read produced partial lines when lines were very long. Limit bytes but read whole lines. Fixes #2643.
2025-06-27 02:30:08 +00:00 · 2024-05-10 14:19:31 -07:00 · 2024-05-10 14:19:31 -07:00 · e4c895923d
commit e4c895923d
parent 8eee14d589
5 changed files with 33 additions and 12 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.13.8-dev3
+## 0.13.8-dev4

 ### Enhancements

@ -14,6 +14,7 @@
 * **Re-apply: skip accuracy calculation feature** Overwritten by mistake
 * **Fix type hint for paragraph_grouper param** `paragraph_grouper` can be set to `False`, but the type hint did not not reflect this previously.
 * **Remove links param from partition_pdf** `links` is extracted during partitioning and is not needed as a paramter in partition_pdf.
+* **Improve CSV delimeter detection.** `partition_csv()` would raise on CSV files with very long lines.

 ## 0.13.7

--- a/example-docs/csv-with-long-lines.csv
+++ b/example-docs/csv-with-long-lines.csv
--- a/test_unstructured/partition/csv/test_csv.py
+++ b/test_unstructured/partition/csv/test_csv.py
@ -15,7 +15,7 @@ from test_unstructured.unit_utils import assert_round_trips_through_JSON, exampl
 from unstructured.chunking.title import chunk_by_title
 from unstructured.cleaners.core import clean_extra_whitespace
 from unstructured.documents.elements import Table
-from unstructured.partition.csv import partition_csv
+from unstructured.partition.csv import get_delimiter, partition_csv
 from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA

 EXPECTED_FILETYPE = "text/csv"
@ -270,3 +270,8 @@ def test_partition_csv_header():
        == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
    )
    assert "<thead>" in elements[0].metadata.text_as_html
+
+
+def test_partition_csv_detects_the_right_csv_delimiter():
+    # -- Issue #2643: previously raised `_csv.Error: Could not determine delimiter` on this file --
+    assert get_delimiter("example-docs/csv-with-long-lines.csv") == ","
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.13.8-dev3"  # pragma: no cover
+__version__ = "0.13.8-dev4"  # pragma: no cover
--- a/unstructured/partition/csv.py
+++ b/unstructured/partition/csv.py
@ -111,19 +111,23 @@ def partition_csv(
    return list(elements)


-def get_delimiter(file_path=None, file=None):
-    """
-    Use the standard csv sniffer to determine the delimiter.
-    Read just a small portion in case the file is large.
+def get_delimiter(file_path: str | None = None, file: IO[bytes] | None = None):
+    """Use the standard csv sniffer to determine the delimiter.
+
+    Reads just a small portion in case the file is large.
    """
    sniffer = csv.Sniffer()
+    num_bytes = 65536

-    num_bytes = 8192
+    # -- read whole lines, sniffer can be confused by a trailing partial line --
    if file:
-        data = file.read(num_bytes).decode("utf-8")
+        lines = file.readlines(num_bytes)
        file.seek(0)
-    else:
+        data = "\n".join(ln.decode("utf-8") for ln in lines)
+    elif file_path is not None:
        with open(file_path) as f:
-            data = f.read(num_bytes)
+            data = "\n".join(f.readlines(num_bytes))
+    else:
+        raise ValueError("either `file_path` or `file` argument must be provided")

-    return sniffer.sniff(data, delimiters=[",", ";"]).delimiter
+    return sniffer.sniff(data, delimiters=",;").delimiter