fix(csv): partition_csv() raises on long lines (#2998)

**Summary**
The CSV delimiter-sniffer requires whole lines to properly detect the
delimiter character. Limiting bytes read produced partial lines when
lines were very long. Limit bytes but read whole lines.

Fixes #2643.
This commit is contained in:
Steve Canny 2024-05-10 14:19:31 -07:00 committed by GitHub
parent 8eee14d589
commit e4c895923d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 33 additions and 12 deletions

View File

@ -1,4 +1,4 @@
## 0.13.8-dev3
## 0.13.8-dev4
### Enhancements
@ -14,6 +14,7 @@
* **Re-apply: skip accuracy calculation feature** Overwritten by mistake
* **Fix type hint for paragraph_grouper param** `paragraph_grouper` can be set to `False`, but the type hint did not not reflect this previously.
* **Remove links param from partition_pdf** `links` is extracted during partitioning and is not needed as a paramter in partition_pdf.
* **Improve CSV delimeter detection.** `partition_csv()` would raise on CSV files with very long lines.
## 0.13.7

File diff suppressed because one or more lines are too long

View File

@ -15,7 +15,7 @@ from test_unstructured.unit_utils import assert_round_trips_through_JSON, exampl
from unstructured.chunking.title import chunk_by_title
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table
from unstructured.partition.csv import partition_csv
from unstructured.partition.csv import get_delimiter, partition_csv
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
EXPECTED_FILETYPE = "text/csv"
@ -270,3 +270,8 @@ def test_partition_csv_header():
== "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
)
assert "<thead>" in elements[0].metadata.text_as_html
def test_partition_csv_detects_the_right_csv_delimiter():
# -- Issue #2643: previously raised `_csv.Error: Could not determine delimiter` on this file --
assert get_delimiter("example-docs/csv-with-long-lines.csv") == ","

View File

@ -1 +1 @@
__version__ = "0.13.8-dev3" # pragma: no cover
__version__ = "0.13.8-dev4" # pragma: no cover

View File

@ -111,19 +111,23 @@ def partition_csv(
return list(elements)
def get_delimiter(file_path=None, file=None):
"""
Use the standard csv sniffer to determine the delimiter.
Read just a small portion in case the file is large.
def get_delimiter(file_path: str | None = None, file: IO[bytes] | None = None):
"""Use the standard csv sniffer to determine the delimiter.
Reads just a small portion in case the file is large.
"""
sniffer = csv.Sniffer()
num_bytes = 65536
num_bytes = 8192
# -- read whole lines, sniffer can be confused by a trailing partial line --
if file:
data = file.read(num_bytes).decode("utf-8")
lines = file.readlines(num_bytes)
file.seek(0)
else:
data = "\n".join(ln.decode("utf-8") for ln in lines)
elif file_path is not None:
with open(file_path) as f:
data = f.read(num_bytes)
data = "\n".join(f.readlines(num_bytes))
else:
raise ValueError("either `file_path` or `file` argument must be provided")
return sniffer.sniff(data, delimiters=[",", ";"]).delimiter
return sniffer.sniff(data, delimiters=",;").delimiter