mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
fix(csv): partition_csv() raises on long lines (#2998)
**Summary** The CSV delimiter-sniffer requires whole lines to properly detect the delimiter character. Limiting bytes read produced partial lines when lines were very long. Limit bytes but read whole lines. Fixes #2643.
This commit is contained in:
parent
8eee14d589
commit
e4c895923d
@ -1,4 +1,4 @@
|
||||
## 0.13.8-dev3
|
||||
## 0.13.8-dev4
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
* **Re-apply: skip accuracy calculation feature** Overwritten by mistake
|
||||
* **Fix type hint for paragraph_grouper param** `paragraph_grouper` can be set to `False`, but the type hint did not not reflect this previously.
|
||||
* **Remove links param from partition_pdf** `links` is extracted during partitioning and is not needed as a paramter in partition_pdf.
|
||||
* **Improve CSV delimeter detection.** `partition_csv()` would raise on CSV files with very long lines.
|
||||
|
||||
## 0.13.7
|
||||
|
||||
|
11
example-docs/csv-with-long-lines.csv
Normal file
11
example-docs/csv-with-long-lines.csv
Normal file
File diff suppressed because one or more lines are too long
@ -15,7 +15,7 @@ from test_unstructured.unit_utils import assert_round_trips_through_JSON, exampl
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
from unstructured.documents.elements import Table
|
||||
from unstructured.partition.csv import partition_csv
|
||||
from unstructured.partition.csv import get_delimiter, partition_csv
|
||||
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
||||
|
||||
EXPECTED_FILETYPE = "text/csv"
|
||||
@ -270,3 +270,8 @@ def test_partition_csv_header():
|
||||
== "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
|
||||
)
|
||||
assert "<thead>" in elements[0].metadata.text_as_html
|
||||
|
||||
|
||||
def test_partition_csv_detects_the_right_csv_delimiter():
|
||||
# -- Issue #2643: previously raised `_csv.Error: Could not determine delimiter` on this file --
|
||||
assert get_delimiter("example-docs/csv-with-long-lines.csv") == ","
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.13.8-dev3" # pragma: no cover
|
||||
__version__ = "0.13.8-dev4" # pragma: no cover
|
||||
|
@ -111,19 +111,23 @@ def partition_csv(
|
||||
return list(elements)
|
||||
|
||||
|
||||
def get_delimiter(file_path=None, file=None):
|
||||
"""
|
||||
Use the standard csv sniffer to determine the delimiter.
|
||||
Read just a small portion in case the file is large.
|
||||
def get_delimiter(file_path: str | None = None, file: IO[bytes] | None = None):
|
||||
"""Use the standard csv sniffer to determine the delimiter.
|
||||
|
||||
Reads just a small portion in case the file is large.
|
||||
"""
|
||||
sniffer = csv.Sniffer()
|
||||
num_bytes = 65536
|
||||
|
||||
num_bytes = 8192
|
||||
# -- read whole lines, sniffer can be confused by a trailing partial line --
|
||||
if file:
|
||||
data = file.read(num_bytes).decode("utf-8")
|
||||
lines = file.readlines(num_bytes)
|
||||
file.seek(0)
|
||||
else:
|
||||
data = "\n".join(ln.decode("utf-8") for ln in lines)
|
||||
elif file_path is not None:
|
||||
with open(file_path) as f:
|
||||
data = f.read(num_bytes)
|
||||
data = "\n".join(f.readlines(num_bytes))
|
||||
else:
|
||||
raise ValueError("either `file_path` or `file` argument must be provided")
|
||||
|
||||
return sniffer.sniff(data, delimiters=[",", ";"]).delimiter
|
||||
return sniffer.sniff(data, delimiters=",;").delimiter
|
||||
|
Loading…
x
Reference in New Issue
Block a user