add '|' as a delimiter in csv files (#4059)

This PR fixes the error “Failure to process CSV: Expected 2 fields in
line 2, saw 4” when '|' is used as a delimiter in the csv file
This commit is contained in:
jiajun-unstructured 2025-07-18 10:56:24 -07:00 committed by GitHub
parent a040483a7e
commit d24dec5e04
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 34 additions and 2 deletions

View File

@ -1,3 +1,12 @@
## 0.18.11-dev0
### Enhancements
### Features
### Fixes
- **Recognize '|' as a delimiter** csv parser will now recognize '|' as a delimiter in addition to ',' and ';'.
## 0.18.10
### Enhancements

View File

@ -0,0 +1,4 @@
col1|col2|col3
a|b|c
d|e|f
g|h|i
1 col1 col2 col3
2 a b c
3 d e f
4 g h i

View File

@ -37,6 +37,16 @@ EXPECTED_TABLE_XLSX = (
"</table>"
)
EXPECTED_TABLE_WITH_LINE_DELIMITER = (
"<table>"
"<tr><td>col1</td><td>col2</td><td>col3</td></tr>"
"<tr><td>a</td><td>b</td><td>c</td></tr>"
"<tr><td>d</td><td>e</td><td>f</td></tr>"
"<tr><td>g</td><td>h</td><td>i</td></tr>"
"</table>"
)
EXPECTED_TITLE = "Stanley Cups"
EXPECTED_TEXT = (
@ -54,6 +64,8 @@ EXPECTED_TEXT_SEMICOLON_DELIMITER = (
"Year Month Revenue Costs 2022 1 123 -123 2023 2 143,1 -814,38 2024 3 215,32 -11,08"
)
EXPECTED_TEXT_WITH_LINE_DELIMITER = "col1 col2 col3 a b c d e f g h i"
EXPECTED_XLS_TABLE = (
"<table><tr>"
"<td>MC</td>"

View File

@ -11,9 +11,11 @@ from test_unstructured.partition.test_constants import (
EXPECTED_TABLE,
EXPECTED_TABLE_SEMICOLON_DELIMITER,
EXPECTED_TABLE_WITH_EMOJI,
EXPECTED_TABLE_WITH_LINE_DELIMITER,
EXPECTED_TEXT,
EXPECTED_TEXT_SEMICOLON_DELIMITER,
EXPECTED_TEXT_WITH_EMOJI,
EXPECTED_TEXT_WITH_LINE_DELIMITER,
EXPECTED_TEXT_XLSX,
)
from test_unstructured.unit_utils import (
@ -42,6 +44,11 @@ EXPECTED_FILETYPE = "text/csv"
EXPECTED_TEXT_SEMICOLON_DELIMITER,
EXPECTED_TABLE_SEMICOLON_DELIMITER,
),
(
"csv-with-line-delimiter.csv",
EXPECTED_TEXT_WITH_LINE_DELIMITER,
EXPECTED_TABLE_WITH_LINE_DELIMITER,
),
],
)
def test_partition_csv_from_filename(filename: str, expected_text: str, expected_table: str):

View File

@ -1 +1 @@
__version__ = "0.18.10" # pragma: no cover
__version__ = "0.18.11-dev0" # pragma: no cover

View File

@ -127,7 +127,7 @@ class _CsvPartitioningContext:
)
try:
return sniffer.sniff(data, delimiters=",;").delimiter
return sniffer.sniff(data, delimiters=",;|").delimiter
except csv.Error:
# -- sniffing will fail on single-column csv as no default can be assumed --
return None