mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-28 07:33:36 +00:00
add '|' as a delimiter in csv files (#4059)
This PR fixes the error “Failure to process CSV: Expected 2 fields in line 2, saw 4” when '|' is used as a delimiter in the csv file
This commit is contained in:
parent
a040483a7e
commit
d24dec5e04
@ -1,3 +1,12 @@
|
||||
## 0.18.11-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
- **Recognize '|' as a delimiter** csv parser will now recognize '|' as a delimiter in addition to ',' and ';'.
|
||||
|
||||
## 0.18.10
|
||||
|
||||
### Enhancements
|
||||
|
||||
4
example-docs/csv-with-line-delimiter.csv
Normal file
4
example-docs/csv-with-line-delimiter.csv
Normal file
@ -0,0 +1,4 @@
|
||||
col1|col2|col3
|
||||
a|b|c
|
||||
d|e|f
|
||||
g|h|i
|
||||
|
@ -37,6 +37,16 @@ EXPECTED_TABLE_XLSX = (
|
||||
"</table>"
|
||||
)
|
||||
|
||||
EXPECTED_TABLE_WITH_LINE_DELIMITER = (
|
||||
"<table>"
|
||||
"<tr><td>col1</td><td>col2</td><td>col3</td></tr>"
|
||||
"<tr><td>a</td><td>b</td><td>c</td></tr>"
|
||||
"<tr><td>d</td><td>e</td><td>f</td></tr>"
|
||||
"<tr><td>g</td><td>h</td><td>i</td></tr>"
|
||||
"</table>"
|
||||
)
|
||||
|
||||
|
||||
EXPECTED_TITLE = "Stanley Cups"
|
||||
|
||||
EXPECTED_TEXT = (
|
||||
@ -54,6 +64,8 @@ EXPECTED_TEXT_SEMICOLON_DELIMITER = (
|
||||
"Year Month Revenue Costs 2022 1 123 -123 2023 2 143,1 -814,38 2024 3 215,32 -11,08"
|
||||
)
|
||||
|
||||
EXPECTED_TEXT_WITH_LINE_DELIMITER = "col1 col2 col3 a b c d e f g h i"
|
||||
|
||||
EXPECTED_XLS_TABLE = (
|
||||
"<table><tr>"
|
||||
"<td>MC</td>"
|
||||
|
||||
@ -11,9 +11,11 @@ from test_unstructured.partition.test_constants import (
|
||||
EXPECTED_TABLE,
|
||||
EXPECTED_TABLE_SEMICOLON_DELIMITER,
|
||||
EXPECTED_TABLE_WITH_EMOJI,
|
||||
EXPECTED_TABLE_WITH_LINE_DELIMITER,
|
||||
EXPECTED_TEXT,
|
||||
EXPECTED_TEXT_SEMICOLON_DELIMITER,
|
||||
EXPECTED_TEXT_WITH_EMOJI,
|
||||
EXPECTED_TEXT_WITH_LINE_DELIMITER,
|
||||
EXPECTED_TEXT_XLSX,
|
||||
)
|
||||
from test_unstructured.unit_utils import (
|
||||
@ -42,6 +44,11 @@ EXPECTED_FILETYPE = "text/csv"
|
||||
EXPECTED_TEXT_SEMICOLON_DELIMITER,
|
||||
EXPECTED_TABLE_SEMICOLON_DELIMITER,
|
||||
),
|
||||
(
|
||||
"csv-with-line-delimiter.csv",
|
||||
EXPECTED_TEXT_WITH_LINE_DELIMITER,
|
||||
EXPECTED_TABLE_WITH_LINE_DELIMITER,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_partition_csv_from_filename(filename: str, expected_text: str, expected_table: str):
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.18.10" # pragma: no cover
|
||||
__version__ = "0.18.11-dev0" # pragma: no cover
|
||||
|
||||
@ -127,7 +127,7 @@ class _CsvPartitioningContext:
|
||||
)
|
||||
|
||||
try:
|
||||
return sniffer.sniff(data, delimiters=",;").delimiter
|
||||
return sniffer.sniff(data, delimiters=",;|").delimiter
|
||||
except csv.Error:
|
||||
# -- sniffing will fail on single-column csv as no default can be assumed --
|
||||
return None
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user