mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-04 07:27:34 +00:00

Closes #2218. When a csv has commas in its content, and the delimiter is something else, Pandas may throw an error. We can sniff the csv and get the correct delimiter to pass to Pandas. To verify, try partitioning the file in the linked bug.
140 lines
2.6 KiB
Python
140 lines
2.6 KiB
Python
EXPECTED_TABLE = """<table border="1" class="dataframe">
|
|
<tbody>
|
|
<tr>
|
|
<td>Stanley Cups</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
<tr>
|
|
<td>Team</td>
|
|
<td>Location</td>
|
|
<td>Stanley Cups</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Blues</td>
|
|
<td>STL</td>
|
|
<td>1</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Flyers</td>
|
|
<td>PHI</td>
|
|
<td>2</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Maple Leafs</td>
|
|
<td>TOR</td>
|
|
<td>13</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>"""
|
|
|
|
EXPECTED_TABLE_XLSX = """<table border="1" class="dataframe">
|
|
<tbody>
|
|
<tr>
|
|
<td>Team</td>
|
|
<td>Location</td>
|
|
<td>Stanley Cups</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Blues</td>
|
|
<td>STL</td>
|
|
<td>1</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Flyers</td>
|
|
<td>PHI</td>
|
|
<td>2</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Maple Leafs</td>
|
|
<td>TOR</td>
|
|
<td>13</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>"""
|
|
|
|
EXPECTED_TITLE = "Stanley Cups"
|
|
|
|
EXPECTED_TEXT = (
|
|
"Stanley Cups Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
|
)
|
|
|
|
EXPECTED_TEXT_XLSX = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
|
|
|
EXPECTED_TEXT_WITH_EMOJI = (
|
|
"Stanley Cups "
|
|
"Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13 👨\\U+1F3FB🔧 TOR 15"
|
|
)
|
|
|
|
EXPECTED_TEXT_SEMICOLON_DELIMITER = (
|
|
"Year Month Revenue Costs 2022 1 123 -123 2023 2 143,1 -814,38 2024 3 215,32 -11,08"
|
|
)
|
|
|
|
EXPECTED_TABLE_SEMICOLON_DELIMITER = """<table border="1" class="dataframe">
|
|
<tbody>
|
|
<tr>
|
|
<td>Year</td>
|
|
<td>Month</td>
|
|
<td>Revenue</td>
|
|
<td>Costs</td>
|
|
<td></td>
|
|
</tr>
|
|
<tr>
|
|
<td>2022</td>
|
|
<td>1</td>
|
|
<td>123</td>
|
|
<td>-123</td>
|
|
<td></td>
|
|
</tr>
|
|
<tr>
|
|
<td>2023</td>
|
|
<td>2</td>
|
|
<td>143,1</td>
|
|
<td>-814,38</td>
|
|
<td></td>
|
|
</tr>
|
|
<tr>
|
|
<td>2024</td>
|
|
<td>3</td>
|
|
<td>215,32</td>
|
|
<td>-11,08</td>
|
|
<td></td>
|
|
</tr>
|
|
</tbody>
|
|
</table>"""
|
|
|
|
EXPECTED_TABLE_WITH_EMOJI = """<table border="1" class="dataframe">
|
|
<tbody>
|
|
<tr>
|
|
<td>Stanley Cups</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
<tr>
|
|
<td>Team</td>
|
|
<td>Location</td>
|
|
<td>Stanley Cups</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Blues</td>
|
|
<td>STL</td>
|
|
<td>1</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Flyers</td>
|
|
<td>PHI</td>
|
|
<td>2</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Maple Leafs</td>
|
|
<td>TOR</td>
|
|
<td>13</td>
|
|
</tr>
|
|
<tr>
|
|
<td>👨\\U+1F3FB🔧</td>
|
|
<td>TOR</td>
|
|
<td>15</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>"""
|