mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-29 17:04:37 +00:00
The current code assumes the first line of csv and tsv files are a
header line. Most csv and tsv files don't have a header line, and even
for those that do, dropping this line may not be the desired behavior.
Here is a snippet of code that demonstrates the current behavior and the
proposed fix
```
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
c1 = """
Stanley Cups,,
Team,Location,Stanley Cups
Blues,STL,1
Flyers,PHI,2
Maple Leafs,TOR,13
"""
f = "./test.csv"
with open(f, 'w') as ff:
ff.write(c1)
print("Suggested Improvement Keep First Line")
table = pd.read_csv(f, header=None)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
print("\n\nOriginal Looses First Line")
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
print(text)
```
---------
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Yao You <yao@unstructured.io>
103 lines
1.9 KiB
Python
103 lines
1.9 KiB
Python
EXPECTED_TABLE = """<table border="1" class="dataframe">
|
|
<tbody>
|
|
<tr>
|
|
<td>Stanley Cups</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
<tr>
|
|
<td>Team</td>
|
|
<td>Location</td>
|
|
<td>Stanley Cups</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Blues</td>
|
|
<td>STL</td>
|
|
<td>1</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Flyers</td>
|
|
<td>PHI</td>
|
|
<td>2</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Maple Leafs</td>
|
|
<td>TOR</td>
|
|
<td>13</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>"""
|
|
|
|
EXPECTED_TABLE_XLSX = """<table border="1" class="dataframe">
|
|
<tbody>
|
|
<tr>
|
|
<td>Team</td>
|
|
<td>Location</td>
|
|
<td>Stanley Cups</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Blues</td>
|
|
<td>STL</td>
|
|
<td>1</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Flyers</td>
|
|
<td>PHI</td>
|
|
<td>2</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Maple Leafs</td>
|
|
<td>TOR</td>
|
|
<td>13</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>"""
|
|
|
|
EXPECTED_TITLE = "Stanley Cups"
|
|
|
|
EXPECTED_TEXT = (
|
|
"Stanley Cups Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
|
)
|
|
|
|
EXPECTED_TEXT_XLSX = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
|
|
|
EXPECTED_TEXT_WITH_EMOJI = (
|
|
"Stanley Cups "
|
|
"Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13 👨\\U+1F3FB🔧 TOR 15"
|
|
)
|
|
|
|
EXPECTED_TABLE_WITH_EMOJI = """<table border="1" class="dataframe">
|
|
<tbody>
|
|
<tr>
|
|
<td>Stanley Cups</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
<tr>
|
|
<td>Team</td>
|
|
<td>Location</td>
|
|
<td>Stanley Cups</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Blues</td>
|
|
<td>STL</td>
|
|
<td>1</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Flyers</td>
|
|
<td>PHI</td>
|
|
<td>2</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Maple Leafs</td>
|
|
<td>TOR</td>
|
|
<td>13</td>
|
|
</tr>
|
|
<tr>
|
|
<td>👨\\U+1F3FB🔧</td>
|
|
<td>TOR</td>
|
|
<td>15</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>"""
|