mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-14 04:25:54 +00:00

The current code assumes the first line of csv and tsv files are a header line. Most csv and tsv files don't have a header line, and even for those that do, dropping this line may not be the desired behavior. Here is a snippet of code that demonstrates the current behavior and the proposed fix ``` import pandas as pd from lxml.html.soupparser import fromstring as soupparser_fromstring c1 = """ Stanley Cups,, Team,Location,Stanley Cups Blues,STL,1 Flyers,PHI,2 Maple Leafs,TOR,13 """ f = "./test.csv" with open(f, 'w') as ff: ff.write(c1) print("Suggested Improvement Keep First Line") table = pd.read_csv(f, header=None) html_text = table.to_html(index=False, header=False, na_rep="") text = soupparser_fromstring(html_text).text_content() print(text) print("\n\nOriginal Looses First Line") table = pd.read_csv(f) html_text = table.to_html(index=False, header=False, na_rep="") text = soupparser_fromstring(html_text).text_content() print(text) ``` --------- Co-authored-by: cragwolfe <crag@unstructured.io> Co-authored-by: Yao You <theyaoyou@gmail.com> Co-authored-by: Yao You <yao@unstructured.io>
103 lines
1.9 KiB
Python
103 lines
1.9 KiB
Python
EXPECTED_TABLE = """<table border="1" class="dataframe">
|
|
<tbody>
|
|
<tr>
|
|
<td>Stanley Cups</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
<tr>
|
|
<td>Team</td>
|
|
<td>Location</td>
|
|
<td>Stanley Cups</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Blues</td>
|
|
<td>STL</td>
|
|
<td>1</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Flyers</td>
|
|
<td>PHI</td>
|
|
<td>2</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Maple Leafs</td>
|
|
<td>TOR</td>
|
|
<td>13</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>"""
|
|
|
|
EXPECTED_TABLE_XLSX = """<table border="1" class="dataframe">
|
|
<tbody>
|
|
<tr>
|
|
<td>Team</td>
|
|
<td>Location</td>
|
|
<td>Stanley Cups</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Blues</td>
|
|
<td>STL</td>
|
|
<td>1</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Flyers</td>
|
|
<td>PHI</td>
|
|
<td>2</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Maple Leafs</td>
|
|
<td>TOR</td>
|
|
<td>13</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>"""
|
|
|
|
EXPECTED_TITLE = "Stanley Cups"
|
|
|
|
EXPECTED_TEXT = (
|
|
"Stanley Cups Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
|
)
|
|
|
|
EXPECTED_TEXT_XLSX = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
|
|
|
EXPECTED_TEXT_WITH_EMOJI = (
|
|
"Stanley Cups "
|
|
"Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13 👨\\U+1F3FB🔧 TOR 15"
|
|
)
|
|
|
|
EXPECTED_TABLE_WITH_EMOJI = """<table border="1" class="dataframe">
|
|
<tbody>
|
|
<tr>
|
|
<td>Stanley Cups</td>
|
|
<td></td>
|
|
<td></td>
|
|
</tr>
|
|
<tr>
|
|
<td>Team</td>
|
|
<td>Location</td>
|
|
<td>Stanley Cups</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Blues</td>
|
|
<td>STL</td>
|
|
<td>1</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Flyers</td>
|
|
<td>PHI</td>
|
|
<td>2</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Maple Leafs</td>
|
|
<td>TOR</td>
|
|
<td>13</td>
|
|
</tr>
|
|
<tr>
|
|
<td>👨\\U+1F3FB🔧</td>
|
|
<td>TOR</td>
|
|
<td>15</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>"""
|