mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-04 07:27:34 +00:00

* feat: add functionality to check if a string contains any emoji characters * feat: add functionality to switch `html` text parser based on whether the `html` text contains emoji * chore: add `beautifulsoup4` and `emoji` packages to `requirements/base.in` for general use * chore: update changelog & version * chore: update changelog & version * chore: update dependencies * test: update `EXPECTED_XLS_TEXT_LEN` for `test_auto_partition_xls_from_filename` * chore: update changelog & version
62 lines
1.1 KiB
Python
62 lines
1.1 KiB
Python
EXPECTED_TABLE = """<table border="1" class="dataframe">
|
|
<tbody>
|
|
<tr>
|
|
<td>Team</td>
|
|
<td>Location</td>
|
|
<td>Stanley Cups</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Blues</td>
|
|
<td>STL</td>
|
|
<td>1</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Flyers</td>
|
|
<td>PHI</td>
|
|
<td>2</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Maple Leafs</td>
|
|
<td>TOR</td>
|
|
<td>13</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>"""
|
|
|
|
|
|
EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
|
|
|
EXPECTED_TEXT_WITH_EMOJI = (
|
|
"Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13 👨\\U+1F3FB🔧 TOR 15"
|
|
)
|
|
|
|
EXPECTED_TABLE_WITH_EMOJI = """<table border="1" class="dataframe">
|
|
<tbody>
|
|
<tr>
|
|
<td>Team</td>
|
|
<td>Location</td>
|
|
<td>Stanley Cups</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Blues</td>
|
|
<td>STL</td>
|
|
<td>1</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Flyers</td>
|
|
<td>PHI</td>
|
|
<td>2</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Maple Leafs</td>
|
|
<td>TOR</td>
|
|
<td>13</td>
|
|
</tr>
|
|
<tr>
|
|
<td>👨\\U+1F3FB🔧</td>
|
|
<td>TOR</td>
|
|
<td>15</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>"""
|