mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-15 12:19:36 +00:00
fix: etree parser error (#1077)
* feat: add functionality to check if a string contains any emoji characters * feat: add functionality to switch `html` text parser based on whether the `html` text contains emoji * chore: add `beautifulsoup4` and `emoji` packages to `requirements/base.in` for general use * chore: update changelog & version * chore: update changelog & version * chore: update dependencies * test: update `EXPECTED_XLS_TEXT_LEN` for `test_auto_partition_xls_from_filename` * chore: update changelog & version
This commit is contained in:
parent
b31c62fa84
commit
d26ab1deac
11
CHANGELOG.md
11
CHANGELOG.md
@ -1,3 +1,14 @@
|
|||||||
|
## 0.9.3-dev0
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
* Add functionality to switch `html` text parser based on whether the `html` text contains emoji
|
||||||
|
* Add functionality to check if a string contains any emoji characters
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
## 0.9.2
|
## 0.9.2
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
6
example-docs/stanley-cups-with-emoji.csv
Normal file
6
example-docs/stanley-cups-with-emoji.csv
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
Stanley Cups,,
|
||||||
|
Team,Location,Stanley Cups
|
||||||
|
Blues,STL,1
|
||||||
|
Flyers,PHI,2
|
||||||
|
Maple Leafs,TOR,13
|
||||||
|
👨\U+1F3FB🔧,TOR,15
|
|
@ -6,3 +6,5 @@ lxml
|
|||||||
nltk
|
nltk
|
||||||
tabulate
|
tabulate
|
||||||
requests
|
requests
|
||||||
|
beautifulsoup4
|
||||||
|
emoji
|
||||||
|
@ -4,6 +4,8 @@
|
|||||||
#
|
#
|
||||||
# pip-compile requirements/base.in
|
# pip-compile requirements/base.in
|
||||||
#
|
#
|
||||||
|
beautifulsoup4==4.12.2
|
||||||
|
# via -r requirements/base.in
|
||||||
certifi==2023.7.22
|
certifi==2023.7.22
|
||||||
# via
|
# via
|
||||||
# -c requirements/constraints.in
|
# -c requirements/constraints.in
|
||||||
@ -14,6 +16,8 @@ charset-normalizer==3.2.0
|
|||||||
# via requests
|
# via requests
|
||||||
click==8.1.6
|
click==8.1.6
|
||||||
# via nltk
|
# via nltk
|
||||||
|
emoji==2.7.0
|
||||||
|
# via -r requirements/base.in
|
||||||
filetype==1.2.0
|
filetype==1.2.0
|
||||||
# via -r requirements/base.in
|
# via -r requirements/base.in
|
||||||
idna==3.4
|
idna==3.4
|
||||||
@ -30,6 +34,8 @@ regex==2023.8.8
|
|||||||
# via nltk
|
# via nltk
|
||||||
requests==2.31.0
|
requests==2.31.0
|
||||||
# via -r requirements/base.in
|
# via -r requirements/base.in
|
||||||
|
soupsieve==2.4.1
|
||||||
|
# via beautifulsoup4
|
||||||
tabulate==0.9.0
|
tabulate==0.9.0
|
||||||
# via -r requirements/base.in
|
# via -r requirements/base.in
|
||||||
tqdm==4.66.0
|
tqdm==4.66.0
|
||||||
|
@ -4,4 +4,3 @@
|
|||||||
openpyxl
|
openpyxl
|
||||||
pandas
|
pandas
|
||||||
xlrd
|
xlrd
|
||||||
beautifulsoup4
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import pytest
|
||||||
from unstructured_inference.inference.layout import LayoutElement
|
from unstructured_inference.inference.layout import LayoutElement
|
||||||
|
|
||||||
from unstructured.documents.coordinates import PixelSpace
|
from unstructured.documents.coordinates import PixelSpace
|
||||||
@ -10,6 +11,7 @@ from unstructured.documents.elements import (
|
|||||||
Title,
|
Title,
|
||||||
)
|
)
|
||||||
from unstructured.partition import common
|
from unstructured.partition import common
|
||||||
|
from unstructured.partition.common import contains_emoji
|
||||||
|
|
||||||
|
|
||||||
def test_normalize_layout_element_dict():
|
def test_normalize_layout_element_dict():
|
||||||
@ -230,3 +232,14 @@ def test_convert_ms_office_table_to_text_works_with_empty_tables():
|
|||||||
table = MockDocxEmptyTable()
|
table = MockDocxEmptyTable()
|
||||||
assert common.convert_ms_office_table_to_text(table, as_html=True) == ""
|
assert common.convert_ms_office_table_to_text(table, as_html=True) == ""
|
||||||
assert common.convert_ms_office_table_to_text(table, as_html=False) == ""
|
assert common.convert_ms_office_table_to_text(table, as_html=False) == ""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("text", "expected"),
|
||||||
|
[
|
||||||
|
("<table><tbody><tr><td>👨\\U+1F3FB🔧</td></tr></tbody></table>", True),
|
||||||
|
("<table><tbody><tr><td>Hello!</td></tr></tbody></table>", False),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_contains_emoji(text, expected):
|
||||||
|
assert contains_emoji(text) is expected
|
||||||
|
@ -25,3 +25,37 @@ EXPECTED_TABLE = """<table border="1" class="dataframe">
|
|||||||
|
|
||||||
|
|
||||||
EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
||||||
|
|
||||||
|
EXPECTED_TEXT_WITH_EMOJI = (
|
||||||
|
"Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13 👨\\U+1F3FB🔧 TOR 15"
|
||||||
|
)
|
||||||
|
|
||||||
|
EXPECTED_TABLE_WITH_EMOJI = """<table border="1" class="dataframe">
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td>Team</td>
|
||||||
|
<td>Location</td>
|
||||||
|
<td>Stanley Cups</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Blues</td>
|
||||||
|
<td>STL</td>
|
||||||
|
<td>1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Flyers</td>
|
||||||
|
<td>PHI</td>
|
||||||
|
<td>2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Maple Leafs</td>
|
||||||
|
<td>TOR</td>
|
||||||
|
<td>13</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>👨\\U+1F3FB🔧</td>
|
||||||
|
<td>TOR</td>
|
||||||
|
<td>15</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>"""
|
||||||
|
@ -1,6 +1,13 @@
|
|||||||
from tempfile import SpooledTemporaryFile
|
from tempfile import SpooledTemporaryFile
|
||||||
|
|
||||||
from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
|
import pytest
|
||||||
|
|
||||||
|
from test_unstructured.partition.test_constants import (
|
||||||
|
EXPECTED_TABLE,
|
||||||
|
EXPECTED_TABLE_WITH_EMOJI,
|
||||||
|
EXPECTED_TEXT,
|
||||||
|
EXPECTED_TEXT_WITH_EMOJI,
|
||||||
|
)
|
||||||
from unstructured.cleaners.core import clean_extra_whitespace
|
from unstructured.cleaners.core import clean_extra_whitespace
|
||||||
from unstructured.documents.elements import Table
|
from unstructured.documents.elements import Table
|
||||||
from unstructured.partition.csv import partition_csv
|
from unstructured.partition.csv import partition_csv
|
||||||
@ -8,13 +15,21 @@ from unstructured.partition.csv import partition_csv
|
|||||||
EXPECTED_FILETYPE = "text/csv"
|
EXPECTED_FILETYPE = "text/csv"
|
||||||
|
|
||||||
|
|
||||||
def test_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
|
@pytest.mark.parametrize(
|
||||||
elements = partition_csv(filename=filename)
|
("filename", "expected_text", "expected_table"),
|
||||||
|
[
|
||||||
|
("stanley-cups.csv", EXPECTED_TEXT, EXPECTED_TABLE),
|
||||||
|
("stanley-cups-with-emoji.csv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_partition_csv_from_filename(filename, expected_text, expected_table):
|
||||||
|
f_path = f"example-docs/{filename}"
|
||||||
|
elements = partition_csv(filename=f_path)
|
||||||
|
|
||||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
assert clean_extra_whitespace(elements[0].text) == expected_text
|
||||||
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
assert elements[0].metadata.text_as_html == expected_table
|
||||||
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
|
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
|
||||||
assert elements[0].metadata.filename == "stanley-cups.csv"
|
assert elements[0].metadata.filename == filename
|
||||||
|
|
||||||
|
|
||||||
def test_partition_csv_from_filename_with_metadata_filename(
|
def test_partition_csv_from_filename_with_metadata_filename(
|
||||||
@ -26,13 +41,21 @@ def test_partition_csv_from_filename_with_metadata_filename(
|
|||||||
assert elements[0].metadata.filename == "test"
|
assert elements[0].metadata.filename == "test"
|
||||||
|
|
||||||
|
|
||||||
def test_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
|
@pytest.mark.parametrize(
|
||||||
with open(filename, "rb") as f:
|
("filename", "expected_text", "expected_table"),
|
||||||
|
[
|
||||||
|
("stanley-cups.csv", EXPECTED_TEXT, EXPECTED_TABLE),
|
||||||
|
("stanley-cups-with-emoji.csv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_partition_csv_from_file(filename, expected_text, expected_table):
|
||||||
|
f_path = f"example-docs/{filename}"
|
||||||
|
with open(f_path, "rb") as f:
|
||||||
elements = partition_csv(file=f)
|
elements = partition_csv(file=f)
|
||||||
|
|
||||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
assert clean_extra_whitespace(elements[0].text) == expected_text
|
||||||
assert isinstance(elements[0], Table)
|
assert isinstance(elements[0], Table)
|
||||||
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
assert elements[0].metadata.text_as_html == expected_table
|
||||||
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
|
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
|
||||||
assert elements[0].metadata.filename is None
|
assert elements[0].metadata.filename is None
|
||||||
|
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.9.2" # pragma: no cover
|
__version__ = "0.9.3-dev0" # pragma: no cover
|
||||||
|
@ -7,6 +7,7 @@ from io import BufferedReader, BytesIO, TextIOWrapper
|
|||||||
from tempfile import SpooledTemporaryFile
|
from tempfile import SpooledTemporaryFile
|
||||||
from typing import IO, TYPE_CHECKING, Any, BinaryIO, Dict, List, Optional, Tuple, Union
|
from typing import IO, TYPE_CHECKING, Any, BinaryIO, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
import emoji
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
from unstructured.documents.coordinates import CoordinateSystem
|
from unstructured.documents.coordinates import CoordinateSystem
|
||||||
@ -333,3 +334,17 @@ def convert_ms_office_table_to_text(table: "docxtable.Table", as_html: bool = Tr
|
|||||||
else:
|
else:
|
||||||
table_text = ""
|
table_text = ""
|
||||||
return table_text
|
return table_text
|
||||||
|
|
||||||
|
|
||||||
|
def contains_emoji(s: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the input string contains any emoji characters.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- s (str): The input string to check.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- bool: True if the string contains any emoji, False otherwise.
|
||||||
|
"""
|
||||||
|
|
||||||
|
return bool(emoji.emoji_count(s))
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
from tempfile import SpooledTemporaryFile
|
from tempfile import SpooledTemporaryFile
|
||||||
from typing import IO, BinaryIO, List, Optional, Union, cast
|
from typing import IO, BinaryIO, List, Optional, Union, cast
|
||||||
|
|
||||||
import lxml.html
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from lxml.html import document_fromstring
|
||||||
|
from lxml.html.soupparser import fromstring as soupparser_fromstring
|
||||||
|
|
||||||
from unstructured.documents.elements import (
|
from unstructured.documents.elements import (
|
||||||
Element,
|
Element,
|
||||||
@ -12,6 +13,7 @@ from unstructured.documents.elements import (
|
|||||||
)
|
)
|
||||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common import (
|
||||||
|
contains_emoji,
|
||||||
exactly_one,
|
exactly_one,
|
||||||
get_last_modified_date,
|
get_last_modified_date,
|
||||||
get_last_modified_date_from_file,
|
get_last_modified_date_from_file,
|
||||||
@ -58,7 +60,8 @@ def partition_csv(
|
|||||||
table = pd.read_csv(f)
|
table = pd.read_csv(f)
|
||||||
|
|
||||||
html_text = table.to_html(index=False, header=False, na_rep="")
|
html_text = table.to_html(index=False, header=False, na_rep="")
|
||||||
text = lxml.html.document_fromstring(html_text).text_content()
|
html_string_parser = soupparser_fromstring if contains_emoji(html_text) else document_fromstring
|
||||||
|
text = html_string_parser(html_text).text_content()
|
||||||
|
|
||||||
if include_metadata:
|
if include_metadata:
|
||||||
metadata = ElementMetadata(
|
metadata = ElementMetadata(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user