fix: etree parser error (#1077)

* feat: add functionality to check if a string contains any emoji characters * feat: add functionality to switch `html` text parser based on whether the `html` text contains emoji * chore: add `beautifulsoup4` and `emoji` packages to `requirements/base.in` for general use * chore: update changelog & version * chore: update changelog & version * chore: update dependencies * test: update `EXPECTED_XLS_TEXT_LEN` for `test_auto_partition_xls_from_filename` * chore: update changelog & version
2025-12-19 19:23:46 +00:00 · 2023-08-10 16:28:57 -07:00 · 2023-08-10 16:28:57 -07:00 · d26ab1deac
commit d26ab1deac
parent b31c62fa84
11 changed files with 126 additions and 14 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,14 @@
+## 0.9.3-dev0
+
+### Enhancements
+
+* Add functionality to switch `html` text parser based on whether the `html` text contains emoji
+* Add functionality to check if a string contains any emoji characters
+
+### Features
+
+### Fixes
+
 ## 0.9.2

 ### Enhancements
--- a/example-docs/stanley-cups-with-emoji.csv
+++ b/example-docs/stanley-cups-with-emoji.csv
@ -0,0 +1,6 @@
+Stanley Cups,,
+Team,Location,Stanley Cups
+Blues,STL,1
+Flyers,PHI,2
+Maple Leafs,TOR,13
+👨\U+1F3FB🔧,TOR,15
--- a/requirements/base.in
+++ b/requirements/base.in
@ -6,3 +6,5 @@ lxml
 nltk
 tabulate
 requests
+beautifulsoup4
+emoji
--- a/requirements/base.txt
+++ b/requirements/base.txt
@ -4,6 +4,8 @@
 #
 #    pip-compile requirements/base.in
 #
+beautifulsoup4==4.12.2
+    # via -r requirements/base.in
 certifi==2023.7.22
    # via
    #   -c requirements/constraints.in
@ -14,6 +16,8 @@ charset-normalizer==3.2.0
    # via requests
 click==8.1.6
    # via nltk
+emoji==2.7.0
+    # via -r requirements/base.in
 filetype==1.2.0
    # via -r requirements/base.in
 idna==3.4
@ -30,6 +34,8 @@ regex==2023.8.8
    # via nltk
 requests==2.31.0
    # via -r requirements/base.in
+soupsieve==2.4.1
+    # via beautifulsoup4
 tabulate==0.9.0
    # via -r requirements/base.in
 tqdm==4.66.0
--- a/requirements/extra-xlsx.in
+++ b/requirements/extra-xlsx.in
@ -4,4 +4,3 @@
 openpyxl
 pandas
 xlrd
-beautifulsoup4
--- a/test_unstructured/partition/test_common.py
+++ b/test_unstructured/partition/test_common.py
@ -1,3 +1,4 @@
+import pytest
 from unstructured_inference.inference.layout import LayoutElement

 from unstructured.documents.coordinates import PixelSpace
@ -10,6 +11,7 @@ from unstructured.documents.elements import (
    Title,
 )
 from unstructured.partition import common
+from unstructured.partition.common import contains_emoji


 def test_normalize_layout_element_dict():
@ -230,3 +232,14 @@ def test_convert_ms_office_table_to_text_works_with_empty_tables():
    table = MockDocxEmptyTable()
    assert common.convert_ms_office_table_to_text(table, as_html=True) == ""
    assert common.convert_ms_office_table_to_text(table, as_html=False) == ""
+
+
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        ("<table><tbody><tr><td>👨\\U+1F3FB🔧</td></tr></tbody></table>", True),
+        ("<table><tbody><tr><td>Hello!</td></tr></tbody></table>", False),
+    ],
+)
+def test_contains_emoji(text, expected):
+    assert contains_emoji(text) is expected
--- a/test_unstructured/partition/test_constants.py
+++ b/test_unstructured/partition/test_constants.py
@ -25,3 +25,37 @@ EXPECTED_TABLE = """<table border="1" class="dataframe">


 EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
+
+EXPECTED_TEXT_WITH_EMOJI = (
+    "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13 👨\\U+1F3FB🔧 TOR 15"
+)
+
+EXPECTED_TABLE_WITH_EMOJI = """<table border="1" class="dataframe">
+  <tbody>
+    <tr>
+      <td>Team</td>
+      <td>Location</td>
+      <td>Stanley Cups</td>
+    </tr>
+    <tr>
+      <td>Blues</td>
+      <td>STL</td>
+      <td>1</td>
+    </tr>
+    <tr>
+      <td>Flyers</td>
+      <td>PHI</td>
+      <td>2</td>
+    </tr>
+    <tr>
+      <td>Maple Leafs</td>
+      <td>TOR</td>
+      <td>13</td>
+    </tr>
+    <tr>
+      <td>👨\\U+1F3FB🔧</td>
+      <td>TOR</td>
+      <td>15</td>
+    </tr>
+  </tbody>
+</table>"""
--- a/test_unstructured/partition/test_csv.py
+++ b/test_unstructured/partition/test_csv.py
@ -1,6 +1,13 @@
 from tempfile import SpooledTemporaryFile

-from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
+import pytest
+
+from test_unstructured.partition.test_constants import (
+    EXPECTED_TABLE,
+    EXPECTED_TABLE_WITH_EMOJI,
+    EXPECTED_TEXT,
+    EXPECTED_TEXT_WITH_EMOJI,
+)
 from unstructured.cleaners.core import clean_extra_whitespace
 from unstructured.documents.elements import Table
 from unstructured.partition.csv import partition_csv
@ -8,13 +15,21 @@ from unstructured.partition.csv import partition_csv
 EXPECTED_FILETYPE = "text/csv"


-def test_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
-    elements = partition_csv(filename=filename)
+@pytest.mark.parametrize(
+    ("filename", "expected_text", "expected_table"),
+    [
+        ("stanley-cups.csv", EXPECTED_TEXT, EXPECTED_TABLE),
+        ("stanley-cups-with-emoji.csv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
+    ],
+)
+def test_partition_csv_from_filename(filename, expected_text, expected_table):
+    f_path = f"example-docs/{filename}"
+    elements = partition_csv(filename=f_path)

-    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
-    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
+    assert clean_extra_whitespace(elements[0].text) == expected_text
+    assert elements[0].metadata.text_as_html == expected_table
    assert elements[0].metadata.filetype == EXPECTED_FILETYPE
-    assert elements[0].metadata.filename == "stanley-cups.csv"
+    assert elements[0].metadata.filename == filename


 def test_partition_csv_from_filename_with_metadata_filename(
@ -26,13 +41,21 @@ def test_partition_csv_from_filename_with_metadata_filename(
    assert elements[0].metadata.filename == "test"


-def test_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
-    with open(filename, "rb") as f:
+@pytest.mark.parametrize(
+    ("filename", "expected_text", "expected_table"),
+    [
+        ("stanley-cups.csv", EXPECTED_TEXT, EXPECTED_TABLE),
+        ("stanley-cups-with-emoji.csv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
+    ],
+)
+def test_partition_csv_from_file(filename, expected_text, expected_table):
+    f_path = f"example-docs/{filename}"
+    with open(f_path, "rb") as f:
        elements = partition_csv(file=f)

-    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+    assert clean_extra_whitespace(elements[0].text) == expected_text
    assert isinstance(elements[0], Table)
-    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
+    assert elements[0].metadata.text_as_html == expected_table
    assert elements[0].metadata.filetype == EXPECTED_FILETYPE
    assert elements[0].metadata.filename is None

--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.9.2"  # pragma: no cover
+__version__ = "0.9.3-dev0"  # pragma: no cover
--- a/unstructured/partition/common.py
+++ b/unstructured/partition/common.py
@ -7,6 +7,7 @@ from io import BufferedReader, BytesIO, TextIOWrapper
 from tempfile import SpooledTemporaryFile
 from typing import IO, TYPE_CHECKING, Any, BinaryIO, Dict, List, Optional, Tuple, Union

+import emoji
 from tabulate import tabulate

 from unstructured.documents.coordinates import CoordinateSystem
@ -333,3 +334,17 @@ def convert_ms_office_table_to_text(table: "docxtable.Table", as_html: bool = Tr
    else:
        table_text = ""
    return table_text
+
+
+def contains_emoji(s: str) -> bool:
+    """
+    Check if the input string contains any emoji characters.
+
+    Parameters:
+    - s (str): The input string to check.
+
+    Returns:
+    - bool: True if the string contains any emoji, False otherwise.
+    """
+
+    return bool(emoji.emoji_count(s))
--- a/unstructured/partition/csv.py
+++ b/unstructured/partition/csv.py
@ -1,8 +1,9 @@
 from tempfile import SpooledTemporaryFile
 from typing import IO, BinaryIO, List, Optional, Union, cast

-import lxml.html
 import pandas as pd
+from lxml.html import document_fromstring
+from lxml.html.soupparser import fromstring as soupparser_fromstring

 from unstructured.documents.elements import (
    Element,
@ -12,6 +13,7 @@ from unstructured.documents.elements import (
 )
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import (
+    contains_emoji,
    exactly_one,
    get_last_modified_date,
    get_last_modified_date_from_file,
@ -58,7 +60,8 @@ def partition_csv(
        table = pd.read_csv(f)

    html_text = table.to_html(index=False, header=False, na_rep="")
-    text = lxml.html.document_fromstring(html_text).text_content()
+    html_string_parser = soupparser_fromstring if contains_emoji(html_text) else document_fromstring
+    text = html_string_parser(html_text).text_content()

    if include_metadata:
        metadata = ElementMetadata(