diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4b4efaa13..4e22f4945 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,14 @@
+## 0.9.3-dev0
+
+### Enhancements
+
+* Add functionality to switch `html` text parser based on whether the `html` text contains emoji
+* Add functionality to check if a string contains any emoji characters
+
+### Features
+
+### Fixes
+
## 0.9.2
### Enhancements
diff --git a/example-docs/stanley-cups-with-emoji.csv b/example-docs/stanley-cups-with-emoji.csv
new file mode 100644
index 000000000..48adf033d
--- /dev/null
+++ b/example-docs/stanley-cups-with-emoji.csv
@@ -0,0 +1,6 @@
+Stanley Cups,,
+Team,Location,Stanley Cups
+Blues,STL,1
+Flyers,PHI,2
+Maple Leafs,TOR,13
+👨\U+1F3FB🔧,TOR,15
\ No newline at end of file
diff --git a/requirements/base.in b/requirements/base.in
index acc32d982..0df6c2abd 100644
--- a/requirements/base.in
+++ b/requirements/base.in
@@ -6,3 +6,5 @@ lxml
nltk
tabulate
requests
+beautifulsoup4
+emoji
diff --git a/requirements/base.txt b/requirements/base.txt
index b5bec8c12..a5ce90141 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -4,6 +4,8 @@
#
# pip-compile requirements/base.in
#
+beautifulsoup4==4.12.2
+ # via -r requirements/base.in
certifi==2023.7.22
# via
# -c requirements/constraints.in
@@ -14,6 +16,8 @@ charset-normalizer==3.2.0
# via requests
click==8.1.6
# via nltk
+emoji==2.7.0
+ # via -r requirements/base.in
filetype==1.2.0
# via -r requirements/base.in
idna==3.4
@@ -30,6 +34,8 @@ regex==2023.8.8
# via nltk
requests==2.31.0
# via -r requirements/base.in
+soupsieve==2.4.1
+ # via beautifulsoup4
tabulate==0.9.0
# via -r requirements/base.in
tqdm==4.66.0
diff --git a/requirements/extra-xlsx.in b/requirements/extra-xlsx.in
index 24226dd52..5e296abf8 100644
--- a/requirements/extra-xlsx.in
+++ b/requirements/extra-xlsx.in
@@ -4,4 +4,3 @@
openpyxl
pandas
xlrd
-beautifulsoup4
diff --git a/test_unstructured/partition/test_common.py b/test_unstructured/partition/test_common.py
index 3ef097f4e..e1acc9355 100644
--- a/test_unstructured/partition/test_common.py
+++ b/test_unstructured/partition/test_common.py
@@ -1,3 +1,4 @@
+import pytest
from unstructured_inference.inference.layout import LayoutElement
from unstructured.documents.coordinates import PixelSpace
@@ -10,6 +11,7 @@ from unstructured.documents.elements import (
Title,
)
from unstructured.partition import common
+from unstructured.partition.common import contains_emoji
def test_normalize_layout_element_dict():
@@ -230,3 +232,14 @@ def test_convert_ms_office_table_to_text_works_with_empty_tables():
table = MockDocxEmptyTable()
assert common.convert_ms_office_table_to_text(table, as_html=True) == ""
assert common.convert_ms_office_table_to_text(table, as_html=False) == ""
+
+
+@pytest.mark.parametrize(
+ ("text", "expected"),
+ [
+ ("
", True),
+ ("", False),
+ ],
+)
+def test_contains_emoji(text, expected):
+ assert contains_emoji(text) is expected
diff --git a/test_unstructured/partition/test_constants.py b/test_unstructured/partition/test_constants.py
index 492559dde..f9f04baad 100644
--- a/test_unstructured/partition/test_constants.py
+++ b/test_unstructured/partition/test_constants.py
@@ -25,3 +25,37 @@ EXPECTED_TABLE = """
EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
+
+EXPECTED_TEXT_WITH_EMOJI = (
+ "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13 👨\\U+1F3FB🔧 TOR 15"
+)
+
+EXPECTED_TABLE_WITH_EMOJI = """
+
+
+ Team |
+ Location |
+ Stanley Cups |
+
+
+ Blues |
+ STL |
+ 1 |
+
+
+ Flyers |
+ PHI |
+ 2 |
+
+
+ Maple Leafs |
+ TOR |
+ 13 |
+
+
+ 👨\\U+1F3FB🔧 |
+ TOR |
+ 15 |
+
+
+
"""
diff --git a/test_unstructured/partition/test_csv.py b/test_unstructured/partition/test_csv.py
index d25857fef..ad26c0980 100644
--- a/test_unstructured/partition/test_csv.py
+++ b/test_unstructured/partition/test_csv.py
@@ -1,6 +1,13 @@
from tempfile import SpooledTemporaryFile
-from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
+import pytest
+
+from test_unstructured.partition.test_constants import (
+ EXPECTED_TABLE,
+ EXPECTED_TABLE_WITH_EMOJI,
+ EXPECTED_TEXT,
+ EXPECTED_TEXT_WITH_EMOJI,
+)
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table
from unstructured.partition.csv import partition_csv
@@ -8,13 +15,21 @@ from unstructured.partition.csv import partition_csv
EXPECTED_FILETYPE = "text/csv"
-def test_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
- elements = partition_csv(filename=filename)
+@pytest.mark.parametrize(
+ ("filename", "expected_text", "expected_table"),
+ [
+ ("stanley-cups.csv", EXPECTED_TEXT, EXPECTED_TABLE),
+ ("stanley-cups-with-emoji.csv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
+ ],
+)
+def test_partition_csv_from_filename(filename, expected_text, expected_table):
+ f_path = f"example-docs/{filename}"
+ elements = partition_csv(filename=f_path)
- assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
- assert elements[0].metadata.text_as_html == EXPECTED_TABLE
+ assert clean_extra_whitespace(elements[0].text) == expected_text
+ assert elements[0].metadata.text_as_html == expected_table
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
- assert elements[0].metadata.filename == "stanley-cups.csv"
+ assert elements[0].metadata.filename == filename
def test_partition_csv_from_filename_with_metadata_filename(
@@ -26,13 +41,21 @@ def test_partition_csv_from_filename_with_metadata_filename(
assert elements[0].metadata.filename == "test"
-def test_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
- with open(filename, "rb") as f:
+@pytest.mark.parametrize(
+ ("filename", "expected_text", "expected_table"),
+ [
+ ("stanley-cups.csv", EXPECTED_TEXT, EXPECTED_TABLE),
+ ("stanley-cups-with-emoji.csv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
+ ],
+)
+def test_partition_csv_from_file(filename, expected_text, expected_table):
+ f_path = f"example-docs/{filename}"
+ with open(f_path, "rb") as f:
elements = partition_csv(file=f)
- assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+ assert clean_extra_whitespace(elements[0].text) == expected_text
assert isinstance(elements[0], Table)
- assert elements[0].metadata.text_as_html == EXPECTED_TABLE
+ assert elements[0].metadata.text_as_html == expected_table
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
assert elements[0].metadata.filename is None
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index da9be9fa8..5d3ca0b37 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.9.2" # pragma: no cover
+__version__ = "0.9.3-dev0" # pragma: no cover
diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py
index 1b2186cc4..bbb8046c9 100644
--- a/unstructured/partition/common.py
+++ b/unstructured/partition/common.py
@@ -7,6 +7,7 @@ from io import BufferedReader, BytesIO, TextIOWrapper
from tempfile import SpooledTemporaryFile
from typing import IO, TYPE_CHECKING, Any, BinaryIO, Dict, List, Optional, Tuple, Union
+import emoji
from tabulate import tabulate
from unstructured.documents.coordinates import CoordinateSystem
@@ -333,3 +334,17 @@ def convert_ms_office_table_to_text(table: "docxtable.Table", as_html: bool = Tr
else:
table_text = ""
return table_text
+
+
+def contains_emoji(s: str) -> bool:
+ """
+ Check if the input string contains any emoji characters.
+
+ Parameters:
+ - s (str): The input string to check.
+
+ Returns:
+ - bool: True if the string contains any emoji, False otherwise.
+ """
+
+ return bool(emoji.emoji_count(s))
diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py
index 78cecbb16..8934b31c6 100644
--- a/unstructured/partition/csv.py
+++ b/unstructured/partition/csv.py
@@ -1,8 +1,9 @@
from tempfile import SpooledTemporaryFile
from typing import IO, BinaryIO, List, Optional, Union, cast
-import lxml.html
import pandas as pd
+from lxml.html import document_fromstring
+from lxml.html.soupparser import fromstring as soupparser_fromstring
from unstructured.documents.elements import (
Element,
@@ -12,6 +13,7 @@ from unstructured.documents.elements import (
)
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import (
+ contains_emoji,
exactly_one,
get_last_modified_date,
get_last_modified_date_from_file,
@@ -58,7 +60,8 @@ def partition_csv(
table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="")
- text = lxml.html.document_fromstring(html_text).text_content()
+ html_string_parser = soupparser_fromstring if contains_emoji(html_text) else document_fromstring
+ text = html_string_parser(html_text).text_content()
if include_metadata:
metadata = ElementMetadata(