From fc2699ff0683da93e2f6d2b2f0e6bcc05d5b63b3 Mon Sep 17 00:00:00 2001
From: Christine Straub <christinemstraub@gmail.com>
Date: Sun, 13 Aug 2023 18:22:36 -0700
Subject: [PATCH] Fix/1057 etree parser error tsv (#1106)

* feat: always use `soupparser_fromstring` to parse `html text` which gracefully handles emoji
* chore: update changelog & version
---
 CHANGELOG.md                             |  5 +--
 example-docs/stanley-cups-with-emoji.tsv |  6 ++++
 test_unstructured/partition/test_tsv.py  | 43 ++++++++++++++++++------
 unstructured/__version__.py              |  2 +-
 unstructured/partition/tsv.py            |  4 +--
 5 files changed, 45 insertions(+), 15 deletions(-)
 create mode 100644 example-docs/stanley-cups-with-emoji.tsv

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e21287ebc..4e7ca116f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,11 +1,12 @@
-## 0.9.3-dev1
+## 0.9.3-dev2
 
 ### Enhancements
 
+* Update `partition_tsv` to always use `soupparser_fromstring` to parse `html text`
 * Add `metadata.section` to capture epub table of contents data
 * Add `unique_element_ids` kwarg to partition functions. If `True`, will use a UUID
   for element IDs instead of a SHA-256 hash.
-* Update `partition_xlsx` to always use `soupparser_fromstring` to parse `html text` 
+* Update `partition_xlsx` to always use `soupparser_fromstring` to parse `html text`
 * Add functionality to switch `html` text parser based on whether the `html` text contains emoji
 * Add functionality to check if a string contains any emoji characters
 
diff --git a/example-docs/stanley-cups-with-emoji.tsv b/example-docs/stanley-cups-with-emoji.tsv
new file mode 100644
index 000000000..890e5768d
--- /dev/null
+++ b/example-docs/stanley-cups-with-emoji.tsv
@@ -0,0 +1,6 @@
+Stanley Cups		
+Team	Location	Stanley Cups
+Blues	STL	1
+Flyers	PHI	2
+Maple Leafs	TOR	13
+👨\U+1F3FB🔧	TOR	15
diff --git a/test_unstructured/partition/test_tsv.py b/test_unstructured/partition/test_tsv.py
index 3ebee20f9..052956cd4 100644
--- a/test_unstructured/partition/test_tsv.py
+++ b/test_unstructured/partition/test_tsv.py
@@ -1,4 +1,11 @@
-from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
+import pytest
+
+from test_unstructured.partition.test_constants import (
+    EXPECTED_TABLE,
+    EXPECTED_TABLE_WITH_EMOJI,
+    EXPECTED_TEXT,
+    EXPECTED_TEXT_WITH_EMOJI,
+)
 from unstructured.cleaners.core import clean_extra_whitespace
 from unstructured.documents.elements import Table
 from unstructured.partition.tsv import partition_tsv
@@ -6,14 +13,22 @@ from unstructured.partition.tsv import partition_tsv
 EXPECTED_FILETYPE = "text/tsv"
 
 
-def test_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"):
-    elements = partition_tsv(filename=filename)
+@pytest.mark.parametrize(
+    ("filename", "expected_text", "expected_table"),
+    [
+        ("stanley-cups.tsv", EXPECTED_TEXT, EXPECTED_TABLE),
+        ("stanley-cups-with-emoji.tsv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
+    ],
+)
+def test_partition_tsv_from_filename(filename, expected_text, expected_table):
+    f_path = f"example-docs/{filename}"
+    elements = partition_tsv(filename=f_path)
 
-    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
-    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
+    assert clean_extra_whitespace(elements[0].text) == expected_text
+    assert elements[0].metadata.text_as_html == expected_table
     assert elements[0].metadata.filetype == EXPECTED_FILETYPE
     for element in elements:
-        assert element.metadata.filename == "stanley-cups.tsv"
+        assert element.metadata.filename == filename
 
 
 def test_partition_tsv_from_filename_with_metadata_filename(
@@ -26,13 +41,21 @@ def test_partition_tsv_from_filename_with_metadata_filename(
         assert element.metadata.filename == "test"
 
 
-def test_partition_tsv_from_file(filename="example-docs/stanley-cups.tsv"):
-    with open(filename, "rb") as f:
+@pytest.mark.parametrize(
+    ("filename", "expected_text", "expected_table"),
+    [
+        ("stanley-cups.tsv", EXPECTED_TEXT, EXPECTED_TABLE),
+        ("stanley-cups-with-emoji.tsv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
+    ],
+)
+def test_partition_tsv_from_file(filename, expected_text, expected_table):
+    f_path = f"example-docs/{filename}"
+    with open(f_path, "rb") as f:
         elements = partition_tsv(file=f)
 
-    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+    assert clean_extra_whitespace(elements[0].text) == expected_text
     assert isinstance(elements[0], Table)
-    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
+    assert elements[0].metadata.text_as_html == expected_table
     assert elements[0].metadata.filetype == EXPECTED_FILETYPE
     for element in elements:
         assert element.metadata.filename is None
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 52f0e0a38..46965abb9 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.9.3-dev1"  # pragma: no cover
+__version__ = "0.9.3-dev2"  # pragma: no cover
diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py
index c962acaf5..0fd2a892a 100644
--- a/unstructured/partition/tsv.py
+++ b/unstructured/partition/tsv.py
@@ -1,8 +1,8 @@
 from tempfile import SpooledTemporaryFile
 from typing import IO, BinaryIO, List, Optional, Union, cast
 
-import lxml.html
 import pandas as pd
+from lxml.html.soupparser import fromstring as soupparser_fromstring
 
 from unstructured.documents.elements import (
     Element,
@@ -55,7 +55,7 @@ def partition_tsv(
         last_modification_date = get_last_modified_date_from_file(file)
 
     html_text = table.to_html(index=False, header=False, na_rep="")
-    text = lxml.html.document_fromstring(html_text).text_content()
+    text = soupparser_fromstring(html_text).text_content()
 
     if include_metadata:
         metadata = ElementMetadata(