From fc2699ff0683da93e2f6d2b2f0e6bcc05d5b63b3 Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Sun, 13 Aug 2023 18:22:36 -0700 Subject: [PATCH] Fix/1057 etree parser error tsv (#1106) * feat: always use `soupparser_fromstring` to parse `html text` which gracefully handles emoji * chore: update changelog & version --- CHANGELOG.md | 5 +-- example-docs/stanley-cups-with-emoji.tsv | 6 ++++ test_unstructured/partition/test_tsv.py | 43 ++++++++++++++++++------ unstructured/__version__.py | 2 +- unstructured/partition/tsv.py | 4 +-- 5 files changed, 45 insertions(+), 15 deletions(-) create mode 100644 example-docs/stanley-cups-with-emoji.tsv diff --git a/CHANGELOG.md b/CHANGELOG.md index e21287ebc..4e7ca116f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,12 @@ -## 0.9.3-dev1 +## 0.9.3-dev2 ### Enhancements +* Update `partition_tsv` to always use `soupparser_fromstring` to parse `html text` * Add `metadata.section` to capture epub table of contents data * Add `unique_element_ids` kwarg to partition functions. If `True`, will use a UUID for element IDs instead of a SHA-256 hash. -* Update `partition_xlsx` to always use `soupparser_fromstring` to parse `html text` +* Update `partition_xlsx` to always use `soupparser_fromstring` to parse `html text` * Add functionality to switch `html` text parser based on whether the `html` text contains emoji * Add functionality to check if a string contains any emoji characters diff --git a/example-docs/stanley-cups-with-emoji.tsv b/example-docs/stanley-cups-with-emoji.tsv new file mode 100644 index 000000000..890e5768d --- /dev/null +++ b/example-docs/stanley-cups-with-emoji.tsv @@ -0,0 +1,6 @@ +Stanley Cups +Team Location Stanley Cups +Blues STL 1 +Flyers PHI 2 +Maple Leafs TOR 13 +👨\U+1F3FB🔧 TOR 15 diff --git a/test_unstructured/partition/test_tsv.py b/test_unstructured/partition/test_tsv.py index 3ebee20f9..052956cd4 100644 --- a/test_unstructured/partition/test_tsv.py +++ b/test_unstructured/partition/test_tsv.py @@ -1,4 +1,11 @@ -from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT +import pytest + +from test_unstructured.partition.test_constants import ( + EXPECTED_TABLE, + EXPECTED_TABLE_WITH_EMOJI, + EXPECTED_TEXT, + EXPECTED_TEXT_WITH_EMOJI, +) from unstructured.cleaners.core import clean_extra_whitespace from unstructured.documents.elements import Table from unstructured.partition.tsv import partition_tsv @@ -6,14 +13,22 @@ from unstructured.partition.tsv import partition_tsv EXPECTED_FILETYPE = "text/tsv" -def test_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"): - elements = partition_tsv(filename=filename) +@pytest.mark.parametrize( + ("filename", "expected_text", "expected_table"), + [ + ("stanley-cups.tsv", EXPECTED_TEXT, EXPECTED_TABLE), + ("stanley-cups-with-emoji.tsv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI), + ], +) +def test_partition_tsv_from_filename(filename, expected_text, expected_table): + f_path = f"example-docs/{filename}" + elements = partition_tsv(filename=f_path) - assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT - assert elements[0].metadata.text_as_html == EXPECTED_TABLE + assert clean_extra_whitespace(elements[0].text) == expected_text + assert elements[0].metadata.text_as_html == expected_table assert elements[0].metadata.filetype == EXPECTED_FILETYPE for element in elements: - assert element.metadata.filename == "stanley-cups.tsv" + assert element.metadata.filename == filename def test_partition_tsv_from_filename_with_metadata_filename( @@ -26,13 +41,21 @@ def test_partition_tsv_from_filename_with_metadata_filename( assert element.metadata.filename == "test" -def test_partition_tsv_from_file(filename="example-docs/stanley-cups.tsv"): - with open(filename, "rb") as f: +@pytest.mark.parametrize( + ("filename", "expected_text", "expected_table"), + [ + ("stanley-cups.tsv", EXPECTED_TEXT, EXPECTED_TABLE), + ("stanley-cups-with-emoji.tsv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI), + ], +) +def test_partition_tsv_from_file(filename, expected_text, expected_table): + f_path = f"example-docs/{filename}" + with open(f_path, "rb") as f: elements = partition_tsv(file=f) - assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT + assert clean_extra_whitespace(elements[0].text) == expected_text assert isinstance(elements[0], Table) - assert elements[0].metadata.text_as_html == EXPECTED_TABLE + assert elements[0].metadata.text_as_html == expected_table assert elements[0].metadata.filetype == EXPECTED_FILETYPE for element in elements: assert element.metadata.filename is None diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 52f0e0a38..46965abb9 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.9.3-dev1" # pragma: no cover +__version__ = "0.9.3-dev2" # pragma: no cover diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py index c962acaf5..0fd2a892a 100644 --- a/unstructured/partition/tsv.py +++ b/unstructured/partition/tsv.py @@ -1,8 +1,8 @@ from tempfile import SpooledTemporaryFile from typing import IO, BinaryIO, List, Optional, Union, cast -import lxml.html import pandas as pd +from lxml.html.soupparser import fromstring as soupparser_fromstring from unstructured.documents.elements import ( Element, @@ -55,7 +55,7 @@ def partition_tsv( last_modification_date = get_last_modified_date_from_file(file) html_text = table.to_html(index=False, header=False, na_rep="") - text = lxml.html.document_fromstring(html_text).text_content() + text = soupparser_fromstring(html_text).text_content() if include_metadata: metadata = ElementMetadata(