From d26ab1deac8f1fc70a03b9ea71b2bcc3aff96cce Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Thu, 10 Aug 2023 16:28:57 -0700 Subject: [PATCH] fix: etree parser error (#1077) * feat: add functionality to check if a string contains any emoji characters * feat: add functionality to switch `html` text parser based on whether the `html` text contains emoji * chore: add `beautifulsoup4` and `emoji` packages to `requirements/base.in` for general use * chore: update changelog & version * chore: update changelog & version * chore: update dependencies * test: update `EXPECTED_XLS_TEXT_LEN` for `test_auto_partition_xls_from_filename` * chore: update changelog & version --- CHANGELOG.md | 11 +++++ example-docs/stanley-cups-with-emoji.csv | 6 +++ requirements/base.in | 2 + requirements/base.txt | 6 +++ requirements/extra-xlsx.in | 1 - test_unstructured/partition/test_common.py | 13 ++++++ test_unstructured/partition/test_constants.py | 34 +++++++++++++++ test_unstructured/partition/test_csv.py | 43 ++++++++++++++----- unstructured/__version__.py | 2 +- unstructured/partition/common.py | 15 +++++++ unstructured/partition/csv.py | 7 ++- 11 files changed, 126 insertions(+), 14 deletions(-) create mode 100644 example-docs/stanley-cups-with-emoji.csv diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b4efaa13..4e22f4945 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +## 0.9.3-dev0 + +### Enhancements + +* Add functionality to switch `html` text parser based on whether the `html` text contains emoji +* Add functionality to check if a string contains any emoji characters + +### Features + +### Fixes + ## 0.9.2 ### Enhancements diff --git a/example-docs/stanley-cups-with-emoji.csv b/example-docs/stanley-cups-with-emoji.csv new file mode 100644 index 000000000..48adf033d --- /dev/null +++ b/example-docs/stanley-cups-with-emoji.csv @@ -0,0 +1,6 @@ +Stanley Cups,, +Team,Location,Stanley Cups +Blues,STL,1 +Flyers,PHI,2 +Maple Leafs,TOR,13 +👨\U+1F3FB🔧,TOR,15 \ No newline at end of file diff --git a/requirements/base.in b/requirements/base.in index acc32d982..0df6c2abd 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -6,3 +6,5 @@ lxml nltk tabulate requests +beautifulsoup4 +emoji diff --git a/requirements/base.txt b/requirements/base.txt index b5bec8c12..a5ce90141 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -4,6 +4,8 @@ # # pip-compile requirements/base.in # +beautifulsoup4==4.12.2 + # via -r requirements/base.in certifi==2023.7.22 # via # -c requirements/constraints.in @@ -14,6 +16,8 @@ charset-normalizer==3.2.0 # via requests click==8.1.6 # via nltk +emoji==2.7.0 + # via -r requirements/base.in filetype==1.2.0 # via -r requirements/base.in idna==3.4 @@ -30,6 +34,8 @@ regex==2023.8.8 # via nltk requests==2.31.0 # via -r requirements/base.in +soupsieve==2.4.1 + # via beautifulsoup4 tabulate==0.9.0 # via -r requirements/base.in tqdm==4.66.0 diff --git a/requirements/extra-xlsx.in b/requirements/extra-xlsx.in index 24226dd52..5e296abf8 100644 --- a/requirements/extra-xlsx.in +++ b/requirements/extra-xlsx.in @@ -4,4 +4,3 @@ openpyxl pandas xlrd -beautifulsoup4 diff --git a/test_unstructured/partition/test_common.py b/test_unstructured/partition/test_common.py index 3ef097f4e..e1acc9355 100644 --- a/test_unstructured/partition/test_common.py +++ b/test_unstructured/partition/test_common.py @@ -1,3 +1,4 @@ +import pytest from unstructured_inference.inference.layout import LayoutElement from unstructured.documents.coordinates import PixelSpace @@ -10,6 +11,7 @@ from unstructured.documents.elements import ( Title, ) from unstructured.partition import common +from unstructured.partition.common import contains_emoji def test_normalize_layout_element_dict(): @@ -230,3 +232,14 @@ def test_convert_ms_office_table_to_text_works_with_empty_tables(): table = MockDocxEmptyTable() assert common.convert_ms_office_table_to_text(table, as_html=True) == "" assert common.convert_ms_office_table_to_text(table, as_html=False) == "" + + +@pytest.mark.parametrize( + ("text", "expected"), + [ + ("
👨\\U+1F3FB🔧
", True), + ("
Hello!
", False), + ], +) +def test_contains_emoji(text, expected): + assert contains_emoji(text) is expected diff --git a/test_unstructured/partition/test_constants.py b/test_unstructured/partition/test_constants.py index 492559dde..f9f04baad 100644 --- a/test_unstructured/partition/test_constants.py +++ b/test_unstructured/partition/test_constants.py @@ -25,3 +25,37 @@ EXPECTED_TABLE = """ EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13" + +EXPECTED_TEXT_WITH_EMOJI = ( + "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13 👨\\U+1F3FB🔧 TOR 15" +) + +EXPECTED_TABLE_WITH_EMOJI = """
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
👨\\U+1F3FB🔧TOR15
""" diff --git a/test_unstructured/partition/test_csv.py b/test_unstructured/partition/test_csv.py index d25857fef..ad26c0980 100644 --- a/test_unstructured/partition/test_csv.py +++ b/test_unstructured/partition/test_csv.py @@ -1,6 +1,13 @@ from tempfile import SpooledTemporaryFile -from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT +import pytest + +from test_unstructured.partition.test_constants import ( + EXPECTED_TABLE, + EXPECTED_TABLE_WITH_EMOJI, + EXPECTED_TEXT, + EXPECTED_TEXT_WITH_EMOJI, +) from unstructured.cleaners.core import clean_extra_whitespace from unstructured.documents.elements import Table from unstructured.partition.csv import partition_csv @@ -8,13 +15,21 @@ from unstructured.partition.csv import partition_csv EXPECTED_FILETYPE = "text/csv" -def test_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"): - elements = partition_csv(filename=filename) +@pytest.mark.parametrize( + ("filename", "expected_text", "expected_table"), + [ + ("stanley-cups.csv", EXPECTED_TEXT, EXPECTED_TABLE), + ("stanley-cups-with-emoji.csv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI), + ], +) +def test_partition_csv_from_filename(filename, expected_text, expected_table): + f_path = f"example-docs/{filename}" + elements = partition_csv(filename=f_path) - assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT - assert elements[0].metadata.text_as_html == EXPECTED_TABLE + assert clean_extra_whitespace(elements[0].text) == expected_text + assert elements[0].metadata.text_as_html == expected_table assert elements[0].metadata.filetype == EXPECTED_FILETYPE - assert elements[0].metadata.filename == "stanley-cups.csv" + assert elements[0].metadata.filename == filename def test_partition_csv_from_filename_with_metadata_filename( @@ -26,13 +41,21 @@ def test_partition_csv_from_filename_with_metadata_filename( assert elements[0].metadata.filename == "test" -def test_partition_csv_from_file(filename="example-docs/stanley-cups.csv"): - with open(filename, "rb") as f: +@pytest.mark.parametrize( + ("filename", "expected_text", "expected_table"), + [ + ("stanley-cups.csv", EXPECTED_TEXT, EXPECTED_TABLE), + ("stanley-cups-with-emoji.csv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI), + ], +) +def test_partition_csv_from_file(filename, expected_text, expected_table): + f_path = f"example-docs/{filename}" + with open(f_path, "rb") as f: elements = partition_csv(file=f) - assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT + assert clean_extra_whitespace(elements[0].text) == expected_text assert isinstance(elements[0], Table) - assert elements[0].metadata.text_as_html == EXPECTED_TABLE + assert elements[0].metadata.text_as_html == expected_table assert elements[0].metadata.filetype == EXPECTED_FILETYPE assert elements[0].metadata.filename is None diff --git a/unstructured/__version__.py b/unstructured/__version__.py index da9be9fa8..5d3ca0b37 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.9.2" # pragma: no cover +__version__ = "0.9.3-dev0" # pragma: no cover diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index 1b2186cc4..bbb8046c9 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -7,6 +7,7 @@ from io import BufferedReader, BytesIO, TextIOWrapper from tempfile import SpooledTemporaryFile from typing import IO, TYPE_CHECKING, Any, BinaryIO, Dict, List, Optional, Tuple, Union +import emoji from tabulate import tabulate from unstructured.documents.coordinates import CoordinateSystem @@ -333,3 +334,17 @@ def convert_ms_office_table_to_text(table: "docxtable.Table", as_html: bool = Tr else: table_text = "" return table_text + + +def contains_emoji(s: str) -> bool: + """ + Check if the input string contains any emoji characters. + + Parameters: + - s (str): The input string to check. + + Returns: + - bool: True if the string contains any emoji, False otherwise. + """ + + return bool(emoji.emoji_count(s)) diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index 78cecbb16..8934b31c6 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -1,8 +1,9 @@ from tempfile import SpooledTemporaryFile from typing import IO, BinaryIO, List, Optional, Union, cast -import lxml.html import pandas as pd +from lxml.html import document_fromstring +from lxml.html.soupparser import fromstring as soupparser_fromstring from unstructured.documents.elements import ( Element, @@ -12,6 +13,7 @@ from unstructured.documents.elements import ( ) from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.common import ( + contains_emoji, exactly_one, get_last_modified_date, get_last_modified_date_from_file, @@ -58,7 +60,8 @@ def partition_csv( table = pd.read_csv(f) html_text = table.to_html(index=False, header=False, na_rep="") - text = lxml.html.document_fromstring(html_text).text_content() + html_string_parser = soupparser_fromstring if contains_emoji(html_text) else document_fromstring + text = html_string_parser(html_text).text_content() if include_metadata: metadata = ElementMetadata(