fix: etree parser error (#1077)

* feat: add functionality to check if a string contains any emoji characters

* feat: add functionality to switch `html` text parser based on whether the `html` text contains emoji

* chore: add `beautifulsoup4` and `emoji` packages to `requirements/base.in` for general use

* chore: update changelog & version

* chore: update changelog & version

* chore: update dependencies

* test: update `EXPECTED_XLS_TEXT_LEN` for `test_auto_partition_xls_from_filename`

* chore: update changelog & version
This commit is contained in:
Christine Straub 2023-08-10 16:28:57 -07:00 committed by GitHub
parent b31c62fa84
commit d26ab1deac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 126 additions and 14 deletions

View File

@ -1,3 +1,14 @@
## 0.9.3-dev0
### Enhancements
* Add functionality to switch `html` text parser based on whether the `html` text contains emoji
* Add functionality to check if a string contains any emoji characters
### Features
### Fixes
## 0.9.2 ## 0.9.2
### Enhancements ### Enhancements

View File

@ -0,0 +1,6 @@
Stanley Cups,,
Team,Location,Stanley Cups
Blues,STL,1
Flyers,PHI,2
Maple Leafs,TOR,13
👨\U+1F3FB🔧,TOR,15
1 Stanley Cups
2 Team Location Stanley Cups
3 Blues STL 1
4 Flyers PHI 2
5 Maple Leafs TOR 13
6 👨\U+1F3FB🔧 TOR 15

View File

@ -6,3 +6,5 @@ lxml
nltk nltk
tabulate tabulate
requests requests
beautifulsoup4
emoji

View File

@ -4,6 +4,8 @@
# #
# pip-compile requirements/base.in # pip-compile requirements/base.in
# #
beautifulsoup4==4.12.2
# via -r requirements/base.in
certifi==2023.7.22 certifi==2023.7.22
# via # via
# -c requirements/constraints.in # -c requirements/constraints.in
@ -14,6 +16,8 @@ charset-normalizer==3.2.0
# via requests # via requests
click==8.1.6 click==8.1.6
# via nltk # via nltk
emoji==2.7.0
# via -r requirements/base.in
filetype==1.2.0 filetype==1.2.0
# via -r requirements/base.in # via -r requirements/base.in
idna==3.4 idna==3.4
@ -30,6 +34,8 @@ regex==2023.8.8
# via nltk # via nltk
requests==2.31.0 requests==2.31.0
# via -r requirements/base.in # via -r requirements/base.in
soupsieve==2.4.1
# via beautifulsoup4
tabulate==0.9.0 tabulate==0.9.0
# via -r requirements/base.in # via -r requirements/base.in
tqdm==4.66.0 tqdm==4.66.0

View File

@ -4,4 +4,3 @@
openpyxl openpyxl
pandas pandas
xlrd xlrd
beautifulsoup4

View File

@ -1,3 +1,4 @@
import pytest
from unstructured_inference.inference.layout import LayoutElement from unstructured_inference.inference.layout import LayoutElement
from unstructured.documents.coordinates import PixelSpace from unstructured.documents.coordinates import PixelSpace
@ -10,6 +11,7 @@ from unstructured.documents.elements import (
Title, Title,
) )
from unstructured.partition import common from unstructured.partition import common
from unstructured.partition.common import contains_emoji
def test_normalize_layout_element_dict(): def test_normalize_layout_element_dict():
@ -230,3 +232,14 @@ def test_convert_ms_office_table_to_text_works_with_empty_tables():
table = MockDocxEmptyTable() table = MockDocxEmptyTable()
assert common.convert_ms_office_table_to_text(table, as_html=True) == "" assert common.convert_ms_office_table_to_text(table, as_html=True) == ""
assert common.convert_ms_office_table_to_text(table, as_html=False) == "" assert common.convert_ms_office_table_to_text(table, as_html=False) == ""
@pytest.mark.parametrize(
("text", "expected"),
[
("<table><tbody><tr><td>👨\\U+1F3FB🔧</td></tr></tbody></table>", True),
("<table><tbody><tr><td>Hello!</td></tr></tbody></table>", False),
],
)
def test_contains_emoji(text, expected):
assert contains_emoji(text) is expected

View File

@ -25,3 +25,37 @@ EXPECTED_TABLE = """<table border="1" class="dataframe">
EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13" EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
EXPECTED_TEXT_WITH_EMOJI = (
"Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13 👨\\U+1F3FB🔧 TOR 15"
)
EXPECTED_TABLE_WITH_EMOJI = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Team</td>
<td>Location</td>
<td>Stanley Cups</td>
</tr>
<tr>
<td>Blues</td>
<td>STL</td>
<td>1</td>
</tr>
<tr>
<td>Flyers</td>
<td>PHI</td>
<td>2</td>
</tr>
<tr>
<td>Maple Leafs</td>
<td>TOR</td>
<td>13</td>
</tr>
<tr>
<td>👨\\U+1F3FB🔧</td>
<td>TOR</td>
<td>15</td>
</tr>
</tbody>
</table>"""

View File

@ -1,6 +1,13 @@
from tempfile import SpooledTemporaryFile from tempfile import SpooledTemporaryFile
from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT import pytest
from test_unstructured.partition.test_constants import (
EXPECTED_TABLE,
EXPECTED_TABLE_WITH_EMOJI,
EXPECTED_TEXT,
EXPECTED_TEXT_WITH_EMOJI,
)
from unstructured.cleaners.core import clean_extra_whitespace from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table from unstructured.documents.elements import Table
from unstructured.partition.csv import partition_csv from unstructured.partition.csv import partition_csv
@ -8,13 +15,21 @@ from unstructured.partition.csv import partition_csv
EXPECTED_FILETYPE = "text/csv" EXPECTED_FILETYPE = "text/csv"
def test_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"): @pytest.mark.parametrize(
elements = partition_csv(filename=filename) ("filename", "expected_text", "expected_table"),
[
("stanley-cups.csv", EXPECTED_TEXT, EXPECTED_TABLE),
("stanley-cups-with-emoji.csv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
],
)
def test_partition_csv_from_filename(filename, expected_text, expected_table):
f_path = f"example-docs/{filename}"
elements = partition_csv(filename=f_path)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT assert clean_extra_whitespace(elements[0].text) == expected_text
assert elements[0].metadata.text_as_html == EXPECTED_TABLE assert elements[0].metadata.text_as_html == expected_table
assert elements[0].metadata.filetype == EXPECTED_FILETYPE assert elements[0].metadata.filetype == EXPECTED_FILETYPE
assert elements[0].metadata.filename == "stanley-cups.csv" assert elements[0].metadata.filename == filename
def test_partition_csv_from_filename_with_metadata_filename( def test_partition_csv_from_filename_with_metadata_filename(
@ -26,13 +41,21 @@ def test_partition_csv_from_filename_with_metadata_filename(
assert elements[0].metadata.filename == "test" assert elements[0].metadata.filename == "test"
def test_partition_csv_from_file(filename="example-docs/stanley-cups.csv"): @pytest.mark.parametrize(
with open(filename, "rb") as f: ("filename", "expected_text", "expected_table"),
[
("stanley-cups.csv", EXPECTED_TEXT, EXPECTED_TABLE),
("stanley-cups-with-emoji.csv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
],
)
def test_partition_csv_from_file(filename, expected_text, expected_table):
f_path = f"example-docs/{filename}"
with open(f_path, "rb") as f:
elements = partition_csv(file=f) elements = partition_csv(file=f)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT assert clean_extra_whitespace(elements[0].text) == expected_text
assert isinstance(elements[0], Table) assert isinstance(elements[0], Table)
assert elements[0].metadata.text_as_html == EXPECTED_TABLE assert elements[0].metadata.text_as_html == expected_table
assert elements[0].metadata.filetype == EXPECTED_FILETYPE assert elements[0].metadata.filetype == EXPECTED_FILETYPE
assert elements[0].metadata.filename is None assert elements[0].metadata.filename is None

View File

@ -1 +1 @@
__version__ = "0.9.2" # pragma: no cover __version__ = "0.9.3-dev0" # pragma: no cover

View File

@ -7,6 +7,7 @@ from io import BufferedReader, BytesIO, TextIOWrapper
from tempfile import SpooledTemporaryFile from tempfile import SpooledTemporaryFile
from typing import IO, TYPE_CHECKING, Any, BinaryIO, Dict, List, Optional, Tuple, Union from typing import IO, TYPE_CHECKING, Any, BinaryIO, Dict, List, Optional, Tuple, Union
import emoji
from tabulate import tabulate from tabulate import tabulate
from unstructured.documents.coordinates import CoordinateSystem from unstructured.documents.coordinates import CoordinateSystem
@ -333,3 +334,17 @@ def convert_ms_office_table_to_text(table: "docxtable.Table", as_html: bool = Tr
else: else:
table_text = "" table_text = ""
return table_text return table_text
def contains_emoji(s: str) -> bool:
"""
Check if the input string contains any emoji characters.
Parameters:
- s (str): The input string to check.
Returns:
- bool: True if the string contains any emoji, False otherwise.
"""
return bool(emoji.emoji_count(s))

View File

@ -1,8 +1,9 @@
from tempfile import SpooledTemporaryFile from tempfile import SpooledTemporaryFile
from typing import IO, BinaryIO, List, Optional, Union, cast from typing import IO, BinaryIO, List, Optional, Union, cast
import lxml.html
import pandas as pd import pandas as pd
from lxml.html import document_fromstring
from lxml.html.soupparser import fromstring as soupparser_fromstring
from unstructured.documents.elements import ( from unstructured.documents.elements import (
Element, Element,
@ -12,6 +13,7 @@ from unstructured.documents.elements import (
) )
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import ( from unstructured.partition.common import (
contains_emoji,
exactly_one, exactly_one,
get_last_modified_date, get_last_modified_date,
get_last_modified_date_from_file, get_last_modified_date_from_file,
@ -58,7 +60,8 @@ def partition_csv(
table = pd.read_csv(f) table = pd.read_csv(f)
html_text = table.to_html(index=False, header=False, na_rep="") html_text = table.to_html(index=False, header=False, na_rep="")
text = lxml.html.document_fromstring(html_text).text_content() html_string_parser = soupparser_fromstring if contains_emoji(html_text) else document_fromstring
text = html_string_parser(html_text).text_content()
if include_metadata: if include_metadata:
metadata = ElementMetadata( metadata = ElementMetadata(