Fix/1057 etree parser error tsv (#1106)

* feat: always use `soupparser_fromstring` to parse `html text` which gracefully handles emoji
* chore: update changelog & version
This commit is contained in:
Christine Straub 2023-08-13 18:22:36 -07:00 committed by GitHub
parent b4b8ac4d8a
commit fc2699ff06
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 45 additions and 15 deletions

View File

@ -1,11 +1,12 @@
## 0.9.3-dev1
## 0.9.3-dev2
### Enhancements
* Update `partition_tsv` to always use `soupparser_fromstring` to parse `html text`
* Add `metadata.section` to capture epub table of contents data
* Add `unique_element_ids` kwarg to partition functions. If `True`, will use a UUID
for element IDs instead of a SHA-256 hash.
* Update `partition_xlsx` to always use `soupparser_fromstring` to parse `html text`
* Update `partition_xlsx` to always use `soupparser_fromstring` to parse `html text`
* Add functionality to switch `html` text parser based on whether the `html` text contains emoji
* Add functionality to check if a string contains any emoji characters

View File

@ -0,0 +1,6 @@
Stanley Cups
Team Location Stanley Cups
Blues STL 1
Flyers PHI 2
Maple Leafs TOR 13
👨\U+1F3FB🔧 TOR 15
1 Stanley Cups
2 Team Location Stanley Cups
3 Blues STL 1
4 Flyers PHI 2
5 Maple Leafs TOR 13
6 👨\U+1F3FB🔧 TOR 15

View File

@ -1,4 +1,11 @@
from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
import pytest
from test_unstructured.partition.test_constants import (
EXPECTED_TABLE,
EXPECTED_TABLE_WITH_EMOJI,
EXPECTED_TEXT,
EXPECTED_TEXT_WITH_EMOJI,
)
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table
from unstructured.partition.tsv import partition_tsv
@ -6,14 +13,22 @@ from unstructured.partition.tsv import partition_tsv
EXPECTED_FILETYPE = "text/tsv"
def test_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"):
elements = partition_tsv(filename=filename)
@pytest.mark.parametrize(
("filename", "expected_text", "expected_table"),
[
("stanley-cups.tsv", EXPECTED_TEXT, EXPECTED_TABLE),
("stanley-cups-with-emoji.tsv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
],
)
def test_partition_tsv_from_filename(filename, expected_text, expected_table):
f_path = f"example-docs/{filename}"
elements = partition_tsv(filename=f_path)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert clean_extra_whitespace(elements[0].text) == expected_text
assert elements[0].metadata.text_as_html == expected_table
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
for element in elements:
assert element.metadata.filename == "stanley-cups.tsv"
assert element.metadata.filename == filename
def test_partition_tsv_from_filename_with_metadata_filename(
@ -26,13 +41,21 @@ def test_partition_tsv_from_filename_with_metadata_filename(
assert element.metadata.filename == "test"
def test_partition_tsv_from_file(filename="example-docs/stanley-cups.tsv"):
with open(filename, "rb") as f:
@pytest.mark.parametrize(
("filename", "expected_text", "expected_table"),
[
("stanley-cups.tsv", EXPECTED_TEXT, EXPECTED_TABLE),
("stanley-cups-with-emoji.tsv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
],
)
def test_partition_tsv_from_file(filename, expected_text, expected_table):
f_path = f"example-docs/{filename}"
with open(f_path, "rb") as f:
elements = partition_tsv(file=f)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert clean_extra_whitespace(elements[0].text) == expected_text
assert isinstance(elements[0], Table)
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.text_as_html == expected_table
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
for element in elements:
assert element.metadata.filename is None

View File

@ -1 +1 @@
__version__ = "0.9.3-dev1" # pragma: no cover
__version__ = "0.9.3-dev2" # pragma: no cover

View File

@ -1,8 +1,8 @@
from tempfile import SpooledTemporaryFile
from typing import IO, BinaryIO, List, Optional, Union, cast
import lxml.html
import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
from unstructured.documents.elements import (
Element,
@ -55,7 +55,7 @@ def partition_tsv(
last_modification_date = get_last_modified_date_from_file(file)
html_text = table.to_html(index=False, header=False, na_rep="")
text = lxml.html.document_fromstring(html_text).text_content()
text = soupparser_fromstring(html_text).text_content()
if include_metadata:
metadata = ElementMetadata(