Fix/1057 etree parser error tsv (#1106)

* feat: always use `soupparser_fromstring` to parse `html text` which gracefully handles emoji
* chore: update changelog & version
This commit is contained in:
Christine Straub 2023-08-13 18:22:36 -07:00 committed by GitHub
parent b4b8ac4d8a
commit fc2699ff06
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 45 additions and 15 deletions

View File

@ -1,7 +1,8 @@
## 0.9.3-dev1 ## 0.9.3-dev2
### Enhancements ### Enhancements
* Update `partition_tsv` to always use `soupparser_fromstring` to parse `html text`
* Add `metadata.section` to capture epub table of contents data * Add `metadata.section` to capture epub table of contents data
* Add `unique_element_ids` kwarg to partition functions. If `True`, will use a UUID * Add `unique_element_ids` kwarg to partition functions. If `True`, will use a UUID
for element IDs instead of a SHA-256 hash. for element IDs instead of a SHA-256 hash.

View File

@ -0,0 +1,6 @@
Stanley Cups
Team Location Stanley Cups
Blues STL 1
Flyers PHI 2
Maple Leafs TOR 13
👨\U+1F3FB🔧 TOR 15
1 Stanley Cups
2 Team Location Stanley Cups
3 Blues STL 1
4 Flyers PHI 2
5 Maple Leafs TOR 13
6 👨\U+1F3FB🔧 TOR 15

View File

@ -1,4 +1,11 @@
from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT import pytest
from test_unstructured.partition.test_constants import (
EXPECTED_TABLE,
EXPECTED_TABLE_WITH_EMOJI,
EXPECTED_TEXT,
EXPECTED_TEXT_WITH_EMOJI,
)
from unstructured.cleaners.core import clean_extra_whitespace from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table from unstructured.documents.elements import Table
from unstructured.partition.tsv import partition_tsv from unstructured.partition.tsv import partition_tsv
@ -6,14 +13,22 @@ from unstructured.partition.tsv import partition_tsv
EXPECTED_FILETYPE = "text/tsv" EXPECTED_FILETYPE = "text/tsv"
def test_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"): @pytest.mark.parametrize(
elements = partition_tsv(filename=filename) ("filename", "expected_text", "expected_table"),
[
("stanley-cups.tsv", EXPECTED_TEXT, EXPECTED_TABLE),
("stanley-cups-with-emoji.tsv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
],
)
def test_partition_tsv_from_filename(filename, expected_text, expected_table):
f_path = f"example-docs/{filename}"
elements = partition_tsv(filename=f_path)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT assert clean_extra_whitespace(elements[0].text) == expected_text
assert elements[0].metadata.text_as_html == EXPECTED_TABLE assert elements[0].metadata.text_as_html == expected_table
assert elements[0].metadata.filetype == EXPECTED_FILETYPE assert elements[0].metadata.filetype == EXPECTED_FILETYPE
for element in elements: for element in elements:
assert element.metadata.filename == "stanley-cups.tsv" assert element.metadata.filename == filename
def test_partition_tsv_from_filename_with_metadata_filename( def test_partition_tsv_from_filename_with_metadata_filename(
@ -26,13 +41,21 @@ def test_partition_tsv_from_filename_with_metadata_filename(
assert element.metadata.filename == "test" assert element.metadata.filename == "test"
def test_partition_tsv_from_file(filename="example-docs/stanley-cups.tsv"): @pytest.mark.parametrize(
with open(filename, "rb") as f: ("filename", "expected_text", "expected_table"),
[
("stanley-cups.tsv", EXPECTED_TEXT, EXPECTED_TABLE),
("stanley-cups-with-emoji.tsv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
],
)
def test_partition_tsv_from_file(filename, expected_text, expected_table):
f_path = f"example-docs/{filename}"
with open(f_path, "rb") as f:
elements = partition_tsv(file=f) elements = partition_tsv(file=f)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT assert clean_extra_whitespace(elements[0].text) == expected_text
assert isinstance(elements[0], Table) assert isinstance(elements[0], Table)
assert elements[0].metadata.text_as_html == EXPECTED_TABLE assert elements[0].metadata.text_as_html == expected_table
assert elements[0].metadata.filetype == EXPECTED_FILETYPE assert elements[0].metadata.filetype == EXPECTED_FILETYPE
for element in elements: for element in elements:
assert element.metadata.filename is None assert element.metadata.filename is None

View File

@ -1 +1 @@
__version__ = "0.9.3-dev1" # pragma: no cover __version__ = "0.9.3-dev2" # pragma: no cover

View File

@ -1,8 +1,8 @@
from tempfile import SpooledTemporaryFile from tempfile import SpooledTemporaryFile
from typing import IO, BinaryIO, List, Optional, Union, cast from typing import IO, BinaryIO, List, Optional, Union, cast
import lxml.html
import pandas as pd import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring
from unstructured.documents.elements import ( from unstructured.documents.elements import (
Element, Element,
@ -55,7 +55,7 @@ def partition_tsv(
last_modification_date = get_last_modified_date_from_file(file) last_modification_date = get_last_modified_date_from_file(file)
html_text = table.to_html(index=False, header=False, na_rep="") html_text = table.to_html(index=False, header=False, na_rep="")
text = lxml.html.document_fromstring(html_text).text_content() text = soupparser_fromstring(html_text).text_content()
if include_metadata: if include_metadata:
metadata = ElementMetadata( metadata = ElementMetadata(