mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
Fix/1057 etree parser error tsv (#1106)
* feat: always use `soupparser_fromstring` to parse `html text` which gracefully handles emoji * chore: update changelog & version
This commit is contained in:
parent
b4b8ac4d8a
commit
fc2699ff06
@ -1,7 +1,8 @@
|
|||||||
## 0.9.3-dev1
|
## 0.9.3-dev2
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
|
* Update `partition_tsv` to always use `soupparser_fromstring` to parse `html text`
|
||||||
* Add `metadata.section` to capture epub table of contents data
|
* Add `metadata.section` to capture epub table of contents data
|
||||||
* Add `unique_element_ids` kwarg to partition functions. If `True`, will use a UUID
|
* Add `unique_element_ids` kwarg to partition functions. If `True`, will use a UUID
|
||||||
for element IDs instead of a SHA-256 hash.
|
for element IDs instead of a SHA-256 hash.
|
||||||
|
6
example-docs/stanley-cups-with-emoji.tsv
Normal file
6
example-docs/stanley-cups-with-emoji.tsv
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
Stanley Cups
|
||||||
|
Team Location Stanley Cups
|
||||||
|
Blues STL 1
|
||||||
|
Flyers PHI 2
|
||||||
|
Maple Leafs TOR 13
|
||||||
|
👨\U+1F3FB🔧 TOR 15
|
|
@ -1,4 +1,11 @@
|
|||||||
from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
|
import pytest
|
||||||
|
|
||||||
|
from test_unstructured.partition.test_constants import (
|
||||||
|
EXPECTED_TABLE,
|
||||||
|
EXPECTED_TABLE_WITH_EMOJI,
|
||||||
|
EXPECTED_TEXT,
|
||||||
|
EXPECTED_TEXT_WITH_EMOJI,
|
||||||
|
)
|
||||||
from unstructured.cleaners.core import clean_extra_whitespace
|
from unstructured.cleaners.core import clean_extra_whitespace
|
||||||
from unstructured.documents.elements import Table
|
from unstructured.documents.elements import Table
|
||||||
from unstructured.partition.tsv import partition_tsv
|
from unstructured.partition.tsv import partition_tsv
|
||||||
@ -6,14 +13,22 @@ from unstructured.partition.tsv import partition_tsv
|
|||||||
EXPECTED_FILETYPE = "text/tsv"
|
EXPECTED_FILETYPE = "text/tsv"
|
||||||
|
|
||||||
|
|
||||||
def test_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"):
|
@pytest.mark.parametrize(
|
||||||
elements = partition_tsv(filename=filename)
|
("filename", "expected_text", "expected_table"),
|
||||||
|
[
|
||||||
|
("stanley-cups.tsv", EXPECTED_TEXT, EXPECTED_TABLE),
|
||||||
|
("stanley-cups-with-emoji.tsv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_partition_tsv_from_filename(filename, expected_text, expected_table):
|
||||||
|
f_path = f"example-docs/{filename}"
|
||||||
|
elements = partition_tsv(filename=f_path)
|
||||||
|
|
||||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
assert clean_extra_whitespace(elements[0].text) == expected_text
|
||||||
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
assert elements[0].metadata.text_as_html == expected_table
|
||||||
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
|
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
|
||||||
for element in elements:
|
for element in elements:
|
||||||
assert element.metadata.filename == "stanley-cups.tsv"
|
assert element.metadata.filename == filename
|
||||||
|
|
||||||
|
|
||||||
def test_partition_tsv_from_filename_with_metadata_filename(
|
def test_partition_tsv_from_filename_with_metadata_filename(
|
||||||
@ -26,13 +41,21 @@ def test_partition_tsv_from_filename_with_metadata_filename(
|
|||||||
assert element.metadata.filename == "test"
|
assert element.metadata.filename == "test"
|
||||||
|
|
||||||
|
|
||||||
def test_partition_tsv_from_file(filename="example-docs/stanley-cups.tsv"):
|
@pytest.mark.parametrize(
|
||||||
with open(filename, "rb") as f:
|
("filename", "expected_text", "expected_table"),
|
||||||
|
[
|
||||||
|
("stanley-cups.tsv", EXPECTED_TEXT, EXPECTED_TABLE),
|
||||||
|
("stanley-cups-with-emoji.tsv", EXPECTED_TEXT_WITH_EMOJI, EXPECTED_TABLE_WITH_EMOJI),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_partition_tsv_from_file(filename, expected_text, expected_table):
|
||||||
|
f_path = f"example-docs/{filename}"
|
||||||
|
with open(f_path, "rb") as f:
|
||||||
elements = partition_tsv(file=f)
|
elements = partition_tsv(file=f)
|
||||||
|
|
||||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
assert clean_extra_whitespace(elements[0].text) == expected_text
|
||||||
assert isinstance(elements[0], Table)
|
assert isinstance(elements[0], Table)
|
||||||
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
assert elements[0].metadata.text_as_html == expected_table
|
||||||
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
|
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
|
||||||
for element in elements:
|
for element in elements:
|
||||||
assert element.metadata.filename is None
|
assert element.metadata.filename is None
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.9.3-dev1" # pragma: no cover
|
__version__ = "0.9.3-dev2" # pragma: no cover
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
from tempfile import SpooledTemporaryFile
|
from tempfile import SpooledTemporaryFile
|
||||||
from typing import IO, BinaryIO, List, Optional, Union, cast
|
from typing import IO, BinaryIO, List, Optional, Union, cast
|
||||||
|
|
||||||
import lxml.html
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from lxml.html.soupparser import fromstring as soupparser_fromstring
|
||||||
|
|
||||||
from unstructured.documents.elements import (
|
from unstructured.documents.elements import (
|
||||||
Element,
|
Element,
|
||||||
@ -55,7 +55,7 @@ def partition_tsv(
|
|||||||
last_modification_date = get_last_modified_date_from_file(file)
|
last_modification_date = get_last_modified_date_from_file(file)
|
||||||
|
|
||||||
html_text = table.to_html(index=False, header=False, na_rep="")
|
html_text = table.to_html(index=False, header=False, na_rep="")
|
||||||
text = lxml.html.document_fromstring(html_text).text_content()
|
text = soupparser_fromstring(html_text).text_content()
|
||||||
|
|
||||||
if include_metadata:
|
if include_metadata:
|
||||||
metadata = ElementMetadata(
|
metadata = ElementMetadata(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user