mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-18 21:57:58 +00:00
feat: cleaning brick for normalizing bytes string output (#481)
* add cleaning brick for emojis * changelog and versoin * docs for bytes_string_to_string * different test for bytes_string_to_string
This commit is contained in:
parent
9c1c6a13f6
commit
137b4b9a2e
@ -1,4 +1,4 @@
|
|||||||
## 0.5.13-dev4
|
## 0.5.13-dev5
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -6,6 +6,8 @@
|
|||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
* `bytes_string_to_string` cleaning brick for bytes string output.
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
* unstructured-documents encode xml string if document_tree is `None` in `_read_xml`.
|
* unstructured-documents encode xml string if document_tree is `None` in `_read_xml`.
|
||||||
|
@ -801,6 +801,37 @@ Examples:
|
|||||||
# Returns "Look at me, I'm flying!"
|
# Returns "Look at me, I'm flying!"
|
||||||
extract_text_after(text, r"SPEAKER \d{1}:")
|
extract_text_after(text, r"SPEAKER \d{1}:")
|
||||||
|
|
||||||
|
|
||||||
|
``bytes_string_to_string``
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
Converts an output string that looks like a byte string to a string using the specified encoding. This
|
||||||
|
happens sometimes in ``partition_html`` when there is a character like an emoji that isn't expected
|
||||||
|
by the HTML parser. In that case, the encoded bytes get processed.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.cleaners.core import bytes_string_to_string
|
||||||
|
|
||||||
|
text = "Hello ð\x9f\x98\x80"
|
||||||
|
# The output should be "Hello 😀"
|
||||||
|
bytes_string_to_string(text, encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.cleaners.core import bytes_string_to_string
|
||||||
|
from unstructured.partition.html import partition_html
|
||||||
|
|
||||||
|
text = """\n<html charset="utf-8"><p>Hello 😀</p></html>"""
|
||||||
|
elements = partition_html(text=text)
|
||||||
|
elements[0].apply(bytes_string_to_string)
|
||||||
|
# The output should be "Hello 😀"
|
||||||
|
elements[0].text
|
||||||
|
|
||||||
|
|
||||||
``extract_email_address``
|
``extract_email_address``
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
||||||
|
@ -240,3 +240,8 @@ def test_clean(text, extra_whitespace, dashes, bullets, lowercase, trailing_punc
|
|||||||
)
|
)
|
||||||
== expected
|
== expected
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_bytes_string_to_string():
|
||||||
|
text = "\xe6\xaf\x8f\xe6\x97\xa5\xe6\x96\xb0\xe9\x97\xbb"
|
||||||
|
assert core.bytes_string_to_string(text, "utf-8") == "每日新闻"
|
||||||
|
@ -6,7 +6,7 @@ import pytest
|
|||||||
import requests
|
import requests
|
||||||
from requests.models import Response
|
from requests.models import Response
|
||||||
|
|
||||||
from unstructured.documents.elements import PageBreak
|
from unstructured.documents.elements import PageBreak, Title
|
||||||
from unstructured.partition.html import partition_html
|
from unstructured.partition.html import partition_html
|
||||||
|
|
||||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||||
@ -155,3 +155,9 @@ def test_partition_html_processes_chinese_chracters():
|
|||||||
html_text = "<html><div><p>每日新闻</p></div></html>"
|
html_text = "<html><div><p>每日新闻</p></div></html>"
|
||||||
elements = partition_html(text=html_text)
|
elements = partition_html(text=html_text)
|
||||||
assert elements[0].text == "每日新闻"
|
assert elements[0].text == "每日新闻"
|
||||||
|
|
||||||
|
|
||||||
|
def test_emoji_appears_with_emoji_utf8_code():
|
||||||
|
html_text = """\n<html charset="utf-8"><p>Hello 😀</p></html>"""
|
||||||
|
elements = partition_html(text=html_text)
|
||||||
|
assert elements[0] == Title("Hello 😀")
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.5.13-dev4" # pragma: no cover
|
__version__ = "0.5.13-dev5" # pragma: no cover
|
||||||
|
@ -238,3 +238,10 @@ def clean(
|
|||||||
cleaned_text = clean_extra_whitespace(cleaned_text) if extra_whitespace else cleaned_text
|
cleaned_text = clean_extra_whitespace(cleaned_text) if extra_whitespace else cleaned_text
|
||||||
cleaned_text = clean_bullets(cleaned_text) if bullets else cleaned_text
|
cleaned_text = clean_bullets(cleaned_text) if bullets else cleaned_text
|
||||||
return cleaned_text.strip()
|
return cleaned_text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def bytes_string_to_string(text: str, encoding: str = "utf-8"):
|
||||||
|
"""Converts a string representation of a byte string to a regular string using the
|
||||||
|
specified encoding."""
|
||||||
|
text_bytes = bytes([ord(char) for char in text])
|
||||||
|
return text_bytes.decode(encoding)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user