diff --git a/CHANGELOG.md b/CHANGELOG.md index cfafcb371..63ed7124a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.5.13-dev4 +## 0.5.13-dev5 ### Enhancements @@ -6,6 +6,8 @@ ### Features +* `bytes_string_to_string` cleaning brick for bytes string output. + ### Fixes * unstructured-documents encode xml string if document_tree is `None` in `_read_xml`. diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index a0d0a12c2..7b959208e 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -801,6 +801,37 @@ Examples: # Returns "Look at me, I'm flying!" extract_text_after(text, r"SPEAKER \d{1}:") + +``bytes_string_to_string`` +--------------------------- + +Converts an output string that looks like a byte string to a string using the specified encoding. This +happens sometimes in ``partition_html`` when there is a character like an emoji that isn't expected +by the HTML parser. In that case, the encoded bytes get processed. + +Examples: + +.. code:: python + + from unstructured.cleaners.core import bytes_string_to_string + + text = "Hello ð\x9f\x98\x80" + # The output should be "Hello 😀" + bytes_string_to_string(text, encoding="utf-8") + + +.. code:: python + + from unstructured.cleaners.core import bytes_string_to_string + from unstructured.partition.html import partition_html + + text = """\n

Hello 😀

""" + elements = partition_html(text=text) + elements[0].apply(bytes_string_to_string) + # The output should be "Hello 😀" + elements[0].text + + ``extract_email_address`` -------------------------- diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py index 8c28428b1..196aa19a3 100644 --- a/test_unstructured/cleaners/test_core.py +++ b/test_unstructured/cleaners/test_core.py @@ -240,3 +240,8 @@ def test_clean(text, extra_whitespace, dashes, bullets, lowercase, trailing_punc ) == expected ) + + +def test_bytes_string_to_string(): + text = "\xe6\xaf\x8f\xe6\x97\xa5\xe6\x96\xb0\xe9\x97\xbb" + assert core.bytes_string_to_string(text, "utf-8") == "每日新闻" diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py index 4fa6e3ace..51fab4ef2 100644 --- a/test_unstructured/partition/test_html_partition.py +++ b/test_unstructured/partition/test_html_partition.py @@ -6,7 +6,7 @@ import pytest import requests from requests.models import Response -from unstructured.documents.elements import PageBreak +from unstructured.documents.elements import PageBreak, Title from unstructured.partition.html import partition_html DIRECTORY = pathlib.Path(__file__).parent.resolve() @@ -155,3 +155,9 @@ def test_partition_html_processes_chinese_chracters(): html_text = "

每日新闻

" elements = partition_html(text=html_text) assert elements[0].text == "每日新闻" + + +def test_emoji_appears_with_emoji_utf8_code(): + html_text = """\n

Hello 😀

""" + elements = partition_html(text=html_text) + assert elements[0] == Title("Hello 😀") diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 20c9e2db5..cf303ac8f 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.5.13-dev4" # pragma: no cover +__version__ = "0.5.13-dev5" # pragma: no cover diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index e9cb8a1d5..d034df082 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -238,3 +238,10 @@ def clean( cleaned_text = clean_extra_whitespace(cleaned_text) if extra_whitespace else cleaned_text cleaned_text = clean_bullets(cleaned_text) if bullets else cleaned_text return cleaned_text.strip() + + +def bytes_string_to_string(text: str, encoding: str = "utf-8"): + """Converts a string representation of a byte string to a regular string using the + specified encoding.""" + text_bytes = bytes([ord(char) for char in text]) + return text_bytes.decode(encoding)