fix: update md to reads umlauts on non-utf-8 files (#4037)

This PR updates the `partition_md` to reads files with non-utf8
encodings without fail.

Closes issue
https://github.com/Unstructured-IO/unstructured-api/issues/489
This commit is contained in:
Klaijan 2025-07-01 12:38:30 -04:00 committed by GitHub
parent 66640f26fe
commit 56e739b34c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 18 additions and 4 deletions

View File

@ -7,9 +7,9 @@
### Fixes
- **Fixes empty HTML content** Previously, when the HTML content was empty, the partitioner would raise a TypeError: Invalid input object: NoneType. Now it will return an empty list of elements.
- **Failproof docx malformed or merged tables** This fix prevents docx file with complex or vertical merges or malformed tables from failing at `tc_at_grid_offset` and raised `ValueError: no tc element at grid_offset=X`.
- **partition_md can read special characters on non- utf-8 files** `partition_md` reads the file as utf-8 previously. Now it uses `read_txt_file` that reads file with detected encoding.
- xml code not getting escaped in a code block in a markdown file when in partition
## 0.18.1
### Enhancements

View File

@ -0,0 +1 @@
Umlauts: äöüß

View File

@ -0,0 +1,5 @@
## können
können
äöüß

View File

@ -255,6 +255,14 @@ def test_partition_md_parse_table():
assert elements[0].category == ElementType.TABLE
@pytest.mark.parametrize("filename", ["umlauts-utf8.md", "umlauts-non-utf8.md"])
def test_partition_md_with_umlauts(filename: str):
filename = example_doc_path(filename)
elements = partition_md(filename=filename)
assert len(elements) > 0
assert elements[-1].text.endswith("äöüß")
def test_partition_md_xml_processing_instruction():
xml_content = """```
<?xml version="1.0"?>

View File

@ -7,6 +7,7 @@ import markdown
import requests
from unstructured.documents.elements import Element
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.model import FileType
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import get_last_modified_date
@ -79,11 +80,10 @@ def partition_md(
last_modified = get_last_modified_date(filename) if filename else None
if filename is not None:
with open(filename, encoding="utf8") as f:
text = optional_decode(f.read())
_, text = read_txt_file(filename=filename)
elif file is not None:
text = optional_decode(file.read())
_, text = read_txt_file(file=file)
elif url is not None:
response = requests.get(url)