mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2026-01-06 04:11:08 +00:00
fix: update md to reads umlauts on non-utf-8 files (#4037)
This PR updates the `partition_md` to reads files with non-utf8 encodings without fail. Closes issue https://github.com/Unstructured-IO/unstructured-api/issues/489
This commit is contained in:
parent
66640f26fe
commit
56e739b34c
@ -7,9 +7,9 @@
|
||||
### Fixes
|
||||
- **Fixes empty HTML content** Previously, when the HTML content was empty, the partitioner would raise a TypeError: Invalid input object: NoneType. Now it will return an empty list of elements.
|
||||
- **Failproof docx malformed or merged tables** This fix prevents docx file with complex or vertical merges or malformed tables from failing at `tc_at_grid_offset` and raised `ValueError: no tc element at grid_offset=X`.
|
||||
- **partition_md can read special characters on non- utf-8 files** `partition_md` reads the file as utf-8 previously. Now it uses `read_txt_file` that reads file with detected encoding.
|
||||
- xml code not getting escaped in a code block in a markdown file when in partition
|
||||
|
||||
|
||||
## 0.18.1
|
||||
|
||||
### Enhancements
|
||||
|
||||
1
example-docs/umlauts-non-utf8.md
Normal file
1
example-docs/umlauts-non-utf8.md
Normal file
@ -0,0 +1 @@
|
||||
Umlauts: äöüß
|
||||
5
example-docs/umlauts-utf8.md
Normal file
5
example-docs/umlauts-utf8.md
Normal file
@ -0,0 +1,5 @@
|
||||
## können
|
||||
|
||||
können
|
||||
|
||||
äöüß
|
||||
@ -255,6 +255,14 @@ def test_partition_md_parse_table():
|
||||
assert elements[0].category == ElementType.TABLE
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", ["umlauts-utf8.md", "umlauts-non-utf8.md"])
|
||||
def test_partition_md_with_umlauts(filename: str):
|
||||
filename = example_doc_path(filename)
|
||||
elements = partition_md(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert elements[-1].text.endswith("äöüß")
|
||||
|
||||
|
||||
def test_partition_md_xml_processing_instruction():
|
||||
xml_content = """```
|
||||
<?xml version="1.0"?>
|
||||
|
||||
@ -7,6 +7,7 @@ import markdown
|
||||
import requests
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.file_utils.encoding import read_txt_file
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.common import exactly_one
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
@ -79,11 +80,10 @@ def partition_md(
|
||||
last_modified = get_last_modified_date(filename) if filename else None
|
||||
|
||||
if filename is not None:
|
||||
with open(filename, encoding="utf8") as f:
|
||||
text = optional_decode(f.read())
|
||||
_, text = read_txt_file(filename=filename)
|
||||
|
||||
elif file is not None:
|
||||
text = optional_decode(file.read())
|
||||
_, text = read_txt_file(file=file)
|
||||
|
||||
elif url is not None:
|
||||
response = requests.get(url)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user