mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 23:24:27 +00:00
fix: xml processing not escaped (#4034)
`<?xml version="1.0"?>` does not get escaped when converting to html, in a code block like this in the markdown file ```` <?xml version="1.0"?> <sparql xmlns="http://www.w3.org/2005/sparql-results#"> <head></head> <boolean>true</boolean> </sparql> ```` which causes the parser to throw error like > AttributeError: 'lxml.etree._ProcessingInstruction' object has no attribute 'is_phrasing'. This PR processes the original md file and add indentation to `<?xml version="1.0"?>` to force the xml code to be escaped when being converted to html https://github.com/Unstructured-IO/unstructured/issues/3935
This commit is contained in:
parent
dab79b0c83
commit
66640f26fe
@ -1,4 +1,4 @@
|
||||
## 0.18.2-dev1
|
||||
## 0.18.2-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
### Fixes
|
||||
- **Fixes empty HTML content** Previously, when the HTML content was empty, the partitioner would raise a TypeError: Invalid input object: NoneType. Now it will return an empty list of elements.
|
||||
- **Failproof docx malformed or merged tables** This fix prevents docx file with complex or vertical merges or malformed tables from failing at `tc_at_grid_offset` and raised `ValueError: no tc element at grid_offset=X`.
|
||||
- xml code not getting escaped in a code block in a markdown file when in partition
|
||||
|
||||
|
||||
## 0.18.1
|
||||
|
||||
@ -253,3 +253,38 @@ def test_partition_md_parse_table():
|
||||
elements = partition_md(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert elements[0].category == ElementType.TABLE
|
||||
|
||||
|
||||
def test_partition_md_xml_processing_instruction():
|
||||
xml_content = """```
|
||||
<?xml version="1.0"?>
|
||||
<sparql xmlns="http://www.w3.org/2005/sparql-results#">
|
||||
<head></head>
|
||||
<boolean>true</boolean>
|
||||
</sparql>
|
||||
```"""
|
||||
|
||||
elements = partition_md(text=xml_content)
|
||||
assert len(elements) == 1
|
||||
|
||||
|
||||
def test_partition_md_xml_processing_instruction_with_indents():
|
||||
xml_content = """```
|
||||
<?xml version="1.0"?>
|
||||
<sparql xmlns="http://www.w3.org/2005/sparql-results#">
|
||||
<head></head>
|
||||
<boolean>true</boolean>
|
||||
</sparql>
|
||||
```"""
|
||||
|
||||
elements = partition_md(text=xml_content)
|
||||
assert len(elements) == 1
|
||||
|
||||
|
||||
def test_partition_md_non_xml_processing_instruction():
|
||||
php_content = """```
|
||||
<?php echo "hello"; ?>
|
||||
```"""
|
||||
|
||||
elements = partition_md(text=php_content)
|
||||
assert len(elements) == 1
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.18.2-dev1" # pragma: no cover
|
||||
__version__ = "0.18.2-dev2" # pragma: no cover
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import IO, Any
|
||||
import re
|
||||
from typing import IO, Any, Match
|
||||
|
||||
import markdown
|
||||
import requests
|
||||
@ -21,6 +22,30 @@ def optional_decode(contents: str | bytes) -> str:
|
||||
DETECTION_ORIGIN: str = "md"
|
||||
|
||||
|
||||
def _preprocess_markdown_code_blocks(text: str) -> str:
|
||||
"""Pre-process code blocks so that processing instructions can be properly escaped.
|
||||
|
||||
The markdown library can fail to properly escape processing instructions like <?xml>, <?php>,
|
||||
etc. in code blocks. This function adds minimal indentation to the processing instruction line
|
||||
to force markdown to treat it as text content rather than XML.
|
||||
"""
|
||||
# Breakdown of the regex:
|
||||
# ```\s*\n - Opening triple backticks + optional whitespace + newline
|
||||
# ([ \t]{0,3})? - Capture group 1: optional 0-3 spaces/tabs (existing indentation)
|
||||
# (<\?[a-zA-Z][^>]*\?>.*?) - Capture group 2: processing instruction + any following content
|
||||
# \n?``` - Optional newline + closing triple backticks
|
||||
code_block_pattern = r"```\s*\n([ \t]{0,3})?(<\?[a-zA-Z][^>]*\?>.*?)\n?```"
|
||||
|
||||
def indent_processing_instruction(match: Match[str]) -> str:
|
||||
content = match.group(2)
|
||||
# Ensure processing instruction has at least 4-space indentation
|
||||
if content.lstrip().startswith("<?"):
|
||||
content = " " + content.lstrip()
|
||||
return f"```\n{content}\n```"
|
||||
|
||||
return re.sub(code_block_pattern, indent_processing_instruction, text, flags=re.DOTALL)
|
||||
|
||||
|
||||
def partition_md(
|
||||
filename: str | None = None,
|
||||
file: IO[bytes] | None = None,
|
||||
@ -73,7 +98,8 @@ def partition_md(
|
||||
|
||||
text = response.text
|
||||
|
||||
html = markdown.markdown(text, extensions=["tables"])
|
||||
processed_text = _preprocess_markdown_code_blocks(text)
|
||||
html = markdown.markdown(processed_text, extensions=["tables"])
|
||||
|
||||
return partition_html(
|
||||
text=html,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user