fix: xml processing not escaped (#4034)

`<?xml version="1.0"?>` does not get escaped when converting to html, in
a code block like this in the markdown file
````
<?xml version="1.0"?>
<sparql xmlns="http://www.w3.org/2005/sparql-results#">
  <head></head>
  <boolean>true</boolean>
</sparql>
````
which causes the parser to throw error like 

> AttributeError: 'lxml.etree._ProcessingInstruction' object has no
attribute 'is_phrasing'.

This PR processes the original md file and add indentation to `<?xml
version="1.0"?>` to force the xml code to be escaped when being
converted to html

https://github.com/Unstructured-IO/unstructured/issues/3935
This commit is contained in:
jiajun-unstructured 2025-06-30 13:15:38 -07:00 committed by GitHub
parent dab79b0c83
commit 66640f26fe
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 66 additions and 4 deletions

View File

@ -1,4 +1,4 @@
## 0.18.2-dev1
## 0.18.2-dev2
### Enhancements
@ -7,6 +7,7 @@
### Fixes
- **Fixes empty HTML content** Previously, when the HTML content was empty, the partitioner would raise a TypeError: Invalid input object: NoneType. Now it will return an empty list of elements.
- **Failproof docx malformed or merged tables** This fix prevents docx file with complex or vertical merges or malformed tables from failing at `tc_at_grid_offset` and raised `ValueError: no tc element at grid_offset=X`.
- xml code not getting escaped in a code block in a markdown file when in partition
## 0.18.1

View File

@ -253,3 +253,38 @@ def test_partition_md_parse_table():
elements = partition_md(filename=filename)
assert len(elements) > 0
assert elements[0].category == ElementType.TABLE
def test_partition_md_xml_processing_instruction():
xml_content = """```
<?xml version="1.0"?>
<sparql xmlns="http://www.w3.org/2005/sparql-results#">
<head></head>
<boolean>true</boolean>
</sparql>
```"""
elements = partition_md(text=xml_content)
assert len(elements) == 1
def test_partition_md_xml_processing_instruction_with_indents():
xml_content = """```
<?xml version="1.0"?>
<sparql xmlns="http://www.w3.org/2005/sparql-results#">
<head></head>
<boolean>true</boolean>
</sparql>
```"""
elements = partition_md(text=xml_content)
assert len(elements) == 1
def test_partition_md_non_xml_processing_instruction():
php_content = """```
<?php echo "hello"; ?>
```"""
elements = partition_md(text=php_content)
assert len(elements) == 1

View File

@ -1 +1 @@
__version__ = "0.18.2-dev1" # pragma: no cover
__version__ = "0.18.2-dev2" # pragma: no cover

View File

@ -1,6 +1,7 @@
from __future__ import annotations
from typing import IO, Any
import re
from typing import IO, Any, Match
import markdown
import requests
@ -21,6 +22,30 @@ def optional_decode(contents: str | bytes) -> str:
DETECTION_ORIGIN: str = "md"
def _preprocess_markdown_code_blocks(text: str) -> str:
"""Pre-process code blocks so that processing instructions can be properly escaped.
The markdown library can fail to properly escape processing instructions like <?xml>, <?php>,
etc. in code blocks. This function adds minimal indentation to the processing instruction line
to force markdown to treat it as text content rather than XML.
"""
# Breakdown of the regex:
# ```\s*\n - Opening triple backticks + optional whitespace + newline
# ([ \t]{0,3})? - Capture group 1: optional 0-3 spaces/tabs (existing indentation)
# (<\?[a-zA-Z][^>]*\?>.*?) - Capture group 2: processing instruction + any following content
# \n?``` - Optional newline + closing triple backticks
code_block_pattern = r"```\s*\n([ \t]{0,3})?(<\?[a-zA-Z][^>]*\?>.*?)\n?```"
def indent_processing_instruction(match: Match[str]) -> str:
content = match.group(2)
# Ensure processing instruction has at least 4-space indentation
if content.lstrip().startswith("<?"):
content = " " + content.lstrip()
return f"```\n{content}\n```"
return re.sub(code_block_pattern, indent_processing_instruction, text, flags=re.DOTALL)
def partition_md(
filename: str | None = None,
file: IO[bytes] | None = None,
@ -73,7 +98,8 @@ def partition_md(
text = response.text
html = markdown.markdown(text, extensions=["tables"])
processed_text = _preprocess_markdown_code_blocks(text)
html = markdown.markdown(processed_text, extensions=["tables"])
return partition_html(
text=html,