fix: xml processing not escaped (#4034)

`<?xml version="1.0"?>` does not get escaped when converting to html, in a code block like this in the markdown file ```` <?xml version="1.0"?> <sparql xmlns="http://www.w3.org/2005/sparql-results#"> <head></head> <boolean>true</boolean> </sparql> ```` which causes the parser to throw error like > AttributeError: 'lxml.etree._ProcessingInstruction' object has no attribute 'is_phrasing'. This PR processes the original md file and add indentation to `<?xml version="1.0"?>` to force the xml code to be escaped when being converted to html https://github.com/Unstructured-IO/unstructured/issues/3935
2025-12-27 23:24:27 +00:00 · 2025-06-30 13:15:38 -07:00 · 2025-06-30 13:15:38 -07:00 · 66640f26fe
commit 66640f26fe
parent dab79b0c83
4 changed files with 66 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.18.2-dev1
+## 0.18.2-dev2

 ### Enhancements

@ -7,6 +7,7 @@
 ### Fixes
 - **Fixes empty HTML content** Previously, when the HTML content was empty, the partitioner would raise a TypeError: Invalid input object: NoneType. Now it will return an empty list of elements.
 - **Failproof docx malformed or merged tables** This fix prevents docx file with complex or vertical merges or malformed tables from failing at `tc_at_grid_offset` and raised `ValueError: no tc element at grid_offset=X`.
+- xml code not getting escaped in a code block in a markdown file when in partition


 ## 0.18.1
--- a/test_unstructured/partition/test_md.py
+++ b/test_unstructured/partition/test_md.py
@ -253,3 +253,38 @@ def test_partition_md_parse_table():
    elements = partition_md(filename=filename)
    assert len(elements) > 0
    assert elements[0].category == ElementType.TABLE
+
+
+def test_partition_md_xml_processing_instruction():
+    xml_content = """```
+<?xml version="1.0"?>
+<sparql xmlns="http://www.w3.org/2005/sparql-results#">
+  <head></head>
+  <boolean>true</boolean>
+</sparql>
+```"""
+
+    elements = partition_md(text=xml_content)
+    assert len(elements) == 1
+
+
+def test_partition_md_xml_processing_instruction_with_indents():
+    xml_content = """```
+  <?xml version="1.0"?>
+<sparql xmlns="http://www.w3.org/2005/sparql-results#">
+  <head></head>
+  <boolean>true</boolean>
+</sparql>
+```"""
+
+    elements = partition_md(text=xml_content)
+    assert len(elements) == 1
+
+
+def test_partition_md_non_xml_processing_instruction():
+    php_content = """```
+    <?php echo "hello"; ?>
+    ```"""
+
+    elements = partition_md(text=php_content)
+    assert len(elements) == 1
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.18.2-dev1"  # pragma: no cover
+__version__ = "0.18.2-dev2"  # pragma: no cover
--- a/unstructured/partition/md.py
+++ b/unstructured/partition/md.py
@ -1,6 +1,7 @@
 from __future__ import annotations

-from typing import IO, Any
+import re
+from typing import IO, Any, Match

 import markdown
 import requests
@ -21,6 +22,30 @@ def optional_decode(contents: str | bytes) -> str:
 DETECTION_ORIGIN: str = "md"


+def _preprocess_markdown_code_blocks(text: str) -> str:
+    """Pre-process code blocks so that processing instructions can be properly escaped.
+
+    The markdown library can fail to properly escape processing instructions like <?xml>, <?php>,
+    etc. in code blocks. This function adds minimal indentation to the processing instruction line
+    to force markdown to treat it as text content rather than XML.
+    """
+    # Breakdown of the regex:
+    # ```\s*\n           - Opening triple backticks + optional whitespace + newline
+    # ([ \t]{0,3})?      - Capture group 1: optional 0-3 spaces/tabs (existing indentation)
+    # (<\?[a-zA-Z][^>]*\?>.*?) - Capture group 2: processing instruction + any following content
+    # \n?```             - Optional newline + closing triple backticks
+    code_block_pattern = r"```\s*\n([ \t]{0,3})?(<\?[a-zA-Z][^>]*\?>.*?)\n?```"
+
+    def indent_processing_instruction(match: Match[str]) -> str:
+        content = match.group(2)
+        # Ensure processing instruction has at least 4-space indentation
+        if content.lstrip().startswith("<?"):
+            content = "    " + content.lstrip()
+        return f"```\n{content}\n```"
+
+    return re.sub(code_block_pattern, indent_processing_instruction, text, flags=re.DOTALL)
+
+
 def partition_md(
    filename: str | None = None,
    file: IO[bytes] | None = None,
@ -73,7 +98,8 @@ def partition_md(

        text = response.text

-    html = markdown.markdown(text, extensions=["tables"])
+    processed_text = _preprocess_markdown_code_blocks(text)
+    html = markdown.markdown(processed_text, extensions=["tables"])

    return partition_html(
        text=html,