fix: skip comment blocks in DOCXToDocument (#8764)

* fix bug #8759 * Apply suggestions from code review * release note --------- Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
2026-01-07 20:46:31 +00:00 · 2025-01-24 19:06:09 +08:00 · 2025-01-24 19:06:09 +08:00 · c989d9c483
commit c989d9c483
parent 223373eced
2 changed files with 7 additions and 0 deletions
--- a/haystack/components/converters/docx.py
+++ b/haystack/components/converters/docx.py
@ -23,6 +23,7 @@ with LazyImport("Run 'pip install python-docx'") as docx_import:
    from docx.document import Document as DocxDocument
    from docx.table import Table
    from docx.text.paragraph import Paragraph
+    from lxml.etree import _Comment


@dataclass
@ -210,6 +211,8 @@ class DOCXToDocument:
        """
        elements = []
        for element in document.element.body:
+            if isinstance(element, _Comment):
+                continue
            if element.tag.endswith("p"):
                paragraph = Paragraph(element, document)
                if paragraph.contains_page_break:
--- a/releasenotes/notes/docx-skip-comment-blocks-d3a555d0324788c7.yaml
+++ b/releasenotes/notes/docx-skip-comment-blocks-d3a555d0324788c7.yaml
@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    The DOCXToDocument component now skips comment blocks in DOCX files that previously caused errors.