From c989d9c483a4fc74147e17065779fd6103f4a084 Mon Sep 17 00:00:00 2001 From: Night-Quiet Date: Fri, 24 Jan 2025 19:06:09 +0800 Subject: [PATCH] fix: skip comment blocks in `DOCXToDocument` (#8764) * fix bug #8759 * Apply suggestions from code review * release note --------- Co-authored-by: Stefano Fiorucci --- haystack/components/converters/docx.py | 3 +++ .../notes/docx-skip-comment-blocks-d3a555d0324788c7.yaml | 4 ++++ 2 files changed, 7 insertions(+) create mode 100644 releasenotes/notes/docx-skip-comment-blocks-d3a555d0324788c7.yaml diff --git a/haystack/components/converters/docx.py b/haystack/components/converters/docx.py index b9d59bd56..8f9a58004 100644 --- a/haystack/components/converters/docx.py +++ b/haystack/components/converters/docx.py @@ -23,6 +23,7 @@ with LazyImport("Run 'pip install python-docx'") as docx_import: from docx.document import Document as DocxDocument from docx.table import Table from docx.text.paragraph import Paragraph + from lxml.etree import _Comment @dataclass @@ -210,6 +211,8 @@ class DOCXToDocument: """ elements = [] for element in document.element.body: + if isinstance(element, _Comment): + continue if element.tag.endswith("p"): paragraph = Paragraph(element, document) if paragraph.contains_page_break: diff --git a/releasenotes/notes/docx-skip-comment-blocks-d3a555d0324788c7.yaml b/releasenotes/notes/docx-skip-comment-blocks-d3a555d0324788c7.yaml new file mode 100644 index 000000000..e213aa694 --- /dev/null +++ b/releasenotes/notes/docx-skip-comment-blocks-d3a555d0324788c7.yaml @@ -0,0 +1,4 @@ +--- +fixes: + - | + The DOCXToDocument component now skips comment blocks in DOCX files that previously caused errors.