mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-24 21:48:52 +00:00
fix: skip comment blocks in DOCXToDocument (#8764)
* fix bug #8759 * Apply suggestions from code review * release note --------- Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
This commit is contained in:
parent
223373eced
commit
c989d9c483
@ -23,6 +23,7 @@ with LazyImport("Run 'pip install python-docx'") as docx_import:
|
||||
from docx.document import Document as DocxDocument
|
||||
from docx.table import Table
|
||||
from docx.text.paragraph import Paragraph
|
||||
from lxml.etree import _Comment
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -210,6 +211,8 @@ class DOCXToDocument:
|
||||
"""
|
||||
elements = []
|
||||
for element in document.element.body:
|
||||
if isinstance(element, _Comment):
|
||||
continue
|
||||
if element.tag.endswith("p"):
|
||||
paragraph = Paragraph(element, document)
|
||||
if paragraph.contains_page_break:
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
The DOCXToDocument component now skips comment blocks in DOCX files that previously caused errors.
|
||||
Loading…
x
Reference in New Issue
Block a user