fix(builder): fix doc reader (#487)

* fix buidler init

* add pro commit

* rename graphalgoclient to graphclient

* fix doc reader

* fix doc reader

* fix doc reader
This commit is contained in:
Xinhong Zhang 2025-04-28 16:28:24 +08:00 committed by GitHub
parent 407c73cc9e
commit 910c2d9df3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 15 additions and 7 deletions

View File

@ -591,8 +591,12 @@ class DocxReader(ReaderABC):
"""Remove empty nodes and return whether this node should be kept"""
# First clean children
node.children = [child for child in node.children if clean_node(child)]
# Keep node if it has content or non-empty children
return bool(node.content.strip() or node.children)
# Keep node if it has content, non-empty children, or a meaningful title
return bool(
node.content.strip()
or node.children
or (node.title and node.title != "root")
)
root.children = [child for child in root.children if clean_node(child)]
@ -810,10 +814,14 @@ class DocxReader(ReaderABC):
if __name__ == "__main__":
reader = ReaderABC.from_config({"type": "docx_reader"})
chunks, subgraph = reader.invoke("/Users/zhangxinhong.zxh/Downloads/测试样例文件.docx")
dir_path = os.path.dirname(__file__)
file_path = os.path.join(
dir_path, "../../../../tests/unit/builder/data/default.docx"
)
chunks = reader.invoke(file_path, write_ckpt=False)
print("Extracted chunks:")
for chunk in chunks:
print(f"\nChunk: {chunk.name}")
print(f"Content: {chunk.content}")
if chunk.parent_content:
print(f"Parent Content: {chunk.parent_content}")
print(f"\nChunk: {chunk.data.name}")
print(f"Content: {chunk.data.content}")
if chunk.data.parent_content:
print(f"Parent Content: {chunk.data.parent_content}")

Binary file not shown.