mirror of
https://github.com/OpenSPG/KAG.git
synced 2025-06-27 03:20:08 +00:00
fix(builder): fix doc reader (#487)
* fix buidler init * add pro commit * rename graphalgoclient to graphclient * fix doc reader * fix doc reader * fix doc reader
This commit is contained in:
parent
407c73cc9e
commit
910c2d9df3
@ -591,8 +591,12 @@ class DocxReader(ReaderABC):
|
|||||||
"""Remove empty nodes and return whether this node should be kept"""
|
"""Remove empty nodes and return whether this node should be kept"""
|
||||||
# First clean children
|
# First clean children
|
||||||
node.children = [child for child in node.children if clean_node(child)]
|
node.children = [child for child in node.children if clean_node(child)]
|
||||||
# Keep node if it has content or non-empty children
|
# Keep node if it has content, non-empty children, or a meaningful title
|
||||||
return bool(node.content.strip() or node.children)
|
return bool(
|
||||||
|
node.content.strip()
|
||||||
|
or node.children
|
||||||
|
or (node.title and node.title != "root")
|
||||||
|
)
|
||||||
|
|
||||||
root.children = [child for child in root.children if clean_node(child)]
|
root.children = [child for child in root.children if clean_node(child)]
|
||||||
|
|
||||||
@ -810,10 +814,14 @@ class DocxReader(ReaderABC):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
reader = ReaderABC.from_config({"type": "docx_reader"})
|
reader = ReaderABC.from_config({"type": "docx_reader"})
|
||||||
chunks, subgraph = reader.invoke("/Users/zhangxinhong.zxh/Downloads/测试样例文件.docx")
|
dir_path = os.path.dirname(__file__)
|
||||||
|
file_path = os.path.join(
|
||||||
|
dir_path, "../../../../tests/unit/builder/data/default.docx"
|
||||||
|
)
|
||||||
|
chunks = reader.invoke(file_path, write_ckpt=False)
|
||||||
print("Extracted chunks:")
|
print("Extracted chunks:")
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
print(f"\nChunk: {chunk.name}")
|
print(f"\nChunk: {chunk.data.name}")
|
||||||
print(f"Content: {chunk.content}")
|
print(f"Content: {chunk.data.content}")
|
||||||
if chunk.parent_content:
|
if chunk.data.parent_content:
|
||||||
print(f"Parent Content: {chunk.parent_content}")
|
print(f"Parent Content: {chunk.data.parent_content}")
|
||||||
|
BIN
tests/unit/builder/data/default.docx
Normal file
BIN
tests/unit/builder/data/default.docx
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user