Merge pull request #811 from da-luggas/main

Fixed broken ainsert_custom_kg()
2025-10-21 04:49:02 +00:00 · 2025-02-19 15:20:37 +01:00 · 2025-02-19 15:20:37 +01:00 · 6f95ad92bf
commit 6f95ad92bf
parent 0b795aa183 7fab9accfe
3 changed files with 38 additions and 3 deletions
--- a/README.md
+++ b/README.md
@ -461,14 +461,22 @@ custom_kg = {
        {
            "content": "ProductX, developed by CompanyA, has revolutionized the market with its cutting-edge features.",
            "source_id": "Source1",
+            "chunk_order_index": 0,
+        },
+        {
+            "content": "One outstanding feature of ProductX is its advanced AI capabilities.",
+            "source_id": "Source1",
+            "chunk_order_index": 1,
        },
        {
            "content": "PersonA is a prominent researcher at UniversityB, focusing on artificial intelligence and machine learning.",
            "source_id": "Source2",
+            "chunk_order_index": 0,
        },
        {
            "content": "None",
            "source_id": "UNKNOWN",
+            "chunk_order_index": 0,
        },
    ],
 }
--- a/examples/insert_custom_kg.py
+++ b/examples/insert_custom_kg.py
@ -87,18 +87,27 @@ custom_kg = {
        {
            "content": "ProductX, developed by CompanyA, has revolutionized the market with its cutting-edge features.",
            "source_id": "Source1",
+            "source_chunk_index": 0,
+        },
+        {
+            "content": "One outstanding feature of ProductX is its advanced AI capabilities.",
+            "source_id": "Source1",
+            "chunk_order_index": 1,
        },
        {
            "content": "PersonA is a prominent researcher at UniversityB, focusing on artificial intelligence and machine learning.",
            "source_id": "Source2",
+            "source_chunk_index": 0,
        },
        {
            "content": "EventY, held in CityC, attracts technology enthusiasts and companies from around the globe.",
            "source_id": "Source3",
+            "source_chunk_index": 0,
        },
        {
            "content": "None",
            "source_id": "UNKNOWN",
+            "source_chunk_index": 0,
        },
    ],
 }
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@ -37,6 +37,7 @@ from .utils import (
    limit_async_func_call,
    logger,
    set_logger,
+    encode_string_by_tiktoken,
 )
 from .types import KnowledgeGraph

@ -926,11 +927,28 @@ class LightRAG:
            all_chunks_data: dict[str, dict[str, str]] = {}
            chunk_to_source_map: dict[str, str] = {}
            for chunk_data in custom_kg.get("chunks", {}):
-                chunk_content = chunk_data["content"]
+                chunk_content = chunk_data["content"].strip()
                source_id = chunk_data["source_id"]
-                chunk_id = compute_mdhash_id(chunk_content.strip(), prefix="chunk-")
+                tokens = len(
+                    encode_string_by_tiktoken(
+                        chunk_content, model_name=self.tiktoken_model_name
+                    )
+                )
+                chunk_order_index = (
+                    0
+                    if "chunk_order_index" not in chunk_data.keys()
+                    else chunk_data["chunk_order_index"]
+                )
+                chunk_id = compute_mdhash_id(chunk_content, prefix="chunk-")

-                chunk_entry = {"content": chunk_content.strip(), "source_id": source_id}
+                chunk_entry = {
+                    "content": chunk_content,
+                    "source_id": source_id,
+                    "tokens": tokens,
+                    "chunk_order_index": chunk_order_index,
+                    "full_doc_id": source_id,
+                    "status": DocStatus.PROCESSED,
+                }
                all_chunks_data[chunk_id] = chunk_entry
                chunk_to_source_map[source_id] = chunk_id
                update_storage = True