mirror of
				https://github.com/HKUDS/LightRAG.git
				synced 2025-11-04 11:49:29 +00:00 
			
		
		
		
	Merge branch 'HKUDS:main' into main
This commit is contained in:
		
						commit
						a88908009c
					
				
							
								
								
									
										46
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										46
									
								
								README.md
									
									
									
									
									
								
							@ -26,7 +26,8 @@ This repository hosts the code of LightRAG. The structure of this code is based
 | 
			
		||||
</div>
 | 
			
		||||
 | 
			
		||||
## 🎉 News
 | 
			
		||||
- [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author!
 | 
			
		||||
- [x] [2024.11.25]🎯📢LightRAG now supports seamless integration of [custom knowledge graphs](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg), empowering users to enhance the system with their own domain expertise.
 | 
			
		||||
- [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author.
 | 
			
		||||
- [x] [2024.11.12]🎯📢LightRAG now supports [Oracle Database 23ai for all storage types (KV, vector, and graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py).
 | 
			
		||||
- [x] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete-entity).
 | 
			
		||||
- [x] [2024.11.09]🎯📢Introducing the [LightRAG Gui](https://lightrag-gui.streamlit.app), which allows you to insert, query, visualize, and download LightRAG knowledge.
 | 
			
		||||
@ -327,6 +328,49 @@ with open("./newText.txt") as f:
 | 
			
		||||
    rag.insert(f.read())
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### Insert Custom KG
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
rag = LightRAG(
 | 
			
		||||
     working_dir=WORKING_DIR,
 | 
			
		||||
     llm_model_func=llm_model_func,
 | 
			
		||||
     embedding_func=EmbeddingFunc(
 | 
			
		||||
          embedding_dim=embedding_dimension,
 | 
			
		||||
          max_token_size=8192,
 | 
			
		||||
          func=embedding_func,
 | 
			
		||||
     ),
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
custom_kg = {
 | 
			
		||||
    "entities": [
 | 
			
		||||
        {
 | 
			
		||||
            "entity_name": "CompanyA",
 | 
			
		||||
            "entity_type": "Organization",
 | 
			
		||||
            "description": "A major technology company",
 | 
			
		||||
            "source_id": "Source1"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "entity_name": "ProductX",
 | 
			
		||||
            "entity_type": "Product",
 | 
			
		||||
            "description": "A popular product developed by CompanyA",
 | 
			
		||||
            "source_id": "Source1"
 | 
			
		||||
        }
 | 
			
		||||
    ],
 | 
			
		||||
    "relationships": [
 | 
			
		||||
        {
 | 
			
		||||
            "src_id": "CompanyA",
 | 
			
		||||
            "tgt_id": "ProductX",
 | 
			
		||||
            "description": "CompanyA develops ProductX",
 | 
			
		||||
            "keywords": "develop, produce",
 | 
			
		||||
            "weight": 1.0,
 | 
			
		||||
            "source_id": "Source1"
 | 
			
		||||
        }
 | 
			
		||||
    ]
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
rag.insert_custom_kg(custom_kg)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### Delete Entity
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										108
									
								
								examples/insert_custom_kg.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										108
									
								
								examples/insert_custom_kg.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,108 @@
 | 
			
		||||
import os
 | 
			
		||||
from lightrag import LightRAG, QueryParam
 | 
			
		||||
from lightrag.llm import gpt_4o_mini_complete
 | 
			
		||||
#########
 | 
			
		||||
# Uncomment the below two lines if running in a jupyter notebook to handle the async nature of rag.insert()
 | 
			
		||||
# import nest_asyncio
 | 
			
		||||
# nest_asyncio.apply()
 | 
			
		||||
#########
 | 
			
		||||
 | 
			
		||||
WORKING_DIR = "./custom_kg"
 | 
			
		||||
 | 
			
		||||
if not os.path.exists(WORKING_DIR):
 | 
			
		||||
    os.mkdir(WORKING_DIR)
 | 
			
		||||
 | 
			
		||||
rag = LightRAG(
 | 
			
		||||
    working_dir=WORKING_DIR,
 | 
			
		||||
    llm_model_func=gpt_4o_mini_complete,  # Use gpt_4o_mini_complete LLM model
 | 
			
		||||
    # llm_model_func=gpt_4o_complete  # Optionally, use a stronger model
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
custom_kg = {
 | 
			
		||||
    "entities": [
 | 
			
		||||
        {
 | 
			
		||||
            "entity_name": "CompanyA",
 | 
			
		||||
            "entity_type": "Organization",
 | 
			
		||||
            "description": "A major technology company",
 | 
			
		||||
            "source_id": "Source1"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "entity_name": "ProductX",
 | 
			
		||||
            "entity_type": "Product",
 | 
			
		||||
            "description": "A popular product developed by CompanyA",
 | 
			
		||||
            "source_id": "Source1"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "entity_name": "PersonA",
 | 
			
		||||
            "entity_type": "Person",
 | 
			
		||||
            "description": "A renowned researcher in AI",
 | 
			
		||||
            "source_id": "Source2"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "entity_name": "UniversityB",
 | 
			
		||||
            "entity_type": "Organization",
 | 
			
		||||
            "description": "A leading university specializing in technology and sciences",
 | 
			
		||||
            "source_id": "Source2"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "entity_name": "CityC",
 | 
			
		||||
            "entity_type": "Location",
 | 
			
		||||
            "description": "A large metropolitan city known for its culture and economy",
 | 
			
		||||
            "source_id": "Source3"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "entity_name": "EventY",
 | 
			
		||||
            "entity_type": "Event",
 | 
			
		||||
            "description": "An annual technology conference held in CityC",
 | 
			
		||||
            "source_id": "Source3"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "entity_name": "CompanyD",
 | 
			
		||||
            "entity_type": "Organization",
 | 
			
		||||
            "description": "A financial services company specializing in insurance",
 | 
			
		||||
            "source_id": "Source4"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "entity_name": "ServiceZ",
 | 
			
		||||
            "entity_type": "Service",
 | 
			
		||||
            "description": "An insurance product offered by CompanyD",
 | 
			
		||||
            "source_id": "Source4"
 | 
			
		||||
        }
 | 
			
		||||
    ],
 | 
			
		||||
    "relationships": [
 | 
			
		||||
        {
 | 
			
		||||
            "src_id": "CompanyA",
 | 
			
		||||
            "tgt_id": "ProductX",
 | 
			
		||||
            "description": "CompanyA develops ProductX",
 | 
			
		||||
            "keywords": "develop, produce",
 | 
			
		||||
            "weight": 1.0,
 | 
			
		||||
            "source_id": "Source1"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "src_id": "PersonA",
 | 
			
		||||
            "tgt_id": "UniversityB",
 | 
			
		||||
            "description": "PersonA works at UniversityB",
 | 
			
		||||
            "keywords": "employment, affiliation",
 | 
			
		||||
            "weight": 0.9,
 | 
			
		||||
            "source_id": "Source2"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "src_id": "CityC",
 | 
			
		||||
            "tgt_id": "EventY",
 | 
			
		||||
            "description": "EventY is hosted in CityC",
 | 
			
		||||
            "keywords": "host, location",
 | 
			
		||||
            "weight": 0.8,
 | 
			
		||||
            "source_id": "Source3"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "src_id": "CompanyD",
 | 
			
		||||
            "tgt_id": "ServiceZ",
 | 
			
		||||
            "description": "CompanyD provides ServiceZ",
 | 
			
		||||
            "keywords": "provide, offer",
 | 
			
		||||
            "weight": 1.0,
 | 
			
		||||
            "source_id": "Source4"
 | 
			
		||||
        }
 | 
			
		||||
    ]
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
rag.insert_custom_kg(custom_kg)
 | 
			
		||||
@ -1,5 +1,5 @@
 | 
			
		||||
from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
 | 
			
		||||
 | 
			
		||||
__version__ = "1.0.1"
 | 
			
		||||
__version__ = "1.0.2"
 | 
			
		||||
__author__ = "Zirui Guo"
 | 
			
		||||
__url__ = "https://github.com/HKUDS/LightRAG"
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,6 @@
 | 
			
		||||
import asyncio
 | 
			
		||||
import os
 | 
			
		||||
from tqdm.asyncio import tqdm as tqdm_async
 | 
			
		||||
from dataclasses import asdict, dataclass, field
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
from functools import partial
 | 
			
		||||
@ -242,7 +243,9 @@ class LightRAG:
 | 
			
		||||
            logger.info(f"[New Docs] inserting {len(new_docs)} docs")
 | 
			
		||||
 | 
			
		||||
            inserting_chunks = {}
 | 
			
		||||
            for doc_key, doc in new_docs.items():
 | 
			
		||||
            for doc_key, doc in tqdm_async(
 | 
			
		||||
                new_docs.items(), desc="Chunking documents", unit="doc"
 | 
			
		||||
            ):
 | 
			
		||||
                chunks = {
 | 
			
		||||
                    compute_mdhash_id(dp["content"], prefix="chunk-"): {
 | 
			
		||||
                        **dp,
 | 
			
		||||
@ -304,6 +307,108 @@ class LightRAG:
 | 
			
		||||
            tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback())
 | 
			
		||||
        await asyncio.gather(*tasks)
 | 
			
		||||
 | 
			
		||||
    def insert_custom_kg(self, custom_kg: dict):
 | 
			
		||||
        loop = always_get_an_event_loop()
 | 
			
		||||
        return loop.run_until_complete(self.ainsert_custom_kg(custom_kg))
 | 
			
		||||
 | 
			
		||||
    async def ainsert_custom_kg(self, custom_kg: dict):
 | 
			
		||||
        update_storage = False
 | 
			
		||||
        try:
 | 
			
		||||
            # Insert entities into knowledge graph
 | 
			
		||||
            all_entities_data = []
 | 
			
		||||
            for entity_data in custom_kg.get("entities", []):
 | 
			
		||||
                entity_name = f'"{entity_data["entity_name"].upper()}"'
 | 
			
		||||
                entity_type = entity_data.get("entity_type", "UNKNOWN")
 | 
			
		||||
                description = entity_data.get("description", "No description provided")
 | 
			
		||||
                source_id = entity_data["source_id"]
 | 
			
		||||
 | 
			
		||||
                # Prepare node data
 | 
			
		||||
                node_data = {
 | 
			
		||||
                    "entity_type": entity_type,
 | 
			
		||||
                    "description": description,
 | 
			
		||||
                    "source_id": source_id,
 | 
			
		||||
                }
 | 
			
		||||
                # Insert node data into the knowledge graph
 | 
			
		||||
                await self.chunk_entity_relation_graph.upsert_node(
 | 
			
		||||
                    entity_name, node_data=node_data
 | 
			
		||||
                )
 | 
			
		||||
                node_data["entity_name"] = entity_name
 | 
			
		||||
                all_entities_data.append(node_data)
 | 
			
		||||
                update_storage = True
 | 
			
		||||
 | 
			
		||||
            # Insert relationships into knowledge graph
 | 
			
		||||
            all_relationships_data = []
 | 
			
		||||
            for relationship_data in custom_kg.get("relationships", []):
 | 
			
		||||
                src_id = f'"{relationship_data["src_id"].upper()}"'
 | 
			
		||||
                tgt_id = f'"{relationship_data["tgt_id"].upper()}"'
 | 
			
		||||
                description = relationship_data["description"]
 | 
			
		||||
                keywords = relationship_data["keywords"]
 | 
			
		||||
                weight = relationship_data.get("weight", 1.0)
 | 
			
		||||
                source_id = relationship_data["source_id"]
 | 
			
		||||
 | 
			
		||||
                # Check if nodes exist in the knowledge graph
 | 
			
		||||
                for need_insert_id in [src_id, tgt_id]:
 | 
			
		||||
                    if not (
 | 
			
		||||
                        await self.chunk_entity_relation_graph.has_node(need_insert_id)
 | 
			
		||||
                    ):
 | 
			
		||||
                        await self.chunk_entity_relation_graph.upsert_node(
 | 
			
		||||
                            need_insert_id,
 | 
			
		||||
                            node_data={
 | 
			
		||||
                                "source_id": source_id,
 | 
			
		||||
                                "description": "UNKNOWN",
 | 
			
		||||
                                "entity_type": "UNKNOWN",
 | 
			
		||||
                            },
 | 
			
		||||
                        )
 | 
			
		||||
 | 
			
		||||
                # Insert edge into the knowledge graph
 | 
			
		||||
                await self.chunk_entity_relation_graph.upsert_edge(
 | 
			
		||||
                    src_id,
 | 
			
		||||
                    tgt_id,
 | 
			
		||||
                    edge_data={
 | 
			
		||||
                        "weight": weight,
 | 
			
		||||
                        "description": description,
 | 
			
		||||
                        "keywords": keywords,
 | 
			
		||||
                        "source_id": source_id,
 | 
			
		||||
                    },
 | 
			
		||||
                )
 | 
			
		||||
                edge_data = {
 | 
			
		||||
                    "src_id": src_id,
 | 
			
		||||
                    "tgt_id": tgt_id,
 | 
			
		||||
                    "description": description,
 | 
			
		||||
                    "keywords": keywords,
 | 
			
		||||
                }
 | 
			
		||||
                all_relationships_data.append(edge_data)
 | 
			
		||||
                update_storage = True
 | 
			
		||||
 | 
			
		||||
            # Insert entities into vector storage if needed
 | 
			
		||||
            if self.entities_vdb is not None:
 | 
			
		||||
                data_for_vdb = {
 | 
			
		||||
                    compute_mdhash_id(dp["entity_name"], prefix="ent-"): {
 | 
			
		||||
                        "content": dp["entity_name"] + dp["description"],
 | 
			
		||||
                        "entity_name": dp["entity_name"],
 | 
			
		||||
                    }
 | 
			
		||||
                    for dp in all_entities_data
 | 
			
		||||
                }
 | 
			
		||||
                await self.entities_vdb.upsert(data_for_vdb)
 | 
			
		||||
 | 
			
		||||
            # Insert relationships into vector storage if needed
 | 
			
		||||
            if self.relationships_vdb is not None:
 | 
			
		||||
                data_for_vdb = {
 | 
			
		||||
                    compute_mdhash_id(dp["src_id"] + dp["tgt_id"], prefix="rel-"): {
 | 
			
		||||
                        "src_id": dp["src_id"],
 | 
			
		||||
                        "tgt_id": dp["tgt_id"],
 | 
			
		||||
                        "content": dp["keywords"]
 | 
			
		||||
                        + dp["src_id"]
 | 
			
		||||
                        + dp["tgt_id"]
 | 
			
		||||
                        + dp["description"],
 | 
			
		||||
                    }
 | 
			
		||||
                    for dp in all_relationships_data
 | 
			
		||||
                }
 | 
			
		||||
                await self.relationships_vdb.upsert(data_for_vdb)
 | 
			
		||||
        finally:
 | 
			
		||||
            if update_storage:
 | 
			
		||||
                await self._insert_done()
 | 
			
		||||
 | 
			
		||||
    def query(self, query: str, param: QueryParam = QueryParam()):
 | 
			
		||||
        loop = always_get_an_event_loop()
 | 
			
		||||
        return loop.run_until_complete(self.aquery(query, param))
 | 
			
		||||
 | 
			
		||||
@ -1,6 +1,7 @@
 | 
			
		||||
import asyncio
 | 
			
		||||
import json
 | 
			
		||||
import re
 | 
			
		||||
from tqdm.asyncio import tqdm as tqdm_async
 | 
			
		||||
from typing import Union
 | 
			
		||||
from collections import Counter, defaultdict
 | 
			
		||||
import warnings
 | 
			
		||||
@ -342,11 +343,15 @@ async def extract_entities(
 | 
			
		||||
        )
 | 
			
		||||
        return dict(maybe_nodes), dict(maybe_edges)
 | 
			
		||||
 | 
			
		||||
    # use_llm_func is wrapped in ascynio.Semaphore, limiting max_async callings
 | 
			
		||||
    results = await asyncio.gather(
 | 
			
		||||
        *[_process_single_content(c) for c in ordered_chunks]
 | 
			
		||||
    )
 | 
			
		||||
    print()  # clear the progress bar
 | 
			
		||||
    results = []
 | 
			
		||||
    for result in tqdm_async(
 | 
			
		||||
        asyncio.as_completed([_process_single_content(c) for c in ordered_chunks]),
 | 
			
		||||
        total=len(ordered_chunks),
 | 
			
		||||
        desc="Extracting entities from chunks",
 | 
			
		||||
        unit="chunk",
 | 
			
		||||
    ):
 | 
			
		||||
        results.append(await result)
 | 
			
		||||
 | 
			
		||||
    maybe_nodes = defaultdict(list)
 | 
			
		||||
    maybe_edges = defaultdict(list)
 | 
			
		||||
    for m_nodes, m_edges in results:
 | 
			
		||||
@ -354,18 +359,38 @@ async def extract_entities(
 | 
			
		||||
            maybe_nodes[k].extend(v)
 | 
			
		||||
        for k, v in m_edges.items():
 | 
			
		||||
            maybe_edges[tuple(sorted(k))].extend(v)
 | 
			
		||||
    all_entities_data = await asyncio.gather(
 | 
			
		||||
        *[
 | 
			
		||||
            _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config)
 | 
			
		||||
            for k, v in maybe_nodes.items()
 | 
			
		||||
        ]
 | 
			
		||||
    )
 | 
			
		||||
    all_relationships_data = await asyncio.gather(
 | 
			
		||||
        *[
 | 
			
		||||
            _merge_edges_then_upsert(k[0], k[1], v, knowledge_graph_inst, global_config)
 | 
			
		||||
            for k, v in maybe_edges.items()
 | 
			
		||||
        ]
 | 
			
		||||
    )
 | 
			
		||||
    logger.info("Inserting entities into storage...")
 | 
			
		||||
    all_entities_data = []
 | 
			
		||||
    for result in tqdm_async(
 | 
			
		||||
        asyncio.as_completed(
 | 
			
		||||
            [
 | 
			
		||||
                _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config)
 | 
			
		||||
                for k, v in maybe_nodes.items()
 | 
			
		||||
            ]
 | 
			
		||||
        ),
 | 
			
		||||
        total=len(maybe_nodes),
 | 
			
		||||
        desc="Inserting entities",
 | 
			
		||||
        unit="entity",
 | 
			
		||||
    ):
 | 
			
		||||
        all_entities_data.append(await result)
 | 
			
		||||
 | 
			
		||||
    logger.info("Inserting relationships into storage...")
 | 
			
		||||
    all_relationships_data = []
 | 
			
		||||
    for result in tqdm_async(
 | 
			
		||||
        asyncio.as_completed(
 | 
			
		||||
            [
 | 
			
		||||
                _merge_edges_then_upsert(
 | 
			
		||||
                    k[0], k[1], v, knowledge_graph_inst, global_config
 | 
			
		||||
                )
 | 
			
		||||
                for k, v in maybe_edges.items()
 | 
			
		||||
            ]
 | 
			
		||||
        ),
 | 
			
		||||
        total=len(maybe_edges),
 | 
			
		||||
        desc="Inserting relationships",
 | 
			
		||||
        unit="relationship",
 | 
			
		||||
    ):
 | 
			
		||||
        all_relationships_data.append(await result)
 | 
			
		||||
 | 
			
		||||
    if not len(all_entities_data):
 | 
			
		||||
        logger.warning("Didn't extract any entities, maybe your LLM is not working")
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
@ -1,6 +1,7 @@
 | 
			
		||||
import asyncio
 | 
			
		||||
import html
 | 
			
		||||
import os
 | 
			
		||||
from tqdm.asyncio import tqdm as tqdm_async
 | 
			
		||||
from dataclasses import dataclass
 | 
			
		||||
from typing import Any, Union, cast
 | 
			
		||||
import networkx as nx
 | 
			
		||||
@ -95,9 +96,16 @@ class NanoVectorDBStorage(BaseVectorStorage):
 | 
			
		||||
            contents[i : i + self._max_batch_size]
 | 
			
		||||
            for i in range(0, len(contents), self._max_batch_size)
 | 
			
		||||
        ]
 | 
			
		||||
        embeddings_list = await asyncio.gather(
 | 
			
		||||
            *[self.embedding_func(batch) for batch in batches]
 | 
			
		||||
        )
 | 
			
		||||
        embedding_tasks = [self.embedding_func(batch) for batch in batches]
 | 
			
		||||
        embeddings_list = []
 | 
			
		||||
        for f in tqdm_async(
 | 
			
		||||
            asyncio.as_completed(embedding_tasks),
 | 
			
		||||
            total=len(embedding_tasks),
 | 
			
		||||
            desc="Generating embeddings",
 | 
			
		||||
            unit="batch",
 | 
			
		||||
        ):
 | 
			
		||||
            embeddings = await f
 | 
			
		||||
            embeddings_list.append(embeddings)
 | 
			
		||||
        embeddings = np.concatenate(embeddings_list)
 | 
			
		||||
        for i, d in enumerate(list_data):
 | 
			
		||||
            d["__vector__"] = embeddings[i]
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user