2025-02-15 22:37:12 +01:00
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
from abc import ABC, abstractmethod
|
2025-02-17 18:26:07 +01:00
|
|
|
|
from enum import Enum
|
2025-01-29 21:34:34 +08:00
|
|
|
|
import os
|
2025-02-16 20:05:45 +08:00
|
|
|
|
from dotenv import load_dotenv
|
2024-10-10 15:02:30 +08:00
|
|
|
|
from dataclasses import dataclass, field
|
2025-01-25 18:54:40 +08:00
|
|
|
|
from typing import (
|
2025-02-09 19:51:05 +01:00
|
|
|
|
Any,
|
|
|
|
|
Literal,
|
2025-01-25 18:54:40 +08:00
|
|
|
|
TypedDict,
|
|
|
|
|
TypeVar,
|
2025-03-23 21:33:49 +05:30
|
|
|
|
Callable,
|
2025-01-25 18:54:40 +08:00
|
|
|
|
)
|
2024-10-10 15:02:30 +08:00
|
|
|
|
from .utils import EmbeddingFunc
|
2025-02-20 14:29:36 +01:00
|
|
|
|
from .types import KnowledgeGraph
|
2025-06-25 12:37:57 +08:00
|
|
|
|
from .constants import GRAPH_FIELD_SEP
|
2024-10-10 15:02:30 +08:00
|
|
|
|
|
2025-03-29 03:48:38 +08:00
|
|
|
|
# use the .env that is inside the current folder
|
|
|
|
|
# allows to use different .env file for each lightrag instance
|
|
|
|
|
# the OS environment variables take precedence over the .env file
|
|
|
|
|
load_dotenv(dotenv_path=".env", override=False)
|
2025-02-09 19:51:05 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TextChunkSchema(TypedDict):
|
|
|
|
|
tokens: int
|
|
|
|
|
content: str
|
|
|
|
|
full_doc_id: str
|
|
|
|
|
chunk_order_index: int
|
|
|
|
|
|
2024-10-10 15:02:30 +08:00
|
|
|
|
|
|
|
|
|
T = TypeVar("T")
|
|
|
|
|
|
2024-10-19 09:43:17 +05:30
|
|
|
|
|
2024-10-10 15:02:30 +08:00
|
|
|
|
@dataclass
|
|
|
|
|
class QueryParam:
|
2025-02-09 00:13:26 +01:00
|
|
|
|
"""Configuration parameters for query execution in LightRAG."""
|
|
|
|
|
|
2025-04-10 23:17:33 +08:00
|
|
|
|
mode: Literal["local", "global", "hybrid", "naive", "mix", "bypass"] = "global"
|
2025-02-09 00:13:26 +01:00
|
|
|
|
"""Specifies the retrieval mode:
|
|
|
|
|
- "local": Focuses on context-dependent information.
|
|
|
|
|
- "global": Utilizes global knowledge.
|
|
|
|
|
- "hybrid": Combines local and global retrieval methods.
|
|
|
|
|
- "naive": Performs a basic search without advanced techniques.
|
|
|
|
|
- "mix": Integrates knowledge graph and vector retrieval.
|
|
|
|
|
"""
|
|
|
|
|
|
2024-10-10 15:02:30 +08:00
|
|
|
|
only_need_context: bool = False
|
2025-02-09 00:13:26 +01:00
|
|
|
|
"""If True, only returns the retrieved context without generating a response."""
|
|
|
|
|
|
2024-11-15 12:57:01 +08:00
|
|
|
|
only_need_prompt: bool = False
|
2025-02-09 00:13:26 +01:00
|
|
|
|
"""If True, only returns the generated prompt without producing a response."""
|
|
|
|
|
|
2024-10-10 15:02:30 +08:00
|
|
|
|
response_type: str = "Multiple Paragraphs"
|
2025-02-09 00:13:26 +01:00
|
|
|
|
"""Defines the response format. Examples: 'Multiple Paragraphs', 'Single Paragraph', 'Bullet Points'."""
|
|
|
|
|
|
2024-12-06 08:48:55 +08:00
|
|
|
|
stream: bool = False
|
2025-02-09 00:13:26 +01:00
|
|
|
|
"""If True, enables streaming output for real-time responses."""
|
|
|
|
|
|
2025-01-29 21:34:34 +08:00
|
|
|
|
top_k: int = int(os.getenv("TOP_K", "60"))
|
2025-02-09 00:13:26 +01:00
|
|
|
|
"""Number of top items to retrieve. Represents entities in 'local' mode and relationships in 'global' mode."""
|
|
|
|
|
|
2025-02-16 19:26:57 +08:00
|
|
|
|
max_token_for_text_unit: int = int(os.getenv("MAX_TOKEN_TEXT_CHUNK", "4000"))
|
2025-02-09 00:13:26 +01:00
|
|
|
|
"""Maximum number of tokens allowed for each retrieved text chunk."""
|
|
|
|
|
|
2025-02-16 19:26:57 +08:00
|
|
|
|
max_token_for_global_context: int = int(
|
|
|
|
|
os.getenv("MAX_TOKEN_RELATION_DESC", "4000")
|
|
|
|
|
)
|
2025-02-09 00:13:26 +01:00
|
|
|
|
"""Maximum number of tokens allocated for relationship descriptions in global retrieval."""
|
|
|
|
|
|
2025-02-16 19:26:57 +08:00
|
|
|
|
max_token_for_local_context: int = int(os.getenv("MAX_TOKEN_ENTITY_DESC", "4000"))
|
2025-02-09 00:13:26 +01:00
|
|
|
|
"""Maximum number of tokens allocated for entity descriptions in local retrieval."""
|
|
|
|
|
|
2025-02-09 18:03:34 +01:00
|
|
|
|
hl_keywords: list[str] = field(default_factory=list)
|
2025-02-09 00:13:26 +01:00
|
|
|
|
"""List of high-level keywords to prioritize in retrieval."""
|
|
|
|
|
|
2025-02-09 18:03:34 +01:00
|
|
|
|
ll_keywords: list[str] = field(default_factory=list)
|
2025-02-09 00:13:26 +01:00
|
|
|
|
"""List of low-level keywords to refine retrieval focus."""
|
|
|
|
|
|
2025-02-15 22:23:16 +01:00
|
|
|
|
conversation_history: list[dict[str, str]] = field(default_factory=list)
|
2025-02-09 00:13:26 +01:00
|
|
|
|
"""Stores past conversation history to maintain context.
|
|
|
|
|
Format: [{"role": "user/assistant", "content": "message"}].
|
|
|
|
|
"""
|
2024-10-10 15:02:30 +08:00
|
|
|
|
|
2025-02-09 00:13:26 +01:00
|
|
|
|
history_turns: int = 3
|
|
|
|
|
"""Number of complete conversation turns (user-assistant pairs) to consider in the response context."""
|
2024-10-10 15:02:30 +08:00
|
|
|
|
|
2025-03-07 18:45:28 +00:00
|
|
|
|
ids: list[str] | None = None
|
|
|
|
|
"""List of ids to filter the results."""
|
2025-03-25 15:20:09 +05:30
|
|
|
|
|
2025-03-23 21:33:49 +05:30
|
|
|
|
model_func: Callable[..., object] | None = None
|
|
|
|
|
"""Optional override for the LLM model function to use for this specific query.
|
|
|
|
|
If provided, this will be used instead of the global model function.
|
|
|
|
|
This allows using different models for different query modes.
|
|
|
|
|
"""
|
2025-03-07 18:45:28 +00:00
|
|
|
|
|
2025-05-08 04:29:43 +08:00
|
|
|
|
user_prompt: str | None = None
|
2025-05-08 03:38:47 +08:00
|
|
|
|
"""User-provided prompt for the query.
|
|
|
|
|
If proivded, this will be use instead of the default vaulue from prompt template.
|
|
|
|
|
"""
|
|
|
|
|
|
2025-02-09 18:03:34 +01:00
|
|
|
|
|
2024-10-10 15:02:30 +08:00
|
|
|
|
@dataclass
|
2025-02-16 14:11:19 +01:00
|
|
|
|
class StorageNameSpace(ABC):
|
2024-10-10 15:02:30 +08:00
|
|
|
|
namespace: str
|
2025-02-08 22:57:37 +01:00
|
|
|
|
global_config: dict[str, Any]
|
2024-10-10 15:02:30 +08:00
|
|
|
|
|
2025-02-19 03:46:18 +08:00
|
|
|
|
async def initialize(self):
|
|
|
|
|
"""Initialize the storage"""
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
async def finalize(self):
|
|
|
|
|
"""Finalize the storage"""
|
|
|
|
|
pass
|
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2025-02-14 23:31:27 +01:00
|
|
|
|
async def index_done_callback(self) -> None:
|
2025-02-09 19:51:05 +01:00
|
|
|
|
"""Commit the storage operations after indexing"""
|
2025-03-31 23:22:27 +08:00
|
|
|
|
|
2025-03-30 15:17:57 +08:00
|
|
|
|
@abstractmethod
|
|
|
|
|
async def drop(self) -> dict[str, str]:
|
|
|
|
|
"""Drop all data from storage and clean up resources
|
2025-03-31 23:22:27 +08:00
|
|
|
|
|
2025-03-30 15:17:57 +08:00
|
|
|
|
This abstract method defines the contract for dropping all data from a storage implementation.
|
|
|
|
|
Each storage type must implement this method to:
|
|
|
|
|
1. Clear all data from memory and/or external storage
|
|
|
|
|
2. Remove any associated storage files if applicable
|
|
|
|
|
3. Reset the storage to its initial state
|
|
|
|
|
4. Handle cleanup of any resources
|
|
|
|
|
5. Notify other processes if necessary
|
2025-03-31 14:14:32 +08:00
|
|
|
|
6. This action should persistent the data to disk immediately.
|
2025-03-31 23:22:27 +08:00
|
|
|
|
|
2025-03-30 15:17:57 +08:00
|
|
|
|
Returns:
|
|
|
|
|
dict[str, str]: Operation status and message with the following format:
|
|
|
|
|
{
|
|
|
|
|
"status": str, # "success" or "error"
|
|
|
|
|
"message": str # "data dropped" on success, error details on failure
|
|
|
|
|
}
|
2025-03-31 23:22:27 +08:00
|
|
|
|
|
2025-03-30 15:17:57 +08:00
|
|
|
|
Implementation specific:
|
|
|
|
|
- On success: return {"status": "success", "message": "data dropped"}
|
|
|
|
|
- On failure: return {"status": "error", "message": "<error details>"}
|
|
|
|
|
- If not supported: return {"status": "error", "message": "unsupported"}
|
|
|
|
|
"""
|
2024-10-10 15:02:30 +08:00
|
|
|
|
|
2024-10-19 09:43:17 +05:30
|
|
|
|
|
2024-10-10 15:02:30 +08:00
|
|
|
|
@dataclass
|
2025-02-16 14:11:19 +01:00
|
|
|
|
class BaseVectorStorage(StorageNameSpace, ABC):
|
2024-10-10 15:02:30 +08:00
|
|
|
|
embedding_func: EmbeddingFunc
|
2025-02-16 15:20:46 +01:00
|
|
|
|
cosine_better_than_threshold: float = field(default=0.2)
|
2025-02-14 23:42:52 +01:00
|
|
|
|
meta_fields: set[str] = field(default_factory=set)
|
2025-03-10 15:39:18 +00:00
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2025-03-10 15:39:18 +00:00
|
|
|
|
async def query(
|
|
|
|
|
self, query: str, top_k: int, ids: list[str] | None = None
|
|
|
|
|
) -> list[dict[str, Any]]:
|
2025-02-16 13:53:59 +01:00
|
|
|
|
"""Query the vector storage and retrieve top_k results."""
|
2024-10-10 15:02:30 +08:00
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2025-02-09 11:00:04 +01:00
|
|
|
|
async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
|
2025-03-31 16:21:20 +08:00
|
|
|
|
"""Insert or update vectors in the storage.
|
|
|
|
|
|
|
|
|
|
Importance notes for in-memory storage:
|
|
|
|
|
1. Changes will be persisted to disk during the next index_done_callback
|
|
|
|
|
2. Only one process should updating the storage at a time before index_done_callback,
|
|
|
|
|
KG-storage-log should be used to avoid data corruption
|
|
|
|
|
"""
|
2024-10-10 15:02:30 +08:00
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2025-02-15 00:01:21 +01:00
|
|
|
|
async def delete_entity(self, entity_name: str) -> None:
|
2025-03-31 16:21:20 +08:00
|
|
|
|
"""Delete a single entity by its name.
|
|
|
|
|
|
|
|
|
|
Importance notes for in-memory storage:
|
|
|
|
|
1. Changes will be persisted to disk during the next index_done_callback
|
|
|
|
|
2. Only one process should updating the storage at a time before index_done_callback,
|
|
|
|
|
KG-storage-log should be used to avoid data corruption
|
|
|
|
|
"""
|
2025-02-15 00:01:21 +01:00
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2025-02-15 00:01:21 +01:00
|
|
|
|
async def delete_entity_relation(self, entity_name: str) -> None:
|
2025-03-31 16:21:20 +08:00
|
|
|
|
"""Delete relations for a given entity.
|
|
|
|
|
|
|
|
|
|
Importance notes for in-memory storage:
|
|
|
|
|
1. Changes will be persisted to disk during the next index_done_callback
|
|
|
|
|
2. Only one process should updating the storage at a time before index_done_callback,
|
|
|
|
|
KG-storage-log should be used to avoid data corruption
|
|
|
|
|
"""
|
2025-02-15 00:01:21 +01:00
|
|
|
|
|
2025-03-11 16:05:04 +08:00
|
|
|
|
@abstractmethod
|
|
|
|
|
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
|
|
|
|
"""Get vector data by its ID
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
id: The unique identifier of the vector
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
The vector data if found, or None if not found
|
|
|
|
|
"""
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
|
|
|
|
|
"""Get multiple vector data by their IDs
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
ids: List of unique identifiers
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List of vector data objects that were found
|
|
|
|
|
"""
|
|
|
|
|
pass
|
|
|
|
|
|
2025-03-31 16:21:20 +08:00
|
|
|
|
@abstractmethod
|
|
|
|
|
async def delete(self, ids: list[str]):
|
|
|
|
|
"""Delete vectors with specified IDs
|
|
|
|
|
|
|
|
|
|
Importance notes for in-memory storage:
|
|
|
|
|
1. Changes will be persisted to disk during the next index_done_callback
|
|
|
|
|
2. Only one process should updating the storage at a time before index_done_callback,
|
|
|
|
|
KG-storage-log should be used to avoid data corruption
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
ids: List of vector IDs to be deleted
|
|
|
|
|
"""
|
|
|
|
|
|
2024-10-19 09:43:17 +05:30
|
|
|
|
|
2024-10-10 15:02:30 +08:00
|
|
|
|
@dataclass
|
2025-02-16 14:11:19 +01:00
|
|
|
|
class BaseKVStorage(StorageNameSpace, ABC):
|
|
|
|
|
embedding_func: EmbeddingFunc
|
2024-11-12 13:32:40 +08:00
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2025-02-15 22:37:12 +01:00
|
|
|
|
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
2025-02-16 13:53:59 +01:00
|
|
|
|
"""Get value by id"""
|
2024-10-10 15:02:30 +08:00
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2025-02-09 10:32:59 +01:00
|
|
|
|
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
|
2025-02-16 13:53:59 +01:00
|
|
|
|
"""Get values by ids"""
|
2024-10-10 15:02:30 +08:00
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2025-02-16 13:31:12 +01:00
|
|
|
|
async def filter_keys(self, keys: set[str]) -> set[str]:
|
2025-02-09 19:51:05 +01:00
|
|
|
|
"""Return un-exist keys"""
|
2024-10-10 15:02:30 +08:00
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2025-02-16 13:31:12 +01:00
|
|
|
|
async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
|
2025-03-31 16:21:20 +08:00
|
|
|
|
"""Upsert data
|
2025-03-31 23:22:27 +08:00
|
|
|
|
|
2025-03-31 16:21:20 +08:00
|
|
|
|
Importance notes for in-memory storage:
|
|
|
|
|
1. Changes will be persisted to disk during the next index_done_callback
|
2025-03-31 23:22:27 +08:00
|
|
|
|
2. update flags to notify other processes that data persistence is needed
|
2025-03-31 16:21:20 +08:00
|
|
|
|
"""
|
2024-10-10 15:02:30 +08:00
|
|
|
|
|
2025-03-31 14:14:32 +08:00
|
|
|
|
@abstractmethod
|
|
|
|
|
async def delete(self, ids: list[str]) -> None:
|
|
|
|
|
"""Delete specific records from storage by their IDs
|
2025-03-31 23:22:27 +08:00
|
|
|
|
|
2025-03-31 16:21:20 +08:00
|
|
|
|
Importance notes for in-memory storage:
|
|
|
|
|
1. Changes will be persisted to disk during the next index_done_callback
|
|
|
|
|
2. update flags to notify other processes that data persistence is needed
|
2025-03-31 23:22:27 +08:00
|
|
|
|
|
2025-03-31 14:14:32 +08:00
|
|
|
|
Args:
|
|
|
|
|
ids (list[str]): List of document IDs to be deleted from storage
|
2025-03-31 23:22:27 +08:00
|
|
|
|
|
2025-03-31 14:14:32 +08:00
|
|
|
|
Returns:
|
|
|
|
|
None
|
|
|
|
|
"""
|
|
|
|
|
|
2025-03-31 23:22:27 +08:00
|
|
|
|
async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool:
|
2025-03-31 23:10:21 +08:00
|
|
|
|
"""Delete specific records from storage by cache mode
|
2025-03-31 23:22:27 +08:00
|
|
|
|
|
2025-03-31 23:10:21 +08:00
|
|
|
|
Importance notes for in-memory storage:
|
|
|
|
|
1. Changes will be persisted to disk during the next index_done_callback
|
|
|
|
|
2. update flags to notify other processes that data persistence is needed
|
2025-03-31 23:22:27 +08:00
|
|
|
|
|
2025-03-31 23:10:21 +08:00
|
|
|
|
Args:
|
|
|
|
|
modes (list[str]): List of cache modes to be dropped from storage
|
2025-03-31 23:22:27 +08:00
|
|
|
|
|
2025-03-31 23:10:21 +08:00
|
|
|
|
Returns:
|
|
|
|
|
True: if the cache drop successfully
|
|
|
|
|
False: if the cache drop failed, or the cache mode is not supported
|
|
|
|
|
"""
|
2025-02-08 23:58:15 +01:00
|
|
|
|
|
2025-06-22 15:12:09 +08:00
|
|
|
|
# async def drop_cache_by_chunk_ids(self, chunk_ids: list[str] | None = None) -> bool:
|
|
|
|
|
# """Delete specific cache records from storage by chunk IDs
|
2025-06-09 18:52:34 +08:00
|
|
|
|
|
2025-06-22 15:12:09 +08:00
|
|
|
|
# Importance notes for in-memory storage:
|
|
|
|
|
# 1. Changes will be persisted to disk during the next index_done_callback
|
|
|
|
|
# 2. update flags to notify other processes that data persistence is needed
|
2025-06-09 18:52:34 +08:00
|
|
|
|
|
2025-06-22 15:12:09 +08:00
|
|
|
|
# Args:
|
|
|
|
|
# chunk_ids (list[str]): List of chunk IDs to be dropped from storage
|
2025-06-09 18:52:34 +08:00
|
|
|
|
|
2025-06-22 15:12:09 +08:00
|
|
|
|
# Returns:
|
|
|
|
|
# True: if the cache drop successfully
|
|
|
|
|
# False: if the cache drop failed, or the operation is not supported
|
|
|
|
|
# """
|
2025-06-09 18:52:34 +08:00
|
|
|
|
|
2025-03-31 23:22:27 +08:00
|
|
|
|
|
2024-10-10 15:02:30 +08:00
|
|
|
|
@dataclass
|
2025-02-16 14:11:19 +01:00
|
|
|
|
class BaseGraphStorage(StorageNameSpace, ABC):
|
|
|
|
|
embedding_func: EmbeddingFunc
|
2025-02-14 23:52:05 +01:00
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2024-10-10 15:02:30 +08:00
|
|
|
|
async def has_node(self, node_id: str) -> bool:
|
2025-04-11 18:34:03 +08:00
|
|
|
|
"""Check if a node exists in the graph.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
node_id: The ID of the node to check
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
True if the node exists, False otherwise
|
|
|
|
|
"""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2024-10-10 15:02:30 +08:00
|
|
|
|
async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
|
2025-04-11 18:34:03 +08:00
|
|
|
|
"""Check if an edge exists between two nodes.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
source_node_id: The ID of the source node
|
|
|
|
|
target_node_id: The ID of the target node
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
True if the edge exists, False otherwise
|
|
|
|
|
"""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2024-10-10 15:02:30 +08:00
|
|
|
|
async def node_degree(self, node_id: str) -> int:
|
2025-04-11 18:34:03 +08:00
|
|
|
|
"""Get the degree (number of connected edges) of a node.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
node_id: The ID of the node
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
The number of edges connected to the node
|
|
|
|
|
"""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2024-10-10 15:02:30 +08:00
|
|
|
|
async def edge_degree(self, src_id: str, tgt_id: str) -> int:
|
2025-04-11 18:34:03 +08:00
|
|
|
|
"""Get the total degree of an edge (sum of degrees of its source and target nodes).
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
src_id: The ID of the source node
|
|
|
|
|
tgt_id: The ID of the target node
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
The sum of the degrees of the source and target nodes
|
|
|
|
|
"""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2025-02-15 22:37:12 +01:00
|
|
|
|
async def get_node(self, node_id: str) -> dict[str, str] | None:
|
2025-04-11 18:34:03 +08:00
|
|
|
|
"""Get node by its ID, returning only node properties.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
node_id: The ID of the node to retrieve
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A dictionary of node properties if found, None otherwise
|
|
|
|
|
"""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2024-10-10 15:02:30 +08:00
|
|
|
|
async def get_edge(
|
2025-02-15 22:37:32 +01:00
|
|
|
|
self, source_node_id: str, target_node_id: str
|
|
|
|
|
) -> dict[str, str] | None:
|
2025-04-11 18:34:03 +08:00
|
|
|
|
"""Get edge properties between two nodes.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
source_node_id: The ID of the source node
|
|
|
|
|
target_node_id: The ID of the target node
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A dictionary of edge properties if found, None otherwise
|
|
|
|
|
"""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2025-02-15 22:37:32 +01:00
|
|
|
|
async def get_node_edges(self, source_node_id: str) -> list[tuple[str, str]] | None:
|
2025-04-11 18:34:03 +08:00
|
|
|
|
"""Get all edges connected to a node.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
source_node_id: The ID of the node to get edges for
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A list of (source_id, target_id) tuples representing edges,
|
|
|
|
|
or None if the node doesn't exist
|
|
|
|
|
"""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
|
2025-04-07 19:09:31 +02:00
|
|
|
|
async def get_nodes_batch(self, node_ids: list[str]) -> dict[str, dict]:
|
2025-04-12 22:42:43 +08:00
|
|
|
|
"""Get nodes as a batch using UNWIND
|
2025-04-15 12:34:04 +08:00
|
|
|
|
|
2025-04-12 22:42:43 +08:00
|
|
|
|
Default implementation fetches nodes one by one.
|
|
|
|
|
Override this method for better performance in storage backends
|
|
|
|
|
that support batch operations.
|
|
|
|
|
"""
|
|
|
|
|
result = {}
|
|
|
|
|
for node_id in node_ids:
|
|
|
|
|
node = await self.get_node(node_id)
|
|
|
|
|
if node is not None:
|
|
|
|
|
result[node_id] = node
|
|
|
|
|
return result
|
2025-04-07 19:09:31 +02:00
|
|
|
|
|
|
|
|
|
async def node_degrees_batch(self, node_ids: list[str]) -> dict[str, int]:
|
2025-04-12 22:42:43 +08:00
|
|
|
|
"""Node degrees as a batch using UNWIND
|
2025-04-15 12:34:04 +08:00
|
|
|
|
|
2025-04-12 22:42:43 +08:00
|
|
|
|
Default implementation fetches node degrees one by one.
|
|
|
|
|
Override this method for better performance in storage backends
|
|
|
|
|
that support batch operations.
|
|
|
|
|
"""
|
|
|
|
|
result = {}
|
|
|
|
|
for node_id in node_ids:
|
|
|
|
|
degree = await self.node_degree(node_id)
|
|
|
|
|
result[node_id] = degree
|
|
|
|
|
return result
|
2025-04-07 19:09:31 +02:00
|
|
|
|
|
2025-04-15 12:34:04 +08:00
|
|
|
|
async def edge_degrees_batch(
|
|
|
|
|
self, edge_pairs: list[tuple[str, str]]
|
|
|
|
|
) -> dict[tuple[str, str], int]:
|
2025-04-12 22:42:43 +08:00
|
|
|
|
"""Edge degrees as a batch using UNWIND also uses node_degrees_batch
|
2025-04-15 12:34:04 +08:00
|
|
|
|
|
2025-04-12 22:42:43 +08:00
|
|
|
|
Default implementation calculates edge degrees one by one.
|
|
|
|
|
Override this method for better performance in storage backends
|
|
|
|
|
that support batch operations.
|
|
|
|
|
"""
|
|
|
|
|
result = {}
|
|
|
|
|
for src_id, tgt_id in edge_pairs:
|
|
|
|
|
degree = await self.edge_degree(src_id, tgt_id)
|
|
|
|
|
result[(src_id, tgt_id)] = degree
|
|
|
|
|
return result
|
2025-04-07 19:09:31 +02:00
|
|
|
|
|
2025-04-15 12:34:04 +08:00
|
|
|
|
async def get_edges_batch(
|
|
|
|
|
self, pairs: list[dict[str, str]]
|
|
|
|
|
) -> dict[tuple[str, str], dict]:
|
2025-04-12 22:42:43 +08:00
|
|
|
|
"""Get edges as a batch using UNWIND
|
2025-04-15 12:34:04 +08:00
|
|
|
|
|
2025-04-12 22:42:43 +08:00
|
|
|
|
Default implementation fetches edges one by one.
|
|
|
|
|
Override this method for better performance in storage backends
|
|
|
|
|
that support batch operations.
|
|
|
|
|
"""
|
|
|
|
|
result = {}
|
|
|
|
|
for pair in pairs:
|
|
|
|
|
src_id = pair["src"]
|
|
|
|
|
tgt_id = pair["tgt"]
|
|
|
|
|
edge = await self.get_edge(src_id, tgt_id)
|
|
|
|
|
if edge is not None:
|
|
|
|
|
result[(src_id, tgt_id)] = edge
|
|
|
|
|
return result
|
2025-04-07 19:09:31 +02:00
|
|
|
|
|
2025-04-15 12:34:04 +08:00
|
|
|
|
async def get_nodes_edges_batch(
|
|
|
|
|
self, node_ids: list[str]
|
|
|
|
|
) -> dict[str, list[tuple[str, str]]]:
|
2025-04-12 22:42:43 +08:00
|
|
|
|
"""Get nodes edges as a batch using UNWIND
|
2025-04-15 12:34:04 +08:00
|
|
|
|
|
2025-04-12 22:42:43 +08:00
|
|
|
|
Default implementation fetches node edges one by one.
|
|
|
|
|
Override this method for better performance in storage backends
|
|
|
|
|
that support batch operations.
|
|
|
|
|
"""
|
|
|
|
|
result = {}
|
|
|
|
|
for node_id in node_ids:
|
|
|
|
|
edges = await self.get_node_edges(node_id)
|
|
|
|
|
result[node_id] = edges if edges is not None else []
|
|
|
|
|
return result
|
2025-04-07 19:09:31 +02:00
|
|
|
|
|
2025-06-25 12:37:57 +08:00
|
|
|
|
@abstractmethod
|
|
|
|
|
async def get_nodes_by_chunk_ids(self, chunk_ids: list[str]) -> list[dict]:
|
|
|
|
|
"""Get all nodes that are associated with the given chunk_ids.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
chunk_ids (list[str]): A list of chunk IDs to find associated nodes for.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
list[dict]: A list of nodes, where each node is a dictionary of its properties.
|
|
|
|
|
An empty list if no matching nodes are found.
|
|
|
|
|
"""
|
|
|
|
|
# Default implementation iterates through all nodes, which is inefficient.
|
|
|
|
|
# This method should be overridden by subclasses for better performance.
|
|
|
|
|
all_nodes = []
|
|
|
|
|
all_labels = await self.get_all_labels()
|
|
|
|
|
for label in all_labels:
|
|
|
|
|
node = await self.get_node(label)
|
|
|
|
|
if node and "source_id" in node:
|
|
|
|
|
source_ids = set(node["source_id"].split(GRAPH_FIELD_SEP))
|
|
|
|
|
if not source_ids.isdisjoint(chunk_ids):
|
|
|
|
|
all_nodes.append(node)
|
|
|
|
|
return all_nodes
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
async def get_edges_by_chunk_ids(self, chunk_ids: list[str]) -> list[dict]:
|
|
|
|
|
"""Get all edges that are associated with the given chunk_ids.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
chunk_ids (list[str]): A list of chunk IDs to find associated edges for.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
list[dict]: A list of edges, where each edge is a dictionary of its properties.
|
|
|
|
|
An empty list if no matching edges are found.
|
|
|
|
|
"""
|
|
|
|
|
# Default implementation iterates through all nodes and their edges, which is inefficient.
|
|
|
|
|
# This method should be overridden by subclasses for better performance.
|
|
|
|
|
all_edges = []
|
|
|
|
|
all_labels = await self.get_all_labels()
|
|
|
|
|
processed_edges = set()
|
|
|
|
|
|
|
|
|
|
for label in all_labels:
|
|
|
|
|
edges = await self.get_node_edges(label)
|
|
|
|
|
if edges:
|
|
|
|
|
for src_id, tgt_id in edges:
|
|
|
|
|
# Avoid processing the same edge twice in an undirected graph
|
|
|
|
|
edge_tuple = tuple(sorted((src_id, tgt_id)))
|
|
|
|
|
if edge_tuple in processed_edges:
|
|
|
|
|
continue
|
|
|
|
|
processed_edges.add(edge_tuple)
|
|
|
|
|
|
|
|
|
|
edge = await self.get_edge(src_id, tgt_id)
|
|
|
|
|
if edge and "source_id" in edge:
|
|
|
|
|
source_ids = set(edge["source_id"].split(GRAPH_FIELD_SEP))
|
|
|
|
|
if not source_ids.isdisjoint(chunk_ids):
|
|
|
|
|
# Add source and target to the edge dict for easier processing later
|
|
|
|
|
edge_with_nodes = edge.copy()
|
|
|
|
|
edge_with_nodes["source"] = src_id
|
|
|
|
|
edge_with_nodes["target"] = tgt_id
|
|
|
|
|
all_edges.append(edge_with_nodes)
|
|
|
|
|
return all_edges
|
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2025-02-14 23:42:52 +01:00
|
|
|
|
async def upsert_node(self, node_id: str, node_data: dict[str, str]) -> None:
|
2025-04-11 18:34:03 +08:00
|
|
|
|
"""Insert a new node or update an existing node in the graph.
|
|
|
|
|
|
|
|
|
|
Importance notes for in-memory storage:
|
|
|
|
|
1. Changes will be persisted to disk during the next index_done_callback
|
|
|
|
|
2. Only one process should updating the storage at a time before index_done_callback,
|
|
|
|
|
KG-storage-log should be used to avoid data corruption
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
node_id: The ID of the node to insert or update
|
|
|
|
|
node_data: A dictionary of node properties
|
|
|
|
|
"""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2024-10-10 15:02:30 +08:00
|
|
|
|
async def upsert_edge(
|
2025-02-14 23:52:05 +01:00
|
|
|
|
self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
|
2025-02-14 23:42:52 +01:00
|
|
|
|
) -> None:
|
2025-04-11 18:34:03 +08:00
|
|
|
|
"""Insert a new edge or update an existing edge in the graph.
|
2025-03-31 16:21:20 +08:00
|
|
|
|
|
|
|
|
|
Importance notes for in-memory storage:
|
|
|
|
|
1. Changes will be persisted to disk during the next index_done_callback
|
|
|
|
|
2. Only one process should updating the storage at a time before index_done_callback,
|
|
|
|
|
KG-storage-log should be used to avoid data corruption
|
2025-04-11 18:34:03 +08:00
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
source_node_id: The ID of the source node
|
|
|
|
|
target_node_id: The ID of the target node
|
|
|
|
|
edge_data: A dictionary of edge properties
|
2025-03-31 16:21:20 +08:00
|
|
|
|
"""
|
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2025-02-14 23:42:52 +01:00
|
|
|
|
async def delete_node(self, node_id: str) -> None:
|
2025-04-11 18:34:03 +08:00
|
|
|
|
"""Delete a node from the graph.
|
|
|
|
|
|
|
|
|
|
Importance notes for in-memory storage:
|
|
|
|
|
1. Changes will be persisted to disk during the next index_done_callback
|
|
|
|
|
2. Only one process should updating the storage at a time before index_done_callback,
|
|
|
|
|
KG-storage-log should be used to avoid data corruption
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
node_id: The ID of the node to delete
|
|
|
|
|
"""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2025-04-11 18:34:03 +08:00
|
|
|
|
async def remove_nodes(self, nodes: list[str]):
|
|
|
|
|
"""Delete multiple nodes
|
|
|
|
|
|
|
|
|
|
Importance notes:
|
|
|
|
|
1. Changes will be persisted to disk during the next index_done_callback
|
|
|
|
|
2. Only one process should updating the storage at a time before index_done_callback,
|
|
|
|
|
KG-storage-log should be used to avoid data corruption
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
nodes: List of node IDs to be deleted
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
async def remove_edges(self, edges: list[tuple[str, str]]):
|
|
|
|
|
"""Delete multiple edges
|
|
|
|
|
|
|
|
|
|
Importance notes:
|
|
|
|
|
1. Changes will be persisted to disk during the next index_done_callback
|
|
|
|
|
2. Only one process should updating the storage at a time before index_done_callback,
|
|
|
|
|
KG-storage-log should be used to avoid data corruption
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
edges: List of edges to be deleted, each edge is a (source, target) tuple
|
|
|
|
|
"""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
|
2025-02-20 15:09:43 +01:00
|
|
|
|
@abstractmethod
|
|
|
|
|
async def get_all_labels(self) -> list[str]:
|
2025-04-11 18:34:03 +08:00
|
|
|
|
"""Get all labels in the graph.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A list of all node labels in the graph, sorted alphabetically
|
|
|
|
|
"""
|
2025-02-20 15:09:43 +01:00
|
|
|
|
|
2025-02-20 14:29:36 +01:00
|
|
|
|
@abstractmethod
|
|
|
|
|
async def get_knowledge_graph(
|
2025-04-02 18:32:03 +08:00
|
|
|
|
self, node_label: str, max_depth: int = 3, max_nodes: int = 1000
|
2025-02-20 14:29:36 +01:00
|
|
|
|
) -> KnowledgeGraph:
|
2025-04-02 22:12:20 +08:00
|
|
|
|
"""
|
|
|
|
|
Retrieve a connected subgraph of nodes where the label includes the specified `node_label`.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
node_label: Label of the starting node,* means all nodes
|
|
|
|
|
max_depth: Maximum depth of the subgraph, Defaults to 3
|
2025-04-03 16:32:18 +08:00
|
|
|
|
max_nodes: Maxiumu nodes to return, Defaults to 1000(BFS if possible)
|
2025-04-02 22:12:20 +08:00
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
KnowledgeGraph object containing nodes and edges, with an is_truncated flag
|
|
|
|
|
indicating whether the graph was truncated due to max_nodes limit
|
|
|
|
|
"""
|
2025-02-20 14:29:36 +01:00
|
|
|
|
|
2024-12-28 00:11:25 +08:00
|
|
|
|
|
2025-02-17 18:26:07 +01:00
|
|
|
|
class DocStatus(str, Enum):
|
2025-02-16 16:22:28 +01:00
|
|
|
|
"""Document processing status"""
|
2024-12-28 00:11:25 +08:00
|
|
|
|
|
|
|
|
|
PENDING = "pending"
|
|
|
|
|
PROCESSING = "processing"
|
|
|
|
|
PROCESSED = "processed"
|
|
|
|
|
FAILED = "failed"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class DocProcessingStatus:
|
|
|
|
|
"""Document processing status data structure"""
|
2025-02-09 15:25:58 +01:00
|
|
|
|
|
2025-02-09 15:24:37 +01:00
|
|
|
|
content: str
|
|
|
|
|
"""Original content of the document"""
|
|
|
|
|
content_summary: str
|
|
|
|
|
"""First 100 chars of document content, used for preview"""
|
|
|
|
|
content_length: int
|
|
|
|
|
"""Total length of document"""
|
2025-03-17 23:32:35 +08:00
|
|
|
|
file_path: str
|
|
|
|
|
"""File path of the document"""
|
2025-02-09 15:24:37 +01:00
|
|
|
|
status: DocStatus
|
|
|
|
|
"""Current processing status"""
|
|
|
|
|
created_at: str
|
|
|
|
|
"""ISO format timestamp when document was created"""
|
|
|
|
|
updated_at: str
|
|
|
|
|
"""ISO format timestamp when document was last updated"""
|
2025-02-15 22:37:12 +01:00
|
|
|
|
chunks_count: int | None = None
|
2025-02-09 15:24:37 +01:00
|
|
|
|
"""Number of chunks after splitting, used for processing"""
|
2025-02-15 22:37:12 +01:00
|
|
|
|
error: str | None = None
|
2025-02-09 15:24:37 +01:00
|
|
|
|
"""Error message if failed"""
|
|
|
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
"""Additional metadata"""
|
2024-12-28 00:11:25 +08:00
|
|
|
|
|
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@dataclass
|
|
|
|
|
class DocStatusStorage(BaseKVStorage, ABC):
|
2024-12-28 00:11:25 +08:00
|
|
|
|
"""Base class for document status storage"""
|
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2025-02-09 11:24:08 +01:00
|
|
|
|
async def get_status_counts(self) -> dict[str, int]:
|
2024-12-28 00:11:25 +08:00
|
|
|
|
"""Get counts of documents in each status"""
|
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
|
@abstractmethod
|
2025-02-16 21:28:58 +08:00
|
|
|
|
async def get_docs_by_status(
|
|
|
|
|
self, status: DocStatus
|
|
|
|
|
) -> dict[str, DocProcessingStatus]:
|
|
|
|
|
"""Get all documents with a specific status"""
|
2025-02-19 03:46:18 +08:00
|
|
|
|
|
2025-03-31 23:22:27 +08:00
|
|
|
|
async def drop_cache_by_modes(self, modes: list[str] | None = None) -> bool:
|
2025-03-31 23:10:21 +08:00
|
|
|
|
"""Drop cache is not supported for Doc Status storage"""
|
|
|
|
|
return False
|
|
|
|
|
|
2025-02-19 03:46:18 +08:00
|
|
|
|
|
|
|
|
|
class StoragesStatus(str, Enum):
|
|
|
|
|
"""Storages status"""
|
|
|
|
|
|
|
|
|
|
NOT_CREATED = "not_created"
|
|
|
|
|
CREATED = "created"
|
|
|
|
|
INITIALIZED = "initialized"
|
|
|
|
|
FINALIZED = "finalized"
|
2025-06-23 17:59:27 +08:00
|
|
|
|
|
2025-06-23 18:41:30 +08:00
|
|
|
|
|
2025-06-23 17:59:27 +08:00
|
|
|
|
@dataclass
|
|
|
|
|
class DeletionResult:
|
|
|
|
|
"""Represents the result of a deletion operation."""
|
2025-06-23 18:41:30 +08:00
|
|
|
|
|
2025-06-23 21:38:47 +08:00
|
|
|
|
status: Literal["success", "not_found", "fail"]
|
2025-06-23 17:59:27 +08:00
|
|
|
|
doc_id: str
|
|
|
|
|
message: str
|
|
|
|
|
status_code: int = 200
|
2025-06-25 19:02:46 +08:00
|
|
|
|
file_path: str | None = None
|