2025-02-15 22:37:12 +01:00
|
|
|
from __future__ import annotations
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
from abc import ABC, abstractmethod
|
2025-02-17 18:26:07 +01:00
|
|
|
from enum import Enum
|
2025-01-29 21:34:34 +08:00
|
|
|
import os
|
2025-02-16 20:05:45 +08:00
|
|
|
from dotenv import load_dotenv
|
2024-10-10 15:02:30 +08:00
|
|
|
from dataclasses import dataclass, field
|
2025-01-25 18:54:40 +08:00
|
|
|
from typing import (
|
2025-02-09 19:51:05 +01:00
|
|
|
Any,
|
|
|
|
Literal,
|
2025-01-25 18:54:40 +08:00
|
|
|
TypedDict,
|
|
|
|
TypeVar,
|
|
|
|
)
|
2025-02-09 11:24:08 +01:00
|
|
|
import numpy as np
|
2024-10-10 15:02:30 +08:00
|
|
|
from .utils import EmbeddingFunc
|
2025-02-13 17:32:51 +08:00
|
|
|
from .types import KnowledgeGraph
|
2024-10-10 15:02:30 +08:00
|
|
|
|
2025-02-16 20:05:45 +08:00
|
|
|
load_dotenv()
|
2025-02-09 19:51:05 +01:00
|
|
|
|
|
|
|
|
|
|
|
class TextChunkSchema(TypedDict):
|
|
|
|
tokens: int
|
|
|
|
content: str
|
|
|
|
full_doc_id: str
|
|
|
|
chunk_order_index: int
|
|
|
|
|
2024-10-10 15:02:30 +08:00
|
|
|
|
|
|
|
T = TypeVar("T")
|
|
|
|
|
2024-10-19 09:43:17 +05:30
|
|
|
|
2024-10-10 15:02:30 +08:00
|
|
|
@dataclass
|
|
|
|
class QueryParam:
|
2025-02-09 00:13:26 +01:00
|
|
|
"""Configuration parameters for query execution in LightRAG."""
|
|
|
|
|
2024-12-28 11:56:28 +08:00
|
|
|
mode: Literal["local", "global", "hybrid", "naive", "mix"] = "global"
|
2025-02-09 00:13:26 +01:00
|
|
|
"""Specifies the retrieval mode:
|
|
|
|
- "local": Focuses on context-dependent information.
|
|
|
|
- "global": Utilizes global knowledge.
|
|
|
|
- "hybrid": Combines local and global retrieval methods.
|
|
|
|
- "naive": Performs a basic search without advanced techniques.
|
|
|
|
- "mix": Integrates knowledge graph and vector retrieval.
|
|
|
|
"""
|
|
|
|
|
2024-10-10 15:02:30 +08:00
|
|
|
only_need_context: bool = False
|
2025-02-09 00:13:26 +01:00
|
|
|
"""If True, only returns the retrieved context without generating a response."""
|
|
|
|
|
2024-11-15 12:57:01 +08:00
|
|
|
only_need_prompt: bool = False
|
2025-02-09 00:13:26 +01:00
|
|
|
"""If True, only returns the generated prompt without producing a response."""
|
|
|
|
|
2024-10-10 15:02:30 +08:00
|
|
|
response_type: str = "Multiple Paragraphs"
|
2025-02-09 00:13:26 +01:00
|
|
|
"""Defines the response format. Examples: 'Multiple Paragraphs', 'Single Paragraph', 'Bullet Points'."""
|
|
|
|
|
2024-12-06 08:48:55 +08:00
|
|
|
stream: bool = False
|
2025-02-09 00:13:26 +01:00
|
|
|
"""If True, enables streaming output for real-time responses."""
|
|
|
|
|
2025-01-29 21:34:34 +08:00
|
|
|
top_k: int = int(os.getenv("TOP_K", "60"))
|
2025-02-09 00:13:26 +01:00
|
|
|
"""Number of top items to retrieve. Represents entities in 'local' mode and relationships in 'global' mode."""
|
|
|
|
|
2025-02-16 19:26:57 +08:00
|
|
|
max_token_for_text_unit: int = int(os.getenv("MAX_TOKEN_TEXT_CHUNK", "4000"))
|
2025-02-09 00:13:26 +01:00
|
|
|
"""Maximum number of tokens allowed for each retrieved text chunk."""
|
|
|
|
|
2025-02-16 19:26:57 +08:00
|
|
|
max_token_for_global_context: int = int(
|
|
|
|
os.getenv("MAX_TOKEN_RELATION_DESC", "4000")
|
|
|
|
)
|
2025-02-09 00:13:26 +01:00
|
|
|
"""Maximum number of tokens allocated for relationship descriptions in global retrieval."""
|
|
|
|
|
2025-02-16 19:26:57 +08:00
|
|
|
max_token_for_local_context: int = int(os.getenv("MAX_TOKEN_ENTITY_DESC", "4000"))
|
2025-02-09 00:13:26 +01:00
|
|
|
"""Maximum number of tokens allocated for entity descriptions in local retrieval."""
|
|
|
|
|
2025-02-09 18:03:34 +01:00
|
|
|
hl_keywords: list[str] = field(default_factory=list)
|
2025-02-09 00:13:26 +01:00
|
|
|
"""List of high-level keywords to prioritize in retrieval."""
|
|
|
|
|
2025-02-09 18:03:34 +01:00
|
|
|
ll_keywords: list[str] = field(default_factory=list)
|
2025-02-09 00:13:26 +01:00
|
|
|
"""List of low-level keywords to refine retrieval focus."""
|
|
|
|
|
2025-02-15 22:23:16 +01:00
|
|
|
conversation_history: list[dict[str, str]] = field(default_factory=list)
|
2025-02-09 00:13:26 +01:00
|
|
|
"""Stores past conversation history to maintain context.
|
|
|
|
Format: [{"role": "user/assistant", "content": "message"}].
|
|
|
|
"""
|
2024-10-10 15:02:30 +08:00
|
|
|
|
2025-02-09 00:13:26 +01:00
|
|
|
history_turns: int = 3
|
|
|
|
"""Number of complete conversation turns (user-assistant pairs) to consider in the response context."""
|
2024-10-10 15:02:30 +08:00
|
|
|
|
2025-02-09 18:03:34 +01:00
|
|
|
|
2024-10-10 15:02:30 +08:00
|
|
|
@dataclass
|
2025-02-16 14:11:19 +01:00
|
|
|
class StorageNameSpace(ABC):
|
2024-10-10 15:02:30 +08:00
|
|
|
namespace: str
|
2025-02-08 22:57:37 +01:00
|
|
|
global_config: dict[str, Any]
|
2024-10-10 15:02:30 +08:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2025-02-14 23:31:27 +01:00
|
|
|
async def index_done_callback(self) -> None:
|
2025-02-09 19:51:05 +01:00
|
|
|
"""Commit the storage operations after indexing"""
|
2024-10-10 15:02:30 +08:00
|
|
|
|
2024-10-19 09:43:17 +05:30
|
|
|
|
2024-10-10 15:02:30 +08:00
|
|
|
@dataclass
|
2025-02-16 14:11:19 +01:00
|
|
|
class BaseVectorStorage(StorageNameSpace, ABC):
|
2024-10-10 15:02:30 +08:00
|
|
|
embedding_func: EmbeddingFunc
|
2025-02-16 15:20:46 +01:00
|
|
|
cosine_better_than_threshold: float = field(default=0.2)
|
2025-02-14 23:42:52 +01:00
|
|
|
meta_fields: set[str] = field(default_factory=set)
|
2025-02-14 23:52:05 +01:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2025-02-09 11:00:04 +01:00
|
|
|
async def query(self, query: str, top_k: int) -> list[dict[str, Any]]:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Query the vector storage and retrieve top_k results."""
|
2024-10-10 15:02:30 +08:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2025-02-09 11:00:04 +01:00
|
|
|
async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Insert or update vectors in the storage."""
|
2024-10-10 15:02:30 +08:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2025-02-15 00:01:21 +01:00
|
|
|
async def delete_entity(self, entity_name: str) -> None:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Delete a single entity by its name."""
|
2025-02-15 00:01:21 +01:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2025-02-15 00:01:21 +01:00
|
|
|
async def delete_entity_relation(self, entity_name: str) -> None:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Delete relations for a given entity."""
|
2025-02-15 00:01:21 +01:00
|
|
|
|
2024-10-19 09:43:17 +05:30
|
|
|
|
2024-10-10 15:02:30 +08:00
|
|
|
@dataclass
|
2025-02-16 14:11:19 +01:00
|
|
|
class BaseKVStorage(StorageNameSpace, ABC):
|
|
|
|
embedding_func: EmbeddingFunc
|
2024-11-12 13:32:40 +08:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2025-02-15 22:37:12 +01:00
|
|
|
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Get value by id"""
|
2024-10-10 15:02:30 +08:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2025-02-09 10:32:59 +01:00
|
|
|
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Get values by ids"""
|
2024-10-10 15:02:30 +08:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2025-02-16 13:31:12 +01:00
|
|
|
async def filter_keys(self, keys: set[str]) -> set[str]:
|
2025-02-09 19:51:05 +01:00
|
|
|
"""Return un-exist keys"""
|
2024-10-10 15:02:30 +08:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2025-02-16 13:31:12 +01:00
|
|
|
async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Upsert data"""
|
2024-10-10 15:02:30 +08:00
|
|
|
|
2025-02-08 23:58:15 +01:00
|
|
|
|
2024-10-10 15:02:30 +08:00
|
|
|
@dataclass
|
2025-02-16 14:11:19 +01:00
|
|
|
class BaseGraphStorage(StorageNameSpace, ABC):
|
|
|
|
embedding_func: EmbeddingFunc
|
2025-02-14 23:52:05 +01:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2024-10-10 15:02:30 +08:00
|
|
|
async def has_node(self, node_id: str) -> bool:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Check if an edge exists in the graph."""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2024-10-10 15:02:30 +08:00
|
|
|
async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Get the degree of a node."""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2024-10-10 15:02:30 +08:00
|
|
|
async def node_degree(self, node_id: str) -> int:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Get the degree of an edge."""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2024-10-10 15:02:30 +08:00
|
|
|
async def edge_degree(self, src_id: str, tgt_id: str) -> int:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Get a node by its id."""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2025-02-15 22:37:12 +01:00
|
|
|
async def get_node(self, node_id: str) -> dict[str, str] | None:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Get an edge by its source and target node ids."""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2024-10-10 15:02:30 +08:00
|
|
|
async def get_edge(
|
2025-02-15 22:37:32 +01:00
|
|
|
self, source_node_id: str, target_node_id: str
|
|
|
|
) -> dict[str, str] | None:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Get all edges connected to a node."""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2025-02-15 22:37:32 +01:00
|
|
|
async def get_node_edges(self, source_node_id: str) -> list[tuple[str, str]] | None:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Upsert a node into the graph."""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2025-02-14 23:42:52 +01:00
|
|
|
async def upsert_node(self, node_id: str, node_data: dict[str, str]) -> None:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Upsert an edge into the graph."""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2024-10-10 15:02:30 +08:00
|
|
|
async def upsert_edge(
|
2025-02-14 23:52:05 +01:00
|
|
|
self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
|
2025-02-14 23:42:52 +01:00
|
|
|
) -> None:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Delete a node from the graph."""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2025-02-14 23:42:52 +01:00
|
|
|
async def delete_node(self, node_id: str) -> None:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Embed nodes using an algorithm."""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2025-02-14 23:52:05 +01:00
|
|
|
async def embed_nodes(
|
|
|
|
self, algorithm: str
|
|
|
|
) -> tuple[np.ndarray[Any, Any], list[str]]:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Get all labels in the graph."""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2025-02-09 11:24:08 +01:00
|
|
|
async def get_all_labels(self) -> list[str]:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Get a knowledge graph of a node."""
|
2025-02-14 23:52:05 +01:00
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2025-01-25 18:54:40 +08:00
|
|
|
async def get_knowledge_graph(
|
|
|
|
self, node_label: str, max_depth: int = 5
|
2025-02-13 17:32:51 +08:00
|
|
|
) -> KnowledgeGraph:
|
2025-02-16 13:53:59 +01:00
|
|
|
"""Retrieve a subgraph of the knowledge graph starting from a given node."""
|
2025-01-25 18:54:40 +08:00
|
|
|
|
2024-12-28 00:11:25 +08:00
|
|
|
|
2025-02-17 18:26:07 +01:00
|
|
|
class DocStatus(str, Enum):
|
2025-02-16 16:22:28 +01:00
|
|
|
"""Document processing status"""
|
2024-12-28 00:11:25 +08:00
|
|
|
|
|
|
|
PENDING = "pending"
|
|
|
|
PROCESSING = "processing"
|
|
|
|
PROCESSED = "processed"
|
|
|
|
FAILED = "failed"
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class DocProcessingStatus:
|
|
|
|
"""Document processing status data structure"""
|
2025-02-09 15:25:58 +01:00
|
|
|
|
2025-02-09 15:24:37 +01:00
|
|
|
content: str
|
|
|
|
"""Original content of the document"""
|
|
|
|
content_summary: str
|
|
|
|
"""First 100 chars of document content, used for preview"""
|
|
|
|
content_length: int
|
|
|
|
"""Total length of document"""
|
|
|
|
status: DocStatus
|
|
|
|
"""Current processing status"""
|
|
|
|
created_at: str
|
|
|
|
"""ISO format timestamp when document was created"""
|
|
|
|
updated_at: str
|
|
|
|
"""ISO format timestamp when document was last updated"""
|
2025-02-15 22:37:12 +01:00
|
|
|
chunks_count: int | None = None
|
2025-02-09 15:24:37 +01:00
|
|
|
"""Number of chunks after splitting, used for processing"""
|
2025-02-15 22:37:12 +01:00
|
|
|
error: str | None = None
|
2025-02-09 15:24:37 +01:00
|
|
|
"""Error message if failed"""
|
|
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
|
|
"""Additional metadata"""
|
2024-12-28 00:11:25 +08:00
|
|
|
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@dataclass
|
|
|
|
class DocStatusStorage(BaseKVStorage, ABC):
|
2024-12-28 00:11:25 +08:00
|
|
|
"""Base class for document status storage"""
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2025-02-09 11:24:08 +01:00
|
|
|
async def get_status_counts(self) -> dict[str, int]:
|
2024-12-28 00:11:25 +08:00
|
|
|
"""Get counts of documents in each status"""
|
|
|
|
|
2025-02-16 14:11:19 +01:00
|
|
|
@abstractmethod
|
2025-02-16 21:28:58 +08:00
|
|
|
async def get_docs_by_status(
|
|
|
|
self, status: DocStatus
|
|
|
|
) -> dict[str, DocProcessingStatus]:
|
|
|
|
"""Get all documents with a specific status"""
|