LightRAG/lightrag/kg/qdrant_impl.py

import asyncio
import os
from typing import Any, final, List
from dataclasses import dataclass
import numpy as np
import hashlib
import uuid
from ..utils import logger
from ..base import BaseVectorStorage
import configparser


config = configparser.ConfigParser()
config.read("config.ini", "utf-8")

import pipmaster as pm

if not pm.is_installed("qdrant-client"):
    pm.install("qdrant-client")

from qdrant_client import QdrantClient, models


def compute_mdhash_id_for_qdrant(
    content: str, prefix: str = "", style: str = "simple"
) -> str:
    """
    Generate a UUID based on the content and support multiple formats.

    :param content: The content used to generate the UUID.
    :param style: The format of the UUID, optional values are "simple", "hyphenated", "urn".
    :return: A UUID that meets the requirements of Qdrant.
    """
    if not content:
        raise ValueError("Content must not be empty.")

    # Use the hash value of the content to create a UUID.
    hashed_content = hashlib.sha256((prefix + content).encode("utf-8")).digest()
    generated_uuid = uuid.UUID(bytes=hashed_content[:16], version=4)

    # Return the UUID according to the specified format.
    if style == "simple":
        return generated_uuid.hex
    elif style == "hyphenated":
        return str(generated_uuid)
    elif style == "urn":
        return f"urn:uuid:{generated_uuid}"
    else:
        raise ValueError("Invalid style. Choose from 'simple', 'hyphenated', or 'urn'.")


@final
@dataclass
class QdrantVectorDBStorage(BaseVectorStorage):
    @staticmethod
    def create_collection_if_not_exist(
        client: QdrantClient, collection_name: str, **kwargs
    ):
        if client.collection_exists(collection_name):
            return
        client.create_collection(collection_name, **kwargs)

    def __post_init__(self):
        kwargs = self.global_config.get("vector_db_storage_cls_kwargs", {})
        cosine_threshold = kwargs.get("cosine_better_than_threshold")
        if cosine_threshold is None:
            raise ValueError(
                "cosine_better_than_threshold must be specified in vector_db_storage_cls_kwargs"
            )
        self.cosine_better_than_threshold = cosine_threshold

        self._client = QdrantClient(
            url=os.environ.get(
                "QDRANT_URL", config.get("qdrant", "uri", fallback=None)
            ),
            api_key=os.environ.get(
                "QDRANT_API_KEY", config.get("qdrant", "apikey", fallback=None)
            ),
        )
        self._max_batch_size = self.global_config["embedding_batch_num"]
        QdrantVectorDBStorage.create_collection_if_not_exist(
            self._client,
            self.namespace,
            vectors_config=models.VectorParams(
                size=self.embedding_func.embedding_dim, distance=models.Distance.COSINE
            ),
        )

    async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
        logger.info(f"Inserting {len(data)} to {self.namespace}")
        if not data:
            return
        list_data = [
            {
                "id": k,
                **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
            }
            for k, v in data.items()
        ]
        contents = [v["content"] for v in data.values()]
        batches = [
            contents[i : i + self._max_batch_size]
            for i in range(0, len(contents), self._max_batch_size)
        ]

        embedding_tasks = [self.embedding_func(batch) for batch in batches]
        embeddings_list = await asyncio.gather(*embedding_tasks)

        embeddings = np.concatenate(embeddings_list)

        list_points = []
        for i, d in enumerate(list_data):
            list_points.append(
                models.PointStruct(
                    id=compute_mdhash_id_for_qdrant(d["id"]),
                    vector=embeddings[i],
                    payload=d,
                )
            )

        results = self._client.upsert(
            collection_name=self.namespace, points=list_points, wait=True
        )
        return results

    async def query(self, query: str, top_k: int) -> list[dict[str, Any]]:
        embedding = await self.embedding_func([query])
        results = self._client.search(
            collection_name=self.namespace,
            query_vector=embedding[0],
            limit=top_k,
            with_payload=True,
            score_threshold=self.cosine_better_than_threshold,
        )

        logger.debug(f"query result: {results}")

        return [{**dp.payload, "id": dp.id, "distance": dp.score} for dp in results]

    async def index_done_callback(self) -> None:
        # Qdrant handles persistence automatically
        pass

    async def delete(self, ids: List[str]) -> None:
        """Delete vectors with specified IDs

        Args:
            ids: List of vector IDs to be deleted
        """
        try:
            # Convert regular ids to Qdrant compatible ids
            qdrant_ids = [compute_mdhash_id_for_qdrant(id) for id in ids]
            # Delete points from the collection
            self._client.delete(
                collection_name=self.namespace,
                points_selector=models.PointIdsList(
                    points=qdrant_ids,
                ),
                wait=True,
            )
            logger.debug(
                f"Successfully deleted {len(ids)} vectors from {self.namespace}"
            )
        except Exception as e:
            logger.error(f"Error while deleting vectors from {self.namespace}: {e}")

    async def delete_entity(self, entity_name: str) -> None:
        """Delete an entity by name

        Args:
            entity_name: Name of the entity to delete
        """
        try:
            # Generate the entity ID
            entity_id = compute_mdhash_id_for_qdrant(entity_name, prefix="ent-")
            logger.debug(
                f"Attempting to delete entity {entity_name} with ID {entity_id}"
            )

            # Delete the entity point from the collection
            self._client.delete(
                collection_name=self.namespace,
                points_selector=models.PointIdsList(
                    points=[entity_id],
                ),
                wait=True,
            )
            logger.debug(f"Successfully deleted entity {entity_name}")
        except Exception as e:
            logger.error(f"Error deleting entity {entity_name}: {e}")

    async def delete_entity_relation(self, entity_name: str) -> None:
        """Delete all relations associated with an entity

        Args:
            entity_name: Name of the entity whose relations should be deleted
        """
        try:
            # Find relations where the entity is either source or target
            results = self._client.scroll(
                collection_name=self.namespace,
                scroll_filter=models.Filter(
                    should=[
                        models.FieldCondition(
                            key="src_id", match=models.MatchValue(value=entity_name)
                        ),
                        models.FieldCondition(
                            key="tgt_id", match=models.MatchValue(value=entity_name)
                        ),
                    ]
                ),
                with_payload=True,
                limit=1000,  # Adjust as needed for your use case
            )

            # Extract points that need to be deleted
            relation_points = results[0]
            ids_to_delete = [point.id for point in relation_points]

            if ids_to_delete:
                # Delete the relations
                self._client.delete(
                    collection_name=self.namespace,
                    points_selector=models.PointIdsList(
                        points=ids_to_delete,
                    ),
                    wait=True,
                )
                logger.debug(
                    f"Deleted {len(ids_to_delete)} relations for {entity_name}"
                )
            else:
                logger.debug(f"No relations found for entity {entity_name}")
        except Exception as e:
            logger.error(f"Error deleting relations for {entity_name}: {e}")

    async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:
        """Search for records with IDs starting with a specific prefix.

        Args:
            prefix: The prefix to search for in record IDs

        Returns:
            List of records with matching ID prefixes
        """
        try:
            # Use scroll method to find records with IDs starting with the prefix
            results = self._client.scroll(
                collection_name=self.namespace,
                scroll_filter=models.Filter(
                    must=[
                        models.FieldCondition(
                            key="id", match=models.MatchText(text=prefix, prefix=True)
                        )
                    ]
                ),
                with_payload=True,
                with_vectors=False,
                limit=1000,  # Adjust as needed for your use case
            )

            # Extract matching points
            matching_records = results[0]

            # Format the results to match expected return format
            formatted_results = [
                {**point.payload, "id": point.id} for point in matching_records
            ]

            logger.debug(
                f"Found {len(formatted_results)} records with prefix '{prefix}'"
            )
            return formatted_results

        except Exception as e:
            logger.error(f"Error searching for prefix '{prefix}': {e}")
            return []
add qdrant backend 2025-02-10 00:57:28 +08:00			`import asyncio`
			`import os`
Implement the missing methods. 2025-03-04 15:50:53 +08:00			`from typing import Any, final, List`
add qdrant backend 2025-02-10 00:57:28 +08:00			`from dataclasses import dataclass`
			`import numpy as np`
			`import hashlib`
			`import uuid`
			`from ..utils import logger`
			`from ..base import BaseVectorStorage`
feat optimize storage configuration and environment variables * add storage type compatibility validation table * add enviroment variables check for storage * modify storage init to get setting from confing.ini and env 2025-02-11 00:55:52 +08:00			`import configparser`
add qdrant backend 2025-02-10 00:57:28 +08:00

feat optimize storage configuration and environment variables * add storage type compatibility validation table * add enviroment variables check for storage * modify storage init to get setting from confing.ini and env 2025-02-11 00:55:52 +08:00			`config = configparser.ConfigParser()`
			`config.read("config.ini", "utf-8")`

back to not making breaks 2025-02-16 15:08:50 +01:00			`import pipmaster as pm`

Update qdrant_impl.py 2025-02-19 19:51:39 +01:00			`if not pm.is_installed("qdrant-client"):`
			`pm.install("qdrant-client")`
back to not making breaks 2025-02-16 15:08:50 +01:00
Update qdrant_impl.py 2025-02-19 19:51:39 +01:00			`from qdrant_client import QdrantClient, models`
added final, required methods and cleaned import 2025-02-16 14:38:09 +01:00
Fix linting 2025-02-11 03:29:40 +08:00
add qdrant backend 2025-02-10 00:57:28 +08:00			`def compute_mdhash_id_for_qdrant(`
			`content: str, prefix: str = "", style: str = "simple"`
			`) -> str:`
			`"""`
			`Generate a UUID based on the content and support multiple formats.`

			`:param content: The content used to generate the UUID.`
			`:param style: The format of the UUID, optional values are "simple", "hyphenated", "urn".`
			`:return: A UUID that meets the requirements of Qdrant.`
			`"""`
			`if not content:`
			`raise ValueError("Content must not be empty.")`

			`# Use the hash value of the content to create a UUID.`
			`hashed_content = hashlib.sha256((prefix + content).encode("utf-8")).digest()`
			`generated_uuid = uuid.UUID(bytes=hashed_content[:16], version=4)`

			`# Return the UUID according to the specified format.`
			`if style == "simple":`
			`return generated_uuid.hex`
			`elif style == "hyphenated":`
			`return str(generated_uuid)`
			`elif style == "urn":`
			`return f"urn:uuid:{generated_uuid}"`
			`else:`
			`raise ValueError("Invalid style. Choose from 'simple', 'hyphenated', or 'urn'.")`


added final, required methods and cleaned import 2025-02-16 14:38:09 +01:00			`@final`
add qdrant backend 2025-02-10 00:57:28 +08:00			`@dataclass`
			`class QdrantVectorDBStorage(BaseVectorStorage):`
			`@staticmethod`
			`def create_collection_if_not_exist(`
			`client: QdrantClient, collection_name: str, **kwargs`
			`):`
			`if client.collection_exists(collection_name):`
			`return`
			`client.create_collection(collection_name, **kwargs)`

			`def __post_init__(self):`
fix configuration errors of mongodb, neo4j, and qdrant backends. 2025-02-14 02:48:15 +08:00			`kwargs = self.global_config.get("vector_db_storage_cls_kwargs", {})`
			`cosine_threshold = kwargs.get("cosine_better_than_threshold")`
refactor: make cosine similarity threshold a required config parameter • Remove default threshold from env var • Add validation for missing threshold • Move default to lightrag.py config init • Update all vector DB implementations • Improve threshold validation consistency 2025-02-13 03:25:48 +08:00			`if cosine_threshold is None:`
Fix linting 2025-02-13 04:12:00 +08:00			`raise ValueError(`
			`"cosine_better_than_threshold must be specified in vector_db_storage_cls_kwargs"`
			`)`
refactor: make cosine similarity threshold a required config parameter • Remove default threshold from env var • Add validation for missing threshold • Move default to lightrag.py config init • Update all vector DB implementations • Improve threshold validation consistency 2025-02-13 03:25:48 +08:00			`self.cosine_better_than_threshold = cosine_threshold`

add qdrant backend 2025-02-10 00:57:28 +08:00			`self._client = QdrantClient(`
Fix linting 2025-02-11 03:29:40 +08:00			`url=os.environ.get(`
			`"QDRANT_URL", config.get("qdrant", "uri", fallback=None)`
			`),`
			`api_key=os.environ.get(`
			`"QDRANT_API_KEY", config.get("qdrant", "apikey", fallback=None)`
			`),`
add qdrant backend 2025-02-10 00:57:28 +08:00			`)`
			`self._max_batch_size = self.global_config["embedding_batch_num"]`
			`QdrantVectorDBStorage.create_collection_if_not_exist(`
			`self._client,`
			`self.namespace,`
			`vectors_config=models.VectorParams(`
			`size=self.embedding_func.embedding_dim, distance=models.Distance.COSINE`
			`),`
			`)`

updated clean of what implemented on BaseVectorStorage 2025-02-16 13:24:42 +01:00			`async def upsert(self, data: dict[str, dict[str, Any]]) -> None:`
fixed return 2025-02-19 22:22:41 +01:00			`logger.info(f"Inserting {len(data)} to {self.namespace}")`
			`if not data:`
			`return`
add qdrant backend 2025-02-10 00:57:28 +08:00			`list_data = [`
			`{`
			`"id": k,`
			`**{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},`
			`}`
			`for k, v in data.items()`
			`]`
			`contents = [v["content"] for v in data.values()]`
			`batches = [`
			`contents[i : i + self._max_batch_size]`
			`for i in range(0, len(contents), self._max_batch_size)`
			`]`

remove tqdm and cleaned readme and ollama 2025-02-18 19:58:03 +01:00			`embedding_tasks = [self.embedding_func(batch) for batch in batches]`
add qdrant backend 2025-02-10 00:57:28 +08:00			`embeddings_list = await asyncio.gather(*embedding_tasks)`

			`embeddings = np.concatenate(embeddings_list)`

			`list_points = []`
			`for i, d in enumerate(list_data):`
			`list_points.append(`
			`models.PointStruct(`
			`id=compute_mdhash_id_for_qdrant(d["id"]),`
			`vector=embeddings[i],`
			`payload=d,`
			`)`
			`)`

			`results = self._client.upsert(`
			`collection_name=self.namespace, points=list_points, wait=True`
			`)`
			`return results`

updated clean of what implemented on BaseVectorStorage 2025-02-16 13:24:42 +01:00			`async def query(self, query: str, top_k: int) -> list[dict[str, Any]]:`
add qdrant backend 2025-02-10 00:57:28 +08:00			`embedding = await self.embedding_func([query])`
			`results = self._client.search(`
			`collection_name=self.namespace,`
			`query_vector=embedding[0],`
			`limit=top_k,`
			`with_payload=True,`
remove redundant cosine similarity filter in Qdrant query fix 2025-02-14 03:14:48 +08:00			`score_threshold=self.cosine_better_than_threshold,`
add qdrant backend 2025-02-10 00:57:28 +08:00			`)`
remove redundant cosine similarity filter in Qdrant query fix 2025-02-14 03:14:48 +08:00
add qdrant backend 2025-02-10 00:57:28 +08:00			`logger.debug(f"query result: {results}")`
remove redundant cosine similarity filter in Qdrant query fix 2025-02-14 03:14:48 +08:00
			`return [{**dp.payload, "id": dp.id, "distance": dp.score} for dp in results]`
updated clean of what implemented on BaseVectorStorage 2025-02-16 13:24:42 +01:00
			`async def index_done_callback(self) -> None:`
added some comments 2025-02-16 16:04:07 +01:00			`# Qdrant handles persistence automatically`
updated clean of what implemented on BaseVectorStorage 2025-02-16 13:24:42 +01:00			`pass`
cleaned code 2025-02-16 13:55:30 +01:00
Implement the missing methods. 2025-03-04 15:50:53 +08:00			`async def delete(self, ids: List[str]) -> None:`
			`"""Delete vectors with specified IDs`
fix linting 2025-03-04 15:53:20 +08:00
Implement the missing methods. 2025-03-04 15:50:53 +08:00			`Args:`
			`ids: List of vector IDs to be deleted`
			`"""`
			`try:`
			`# Convert regular ids to Qdrant compatible ids`
			`qdrant_ids = [compute_mdhash_id_for_qdrant(id) for id in ids]`
			`# Delete points from the collection`
			`self._client.delete(`
			`collection_name=self.namespace,`
			`points_selector=models.PointIdsList(`
			`points=qdrant_ids,`
			`),`
fix linting 2025-03-04 15:53:20 +08:00			`wait=True,`
			`)`
			`logger.debug(`
			`f"Successfully deleted {len(ids)} vectors from {self.namespace}"`
Implement the missing methods. 2025-03-04 15:50:53 +08:00			`)`
			`except Exception as e:`
			`logger.error(f"Error while deleting vectors from {self.namespace}: {e}")`

updated clean of what implemented on BaseVectorStorage 2025-02-16 13:24:42 +01:00			`async def delete_entity(self, entity_name: str) -> None:`
Implement the missing methods. 2025-03-04 15:50:53 +08:00			`"""Delete an entity by name`
fix linting 2025-03-04 15:53:20 +08:00
Implement the missing methods. 2025-03-04 15:50:53 +08:00			`Args:`
			`entity_name: Name of the entity to delete`
			`"""`
			`try:`
			`# Generate the entity ID`
			`entity_id = compute_mdhash_id_for_qdrant(entity_name, prefix="ent-")`
fix linting 2025-03-04 15:53:20 +08:00			`logger.debug(`
			`f"Attempting to delete entity {entity_name} with ID {entity_id}"`
			`)`

Implement the missing methods. 2025-03-04 15:50:53 +08:00			`# Delete the entity point from the collection`
			`self._client.delete(`
			`collection_name=self.namespace,`
			`points_selector=models.PointIdsList(`
			`points=[entity_id],`
			`),`
fix linting 2025-03-04 15:53:20 +08:00			`wait=True,`
Implement the missing methods. 2025-03-04 15:50:53 +08:00			`)`
			`logger.debug(f"Successfully deleted entity {entity_name}")`
			`except Exception as e:`
			`logger.error(f"Error deleting entity {entity_name}: {e}")`
updated clean of what implemented on BaseVectorStorage 2025-02-16 13:24:42 +01:00
			`async def delete_entity_relation(self, entity_name: str) -> None:`
Implement the missing methods. 2025-03-04 15:50:53 +08:00			`"""Delete all relations associated with an entity`
fix linting 2025-03-04 15:53:20 +08:00
Implement the missing methods. 2025-03-04 15:50:53 +08:00			`Args:`
			`entity_name: Name of the entity whose relations should be deleted`
			`"""`
			`try:`
			`# Find relations where the entity is either source or target`
			`results = self._client.scroll(`
			`collection_name=self.namespace,`
			`scroll_filter=models.Filter(`
			`should=[`
			`models.FieldCondition(`
fix linting 2025-03-04 15:53:20 +08:00			`key="src_id", match=models.MatchValue(value=entity_name)`
Implement the missing methods. 2025-03-04 15:50:53 +08:00			`),`
			`models.FieldCondition(`
fix linting 2025-03-04 15:53:20 +08:00			`key="tgt_id", match=models.MatchValue(value=entity_name)`
			`),`
Implement the missing methods. 2025-03-04 15:50:53 +08:00			`]`
			`),`
			`with_payload=True,`
fix linting 2025-03-04 15:53:20 +08:00			`limit=1000, # Adjust as needed for your use case`
Implement the missing methods. 2025-03-04 15:50:53 +08:00			`)`
fix linting 2025-03-04 15:53:20 +08:00
Implement the missing methods. 2025-03-04 15:50:53 +08:00			`# Extract points that need to be deleted`
			`relation_points = results[0]`
			`ids_to_delete = [point.id for point in relation_points]`
fix linting 2025-03-04 15:53:20 +08:00
Implement the missing methods. 2025-03-04 15:50:53 +08:00			`if ids_to_delete:`
			`# Delete the relations`
			`self._client.delete(`
			`collection_name=self.namespace,`
			`points_selector=models.PointIdsList(`
			`points=ids_to_delete,`
			`),`
fix linting 2025-03-04 15:53:20 +08:00			`wait=True,`
			`)`
			`logger.debug(`
			`f"Deleted {len(ids_to_delete)} relations for {entity_name}"`
Implement the missing methods. 2025-03-04 15:50:53 +08:00			`)`
			`else:`
			`logger.debug(f"No relations found for entity {entity_name}")`
			`except Exception as e:`
			`logger.error(f"Error deleting relations for {entity_name}: {e}")`
Fix edit entity and relation bugs 2025-03-07 14:39:06 +08:00
			`async def search_by_prefix(self, prefix: str) -> list[dict[str, Any]]:`
			`"""Search for records with IDs starting with a specific prefix.`

			`Args:`
			`prefix: The prefix to search for in record IDs`

			`Returns:`
			`List of records with matching ID prefixes`
			`"""`
			`try:`
			`# Use scroll method to find records with IDs starting with the prefix`
			`results = self._client.scroll(`
			`collection_name=self.namespace,`
			`scroll_filter=models.Filter(`
			`must=[`
			`models.FieldCondition(`
			`key="id", match=models.MatchText(text=prefix, prefix=True)`
			`)`
			`]`
			`),`
			`with_payload=True,`
			`with_vectors=False,`
			`limit=1000, # Adjust as needed for your use case`
			`)`

			`# Extract matching points`
			`matching_records = results[0]`

			`# Format the results to match expected return format`
			`formatted_results = [`
			`{**point.payload, "id": point.id} for point in matching_records`
			`]`

			`logger.debug(`
			`f"Found {len(formatted_results)} records with prefix '{prefix}'"`
			`)`
			`return formatted_results`

			`except Exception as e:`
			`logger.error(f"Error searching for prefix '{prefix}': {e}")`
			`return []`