llama-hub/loader_hub/chroma/base.py

"""Chroma Reader."""

from typing import Any

from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document


class ChromaReader(BaseReader):
    """Chroma reader.

    Retrieve documents from existing persisted Chroma collections.

    Args:
        collection_name: Name of the peristed collection.
        persist_directory: Directory where the collection is persisted.

    """

    def __init__(
        self,
        collection_name: str,
        persist_directory: str,
    ) -> None:
        """Initialize with parameters."""
        import_err_msg = (
            "`chromadb` package not found, please run `pip install chromadb`"
        )
        import chromadb  # noqa: F401
        from chromadb.config import Settings

        if (collection_name is None) or (persist_directory is None):
            raise ValueError("Please provide a collection name and persist directory.")

        self._client = chromadb.Client(
            Settings(
                chroma_db_impl="duckdb+parquet", persist_directory=persist_directory
            )
        )
        self._collection = self._client.get_collection(collection_name)

    def load_data(
        self,
        query_vector: Any,
        limit: int = 10,
    ) -> Any:
        """Load data from Chroma.

        Args:
            query_vector (Any): Query
            limit (int): Number of results to return.

        Returns:
            List[Document]: A list of documents.
        """
        results = self._collection.query(query_embeddings=query_vector, n_results=limit)

        documents = []
        for result in zip(results["ids"], results["documents"], results["embeddings"]):
            document = Document(
                doc_id=result[0][0],
                text=result[1][0],
                embedding=result[2][0],
            )
            documents.append(document)

        return documents