mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-08-17 21:22:02 +00:00
116 lines
3.4 KiB
Python
116 lines
3.4 KiB
Python
![]() |
"""DeepLake reader."""
|
||
|
from typing import List, Optional, Union
|
||
|
|
||
|
import numpy as np
|
||
|
from llama_index.readers.base import BaseReader
|
||
|
from llama_index.readers.schema.base import Document
|
||
|
|
||
|
distance_metric_map = {
|
||
|
"l2": lambda a, b: np.linalg.norm(a - b, axis=1, ord=2),
|
||
|
"l1": lambda a, b: np.linalg.norm(a - b, axis=1, ord=1),
|
||
|
"max": lambda a, b: np.linalg.norm(a - b, axis=1, ord=np.inf),
|
||
|
"cos": lambda a, b: np.dot(a, b.T)
|
||
|
/ (np.linalg.norm(a) * np.linalg.norm(b, axis=1)),
|
||
|
"dot": lambda a, b: np.dot(a, b.T),
|
||
|
}
|
||
|
|
||
|
|
||
|
def vector_search(
|
||
|
query_vector: Union[List, np.ndarray],
|
||
|
data_vectors: np.ndarray,
|
||
|
distance_metric: str = "l2",
|
||
|
limit: Optional[int] = 4,
|
||
|
) -> List:
|
||
|
"""Naive search for nearest neighbors
|
||
|
args:
|
||
|
query_vector: Union[List, np.ndarray]
|
||
|
data_vectors: np.ndarray
|
||
|
limit (int): number of nearest neighbors
|
||
|
distance_metric: distance function 'L2' for Euclidean, 'L1' for Nuclear, 'Max'
|
||
|
l-infinity distnace, 'cos' for cosine similarity, 'dot' for dot product
|
||
|
returns:
|
||
|
nearest_indices: List, indices of nearest neighbors
|
||
|
"""
|
||
|
# Calculate the distance between the query_vector and all data_vectors
|
||
|
if isinstance(query_vector, list):
|
||
|
query_vector = np.array(query_vector)
|
||
|
query_vector = query_vector.reshape(1, -1)
|
||
|
|
||
|
distances = distance_metric_map[distance_metric](query_vector, data_vectors)
|
||
|
nearest_indices = np.argsort(distances)
|
||
|
|
||
|
nearest_indices = (
|
||
|
nearest_indices[::-1][:limit]
|
||
|
if distance_metric in ["cos"]
|
||
|
else nearest_indices[:limit]
|
||
|
)
|
||
|
|
||
|
return nearest_indices.tolist()
|
||
|
|
||
|
|
||
|
class DeepLakeReader(BaseReader):
|
||
|
"""DeepLake reader.
|
||
|
|
||
|
Retrieve documents from existing DeepLake datasets.
|
||
|
|
||
|
Args:
|
||
|
dataset_name: Name of the deeplake dataset.
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
token: Optional[str] = None,
|
||
|
):
|
||
|
|
||
|
"""initializing the deepLake reader"""
|
||
|
import_err_msg = (
|
||
|
"`deeplake` package not found, please run `pip install deeplake`"
|
||
|
)
|
||
|
try:
|
||
|
import deeplake # noqa: F401
|
||
|
except ImportError:
|
||
|
raise ImportError(import_err_msg)
|
||
|
self.token = token
|
||
|
|
||
|
def load_data(
|
||
|
self,
|
||
|
query_vector: List[float],
|
||
|
dataset_path: str,
|
||
|
limit: int = 4,
|
||
|
distance_metric: str = "l2",
|
||
|
) -> List[Document]:
|
||
|
"""Load data from DeepLake.
|
||
|
|
||
|
Args:
|
||
|
dataset_name (str): Name of the DeepLake dataet.
|
||
|
query_vector (List[float]): Query vector.
|
||
|
limit (int): Number of results to return.
|
||
|
|
||
|
Returns:
|
||
|
List[Document]: A list of documents.
|
||
|
"""
|
||
|
import deeplake
|
||
|
from deeplake.util.exceptions import TensorDoesNotExistError
|
||
|
|
||
|
dataset = deeplake.load(dataset_path, token=self.token)
|
||
|
|
||
|
try:
|
||
|
embeddings = dataset.embedding.numpy(fetch_chunks=True)
|
||
|
except Exception:
|
||
|
raise TensorDoesNotExistError("embedding")
|
||
|
|
||
|
indices = vector_search(
|
||
|
query_vector, embeddings, distance_metric=distance_metric, limit=limit
|
||
|
)
|
||
|
|
||
|
documents = []
|
||
|
for idx in indices:
|
||
|
document = Document(
|
||
|
doc_id=dataset[idx].ids.numpy().tolist()[0],
|
||
|
text=str(dataset[idx].text.numpy().tolist()[0]),
|
||
|
)
|
||
|
|
||
|
documents.append(document)
|
||
|
|
||
|
return documents
|