mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-29 10:56:40 +00:00
153 lines
6.4 KiB
Python
153 lines
6.4 KiB
Python
from copy import deepcopy
|
|
import math
|
|
|
|
import pytest
|
|
import numpy as np
|
|
|
|
from haystack.schema import Document
|
|
|
|
from ..conftest import document_store
|
|
|
|
|
|
DOCUMENTS = [
|
|
{
|
|
"meta": {"name": "name_1", "year": "2020", "month": "01"},
|
|
"content": "text_1",
|
|
"embedding": np.random.rand(768).astype(np.float32),
|
|
},
|
|
{
|
|
"meta": {"name": "name_2", "year": "2020", "month": "02"},
|
|
"content": "text_2",
|
|
"embedding": np.random.rand(768).astype(np.float32),
|
|
},
|
|
{
|
|
"meta": {"name": "name_3", "year": "2020", "month": "03"},
|
|
"content": "text_3",
|
|
"embedding": np.random.rand(768).astype(np.float64),
|
|
},
|
|
{
|
|
"meta": {"name": "name_4", "year": "2021", "month": "01"},
|
|
"content": "text_4",
|
|
"embedding": np.random.rand(768).astype(np.float32),
|
|
},
|
|
{
|
|
"meta": {"name": "name_5", "year": "2021", "month": "02"},
|
|
"content": "text_5",
|
|
"embedding": np.random.rand(768).astype(np.float32),
|
|
},
|
|
{
|
|
"meta": {"name": "name_6", "year": "2021", "month": "03"},
|
|
"content": "text_6",
|
|
"embedding": np.random.rand(768).astype(np.float64),
|
|
},
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize("name", ["faiss", "weaviate", "opensearch_faiss", "elasticsearch", "memory"])
|
|
def test_cosine_similarity(name, tmp_path):
|
|
documents = [Document.from_dict(d) for d in DOCUMENTS]
|
|
with document_store(name, documents, tmp_path) as ds:
|
|
# below we will write documents to the store and then query it to see if vectors were normalized or not
|
|
query = np.random.rand(768).astype(np.float32)
|
|
query_results = ds.query_by_embedding(
|
|
query_emb=query, top_k=len(documents), return_embedding=True, scale_score=False
|
|
)
|
|
|
|
# check if search with cosine similarity returns the correct number of results
|
|
assert len(query_results) == len(documents)
|
|
|
|
original_embeddings = {doc["content"]: doc["embedding"] for doc in DOCUMENTS}
|
|
|
|
for doc in query_results:
|
|
result_emb = doc.embedding
|
|
original_emb = original_embeddings[doc.content]
|
|
|
|
expected_emb = original_emb
|
|
# embeddings of document stores which only support dot product out of the box must be normalized
|
|
if name in ["faiss", "weaviate", "opensearch_faiss"]:
|
|
expected_emb = original_emb / np.linalg.norm(original_emb)
|
|
|
|
# check if the stored embedding was normalized or not
|
|
np.testing.assert_allclose(
|
|
expected_emb, result_emb, rtol=0.2, atol=5e-07
|
|
) # high tolerance was necessary for Milvus 2
|
|
|
|
# check if the score is plausible for cosine similarity
|
|
cosine_score = np.dot(result_emb, query) / (np.linalg.norm(result_emb) * np.linalg.norm(query))
|
|
assert cosine_score == pytest.approx(doc.score, 0.01)
|
|
|
|
|
|
@pytest.mark.parametrize("name", ["faiss", "weaviate", "opensearch_faiss", "elasticsearch", "memory"])
|
|
def test_update_embeddings_cosine_similarity(name, tmp_path):
|
|
# clear embeddings and convert to Document
|
|
documents = deepcopy(DOCUMENTS)
|
|
for doc in documents:
|
|
doc.pop("embedding")
|
|
documents = [Document.from_dict(d) for d in documents]
|
|
|
|
with document_store(name, documents, tmp_path) as ds:
|
|
# we wrote documents to the store and then query it to see if vectors were normalized
|
|
original_embeddings = {}
|
|
|
|
# now check if vectors are normalized when updating embeddings
|
|
class MockRetriever:
|
|
def embed_documents(self, docs):
|
|
embeddings = []
|
|
for doc in docs:
|
|
embedding = np.random.rand(768).astype(np.float32)
|
|
original_embeddings[doc.content] = embedding
|
|
embeddings.append(embedding)
|
|
return np.stack(embeddings)
|
|
|
|
retriever = MockRetriever()
|
|
ds.update_embeddings(retriever=retriever)
|
|
|
|
query = np.random.rand(768).astype(np.float32)
|
|
query_results = ds.query_by_embedding(
|
|
query_emb=query, top_k=len(DOCUMENTS), return_embedding=True, scale_score=False
|
|
)
|
|
|
|
# check if search with cosine similarity returns the correct number of results
|
|
assert len(query_results) == len(DOCUMENTS)
|
|
|
|
for doc in query_results:
|
|
result_emb = doc.embedding
|
|
original_emb = original_embeddings[doc.content]
|
|
|
|
expected_emb = original_emb
|
|
# embeddings of document stores which only support dot product out of the box must be normalized
|
|
if name in ["faiss", "weaviate", "opensearch_faiss"]:
|
|
expected_emb = original_emb / np.linalg.norm(original_emb)
|
|
|
|
# check if the stored embedding was normalized or not
|
|
np.testing.assert_allclose(
|
|
expected_emb, result_emb, rtol=0.2, atol=5e-07
|
|
) # high tolerance was necessary for Milvus 2
|
|
|
|
# check if the score is plausible for cosine similarity
|
|
cosine_score = np.dot(result_emb, query) / (np.linalg.norm(result_emb) * np.linalg.norm(query))
|
|
assert cosine_score == pytest.approx(doc.score, 0.01)
|
|
|
|
|
|
@pytest.mark.parametrize("name", ["faiss", "weaviate", "memory", "elasticsearch", "opensearch_faiss"])
|
|
def test_cosine_sanity_check(name, tmp_path):
|
|
VEC_1 = np.array([0.1, 0.2, 0.3], dtype="float32")
|
|
VEC_2 = np.array([0.4, 0.5, 0.6], dtype="float32")
|
|
|
|
# This is the cosine similarity of VEC_1 and VEC_2 calculated using sklearn.metrics.pairwise.cosine_similarity
|
|
# The score is normalized to yield a value between 0 and 1.
|
|
KNOWN_COSINE = 0.9746317
|
|
KNOWN_SCALED_COSINE = (KNOWN_COSINE + 1) / 2
|
|
|
|
docs = [Document.from_dict({"name": "vec_1", "text": "vec_1", "content": "vec_1", "embedding": VEC_1})]
|
|
with document_store(name, docs, tmp_path, embedding_dim=3) as ds:
|
|
query_results = ds.query_by_embedding(query_emb=VEC_2, top_k=1, return_embedding=True, scale_score=True)
|
|
|
|
# check if faiss returns the same cosine similarity. Manual testing with faiss yielded 0.9746318
|
|
assert math.isclose(query_results[0].score, KNOWN_SCALED_COSINE, abs_tol=0.0002)
|
|
|
|
query_results = ds.query_by_embedding(query_emb=VEC_2, top_k=1, return_embedding=True, scale_score=False)
|
|
|
|
# check if faiss returns the same cosine similarity. Manual testing with faiss yielded 0.9746318
|
|
assert math.isclose(query_results[0].score, KNOWN_COSINE, abs_tol=0.0002)
|