haystack/e2e/document_stores/test_cosine_similarity.py
Massimiliano Pippi 4974bf7ab3
chore: remove deprecated MilvusDocumentStore (#4951)
* remove deprecated MilvusDocumentStore

* remove leftovers

* fix pylint
2023-05-19 16:37:38 +02:00

153 lines
6.4 KiB
Python

from copy import deepcopy
import math
import pytest
import numpy as np
from haystack.schema import Document
from ..conftest import document_store
DOCUMENTS = [
{
"meta": {"name": "name_1", "year": "2020", "month": "01"},
"content": "text_1",
"embedding": np.random.rand(768).astype(np.float32),
},
{
"meta": {"name": "name_2", "year": "2020", "month": "02"},
"content": "text_2",
"embedding": np.random.rand(768).astype(np.float32),
},
{
"meta": {"name": "name_3", "year": "2020", "month": "03"},
"content": "text_3",
"embedding": np.random.rand(768).astype(np.float64),
},
{
"meta": {"name": "name_4", "year": "2021", "month": "01"},
"content": "text_4",
"embedding": np.random.rand(768).astype(np.float32),
},
{
"meta": {"name": "name_5", "year": "2021", "month": "02"},
"content": "text_5",
"embedding": np.random.rand(768).astype(np.float32),
},
{
"meta": {"name": "name_6", "year": "2021", "month": "03"},
"content": "text_6",
"embedding": np.random.rand(768).astype(np.float64),
},
]
@pytest.mark.parametrize("name", ["faiss", "weaviate", "opensearch_faiss", "elasticsearch", "memory"])
def test_cosine_similarity(name, tmp_path):
documents = [Document.from_dict(d) for d in DOCUMENTS]
with document_store(name, documents, tmp_path) as ds:
# below we will write documents to the store and then query it to see if vectors were normalized or not
query = np.random.rand(768).astype(np.float32)
query_results = ds.query_by_embedding(
query_emb=query, top_k=len(documents), return_embedding=True, scale_score=False
)
# check if search with cosine similarity returns the correct number of results
assert len(query_results) == len(documents)
original_embeddings = {doc["content"]: doc["embedding"] for doc in DOCUMENTS}
for doc in query_results:
result_emb = doc.embedding
original_emb = original_embeddings[doc.content]
expected_emb = original_emb
# embeddings of document stores which only support dot product out of the box must be normalized
if name in ["faiss", "weaviate", "opensearch_faiss"]:
expected_emb = original_emb / np.linalg.norm(original_emb)
# check if the stored embedding was normalized or not
np.testing.assert_allclose(
expected_emb, result_emb, rtol=0.2, atol=5e-07
) # high tolerance was necessary for Milvus 2
# check if the score is plausible for cosine similarity
cosine_score = np.dot(result_emb, query) / (np.linalg.norm(result_emb) * np.linalg.norm(query))
assert cosine_score == pytest.approx(doc.score, 0.01)
@pytest.mark.parametrize("name", ["faiss", "weaviate", "opensearch_faiss", "elasticsearch", "memory"])
def test_update_embeddings_cosine_similarity(name, tmp_path):
# clear embeddings and convert to Document
documents = deepcopy(DOCUMENTS)
for doc in documents:
doc.pop("embedding")
documents = [Document.from_dict(d) for d in documents]
with document_store(name, documents, tmp_path) as ds:
# we wrote documents to the store and then query it to see if vectors were normalized
original_embeddings = {}
# now check if vectors are normalized when updating embeddings
class MockRetriever:
def embed_documents(self, docs):
embeddings = []
for doc in docs:
embedding = np.random.rand(768).astype(np.float32)
original_embeddings[doc.content] = embedding
embeddings.append(embedding)
return np.stack(embeddings)
retriever = MockRetriever()
ds.update_embeddings(retriever=retriever)
query = np.random.rand(768).astype(np.float32)
query_results = ds.query_by_embedding(
query_emb=query, top_k=len(DOCUMENTS), return_embedding=True, scale_score=False
)
# check if search with cosine similarity returns the correct number of results
assert len(query_results) == len(DOCUMENTS)
for doc in query_results:
result_emb = doc.embedding
original_emb = original_embeddings[doc.content]
expected_emb = original_emb
# embeddings of document stores which only support dot product out of the box must be normalized
if name in ["faiss", "weaviate", "opensearch_faiss"]:
expected_emb = original_emb / np.linalg.norm(original_emb)
# check if the stored embedding was normalized or not
np.testing.assert_allclose(
expected_emb, result_emb, rtol=0.2, atol=5e-07
) # high tolerance was necessary for Milvus 2
# check if the score is plausible for cosine similarity
cosine_score = np.dot(result_emb, query) / (np.linalg.norm(result_emb) * np.linalg.norm(query))
assert cosine_score == pytest.approx(doc.score, 0.01)
@pytest.mark.parametrize("name", ["faiss", "weaviate", "memory", "elasticsearch", "opensearch_faiss"])
def test_cosine_sanity_check(name, tmp_path):
VEC_1 = np.array([0.1, 0.2, 0.3], dtype="float32")
VEC_2 = np.array([0.4, 0.5, 0.6], dtype="float32")
# This is the cosine similarity of VEC_1 and VEC_2 calculated using sklearn.metrics.pairwise.cosine_similarity
# The score is normalized to yield a value between 0 and 1.
KNOWN_COSINE = 0.9746317
KNOWN_SCALED_COSINE = (KNOWN_COSINE + 1) / 2
docs = [Document.from_dict({"name": "vec_1", "text": "vec_1", "content": "vec_1", "embedding": VEC_1})]
with document_store(name, docs, tmp_path, embedding_dim=3) as ds:
query_results = ds.query_by_embedding(query_emb=VEC_2, top_k=1, return_embedding=True, scale_score=True)
# check if faiss returns the same cosine similarity. Manual testing with faiss yielded 0.9746318
assert math.isclose(query_results[0].score, KNOWN_SCALED_COSINE, abs_tol=0.0002)
query_results = ds.query_by_embedding(query_emb=VEC_2, top_k=1, return_embedding=True, scale_score=False)
# check if faiss returns the same cosine similarity. Manual testing with faiss yielded 0.9746318
assert math.isclose(query_results[0].score, KNOWN_COSINE, abs_tol=0.0002)