haystack/e2e/document_stores/test_cosine_similarity.py

from copy import deepcopy
import math

import pytest
import numpy as np

from haystack.schema import Document

from ..conftest import document_store


DOCUMENTS = [
    {
        "meta": {"name": "name_1", "year": "2020", "month": "01"},
        "content": "text_1",
        "embedding": np.random.rand(768).astype(np.float32),
    },
    {
        "meta": {"name": "name_2", "year": "2020", "month": "02"},
        "content": "text_2",
        "embedding": np.random.rand(768).astype(np.float32),
    },
    {
        "meta": {"name": "name_3", "year": "2020", "month": "03"},
        "content": "text_3",
        "embedding": np.random.rand(768).astype(np.float64),
    },
    {
        "meta": {"name": "name_4", "year": "2021", "month": "01"},
        "content": "text_4",
        "embedding": np.random.rand(768).astype(np.float32),
    },
    {
        "meta": {"name": "name_5", "year": "2021", "month": "02"},
        "content": "text_5",
        "embedding": np.random.rand(768).astype(np.float32),
    },
    {
        "meta": {"name": "name_6", "year": "2021", "month": "03"},
        "content": "text_6",
        "embedding": np.random.rand(768).astype(np.float64),
    },
]


@pytest.mark.parametrize("name", ["faiss", "weaviate", "opensearch_faiss", "elasticsearch", "memory"])
def test_cosine_similarity(name, tmp_path):
    documents = [Document.from_dict(d) for d in DOCUMENTS]
    with document_store(name, documents, tmp_path) as ds:
        # below we will write documents to the store and then query it to see if vectors were normalized or not
        query = np.random.rand(768).astype(np.float32)
        query_results = ds.query_by_embedding(
            query_emb=query, top_k=len(documents), return_embedding=True, scale_score=False
        )

        # check if search with cosine similarity returns the correct number of results
        assert len(query_results) == len(documents)

        original_embeddings = {doc["content"]: doc["embedding"] for doc in DOCUMENTS}

        for doc in query_results:
            result_emb = doc.embedding
            original_emb = original_embeddings[doc.content]

            expected_emb = original_emb
            # embeddings of document stores which only support dot product out of the box must be normalized
            if name in ["faiss", "weaviate", "opensearch_faiss"]:
                expected_emb = original_emb / np.linalg.norm(original_emb)

            # check if the stored embedding was normalized or not
            np.testing.assert_allclose(
                expected_emb, result_emb, rtol=0.2, atol=5e-07
            )  # high tolerance was necessary for Milvus 2

            # check if the score is plausible for cosine similarity
            cosine_score = np.dot(result_emb, query) / (np.linalg.norm(result_emb) * np.linalg.norm(query))
            assert cosine_score == pytest.approx(doc.score, 0.01)


@pytest.mark.parametrize("name", ["faiss", "weaviate", "opensearch_faiss", "elasticsearch", "memory"])
def test_update_embeddings_cosine_similarity(name, tmp_path):
    # clear embeddings and convert to Document
    documents = deepcopy(DOCUMENTS)
    for doc in documents:
        doc.pop("embedding")
    documents = [Document.from_dict(d) for d in documents]

    with document_store(name, documents, tmp_path) as ds:
        # we wrote documents to the store and then query it to see if vectors were normalized
        original_embeddings = {}

        # now check if vectors are normalized when updating embeddings
        class MockRetriever:
            def embed_documents(self, docs):
                embeddings = []
                for doc in docs:
                    embedding = np.random.rand(768).astype(np.float32)
                    original_embeddings[doc.content] = embedding
                    embeddings.append(embedding)
                return np.stack(embeddings)

        retriever = MockRetriever()
        ds.update_embeddings(retriever=retriever)

        query = np.random.rand(768).astype(np.float32)
        query_results = ds.query_by_embedding(
            query_emb=query, top_k=len(DOCUMENTS), return_embedding=True, scale_score=False
        )

        # check if search with cosine similarity returns the correct number of results
        assert len(query_results) == len(DOCUMENTS)

        for doc in query_results:
            result_emb = doc.embedding
            original_emb = original_embeddings[doc.content]

            expected_emb = original_emb
            # embeddings of document stores which only support dot product out of the box must be normalized
            if name in ["faiss", "weaviate", "opensearch_faiss"]:
                expected_emb = original_emb / np.linalg.norm(original_emb)

            # check if the stored embedding was normalized or not
            np.testing.assert_allclose(
                expected_emb, result_emb, rtol=0.2, atol=5e-07
            )  # high tolerance was necessary for Milvus 2

            # check if the score is plausible for cosine similarity
            cosine_score = np.dot(result_emb, query) / (np.linalg.norm(result_emb) * np.linalg.norm(query))
            assert cosine_score == pytest.approx(doc.score, 0.01)


@pytest.mark.parametrize("name", ["faiss", "weaviate", "memory", "elasticsearch", "opensearch_faiss"])
def test_cosine_sanity_check(name, tmp_path):
    VEC_1 = np.array([0.1, 0.2, 0.3], dtype="float32")
    VEC_2 = np.array([0.4, 0.5, 0.6], dtype="float32")

    # This is the cosine similarity of VEC_1 and VEC_2 calculated using sklearn.metrics.pairwise.cosine_similarity
    # The score is normalized to yield a value between 0 and 1.
    KNOWN_COSINE = 0.9746317
    KNOWN_SCALED_COSINE = (KNOWN_COSINE + 1) / 2

    docs = [Document.from_dict({"name": "vec_1", "text": "vec_1", "content": "vec_1", "embedding": VEC_1})]
    with document_store(name, docs, tmp_path, embedding_dim=3) as ds:
        query_results = ds.query_by_embedding(query_emb=VEC_2, top_k=1, return_embedding=True, scale_score=True)

        # check if faiss returns the same cosine similarity. Manual testing with faiss yielded 0.9746318
        assert math.isclose(query_results[0].score, KNOWN_SCALED_COSINE, abs_tol=0.0002)

        query_results = ds.query_by_embedding(query_emb=VEC_2, top_k=1, return_embedding=True, scale_score=False)

        # check if faiss returns the same cosine similarity. Manual testing with faiss yielded 0.9746318
        assert math.isclose(query_results[0].score, KNOWN_COSINE, abs_tol=0.0002)