mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-09-05 14:23:32 +00:00

* move more tests to the base class * skip tests where unsupported * do not pass index label explicitly * skip test for Pinecone
558 lines
24 KiB
Python
558 lines
24 KiB
Python
from copy import deepcopy
|
|
import math
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
from unittest.mock import Mock
|
|
|
|
|
|
from ..conftest import get_document_store, ensure_ids_are_correct_uuids
|
|
from haystack.document_stores import (
|
|
InMemoryDocumentStore,
|
|
WeaviateDocumentStore,
|
|
MilvusDocumentStore,
|
|
FAISSDocumentStore,
|
|
ElasticsearchDocumentStore,
|
|
OpenSearchDocumentStore,
|
|
)
|
|
|
|
from haystack.document_stores.base import BaseDocumentStore
|
|
from haystack.document_stores.es_converter import elasticsearch_index_to_document_store
|
|
from haystack.schema import Document, Label, Answer, Span
|
|
from haystack.nodes import EmbeddingRetriever, PreProcessor
|
|
from haystack.pipelines import DocumentSearchPipeline
|
|
|
|
|
|
DOCUMENTS = [
|
|
{
|
|
"meta": {"name": "name_1", "year": "2020", "month": "01"},
|
|
"content": "text_1",
|
|
"embedding": np.random.rand(768).astype(np.float32),
|
|
},
|
|
{
|
|
"meta": {"name": "name_2", "year": "2020", "month": "02"},
|
|
"content": "text_2",
|
|
"embedding": np.random.rand(768).astype(np.float32),
|
|
},
|
|
{
|
|
"meta": {"name": "name_3", "year": "2020", "month": "03"},
|
|
"content": "text_3",
|
|
"embedding": np.random.rand(768).astype(np.float64),
|
|
},
|
|
{
|
|
"meta": {"name": "name_4", "year": "2021", "month": "01"},
|
|
"content": "text_4",
|
|
"embedding": np.random.rand(768).astype(np.float32),
|
|
},
|
|
{
|
|
"meta": {"name": "name_5", "year": "2021", "month": "02"},
|
|
"content": "text_5",
|
|
"embedding": np.random.rand(768).astype(np.float32),
|
|
},
|
|
{
|
|
"meta": {"name": "name_6", "year": "2021", "month": "03"},
|
|
"content": "text_6",
|
|
"embedding": np.random.rand(768).astype(np.float64),
|
|
},
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate"], indirect=True)
|
|
@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
|
|
def test_update_embeddings(document_store, retriever):
|
|
documents = []
|
|
for i in range(6):
|
|
documents.append({"content": f"text_{i}", "id": str(i), "meta_field": f"value_{i}"})
|
|
documents.append({"content": "text_0", "id": "6", "meta_field": "value_0"})
|
|
|
|
document_store.write_documents(documents)
|
|
document_store.update_embeddings(retriever, batch_size=3)
|
|
documents = document_store.get_all_documents(return_embedding=True)
|
|
assert len(documents) == 7
|
|
for doc in documents:
|
|
assert type(doc.embedding) is np.ndarray
|
|
|
|
documents = document_store.get_all_documents(filters={"meta_field": ["value_0"]}, return_embedding=True)
|
|
assert len(documents) == 2
|
|
for doc in documents:
|
|
assert doc.meta["meta_field"] == "value_0"
|
|
np.testing.assert_array_almost_equal(documents[0].embedding, documents[1].embedding, decimal=4)
|
|
|
|
documents = document_store.get_all_documents(filters={"meta_field": ["value_0", "value_5"]}, return_embedding=True)
|
|
documents_with_value_0 = [doc for doc in documents if doc.meta["meta_field"] == "value_0"]
|
|
documents_with_value_5 = [doc for doc in documents if doc.meta["meta_field"] == "value_5"]
|
|
np.testing.assert_raises(
|
|
AssertionError,
|
|
np.testing.assert_array_equal,
|
|
documents_with_value_0[0].embedding,
|
|
documents_with_value_5[0].embedding,
|
|
)
|
|
|
|
doc = {
|
|
"content": "text_7",
|
|
"id": "7",
|
|
"meta_field": "value_7",
|
|
"embedding": retriever.embed_queries(queries=["a random string"])[0],
|
|
}
|
|
document_store.write_documents([doc])
|
|
|
|
documents = []
|
|
for i in range(8, 11):
|
|
documents.append({"content": f"text_{i}", "id": str(i), "meta_field": f"value_{i}"})
|
|
document_store.write_documents(documents)
|
|
|
|
doc_before_update = document_store.get_all_documents(filters={"meta_field": ["value_7"]})[0]
|
|
embedding_before_update = doc_before_update.embedding
|
|
|
|
# test updating only documents without embeddings
|
|
if not isinstance(document_store, WeaviateDocumentStore):
|
|
# All the documents in Weaviate store have an embedding by default. "update_existing_embeddings=False" is not allowed
|
|
document_store.update_embeddings(retriever, batch_size=3, update_existing_embeddings=False)
|
|
doc_after_update = document_store.get_all_documents(filters={"meta_field": ["value_7"]})[0]
|
|
embedding_after_update = doc_after_update.embedding
|
|
np.testing.assert_array_equal(embedding_before_update, embedding_after_update)
|
|
|
|
# test updating with filters
|
|
if isinstance(document_store, FAISSDocumentStore):
|
|
with pytest.raises(Exception):
|
|
document_store.update_embeddings(
|
|
retriever, update_existing_embeddings=True, filters={"meta_field": ["value"]}
|
|
)
|
|
else:
|
|
document_store.update_embeddings(retriever, batch_size=3, filters={"meta_field": ["value_0", "value_1"]})
|
|
doc_after_update = document_store.get_all_documents(filters={"meta_field": ["value_7"]})[0]
|
|
embedding_after_update = doc_after_update.embedding
|
|
np.testing.assert_array_equal(embedding_before_update, embedding_after_update)
|
|
|
|
# test update all embeddings
|
|
document_store.update_embeddings(retriever, batch_size=3, update_existing_embeddings=True)
|
|
assert document_store.get_embedding_count() == 11
|
|
doc_after_update = document_store.get_all_documents(filters={"meta_field": ["value_7"]})[0]
|
|
embedding_after_update = doc_after_update.embedding
|
|
np.testing.assert_raises(
|
|
AssertionError, np.testing.assert_array_equal, embedding_before_update, embedding_after_update
|
|
)
|
|
|
|
# test update embeddings for newly added docs
|
|
documents = []
|
|
for i in range(12, 15):
|
|
documents.append({"content": f"text_{i}", "id": str(i), "meta_field": f"value_{i}"})
|
|
document_store.write_documents(documents)
|
|
|
|
if not isinstance(document_store, WeaviateDocumentStore):
|
|
# All the documents in Weaviate store have an embedding by default. "update_existing_embeddings=False" is not allowed
|
|
document_store.update_embeddings(retriever, batch_size=3, update_existing_embeddings=False)
|
|
assert document_store.get_embedding_count() == 14
|
|
|
|
|
|
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
|
|
@pytest.mark.parametrize("retriever", ["table_text_retriever"], indirect=True)
|
|
@pytest.mark.embedding_dim(512)
|
|
def test_update_embeddings_table_text_retriever(document_store, retriever):
|
|
documents = []
|
|
for i in range(3):
|
|
documents.append(
|
|
{"content": f"text_{i}", "id": f"pssg_{i}", "meta_field": f"value_text_{i}", "content_type": "text"}
|
|
)
|
|
documents.append(
|
|
{
|
|
"content": pd.DataFrame(columns=[f"col_{i}", f"col_{i+1}"], data=[[f"cell_{i}", f"cell_{i+1}"]]),
|
|
"id": f"table_{i}",
|
|
f"meta_field": f"value_table_{i}",
|
|
"content_type": "table",
|
|
}
|
|
)
|
|
documents.append({"content": "text_0", "id": "pssg_4", "meta_field": "value_text_0", "content_type": "text"})
|
|
documents.append(
|
|
{
|
|
"content": pd.DataFrame(columns=["col_0", "col_1"], data=[["cell_0", "cell_1"]]),
|
|
"id": "table_4",
|
|
"meta_field": "value_table_0",
|
|
"content_type": "table",
|
|
}
|
|
)
|
|
|
|
document_store.write_documents(documents)
|
|
document_store.update_embeddings(retriever, batch_size=3)
|
|
documents = document_store.get_all_documents(return_embedding=True)
|
|
assert len(documents) == 8
|
|
for doc in documents:
|
|
assert type(doc.embedding) is np.ndarray
|
|
|
|
# Check if Documents with same content (text) get same embedding
|
|
documents = document_store.get_all_documents(filters={"meta_field": ["value_text_0"]}, return_embedding=True)
|
|
assert len(documents) == 2
|
|
for doc in documents:
|
|
assert doc.meta["meta_field"] == "value_text_0"
|
|
np.testing.assert_array_almost_equal(documents[0].embedding, documents[1].embedding, decimal=4)
|
|
|
|
# Check if Documents with same content (table) get same embedding
|
|
documents = document_store.get_all_documents(filters={"meta_field": ["value_table_0"]}, return_embedding=True)
|
|
assert len(documents) == 2
|
|
for doc in documents:
|
|
assert doc.meta["meta_field"] == "value_table_0"
|
|
np.testing.assert_array_almost_equal(documents[0].embedding, documents[1].embedding, decimal=4)
|
|
|
|
# Check if Documents wih different content (text) get different embedding
|
|
documents = document_store.get_all_documents(
|
|
filters={"meta_field": ["value_text_1", "value_text_2"]}, return_embedding=True
|
|
)
|
|
np.testing.assert_raises(
|
|
AssertionError, np.testing.assert_array_equal, documents[0].embedding, documents[1].embedding
|
|
)
|
|
|
|
# Check if Documents with different content (table) get different embeddings
|
|
documents = document_store.get_all_documents(
|
|
filters={"meta_field": ["value_table_1", "value_table_2"]}, return_embedding=True
|
|
)
|
|
np.testing.assert_raises(
|
|
AssertionError, np.testing.assert_array_equal, documents[0].embedding, documents[1].embedding
|
|
)
|
|
|
|
# Check if Documents with different content (table + text) get different embeddings
|
|
documents = document_store.get_all_documents(
|
|
filters={"meta_field": ["value_text_1", "value_table_1"]}, return_embedding=True
|
|
)
|
|
np.testing.assert_raises(
|
|
AssertionError, np.testing.assert_array_equal, documents[0].embedding, documents[1].embedding
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("document_store_type", ["elasticsearch", "memory"])
|
|
def test_custom_embedding_field(document_store_type, tmp_path):
|
|
document_store = get_document_store(
|
|
document_store_type=document_store_type,
|
|
tmp_path=tmp_path,
|
|
embedding_field="custom_embedding_field",
|
|
index="custom_embedding_field",
|
|
)
|
|
doc_to_write = {"content": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32)}
|
|
document_store.write_documents([doc_to_write])
|
|
documents = document_store.get_all_documents(return_embedding=True)
|
|
assert len(documents) == 1
|
|
assert documents[0].content == "test"
|
|
np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding)
|
|
|
|
|
|
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
|
|
def test_get_meta_values_by_key(document_store: BaseDocumentStore):
|
|
documents = [Document(content=f"Doc{i}", meta={"meta_key_1": f"{i}", "meta_key_2": f"{i}{i}"}) for i in range(20)]
|
|
document_store.write_documents(documents)
|
|
|
|
# test without filters or query
|
|
result = document_store.get_metadata_values_by_key(key="meta_key_1")
|
|
possible_values = [f"{i}" for i in range(20)]
|
|
assert len(result) == 20
|
|
for bucket in result:
|
|
assert bucket["value"] in possible_values
|
|
assert bucket["count"] == 1
|
|
|
|
# test with filters but no query
|
|
result = document_store.get_metadata_values_by_key(key="meta_key_1", filters={"meta_key_2": ["11", "22"]})
|
|
for bucket in result:
|
|
assert bucket["value"] in ["1", "2"]
|
|
assert bucket["count"] == 1
|
|
|
|
# test with filters & query
|
|
result = document_store.get_metadata_values_by_key(key="meta_key_1", query="Doc1")
|
|
for bucket in result:
|
|
assert bucket["value"] in ["1"]
|
|
assert bucket["count"] == 1
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"document_store_with_docs", ["memory", "faiss", "milvus", "weaviate", "elasticsearch"], indirect=True
|
|
)
|
|
@pytest.mark.embedding_dim(384)
|
|
def test_similarity_score_sentence_transformers(document_store_with_docs):
|
|
retriever = EmbeddingRetriever(
|
|
document_store=document_store_with_docs, embedding_model="sentence-transformers/paraphrase-MiniLM-L3-v2"
|
|
)
|
|
document_store_with_docs.update_embeddings(retriever)
|
|
pipeline = DocumentSearchPipeline(retriever)
|
|
prediction = pipeline.run("Paul lives in New York")
|
|
scores = [document.score for document in prediction["documents"]]
|
|
assert [document.content for document in prediction["documents"]] == [
|
|
"My name is Paul and I live in New York",
|
|
"My name is Matteo and I live in Rome",
|
|
"My name is Christelle and I live in Paris",
|
|
"My name is Carla and I live in Berlin",
|
|
"My name is Camila and I live in Madrid",
|
|
]
|
|
assert scores == pytest.approx(
|
|
[0.9149981737136841, 0.6895168423652649, 0.641706794500351, 0.6206043660640717, 0.5837393924593925], abs=1e-3
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"document_store_with_docs", ["memory", "faiss", "milvus", "weaviate", "elasticsearch"], indirect=True
|
|
)
|
|
@pytest.mark.embedding_dim(384)
|
|
def test_similarity_score(document_store_with_docs):
|
|
retriever = EmbeddingRetriever(
|
|
document_store=document_store_with_docs,
|
|
embedding_model="sentence-transformers/paraphrase-MiniLM-L3-v2",
|
|
model_format="farm",
|
|
)
|
|
document_store_with_docs.update_embeddings(retriever)
|
|
pipeline = DocumentSearchPipeline(retriever)
|
|
prediction = pipeline.run("Paul lives in New York")
|
|
scores = [document.score for document in prediction["documents"]]
|
|
assert scores == pytest.approx(
|
|
[0.9102507941407827, 0.6937791467877008, 0.6491682889305038, 0.6321622491318529, 0.5909129441370939], abs=1e-3
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"document_store_with_docs", ["memory", "faiss", "milvus", "weaviate", "elasticsearch"], indirect=True
|
|
)
|
|
@pytest.mark.embedding_dim(384)
|
|
def test_similarity_score_without_scaling(document_store_with_docs):
|
|
retriever = EmbeddingRetriever(
|
|
document_store=document_store_with_docs,
|
|
embedding_model="sentence-transformers/paraphrase-MiniLM-L3-v2",
|
|
scale_score=False,
|
|
model_format="farm",
|
|
)
|
|
document_store_with_docs.update_embeddings(retriever)
|
|
pipeline = DocumentSearchPipeline(retriever)
|
|
prediction = pipeline.run("Paul lives in New York")
|
|
scores = [document.score for document in prediction["documents"]]
|
|
assert scores == pytest.approx(
|
|
[0.8205015882815654, 0.3875582935754016, 0.29833657786100765, 0.26432449826370585, 0.18182588827418789],
|
|
abs=1e-3,
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"document_store_dot_product_with_docs", ["memory", "faiss", "milvus", "elasticsearch", "weaviate"], indirect=True
|
|
)
|
|
@pytest.mark.embedding_dim(384)
|
|
def test_similarity_score_dot_product(document_store_dot_product_with_docs):
|
|
retriever = EmbeddingRetriever(
|
|
document_store=document_store_dot_product_with_docs,
|
|
embedding_model="sentence-transformers/paraphrase-MiniLM-L3-v2",
|
|
model_format="farm",
|
|
)
|
|
document_store_dot_product_with_docs.update_embeddings(retriever)
|
|
pipeline = DocumentSearchPipeline(retriever)
|
|
prediction = pipeline.run("Paul lives in New York")
|
|
scores = [document.score for document in prediction["documents"]]
|
|
assert scores == pytest.approx(
|
|
[0.5526494403409358, 0.5247784342375555, 0.5189836829440964, 0.5179697273254912, 0.5112024928228626], abs=1e-3
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"document_store_dot_product_with_docs", ["memory", "faiss", "milvus", "elasticsearch", "weaviate"], indirect=True
|
|
)
|
|
@pytest.mark.embedding_dim(384)
|
|
def test_similarity_score_dot_product_without_scaling(document_store_dot_product_with_docs):
|
|
retriever = EmbeddingRetriever(
|
|
document_store=document_store_dot_product_with_docs,
|
|
embedding_model="sentence-transformers/paraphrase-MiniLM-L3-v2",
|
|
scale_score=False,
|
|
model_format="farm",
|
|
)
|
|
document_store_dot_product_with_docs.update_embeddings(retriever)
|
|
pipeline = DocumentSearchPipeline(retriever)
|
|
prediction = pipeline.run("Paul lives in New York")
|
|
scores = [document.score for document in prediction["documents"]]
|
|
assert scores == pytest.approx(
|
|
[21.13810000000001, 9.919499999999971, 7.597099999999955, 7.191000000000031, 4.481750000000034], abs=1e-3
|
|
)
|
|
|
|
|
|
def test_custom_headers(document_store_with_docs: BaseDocumentStore):
|
|
mock_client = None
|
|
if isinstance(document_store_with_docs, ElasticsearchDocumentStore):
|
|
es_document_store: ElasticsearchDocumentStore = document_store_with_docs
|
|
mock_client = Mock(wraps=es_document_store.client)
|
|
es_document_store.client = mock_client
|
|
custom_headers = {"X-My-Custom-Header": "header-value"}
|
|
if not mock_client:
|
|
with pytest.raises(NotImplementedError):
|
|
documents = document_store_with_docs.get_all_documents(headers=custom_headers)
|
|
else:
|
|
documents = document_store_with_docs.get_all_documents(headers=custom_headers)
|
|
mock_client.search.assert_called_once()
|
|
args, kwargs = mock_client.search.call_args
|
|
assert "headers" in kwargs
|
|
assert kwargs["headers"] == custom_headers
|
|
assert len(documents) > 0
|
|
|
|
|
|
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
|
|
def test_elasticsearch_brownfield_support(document_store_with_docs):
|
|
new_document_store = InMemoryDocumentStore()
|
|
new_document_store = elasticsearch_index_to_document_store(
|
|
document_store=new_document_store,
|
|
original_index_name="haystack_test",
|
|
original_content_field="content",
|
|
original_name_field="name",
|
|
included_metadata_fields=["date_field"],
|
|
index="test_brownfield_support",
|
|
id_hash_keys=["content", "meta"],
|
|
)
|
|
|
|
original_documents = document_store_with_docs.get_all_documents(index="haystack_test")
|
|
transferred_documents = new_document_store.get_all_documents(index="test_brownfield_support")
|
|
assert len(original_documents) == len(transferred_documents)
|
|
assert all("name" in doc.meta for doc in transferred_documents)
|
|
assert all("date_field" in doc.meta for doc in transferred_documents)
|
|
assert all("meta_field" not in doc.meta for doc in transferred_documents)
|
|
assert all("numeric_field" not in doc.meta for doc in transferred_documents)
|
|
assert all(doc.id == doc._get_id(["content", "meta"]) for doc in transferred_documents)
|
|
|
|
original_content = set([doc.content for doc in original_documents])
|
|
transferred_content = set([doc.content for doc in transferred_documents])
|
|
assert original_content == transferred_content
|
|
|
|
# Test transferring docs with PreProcessor
|
|
new_document_store = elasticsearch_index_to_document_store(
|
|
document_store=new_document_store,
|
|
original_index_name="haystack_test",
|
|
original_content_field="content",
|
|
excluded_metadata_fields=["date_field"],
|
|
index="test_brownfield_support_2",
|
|
preprocessor=PreProcessor(split_length=1, split_respect_sentence_boundary=False),
|
|
)
|
|
transferred_documents = new_document_store.get_all_documents(index="test_brownfield_support_2")
|
|
assert all("date_field" not in doc.meta for doc in transferred_documents)
|
|
assert all("name" in doc.meta for doc in transferred_documents)
|
|
assert all("meta_field" in doc.meta for doc in transferred_documents)
|
|
assert all("numeric_field" in doc.meta for doc in transferred_documents)
|
|
# Check if number of transferred_documents is equal to number of unique words.
|
|
assert len(transferred_documents) == len(set(" ".join(original_content).split()))
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"document_store", ["faiss", "milvus", "weaviate", "opensearch", "elasticsearch", "memory"], indirect=True
|
|
)
|
|
def test_cosine_similarity(document_store: BaseDocumentStore):
|
|
# below we will write documents to the store and then query it to see if vectors were normalized or not
|
|
ensure_ids_are_correct_uuids(docs=DOCUMENTS, document_store=document_store)
|
|
document_store.write_documents(documents=DOCUMENTS)
|
|
|
|
query = np.random.rand(768).astype(np.float32)
|
|
query_results = document_store.query_by_embedding(
|
|
query_emb=query, top_k=len(DOCUMENTS), return_embedding=True, scale_score=False
|
|
)
|
|
|
|
# check if search with cosine similarity returns the correct number of results
|
|
assert len(query_results) == len(DOCUMENTS)
|
|
|
|
original_embeddings = {doc["content"]: doc["embedding"] for doc in DOCUMENTS}
|
|
|
|
for doc in query_results:
|
|
result_emb = doc.embedding
|
|
original_emb = original_embeddings[doc.content]
|
|
|
|
expected_emb = original_emb
|
|
# embeddings of document stores which only support dot product out of the box must be normalized
|
|
if (
|
|
isinstance(document_store, (FAISSDocumentStore, MilvusDocumentStore, WeaviateDocumentStore))
|
|
or isinstance(document_store, OpenSearchDocumentStore)
|
|
and document_store.knn_engine == "faiss"
|
|
):
|
|
expected_emb = original_emb / np.linalg.norm(original_emb)
|
|
|
|
# check if the stored embedding was normalized or not
|
|
np.testing.assert_allclose(
|
|
expected_emb, result_emb, rtol=0.2, atol=5e-07
|
|
) # high tolerance necessary for Milvus 2
|
|
|
|
# check if the score is plausible for cosine similarity
|
|
cosine_score = np.dot(result_emb, query) / (np.linalg.norm(result_emb) * np.linalg.norm(query))
|
|
assert cosine_score == pytest.approx(doc.score, 0.01)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"document_store", ["faiss", "milvus", "weaviate", "opensearch", "elasticsearch", "memory"], indirect=True
|
|
)
|
|
def test_update_embeddings_cosine_similarity(document_store: BaseDocumentStore):
|
|
# below we will write documents to the store and then query it to see if vectors were normalized
|
|
ensure_ids_are_correct_uuids(docs=DOCUMENTS, document_store=document_store)
|
|
# clear embeddings
|
|
docs = deepcopy(DOCUMENTS)
|
|
for doc in docs:
|
|
doc.pop("embedding")
|
|
|
|
document_store.write_documents(documents=docs)
|
|
original_embeddings = {}
|
|
|
|
# now check if vectors are normalized when updating embeddings
|
|
class MockRetriever:
|
|
def embed_documents(self, docs):
|
|
embeddings = []
|
|
for doc in docs:
|
|
embedding = np.random.rand(768).astype(np.float32)
|
|
original_embeddings[doc.content] = embedding
|
|
embeddings.append(embedding)
|
|
return np.stack(embeddings)
|
|
|
|
retriever = MockRetriever()
|
|
document_store.update_embeddings(retriever=retriever)
|
|
|
|
query = np.random.rand(768).astype(np.float32)
|
|
query_results = document_store.query_by_embedding(
|
|
query_emb=query, top_k=len(DOCUMENTS), return_embedding=True, scale_score=False
|
|
)
|
|
|
|
# check if search with cosine similarity returns the correct number of results
|
|
assert len(query_results) == len(DOCUMENTS)
|
|
|
|
for doc in query_results:
|
|
result_emb = doc.embedding
|
|
original_emb = original_embeddings[doc.content]
|
|
|
|
expected_emb = original_emb
|
|
# embeddings of document stores which only support dot product out of the box must be normalized
|
|
if (
|
|
isinstance(document_store, (FAISSDocumentStore, MilvusDocumentStore, WeaviateDocumentStore))
|
|
or isinstance(document_store, OpenSearchDocumentStore)
|
|
and document_store.knn_engine == "faiss"
|
|
):
|
|
expected_emb = original_emb / np.linalg.norm(original_emb)
|
|
|
|
# check if the stored embedding was normalized or not
|
|
np.testing.assert_allclose(
|
|
expected_emb, result_emb, rtol=0.2, atol=5e-07
|
|
) # high tolerance necessary for Milvus 2
|
|
|
|
# check if the score is plausible for cosine similarity
|
|
cosine_score = np.dot(result_emb, query) / (np.linalg.norm(result_emb) * np.linalg.norm(query))
|
|
assert cosine_score == pytest.approx(doc.score, 0.01)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"document_store_small", ["faiss", "milvus", "weaviate", "memory", "elasticsearch", "opensearch"], indirect=True
|
|
)
|
|
def test_cosine_sanity_check(document_store_small):
|
|
VEC_1 = np.array([0.1, 0.2, 0.3], dtype="float32")
|
|
VEC_2 = np.array([0.4, 0.5, 0.6], dtype="float32")
|
|
|
|
# This is the cosine similarity of VEC_1 and VEC_2 calculated using sklearn.metrics.pairwise.cosine_similarity
|
|
# The score is normalized to yield a value between 0 and 1.
|
|
KNOWN_COSINE = 0.9746317
|
|
KNOWN_SCALED_COSINE = (KNOWN_COSINE + 1) / 2
|
|
|
|
docs = [{"name": "vec_1", "text": "vec_1", "content": "vec_1", "embedding": VEC_1}]
|
|
ensure_ids_are_correct_uuids(docs=docs, document_store=document_store_small)
|
|
document_store_small.write_documents(documents=docs)
|
|
|
|
query_results = document_store_small.query_by_embedding(
|
|
query_emb=VEC_2, top_k=1, return_embedding=True, scale_score=True
|
|
)
|
|
|
|
# check if faiss returns the same cosine similarity. Manual testing with faiss yielded 0.9746318
|
|
assert math.isclose(query_results[0].score, KNOWN_SCALED_COSINE, abs_tol=0.0002)
|
|
|
|
query_results = document_store_small.query_by_embedding(
|
|
query_emb=VEC_2, top_k=1, return_embedding=True, scale_score=False
|
|
)
|
|
|
|
# check if faiss returns the same cosine similarity. Manual testing with faiss yielded 0.9746318
|
|
assert math.isclose(query_results[0].score, KNOWN_COSINE, abs_tol=0.0002)
|