haystack/test/document_stores/test_sql_based.py
Christian Clauss bf6d306d68
ci: Simplify Python code with ruff rules SIM (#5833)
* ci: Simplify Python code with ruff rules SIM

* Revert #5828

* ruff --select=I --fix haystack/modeling/infer.py

---------

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
2023-09-20 08:32:44 +02:00

275 lines
11 KiB
Python

import pytest
import numpy as np
from haystack.schema import Document
from haystack.pipelines import DocumentSearchPipeline
from haystack.pipelines import Pipeline
DOCUMENTS = [
{
"meta": {"name": "name_1", "year": "2020", "month": "01"},
"content": "text_1",
"embedding": np.random.rand(768).astype(np.float32),
},
{
"meta": {"name": "name_2", "year": "2020", "month": "02"},
"content": "text_2",
"embedding": np.random.rand(768).astype(np.float32),
},
{
"meta": {"name": "name_3", "year": "2020", "month": "03"},
"content": "text_3",
"embedding": np.random.rand(768).astype(np.float64),
},
{
"meta": {"name": "name_4", "year": "2021", "month": "01"},
"content": "text_4",
"embedding": np.random.rand(768).astype(np.float32),
},
{
"meta": {"name": "name_5", "year": "2021", "month": "02"},
"content": "text_5",
"embedding": np.random.rand(768).astype(np.float32),
},
{
"meta": {"name": "name_6", "year": "2021", "month": "03"},
"content": "text_6",
"embedding": np.random.rand(768).astype(np.float64),
},
]
@pytest.mark.integration
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
@pytest.mark.parametrize("batch_size", [4, 6])
def test_update_docs(document_store, retriever, batch_size):
# initial write
document_store.write_documents(DOCUMENTS)
document_store.update_embeddings(retriever=retriever, batch_size=batch_size)
documents_indexed = document_store.get_all_documents()
assert len(documents_indexed) == len(DOCUMENTS)
# test if correct vectors are associated with docs
for doc in documents_indexed:
original_doc = [d for d in DOCUMENTS if d["content"] == doc.content][0]
updated_embedding = retriever.embed_documents([Document.from_dict(original_doc)])
stored_doc = document_store.get_all_documents(filters={"name": [doc.meta["name"]]})[0]
# compare original input vec with stored one (ignore extra dim added by hnsw)
# original input vec is normalized as faiss only stores normalized vectors
a = updated_embedding / np.linalg.norm(updated_embedding)
assert np.allclose(a[0], stored_doc.embedding, rtol=0.2) # high tolerance was necessary for Milvus 2
@pytest.mark.integration
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
def test_update_existing_docs(document_store, retriever):
document_store.duplicate_documents = "overwrite"
old_document = Document(content="text_1")
# initial write
document_store.write_documents([old_document])
document_store.update_embeddings(retriever=retriever)
old_documents_indexed = document_store.get_all_documents(return_embedding=True)
assert len(old_documents_indexed) == 1
# Update document data
new_document = Document(content="text_2")
new_document.id = old_document.id
document_store.write_documents([new_document])
document_store.update_embeddings(retriever=retriever)
new_documents_indexed = document_store.get_all_documents(return_embedding=True)
assert len(new_documents_indexed) == 1
assert old_documents_indexed[0].id == new_documents_indexed[0].id
assert old_documents_indexed[0].content == "text_1"
assert new_documents_indexed[0].content == "text_2"
print(type(old_documents_indexed[0].embedding))
print(type(new_documents_indexed[0].embedding))
assert not np.allclose(old_documents_indexed[0].embedding, new_documents_indexed[0].embedding, rtol=0.01)
@pytest.mark.integration
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
def test_update_with_empty_store(document_store, retriever):
# Call update with empty doc store
document_store.update_embeddings(retriever=retriever)
# initial write
document_store.write_documents(DOCUMENTS)
documents_indexed = document_store.get_all_documents()
assert len(documents_indexed) == len(DOCUMENTS)
@pytest.mark.integration
@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
def test_finding(document_store, retriever):
document_store.write_documents(DOCUMENTS)
pipe = DocumentSearchPipeline(retriever=retriever)
prediction = pipe.run(query="How to test this?", params={"Retriever": {"top_k": 1}})
assert len(prediction.get("documents", [])) == 1
@pytest.mark.integration
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
def test_delete_docs_with_filters_multivalue(document_store, retriever):
document_store.write_documents(DOCUMENTS)
document_store.update_embeddings(retriever=retriever, batch_size=4)
assert document_store.get_embedding_count() == 6
document_store.delete_documents(filters={"name": ["name_1", "name_2", "name_3", "name_4"]})
documents = document_store.get_all_documents()
assert len(documents) == 2
assert document_store.get_embedding_count() == 2
assert {doc.meta["name"] for doc in documents} == {"name_5", "name_6"}
@pytest.mark.integration
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
def test_delete_docs_with_filters(document_store, retriever):
document_store.write_documents(DOCUMENTS)
document_store.update_embeddings(retriever=retriever, batch_size=4)
assert document_store.get_embedding_count() == 6
document_store.delete_documents(filters={"year": ["2020"]})
documents = document_store.get_all_documents()
assert len(documents) == 3
assert document_store.get_embedding_count() == 3
assert all(doc.meta["year"] == "2021" for doc in documents)
@pytest.mark.integration
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
def test_delete_docs_with_many_filters(document_store, retriever):
document_store.write_documents(DOCUMENTS)
document_store.update_embeddings(retriever=retriever, batch_size=4)
assert document_store.get_embedding_count() == 6
document_store.delete_documents(filters={"month": ["01"], "year": ["2020"]})
documents = document_store.get_all_documents()
assert len(documents) == 5
assert document_store.get_embedding_count() == 5
assert "name_1" not in {doc.meta["name"] for doc in documents}
@pytest.mark.integration
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
def test_delete_docs_by_id(document_store, retriever):
document_store.write_documents(DOCUMENTS)
document_store.update_embeddings(retriever=retriever, batch_size=4)
assert document_store.get_embedding_count() == 6
doc_ids = [doc.id for doc in document_store.get_all_documents()]
ids_to_delete = doc_ids[0:3]
document_store.delete_documents(ids=ids_to_delete)
documents = document_store.get_all_documents()
assert len(documents) == len(doc_ids) - len(ids_to_delete)
assert document_store.get_embedding_count() == len(doc_ids) - len(ids_to_delete)
remaining_ids = [doc.id for doc in documents]
assert all(doc_id not in remaining_ids for doc_id in ids_to_delete)
@pytest.mark.integration
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
def test_delete_docs_by_id_with_filters(document_store, retriever):
document_store.write_documents(DOCUMENTS)
document_store.update_embeddings(retriever=retriever, batch_size=4)
assert document_store.get_embedding_count() == 6
ids_to_delete = [doc.id for doc in document_store.get_all_documents(filters={"name": ["name_1", "name_2"]})]
ids_not_to_delete = [
doc.id for doc in document_store.get_all_documents(filters={"name": ["name_3", "name_4", "name_5", "name_6"]})
]
document_store.delete_documents(ids=ids_to_delete, filters={"name": ["name_1", "name_2", "name_3", "name_4"]})
documents = document_store.get_all_documents()
assert len(documents) == len(DOCUMENTS) - len(ids_to_delete)
assert document_store.get_embedding_count() == len(DOCUMENTS) - len(ids_to_delete)
assert all(doc.meta["name"] != "name_1" for doc in documents)
assert all(doc.meta["name"] != "name_2" for doc in documents)
all_ids_left = [doc.id for doc in documents]
assert all(doc_id in all_ids_left for doc_id in ids_not_to_delete)
@pytest.mark.integration
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
def test_get_docs_with_filters_one_value(document_store, retriever):
document_store.write_documents(DOCUMENTS)
document_store.update_embeddings(retriever=retriever, batch_size=4)
assert document_store.get_embedding_count() == 6
documents = document_store.get_all_documents(filters={"year": ["2020"]})
assert len(documents) == 3
assert all(doc.meta["year"] == "2020" for doc in documents)
@pytest.mark.integration
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
def test_get_docs_with_filters_many_values(document_store, retriever):
document_store.write_documents(DOCUMENTS)
document_store.update_embeddings(retriever=retriever, batch_size=4)
assert document_store.get_embedding_count() == 6
documents = document_store.get_all_documents(filters={"name": ["name_5", "name_6"]})
assert len(documents) == 2
assert {doc.meta["name"] for doc in documents} == {"name_5", "name_6"}
@pytest.mark.integration
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
def test_get_docs_with_many_filters(document_store, retriever):
document_store.write_documents(DOCUMENTS)
document_store.update_embeddings(retriever=retriever, batch_size=4)
assert document_store.get_embedding_count() == 6
documents = document_store.get_all_documents(filters={"month": ["01"], "year": ["2020"]})
assert len(documents) == 1
assert documents[0].meta["name"] == "name_1"
assert documents[0].meta["month"] == "01"
assert documents[0].meta["year"] == "2020"
@pytest.mark.integration
@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
def test_pipeline(document_store, retriever):
documents = [
{"name": "name_1", "content": "text_1", "embedding": np.random.rand(768).astype(np.float32)},
{"name": "name_2", "content": "text_2", "embedding": np.random.rand(768).astype(np.float32)},
{"name": "name_3", "content": "text_3", "embedding": np.random.rand(768).astype(np.float64)},
{"name": "name_4", "content": "text_4", "embedding": np.random.rand(768).astype(np.float32)},
]
document_store.write_documents(documents)
pipeline = Pipeline()
pipeline.add_node(component=retriever, name="FAISS", inputs=["Query"])
output = pipeline.run(query="How to test this?", params={"FAISS": {"top_k": 3}})
assert len(output["documents"]) == 3