mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-30 12:22:52 +00:00

* ci: Simplify Python code with ruff rules SIM * Revert #5828 * ruff --select=I --fix haystack/modeling/infer.py --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
275 lines
11 KiB
Python
275 lines
11 KiB
Python
import pytest
|
|
import numpy as np
|
|
|
|
from haystack.schema import Document
|
|
from haystack.pipelines import DocumentSearchPipeline
|
|
|
|
from haystack.pipelines import Pipeline
|
|
|
|
|
|
DOCUMENTS = [
|
|
{
|
|
"meta": {"name": "name_1", "year": "2020", "month": "01"},
|
|
"content": "text_1",
|
|
"embedding": np.random.rand(768).astype(np.float32),
|
|
},
|
|
{
|
|
"meta": {"name": "name_2", "year": "2020", "month": "02"},
|
|
"content": "text_2",
|
|
"embedding": np.random.rand(768).astype(np.float32),
|
|
},
|
|
{
|
|
"meta": {"name": "name_3", "year": "2020", "month": "03"},
|
|
"content": "text_3",
|
|
"embedding": np.random.rand(768).astype(np.float64),
|
|
},
|
|
{
|
|
"meta": {"name": "name_4", "year": "2021", "month": "01"},
|
|
"content": "text_4",
|
|
"embedding": np.random.rand(768).astype(np.float32),
|
|
},
|
|
{
|
|
"meta": {"name": "name_5", "year": "2021", "month": "02"},
|
|
"content": "text_5",
|
|
"embedding": np.random.rand(768).astype(np.float32),
|
|
},
|
|
{
|
|
"meta": {"name": "name_6", "year": "2021", "month": "03"},
|
|
"content": "text_6",
|
|
"embedding": np.random.rand(768).astype(np.float64),
|
|
},
|
|
]
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
|
|
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
|
|
@pytest.mark.parametrize("batch_size", [4, 6])
|
|
def test_update_docs(document_store, retriever, batch_size):
|
|
# initial write
|
|
document_store.write_documents(DOCUMENTS)
|
|
|
|
document_store.update_embeddings(retriever=retriever, batch_size=batch_size)
|
|
documents_indexed = document_store.get_all_documents()
|
|
assert len(documents_indexed) == len(DOCUMENTS)
|
|
|
|
# test if correct vectors are associated with docs
|
|
for doc in documents_indexed:
|
|
original_doc = [d for d in DOCUMENTS if d["content"] == doc.content][0]
|
|
updated_embedding = retriever.embed_documents([Document.from_dict(original_doc)])
|
|
stored_doc = document_store.get_all_documents(filters={"name": [doc.meta["name"]]})[0]
|
|
# compare original input vec with stored one (ignore extra dim added by hnsw)
|
|
# original input vec is normalized as faiss only stores normalized vectors
|
|
a = updated_embedding / np.linalg.norm(updated_embedding)
|
|
assert np.allclose(a[0], stored_doc.embedding, rtol=0.2) # high tolerance was necessary for Milvus 2
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
|
|
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
|
|
def test_update_existing_docs(document_store, retriever):
|
|
document_store.duplicate_documents = "overwrite"
|
|
old_document = Document(content="text_1")
|
|
# initial write
|
|
document_store.write_documents([old_document])
|
|
document_store.update_embeddings(retriever=retriever)
|
|
old_documents_indexed = document_store.get_all_documents(return_embedding=True)
|
|
assert len(old_documents_indexed) == 1
|
|
|
|
# Update document data
|
|
new_document = Document(content="text_2")
|
|
new_document.id = old_document.id
|
|
document_store.write_documents([new_document])
|
|
document_store.update_embeddings(retriever=retriever)
|
|
new_documents_indexed = document_store.get_all_documents(return_embedding=True)
|
|
assert len(new_documents_indexed) == 1
|
|
|
|
assert old_documents_indexed[0].id == new_documents_indexed[0].id
|
|
assert old_documents_indexed[0].content == "text_1"
|
|
assert new_documents_indexed[0].content == "text_2"
|
|
print(type(old_documents_indexed[0].embedding))
|
|
print(type(new_documents_indexed[0].embedding))
|
|
assert not np.allclose(old_documents_indexed[0].embedding, new_documents_indexed[0].embedding, rtol=0.01)
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
|
|
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
|
|
def test_update_with_empty_store(document_store, retriever):
|
|
# Call update with empty doc store
|
|
document_store.update_embeddings(retriever=retriever)
|
|
|
|
# initial write
|
|
document_store.write_documents(DOCUMENTS)
|
|
|
|
documents_indexed = document_store.get_all_documents()
|
|
|
|
assert len(documents_indexed) == len(DOCUMENTS)
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
|
|
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
|
|
def test_finding(document_store, retriever):
|
|
document_store.write_documents(DOCUMENTS)
|
|
pipe = DocumentSearchPipeline(retriever=retriever)
|
|
|
|
prediction = pipe.run(query="How to test this?", params={"Retriever": {"top_k": 1}})
|
|
|
|
assert len(prediction.get("documents", [])) == 1
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
|
|
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
|
|
def test_delete_docs_with_filters_multivalue(document_store, retriever):
|
|
document_store.write_documents(DOCUMENTS)
|
|
document_store.update_embeddings(retriever=retriever, batch_size=4)
|
|
assert document_store.get_embedding_count() == 6
|
|
|
|
document_store.delete_documents(filters={"name": ["name_1", "name_2", "name_3", "name_4"]})
|
|
|
|
documents = document_store.get_all_documents()
|
|
assert len(documents) == 2
|
|
assert document_store.get_embedding_count() == 2
|
|
assert {doc.meta["name"] for doc in documents} == {"name_5", "name_6"}
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
|
|
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
|
|
def test_delete_docs_with_filters(document_store, retriever):
|
|
document_store.write_documents(DOCUMENTS)
|
|
document_store.update_embeddings(retriever=retriever, batch_size=4)
|
|
assert document_store.get_embedding_count() == 6
|
|
|
|
document_store.delete_documents(filters={"year": ["2020"]})
|
|
|
|
documents = document_store.get_all_documents()
|
|
assert len(documents) == 3
|
|
assert document_store.get_embedding_count() == 3
|
|
assert all(doc.meta["year"] == "2021" for doc in documents)
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
|
|
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
|
|
def test_delete_docs_with_many_filters(document_store, retriever):
|
|
document_store.write_documents(DOCUMENTS)
|
|
document_store.update_embeddings(retriever=retriever, batch_size=4)
|
|
assert document_store.get_embedding_count() == 6
|
|
|
|
document_store.delete_documents(filters={"month": ["01"], "year": ["2020"]})
|
|
|
|
documents = document_store.get_all_documents()
|
|
assert len(documents) == 5
|
|
assert document_store.get_embedding_count() == 5
|
|
assert "name_1" not in {doc.meta["name"] for doc in documents}
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
|
|
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
|
|
def test_delete_docs_by_id(document_store, retriever):
|
|
document_store.write_documents(DOCUMENTS)
|
|
document_store.update_embeddings(retriever=retriever, batch_size=4)
|
|
assert document_store.get_embedding_count() == 6
|
|
doc_ids = [doc.id for doc in document_store.get_all_documents()]
|
|
ids_to_delete = doc_ids[0:3]
|
|
|
|
document_store.delete_documents(ids=ids_to_delete)
|
|
|
|
documents = document_store.get_all_documents()
|
|
assert len(documents) == len(doc_ids) - len(ids_to_delete)
|
|
assert document_store.get_embedding_count() == len(doc_ids) - len(ids_to_delete)
|
|
|
|
remaining_ids = [doc.id for doc in documents]
|
|
assert all(doc_id not in remaining_ids for doc_id in ids_to_delete)
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
|
|
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
|
|
def test_delete_docs_by_id_with_filters(document_store, retriever):
|
|
document_store.write_documents(DOCUMENTS)
|
|
document_store.update_embeddings(retriever=retriever, batch_size=4)
|
|
assert document_store.get_embedding_count() == 6
|
|
|
|
ids_to_delete = [doc.id for doc in document_store.get_all_documents(filters={"name": ["name_1", "name_2"]})]
|
|
ids_not_to_delete = [
|
|
doc.id for doc in document_store.get_all_documents(filters={"name": ["name_3", "name_4", "name_5", "name_6"]})
|
|
]
|
|
|
|
document_store.delete_documents(ids=ids_to_delete, filters={"name": ["name_1", "name_2", "name_3", "name_4"]})
|
|
|
|
documents = document_store.get_all_documents()
|
|
assert len(documents) == len(DOCUMENTS) - len(ids_to_delete)
|
|
assert document_store.get_embedding_count() == len(DOCUMENTS) - len(ids_to_delete)
|
|
|
|
assert all(doc.meta["name"] != "name_1" for doc in documents)
|
|
assert all(doc.meta["name"] != "name_2" for doc in documents)
|
|
|
|
all_ids_left = [doc.id for doc in documents]
|
|
assert all(doc_id in all_ids_left for doc_id in ids_not_to_delete)
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
|
|
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
|
|
def test_get_docs_with_filters_one_value(document_store, retriever):
|
|
document_store.write_documents(DOCUMENTS)
|
|
document_store.update_embeddings(retriever=retriever, batch_size=4)
|
|
assert document_store.get_embedding_count() == 6
|
|
|
|
documents = document_store.get_all_documents(filters={"year": ["2020"]})
|
|
|
|
assert len(documents) == 3
|
|
assert all(doc.meta["year"] == "2020" for doc in documents)
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
|
|
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
|
|
def test_get_docs_with_filters_many_values(document_store, retriever):
|
|
document_store.write_documents(DOCUMENTS)
|
|
document_store.update_embeddings(retriever=retriever, batch_size=4)
|
|
assert document_store.get_embedding_count() == 6
|
|
|
|
documents = document_store.get_all_documents(filters={"name": ["name_5", "name_6"]})
|
|
|
|
assert len(documents) == 2
|
|
assert {doc.meta["name"] for doc in documents} == {"name_5", "name_6"}
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
|
|
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
|
|
def test_get_docs_with_many_filters(document_store, retriever):
|
|
document_store.write_documents(DOCUMENTS)
|
|
document_store.update_embeddings(retriever=retriever, batch_size=4)
|
|
assert document_store.get_embedding_count() == 6
|
|
|
|
documents = document_store.get_all_documents(filters={"month": ["01"], "year": ["2020"]})
|
|
|
|
assert len(documents) == 1
|
|
assert documents[0].meta["name"] == "name_1"
|
|
assert documents[0].meta["month"] == "01"
|
|
assert documents[0].meta["year"] == "2020"
|
|
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
|
|
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
|
|
def test_pipeline(document_store, retriever):
|
|
documents = [
|
|
{"name": "name_1", "content": "text_1", "embedding": np.random.rand(768).astype(np.float32)},
|
|
{"name": "name_2", "content": "text_2", "embedding": np.random.rand(768).astype(np.float32)},
|
|
{"name": "name_3", "content": "text_3", "embedding": np.random.rand(768).astype(np.float64)},
|
|
{"name": "name_4", "content": "text_4", "embedding": np.random.rand(768).astype(np.float32)},
|
|
]
|
|
document_store.write_documents(documents)
|
|
pipeline = Pipeline()
|
|
pipeline.add_node(component=retriever, name="FAISS", inputs=["Query"])
|
|
output = pipeline.run(query="How to test this?", params={"FAISS": {"top_k": 3}})
|
|
assert len(output["documents"]) == 3
|