mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-29 11:50:34 +00:00

* ci: Fix all ruff pyflakes errors except unused imports * Delete releasenotes/notes/fix-some-pyflakes-errors-69a1106efa5d0203.yaml
331 lines
14 KiB
Python
331 lines
14 KiB
Python
import faiss
|
|
import pytest
|
|
import numpy as np
|
|
|
|
from haystack.document_stores.faiss import FAISSDocumentStore
|
|
from haystack.testing import DocumentStoreBaseTestAbstract
|
|
|
|
from haystack.pipelines import Pipeline
|
|
|
|
from ..conftest import MockDenseRetriever
|
|
|
|
|
|
class TestFAISSDocumentStore(DocumentStoreBaseTestAbstract):
|
|
@pytest.fixture
|
|
def ds(self, tmp_path):
|
|
return FAISSDocumentStore(
|
|
sql_url=f"sqlite:///{tmp_path}/haystack_test.db",
|
|
return_embedding=True,
|
|
isolation_level="AUTOCOMMIT",
|
|
progress_bar=False,
|
|
similarity="cosine",
|
|
)
|
|
|
|
@pytest.fixture
|
|
def documents_with_embeddings(self, documents):
|
|
# drop documents without embeddings from the original fixture
|
|
return [d for d in documents if d.embedding is not None]
|
|
|
|
@pytest.mark.unit
|
|
def test_index_mutual_exclusive_args(self, tmp_path):
|
|
with pytest.raises(ValueError, match="faiss_index_path"):
|
|
FAISSDocumentStore(
|
|
sql_url=f"sqlite:////{tmp_path/'haystack_test.db'}",
|
|
faiss_index_path=f"{tmp_path/'haystack_test'}",
|
|
isolation_level="AUTOCOMMIT",
|
|
)
|
|
|
|
with pytest.raises(ValueError, match="faiss_index_path"):
|
|
FAISSDocumentStore(
|
|
f"sqlite:////{tmp_path/'haystack_test.db'}",
|
|
faiss_index_path=f"{tmp_path/'haystack_test'}",
|
|
isolation_level="AUTOCOMMIT",
|
|
)
|
|
|
|
@pytest.mark.integration
|
|
def test_delete_index(self, ds, documents):
|
|
"""Contrary to other Document Stores, FAISSDocumentStore doesn't raise if the index is empty"""
|
|
ds.write_documents(documents)
|
|
assert ds.get_document_count() == len(documents)
|
|
ds.delete_index(ds.index)
|
|
assert ds.get_document_count() == 0
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("config_path", [None, "custom_path.json"])
|
|
def test_index_save_and_load(self, ds, documents_with_embeddings, tmp_path, config_path):
|
|
if config_path:
|
|
config_path = tmp_path / config_path
|
|
|
|
ds.write_documents(documents_with_embeddings)
|
|
|
|
# test saving the index
|
|
ds.save(index_path=tmp_path / "haystack_test_faiss", config_path=config_path)
|
|
|
|
# clear existing faiss_index
|
|
ds.faiss_indexes[ds.index].reset()
|
|
|
|
# test faiss index is cleared
|
|
assert ds.faiss_indexes[ds.index].ntotal == 0
|
|
|
|
# test loading the index
|
|
new_document_store = FAISSDocumentStore.load(
|
|
index_path=tmp_path / "haystack_test_faiss", config_path=config_path
|
|
)
|
|
|
|
# check faiss index is restored
|
|
assert new_document_store.faiss_indexes[ds.index].ntotal == len(documents_with_embeddings)
|
|
# check if documents are restored
|
|
assert len(new_document_store.get_all_documents()) == len(documents_with_embeddings)
|
|
# Check if the init parameters are kept
|
|
assert not new_document_store.progress_bar
|
|
|
|
# test saving and loading the loaded faiss index
|
|
new_document_store.save(tmp_path / "haystack_test_faiss", config_path=config_path)
|
|
reloaded_document_store = FAISSDocumentStore.load(tmp_path / "haystack_test_faiss", config_path=config_path)
|
|
|
|
# check faiss index is restored
|
|
assert reloaded_document_store.faiss_indexes[ds.index].ntotal == len(documents_with_embeddings)
|
|
# check if documents are restored
|
|
assert len(reloaded_document_store.get_all_documents()) == len(documents_with_embeddings)
|
|
# Check if the init parameters are kept
|
|
assert not reloaded_document_store.progress_bar
|
|
|
|
# test loading the index via init
|
|
new_document_store = FAISSDocumentStore(
|
|
faiss_index_path=tmp_path / "haystack_test_faiss", faiss_config_path=config_path
|
|
)
|
|
|
|
# check faiss index is restored
|
|
assert new_document_store.faiss_indexes[ds.index].ntotal == len(documents_with_embeddings)
|
|
# check if documents are restored
|
|
assert len(new_document_store.get_all_documents()) == len(documents_with_embeddings)
|
|
# Check if the init parameters are kept
|
|
assert not new_document_store.progress_bar
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("index_buffer_size", [10_000, 2])
|
|
@pytest.mark.parametrize("index_factory", ["Flat", "HNSW", "IVF1,Flat"])
|
|
def test_write_index_docs(self, documents_with_embeddings, tmp_path, index_buffer_size, index_factory):
|
|
document_store = FAISSDocumentStore(
|
|
sql_url=f"sqlite:///{tmp_path}/test_faiss_retrieving_{index_factory}.db",
|
|
faiss_index_factory_str=index_factory,
|
|
isolation_level="AUTOCOMMIT",
|
|
return_embedding=True,
|
|
)
|
|
batch_size = 2
|
|
document_store.index_buffer_size = index_buffer_size
|
|
document_store.delete_all_documents(index=document_store.index)
|
|
if "ivf" in index_factory.lower():
|
|
document_store.train_index(documents_with_embeddings)
|
|
document_store.faiss_indexes[document_store.index].make_direct_map()
|
|
|
|
# Write in batches
|
|
for i in range(0, len(documents_with_embeddings), batch_size):
|
|
document_store.write_documents(documents_with_embeddings[i : i + batch_size])
|
|
|
|
documents_indexed = document_store.get_all_documents()
|
|
assert len(documents_indexed) == len(documents_with_embeddings)
|
|
assert all(doc.embedding is not None for doc in documents_indexed)
|
|
# Check that get_embedding_count works as expected
|
|
assert document_store.get_embedding_count() == len(documents_with_embeddings)
|
|
|
|
@pytest.mark.integration
|
|
def test_write_docs_no_training(self, documents_with_embeddings, tmp_path, caplog):
|
|
document_store = FAISSDocumentStore(
|
|
sql_url=f"sqlite:///{tmp_path}/test_write_docs_no_training.db",
|
|
faiss_index_factory_str="IVF1,Flat",
|
|
isolation_level="AUTOCOMMIT",
|
|
return_embedding=True,
|
|
)
|
|
with pytest.raises(ValueError, match="must be trained before adding vectors"):
|
|
document_store.write_documents(documents_with_embeddings)
|
|
|
|
@pytest.mark.integration
|
|
def test_train_index_from_docs(self, documents_with_embeddings, tmp_path):
|
|
document_store = FAISSDocumentStore(
|
|
sql_url=f"sqlite:///{tmp_path}/test_faiss_retrieving.db",
|
|
faiss_index_factory_str="IVF1,Flat",
|
|
isolation_level="AUTOCOMMIT",
|
|
return_embedding=True,
|
|
)
|
|
document_store.delete_all_documents(index=document_store.index)
|
|
|
|
assert not document_store.faiss_indexes[document_store.index].is_trained
|
|
document_store.train_index(documents_with_embeddings)
|
|
assert document_store.faiss_indexes[document_store.index].is_trained
|
|
|
|
@pytest.mark.integration
|
|
def test_train_index_from_embeddings(self, documents_with_embeddings, tmp_path):
|
|
document_store = FAISSDocumentStore(
|
|
sql_url=f"sqlite:///{tmp_path}/test_faiss_retrieving.db",
|
|
faiss_index_factory_str="IVF1,Flat",
|
|
isolation_level="AUTOCOMMIT",
|
|
return_embedding=True,
|
|
)
|
|
document_store.delete_all_documents(index=document_store.index)
|
|
|
|
embeddings = np.array([doc.embedding for doc in documents_with_embeddings])
|
|
assert not document_store.faiss_indexes[document_store.index].is_trained
|
|
document_store.train_index(embeddings=embeddings)
|
|
assert document_store.faiss_indexes[document_store.index].is_trained
|
|
|
|
@pytest.mark.integration
|
|
def test_write_docs_different_indexes(self, ds, documents_with_embeddings):
|
|
docs_a = documents_with_embeddings[:2]
|
|
docs_b = documents_with_embeddings[2:]
|
|
ds.write_documents(docs_a, index="index_a")
|
|
ds.write_documents(docs_b, index="index_b")
|
|
|
|
docs_from_index_a = ds.get_all_documents(index="index_a", return_embedding=False)
|
|
assert len(docs_from_index_a) == len(docs_a)
|
|
assert {int(doc.meta["vector_id"]) for doc in docs_from_index_a} == {0, 1}
|
|
|
|
docs_from_index_b = ds.get_all_documents(index="index_b", return_embedding=False)
|
|
assert len(docs_from_index_b) == len(docs_b)
|
|
assert {int(doc.meta["vector_id"]) for doc in docs_from_index_b} == {0, 1, 2, 3}
|
|
|
|
@pytest.mark.integration
|
|
def test_update_docs_different_indexes(self, ds, documents_with_embeddings):
|
|
retriever = MockDenseRetriever(document_store=ds)
|
|
|
|
docs_a = documents_with_embeddings[:2]
|
|
docs_b = documents_with_embeddings[2:]
|
|
ds.write_documents(docs_a, index="index_a")
|
|
ds.write_documents(docs_b, index="index_b")
|
|
|
|
ds.update_embeddings(retriever=retriever, update_existing_embeddings=True, index="index_a")
|
|
ds.update_embeddings(retriever=retriever, update_existing_embeddings=True, index="index_b")
|
|
|
|
docs_from_index_a = ds.get_all_documents(index="index_a", return_embedding=False)
|
|
assert len(docs_from_index_a) == len(docs_a)
|
|
assert {int(doc.meta["vector_id"]) for doc in docs_from_index_a} == {0, 1}
|
|
|
|
docs_from_index_b = ds.get_all_documents(index="index_b", return_embedding=False)
|
|
assert len(docs_from_index_b) == len(docs_b)
|
|
assert {int(doc.meta["vector_id"]) for doc in docs_from_index_b} == {0, 1, 2, 3}
|
|
|
|
@pytest.mark.integration
|
|
def test_dont_update_existing_embeddings(self, ds, docs):
|
|
retriever = MockDenseRetriever(document_store=ds)
|
|
first_doc_id = docs[0].id
|
|
|
|
for i in range(1, 4):
|
|
ds.write_documents(docs[:i])
|
|
ds.update_embeddings(retriever=retriever, update_existing_embeddings=False)
|
|
|
|
assert ds.get_document_count() == i
|
|
assert ds.get_embedding_count() == i
|
|
assert ds.get_document_by_id(id=first_doc_id).meta["vector_id"] == "0"
|
|
|
|
# Check if the embeddings of the first document remain unchanged after multiple updates
|
|
if i == 1:
|
|
first_doc_embedding = ds.get_document_by_id(id=first_doc_id).embedding
|
|
else:
|
|
assert np.array_equal(ds.get_document_by_id(id=first_doc_id).embedding, first_doc_embedding)
|
|
|
|
@pytest.mark.integration
|
|
def test_passing_index_from_outside(self, documents_with_embeddings, tmp_path):
|
|
d = 768
|
|
nlist = 2
|
|
quantizer = faiss.IndexFlatIP(d)
|
|
index = "haystack_test_1"
|
|
faiss_index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
|
|
faiss_index.set_direct_map_type(faiss.DirectMap.Hashtable)
|
|
faiss_index.nprobe = 2
|
|
document_store = FAISSDocumentStore(
|
|
sql_url="sqlite:///", faiss_index=faiss_index, index=index, isolation_level="AUTOCOMMIT"
|
|
)
|
|
|
|
document_store.delete_documents()
|
|
# as it is a IVF index we need to train it before adding docs
|
|
document_store.train_index(documents_with_embeddings)
|
|
|
|
document_store.write_documents(documents=documents_with_embeddings)
|
|
documents_indexed = document_store.get_all_documents()
|
|
|
|
# test if vectors ids are associated with docs
|
|
for doc in documents_indexed:
|
|
assert 0 <= int(doc.meta["vector_id"]) <= 7
|
|
|
|
@pytest.mark.integration
|
|
def test_pipeline_with_existing_faiss_docstore(self, ds, documents_with_embeddings, tmp_path):
|
|
ds.write_documents(documents_with_embeddings)
|
|
ds.save(tmp_path / "existing_faiss_document_store")
|
|
pipeline_config = {
|
|
"version": "ignore",
|
|
"components": [
|
|
{
|
|
"name": "DPRRetriever",
|
|
"type": "MockDenseRetriever",
|
|
"params": {"document_store": "ExistingFAISSDocumentStore"},
|
|
},
|
|
{
|
|
"name": "ExistingFAISSDocumentStore",
|
|
"type": "FAISSDocumentStore",
|
|
"params": {"faiss_index_path": f"{tmp_path / 'existing_faiss_document_store'}"},
|
|
},
|
|
],
|
|
"pipelines": [{"name": "query_pipeline", "nodes": [{"name": "DPRRetriever", "inputs": ["Query"]}]}],
|
|
}
|
|
pipeline = Pipeline.load_from_config(pipeline_config)
|
|
existing_document_store = pipeline.get_document_store()
|
|
faiss_index = existing_document_store.faiss_indexes[ds.index]
|
|
assert faiss_index.ntotal == len(documents_with_embeddings)
|
|
|
|
# See TestSQLDocumentStore about why we have to skip these tests
|
|
|
|
@pytest.mark.skip
|
|
@pytest.mark.integration
|
|
def test_ne_filters(self, ds, documents):
|
|
pass
|
|
|
|
@pytest.mark.skip
|
|
@pytest.mark.integration
|
|
def test_nin_filters(self, ds, documents):
|
|
pass
|
|
|
|
@pytest.mark.skip
|
|
@pytest.mark.integration
|
|
def test_comparison_filters(self, ds, documents):
|
|
pass
|
|
|
|
@pytest.mark.skip
|
|
@pytest.mark.integration
|
|
def test_nested_condition_filters(self, ds, documents):
|
|
pass
|
|
|
|
@pytest.mark.skip
|
|
@pytest.mark.integration
|
|
def test_nested_condition_not_filters(self, ds, documents):
|
|
pass
|
|
|
|
@pytest.mark.skip(reason="labels metadata are not supported")
|
|
@pytest.mark.integration
|
|
def test_delete_labels_by_filter(self, ds, labels):
|
|
pass
|
|
|
|
@pytest.mark.skip(reason="labels metadata are not supported")
|
|
@pytest.mark.integration
|
|
def test_delete_labels_by_filter_id(self, ds, labels):
|
|
pass
|
|
|
|
@pytest.mark.skip(reason="labels metadata are not supported")
|
|
@pytest.mark.integration
|
|
def test_multilabel_filter_aggregations(self):
|
|
pass
|
|
|
|
@pytest.mark.skip(reason="labels metadata are not supported")
|
|
@pytest.mark.integration
|
|
def test_multilabel_meta_aggregations(self):
|
|
pass
|
|
|
|
@pytest.mark.skip(reason="tested in test_write_index_docs")
|
|
@pytest.mark.integration
|
|
def test_get_embedding_count(self):
|
|
pass
|
|
|
|
@pytest.mark.skip(reason="can't store embeddings in SQL")
|
|
@pytest.mark.integration
|
|
def test_custom_embedding_field(self, ds):
|
|
pass
|