haystack/test/document_stores/test_faiss.py

import faiss
import pytest
import numpy as np

from haystack.document_stores.faiss import FAISSDocumentStore
from haystack.testing import DocumentStoreBaseTestAbstract

from haystack.pipelines import Pipeline

from ..conftest import MockDenseRetriever


class TestFAISSDocumentStore(DocumentStoreBaseTestAbstract):
    @pytest.fixture
    def ds(self, tmp_path):
        return FAISSDocumentStore(
            sql_url=f"sqlite:///{tmp_path}/haystack_test.db",
            return_embedding=True,
            isolation_level="AUTOCOMMIT",
            progress_bar=False,
            similarity="cosine",
        )

    @pytest.fixture
    def documents_with_embeddings(self, documents):
        # drop documents without embeddings from the original fixture
        return [d for d in documents if d.embedding is not None]

    @pytest.mark.unit
    def test_index_mutual_exclusive_args(self, tmp_path):
        with pytest.raises(ValueError, match="faiss_index_path"):
            FAISSDocumentStore(
                sql_url=f"sqlite:////{tmp_path/'haystack_test.db'}",
                faiss_index_path=f"{tmp_path/'haystack_test'}",
                isolation_level="AUTOCOMMIT",
            )

        with pytest.raises(ValueError, match="faiss_index_path"):
            FAISSDocumentStore(
                f"sqlite:////{tmp_path/'haystack_test.db'}",
                faiss_index_path=f"{tmp_path/'haystack_test'}",
                isolation_level="AUTOCOMMIT",
            )

    @pytest.mark.integration
    def test_delete_index(self, ds, documents):
        """Contrary to other Document Stores, FAISSDocumentStore doesn't raise if the index is empty"""
        ds.write_documents(documents)
        assert ds.get_document_count() == len(documents)
        ds.delete_index(ds.index)
        assert ds.get_document_count() == 0

    @pytest.mark.integration
    @pytest.mark.parametrize("config_path", [None, "custom_path.json"])
    def test_index_save_and_load(self, ds, documents_with_embeddings, tmp_path, config_path):
        if config_path:
            config_path = tmp_path / config_path

        ds.write_documents(documents_with_embeddings)

        # test saving the index
        ds.save(index_path=tmp_path / "haystack_test_faiss", config_path=config_path)

        # clear existing faiss_index
        ds.faiss_indexes[ds.index].reset()

        # test faiss index is cleared
        assert ds.faiss_indexes[ds.index].ntotal == 0

        # test loading the index
        new_document_store = FAISSDocumentStore.load(
            index_path=tmp_path / "haystack_test_faiss", config_path=config_path
        )

        # check faiss index is restored
        assert new_document_store.faiss_indexes[ds.index].ntotal == len(documents_with_embeddings)
        # check if documents are restored
        assert len(new_document_store.get_all_documents()) == len(documents_with_embeddings)
        # Check if the init parameters are kept
        assert not new_document_store.progress_bar

        # test saving and loading the loaded faiss index
        new_document_store.save(tmp_path / "haystack_test_faiss", config_path=config_path)
        reloaded_document_store = FAISSDocumentStore.load(tmp_path / "haystack_test_faiss", config_path=config_path)

        # check faiss index is restored
        assert reloaded_document_store.faiss_indexes[ds.index].ntotal == len(documents_with_embeddings)
        # check if documents are restored
        assert len(reloaded_document_store.get_all_documents()) == len(documents_with_embeddings)
        # Check if the init parameters are kept
        assert not reloaded_document_store.progress_bar

        # test loading the index via init
        new_document_store = FAISSDocumentStore(
            faiss_index_path=tmp_path / "haystack_test_faiss", faiss_config_path=config_path
        )

        # check faiss index is restored
        assert new_document_store.faiss_indexes[ds.index].ntotal == len(documents_with_embeddings)
        # check if documents are restored
        assert len(new_document_store.get_all_documents()) == len(documents_with_embeddings)
        # Check if the init parameters are kept
        assert not new_document_store.progress_bar

    @pytest.mark.integration
    @pytest.mark.parametrize("index_buffer_size", [10_000, 2])
    @pytest.mark.parametrize("index_factory", ["Flat", "HNSW", "IVF1,Flat"])
    def test_write_index_docs(self, documents_with_embeddings, tmp_path, index_buffer_size, index_factory):
        document_store = FAISSDocumentStore(
            sql_url=f"sqlite:///{tmp_path}/test_faiss_retrieving_{index_factory}.db",
            faiss_index_factory_str=index_factory,
            isolation_level="AUTOCOMMIT",
            return_embedding=True,
        )
        batch_size = 2
        document_store.index_buffer_size = index_buffer_size
        document_store.delete_all_documents(index=document_store.index)
        if "ivf" in index_factory.lower():
            document_store.train_index(documents_with_embeddings)
            document_store.faiss_indexes[document_store.index].make_direct_map()

        # Write in batches
        for i in range(0, len(documents_with_embeddings), batch_size):
            document_store.write_documents(documents_with_embeddings[i : i + batch_size])

        documents_indexed = document_store.get_all_documents()
        assert len(documents_indexed) == len(documents_with_embeddings)
        assert all(doc.embedding is not None for doc in documents_indexed)
        # Check that get_embedding_count works as expected
        assert document_store.get_embedding_count() == len(documents_with_embeddings)

    @pytest.mark.integration
    def test_write_docs_no_training(self, documents_with_embeddings, tmp_path, caplog):
        document_store = FAISSDocumentStore(
            sql_url=f"sqlite:///{tmp_path}/test_write_docs_no_training.db",
            faiss_index_factory_str="IVF1,Flat",
            isolation_level="AUTOCOMMIT",
            return_embedding=True,
        )
        with pytest.raises(ValueError, match="must be trained before adding vectors"):
            document_store.write_documents(documents_with_embeddings)

    @pytest.mark.integration
    def test_train_index_from_docs(self, documents_with_embeddings, tmp_path):
        document_store = FAISSDocumentStore(
            sql_url=f"sqlite:///{tmp_path}/test_faiss_retrieving.db",
            faiss_index_factory_str="IVF1,Flat",
            isolation_level="AUTOCOMMIT",
            return_embedding=True,
        )
        document_store.delete_all_documents(index=document_store.index)

        assert not document_store.faiss_indexes[document_store.index].is_trained
        document_store.train_index(documents_with_embeddings)
        assert document_store.faiss_indexes[document_store.index].is_trained

    @pytest.mark.integration
    def test_train_index_from_embeddings(self, documents_with_embeddings, tmp_path):
        document_store = FAISSDocumentStore(
            sql_url=f"sqlite:///{tmp_path}/test_faiss_retrieving.db",
            faiss_index_factory_str="IVF1,Flat",
            isolation_level="AUTOCOMMIT",
            return_embedding=True,
        )
        document_store.delete_all_documents(index=document_store.index)

        embeddings = np.array([doc.embedding for doc in documents_with_embeddings])
        assert not document_store.faiss_indexes[document_store.index].is_trained
        document_store.train_index(embeddings=embeddings)
        assert document_store.faiss_indexes[document_store.index].is_trained

    @pytest.mark.integration
    def test_write_docs_different_indexes(self, ds, documents_with_embeddings):
        docs_a = documents_with_embeddings[:2]
        docs_b = documents_with_embeddings[2:]
        ds.write_documents(docs_a, index="index_a")
        ds.write_documents(docs_b, index="index_b")

        docs_from_index_a = ds.get_all_documents(index="index_a", return_embedding=False)
        assert len(docs_from_index_a) == len(docs_a)
        assert {int(doc.meta["vector_id"]) for doc in docs_from_index_a} == {0, 1}

        docs_from_index_b = ds.get_all_documents(index="index_b", return_embedding=False)
        assert len(docs_from_index_b) == len(docs_b)
        assert {int(doc.meta["vector_id"]) for doc in docs_from_index_b} == {0, 1, 2, 3}

    @pytest.mark.integration
    def test_update_docs_different_indexes(self, ds, documents_with_embeddings):
        retriever = MockDenseRetriever(document_store=ds)

        docs_a = documents_with_embeddings[:2]
        docs_b = documents_with_embeddings[2:]
        ds.write_documents(docs_a, index="index_a")
        ds.write_documents(docs_b, index="index_b")

        ds.update_embeddings(retriever=retriever, update_existing_embeddings=True, index="index_a")
        ds.update_embeddings(retriever=retriever, update_existing_embeddings=True, index="index_b")

        docs_from_index_a = ds.get_all_documents(index="index_a", return_embedding=False)
        assert len(docs_from_index_a) == len(docs_a)
        assert {int(doc.meta["vector_id"]) for doc in docs_from_index_a} == {0, 1}

        docs_from_index_b = ds.get_all_documents(index="index_b", return_embedding=False)
        assert len(docs_from_index_b) == len(docs_b)
        assert {int(doc.meta["vector_id"]) for doc in docs_from_index_b} == {0, 1, 2, 3}

    @pytest.mark.integration
    def test_dont_update_existing_embeddings(self, ds, docs):
        retriever = MockDenseRetriever(document_store=ds)
        first_doc_id = docs[0].id

        for i in range(1, 4):
            ds.write_documents(docs[:i])
            ds.update_embeddings(retriever=retriever, update_existing_embeddings=False)

            assert ds.get_document_count() == i
            assert ds.get_embedding_count() == i
            assert ds.get_document_by_id(id=first_doc_id).meta["vector_id"] == "0"

            # Check if the embeddings of the first document remain unchanged after multiple updates
            if i == 1:
                first_doc_embedding = ds.get_document_by_id(id=first_doc_id).embedding
            else:
                assert np.array_equal(ds.get_document_by_id(id=first_doc_id).embedding, first_doc_embedding)

    @pytest.mark.integration
    def test_passing_index_from_outside(self, documents_with_embeddings, tmp_path):
        d = 768
        nlist = 2
        quantizer = faiss.IndexFlatIP(d)
        index = "haystack_test_1"
        faiss_index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
        faiss_index.set_direct_map_type(faiss.DirectMap.Hashtable)
        faiss_index.nprobe = 2
        document_store = FAISSDocumentStore(
            sql_url="sqlite:///", faiss_index=faiss_index, index=index, isolation_level="AUTOCOMMIT"
        )

        document_store.delete_documents()
        # as it is a IVF index we need to train it before adding docs
        document_store.train_index(documents_with_embeddings)

        document_store.write_documents(documents=documents_with_embeddings)
        documents_indexed = document_store.get_all_documents()

        # test if vectors ids are associated with docs
        for doc in documents_indexed:
            assert 0 <= int(doc.meta["vector_id"]) <= 7

    @pytest.mark.integration
    def test_pipeline_with_existing_faiss_docstore(self, ds, documents_with_embeddings, tmp_path):
        ds.write_documents(documents_with_embeddings)
        ds.save(tmp_path / "existing_faiss_document_store")
        pipeline_config = {
            "version": "ignore",
            "components": [
                {
                    "name": "DPRRetriever",
                    "type": "MockDenseRetriever",
                    "params": {"document_store": "ExistingFAISSDocumentStore"},
                },
                {
                    "name": "ExistingFAISSDocumentStore",
                    "type": "FAISSDocumentStore",
                    "params": {"faiss_index_path": f"{tmp_path / 'existing_faiss_document_store'}"},
                },
            ],
            "pipelines": [{"name": "query_pipeline", "nodes": [{"name": "DPRRetriever", "inputs": ["Query"]}]}],
        }
        pipeline = Pipeline.load_from_config(pipeline_config)
        existing_document_store = pipeline.get_document_store()
        faiss_index = existing_document_store.faiss_indexes[ds.index]
        assert faiss_index.ntotal == len(documents_with_embeddings)

    # See TestSQLDocumentStore about why we have to skip these tests

    @pytest.mark.skip
    @pytest.mark.integration
    def test_ne_filters(self, ds, documents):
        pass

    @pytest.mark.skip
    @pytest.mark.integration
    def test_nin_filters(self, ds, documents):
        pass

    @pytest.mark.skip
    @pytest.mark.integration
    def test_comparison_filters(self, ds, documents):
        pass

    @pytest.mark.skip
    @pytest.mark.integration
    def test_nested_condition_filters(self, ds, documents):
        pass

    @pytest.mark.skip
    @pytest.mark.integration
    def test_nested_condition_not_filters(self, ds, documents):
        pass

    @pytest.mark.skip(reason="labels metadata are not supported")
    @pytest.mark.integration
    def test_delete_labels_by_filter(self, ds, labels):
        pass

    @pytest.mark.skip(reason="labels metadata are not supported")
    @pytest.mark.integration
    def test_delete_labels_by_filter_id(self, ds, labels):
        pass

    @pytest.mark.skip(reason="labels metadata are not supported")
    @pytest.mark.integration
    def test_multilabel_filter_aggregations(self):
        pass

    @pytest.mark.skip(reason="labels metadata are not supported")
    @pytest.mark.integration
    def test_multilabel_meta_aggregations(self):
        pass

    @pytest.mark.skip(reason="tested in test_write_index_docs")
    @pytest.mark.integration
    def test_get_embedding_count(self):
        pass

    @pytest.mark.skip(reason="can't store embeddings in SQL")
    @pytest.mark.integration
    def test_custom_embedding_field(self, ds):
        pass