mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-30 09:19:39 +00:00
Fix: FAISSDocumentStore - make write_documents properly work in combination w update_embeddings (#5221)
* Update VERSION.txt * first draft * simplify method and test * rm unnecessary pb.close * integrate feedback
This commit is contained in:
parent
aee862833e
commit
1be39367ac
@ -257,31 +257,33 @@ class FAISSDocumentStore(SQLDocumentStore):
|
|||||||
document_objects = self._handle_duplicate_documents(
|
document_objects = self._handle_duplicate_documents(
|
||||||
documents=document_objects, index=index, duplicate_documents=duplicate_documents
|
documents=document_objects, index=index, duplicate_documents=duplicate_documents
|
||||||
)
|
)
|
||||||
if len(document_objects) > 0:
|
|
||||||
add_vectors = all(doc.embedding is not None for doc in document_objects)
|
|
||||||
|
|
||||||
if self.duplicate_documents == "overwrite" and add_vectors:
|
if len(document_objects) == 0:
|
||||||
logger.warning(
|
return
|
||||||
"You have to provide `duplicate_documents = 'overwrite'` arg and "
|
|
||||||
"`FAISSDocumentStore` does not support update in existing `faiss_index`.\n"
|
|
||||||
"Please call `update_embeddings` method to repopulate `faiss_index`"
|
|
||||||
)
|
|
||||||
|
|
||||||
vector_id = self.faiss_indexes[index].ntotal
|
vector_id = self.faiss_indexes[index].ntotal
|
||||||
|
add_vectors = all(doc.embedding is not None for doc in document_objects)
|
||||||
|
|
||||||
|
if vector_id > 0 and self.duplicate_documents == "overwrite" and add_vectors:
|
||||||
|
logger.warning(
|
||||||
|
"`FAISSDocumentStore` is adding new vectors to an existing `faiss_index`.\n"
|
||||||
|
"Please call `update_embeddings` method to correctly repopulate `faiss_index`"
|
||||||
|
)
|
||||||
|
|
||||||
with tqdm(
|
with tqdm(
|
||||||
total=len(document_objects), disable=not self.progress_bar, position=0, desc="Writing Documents"
|
total=len(document_objects), disable=not self.progress_bar, position=0, desc="Writing Documents"
|
||||||
) as progress_bar:
|
) as progress_bar:
|
||||||
for i in range(0, len(document_objects), batch_size):
|
for i in range(0, len(document_objects), batch_size):
|
||||||
|
batch_documents = document_objects[i : i + batch_size]
|
||||||
if add_vectors:
|
if add_vectors:
|
||||||
if not self.faiss_indexes[index].is_trained:
|
if not self.faiss_indexes[index].is_trained:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"FAISS index of type {} must be trained before adding vectors. Call `train_index()` "
|
f"FAISS index of type {self.faiss_index_factory_str} must be trained before adding vectors. Call `train_index()` "
|
||||||
"method before adding the vectors. For details, refer to the documentation: "
|
"method before adding the vectors. For details, refer to the documentation: "
|
||||||
"[FAISSDocumentStore API](https://docs.haystack.deepset.ai/reference/document-store-api#faissdocumentstoretrain_index)."
|
"[FAISSDocumentStore API](https://docs.haystack.deepset.ai/reference/document-store-api#faissdocumentstoretrain_index)."
|
||||||
"".format(self.faiss_index_factory_str)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
embeddings = [doc.embedding for doc in document_objects[i : i + batch_size]]
|
embeddings = [doc.embedding for doc in batch_documents]
|
||||||
embeddings_to_index = np.array(embeddings, dtype="float32")
|
embeddings_to_index = np.array(embeddings, dtype="float32")
|
||||||
|
|
||||||
if self.similarity == "cosine":
|
if self.similarity == "cosine":
|
||||||
@ -289,22 +291,29 @@ class FAISSDocumentStore(SQLDocumentStore):
|
|||||||
|
|
||||||
self.faiss_indexes[index].add(embeddings_to_index)
|
self.faiss_indexes[index].add(embeddings_to_index)
|
||||||
|
|
||||||
|
# write_documents method (duplicate_documents="overwrite") should properly work in combination with
|
||||||
|
# update_embeddings method (update_existing_embeddings=False).
|
||||||
|
# If no new embeddings are provided, we save the existing FAISS vector ids
|
||||||
|
elif self.duplicate_documents == "overwrite":
|
||||||
|
existing_docs = self.get_documents_by_id(ids=[doc.id for doc in batch_documents], index=index)
|
||||||
|
existing_docs_vector_ids = {
|
||||||
|
doc.id: doc.meta["vector_id"] for doc in existing_docs if doc.meta and "vector_id" in doc.meta
|
||||||
|
}
|
||||||
|
|
||||||
docs_to_write_in_sql = []
|
docs_to_write_in_sql = []
|
||||||
for doc in document_objects[i : i + batch_size]:
|
for doc in batch_documents:
|
||||||
meta = doc.meta
|
meta = doc.meta
|
||||||
if add_vectors:
|
if add_vectors:
|
||||||
meta["vector_id"] = vector_id
|
meta["vector_id"] = vector_id
|
||||||
vector_id += 1
|
vector_id += 1
|
||||||
|
elif self.duplicate_documents == "overwrite" and doc.id in existing_docs_vector_ids:
|
||||||
|
meta["vector_id"] = existing_docs_vector_ids[doc.id]
|
||||||
docs_to_write_in_sql.append(doc)
|
docs_to_write_in_sql.append(doc)
|
||||||
|
|
||||||
super(FAISSDocumentStore, self).write_documents(
|
super(FAISSDocumentStore, self).write_documents(
|
||||||
docs_to_write_in_sql,
|
docs_to_write_in_sql, index=index, duplicate_documents=duplicate_documents, batch_size=batch_size
|
||||||
index=index,
|
|
||||||
duplicate_documents=duplicate_documents,
|
|
||||||
batch_size=batch_size,
|
|
||||||
)
|
)
|
||||||
progress_bar.update(batch_size)
|
progress_bar.update(batch_size)
|
||||||
progress_bar.close()
|
|
||||||
|
|
||||||
def _create_document_field_map(self) -> Dict:
|
def _create_document_field_map(self) -> Dict:
|
||||||
return {self.index: self.embedding_field}
|
return {self.index: self.embedding_field}
|
||||||
|
|||||||
@ -204,6 +204,25 @@ class TestFAISSDocumentStore(DocumentStoreBaseTestAbstract):
|
|||||||
assert len(docs_from_index_b) == len(docs_b)
|
assert len(docs_from_index_b) == len(docs_b)
|
||||||
assert {int(doc.meta["vector_id"]) for doc in docs_from_index_b} == {0, 1, 2, 3}
|
assert {int(doc.meta["vector_id"]) for doc in docs_from_index_b} == {0, 1, 2, 3}
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_dont_update_existing_embeddings(self, ds, docs):
|
||||||
|
retriever = MockDenseRetriever(document_store=ds)
|
||||||
|
first_doc_id = docs[0].id
|
||||||
|
|
||||||
|
for i in range(1, 4):
|
||||||
|
ds.write_documents(docs[:i])
|
||||||
|
ds.update_embeddings(retriever=retriever, update_existing_embeddings=False)
|
||||||
|
|
||||||
|
assert ds.get_document_count() == i
|
||||||
|
assert ds.get_embedding_count() == i
|
||||||
|
assert ds.get_document_by_id(id=first_doc_id).meta["vector_id"] == "0"
|
||||||
|
|
||||||
|
# Check if the embeddings of the first document remain unchanged after multiple updates
|
||||||
|
if i == 1:
|
||||||
|
first_doc_embedding = ds.get_document_by_id(id=first_doc_id).embedding
|
||||||
|
else:
|
||||||
|
assert np.array_equal(ds.get_document_by_id(id=first_doc_id).embedding, first_doc_embedding)
|
||||||
|
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
def test_passing_index_from_outside(self, documents_with_embeddings, tmp_path):
|
def test_passing_index_from_outside(self, documents_with_embeddings, tmp_path):
|
||||||
d = 768
|
d = 768
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user