From 24405f851c22b98ab61fea7a80f7d059683a4ffd Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Tue, 14 Feb 2023 17:43:11 +0100 Subject: [PATCH] refactor: `InMemoryDocumentStore` - manage documents without embedding & fix mypy errors (#4113) * refactoring and test * try to replace error with warning * more expressive and robust get_scores methods * make get_scores methods internal --- haystack/document_stores/memory.py | 92 ++++++++++++++++++----------- test/document_stores/test_memory.py | 13 ++++ 2 files changed, 69 insertions(+), 36 deletions(-) diff --git a/haystack/document_stores/memory.py b/haystack/document_stores/memory.py index bf3029c74..4a6a4d36d 100644 --- a/haystack/document_stores/memory.py +++ b/haystack/document_stores/memory.py @@ -281,47 +281,60 @@ class InMemoryDocumentStore(KeywordDocumentStore): else: return None - def get_documents_by_id(self, ids: List[str], index: Optional[str] = None) -> List[Document]: # type: ignore + def get_documents_by_id( + self, + ids: List[str], + index: Optional[str] = None, + batch_size: Optional[int] = None, + headers: Optional[Dict[str, str]] = None, + ) -> List[Document]: """ Fetch documents by specifying a list of text id strings. """ + if headers: + raise NotImplementedError("InMemoryDocumentStore does not support headers.") + if batch_size: + logger.warning( + "InMemoryDocumentStore does not support batching in `get_documents_by_id` method. This parameter is ignored." + ) index = index or self.index documents = [self.indexes[index][id] for id in ids] return documents - def get_scores_torch(self, query_emb: np.ndarray, document_to_search: List[Document]) -> List[float]: + def _get_scores_torch(self, query_emb: np.ndarray, documents_to_search: List[Document]) -> List[float]: """ Calculate similarity scores between query embedding and a list of documents using torch. :param query_emb: Embedding of the query (e.g. gathered from DPR) - :param document_to_search: List of documents to compare `query_emb` against. + :param documents_to_search: List of documents to compare `query_emb` against. """ - query_emb = torch.tensor(query_emb, dtype=torch.float).to(self.main_device) # type: ignore [assignment] - if len(query_emb.shape) == 1: - query_emb = query_emb.unsqueeze(dim=0) # type: ignore [attr-defined] + query_emb_tensor = torch.tensor(query_emb, dtype=torch.float).to(self.main_device) + if query_emb_tensor.ndim == 1: + query_emb_tensor = query_emb_tensor.unsqueeze(dim=0) - doc_embeds = np.array([doc.embedding for doc in document_to_search]) - doc_embeds = torch.as_tensor(doc_embeds, dtype=torch.float) # type: ignore [assignment] - if len(doc_embeds.shape) == 1 and doc_embeds.shape[0] == 1: - doc_embeds = doc_embeds.unsqueeze(dim=0) # type: ignore [attr-defined] - elif len(doc_embeds.shape) == 1 and doc_embeds.shape[0] == 0: - return [] + doc_embeds = np.array([doc.embedding for doc in documents_to_search]) + doc_embeds_tensor = torch.as_tensor(doc_embeds, dtype=torch.float) + if doc_embeds_tensor.ndim == 1: + # if there are no embeddings, return an empty list + if doc_embeds_tensor.shape[0] == 0: + return [] + doc_embeds_tensor = doc_embeds_tensor.unsqueeze(dim=0) if self.similarity == "cosine": # cosine similarity is just a normed dot product - query_emb_norm = torch.norm(query_emb, dim=1) - query_emb = torch.div(query_emb, query_emb_norm) # type: ignore [assignment,arg-type] + query_emb_norm = torch.norm(query_emb_tensor, dim=1) + query_emb_tensor = torch.div(query_emb_tensor, query_emb_norm) - doc_embeds_norms = torch.norm(doc_embeds, dim=1) - doc_embeds = torch.div(doc_embeds.T, doc_embeds_norms).T # type: ignore [assignment,arg-type] + doc_embeds_norms = torch.norm(doc_embeds_tensor, dim=1) + doc_embeds_tensor = torch.div(doc_embeds_tensor.T, doc_embeds_norms).T curr_pos = 0 - scores = [] # type: ignore [var-annotated] - while curr_pos < len(doc_embeds): - doc_embeds_slice = doc_embeds[curr_pos : curr_pos + self.scoring_batch_size] - doc_embeds_slice = doc_embeds_slice.to(self.main_device) # type: ignore [attr-defined] + scores: List[float] = [] + while curr_pos < len(doc_embeds_tensor): + doc_embeds_slice = doc_embeds_tensor[curr_pos : curr_pos + self.scoring_batch_size] + doc_embeds_slice = doc_embeds_slice.to(self.main_device) with torch.inference_mode(): - slice_scores = torch.matmul(doc_embeds_slice, query_emb.T).cpu() # type: ignore [arg-type,arg-type] + slice_scores = torch.matmul(doc_embeds_slice, query_emb_tensor.T).cpu() slice_scores = slice_scores.squeeze(dim=1) slice_scores = slice_scores.numpy().tolist() @@ -330,21 +343,22 @@ class InMemoryDocumentStore(KeywordDocumentStore): return scores - def get_scores_numpy(self, query_emb: np.ndarray, document_to_search: List[Document]) -> List[float]: + def _get_scores_numpy(self, query_emb: np.ndarray, documents_to_search: List[Document]) -> List[float]: """ Calculate similarity scores between query embedding and a list of documents using numpy. :param query_emb: Embedding of the query (e.g. gathered from DPR) - :param document_to_search: List of documents to compare `query_emb` against. + :param documents_to_search: List of documents to compare `query_emb` against. """ - if len(query_emb.shape) == 1: - query_emb = np.expand_dims(query_emb, 0) + if query_emb.ndim == 1: + query_emb = np.expand_dims(a=query_emb, axis=0) - doc_embeds = np.array([doc.embedding for doc in document_to_search]) - if len(doc_embeds.shape) == 1 and doc_embeds.shape[0] == 1: - doc_embeds = doc_embeds.unsqueeze(dim=0) # type: ignore [attr-defined] - elif len(doc_embeds.shape) == 1 and doc_embeds.shape[0] == 0: - return [] + doc_embeds = np.array([doc.embedding for doc in documents_to_search]) + if doc_embeds.ndim == 1: + # if there are no embeddings, return an empty list + if doc_embeds.shape[0] == 0: + return [] + doc_embeds = np.expand_dims(a=doc_embeds, axis=0) if self.similarity == "cosine": # cosine similarity is just a normed dot product @@ -360,11 +374,11 @@ class InMemoryDocumentStore(KeywordDocumentStore): return scores - def get_scores(self, query_emb: np.ndarray, document_to_search: List[Document]) -> List[float]: + def _get_scores(self, query_emb: np.ndarray, documents_to_search: List[Document]) -> List[float]: if self.main_device.type == "cuda": - scores = self.get_scores_torch(query_emb, document_to_search) + scores = self._get_scores_torch(query_emb, documents_to_search) else: - scores = self.get_scores_numpy(query_emb, document_to_search) + scores = self._get_scores_numpy(query_emb, documents_to_search) return scores @@ -460,11 +474,17 @@ class InMemoryDocumentStore(KeywordDocumentStore): if query_emb is None: return [] - document_to_search = self.get_all_documents(index=index, filters=filters, return_embedding=True) - scores = self.get_scores(query_emb, document_to_search) + documents = self.get_all_documents(index=index, filters=filters, return_embedding=True) + documents_with_embeddings = [doc for doc in documents if doc.embedding is not None] + if len(documents) != len(documents_with_embeddings): + logger.warning( + "Skipping some of your documents that don't have embeddings. " + "To generate embeddings, run the document store's update_embeddings() method." + ) + scores = self._get_scores(query_emb, documents_with_embeddings) candidate_docs = [] - for doc, score in zip(document_to_search, scores): + for doc, score in zip(documents_with_embeddings, scores): curr_meta = deepcopy(doc.meta) new_document = Document( id=doc.id, content=doc.content, content_type=doc.content_type, meta=curr_meta, embedding=doc.embedding diff --git a/test/document_stores/test_memory.py b/test/document_stores/test_memory.py index 92d3196f4..d04bb1fd3 100644 --- a/test/document_stores/test_memory.py +++ b/test/document_stores/test_memory.py @@ -3,6 +3,7 @@ import logging import pandas as pd import pytest from rank_bm25 import BM25 +import numpy as np from haystack.document_stores.memory import InMemoryDocumentStore from haystack.schema import Document @@ -112,3 +113,15 @@ class TestInMemoryDocumentStore(DocumentStoreBaseTestAbstract): for docs, query_emb in zip(docs_batch, query_embs): assert len(docs) == 5 assert (docs[0].embedding == query_emb).all() + + @pytest.mark.integration + def test_memory_query_by_embedding_docs_wo_embeddings(self, ds, caplog): + # write document but don't update embeddings + ds.write_documents([Document(content="test Document")]) + + query_embedding = np.random.rand(768).astype(np.float32) + + with caplog.at_level(logging.WARNING): + docs = ds.query_by_embedding(query_emb=query_embedding, top_k=1) + assert "Skipping some of your documents that don't have embeddings" in caplog.text + assert len(docs) == 0