diff --git a/haystack/document_stores/faiss.py b/haystack/document_stores/faiss.py index a787a8cc9..d9cfc9f85 100644 --- a/haystack/document_stores/faiss.py +++ b/haystack/document_stores/faiss.py @@ -1,3 +1,4 @@ +import copy from typing import Union, List, Optional, Dict, Generator import json @@ -634,16 +635,18 @@ class FAISSDocumentStore(SQLDocumentStore): scores_for_vector_ids: Dict[str, float] = { str(v_id): s for v_id, s in zip(vector_id_matrix[0], score_matrix[0]) } + return_documents = [] for doc in documents: score = scores_for_vector_ids[doc.meta["vector_id"]] if scale_score: score = self.scale_to_unit_interval(score, self.similarity) doc.score = score - if return_embedding is True: doc.embedding = self.faiss_indexes[index].reconstruct(int(doc.meta["vector_id"])) + return_document = copy.copy(doc) + return_documents.append(return_document) - return documents + return return_documents def save(self, index_path: Union[str, Path], config_path: Optional[Union[str, Path]] = None): """ diff --git a/haystack/document_stores/memory.py b/haystack/document_stores/memory.py index 313a0a74a..0482f4022 100644 --- a/haystack/document_stores/memory.py +++ b/haystack/document_stores/memory.py @@ -1,3 +1,4 @@ +import copy from typing import Any, Dict, List, Optional, Union, Generator, Literal import time @@ -959,7 +960,7 @@ class InMemoryDocumentStore(KeywordDocumentStore): scale_score: bool = True, ) -> List[Document]: """ - Scan through documents in DocumentStore and return a small number documents + Scan through documents in DocumentStore and return a small number of documents that are most relevant to the query as defined by the BM25 algorithm. :param query: The query. :param top_k: How many documents to return per query. @@ -995,13 +996,13 @@ class InMemoryDocumentStore(KeywordDocumentStore): top_docs_positions = np.argsort(docs_scores)[::-1][:top_k] textual_docs_list = [doc for doc in self.indexes[index].values() if doc.content_type in ["text", "table"]] - top_docs = [] + return_documents = [] for i in top_docs_positions: doc = textual_docs_list[i] doc.score = docs_scores[i] - top_docs.append(doc) - - return top_docs + return_document = copy.copy(doc) + return_documents.append(return_document) + return return_documents def query_batch( self, @@ -1015,7 +1016,7 @@ class InMemoryDocumentStore(KeywordDocumentStore): scale_score: bool = True, ) -> List[List[Document]]: """ - Scan through documents in DocumentStore and return a small number documents + Scan through documents in DocumentStore and return a small number of documents that are most relevant to the provided queries as defined by keyword matching algorithms like BM25. This method lets you find relevant documents for list of query strings (output: List of Lists of Documents). :param query: The query. diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index e600b617c..0be92adf0 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -1,3 +1,4 @@ +import copy import json from typing import Set, Union, List, Optional, Dict, Generator, Any @@ -1181,13 +1182,16 @@ class PineconeDocumentStore(BaseDocumentStore): # assign query score to each document scores_for_vector_ids: Dict[str, float] = {str(v_id): s for v_id, s in zip(vector_id_matrix, score_matrix)} + return_documents = [] for doc in documents: score = scores_for_vector_ids[doc.id] if scale_score: score = self.scale_to_unit_interval(score, self.similarity) doc.score = score + return_document = copy.copy(doc) + return_documents.append(return_document) - return documents + return return_documents def _get_documents_by_meta( self, diff --git a/test/document_stores/test_memory.py b/test/document_stores/test_memory.py index ab8035ca5..5e71a35e7 100644 --- a/test/document_stores/test_memory.py +++ b/test/document_stores/test_memory.py @@ -1,4 +1,5 @@ import logging +from copy import deepcopy import pandas as pd import pytest @@ -6,6 +7,7 @@ from rank_bm25 import BM25 import numpy as np from haystack.document_stores.memory import InMemoryDocumentStore +from haystack.nodes import BM25Retriever from haystack.schema import Document from haystack.testing import DocumentStoreBaseTestAbstract @@ -124,3 +126,17 @@ class TestInMemoryDocumentStore(DocumentStoreBaseTestAbstract): docs = ds.query_by_embedding(query_emb=query_embedding, top_k=1) assert "Skipping some of your documents that don't have embeddings" in caplog.text assert len(docs) == 0 + + @pytest.mark.integration + def test_bm25_scores_not_changing_across_queries(self, ds, documents): + """Test that computed scores which are returned to the user should not change when running multiple queries.""" + ds.write_documents(documents) + retriever = BM25Retriever(ds, scale_score=False) + queries = ["What is a Foo Document?", "What is a Bar Document?", "Tell me about a document without embeddings"] + results_direct = [] + results_direct = [retriever.retrieve(query) for query in queries] + results_copied = [deepcopy(retriever.retrieve(query)) for query in queries] + scores_direct = [rd.score for rds in results_direct for rd in rds] + scores_copied = [rc.score for rcs in results_copied for rc in rcs] + + assert scores_direct == scores_copied