fix: changing document scores (#5090)

* #4653 fix changing scores by returning new document objects from document store queries

* added integration test for InMemoryDocumentStore demonstrating the desired behavior

* Update test/document_stores/test_memory.py
This commit is contained in:
Ben Heckmann 2023-06-14 17:35:46 +02:00 committed by GitHub
parent 58c022ef86
commit 60e5d73424
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 33 additions and 9 deletions

View File

@ -1,3 +1,4 @@
import copy
from typing import Union, List, Optional, Dict, Generator
import json
@ -634,16 +635,18 @@ class FAISSDocumentStore(SQLDocumentStore):
scores_for_vector_ids: Dict[str, float] = {
str(v_id): s for v_id, s in zip(vector_id_matrix[0], score_matrix[0])
}
return_documents = []
for doc in documents:
score = scores_for_vector_ids[doc.meta["vector_id"]]
if scale_score:
score = self.scale_to_unit_interval(score, self.similarity)
doc.score = score
if return_embedding is True:
doc.embedding = self.faiss_indexes[index].reconstruct(int(doc.meta["vector_id"]))
return_document = copy.copy(doc)
return_documents.append(return_document)
return documents
return return_documents
def save(self, index_path: Union[str, Path], config_path: Optional[Union[str, Path]] = None):
"""

View File

@ -1,3 +1,4 @@
import copy
from typing import Any, Dict, List, Optional, Union, Generator, Literal
import time
@ -959,7 +960,7 @@ class InMemoryDocumentStore(KeywordDocumentStore):
scale_score: bool = True,
) -> List[Document]:
"""
Scan through documents in DocumentStore and return a small number documents
Scan through documents in DocumentStore and return a small number of documents
that are most relevant to the query as defined by the BM25 algorithm.
:param query: The query.
:param top_k: How many documents to return per query.
@ -995,13 +996,13 @@ class InMemoryDocumentStore(KeywordDocumentStore):
top_docs_positions = np.argsort(docs_scores)[::-1][:top_k]
textual_docs_list = [doc for doc in self.indexes[index].values() if doc.content_type in ["text", "table"]]
top_docs = []
return_documents = []
for i in top_docs_positions:
doc = textual_docs_list[i]
doc.score = docs_scores[i]
top_docs.append(doc)
return top_docs
return_document = copy.copy(doc)
return_documents.append(return_document)
return return_documents
def query_batch(
self,
@ -1015,7 +1016,7 @@ class InMemoryDocumentStore(KeywordDocumentStore):
scale_score: bool = True,
) -> List[List[Document]]:
"""
Scan through documents in DocumentStore and return a small number documents
Scan through documents in DocumentStore and return a small number of documents
that are most relevant to the provided queries as defined by keyword matching algorithms like BM25.
This method lets you find relevant documents for list of query strings (output: List of Lists of Documents).
:param query: The query.

View File

@ -1,3 +1,4 @@
import copy
import json
from typing import Set, Union, List, Optional, Dict, Generator, Any
@ -1181,13 +1182,16 @@ class PineconeDocumentStore(BaseDocumentStore):
# assign query score to each document
scores_for_vector_ids: Dict[str, float] = {str(v_id): s for v_id, s in zip(vector_id_matrix, score_matrix)}
return_documents = []
for doc in documents:
score = scores_for_vector_ids[doc.id]
if scale_score:
score = self.scale_to_unit_interval(score, self.similarity)
doc.score = score
return_document = copy.copy(doc)
return_documents.append(return_document)
return documents
return return_documents
def _get_documents_by_meta(
self,

View File

@ -1,4 +1,5 @@
import logging
from copy import deepcopy
import pandas as pd
import pytest
@ -6,6 +7,7 @@ from rank_bm25 import BM25
import numpy as np
from haystack.document_stores.memory import InMemoryDocumentStore
from haystack.nodes import BM25Retriever
from haystack.schema import Document
from haystack.testing import DocumentStoreBaseTestAbstract
@ -124,3 +126,17 @@ class TestInMemoryDocumentStore(DocumentStoreBaseTestAbstract):
docs = ds.query_by_embedding(query_emb=query_embedding, top_k=1)
assert "Skipping some of your documents that don't have embeddings" in caplog.text
assert len(docs) == 0
@pytest.mark.integration
def test_bm25_scores_not_changing_across_queries(self, ds, documents):
"""Test that computed scores which are returned to the user should not change when running multiple queries."""
ds.write_documents(documents)
retriever = BM25Retriever(ds, scale_score=False)
queries = ["What is a Foo Document?", "What is a Bar Document?", "Tell me about a document without embeddings"]
results_direct = []
results_direct = [retriever.retrieve(query) for query in queries]
results_copied = [deepcopy(retriever.retrieve(query)) for query in queries]
scores_direct = [rd.score for rds in results_direct for rd in rds]
scores_copied = [rc.score for rcs in results_copied for rc in rcs]
assert scores_direct == scores_copied