mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-28 16:28:47 +00:00
fix: changing document scores (#5090)
* #4653 fix changing scores by returning new document objects from document store queries * added integration test for InMemoryDocumentStore demonstrating the desired behavior * Update test/document_stores/test_memory.py
This commit is contained in:
parent
58c022ef86
commit
60e5d73424
@ -1,3 +1,4 @@
|
||||
import copy
|
||||
from typing import Union, List, Optional, Dict, Generator
|
||||
|
||||
import json
|
||||
@ -634,16 +635,18 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
scores_for_vector_ids: Dict[str, float] = {
|
||||
str(v_id): s for v_id, s in zip(vector_id_matrix[0], score_matrix[0])
|
||||
}
|
||||
return_documents = []
|
||||
for doc in documents:
|
||||
score = scores_for_vector_ids[doc.meta["vector_id"]]
|
||||
if scale_score:
|
||||
score = self.scale_to_unit_interval(score, self.similarity)
|
||||
doc.score = score
|
||||
|
||||
if return_embedding is True:
|
||||
doc.embedding = self.faiss_indexes[index].reconstruct(int(doc.meta["vector_id"]))
|
||||
return_document = copy.copy(doc)
|
||||
return_documents.append(return_document)
|
||||
|
||||
return documents
|
||||
return return_documents
|
||||
|
||||
def save(self, index_path: Union[str, Path], config_path: Optional[Union[str, Path]] = None):
|
||||
"""
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import copy
|
||||
from typing import Any, Dict, List, Optional, Union, Generator, Literal
|
||||
|
||||
import time
|
||||
@ -959,7 +960,7 @@ class InMemoryDocumentStore(KeywordDocumentStore):
|
||||
scale_score: bool = True,
|
||||
) -> List[Document]:
|
||||
"""
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
Scan through documents in DocumentStore and return a small number of documents
|
||||
that are most relevant to the query as defined by the BM25 algorithm.
|
||||
:param query: The query.
|
||||
:param top_k: How many documents to return per query.
|
||||
@ -995,13 +996,13 @@ class InMemoryDocumentStore(KeywordDocumentStore):
|
||||
top_docs_positions = np.argsort(docs_scores)[::-1][:top_k]
|
||||
|
||||
textual_docs_list = [doc for doc in self.indexes[index].values() if doc.content_type in ["text", "table"]]
|
||||
top_docs = []
|
||||
return_documents = []
|
||||
for i in top_docs_positions:
|
||||
doc = textual_docs_list[i]
|
||||
doc.score = docs_scores[i]
|
||||
top_docs.append(doc)
|
||||
|
||||
return top_docs
|
||||
return_document = copy.copy(doc)
|
||||
return_documents.append(return_document)
|
||||
return return_documents
|
||||
|
||||
def query_batch(
|
||||
self,
|
||||
@ -1015,7 +1016,7 @@ class InMemoryDocumentStore(KeywordDocumentStore):
|
||||
scale_score: bool = True,
|
||||
) -> List[List[Document]]:
|
||||
"""
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
Scan through documents in DocumentStore and return a small number of documents
|
||||
that are most relevant to the provided queries as defined by keyword matching algorithms like BM25.
|
||||
This method lets you find relevant documents for list of query strings (output: List of Lists of Documents).
|
||||
:param query: The query.
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import copy
|
||||
import json
|
||||
from typing import Set, Union, List, Optional, Dict, Generator, Any
|
||||
|
||||
@ -1181,13 +1182,16 @@ class PineconeDocumentStore(BaseDocumentStore):
|
||||
|
||||
# assign query score to each document
|
||||
scores_for_vector_ids: Dict[str, float] = {str(v_id): s for v_id, s in zip(vector_id_matrix, score_matrix)}
|
||||
return_documents = []
|
||||
for doc in documents:
|
||||
score = scores_for_vector_ids[doc.id]
|
||||
if scale_score:
|
||||
score = self.scale_to_unit_interval(score, self.similarity)
|
||||
doc.score = score
|
||||
return_document = copy.copy(doc)
|
||||
return_documents.append(return_document)
|
||||
|
||||
return documents
|
||||
return return_documents
|
||||
|
||||
def _get_documents_by_meta(
|
||||
self,
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
from copy import deepcopy
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
@ -6,6 +7,7 @@ from rank_bm25 import BM25
|
||||
import numpy as np
|
||||
|
||||
from haystack.document_stores.memory import InMemoryDocumentStore
|
||||
from haystack.nodes import BM25Retriever
|
||||
from haystack.schema import Document
|
||||
from haystack.testing import DocumentStoreBaseTestAbstract
|
||||
|
||||
@ -124,3 +126,17 @@ class TestInMemoryDocumentStore(DocumentStoreBaseTestAbstract):
|
||||
docs = ds.query_by_embedding(query_emb=query_embedding, top_k=1)
|
||||
assert "Skipping some of your documents that don't have embeddings" in caplog.text
|
||||
assert len(docs) == 0
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_bm25_scores_not_changing_across_queries(self, ds, documents):
|
||||
"""Test that computed scores which are returned to the user should not change when running multiple queries."""
|
||||
ds.write_documents(documents)
|
||||
retriever = BM25Retriever(ds, scale_score=False)
|
||||
queries = ["What is a Foo Document?", "What is a Bar Document?", "Tell me about a document without embeddings"]
|
||||
results_direct = []
|
||||
results_direct = [retriever.retrieve(query) for query in queries]
|
||||
results_copied = [deepcopy(retriever.retrieve(query)) for query in queries]
|
||||
scores_direct = [rd.score for rds in results_direct for rd in rds]
|
||||
scores_copied = [rc.score for rcs in results_copied for rc in rcs]
|
||||
|
||||
assert scores_direct == scores_copied
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user