mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-01 18:29:32 +00:00
fix: changing document scores (#5090)
* #4653 fix changing scores by returning new document objects from document store queries * added integration test for InMemoryDocumentStore demonstrating the desired behavior * Update test/document_stores/test_memory.py
This commit is contained in:
parent
58c022ef86
commit
60e5d73424
@ -1,3 +1,4 @@
|
|||||||
|
import copy
|
||||||
from typing import Union, List, Optional, Dict, Generator
|
from typing import Union, List, Optional, Dict, Generator
|
||||||
|
|
||||||
import json
|
import json
|
||||||
@ -634,16 +635,18 @@ class FAISSDocumentStore(SQLDocumentStore):
|
|||||||
scores_for_vector_ids: Dict[str, float] = {
|
scores_for_vector_ids: Dict[str, float] = {
|
||||||
str(v_id): s for v_id, s in zip(vector_id_matrix[0], score_matrix[0])
|
str(v_id): s for v_id, s in zip(vector_id_matrix[0], score_matrix[0])
|
||||||
}
|
}
|
||||||
|
return_documents = []
|
||||||
for doc in documents:
|
for doc in documents:
|
||||||
score = scores_for_vector_ids[doc.meta["vector_id"]]
|
score = scores_for_vector_ids[doc.meta["vector_id"]]
|
||||||
if scale_score:
|
if scale_score:
|
||||||
score = self.scale_to_unit_interval(score, self.similarity)
|
score = self.scale_to_unit_interval(score, self.similarity)
|
||||||
doc.score = score
|
doc.score = score
|
||||||
|
|
||||||
if return_embedding is True:
|
if return_embedding is True:
|
||||||
doc.embedding = self.faiss_indexes[index].reconstruct(int(doc.meta["vector_id"]))
|
doc.embedding = self.faiss_indexes[index].reconstruct(int(doc.meta["vector_id"]))
|
||||||
|
return_document = copy.copy(doc)
|
||||||
|
return_documents.append(return_document)
|
||||||
|
|
||||||
return documents
|
return return_documents
|
||||||
|
|
||||||
def save(self, index_path: Union[str, Path], config_path: Optional[Union[str, Path]] = None):
|
def save(self, index_path: Union[str, Path], config_path: Optional[Union[str, Path]] = None):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
import copy
|
||||||
from typing import Any, Dict, List, Optional, Union, Generator, Literal
|
from typing import Any, Dict, List, Optional, Union, Generator, Literal
|
||||||
|
|
||||||
import time
|
import time
|
||||||
@ -959,7 +960,7 @@ class InMemoryDocumentStore(KeywordDocumentStore):
|
|||||||
scale_score: bool = True,
|
scale_score: bool = True,
|
||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
"""
|
"""
|
||||||
Scan through documents in DocumentStore and return a small number documents
|
Scan through documents in DocumentStore and return a small number of documents
|
||||||
that are most relevant to the query as defined by the BM25 algorithm.
|
that are most relevant to the query as defined by the BM25 algorithm.
|
||||||
:param query: The query.
|
:param query: The query.
|
||||||
:param top_k: How many documents to return per query.
|
:param top_k: How many documents to return per query.
|
||||||
@ -995,13 +996,13 @@ class InMemoryDocumentStore(KeywordDocumentStore):
|
|||||||
top_docs_positions = np.argsort(docs_scores)[::-1][:top_k]
|
top_docs_positions = np.argsort(docs_scores)[::-1][:top_k]
|
||||||
|
|
||||||
textual_docs_list = [doc for doc in self.indexes[index].values() if doc.content_type in ["text", "table"]]
|
textual_docs_list = [doc for doc in self.indexes[index].values() if doc.content_type in ["text", "table"]]
|
||||||
top_docs = []
|
return_documents = []
|
||||||
for i in top_docs_positions:
|
for i in top_docs_positions:
|
||||||
doc = textual_docs_list[i]
|
doc = textual_docs_list[i]
|
||||||
doc.score = docs_scores[i]
|
doc.score = docs_scores[i]
|
||||||
top_docs.append(doc)
|
return_document = copy.copy(doc)
|
||||||
|
return_documents.append(return_document)
|
||||||
return top_docs
|
return return_documents
|
||||||
|
|
||||||
def query_batch(
|
def query_batch(
|
||||||
self,
|
self,
|
||||||
@ -1015,7 +1016,7 @@ class InMemoryDocumentStore(KeywordDocumentStore):
|
|||||||
scale_score: bool = True,
|
scale_score: bool = True,
|
||||||
) -> List[List[Document]]:
|
) -> List[List[Document]]:
|
||||||
"""
|
"""
|
||||||
Scan through documents in DocumentStore and return a small number documents
|
Scan through documents in DocumentStore and return a small number of documents
|
||||||
that are most relevant to the provided queries as defined by keyword matching algorithms like BM25.
|
that are most relevant to the provided queries as defined by keyword matching algorithms like BM25.
|
||||||
This method lets you find relevant documents for list of query strings (output: List of Lists of Documents).
|
This method lets you find relevant documents for list of query strings (output: List of Lists of Documents).
|
||||||
:param query: The query.
|
:param query: The query.
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
import copy
|
||||||
import json
|
import json
|
||||||
from typing import Set, Union, List, Optional, Dict, Generator, Any
|
from typing import Set, Union, List, Optional, Dict, Generator, Any
|
||||||
|
|
||||||
@ -1181,13 +1182,16 @@ class PineconeDocumentStore(BaseDocumentStore):
|
|||||||
|
|
||||||
# assign query score to each document
|
# assign query score to each document
|
||||||
scores_for_vector_ids: Dict[str, float] = {str(v_id): s for v_id, s in zip(vector_id_matrix, score_matrix)}
|
scores_for_vector_ids: Dict[str, float] = {str(v_id): s for v_id, s in zip(vector_id_matrix, score_matrix)}
|
||||||
|
return_documents = []
|
||||||
for doc in documents:
|
for doc in documents:
|
||||||
score = scores_for_vector_ids[doc.id]
|
score = scores_for_vector_ids[doc.id]
|
||||||
if scale_score:
|
if scale_score:
|
||||||
score = self.scale_to_unit_interval(score, self.similarity)
|
score = self.scale_to_unit_interval(score, self.similarity)
|
||||||
doc.score = score
|
doc.score = score
|
||||||
|
return_document = copy.copy(doc)
|
||||||
|
return_documents.append(return_document)
|
||||||
|
|
||||||
return documents
|
return return_documents
|
||||||
|
|
||||||
def _get_documents_by_meta(
|
def _get_documents_by_meta(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pytest
|
import pytest
|
||||||
@ -6,6 +7,7 @@ from rank_bm25 import BM25
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from haystack.document_stores.memory import InMemoryDocumentStore
|
from haystack.document_stores.memory import InMemoryDocumentStore
|
||||||
|
from haystack.nodes import BM25Retriever
|
||||||
from haystack.schema import Document
|
from haystack.schema import Document
|
||||||
from haystack.testing import DocumentStoreBaseTestAbstract
|
from haystack.testing import DocumentStoreBaseTestAbstract
|
||||||
|
|
||||||
@ -124,3 +126,17 @@ class TestInMemoryDocumentStore(DocumentStoreBaseTestAbstract):
|
|||||||
docs = ds.query_by_embedding(query_emb=query_embedding, top_k=1)
|
docs = ds.query_by_embedding(query_emb=query_embedding, top_k=1)
|
||||||
assert "Skipping some of your documents that don't have embeddings" in caplog.text
|
assert "Skipping some of your documents that don't have embeddings" in caplog.text
|
||||||
assert len(docs) == 0
|
assert len(docs) == 0
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_bm25_scores_not_changing_across_queries(self, ds, documents):
|
||||||
|
"""Test that computed scores which are returned to the user should not change when running multiple queries."""
|
||||||
|
ds.write_documents(documents)
|
||||||
|
retriever = BM25Retriever(ds, scale_score=False)
|
||||||
|
queries = ["What is a Foo Document?", "What is a Bar Document?", "Tell me about a document without embeddings"]
|
||||||
|
results_direct = []
|
||||||
|
results_direct = [retriever.retrieve(query) for query in queries]
|
||||||
|
results_copied = [deepcopy(retriever.retrieve(query)) for query in queries]
|
||||||
|
scores_direct = [rd.score for rds in results_direct for rd in rds]
|
||||||
|
scores_copied = [rc.score for rcs in results_copied for rc in rcs]
|
||||||
|
|
||||||
|
assert scores_direct == scores_copied
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user