mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-04 19:17:26 +00:00
Move retriever probability calculations to document_store (#389)
This commit is contained in:
parent
03fa4a8740
commit
06243dbda4
@ -6,6 +6,7 @@ from typing import List, Optional, Union, Dict, Any
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch.helpers import bulk, scan
|
||||
import numpy as np
|
||||
from scipy.special import expit
|
||||
|
||||
from haystack.document_store.base import BaseDocumentStore
|
||||
from haystack import Document, Label
|
||||
@ -211,7 +212,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
_doc["_id"] = str(_doc.pop("id"))
|
||||
|
||||
# don't index query score and empty fields
|
||||
_ = _doc.pop("query_score", None)
|
||||
_ = _doc.pop("score", None)
|
||||
_ = _doc.pop("probability", None)
|
||||
_doc = {k:v for k,v in _doc.items() if v is not None}
|
||||
|
||||
# In order to have a flat structure in elastic + similar behaviour to the other DocumentStores,
|
||||
@ -414,21 +416,31 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
logger.debug(f"Retriever query: {body}")
|
||||
result = self.client.search(index=index, body=body, request_timeout=300)["hits"]["hits"]
|
||||
|
||||
documents = [self._convert_es_hit_to_document(hit, score_adjustment=-1) for hit in result]
|
||||
documents = [self._convert_es_hit_to_document(hit, adapt_score_for_embedding=True) for hit in result]
|
||||
return documents
|
||||
|
||||
def _convert_es_hit_to_document(self, hit: dict, score_adjustment: int = 0) -> Document:
|
||||
def _convert_es_hit_to_document(self, hit: dict, adapt_score_for_embedding: bool = False) -> Document:
|
||||
# We put all additional data of the doc into meta_data and return it in the API
|
||||
meta_data = {k:v for k,v in hit["_source"].items() if k not in (self.text_field, self.faq_question_field, self.embedding_field)}
|
||||
name = meta_data.pop(self.name_field, None)
|
||||
if name:
|
||||
meta_data["name"] = name
|
||||
|
||||
score = hit["_score"] if hit["_score"] else None
|
||||
if score:
|
||||
if adapt_score_for_embedding:
|
||||
score -= 1
|
||||
probability = (score + 1) / 2 # scaling probability from cosine similarity
|
||||
else:
|
||||
probability = float(expit(np.asarray(score / 8))) # scaling probability from TFIDF/BM25
|
||||
else:
|
||||
probability = None
|
||||
document = Document(
|
||||
id=hit["_id"],
|
||||
text=hit["_source"].get(self.text_field),
|
||||
meta=meta_data,
|
||||
query_score=hit["_score"] + score_adjustment if hit["_score"] else None,
|
||||
score=score,
|
||||
probability=probability,
|
||||
question=hit["_source"].get(self.faq_question_field),
|
||||
embedding=hit["_source"].get(self.embedding_field)
|
||||
)
|
||||
|
||||
@ -159,8 +159,8 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
# assign query score to each document
|
||||
scores_for_vector_ids: Dict[str, float] = {str(v_id): s for v_id, s in zip(vector_id_matrix[0], score_matrix[0])}
|
||||
for doc in documents:
|
||||
doc.query_score = scores_for_vector_ids[doc.meta["vector_id"]] # type: ignore
|
||||
|
||||
doc.score = scores_for_vector_ids[doc.meta["vector_id"]] # type: ignore
|
||||
doc.probability = (doc.score + 1) / 2
|
||||
return documents
|
||||
|
||||
def save(self, file_path: Union[str, Path]):
|
||||
|
||||
@ -85,12 +85,13 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
|
||||
candidate_docs = []
|
||||
for idx, doc in self.indexes[index].items():
|
||||
doc.query_score = dot(query_emb, doc.embedding) / (
|
||||
doc.score = dot(query_emb, doc.embedding) / (
|
||||
norm(query_emb) * norm(doc.embedding)
|
||||
)
|
||||
doc.probability = (doc.score + 1) / 2
|
||||
candidate_docs.append(doc)
|
||||
|
||||
return sorted(candidate_docs, key=lambda x: x.query_score, reverse=True)[0:top_k]
|
||||
return sorted(candidate_docs, key=lambda x: x.score, reverse=True)[0:top_k]
|
||||
|
||||
def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None):
|
||||
"""
|
||||
|
||||
@ -4,12 +4,9 @@ from statistics import mean
|
||||
from typing import Optional, Dict, Any, List
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
from scipy.special import expit
|
||||
|
||||
from haystack.reader.base import BaseReader
|
||||
from haystack.retriever.base import BaseRetriever
|
||||
from haystack import MultiLabel, Document
|
||||
from haystack import MultiLabel
|
||||
from haystack.eval import calculate_average_precision, eval_counts_reader_batch, calculate_reader_metrics, \
|
||||
eval_counts_reader
|
||||
|
||||
@ -99,17 +96,13 @@ class Finder:
|
||||
"answer": doc.text,
|
||||
"document_id": doc.id,
|
||||
"context": doc.text,
|
||||
"score": doc.query_score,
|
||||
"score": doc.score,
|
||||
"probability": doc.probability,
|
||||
"offset_start": 0,
|
||||
"offset_end": len(doc.text),
|
||||
"meta": doc.meta
|
||||
}
|
||||
if self.retriever.embedding_model: # type: ignore
|
||||
probability = (doc.query_score + 1) / 2 # type: ignore
|
||||
else:
|
||||
probability = float(expit(np.asarray(doc.query_score / 8))) # type: ignore
|
||||
|
||||
cur_answer["probability"] = probability
|
||||
results["answers"].append(cur_answer)
|
||||
|
||||
return results
|
||||
|
||||
@ -7,7 +7,8 @@ import numpy as np
|
||||
class Document:
|
||||
def __init__(self, text: str,
|
||||
id: str = None,
|
||||
query_score: Optional[float] = None,
|
||||
score: Optional[float] = None,
|
||||
probability: Optional[float] = None,
|
||||
question: Optional[str] = None,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
embedding: Optional[np.array] = None):
|
||||
@ -21,7 +22,8 @@ class Document:
|
||||
|
||||
:param id: ID used within the DocumentStore
|
||||
:param text: Text of the document
|
||||
:param query_score: Retriever's query score for a retrieved document
|
||||
:param score: Retriever's query score for a retrieved document
|
||||
:param probability: a psuedo probability by scaling score in the range 0 to 1
|
||||
:param question: Question text for FAQs.
|
||||
:param meta: Meta fields for a document like name, url, or author.
|
||||
:param embedding: Vector encoding of the text
|
||||
@ -34,7 +36,8 @@ class Document:
|
||||
else:
|
||||
self.id = str(uuid4())
|
||||
|
||||
self.query_score = query_score
|
||||
self.score = score
|
||||
self.probability = probability
|
||||
self.question = question
|
||||
self.meta = meta
|
||||
self.embedding = embedding
|
||||
@ -50,7 +53,7 @@ class Document:
|
||||
@classmethod
|
||||
def from_dict(cls, dict, field_map={}):
|
||||
_doc = dict.copy()
|
||||
init_args = ["text", "id", "query_score", "question", "meta", "embedding"]
|
||||
init_args = ["text", "id", "score", "probability", "question", "meta", "embedding"]
|
||||
if "meta" not in _doc.keys():
|
||||
_doc["meta"] = {}
|
||||
# copy additional fields into "meta"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user