Move retriever probability calculations to document_store (#389)

2026-01-04 19:17:26 +00:00 · 2020-09-17 16:25:46 +02:00 · 2020-09-17 16:25:46 +02:00 · 06243dbda4
commit 06243dbda4
parent 03fa4a8740
5 changed files with 31 additions and 22 deletions
--- a/haystack/document_store/elasticsearch.py
+++ b/haystack/document_store/elasticsearch.py
@ -6,6 +6,7 @@ from typing import List, Optional, Union, Dict, Any
 from elasticsearch import Elasticsearch
 from elasticsearch.helpers import bulk, scan
 import numpy as np
+from scipy.special import expit

 from haystack.document_store.base import BaseDocumentStore
 from haystack import Document, Label
@ -211,7 +212,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
            _doc["_id"] = str(_doc.pop("id"))

            # don't index query score and empty fields
-            _ = _doc.pop("query_score", None)
+            _ = _doc.pop("score", None)
+            _ = _doc.pop("probability", None)
            _doc = {k:v for k,v in _doc.items() if v is not None}

            # In order to have a flat structure in elastic + similar behaviour to the other DocumentStores,
@ -414,21 +416,31 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
            logger.debug(f"Retriever query: {body}")
            result = self.client.search(index=index, body=body, request_timeout=300)["hits"]["hits"]

-            documents = [self._convert_es_hit_to_document(hit, score_adjustment=-1) for hit in result]
+            documents = [self._convert_es_hit_to_document(hit, adapt_score_for_embedding=True) for hit in result]
            return documents

-    def _convert_es_hit_to_document(self, hit: dict, score_adjustment: int = 0) -> Document:
+    def _convert_es_hit_to_document(self, hit: dict, adapt_score_for_embedding: bool = False) -> Document:
        # We put all additional data of the doc into meta_data and return it in the API
        meta_data = {k:v for k,v in hit["_source"].items() if k not in (self.text_field, self.faq_question_field, self.embedding_field)}
        name = meta_data.pop(self.name_field, None)
        if name:
            meta_data["name"] = name

+        score = hit["_score"] if hit["_score"] else None
+        if score:
+            if adapt_score_for_embedding:
+                score -= 1
+                probability = (score + 1) / 2  # scaling probability from cosine similarity
+            else:
+                probability = float(expit(np.asarray(score / 8)))  # scaling probability from TFIDF/BM25
+        else:
+            probability = None
        document = Document(
            id=hit["_id"],
            text=hit["_source"].get(self.text_field),
            meta=meta_data,
-            query_score=hit["_score"] + score_adjustment if hit["_score"] else None,
+            score=score,
+            probability=probability,
            question=hit["_source"].get(self.faq_question_field),
            embedding=hit["_source"].get(self.embedding_field)
        )
--- a/haystack/document_store/faiss.py
+++ b/haystack/document_store/faiss.py
@ -159,8 +159,8 @@ class FAISSDocumentStore(SQLDocumentStore):
        # assign query score to each document
        scores_for_vector_ids: Dict[str, float] = {str(v_id): s for v_id, s in zip(vector_id_matrix[0], score_matrix[0])}
        for doc in documents:
-            doc.query_score = scores_for_vector_ids[doc.meta["vector_id"]]  # type: ignore
-
+            doc.score = scores_for_vector_ids[doc.meta["vector_id"]]  # type: ignore
+            doc.probability = (doc.score + 1) / 2
        return documents

    def save(self, file_path: Union[str, Path]):
--- a/haystack/document_store/memory.py
+++ b/haystack/document_store/memory.py
@ -85,12 +85,13 @@ class InMemoryDocumentStore(BaseDocumentStore):

        candidate_docs = []
        for idx, doc in self.indexes[index].items():
-            doc.query_score = dot(query_emb, doc.embedding) / (
+            doc.score = dot(query_emb, doc.embedding) / (
                norm(query_emb) * norm(doc.embedding)
            )
+            doc.probability = (doc.score + 1) / 2
            candidate_docs.append(doc)

-        return sorted(candidate_docs, key=lambda x: x.query_score, reverse=True)[0:top_k]
+        return sorted(candidate_docs, key=lambda x: x.score, reverse=True)[0:top_k]

    def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None):
        """
--- a/haystack/finder.py
+++ b/haystack/finder.py
@ -4,12 +4,9 @@ from statistics import mean
 from typing import Optional, Dict, Any, List
 from collections import defaultdict

-import numpy as np
-from scipy.special import expit
-
 from haystack.reader.base import BaseReader
 from haystack.retriever.base import BaseRetriever
-from haystack import MultiLabel, Document
+from haystack import MultiLabel
 from haystack.eval import calculate_average_precision, eval_counts_reader_batch, calculate_reader_metrics, \
    eval_counts_reader

@ -99,17 +96,13 @@ class Finder:
                "answer": doc.text,
                "document_id": doc.id,
                "context": doc.text,
-                "score": doc.query_score,
+                "score": doc.score,
+                "probability": doc.probability,
                "offset_start": 0,
                "offset_end": len(doc.text),
                "meta": doc.meta
             }
-            if self.retriever.embedding_model:  # type: ignore
-                probability = (doc.query_score + 1) / 2  # type: ignore
-            else:
-                probability = float(expit(np.asarray(doc.query_score / 8)))  # type: ignore

-            cur_answer["probability"] = probability
            results["answers"].append(cur_answer)

        return results
--- a/haystack/schema.py
+++ b/haystack/schema.py
@ -7,7 +7,8 @@ import numpy as np
 class Document:
    def __init__(self, text: str,
                 id: str = None,
-                 query_score: Optional[float] = None,
+                 score: Optional[float] = None,
+                 probability: Optional[float] = None,
                 question: Optional[str] = None,
                 meta: Optional[Dict[str, Any]] = None,
                 embedding: Optional[np.array] = None):
@ -21,7 +22,8 @@ class Document:

        :param id: ID used within the DocumentStore
        :param text: Text of the document
-        :param query_score: Retriever's query score for a retrieved document
+        :param score: Retriever's query score for a retrieved document
+        :param probability: a psuedo probability by scaling score in the range 0 to 1
        :param question: Question text for FAQs.
        :param meta: Meta fields for a document like name, url, or author.
        :param embedding: Vector encoding of the text
@ -34,7 +36,8 @@ class Document:
        else:
            self.id = str(uuid4())

-        self.query_score = query_score
+        self.score = score
+        self.probability = probability
        self.question = question
        self.meta = meta
        self.embedding = embedding
@ -50,7 +53,7 @@ class Document:
    @classmethod
    def from_dict(cls, dict, field_map={}):
        _doc = dict.copy()
-        init_args = ["text", "id", "query_score", "question", "meta", "embedding"]
+        init_args = ["text", "id", "score", "probability", "question", "meta", "embedding"]
        if "meta" not in _doc.keys():
            _doc["meta"] = {}
        # copy additional fields into "meta"