diff --git a/haystack/document_store/elasticsearch.py b/haystack/document_store/elasticsearch.py index 7c84d92f5..665acba50 100644 --- a/haystack/document_store/elasticsearch.py +++ b/haystack/document_store/elasticsearch.py @@ -6,6 +6,7 @@ from typing import List, Optional, Union, Dict, Any from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk, scan import numpy as np +from scipy.special import expit from haystack.document_store.base import BaseDocumentStore from haystack import Document, Label @@ -211,7 +212,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore): _doc["_id"] = str(_doc.pop("id")) # don't index query score and empty fields - _ = _doc.pop("query_score", None) + _ = _doc.pop("score", None) + _ = _doc.pop("probability", None) _doc = {k:v for k,v in _doc.items() if v is not None} # In order to have a flat structure in elastic + similar behaviour to the other DocumentStores, @@ -414,21 +416,31 @@ class ElasticsearchDocumentStore(BaseDocumentStore): logger.debug(f"Retriever query: {body}") result = self.client.search(index=index, body=body, request_timeout=300)["hits"]["hits"] - documents = [self._convert_es_hit_to_document(hit, score_adjustment=-1) for hit in result] + documents = [self._convert_es_hit_to_document(hit, adapt_score_for_embedding=True) for hit in result] return documents - def _convert_es_hit_to_document(self, hit: dict, score_adjustment: int = 0) -> Document: + def _convert_es_hit_to_document(self, hit: dict, adapt_score_for_embedding: bool = False) -> Document: # We put all additional data of the doc into meta_data and return it in the API meta_data = {k:v for k,v in hit["_source"].items() if k not in (self.text_field, self.faq_question_field, self.embedding_field)} name = meta_data.pop(self.name_field, None) if name: meta_data["name"] = name + score = hit["_score"] if hit["_score"] else None + if score: + if adapt_score_for_embedding: + score -= 1 + probability = (score + 1) / 2 # scaling probability from cosine similarity + else: + probability = float(expit(np.asarray(score / 8))) # scaling probability from TFIDF/BM25 + else: + probability = None document = Document( id=hit["_id"], text=hit["_source"].get(self.text_field), meta=meta_data, - query_score=hit["_score"] + score_adjustment if hit["_score"] else None, + score=score, + probability=probability, question=hit["_source"].get(self.faq_question_field), embedding=hit["_source"].get(self.embedding_field) ) diff --git a/haystack/document_store/faiss.py b/haystack/document_store/faiss.py index 5e718659e..866b6bb45 100644 --- a/haystack/document_store/faiss.py +++ b/haystack/document_store/faiss.py @@ -159,8 +159,8 @@ class FAISSDocumentStore(SQLDocumentStore): # assign query score to each document scores_for_vector_ids: Dict[str, float] = {str(v_id): s for v_id, s in zip(vector_id_matrix[0], score_matrix[0])} for doc in documents: - doc.query_score = scores_for_vector_ids[doc.meta["vector_id"]] # type: ignore - + doc.score = scores_for_vector_ids[doc.meta["vector_id"]] # type: ignore + doc.probability = (doc.score + 1) / 2 return documents def save(self, file_path: Union[str, Path]): diff --git a/haystack/document_store/memory.py b/haystack/document_store/memory.py index a13f13d09..d82e93191 100644 --- a/haystack/document_store/memory.py +++ b/haystack/document_store/memory.py @@ -85,12 +85,13 @@ class InMemoryDocumentStore(BaseDocumentStore): candidate_docs = [] for idx, doc in self.indexes[index].items(): - doc.query_score = dot(query_emb, doc.embedding) / ( + doc.score = dot(query_emb, doc.embedding) / ( norm(query_emb) * norm(doc.embedding) ) + doc.probability = (doc.score + 1) / 2 candidate_docs.append(doc) - return sorted(candidate_docs, key=lambda x: x.query_score, reverse=True)[0:top_k] + return sorted(candidate_docs, key=lambda x: x.score, reverse=True)[0:top_k] def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None): """ diff --git a/haystack/finder.py b/haystack/finder.py index eb7b9e145..7bb2c373e 100644 --- a/haystack/finder.py +++ b/haystack/finder.py @@ -4,12 +4,9 @@ from statistics import mean from typing import Optional, Dict, Any, List from collections import defaultdict -import numpy as np -from scipy.special import expit - from haystack.reader.base import BaseReader from haystack.retriever.base import BaseRetriever -from haystack import MultiLabel, Document +from haystack import MultiLabel from haystack.eval import calculate_average_precision, eval_counts_reader_batch, calculate_reader_metrics, \ eval_counts_reader @@ -99,17 +96,13 @@ class Finder: "answer": doc.text, "document_id": doc.id, "context": doc.text, - "score": doc.query_score, + "score": doc.score, + "probability": doc.probability, "offset_start": 0, "offset_end": len(doc.text), "meta": doc.meta } - if self.retriever.embedding_model: # type: ignore - probability = (doc.query_score + 1) / 2 # type: ignore - else: - probability = float(expit(np.asarray(doc.query_score / 8))) # type: ignore - cur_answer["probability"] = probability results["answers"].append(cur_answer) return results diff --git a/haystack/schema.py b/haystack/schema.py index 449e176ad..a70e1e23b 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -7,7 +7,8 @@ import numpy as np class Document: def __init__(self, text: str, id: str = None, - query_score: Optional[float] = None, + score: Optional[float] = None, + probability: Optional[float] = None, question: Optional[str] = None, meta: Optional[Dict[str, Any]] = None, embedding: Optional[np.array] = None): @@ -21,7 +22,8 @@ class Document: :param id: ID used within the DocumentStore :param text: Text of the document - :param query_score: Retriever's query score for a retrieved document + :param score: Retriever's query score for a retrieved document + :param probability: a psuedo probability by scaling score in the range 0 to 1 :param question: Question text for FAQs. :param meta: Meta fields for a document like name, url, or author. :param embedding: Vector encoding of the text @@ -34,7 +36,8 @@ class Document: else: self.id = str(uuid4()) - self.query_score = query_score + self.score = score + self.probability = probability self.question = question self.meta = meta self.embedding = embedding @@ -50,7 +53,7 @@ class Document: @classmethod def from_dict(cls, dict, field_map={}): _doc = dict.copy() - init_args = ["text", "id", "query_score", "question", "meta", "embedding"] + init_args = ["text", "id", "score", "probability", "question", "meta", "embedding"] if "meta" not in _doc.keys(): _doc["meta"] = {} # copy additional fields into "meta"