Move retriever probability calculations to document_store (#389)

This commit is contained in:
Tanay Soni 2020-09-17 16:25:46 +02:00 committed by GitHub
parent 03fa4a8740
commit 06243dbda4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 31 additions and 22 deletions

View File

@ -6,6 +6,7 @@ from typing import List, Optional, Union, Dict, Any
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, scan
import numpy as np
from scipy.special import expit
from haystack.document_store.base import BaseDocumentStore
from haystack import Document, Label
@ -211,7 +212,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
_doc["_id"] = str(_doc.pop("id"))
# don't index query score and empty fields
_ = _doc.pop("query_score", None)
_ = _doc.pop("score", None)
_ = _doc.pop("probability", None)
_doc = {k:v for k,v in _doc.items() if v is not None}
# In order to have a flat structure in elastic + similar behaviour to the other DocumentStores,
@ -414,21 +416,31 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
logger.debug(f"Retriever query: {body}")
result = self.client.search(index=index, body=body, request_timeout=300)["hits"]["hits"]
documents = [self._convert_es_hit_to_document(hit, score_adjustment=-1) for hit in result]
documents = [self._convert_es_hit_to_document(hit, adapt_score_for_embedding=True) for hit in result]
return documents
def _convert_es_hit_to_document(self, hit: dict, score_adjustment: int = 0) -> Document:
def _convert_es_hit_to_document(self, hit: dict, adapt_score_for_embedding: bool = False) -> Document:
# We put all additional data of the doc into meta_data and return it in the API
meta_data = {k:v for k,v in hit["_source"].items() if k not in (self.text_field, self.faq_question_field, self.embedding_field)}
name = meta_data.pop(self.name_field, None)
if name:
meta_data["name"] = name
score = hit["_score"] if hit["_score"] else None
if score:
if adapt_score_for_embedding:
score -= 1
probability = (score + 1) / 2 # scaling probability from cosine similarity
else:
probability = float(expit(np.asarray(score / 8))) # scaling probability from TFIDF/BM25
else:
probability = None
document = Document(
id=hit["_id"],
text=hit["_source"].get(self.text_field),
meta=meta_data,
query_score=hit["_score"] + score_adjustment if hit["_score"] else None,
score=score,
probability=probability,
question=hit["_source"].get(self.faq_question_field),
embedding=hit["_source"].get(self.embedding_field)
)

View File

@ -159,8 +159,8 @@ class FAISSDocumentStore(SQLDocumentStore):
# assign query score to each document
scores_for_vector_ids: Dict[str, float] = {str(v_id): s for v_id, s in zip(vector_id_matrix[0], score_matrix[0])}
for doc in documents:
doc.query_score = scores_for_vector_ids[doc.meta["vector_id"]] # type: ignore
doc.score = scores_for_vector_ids[doc.meta["vector_id"]] # type: ignore
doc.probability = (doc.score + 1) / 2
return documents
def save(self, file_path: Union[str, Path]):

View File

@ -85,12 +85,13 @@ class InMemoryDocumentStore(BaseDocumentStore):
candidate_docs = []
for idx, doc in self.indexes[index].items():
doc.query_score = dot(query_emb, doc.embedding) / (
doc.score = dot(query_emb, doc.embedding) / (
norm(query_emb) * norm(doc.embedding)
)
doc.probability = (doc.score + 1) / 2
candidate_docs.append(doc)
return sorted(candidate_docs, key=lambda x: x.query_score, reverse=True)[0:top_k]
return sorted(candidate_docs, key=lambda x: x.score, reverse=True)[0:top_k]
def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None):
"""

View File

@ -4,12 +4,9 @@ from statistics import mean
from typing import Optional, Dict, Any, List
from collections import defaultdict
import numpy as np
from scipy.special import expit
from haystack.reader.base import BaseReader
from haystack.retriever.base import BaseRetriever
from haystack import MultiLabel, Document
from haystack import MultiLabel
from haystack.eval import calculate_average_precision, eval_counts_reader_batch, calculate_reader_metrics, \
eval_counts_reader
@ -99,17 +96,13 @@ class Finder:
"answer": doc.text,
"document_id": doc.id,
"context": doc.text,
"score": doc.query_score,
"score": doc.score,
"probability": doc.probability,
"offset_start": 0,
"offset_end": len(doc.text),
"meta": doc.meta
}
if self.retriever.embedding_model: # type: ignore
probability = (doc.query_score + 1) / 2 # type: ignore
else:
probability = float(expit(np.asarray(doc.query_score / 8))) # type: ignore
cur_answer["probability"] = probability
results["answers"].append(cur_answer)
return results

View File

@ -7,7 +7,8 @@ import numpy as np
class Document:
def __init__(self, text: str,
id: str = None,
query_score: Optional[float] = None,
score: Optional[float] = None,
probability: Optional[float] = None,
question: Optional[str] = None,
meta: Optional[Dict[str, Any]] = None,
embedding: Optional[np.array] = None):
@ -21,7 +22,8 @@ class Document:
:param id: ID used within the DocumentStore
:param text: Text of the document
:param query_score: Retriever's query score for a retrieved document
:param score: Retriever's query score for a retrieved document
:param probability: a psuedo probability by scaling score in the range 0 to 1
:param question: Question text for FAQs.
:param meta: Meta fields for a document like name, url, or author.
:param embedding: Vector encoding of the text
@ -34,7 +36,8 @@ class Document:
else:
self.id = str(uuid4())
self.query_score = query_score
self.score = score
self.probability = probability
self.question = question
self.meta = meta
self.embedding = embedding
@ -50,7 +53,7 @@ class Document:
@classmethod
def from_dict(cls, dict, field_map={}):
_doc = dict.copy()
init_args = ["text", "id", "query_score", "question", "meta", "embedding"]
init_args = ["text", "id", "score", "probability", "question", "meta", "embedding"]
if "meta" not in _doc.keys():
_doc["meta"] = {}
# copy additional fields into "meta"