From 05aa42c6875be0f5f692dbca766fc29f99c6c4ad Mon Sep 17 00:00:00 2001 From: Malte Pietsch Date: Sun, 22 Mar 2020 18:28:35 +0100 Subject: [PATCH] fix scaling of pseudo probs for es scores. fix filtering of embedding retrieval(#46) --- haystack/database/elasticsearch.py | 6 +++++- haystack/finder.py | 7 ++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/haystack/database/elasticsearch.py b/haystack/database/elasticsearch.py index 5f65587ab..150b68c71 100644 --- a/haystack/database/elasticsearch.py +++ b/haystack/database/elasticsearch.py @@ -184,7 +184,11 @@ class ElasticsearchDocumentStore(BaseDocumentStore): } if candidate_doc_ids: - body["query"]["bool"]["filter"] = [{"terms": {"_id": candidate_doc_ids}}] + body["query"]["script_score"]["query"] = { + "bool": { + "should": [{"match_all": {}}], + "filter": [{"terms": {"_id": candidate_doc_ids}}] + }} if self.excluded_meta_data: body["_source"] = {"excludes": self.excluded_meta_data} diff --git a/haystack/finder.py b/haystack/finder.py index a0aa2c92b..5ff2bc48e 100644 --- a/haystack/finder.py +++ b/haystack/finder.py @@ -1,4 +1,8 @@ import logging +from scipy.special import expit +import numpy as np + + logger = logging.getLogger(__name__) @@ -101,8 +105,9 @@ class Finder: "probability": (meta["score"]+1)/2, "offset_start": 0, "offset_end": len(answer), "meta": meta} else: + pseudo_prob = float(expit(np.asarray(meta["score"]) / 8)) cur_answer = {"question": meta["question"], "answer": answer, "context": answer, "score": meta["score"], - "probability": meta["score"]/ 10, "offset_start": 0, "offset_end": len(answer), "meta": meta} + "probability": pseudo_prob, "offset_start": 0, "offset_end": len(answer), "meta": meta} results["answers"].append(cur_answer) return results