From 05aa42c6875be0f5f692dbca766fc29f99c6c4ad Mon Sep 17 00:00:00 2001
From: Malte Pietsch <malte.pietsch@deepset.ai>
Date: Sun, 22 Mar 2020 18:28:35 +0100
Subject: [PATCH] fix scaling of pseudo probs for es scores. fix filtering of
 embedding retrieval(#46)

---
 haystack/database/elasticsearch.py | 6 +++++-
 haystack/finder.py                 | 7 ++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/haystack/database/elasticsearch.py b/haystack/database/elasticsearch.py
index 5f65587ab..150b68c71 100644
--- a/haystack/database/elasticsearch.py
+++ b/haystack/database/elasticsearch.py
@@ -184,7 +184,11 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
             }
 
             if candidate_doc_ids:
-                body["query"]["bool"]["filter"] = [{"terms": {"_id": candidate_doc_ids}}]
+                body["query"]["script_score"]["query"] = {
+                    "bool": {
+                        "should": [{"match_all": {}}],
+                        "filter": [{"terms": {"_id": candidate_doc_ids}}]
+                }}
 
             if self.excluded_meta_data:
                 body["_source"] = {"excludes": self.excluded_meta_data}
diff --git a/haystack/finder.py b/haystack/finder.py
index a0aa2c92b..5ff2bc48e 100644
--- a/haystack/finder.py
+++ b/haystack/finder.py
@@ -1,4 +1,8 @@
 import logging
+from scipy.special import expit
+import numpy as np
+
+
 logger = logging.getLogger(__name__)
 
 
@@ -101,8 +105,9 @@ class Finder:
                               "probability": (meta["score"]+1)/2, "offset_start": 0, "offset_end": len(answer),
                               "meta": meta}
             else:
+                pseudo_prob = float(expit(np.asarray(meta["score"]) / 8))
                 cur_answer = {"question": meta["question"], "answer": answer, "context": answer, "score": meta["score"],
-                              "probability": meta["score"]/ 10, "offset_start": 0, "offset_end": len(answer), "meta": meta}
+                              "probability": pseudo_prob, "offset_start": 0, "offset_end": len(answer), "meta": meta}
             results["answers"].append(cur_answer)
 
         return results