Fix scoring in Elasticsearch for dot product (#517)

This commit is contained in:
Tanay Soni 2020-10-23 17:50:49 +02:00 committed by GitHub
parent def8fd617a
commit db4151bbc0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 23 additions and 22 deletions

View File

@ -24,7 +24,7 @@ jobs:
- name: Run Elasticsearch
uses: elastic/elastic-github-actions/elasticsearch@25ad91e35aeee806711d335fc9dec7927ae49bc6
with:
stack-version: 7.6.0
stack-version: 7.9.2
- name: Run Apache Tika
run: docker run -d -p 9998:9998 apache/tika:1.24.1

View File

@ -460,7 +460,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
"script_score": {
"query": {"match_all": {}},
"script": {
"source": f"{self.similarity_fn_name}(params.query_vector,'{self.embedding_field}') + 1.0",
# offset score to ensure a positive range as required by Elasticsearch
"source": f"{self.similarity_fn_name}(params.query_vector,'{self.embedding_field}') + 1000",
"params": {
"query_vector": query_emb.tolist()
}
@ -497,7 +498,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
score = hit["_score"] if hit["_score"] else None
if score:
if adapt_score_for_embedding:
score -= 1
score -= 1000
probability = (score + 1) / 2 # scaling probability from cosine similarity
else:
probability = float(expit(np.asarray(score / 8))) # scaling probability from TFIDF/BM25

View File

@ -33,7 +33,7 @@ def elasticsearch_fixture():
shell=True
)
status = subprocess.run(
['docker run -d --name haystack_test_elastic -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.1'],
['docker run -d --name haystack_test_elastic -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'],
shell=True
)
if status.returncode:
@ -160,12 +160,12 @@ def document_store(request, test_docs_xs, elasticsearch_fixture):
return get_document_store(request.param)
@pytest.fixture(params=["es_filter_only", "elsticsearch", "dpr", "embedding", "tfidf"])
@pytest.fixture(params=["es_filter_only", "elasticsearch", "dpr", "embedding", "tfidf"])
def retriever(request, document_store):
return get_retriever(request.param, document_store)
@pytest.fixture(params=["es_filter_only", "elsticsearch", "dpr", "embedding", "tfidf"])
@pytest.fixture(params=["es_filter_only", "elasticsearch", "dpr", "embedding", "tfidf"])
def retriever_with_docs(request, document_store_with_docs):
return get_retriever(request.param, document_store_with_docs)
@ -206,7 +206,7 @@ def get_retriever(retriever_type, document_store):
retriever = EmbeddingRetriever(document_store=document_store,
embedding_model="deepset/sentence_bert",
use_gpu=False)
elif retriever_type == "elsticsearch":
elif retriever_type == "elasticsearch":
retriever = ElasticsearchRetriever(document_store=document_store)
elif retriever_type == "es_filter_only":
retriever = ElasticsearchFilterOnlyRetriever(document_store=document_store)

View File

@ -2,7 +2,7 @@ import pytest
@pytest.mark.parametrize("document_store_with_docs", [("elasticsearch")], indirect=True)
@pytest.mark.parametrize("retriever_with_docs", ["elsticsearch"], indirect=True)
@pytest.mark.parametrize("retriever_with_docs", ["elasticsearch"], indirect=True)
def test_elasticsearch_retrieval(retriever_with_docs, document_store_with_docs):
res = retriever_with_docs.retrieve(query="Who lives in Berlin?")
assert res[0].text == "My name is Carla and I live in Berlin"
@ -11,7 +11,7 @@ def test_elasticsearch_retrieval(retriever_with_docs, document_store_with_docs):
@pytest.mark.parametrize("document_store_with_docs", [("elasticsearch")], indirect=True)
@pytest.mark.parametrize("retriever_with_docs", ["elsticsearch"], indirect=True)
@pytest.mark.parametrize("retriever_with_docs", ["elasticsearch"], indirect=True)
def test_elasticsearch_retrieval_filters(retriever_with_docs, document_store_with_docs):
res = retriever_with_docs.retrieve(query="Who lives in Berlin?", filters={"name": ["filename1"]})
assert res[0].text == "My name is Carla and I live in Berlin"

View File

@ -61,7 +61,7 @@ def test_eval_reader(reader, document_store: BaseDocumentStore):
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
@pytest.mark.parametrize("open_domain", [True, False])
@pytest.mark.parametrize("retriever", ["elsticsearch"], indirect=True)
@pytest.mark.parametrize("retriever", ["elasticsearch"], indirect=True)
def test_eval_elastic_retriever(document_store: BaseDocumentStore, open_domain, retriever):
# add eval data (SQUAD format)
document_store.delete_all_documents(index="test_eval_document")
@ -81,7 +81,7 @@ def test_eval_elastic_retriever(document_store: BaseDocumentStore, open_domain,
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
@pytest.mark.parametrize("retriever", ["elsticsearch"], indirect=True)
@pytest.mark.parametrize("retriever", ["elasticsearch"], indirect=True)
def test_eval_finder(document_store: BaseDocumentStore, reader, retriever):
finder = Finder(reader=reader, retriever=retriever)

View File

@ -71,7 +71,7 @@
"outputs": [],
"source": [
"# Recommended: Start Elasticsearch using Docker\n",
"# ! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.6.2"
"# ! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.9.2"
]
},
{
@ -81,13 +81,13 @@
"outputs": [],
"source": [
"# In Colab / No Docker environments: Start Elasticsearch from source\n",
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n",
"! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n",
"! chown -R daemon:daemon elasticsearch-7.6.2\n",
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q\n",
"! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz\n",
"! chown -R daemon:daemon elasticsearch-7.9.2\n",
"\n",
"import os\n",
"from subprocess import Popen, PIPE, STDOUT\n",
"es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n",
"es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],\n",
" stdout=PIPE, stderr=STDOUT,\n",
" preexec_fn=lambda: os.setuid(1) # as daemon\n",
" )\n",

View File

@ -26,7 +26,7 @@ LAUNCH_ELASTICSEARCH=True
if LAUNCH_ELASTICSEARCH:
logging.info("Starting Elasticsearch ...")
status = subprocess.run(
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'], shell=True
)
if status.returncode:
raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"

View File

@ -65,13 +65,13 @@
"outputs": [],
"source": [
"# In Colab / No Docker environments: Start Elasticsearch from source\n",
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n",
"! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n",
"! chown -R daemon:daemon elasticsearch-7.6.2\n",
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q\n",
"! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz\n",
"! chown -R daemon:daemon elasticsearch-7.9.2\n",
"\n",
"import os\n",
"from subprocess import Popen, PIPE, STDOUT\n",
"es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n",
"es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],\n",
" stdout=PIPE, stderr=STDOUT,\n",
" preexec_fn=lambda: os.setuid(1) # as daemon\n",
" )\n",

View File

@ -34,7 +34,7 @@ device, n_gpu = initialize_device_settings(use_cuda=True)
if LAUNCH_ELASTICSEARCH:
logging.info("Starting Elasticsearch ...")
status = subprocess.run(
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'], shell=True
)
if status.returncode:
raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"