mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-04 02:57:34 +00:00
Fix scoring in Elasticsearch for dot product (#517)
This commit is contained in:
parent
def8fd617a
commit
db4151bbc0
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
@ -24,7 +24,7 @@ jobs:
|
||||
- name: Run Elasticsearch
|
||||
uses: elastic/elastic-github-actions/elasticsearch@25ad91e35aeee806711d335fc9dec7927ae49bc6
|
||||
with:
|
||||
stack-version: 7.6.0
|
||||
stack-version: 7.9.2
|
||||
|
||||
- name: Run Apache Tika
|
||||
run: docker run -d -p 9998:9998 apache/tika:1.24.1
|
||||
|
||||
@ -460,7 +460,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
"script_score": {
|
||||
"query": {"match_all": {}},
|
||||
"script": {
|
||||
"source": f"{self.similarity_fn_name}(params.query_vector,'{self.embedding_field}') + 1.0",
|
||||
# offset score to ensure a positive range as required by Elasticsearch
|
||||
"source": f"{self.similarity_fn_name}(params.query_vector,'{self.embedding_field}') + 1000",
|
||||
"params": {
|
||||
"query_vector": query_emb.tolist()
|
||||
}
|
||||
@ -497,7 +498,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
score = hit["_score"] if hit["_score"] else None
|
||||
if score:
|
||||
if adapt_score_for_embedding:
|
||||
score -= 1
|
||||
score -= 1000
|
||||
probability = (score + 1) / 2 # scaling probability from cosine similarity
|
||||
else:
|
||||
probability = float(expit(np.asarray(score / 8))) # scaling probability from TFIDF/BM25
|
||||
|
||||
@ -33,7 +33,7 @@ def elasticsearch_fixture():
|
||||
shell=True
|
||||
)
|
||||
status = subprocess.run(
|
||||
['docker run -d --name haystack_test_elastic -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.1'],
|
||||
['docker run -d --name haystack_test_elastic -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'],
|
||||
shell=True
|
||||
)
|
||||
if status.returncode:
|
||||
@ -160,12 +160,12 @@ def document_store(request, test_docs_xs, elasticsearch_fixture):
|
||||
return get_document_store(request.param)
|
||||
|
||||
|
||||
@pytest.fixture(params=["es_filter_only", "elsticsearch", "dpr", "embedding", "tfidf"])
|
||||
@pytest.fixture(params=["es_filter_only", "elasticsearch", "dpr", "embedding", "tfidf"])
|
||||
def retriever(request, document_store):
|
||||
return get_retriever(request.param, document_store)
|
||||
|
||||
|
||||
@pytest.fixture(params=["es_filter_only", "elsticsearch", "dpr", "embedding", "tfidf"])
|
||||
@pytest.fixture(params=["es_filter_only", "elasticsearch", "dpr", "embedding", "tfidf"])
|
||||
def retriever_with_docs(request, document_store_with_docs):
|
||||
return get_retriever(request.param, document_store_with_docs)
|
||||
|
||||
@ -206,7 +206,7 @@ def get_retriever(retriever_type, document_store):
|
||||
retriever = EmbeddingRetriever(document_store=document_store,
|
||||
embedding_model="deepset/sentence_bert",
|
||||
use_gpu=False)
|
||||
elif retriever_type == "elsticsearch":
|
||||
elif retriever_type == "elasticsearch":
|
||||
retriever = ElasticsearchRetriever(document_store=document_store)
|
||||
elif retriever_type == "es_filter_only":
|
||||
retriever = ElasticsearchFilterOnlyRetriever(document_store=document_store)
|
||||
|
||||
@ -2,7 +2,7 @@ import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize("document_store_with_docs", [("elasticsearch")], indirect=True)
|
||||
@pytest.mark.parametrize("retriever_with_docs", ["elsticsearch"], indirect=True)
|
||||
@pytest.mark.parametrize("retriever_with_docs", ["elasticsearch"], indirect=True)
|
||||
def test_elasticsearch_retrieval(retriever_with_docs, document_store_with_docs):
|
||||
res = retriever_with_docs.retrieve(query="Who lives in Berlin?")
|
||||
assert res[0].text == "My name is Carla and I live in Berlin"
|
||||
@ -11,7 +11,7 @@ def test_elasticsearch_retrieval(retriever_with_docs, document_store_with_docs):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("document_store_with_docs", [("elasticsearch")], indirect=True)
|
||||
@pytest.mark.parametrize("retriever_with_docs", ["elsticsearch"], indirect=True)
|
||||
@pytest.mark.parametrize("retriever_with_docs", ["elasticsearch"], indirect=True)
|
||||
def test_elasticsearch_retrieval_filters(retriever_with_docs, document_store_with_docs):
|
||||
res = retriever_with_docs.retrieve(query="Who lives in Berlin?", filters={"name": ["filename1"]})
|
||||
assert res[0].text == "My name is Carla and I live in Berlin"
|
||||
|
||||
@ -61,7 +61,7 @@ def test_eval_reader(reader, document_store: BaseDocumentStore):
|
||||
|
||||
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
|
||||
@pytest.mark.parametrize("open_domain", [True, False])
|
||||
@pytest.mark.parametrize("retriever", ["elsticsearch"], indirect=True)
|
||||
@pytest.mark.parametrize("retriever", ["elasticsearch"], indirect=True)
|
||||
def test_eval_elastic_retriever(document_store: BaseDocumentStore, open_domain, retriever):
|
||||
# add eval data (SQUAD format)
|
||||
document_store.delete_all_documents(index="test_eval_document")
|
||||
@ -81,7 +81,7 @@ def test_eval_elastic_retriever(document_store: BaseDocumentStore, open_domain,
|
||||
|
||||
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
|
||||
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
|
||||
@pytest.mark.parametrize("retriever", ["elsticsearch"], indirect=True)
|
||||
@pytest.mark.parametrize("retriever", ["elasticsearch"], indirect=True)
|
||||
def test_eval_finder(document_store: BaseDocumentStore, reader, retriever):
|
||||
finder = Finder(reader=reader, retriever=retriever)
|
||||
|
||||
|
||||
@ -71,7 +71,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Recommended: Start Elasticsearch using Docker\n",
|
||||
"# ! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.6.2"
|
||||
"# ! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.9.2"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -81,13 +81,13 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# In Colab / No Docker environments: Start Elasticsearch from source\n",
|
||||
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n",
|
||||
"! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n",
|
||||
"! chown -R daemon:daemon elasticsearch-7.6.2\n",
|
||||
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q\n",
|
||||
"! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz\n",
|
||||
"! chown -R daemon:daemon elasticsearch-7.9.2\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from subprocess import Popen, PIPE, STDOUT\n",
|
||||
"es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n",
|
||||
"es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],\n",
|
||||
" stdout=PIPE, stderr=STDOUT,\n",
|
||||
" preexec_fn=lambda: os.setuid(1) # as daemon\n",
|
||||
" )\n",
|
||||
|
||||
@ -26,7 +26,7 @@ LAUNCH_ELASTICSEARCH=True
|
||||
if LAUNCH_ELASTICSEARCH:
|
||||
logging.info("Starting Elasticsearch ...")
|
||||
status = subprocess.run(
|
||||
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True
|
||||
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'], shell=True
|
||||
)
|
||||
if status.returncode:
|
||||
raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
|
||||
|
||||
@ -65,13 +65,13 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# In Colab / No Docker environments: Start Elasticsearch from source\n",
|
||||
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n",
|
||||
"! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n",
|
||||
"! chown -R daemon:daemon elasticsearch-7.6.2\n",
|
||||
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q\n",
|
||||
"! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz\n",
|
||||
"! chown -R daemon:daemon elasticsearch-7.9.2\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from subprocess import Popen, PIPE, STDOUT\n",
|
||||
"es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n",
|
||||
"es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],\n",
|
||||
" stdout=PIPE, stderr=STDOUT,\n",
|
||||
" preexec_fn=lambda: os.setuid(1) # as daemon\n",
|
||||
" )\n",
|
||||
|
||||
@ -34,7 +34,7 @@ device, n_gpu = initialize_device_settings(use_cuda=True)
|
||||
if LAUNCH_ELASTICSEARCH:
|
||||
logging.info("Starting Elasticsearch ...")
|
||||
status = subprocess.run(
|
||||
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True
|
||||
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'], shell=True
|
||||
)
|
||||
if status.returncode:
|
||||
raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user