haystack/test/test_document_store.py
Malte Pietsch e641bff7a6
Allow more options for elasticsearch client (auth, multiple hosts) (#845)
* allow more options for elasticsearch client (auth, multiple hosts)

* Add latest docstring and tutorial changes

* fix mypy

* Add latest docstring and tutorial changes

* test client connection via ping()

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2021-02-19 14:29:59 +01:00

561 lines
22 KiB
Python

import numpy as np
import pytest
from elasticsearch import Elasticsearch
from conftest import get_document_store
from haystack import Document, Label
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
@pytest.mark.elasticsearch
def test_init_elastic_client():
# defaults
_ = ElasticsearchDocumentStore()
# list of hosts + single port
_ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=9200)
# list of hosts + list of ports (wrong)
with pytest.raises(Exception):
_ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=[9200])
# list of hosts + list
_ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=[9200, 9200])
# only api_key
with pytest.raises(Exception):
_ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test")
# api_key + id
_ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test", api_key_id="test")
@pytest.mark.elasticsearch
def test_get_all_documents_without_filters(document_store_with_docs):
documents = document_store_with_docs.get_all_documents()
assert all(isinstance(d, Document) for d in documents)
assert len(documents) == 3
assert {d.meta["name"] for d in documents} == {"filename1", "filename2", "filename3"}
assert {d.meta["meta_field"] for d in documents} == {"test1", "test2", "test3"}
@pytest.mark.elasticsearch
def test_get_all_document_filter_duplicate_value(document_store):
documents = [
Document(
text="Doc1",
meta={"f1": "0"}
),
Document(
text="Doc1",
meta={"f1": "1", "meta_id": "0"}
),
Document(
text="Doc2",
meta={"f3": "0"}
)
]
document_store.write_documents(documents)
documents = document_store.get_all_documents(filters={"f1": ["1"]})
assert documents[0].text == "Doc1"
assert len(documents) == 1
assert {d.meta["meta_id"] for d in documents} == {"0"}
@pytest.mark.elasticsearch
def test_get_all_documents_with_correct_filters(document_store_with_docs):
documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test2"]})
assert len(documents) == 1
assert documents[0].meta["name"] == "filename2"
documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test1", "test3"]})
assert len(documents) == 2
assert {d.meta["name"] for d in documents} == {"filename1", "filename3"}
assert {d.meta["meta_field"] for d in documents} == {"test1", "test3"}
@pytest.mark.parametrize("document_store_with_docs", ["sql"], indirect=True)
def test_get_all_documents_with_correct_filters_legacy_sqlite(document_store_with_docs):
document_store_with_docs.use_windowed_query = False
documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test2"]})
assert len(documents) == 1
assert documents[0].meta["name"] == "filename2"
documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test1", "test3"]})
assert len(documents) == 2
assert {d.meta["name"] for d in documents} == {"filename1", "filename3"}
assert {d.meta["meta_field"] for d in documents} == {"test1", "test3"}
@pytest.mark.elasticsearch
def test_get_all_documents_with_incorrect_filter_name(document_store_with_docs):
documents = document_store_with_docs.get_all_documents(filters={"incorrect_meta_field": ["test2"]})
assert len(documents) == 0
@pytest.mark.elasticsearch
def test_get_all_documents_with_incorrect_filter_value(document_store_with_docs):
documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["incorrect_value"]})
assert len(documents) == 0
@pytest.mark.elasticsearch
def test_get_documents_by_id(document_store_with_docs):
documents = document_store_with_docs.get_all_documents()
doc = document_store_with_docs.get_document_by_id(documents[0].id)
assert doc.id == documents[0].id
assert doc.text == documents[0].text
@pytest.mark.elasticsearch
def test_get_document_count(document_store):
documents = [
{"text": "text1", "id": "1", "meta_field_for_count": "a"},
{"text": "text2", "id": "2", "meta_field_for_count": "b"},
{"text": "text3", "id": "3", "meta_field_for_count": "b"},
{"text": "text4", "id": "4", "meta_field_for_count": "b"},
]
document_store.write_documents(documents)
assert document_store.get_document_count() == 4
assert document_store.get_document_count(filters={"meta_field_for_count": ["a"]}) == 1
assert document_store.get_document_count(filters={"meta_field_for_count": ["b"]}) == 3
@pytest.mark.elasticsearch
def test_get_all_documents_generator(document_store):
documents = [
{"text": "text1", "id": "1", "meta_field_for_count": "a"},
{"text": "text2", "id": "2", "meta_field_for_count": "b"},
{"text": "text3", "id": "3", "meta_field_for_count": "b"},
{"text": "text4", "id": "4", "meta_field_for_count": "b"},
{"text": "text5", "id": "5", "meta_field_for_count": "b"},
]
document_store.write_documents(documents)
assert len(list(document_store.get_all_documents_generator(batch_size=2))) == 5
@pytest.mark.elasticsearch
@pytest.mark.parametrize("document_store", ["elasticsearch", "sql", "faiss", "milvus"], indirect=True)
@pytest.mark.parametrize("update_existing_documents", [True, False])
def test_update_existing_documents(document_store, update_existing_documents):
original_docs = [
{"text": "text1_orig", "id": "1", "meta_field_for_count": "a"},
]
updated_docs = [
{"text": "text1_new", "id": "1", "meta_field_for_count": "a"},
]
document_store.update_existing_documents = update_existing_documents
document_store.write_documents(original_docs)
assert document_store.get_document_count() == 1
if update_existing_documents:
document_store.write_documents(updated_docs)
else:
with pytest.raises(Exception):
document_store.write_documents(updated_docs)
stored_docs = document_store.get_all_documents()
assert len(stored_docs) == 1
if update_existing_documents:
assert stored_docs[0].text == updated_docs[0]["text"]
else:
assert stored_docs[0].text == original_docs[0]["text"]
@pytest.mark.elasticsearch
def test_write_document_meta(document_store):
documents = [
{"text": "dict_without_meta", "id": "1"},
{"text": "dict_with_meta", "meta_field": "test2", "name": "filename2", "id": "2"},
Document(text="document_object_without_meta", id="3"),
Document(text="document_object_with_meta", meta={"meta_field": "test4", "name": "filename3"}, id="4"),
]
document_store.write_documents(documents)
documents_in_store = document_store.get_all_documents()
assert len(documents_in_store) == 4
assert not document_store.get_document_by_id("1").meta
assert document_store.get_document_by_id("2").meta["meta_field"] == "test2"
assert not document_store.get_document_by_id("3").meta
assert document_store.get_document_by_id("4").meta["meta_field"] == "test4"
@pytest.mark.elasticsearch
def test_write_document_index(document_store):
documents = [
{"text": "text1", "id": "1"},
{"text": "text2", "id": "2"},
]
document_store.write_documents([documents[0]], index="haystack_test_1")
assert len(document_store.get_all_documents(index="haystack_test_1")) == 1
document_store.write_documents([documents[1]], index="haystack_test_2")
assert len(document_store.get_all_documents(index="haystack_test_2")) == 1
assert len(document_store.get_all_documents(index="haystack_test_1")) == 1
assert len(document_store.get_all_documents()) == 0
@pytest.mark.elasticsearch
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus"], indirect=True)
def test_document_with_embeddings(document_store):
documents = [
{"text": "text1", "id": "1", "embedding": np.random.rand(768).astype(np.float32)},
{"text": "text2", "id": "2", "embedding": np.random.rand(768).astype(np.float64)},
{"text": "text3", "id": "3", "embedding": np.random.rand(768).astype(np.float32).tolist()},
{"text": "text4", "id": "4", "embedding": np.random.rand(768).astype(np.float32)},
]
document_store.write_documents(documents, index="haystack_test_1")
assert len(document_store.get_all_documents(index="haystack_test_1")) == 4
documents_without_embedding = document_store.get_all_documents(index="haystack_test_1", return_embedding=False)
assert documents_without_embedding[0].embedding is None
documents_with_embedding = document_store.get_all_documents(index="haystack_test_1", return_embedding=True)
assert isinstance(documents_with_embedding[0].embedding, (list, np.ndarray))
@pytest.mark.parametrize("retriever", ["dpr", "embedding"], indirect=True)
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus"], indirect=True)
def test_update_embeddings(document_store, retriever):
documents = []
for i in range(6):
documents.append({"text": f"text_{i}", "id": str(i), "meta_field": f"value_{i}"})
documents.append({"text": "text_0", "id": "6", "meta_field": "value_0"})
document_store.write_documents(documents, index="haystack_test_1")
document_store.update_embeddings(retriever, index="haystack_test_1", batch_size=3)
documents = document_store.get_all_documents(index="haystack_test_1", return_embedding=True)
assert len(documents) == 7
for doc in documents:
assert type(doc.embedding) is np.ndarray
documents = document_store.get_all_documents(
index="haystack_test_1",
filters={"meta_field": ["value_0"]},
return_embedding=True,
)
assert len(documents) == 2
for doc in documents:
assert doc.meta["meta_field"] == "value_0"
np.testing.assert_array_almost_equal(documents[0].embedding, documents[1].embedding, decimal=4)
documents = document_store.get_all_documents(
index="haystack_test_1",
filters={"meta_field": ["value_0", "value_5"]},
return_embedding=True,
)
np.testing.assert_raises(
AssertionError,
np.testing.assert_array_equal,
documents[0].embedding,
documents[1].embedding
)
doc = {"text": "text_7", "id": "7", "meta_field": "value_7",
"embedding": retriever.embed_queries(texts=["a random string"])[0]}
document_store.write_documents([doc], index="haystack_test_1")
documents = []
for i in range(8, 11):
documents.append({"text": f"text_{i}", "id": str(i), "meta_field": f"value_{i}"})
document_store.write_documents(documents, index="haystack_test_1")
doc_before_update = document_store.get_all_documents(index="haystack_test_1", filters={"meta_field": ["value_7"]})[0]
embedding_before_update = doc_before_update.embedding
# test updating only documents without embeddings
document_store.update_embeddings(retriever, index="haystack_test_1", batch_size=3, update_existing_embeddings=False)
doc_after_update = document_store.get_all_documents(index="haystack_test_1", filters={"meta_field": ["value_7"]})[0]
embedding_after_update = doc_after_update.embedding
np.testing.assert_array_equal(embedding_before_update, embedding_after_update)
# test updating with filters
document_store.update_embeddings(
retriever, index="haystack_test_1", batch_size=3, filters={"meta_field": ["value_0", "value_1"]}
)
doc_after_update = document_store.get_all_documents(index="haystack_test_1", filters={"meta_field": ["value_7"]})[0]
embedding_after_update = doc_after_update.embedding
np.testing.assert_array_equal(embedding_before_update, embedding_after_update)
# test update all embeddings
document_store.update_embeddings(retriever, index="haystack_test_1", batch_size=3, update_existing_embeddings=True)
doc_after_update = document_store.get_all_documents(index="haystack_test_1", filters={"meta_field": ["value_7"]})[0]
embedding_after_update = doc_after_update.embedding
np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, embedding_before_update, embedding_after_update)
@pytest.mark.elasticsearch
def test_delete_all_documents(document_store_with_docs):
assert len(document_store_with_docs.get_all_documents()) == 3
document_store_with_docs.delete_all_documents()
documents = document_store_with_docs.get_all_documents()
assert len(documents) == 0
@pytest.mark.elasticsearch
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
def test_delete_documents_with_filters(document_store_with_docs):
document_store_with_docs.delete_all_documents(filters={"meta_field": ["test1", "test2"]})
documents = document_store_with_docs.get_all_documents()
assert len(documents) == 1
assert documents[0].meta["meta_field"] == "test3"
@pytest.mark.elasticsearch
def test_labels(document_store):
label = Label(
question="question",
answer="answer",
is_correct_answer=True,
is_correct_document=True,
document_id="123",
offset_start_in_doc=12,
no_answer=False,
origin="gold_label",
)
document_store.write_labels([label], index="haystack_test_label")
labels = document_store.get_all_labels(index="haystack_test_label")
assert len(labels) == 1
labels = document_store.get_all_labels()
assert len(labels) == 0
@pytest.mark.elasticsearch
def test_multilabel(document_store):
labels =[
Label(
question="question",
answer="answer1",
is_correct_answer=True,
is_correct_document=True,
document_id="123",
offset_start_in_doc=12,
no_answer=False,
origin="gold_label",
),
# different answer in same doc
Label(
question="question",
answer="answer2",
is_correct_answer=True,
is_correct_document=True,
document_id="123",
offset_start_in_doc=42,
no_answer=False,
origin="gold_label",
),
# answer in different doc
Label(
question="question",
answer="answer3",
is_correct_answer=True,
is_correct_document=True,
document_id="321",
offset_start_in_doc=7,
no_answer=False,
origin="gold_label",
),
# 'no answer', should be excluded from MultiLabel
Label(
question="question",
answer="",
is_correct_answer=True,
is_correct_document=True,
document_id="777",
offset_start_in_doc=0,
no_answer=True,
origin="gold_label",
),
# is_correct_answer=False, should be excluded from MultiLabel
Label(
question="question",
answer="answer5",
is_correct_answer=False,
is_correct_document=True,
document_id="123",
offset_start_in_doc=99,
no_answer=True,
origin="gold_label",
),
]
document_store.write_labels(labels, index="haystack_test_multilabel")
multi_labels = document_store.get_all_labels_aggregated(index="haystack_test_multilabel")
labels = document_store.get_all_labels(index="haystack_test_multilabel")
assert len(multi_labels) == 1
assert len(labels) == 5
assert len(multi_labels[0].multiple_answers) == 3
assert len(multi_labels[0].multiple_answers) \
== len(multi_labels[0].multiple_document_ids) \
== len(multi_labels[0].multiple_offset_start_in_docs)
multi_labels = document_store.get_all_labels_aggregated()
assert len(multi_labels) == 0
# clean up
document_store.delete_all_documents(index="haystack_test_multilabel")
@pytest.mark.elasticsearch
def test_multilabel_no_answer(document_store):
labels = [
Label(
question="question",
answer="",
is_correct_answer=True,
is_correct_document=True,
document_id="777",
offset_start_in_doc=0,
no_answer=True,
origin="gold_label",
),
# no answer in different doc
Label(
question="question",
answer="",
is_correct_answer=True,
is_correct_document=True,
document_id="123",
offset_start_in_doc=0,
no_answer=True,
origin="gold_label",
),
# no answer in same doc, should be excluded
Label(
question="question",
answer="",
is_correct_answer=True,
is_correct_document=True,
document_id="777",
offset_start_in_doc=0,
no_answer=True,
origin="gold_label",
),
# no answer with is_correct_answer=False, should be excluded
Label(
question="question",
answer="",
is_correct_answer=False,
is_correct_document=True,
document_id="321",
offset_start_in_doc=0,
no_answer=True,
origin="gold_label",
),
]
document_store.write_labels(labels, index="haystack_test_multilabel_no_answer")
multi_labels = document_store.get_all_labels_aggregated(index="haystack_test_multilabel_no_answer")
labels = document_store.get_all_labels(index="haystack_test_multilabel_no_answer")
assert len(multi_labels) == 1
assert len(labels) == 4
assert len(multi_labels[0].multiple_document_ids) == 2
assert len(multi_labels[0].multiple_answers) \
== len(multi_labels[0].multiple_document_ids) \
== len(multi_labels[0].multiple_offset_start_in_docs)
# clean up
document_store.delete_all_documents(index="haystack_test_multilabel_no_answer")
@pytest.mark.elasticsearch
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "sql"], indirect=True)
def test_update_meta(document_store):
documents = [
Document(
text="Doc1",
meta={"meta_key_1": "1", "meta_key_2": "1"}
),
Document(
text="Doc2",
meta={"meta_key_1": "2", "meta_key_2": "2"}
),
Document(
text="Doc3",
meta={"meta_key_1": "3", "meta_key_2": "3"}
)
]
document_store.write_documents(documents)
document_2 = document_store.get_all_documents(filters={"meta_key_2": ["2"]})[0]
document_store.update_document_meta(document_2.id, meta={"meta_key_1": "99", "meta_key_2": "2"})
updated_document = document_store.get_document_by_id(document_2.id)
assert len(updated_document.meta.keys()) == 2
assert updated_document.meta["meta_key_1"] == "99"
assert updated_document.meta["meta_key_2"] == "2"
@pytest.mark.elasticsearch
@pytest.mark.parametrize("document_store_type", ["elasticsearch", "memory"])
def test_custom_embedding_field(document_store_type):
document_store = get_document_store(
document_store_type=document_store_type, embedding_field="custom_embedding_field"
)
doc_to_write = {"text": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32)}
document_store.write_documents([doc_to_write])
documents = document_store.get_all_documents(return_embedding=True)
assert len(documents) == 1
assert documents[0].text == "test"
np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding)
@pytest.mark.elasticsearch
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
def test_get_meta_values_by_key(document_store):
documents = [
Document(
text="Doc1",
meta={"meta_key_1": "1", "meta_key_2": "11"}
),
Document(
text="Doc2",
meta={"meta_key_1": "2", "meta_key_2": "22"}
),
Document(
text="Doc3",
meta={"meta_key_1": "3", "meta_key_2": "33"}
)
]
document_store.write_documents(documents)
# test without filters or query
result = document_store.get_metadata_values_by_key(key="meta_key_1")
for bucket in result:
assert bucket["value"] in ["1", "2", "3"]
assert bucket["count"] == 1
# test with filters but no query
result = document_store.get_metadata_values_by_key(key="meta_key_1", filters={"meta_key_2": ["11", "22"]})
for bucket in result:
assert bucket["value"] in ["1", "2"]
assert bucket["count"] == 1
# test with filters & query
result = document_store.get_metadata_values_by_key(key="meta_key_1", query="Doc1")
for bucket in result:
assert bucket["value"] in ["1"]
assert bucket["count"] == 1
@pytest.mark.elasticsearch
def test_elasticsearch_custom_fields(elasticsearch_fixture):
client = Elasticsearch()
client.indices.delete(index='haystack_test_custom', ignore=[404])
document_store = ElasticsearchDocumentStore(index="haystack_test_custom", text_field="custom_text_field",
embedding_field="custom_embedding_field")
doc_to_write = {"custom_text_field": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32)}
document_store.write_documents([doc_to_write])
documents = document_store.get_all_documents(return_embedding=True)
assert len(documents) == 1
assert documents[0].text == "test"
np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding)