chore: remove redundant tests (#3620)

* remove redundant tests

* skip test on win

* fix missing import

* revert mistake

* revert
This commit is contained in:
Massimiliano Pippi 2022-11-25 16:25:21 +01:00 committed by GitHub
parent ed7d03665d
commit c6890c3e86
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 16 additions and 457 deletions

View File

@ -1,3 +1,5 @@
import sys
import pytest import pytest
import numpy as np import numpy as np
@ -460,6 +462,20 @@ class DocumentStoreBaseTestAbstract:
assert doc.meta["year"] == "2099" assert doc.meta["year"] == "2099"
assert doc.meta["month"] == "12" assert doc.meta["month"] == "12"
@pytest.mark.integration
@pytest.mark.skipif(sys.platform == "win32", reason="_get_documents_meta() fails with 'too many SQL variables'")
def test_get_all_documents_large_quantities(self, ds):
# Test to exclude situations like Weaviate not returning more than 100 docs by default
# https://github.com/deepset-ai/haystack/issues/1893
docs_to_write = [
{"meta": {"name": f"name_{i}"}, "content": f"text_{i}", "embedding": np.random.rand(768).astype(np.float32)}
for i in range(1000)
]
ds.write_documents(docs_to_write)
documents = ds.get_all_documents()
assert all(isinstance(d, Document) for d in documents)
assert len(documents) == len(docs_to_write)
# #
# Unit tests # Unit tests
# #

View File

@ -1,7 +1,5 @@
from copy import deepcopy from copy import deepcopy
import math import math
import sys
from uuid import uuid4
import numpy as np import numpy as np
import pandas as pd import pandas as pd
@ -62,17 +60,6 @@ DOCUMENTS = [
] ]
def test_write_with_duplicate_doc_ids(document_store: BaseDocumentStore):
duplicate_documents = [
Document(content="Doc1", id_hash_keys=["content"]),
Document(content="Doc1", id_hash_keys=["content"]),
]
document_store.write_documents(duplicate_documents, duplicate_documents="skip")
assert len(document_store.get_all_documents()) == 1
with pytest.raises(Exception):
document_store.write_documents(duplicate_documents, duplicate_documents="fail")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate", "pinecone"], indirect=True "document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate", "pinecone"], indirect=True
) )
@ -100,299 +87,6 @@ def test_write_with_duplicate_doc_ids_custom_index(document_store: BaseDocumentS
document_store.write_documents(duplicate_documents, duplicate_documents="fail") document_store.write_documents(duplicate_documents, duplicate_documents="fail")
def test_get_all_documents_without_filters(document_store_with_docs):
documents = document_store_with_docs.get_all_documents()
assert all(isinstance(d, Document) for d in documents)
assert len(documents) == 5
assert {d.meta["name"] for d in documents} == {"filename1", "filename2", "filename3", "filename4", "filename5"}
assert {d.meta["meta_field"] for d in documents} == {"test1", "test2", "test3", "test4", "test5"}
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Test fails on Windows with an SQLite exception")
def test_get_all_documents_large_quantities(document_store: BaseDocumentStore):
# Test to exclude situations like Weaviate not returning more than 100 docs by default
# https://github.com/deepset-ai/haystack/issues/1893
docs_to_write = [
{"meta": {"name": f"name_{i}"}, "content": f"text_{i}", "embedding": np.random.rand(768).astype(np.float32)}
for i in range(1000)
]
document_store.write_documents(docs_to_write)
documents = document_store.get_all_documents()
assert all(isinstance(d, Document) for d in documents)
assert len(documents) == len(docs_to_write)
def test_get_all_document_filter_duplicate_text_value(document_store: BaseDocumentStore):
documents = [
Document(content="Doc1", meta={"meta_field": "0"}, id_hash_keys=["meta"]),
Document(content="Doc1", meta={"meta_field": "1", "name": "file.txt"}, id_hash_keys=["meta"]),
Document(content="Doc2", meta={"name": "file_2.txt"}, id_hash_keys=["meta"]),
]
document_store.write_documents(documents)
documents = document_store.get_all_documents(filters={"meta_field": ["1"]})
assert documents[0].content == "Doc1"
assert len(documents) == 1
assert {d.meta["name"] for d in documents} == {"file.txt"}
documents = document_store.get_all_documents(filters={"meta_field": ["0"]})
assert documents[0].content == "Doc1"
assert len(documents) == 1
assert documents[0].meta.get("name") is None
documents = document_store.get_all_documents(filters={"name": ["file_2.txt"]})
assert documents[0].content == "Doc2"
assert len(documents) == 1
assert documents[0].meta.get("meta_field") is None
def test_get_all_documents_with_correct_filters(document_store_with_docs):
documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test2"]})
assert len(documents) == 1
assert documents[0].meta["name"] == "filename2"
documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test1", "test3"]})
assert len(documents) == 2
assert {d.meta["name"] for d in documents} == {"filename1", "filename3"}
assert {d.meta["meta_field"] for d in documents} == {"test1", "test3"}
def test_get_all_documents_with_incorrect_filter_name(document_store_with_docs):
documents = document_store_with_docs.get_all_documents(filters={"incorrect_meta_field": ["test2"]})
assert len(documents) == 0
def test_get_all_documents_with_incorrect_filter_value(document_store_with_docs):
documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["incorrect_value"]})
assert len(documents) == 0
# See test_pinecone.py
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch", "weaviate", "memory"], indirect=True)
def test_extended_filter(document_store_with_docs):
# Test comparison operators individually
documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$eq": "test1"}})
assert len(documents) == 1
documents = document_store_with_docs.get_all_documents(filters={"meta_field": "test1"})
assert len(documents) == 1
documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$in": ["test1", "test2", "n.a."]}})
assert len(documents) == 2
documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test1", "test2", "n.a."]})
assert len(documents) == 2
documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$ne": "test1"}})
assert len(documents) == 4
documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$nin": ["test1", "test2", "n.a."]}})
assert len(documents) == 3
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gt": 3.0}})
assert len(documents) == 3
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gte": 3.0}})
assert len(documents) == 4
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lt": 3.0}})
assert len(documents) == 1
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lte": 3.0}})
assert len(documents) == 2
# Test compound filters
filters = {"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"}}
documents = document_store_with_docs.get_all_documents(filters=filters)
assert len(documents) == 3
filters = {
"$and": {
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"name": {"$in": ["filename5", "filename3"]},
}
}
documents = document_store_with_docs.get_all_documents(filters=filters)
assert len(documents) == 1
filters_simplified = {
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"name": ["filename5", "filename3"],
}
documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified)
# Order of returned documents might differ
assert len(documents) == len(documents_simplified_filter) and all(
doc in documents_simplified_filter for doc in documents
)
filters = {
"$and": {
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"$or": {"name": {"$in": ["filename5", "filename3"]}, "numeric_field": {"$lte": 5.0}},
}
}
documents = document_store_with_docs.get_all_documents(filters=filters)
assert len(documents) == 2
filters_simplified = {
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"$or": {"name": ["filename5", "filename3"], "numeric_field": {"$lte": 5.0}},
}
documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified)
assert len(documents) == len(documents_simplified_filter) and all(
doc in documents_simplified_filter for doc in documents
)
filters = {
"$and": {
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"$or": {
"name": {"$in": ["filename5", "filename3"]},
"$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": {"$eq": "test2"}}},
},
}
}
documents = document_store_with_docs.get_all_documents(filters=filters)
assert len(documents) == 1
filters_simplified = {
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"$or": {
"name": ["filename5", "filename3"],
"$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": "test2"}},
},
}
documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified)
assert len(documents) == len(documents_simplified_filter) and all(
doc in documents_simplified_filter for doc in documents
)
# Test nested logical operations within "$not", important as we apply De Morgan's laws in WeaviateDocumentstore
filters = {
"$not": {
"$or": {
"$and": {"numeric_field": {"$gt": 3.0}, "meta_field": {"$ne": "test3"}},
"$not": {"date_field": {"$lt": "2020-01-01"}},
}
}
}
documents = document_store_with_docs.get_all_documents(filters=filters)
docs_meta = [doc.meta["meta_field"] for doc in documents]
assert len(documents) == 2
assert "test3" in docs_meta
assert "test5" in docs_meta
# Test same logical operator twice on same level
filters = {
"$or": [
{"$and": {"meta_field": {"$in": ["test1", "test2"]}, "date_field": {"$gte": "2020-01-01"}}},
{"$and": {"meta_field": {"$in": ["test3", "test4"]}, "date_field": {"$lt": "2020-01-01"}}},
]
}
documents = document_store_with_docs.get_all_documents(filters=filters)
docs_meta = [doc.meta["meta_field"] for doc in documents]
assert len(documents) == 2
assert "test1" in docs_meta
assert "test3" in docs_meta
def test_get_document_by_id(document_store_with_docs):
documents = document_store_with_docs.get_all_documents()
doc = document_store_with_docs.get_document_by_id(documents[0].id)
assert doc.id == documents[0].id
assert doc.content == documents[0].content
def test_get_documents_by_id(document_store: BaseDocumentStore):
# generate more documents than the elasticsearch default query size limit of 10
docs_to_generate = 15
documents = [{"content": "doc-" + str(i)} for i in range(docs_to_generate)]
document_store.write_documents(documents)
all_docs = document_store.get_all_documents()
all_ids = [doc.id for doc in all_docs]
retrieved_by_id = document_store.get_documents_by_id(all_ids)
retrieved_ids = [doc.id for doc in retrieved_by_id]
# all documents in the index should be retrieved when passing all document ids in the index
assert set(retrieved_ids) == set(all_ids)
def test_get_document_count(document_store: BaseDocumentStore):
documents = [
{"content": "text1", "id": "1", "meta_field_for_count": "c"},
{"content": "text2", "id": "2", "meta_field_for_count": "b"},
{"content": "text3", "id": "3", "meta_field_for_count": "b"},
{"content": "text4", "id": "4", "meta_field_for_count": "b"},
]
document_store.write_documents(documents)
assert document_store.get_document_count() == 4
assert document_store.get_document_count(filters={"meta_field_for_count": ["c"]}) == 1
assert document_store.get_document_count(filters={"meta_field_for_count": ["b"]}) == 3
def test_get_all_documents_generator(document_store: BaseDocumentStore):
documents = [
{"content": "text1", "id": "1", "meta_field_for_count": "a"},
{"content": "text2", "id": "2", "meta_field_for_count": "b"},
{"content": "text3", "id": "3", "meta_field_for_count": "b"},
{"content": "text4", "id": "4", "meta_field_for_count": "b"},
{"content": "text5", "id": "5", "meta_field_for_count": "b"},
]
document_store.write_documents(documents)
assert len(list(document_store.get_all_documents_generator(batch_size=2))) == 5
@pytest.mark.parametrize("update_existing_documents", [True, False])
def test_update_existing_documents(document_store, update_existing_documents):
original_docs = [{"content": "text1_orig", "id": "1", "meta_field_for_count": "a"}]
updated_docs = [{"content": "text1_new", "id": "1", "meta_field_for_count": "a"}]
document_store.write_documents(original_docs)
assert document_store.get_document_count() == 1
if update_existing_documents:
document_store.write_documents(updated_docs, duplicate_documents="overwrite")
else:
with pytest.raises(Exception):
document_store.write_documents(updated_docs, duplicate_documents="fail")
stored_docs = document_store.get_all_documents()
assert len(stored_docs) == 1
if update_existing_documents:
assert stored_docs[0].content == updated_docs[0]["content"]
else:
assert stored_docs[0].content == original_docs[0]["content"]
def test_write_document_meta(document_store: BaseDocumentStore):
documents = [
{"content": "dict_without_meta", "id": "1"},
{"content": "dict_with_meta", "meta_field": "test2", "name": "filename2", "id": "2"},
Document(content="document_object_without_meta", id="3"),
Document(content="document_object_with_meta", meta={"meta_field": "test4", "name": "filename3"}, id="4"),
]
document_store.write_documents(documents)
documents_in_store = document_store.get_all_documents()
assert len(documents_in_store) == 4
assert not document_store.get_document_by_id("1").meta
assert document_store.get_document_by_id("2").meta["meta_field"] == "test2"
assert not document_store.get_document_by_id("3").meta
assert document_store.get_document_by_id("4").meta["meta_field"] == "test4"
def test_write_document_index(document_store: BaseDocumentStore):
document_store.delete_index("haystack_test_one")
document_store.delete_index("haystack_test_two")
documents = [{"content": "text1", "id": "1"}, {"content": "text2", "id": "2"}]
document_store.write_documents([documents[0]], index="haystack_test_one")
assert len(document_store.get_all_documents(index="haystack_test_one")) == 1
document_store.write_documents([documents[1]], index="haystack_test_two")
assert len(document_store.get_all_documents(index="haystack_test_two")) == 1
assert len(document_store.get_all_documents(index="haystack_test_one")) == 1
assert len(document_store.get_all_documents()) == 0
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate"], indirect=True) @pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate"], indirect=True)
def test_document_with_embeddings(document_store: BaseDocumentStore): def test_document_with_embeddings(document_store: BaseDocumentStore):
documents = [ documents = [
@ -574,49 +268,6 @@ def test_update_embeddings_table_text_retriever(document_store, retriever):
) )
def test_delete_all_documents(document_store_with_docs):
assert len(document_store_with_docs.get_all_documents()) == 5
document_store_with_docs.delete_documents()
documents = document_store_with_docs.get_all_documents()
assert len(documents) == 0
def test_delete_documents(document_store_with_docs):
assert len(document_store_with_docs.get_all_documents()) == 5
document_store_with_docs.delete_documents()
documents = document_store_with_docs.get_all_documents()
assert len(documents) == 0
def test_delete_documents_with_filters(document_store_with_docs):
document_store_with_docs.delete_documents(filters={"meta_field": ["test1", "test2", "test4", "test5"]})
documents = document_store_with_docs.get_all_documents()
assert len(documents) == 1
assert documents[0].meta["meta_field"] == "test3"
def test_delete_documents_by_id(document_store_with_docs):
import logging
logging.info(len(document_store_with_docs.get_all_documents()))
docs_to_delete = document_store_with_docs.get_all_documents(
filters={"meta_field": ["test1", "test2", "test4", "test5"]}
)
logging.info(len(docs_to_delete))
docs_not_to_delete = document_store_with_docs.get_all_documents(filters={"meta_field": ["test3"]})
logging.info(len(docs_not_to_delete))
document_store_with_docs.delete_documents(ids=[doc.id for doc in docs_to_delete])
all_docs_left = document_store_with_docs.get_all_documents()
assert len(all_docs_left) == 1
assert all_docs_left[0].meta["meta_field"] == "test3"
all_ids_left = [doc.id for doc in all_docs_left]
assert all(doc.id in all_ids_left for doc in docs_not_to_delete)
def test_delete_documents_by_id_with_filters(document_store_with_docs): def test_delete_documents_by_id_with_filters(document_store_with_docs):
docs_to_delete = document_store_with_docs.get_all_documents(filters={"meta_field": ["test1", "test2"]}) docs_to_delete = document_store_with_docs.get_all_documents(filters={"meta_field": ["test1", "test2"]})
docs_not_to_delete = document_store_with_docs.get_all_documents(filters={"meta_field": ["test3"]}) docs_not_to_delete = document_store_with_docs.get_all_documents(filters={"meta_field": ["test3"]})
@ -631,98 +282,6 @@ def test_delete_documents_by_id_with_filters(document_store_with_docs):
assert all(doc.id in all_ids_left for doc in docs_not_to_delete) assert all(doc.id in all_ids_left for doc in docs_not_to_delete)
# exclude weaviate because it does not support storing labels
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus", "pinecone"], indirect=True)
def test_labels(document_store: BaseDocumentStore):
label = Label(
query="question1",
answer=Answer(
answer="answer",
type="extractive",
score=0.0,
context="something",
offsets_in_document=[Span(start=12, end=14)],
offsets_in_context=[Span(start=12, end=14)],
),
is_correct_answer=True,
is_correct_document=True,
document=Document(content="something", id="123"),
origin="gold-label",
)
document_store.write_labels([label])
labels = document_store.get_all_labels()
assert len(labels) == 1
assert label == labels[0]
# different index
document_store.write_labels([label], index="another_index")
labels = document_store.get_all_labels(index="another_index")
assert len(labels) == 1
document_store.delete_labels(index="another_index")
labels = document_store.get_all_labels(index="another_index")
assert len(labels) == 0
labels = document_store.get_all_labels()
assert len(labels) == 1
# write second label + duplicate
label2 = Label(
query="question2",
answer=Answer(
answer="another answer",
type="extractive",
score=0.0,
context="something",
offsets_in_document=[Span(start=12, end=14)],
offsets_in_context=[Span(start=12, end=14)],
),
is_correct_answer=True,
is_correct_document=True,
document=Document(content="something", id="324"),
origin="gold-label",
)
document_store.write_labels([label, label2])
labels = document_store.get_all_labels()
# check that second label has been added but not the duplicate
assert len(labels) == 2
assert label in labels
assert label2 in labels
# delete filtered label2 by id
document_store.delete_labels(ids=[label2.id])
labels = document_store.get_all_labels()
assert label == labels[0]
assert len(labels) == 1
# re-add label2
document_store.write_labels([label2])
labels = document_store.get_all_labels()
assert len(labels) == 2
# delete filtered label2 by query text
document_store.delete_labels(filters={"query": [label2.query]})
labels = document_store.get_all_labels()
assert label == labels[0]
assert len(labels) == 1
# re-add label2
document_store.write_labels([label2])
labels = document_store.get_all_labels()
assert len(labels) == 2
# delete intersection of filters and ids, which is empty
document_store.delete_labels(ids=[label.id], filters={"query": [label2.query]})
labels = document_store.get_all_labels()
assert len(labels) == 2
assert label in labels
assert label2 in labels
# delete all labels
document_store.delete_labels()
labels = document_store.get_all_labels()
assert len(labels) == 0
@pytest.mark.parametrize("document_store", ["elasticsearch", "opensearch"], indirect=True) @pytest.mark.parametrize("document_store", ["elasticsearch", "opensearch"], indirect=True)
def test_labels_with_long_texts(document_store: BaseDocumentStore): def test_labels_with_long_texts(document_store: BaseDocumentStore):
document_store.delete_index("label") document_store.delete_index("label")
@ -1084,22 +643,6 @@ def test_multilabel_meta_aggregations(document_store: BaseDocumentStore):
assert multi_label.filters == l.filters assert multi_label.filters == l.filters
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
def test_update_meta(document_store: BaseDocumentStore):
documents = [
Document(content="Doc1", meta={"meta_key_1": "1", "meta_key_2": "1"}),
Document(content="Doc2", meta={"meta_key_1": "2", "meta_key_2": "2"}),
Document(content="Doc3", meta={"meta_key_1": "3", "meta_key_2": "3"}),
]
document_store.write_documents(documents)
document_2 = document_store.get_all_documents(filters={"meta_key_2": ["2"]})[0]
document_store.update_document_meta(document_2.id, meta={"meta_key_1": "99", "meta_key_2": "2"})
updated_document = document_store.get_document_by_id(document_2.id)
assert len(updated_document.meta.keys()) == 2
assert updated_document.meta["meta_key_1"] == "99"
assert updated_document.meta["meta_key_2"] == "2"
@pytest.mark.parametrize("document_store_type", ["elasticsearch", "memory"]) @pytest.mark.parametrize("document_store_type", ["elasticsearch", "memory"])
def test_custom_embedding_field(document_store_type, tmp_path): def test_custom_embedding_field(document_store_type, tmp_path):
document_store = get_document_store( document_store = get_document_store(