chore: remove redundant tests (#3620)

* remove redundant tests

* skip test on win

* fix missing import

* revert mistake

* revert
This commit is contained in:
Massimiliano Pippi 2022-11-25 16:25:21 +01:00 committed by GitHub
parent ed7d03665d
commit c6890c3e86
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 16 additions and 457 deletions

View File

@ -1,3 +1,5 @@
import sys
import pytest
import numpy as np
@ -460,6 +462,20 @@ class DocumentStoreBaseTestAbstract:
assert doc.meta["year"] == "2099"
assert doc.meta["month"] == "12"
@pytest.mark.integration
@pytest.mark.skipif(sys.platform == "win32", reason="_get_documents_meta() fails with 'too many SQL variables'")
def test_get_all_documents_large_quantities(self, ds):
# Test to exclude situations like Weaviate not returning more than 100 docs by default
# https://github.com/deepset-ai/haystack/issues/1893
docs_to_write = [
{"meta": {"name": f"name_{i}"}, "content": f"text_{i}", "embedding": np.random.rand(768).astype(np.float32)}
for i in range(1000)
]
ds.write_documents(docs_to_write)
documents = ds.get_all_documents()
assert all(isinstance(d, Document) for d in documents)
assert len(documents) == len(docs_to_write)
#
# Unit tests
#

View File

@ -1,7 +1,5 @@
from copy import deepcopy
import math
import sys
from uuid import uuid4
import numpy as np
import pandas as pd
@ -62,17 +60,6 @@ DOCUMENTS = [
]
def test_write_with_duplicate_doc_ids(document_store: BaseDocumentStore):
duplicate_documents = [
Document(content="Doc1", id_hash_keys=["content"]),
Document(content="Doc1", id_hash_keys=["content"]),
]
document_store.write_documents(duplicate_documents, duplicate_documents="skip")
assert len(document_store.get_all_documents()) == 1
with pytest.raises(Exception):
document_store.write_documents(duplicate_documents, duplicate_documents="fail")
@pytest.mark.parametrize(
"document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate", "pinecone"], indirect=True
)
@ -100,299 +87,6 @@ def test_write_with_duplicate_doc_ids_custom_index(document_store: BaseDocumentS
document_store.write_documents(duplicate_documents, duplicate_documents="fail")
def test_get_all_documents_without_filters(document_store_with_docs):
documents = document_store_with_docs.get_all_documents()
assert all(isinstance(d, Document) for d in documents)
assert len(documents) == 5
assert {d.meta["name"] for d in documents} == {"filename1", "filename2", "filename3", "filename4", "filename5"}
assert {d.meta["meta_field"] for d in documents} == {"test1", "test2", "test3", "test4", "test5"}
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Test fails on Windows with an SQLite exception")
def test_get_all_documents_large_quantities(document_store: BaseDocumentStore):
# Test to exclude situations like Weaviate not returning more than 100 docs by default
# https://github.com/deepset-ai/haystack/issues/1893
docs_to_write = [
{"meta": {"name": f"name_{i}"}, "content": f"text_{i}", "embedding": np.random.rand(768).astype(np.float32)}
for i in range(1000)
]
document_store.write_documents(docs_to_write)
documents = document_store.get_all_documents()
assert all(isinstance(d, Document) for d in documents)
assert len(documents) == len(docs_to_write)
def test_get_all_document_filter_duplicate_text_value(document_store: BaseDocumentStore):
documents = [
Document(content="Doc1", meta={"meta_field": "0"}, id_hash_keys=["meta"]),
Document(content="Doc1", meta={"meta_field": "1", "name": "file.txt"}, id_hash_keys=["meta"]),
Document(content="Doc2", meta={"name": "file_2.txt"}, id_hash_keys=["meta"]),
]
document_store.write_documents(documents)
documents = document_store.get_all_documents(filters={"meta_field": ["1"]})
assert documents[0].content == "Doc1"
assert len(documents) == 1
assert {d.meta["name"] for d in documents} == {"file.txt"}
documents = document_store.get_all_documents(filters={"meta_field": ["0"]})
assert documents[0].content == "Doc1"
assert len(documents) == 1
assert documents[0].meta.get("name") is None
documents = document_store.get_all_documents(filters={"name": ["file_2.txt"]})
assert documents[0].content == "Doc2"
assert len(documents) == 1
assert documents[0].meta.get("meta_field") is None
def test_get_all_documents_with_correct_filters(document_store_with_docs):
documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test2"]})
assert len(documents) == 1
assert documents[0].meta["name"] == "filename2"
documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test1", "test3"]})
assert len(documents) == 2
assert {d.meta["name"] for d in documents} == {"filename1", "filename3"}
assert {d.meta["meta_field"] for d in documents} == {"test1", "test3"}
def test_get_all_documents_with_incorrect_filter_name(document_store_with_docs):
documents = document_store_with_docs.get_all_documents(filters={"incorrect_meta_field": ["test2"]})
assert len(documents) == 0
def test_get_all_documents_with_incorrect_filter_value(document_store_with_docs):
documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["incorrect_value"]})
assert len(documents) == 0
# See test_pinecone.py
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch", "weaviate", "memory"], indirect=True)
def test_extended_filter(document_store_with_docs):
# Test comparison operators individually
documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$eq": "test1"}})
assert len(documents) == 1
documents = document_store_with_docs.get_all_documents(filters={"meta_field": "test1"})
assert len(documents) == 1
documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$in": ["test1", "test2", "n.a."]}})
assert len(documents) == 2
documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test1", "test2", "n.a."]})
assert len(documents) == 2
documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$ne": "test1"}})
assert len(documents) == 4
documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$nin": ["test1", "test2", "n.a."]}})
assert len(documents) == 3
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gt": 3.0}})
assert len(documents) == 3
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gte": 3.0}})
assert len(documents) == 4
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lt": 3.0}})
assert len(documents) == 1
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lte": 3.0}})
assert len(documents) == 2
# Test compound filters
filters = {"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"}}
documents = document_store_with_docs.get_all_documents(filters=filters)
assert len(documents) == 3
filters = {
"$and": {
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"name": {"$in": ["filename5", "filename3"]},
}
}
documents = document_store_with_docs.get_all_documents(filters=filters)
assert len(documents) == 1
filters_simplified = {
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"name": ["filename5", "filename3"],
}
documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified)
# Order of returned documents might differ
assert len(documents) == len(documents_simplified_filter) and all(
doc in documents_simplified_filter for doc in documents
)
filters = {
"$and": {
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"$or": {"name": {"$in": ["filename5", "filename3"]}, "numeric_field": {"$lte": 5.0}},
}
}
documents = document_store_with_docs.get_all_documents(filters=filters)
assert len(documents) == 2
filters_simplified = {
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"$or": {"name": ["filename5", "filename3"], "numeric_field": {"$lte": 5.0}},
}
documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified)
assert len(documents) == len(documents_simplified_filter) and all(
doc in documents_simplified_filter for doc in documents
)
filters = {
"$and": {
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"$or": {
"name": {"$in": ["filename5", "filename3"]},
"$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": {"$eq": "test2"}}},
},
}
}
documents = document_store_with_docs.get_all_documents(filters=filters)
assert len(documents) == 1
filters_simplified = {
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"$or": {
"name": ["filename5", "filename3"],
"$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": "test2"}},
},
}
documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified)
assert len(documents) == len(documents_simplified_filter) and all(
doc in documents_simplified_filter for doc in documents
)
# Test nested logical operations within "$not", important as we apply De Morgan's laws in WeaviateDocumentstore
filters = {
"$not": {
"$or": {
"$and": {"numeric_field": {"$gt": 3.0}, "meta_field": {"$ne": "test3"}},
"$not": {"date_field": {"$lt": "2020-01-01"}},
}
}
}
documents = document_store_with_docs.get_all_documents(filters=filters)
docs_meta = [doc.meta["meta_field"] for doc in documents]
assert len(documents) == 2
assert "test3" in docs_meta
assert "test5" in docs_meta
# Test same logical operator twice on same level
filters = {
"$or": [
{"$and": {"meta_field": {"$in": ["test1", "test2"]}, "date_field": {"$gte": "2020-01-01"}}},
{"$and": {"meta_field": {"$in": ["test3", "test4"]}, "date_field": {"$lt": "2020-01-01"}}},
]
}
documents = document_store_with_docs.get_all_documents(filters=filters)
docs_meta = [doc.meta["meta_field"] for doc in documents]
assert len(documents) == 2
assert "test1" in docs_meta
assert "test3" in docs_meta
def test_get_document_by_id(document_store_with_docs):
documents = document_store_with_docs.get_all_documents()
doc = document_store_with_docs.get_document_by_id(documents[0].id)
assert doc.id == documents[0].id
assert doc.content == documents[0].content
def test_get_documents_by_id(document_store: BaseDocumentStore):
# generate more documents than the elasticsearch default query size limit of 10
docs_to_generate = 15
documents = [{"content": "doc-" + str(i)} for i in range(docs_to_generate)]
document_store.write_documents(documents)
all_docs = document_store.get_all_documents()
all_ids = [doc.id for doc in all_docs]
retrieved_by_id = document_store.get_documents_by_id(all_ids)
retrieved_ids = [doc.id for doc in retrieved_by_id]
# all documents in the index should be retrieved when passing all document ids in the index
assert set(retrieved_ids) == set(all_ids)
def test_get_document_count(document_store: BaseDocumentStore):
documents = [
{"content": "text1", "id": "1", "meta_field_for_count": "c"},
{"content": "text2", "id": "2", "meta_field_for_count": "b"},
{"content": "text3", "id": "3", "meta_field_for_count": "b"},
{"content": "text4", "id": "4", "meta_field_for_count": "b"},
]
document_store.write_documents(documents)
assert document_store.get_document_count() == 4
assert document_store.get_document_count(filters={"meta_field_for_count": ["c"]}) == 1
assert document_store.get_document_count(filters={"meta_field_for_count": ["b"]}) == 3
def test_get_all_documents_generator(document_store: BaseDocumentStore):
documents = [
{"content": "text1", "id": "1", "meta_field_for_count": "a"},
{"content": "text2", "id": "2", "meta_field_for_count": "b"},
{"content": "text3", "id": "3", "meta_field_for_count": "b"},
{"content": "text4", "id": "4", "meta_field_for_count": "b"},
{"content": "text5", "id": "5", "meta_field_for_count": "b"},
]
document_store.write_documents(documents)
assert len(list(document_store.get_all_documents_generator(batch_size=2))) == 5
@pytest.mark.parametrize("update_existing_documents", [True, False])
def test_update_existing_documents(document_store, update_existing_documents):
original_docs = [{"content": "text1_orig", "id": "1", "meta_field_for_count": "a"}]
updated_docs = [{"content": "text1_new", "id": "1", "meta_field_for_count": "a"}]
document_store.write_documents(original_docs)
assert document_store.get_document_count() == 1
if update_existing_documents:
document_store.write_documents(updated_docs, duplicate_documents="overwrite")
else:
with pytest.raises(Exception):
document_store.write_documents(updated_docs, duplicate_documents="fail")
stored_docs = document_store.get_all_documents()
assert len(stored_docs) == 1
if update_existing_documents:
assert stored_docs[0].content == updated_docs[0]["content"]
else:
assert stored_docs[0].content == original_docs[0]["content"]
def test_write_document_meta(document_store: BaseDocumentStore):
documents = [
{"content": "dict_without_meta", "id": "1"},
{"content": "dict_with_meta", "meta_field": "test2", "name": "filename2", "id": "2"},
Document(content="document_object_without_meta", id="3"),
Document(content="document_object_with_meta", meta={"meta_field": "test4", "name": "filename3"}, id="4"),
]
document_store.write_documents(documents)
documents_in_store = document_store.get_all_documents()
assert len(documents_in_store) == 4
assert not document_store.get_document_by_id("1").meta
assert document_store.get_document_by_id("2").meta["meta_field"] == "test2"
assert not document_store.get_document_by_id("3").meta
assert document_store.get_document_by_id("4").meta["meta_field"] == "test4"
def test_write_document_index(document_store: BaseDocumentStore):
document_store.delete_index("haystack_test_one")
document_store.delete_index("haystack_test_two")
documents = [{"content": "text1", "id": "1"}, {"content": "text2", "id": "2"}]
document_store.write_documents([documents[0]], index="haystack_test_one")
assert len(document_store.get_all_documents(index="haystack_test_one")) == 1
document_store.write_documents([documents[1]], index="haystack_test_two")
assert len(document_store.get_all_documents(index="haystack_test_two")) == 1
assert len(document_store.get_all_documents(index="haystack_test_one")) == 1
assert len(document_store.get_all_documents()) == 0
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate"], indirect=True)
def test_document_with_embeddings(document_store: BaseDocumentStore):
documents = [
@ -574,49 +268,6 @@ def test_update_embeddings_table_text_retriever(document_store, retriever):
)
def test_delete_all_documents(document_store_with_docs):
assert len(document_store_with_docs.get_all_documents()) == 5
document_store_with_docs.delete_documents()
documents = document_store_with_docs.get_all_documents()
assert len(documents) == 0
def test_delete_documents(document_store_with_docs):
assert len(document_store_with_docs.get_all_documents()) == 5
document_store_with_docs.delete_documents()
documents = document_store_with_docs.get_all_documents()
assert len(documents) == 0
def test_delete_documents_with_filters(document_store_with_docs):
document_store_with_docs.delete_documents(filters={"meta_field": ["test1", "test2", "test4", "test5"]})
documents = document_store_with_docs.get_all_documents()
assert len(documents) == 1
assert documents[0].meta["meta_field"] == "test3"
def test_delete_documents_by_id(document_store_with_docs):
import logging
logging.info(len(document_store_with_docs.get_all_documents()))
docs_to_delete = document_store_with_docs.get_all_documents(
filters={"meta_field": ["test1", "test2", "test4", "test5"]}
)
logging.info(len(docs_to_delete))
docs_not_to_delete = document_store_with_docs.get_all_documents(filters={"meta_field": ["test3"]})
logging.info(len(docs_not_to_delete))
document_store_with_docs.delete_documents(ids=[doc.id for doc in docs_to_delete])
all_docs_left = document_store_with_docs.get_all_documents()
assert len(all_docs_left) == 1
assert all_docs_left[0].meta["meta_field"] == "test3"
all_ids_left = [doc.id for doc in all_docs_left]
assert all(doc.id in all_ids_left for doc in docs_not_to_delete)
def test_delete_documents_by_id_with_filters(document_store_with_docs):
docs_to_delete = document_store_with_docs.get_all_documents(filters={"meta_field": ["test1", "test2"]})
docs_not_to_delete = document_store_with_docs.get_all_documents(filters={"meta_field": ["test3"]})
@ -631,98 +282,6 @@ def test_delete_documents_by_id_with_filters(document_store_with_docs):
assert all(doc.id in all_ids_left for doc in docs_not_to_delete)
# exclude weaviate because it does not support storing labels
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus", "pinecone"], indirect=True)
def test_labels(document_store: BaseDocumentStore):
label = Label(
query="question1",
answer=Answer(
answer="answer",
type="extractive",
score=0.0,
context="something",
offsets_in_document=[Span(start=12, end=14)],
offsets_in_context=[Span(start=12, end=14)],
),
is_correct_answer=True,
is_correct_document=True,
document=Document(content="something", id="123"),
origin="gold-label",
)
document_store.write_labels([label])
labels = document_store.get_all_labels()
assert len(labels) == 1
assert label == labels[0]
# different index
document_store.write_labels([label], index="another_index")
labels = document_store.get_all_labels(index="another_index")
assert len(labels) == 1
document_store.delete_labels(index="another_index")
labels = document_store.get_all_labels(index="another_index")
assert len(labels) == 0
labels = document_store.get_all_labels()
assert len(labels) == 1
# write second label + duplicate
label2 = Label(
query="question2",
answer=Answer(
answer="another answer",
type="extractive",
score=0.0,
context="something",
offsets_in_document=[Span(start=12, end=14)],
offsets_in_context=[Span(start=12, end=14)],
),
is_correct_answer=True,
is_correct_document=True,
document=Document(content="something", id="324"),
origin="gold-label",
)
document_store.write_labels([label, label2])
labels = document_store.get_all_labels()
# check that second label has been added but not the duplicate
assert len(labels) == 2
assert label in labels
assert label2 in labels
# delete filtered label2 by id
document_store.delete_labels(ids=[label2.id])
labels = document_store.get_all_labels()
assert label == labels[0]
assert len(labels) == 1
# re-add label2
document_store.write_labels([label2])
labels = document_store.get_all_labels()
assert len(labels) == 2
# delete filtered label2 by query text
document_store.delete_labels(filters={"query": [label2.query]})
labels = document_store.get_all_labels()
assert label == labels[0]
assert len(labels) == 1
# re-add label2
document_store.write_labels([label2])
labels = document_store.get_all_labels()
assert len(labels) == 2
# delete intersection of filters and ids, which is empty
document_store.delete_labels(ids=[label.id], filters={"query": [label2.query]})
labels = document_store.get_all_labels()
assert len(labels) == 2
assert label in labels
assert label2 in labels
# delete all labels
document_store.delete_labels()
labels = document_store.get_all_labels()
assert len(labels) == 0
@pytest.mark.parametrize("document_store", ["elasticsearch", "opensearch"], indirect=True)
def test_labels_with_long_texts(document_store: BaseDocumentStore):
document_store.delete_index("label")
@ -1084,22 +643,6 @@ def test_multilabel_meta_aggregations(document_store: BaseDocumentStore):
assert multi_label.filters == l.filters
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
def test_update_meta(document_store: BaseDocumentStore):
documents = [
Document(content="Doc1", meta={"meta_key_1": "1", "meta_key_2": "1"}),
Document(content="Doc2", meta={"meta_key_1": "2", "meta_key_2": "2"}),
Document(content="Doc3", meta={"meta_key_1": "3", "meta_key_2": "3"}),
]
document_store.write_documents(documents)
document_2 = document_store.get_all_documents(filters={"meta_key_2": ["2"]})[0]
document_store.update_document_meta(document_2.id, meta={"meta_key_1": "99", "meta_key_2": "2"})
updated_document = document_store.get_document_by_id(document_2.id)
assert len(updated_document.meta.keys()) == 2
assert updated_document.meta["meta_key_1"] == "99"
assert updated_document.meta["meta_key_2"] == "2"
@pytest.mark.parametrize("document_store_type", ["elasticsearch", "memory"])
def test_custom_embedding_field(document_store_type, tmp_path):
document_store = get_document_store(