From c6890c3e867cc5fdcf64431a5c4422764845d995 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Fri, 25 Nov 2022 16:25:21 +0100 Subject: [PATCH] chore: remove redundant tests (#3620) * remove redundant tests * skip test on win * fix missing import * revert mistake * revert --- test/document_stores/test_base.py | 16 + test/document_stores/test_document_store.py | 457 -------------------- 2 files changed, 16 insertions(+), 457 deletions(-) diff --git a/test/document_stores/test_base.py b/test/document_stores/test_base.py index b398aaa31..f881097f4 100644 --- a/test/document_stores/test_base.py +++ b/test/document_stores/test_base.py @@ -1,3 +1,5 @@ +import sys + import pytest import numpy as np @@ -460,6 +462,20 @@ class DocumentStoreBaseTestAbstract: assert doc.meta["year"] == "2099" assert doc.meta["month"] == "12" + @pytest.mark.integration + @pytest.mark.skipif(sys.platform == "win32", reason="_get_documents_meta() fails with 'too many SQL variables'") + def test_get_all_documents_large_quantities(self, ds): + # Test to exclude situations like Weaviate not returning more than 100 docs by default + # https://github.com/deepset-ai/haystack/issues/1893 + docs_to_write = [ + {"meta": {"name": f"name_{i}"}, "content": f"text_{i}", "embedding": np.random.rand(768).astype(np.float32)} + for i in range(1000) + ] + ds.write_documents(docs_to_write) + documents = ds.get_all_documents() + assert all(isinstance(d, Document) for d in documents) + assert len(documents) == len(docs_to_write) + # # Unit tests # diff --git a/test/document_stores/test_document_store.py b/test/document_stores/test_document_store.py index 73cd22c02..f76339a32 100644 --- a/test/document_stores/test_document_store.py +++ b/test/document_stores/test_document_store.py @@ -1,7 +1,5 @@ from copy import deepcopy import math -import sys -from uuid import uuid4 import numpy as np import pandas as pd @@ -62,17 +60,6 @@ DOCUMENTS = [ ] -def test_write_with_duplicate_doc_ids(document_store: BaseDocumentStore): - duplicate_documents = [ - Document(content="Doc1", id_hash_keys=["content"]), - Document(content="Doc1", id_hash_keys=["content"]), - ] - document_store.write_documents(duplicate_documents, duplicate_documents="skip") - assert len(document_store.get_all_documents()) == 1 - with pytest.raises(Exception): - document_store.write_documents(duplicate_documents, duplicate_documents="fail") - - @pytest.mark.parametrize( "document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate", "pinecone"], indirect=True ) @@ -100,299 +87,6 @@ def test_write_with_duplicate_doc_ids_custom_index(document_store: BaseDocumentS document_store.write_documents(duplicate_documents, duplicate_documents="fail") -def test_get_all_documents_without_filters(document_store_with_docs): - documents = document_store_with_docs.get_all_documents() - assert all(isinstance(d, Document) for d in documents) - assert len(documents) == 5 - assert {d.meta["name"] for d in documents} == {"filename1", "filename2", "filename3", "filename4", "filename5"} - assert {d.meta["meta_field"] for d in documents} == {"test1", "test2", "test3", "test4", "test5"} - - -@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Test fails on Windows with an SQLite exception") -def test_get_all_documents_large_quantities(document_store: BaseDocumentStore): - # Test to exclude situations like Weaviate not returning more than 100 docs by default - # https://github.com/deepset-ai/haystack/issues/1893 - docs_to_write = [ - {"meta": {"name": f"name_{i}"}, "content": f"text_{i}", "embedding": np.random.rand(768).astype(np.float32)} - for i in range(1000) - ] - document_store.write_documents(docs_to_write) - documents = document_store.get_all_documents() - assert all(isinstance(d, Document) for d in documents) - assert len(documents) == len(docs_to_write) - - -def test_get_all_document_filter_duplicate_text_value(document_store: BaseDocumentStore): - documents = [ - Document(content="Doc1", meta={"meta_field": "0"}, id_hash_keys=["meta"]), - Document(content="Doc1", meta={"meta_field": "1", "name": "file.txt"}, id_hash_keys=["meta"]), - Document(content="Doc2", meta={"name": "file_2.txt"}, id_hash_keys=["meta"]), - ] - document_store.write_documents(documents) - documents = document_store.get_all_documents(filters={"meta_field": ["1"]}) - assert documents[0].content == "Doc1" - assert len(documents) == 1 - assert {d.meta["name"] for d in documents} == {"file.txt"} - - documents = document_store.get_all_documents(filters={"meta_field": ["0"]}) - assert documents[0].content == "Doc1" - assert len(documents) == 1 - assert documents[0].meta.get("name") is None - - documents = document_store.get_all_documents(filters={"name": ["file_2.txt"]}) - assert documents[0].content == "Doc2" - assert len(documents) == 1 - assert documents[0].meta.get("meta_field") is None - - -def test_get_all_documents_with_correct_filters(document_store_with_docs): - documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test2"]}) - assert len(documents) == 1 - assert documents[0].meta["name"] == "filename2" - - documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test1", "test3"]}) - assert len(documents) == 2 - assert {d.meta["name"] for d in documents} == {"filename1", "filename3"} - assert {d.meta["meta_field"] for d in documents} == {"test1", "test3"} - - -def test_get_all_documents_with_incorrect_filter_name(document_store_with_docs): - documents = document_store_with_docs.get_all_documents(filters={"incorrect_meta_field": ["test2"]}) - assert len(documents) == 0 - - -def test_get_all_documents_with_incorrect_filter_value(document_store_with_docs): - documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["incorrect_value"]}) - assert len(documents) == 0 - - -# See test_pinecone.py -@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch", "weaviate", "memory"], indirect=True) -def test_extended_filter(document_store_with_docs): - # Test comparison operators individually - documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$eq": "test1"}}) - assert len(documents) == 1 - documents = document_store_with_docs.get_all_documents(filters={"meta_field": "test1"}) - assert len(documents) == 1 - - documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$in": ["test1", "test2", "n.a."]}}) - assert len(documents) == 2 - documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test1", "test2", "n.a."]}) - assert len(documents) == 2 - - documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$ne": "test1"}}) - assert len(documents) == 4 - - documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$nin": ["test1", "test2", "n.a."]}}) - assert len(documents) == 3 - - documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gt": 3.0}}) - assert len(documents) == 3 - - documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gte": 3.0}}) - assert len(documents) == 4 - - documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lt": 3.0}}) - assert len(documents) == 1 - - documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lte": 3.0}}) - assert len(documents) == 2 - - # Test compound filters - filters = {"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"}} - documents = document_store_with_docs.get_all_documents(filters=filters) - assert len(documents) == 3 - - filters = { - "$and": { - "date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"}, - "name": {"$in": ["filename5", "filename3"]}, - } - } - documents = document_store_with_docs.get_all_documents(filters=filters) - assert len(documents) == 1 - filters_simplified = { - "date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"}, - "name": ["filename5", "filename3"], - } - documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified) - # Order of returned documents might differ - assert len(documents) == len(documents_simplified_filter) and all( - doc in documents_simplified_filter for doc in documents - ) - - filters = { - "$and": { - "date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"}, - "$or": {"name": {"$in": ["filename5", "filename3"]}, "numeric_field": {"$lte": 5.0}}, - } - } - documents = document_store_with_docs.get_all_documents(filters=filters) - assert len(documents) == 2 - filters_simplified = { - "date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"}, - "$or": {"name": ["filename5", "filename3"], "numeric_field": {"$lte": 5.0}}, - } - documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified) - assert len(documents) == len(documents_simplified_filter) and all( - doc in documents_simplified_filter for doc in documents - ) - - filters = { - "$and": { - "date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"}, - "$or": { - "name": {"$in": ["filename5", "filename3"]}, - "$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": {"$eq": "test2"}}}, - }, - } - } - documents = document_store_with_docs.get_all_documents(filters=filters) - assert len(documents) == 1 - filters_simplified = { - "date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"}, - "$or": { - "name": ["filename5", "filename3"], - "$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": "test2"}}, - }, - } - documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified) - assert len(documents) == len(documents_simplified_filter) and all( - doc in documents_simplified_filter for doc in documents - ) - - # Test nested logical operations within "$not", important as we apply De Morgan's laws in WeaviateDocumentstore - filters = { - "$not": { - "$or": { - "$and": {"numeric_field": {"$gt": 3.0}, "meta_field": {"$ne": "test3"}}, - "$not": {"date_field": {"$lt": "2020-01-01"}}, - } - } - } - documents = document_store_with_docs.get_all_documents(filters=filters) - docs_meta = [doc.meta["meta_field"] for doc in documents] - assert len(documents) == 2 - assert "test3" in docs_meta - assert "test5" in docs_meta - - # Test same logical operator twice on same level - filters = { - "$or": [ - {"$and": {"meta_field": {"$in": ["test1", "test2"]}, "date_field": {"$gte": "2020-01-01"}}}, - {"$and": {"meta_field": {"$in": ["test3", "test4"]}, "date_field": {"$lt": "2020-01-01"}}}, - ] - } - documents = document_store_with_docs.get_all_documents(filters=filters) - docs_meta = [doc.meta["meta_field"] for doc in documents] - assert len(documents) == 2 - assert "test1" in docs_meta - assert "test3" in docs_meta - - -def test_get_document_by_id(document_store_with_docs): - documents = document_store_with_docs.get_all_documents() - doc = document_store_with_docs.get_document_by_id(documents[0].id) - assert doc.id == documents[0].id - assert doc.content == documents[0].content - - -def test_get_documents_by_id(document_store: BaseDocumentStore): - # generate more documents than the elasticsearch default query size limit of 10 - docs_to_generate = 15 - documents = [{"content": "doc-" + str(i)} for i in range(docs_to_generate)] - document_store.write_documents(documents) - - all_docs = document_store.get_all_documents() - all_ids = [doc.id for doc in all_docs] - - retrieved_by_id = document_store.get_documents_by_id(all_ids) - retrieved_ids = [doc.id for doc in retrieved_by_id] - - # all documents in the index should be retrieved when passing all document ids in the index - assert set(retrieved_ids) == set(all_ids) - - -def test_get_document_count(document_store: BaseDocumentStore): - documents = [ - {"content": "text1", "id": "1", "meta_field_for_count": "c"}, - {"content": "text2", "id": "2", "meta_field_for_count": "b"}, - {"content": "text3", "id": "3", "meta_field_for_count": "b"}, - {"content": "text4", "id": "4", "meta_field_for_count": "b"}, - ] - document_store.write_documents(documents) - assert document_store.get_document_count() == 4 - assert document_store.get_document_count(filters={"meta_field_for_count": ["c"]}) == 1 - assert document_store.get_document_count(filters={"meta_field_for_count": ["b"]}) == 3 - - -def test_get_all_documents_generator(document_store: BaseDocumentStore): - documents = [ - {"content": "text1", "id": "1", "meta_field_for_count": "a"}, - {"content": "text2", "id": "2", "meta_field_for_count": "b"}, - {"content": "text3", "id": "3", "meta_field_for_count": "b"}, - {"content": "text4", "id": "4", "meta_field_for_count": "b"}, - {"content": "text5", "id": "5", "meta_field_for_count": "b"}, - ] - - document_store.write_documents(documents) - assert len(list(document_store.get_all_documents_generator(batch_size=2))) == 5 - - -@pytest.mark.parametrize("update_existing_documents", [True, False]) -def test_update_existing_documents(document_store, update_existing_documents): - original_docs = [{"content": "text1_orig", "id": "1", "meta_field_for_count": "a"}] - - updated_docs = [{"content": "text1_new", "id": "1", "meta_field_for_count": "a"}] - - document_store.write_documents(original_docs) - assert document_store.get_document_count() == 1 - - if update_existing_documents: - document_store.write_documents(updated_docs, duplicate_documents="overwrite") - else: - with pytest.raises(Exception): - document_store.write_documents(updated_docs, duplicate_documents="fail") - - stored_docs = document_store.get_all_documents() - assert len(stored_docs) == 1 - if update_existing_documents: - assert stored_docs[0].content == updated_docs[0]["content"] - else: - assert stored_docs[0].content == original_docs[0]["content"] - - -def test_write_document_meta(document_store: BaseDocumentStore): - documents = [ - {"content": "dict_without_meta", "id": "1"}, - {"content": "dict_with_meta", "meta_field": "test2", "name": "filename2", "id": "2"}, - Document(content="document_object_without_meta", id="3"), - Document(content="document_object_with_meta", meta={"meta_field": "test4", "name": "filename3"}, id="4"), - ] - document_store.write_documents(documents) - documents_in_store = document_store.get_all_documents() - assert len(documents_in_store) == 4 - - assert not document_store.get_document_by_id("1").meta - assert document_store.get_document_by_id("2").meta["meta_field"] == "test2" - assert not document_store.get_document_by_id("3").meta - assert document_store.get_document_by_id("4").meta["meta_field"] == "test4" - - -def test_write_document_index(document_store: BaseDocumentStore): - document_store.delete_index("haystack_test_one") - document_store.delete_index("haystack_test_two") - documents = [{"content": "text1", "id": "1"}, {"content": "text2", "id": "2"}] - document_store.write_documents([documents[0]], index="haystack_test_one") - assert len(document_store.get_all_documents(index="haystack_test_one")) == 1 - - document_store.write_documents([documents[1]], index="haystack_test_two") - assert len(document_store.get_all_documents(index="haystack_test_two")) == 1 - - assert len(document_store.get_all_documents(index="haystack_test_one")) == 1 - assert len(document_store.get_all_documents()) == 0 - - @pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate"], indirect=True) def test_document_with_embeddings(document_store: BaseDocumentStore): documents = [ @@ -574,49 +268,6 @@ def test_update_embeddings_table_text_retriever(document_store, retriever): ) -def test_delete_all_documents(document_store_with_docs): - assert len(document_store_with_docs.get_all_documents()) == 5 - - document_store_with_docs.delete_documents() - documents = document_store_with_docs.get_all_documents() - assert len(documents) == 0 - - -def test_delete_documents(document_store_with_docs): - assert len(document_store_with_docs.get_all_documents()) == 5 - - document_store_with_docs.delete_documents() - documents = document_store_with_docs.get_all_documents() - assert len(documents) == 0 - - -def test_delete_documents_with_filters(document_store_with_docs): - document_store_with_docs.delete_documents(filters={"meta_field": ["test1", "test2", "test4", "test5"]}) - documents = document_store_with_docs.get_all_documents() - assert len(documents) == 1 - assert documents[0].meta["meta_field"] == "test3" - - -def test_delete_documents_by_id(document_store_with_docs): - import logging - - logging.info(len(document_store_with_docs.get_all_documents())) - docs_to_delete = document_store_with_docs.get_all_documents( - filters={"meta_field": ["test1", "test2", "test4", "test5"]} - ) - logging.info(len(docs_to_delete)) - docs_not_to_delete = document_store_with_docs.get_all_documents(filters={"meta_field": ["test3"]}) - logging.info(len(docs_not_to_delete)) - - document_store_with_docs.delete_documents(ids=[doc.id for doc in docs_to_delete]) - all_docs_left = document_store_with_docs.get_all_documents() - assert len(all_docs_left) == 1 - assert all_docs_left[0].meta["meta_field"] == "test3" - - all_ids_left = [doc.id for doc in all_docs_left] - assert all(doc.id in all_ids_left for doc in docs_not_to_delete) - - def test_delete_documents_by_id_with_filters(document_store_with_docs): docs_to_delete = document_store_with_docs.get_all_documents(filters={"meta_field": ["test1", "test2"]}) docs_not_to_delete = document_store_with_docs.get_all_documents(filters={"meta_field": ["test3"]}) @@ -631,98 +282,6 @@ def test_delete_documents_by_id_with_filters(document_store_with_docs): assert all(doc.id in all_ids_left for doc in docs_not_to_delete) -# exclude weaviate because it does not support storing labels -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus", "pinecone"], indirect=True) -def test_labels(document_store: BaseDocumentStore): - label = Label( - query="question1", - answer=Answer( - answer="answer", - type="extractive", - score=0.0, - context="something", - offsets_in_document=[Span(start=12, end=14)], - offsets_in_context=[Span(start=12, end=14)], - ), - is_correct_answer=True, - is_correct_document=True, - document=Document(content="something", id="123"), - origin="gold-label", - ) - document_store.write_labels([label]) - labels = document_store.get_all_labels() - assert len(labels) == 1 - assert label == labels[0] - - # different index - document_store.write_labels([label], index="another_index") - labels = document_store.get_all_labels(index="another_index") - assert len(labels) == 1 - document_store.delete_labels(index="another_index") - labels = document_store.get_all_labels(index="another_index") - assert len(labels) == 0 - labels = document_store.get_all_labels() - assert len(labels) == 1 - - # write second label + duplicate - label2 = Label( - query="question2", - answer=Answer( - answer="another answer", - type="extractive", - score=0.0, - context="something", - offsets_in_document=[Span(start=12, end=14)], - offsets_in_context=[Span(start=12, end=14)], - ), - is_correct_answer=True, - is_correct_document=True, - document=Document(content="something", id="324"), - origin="gold-label", - ) - document_store.write_labels([label, label2]) - labels = document_store.get_all_labels() - - # check that second label has been added but not the duplicate - assert len(labels) == 2 - assert label in labels - assert label2 in labels - - # delete filtered label2 by id - document_store.delete_labels(ids=[label2.id]) - labels = document_store.get_all_labels() - assert label == labels[0] - assert len(labels) == 1 - - # re-add label2 - document_store.write_labels([label2]) - labels = document_store.get_all_labels() - assert len(labels) == 2 - - # delete filtered label2 by query text - document_store.delete_labels(filters={"query": [label2.query]}) - labels = document_store.get_all_labels() - assert label == labels[0] - assert len(labels) == 1 - - # re-add label2 - document_store.write_labels([label2]) - labels = document_store.get_all_labels() - assert len(labels) == 2 - - # delete intersection of filters and ids, which is empty - document_store.delete_labels(ids=[label.id], filters={"query": [label2.query]}) - labels = document_store.get_all_labels() - assert len(labels) == 2 - assert label in labels - assert label2 in labels - - # delete all labels - document_store.delete_labels() - labels = document_store.get_all_labels() - assert len(labels) == 0 - - @pytest.mark.parametrize("document_store", ["elasticsearch", "opensearch"], indirect=True) def test_labels_with_long_texts(document_store: BaseDocumentStore): document_store.delete_index("label") @@ -1084,22 +643,6 @@ def test_multilabel_meta_aggregations(document_store: BaseDocumentStore): assert multi_label.filters == l.filters -@pytest.mark.parametrize("document_store", ["memory"], indirect=True) -def test_update_meta(document_store: BaseDocumentStore): - documents = [ - Document(content="Doc1", meta={"meta_key_1": "1", "meta_key_2": "1"}), - Document(content="Doc2", meta={"meta_key_1": "2", "meta_key_2": "2"}), - Document(content="Doc3", meta={"meta_key_1": "3", "meta_key_2": "3"}), - ] - document_store.write_documents(documents) - document_2 = document_store.get_all_documents(filters={"meta_key_2": ["2"]})[0] - document_store.update_document_meta(document_2.id, meta={"meta_key_1": "99", "meta_key_2": "2"}) - updated_document = document_store.get_document_by_id(document_2.id) - assert len(updated_document.meta.keys()) == 2 - assert updated_document.meta["meta_key_1"] == "99" - assert updated_document.meta["meta_key_2"] == "2" - - @pytest.mark.parametrize("document_store_type", ["elasticsearch", "memory"]) def test_custom_embedding_field(document_store_type, tmp_path): document_store = get_document_store(