import logging import json from pathlib import Path from uuid import uuid4 import pytest import responses import numpy as np from responses import matchers from haystack.document_stores import DeepsetCloudDocumentStore from haystack.utils import DeepsetCloudError from haystack.schema import Document, Label, Answer DC_API_ENDPOINT = "https://dc.example.com/v1" DC_TEST_INDEX = "document_retrieval_1" DC_API_KEY = "NO_KEY" @pytest.fixture def dc_api_mock(request): """ This fixture contains responses activation, so either this one or ds() below must be passed to tests that require mocking. If `--mock-dc` was False, responses are never activated and it doesn't matter if the fixture is passed or not. """ if request.config.getoption("--mock-dc"): responses.add( method=responses.GET, url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}", match=[responses.matchers.header_matcher({"authorization": f"Bearer {DC_API_KEY}"})], json={"indexing": {"status": "INDEXED", "pending_file_count": 0, "total_file_count": 31}}, status=200, ) responses.add( method=responses.GET, url=f"{DC_API_ENDPOINT}/workspaces/default/pipelines", match=[responses.matchers.header_matcher({"authorization": f"Bearer {DC_API_KEY}"})], json={ "data": [ { "name": DC_TEST_INDEX, "status": "DEPLOYED", "indexing": {"status": "INDEXED", "pending_file_count": 0, "total_file_count": 31}, } ], "has_more": False, "total": 1, }, ) # activate the default mock, same as using the @responses.activate everywhere with responses.mock as m: yield m @pytest.mark.document_store @pytest.mark.integration @pytest.mark.usefixtures("dc_api_mock") class TestDeepsetCloudDocumentStore: # Fixtures @pytest.fixture def ds(self): """ We make this fixture depend on `dc_api_mock` so that passing the document store will activate the mocking and we spare one function parameter. """ return DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=DC_TEST_INDEX) # Integration tests def test_init_with_dot_product(self, ds): assert ds.return_embedding == False assert ds.similarity == "dot_product" def test_init_with_cosine(self): document_store = DeepsetCloudDocumentStore( api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=DC_TEST_INDEX, similarity="cosine", return_embedding=True, ) assert document_store.return_embedding == True assert document_store.similarity == "cosine" def test_invalid_token(self): responses.add( method=responses.GET, url=f"{DC_API_ENDPOINT}/workspaces/default/pipelines", match=[matchers.header_matcher({"authorization": "Bearer invalid_token"})], body="Internal Server Error", status=500, ) with pytest.raises( DeepsetCloudError, match=f"Could not connect to deepset Cloud:\nGET {DC_API_ENDPOINT}/workspaces/default/pipelines failed: HTTP 500 - Internal Server Error", ): DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key="invalid_token", index=DC_TEST_INDEX) def test_invalid_api_endpoint(self): responses.add( method=responses.GET, url=f"{DC_API_ENDPOINT}00/workspaces/default/pipelines", body="Not Found", status=404 ) with pytest.raises( DeepsetCloudError, match=f"Could not connect to deepset Cloud:\nGET {DC_API_ENDPOINT}00/workspaces/default/pipelines failed: " f"HTTP 404 - Not Found\nNot Found", ): DeepsetCloudDocumentStore(api_endpoint=f"{DC_API_ENDPOINT}00", api_key=DC_API_KEY, index=DC_TEST_INDEX) def test_invalid_index(self, caplog): responses.add( method=responses.GET, url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/invalid_index", body="Not Found", status=404, ) with caplog.at_level(logging.INFO): DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index="invalid_index") assert ( "You are using a DeepsetCloudDocumentStore with an index that does not exist on deepset Cloud." in caplog.text ) def test_documents(self, ds, samples_path): with open(samples_path / "dc" / "documents-stream.response", "r") as f: documents_stream_response = f.read() docs = [json.loads(l) for l in documents_stream_response.splitlines()] filtered_docs = [doc for doc in docs if doc["meta"]["file_id"] == docs[0]["meta"]["file_id"]] documents_stream_filtered_response = "\n".join([json.dumps(d) for d in filtered_docs]) responses.add( method=responses.POST, url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-stream", body=documents_stream_response, status=200, ) responses.add( method=responses.POST, url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-stream", match=[ matchers.json_params_matcher( {"filters": {"file_id": [docs[0]["meta"]["file_id"]]}, "return_embedding": False} ) ], body=documents_stream_filtered_response, status=200, ) for doc in filtered_docs: responses.add( method=responses.GET, url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents/{doc['id']}", json=doc, status=200, ) docs = ds.get_all_documents() assert len(docs) > 1 assert isinstance(docs[0], Document) first_doc = next(ds.get_all_documents_generator()) assert isinstance(first_doc, Document) assert first_doc.meta["file_id"] is not None filtered_docs = ds.get_all_documents(filters={"file_id": [first_doc.meta["file_id"]]}) assert len(filtered_docs) > 0 assert len(filtered_docs) < len(docs) ids = [doc.id for doc in filtered_docs] single_doc_by_id = ds.get_document_by_id(ids[0]) assert single_doc_by_id is not None assert single_doc_by_id.meta["file_id"] == first_doc.meta["file_id"] docs_by_id = ds.get_documents_by_id(ids) assert len(docs_by_id) == len(filtered_docs) for doc in docs_by_id: assert doc.meta["file_id"] == first_doc.meta["file_id"] def test_query(self, ds, samples_path): with open(samples_path / "dc" / "query_winterfell.response", "r") as f: query_winterfell_response = f.read() query_winterfell_docs = json.loads(query_winterfell_response) query_winterfell_filtered_docs = [ doc for doc in query_winterfell_docs if doc["meta"]["file_id"] == query_winterfell_docs[0]["meta"]["file_id"] ] query_winterfell_filtered_response = json.dumps(query_winterfell_filtered_docs) responses.add( method=responses.POST, url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query", match=[ matchers.json_params_matcher( {"query": "winterfell", "top_k": 50, "all_terms_must_match": False, "scale_score": True} ) ], status=200, body=query_winterfell_response, ) responses.add( method=responses.POST, url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query", match=[ matchers.json_params_matcher( { "query": "winterfell", "top_k": 50, "filters": {"file_id": [query_winterfell_docs[0]["meta"]["file_id"]]}, "all_terms_must_match": False, "scale_score": True, } ) ], status=200, body=query_winterfell_filtered_response, ) docs = ds.query("winterfell", top_k=50) assert docs is not None assert len(docs) > 0 first_doc = docs[0] filtered_docs = ds.query("winterfell", top_k=50, filters={"file_id": [first_doc.meta["file_id"]]}) assert len(filtered_docs) > 0 assert len(filtered_docs) < len(docs) @pytest.mark.parametrize( "body, expected_count", [ ( { "data": [ { "evaluation_set_id": str(uuid4()), "name": DC_TEST_INDEX, "created_at": "2022-03-22T13:40:27.535Z", "matched_labels": 2, "total_labels": 10, } ], "has_more": False, "total": 1, }, 10, ), ( { "data": [ { "evaluation_set_id": str(uuid4()), "name": DC_TEST_INDEX, "created_at": "2022-03-22T13:40:27.535Z", "matched_labels": 0, "total_labels": 0, } ], "has_more": False, "total": 1, }, 0, ), ], ) def test_count_of_labels_for_evaluation_set(self, ds, body: dict, expected_count: int): responses.add( method=responses.GET, url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets", status=200, body=json.dumps(body), ) count = ds.get_label_count(index=DC_TEST_INDEX) assert count == expected_count def test_count_of_labels_for_evaluation_set_raises_DC_error_when_nothing_found(self, ds): responses.add( method=responses.GET, url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets", status=200, body=json.dumps({"data": [], "has_more": False, "total": 0}), ) with pytest.raises(DeepsetCloudError, match=f"No evaluation set found with the name {DC_TEST_INDEX}"): ds.get_label_count(index=DC_TEST_INDEX) def test_lists_evaluation_sets(self, ds): response_evaluation_set = { "evaluation_set_id": str(uuid4()), "name": DC_TEST_INDEX, "created_at": "2022-03-22T13:40:27.535Z", "matched_labels": 2, "total_labels": 10, } responses.add( method=responses.GET, url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets", status=200, body=json.dumps({"data": [response_evaluation_set], "has_more": False, "total": 1}), ) evaluation_sets = ds.get_evaluation_sets() assert evaluation_sets == [response_evaluation_set] def test_fetches_labels_for_evaluation_set(self, ds): responses.add( method=responses.GET, url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets/{DC_TEST_INDEX}", status=200, body=json.dumps( [ { "label_id": "3fa85f64-5717-4562-b3fc-2c963f66afa6", "query": "What is berlin?", "answer": "biggest city in germany", "answer_start": 0, "answer_end": 0, "meta": {}, "context": "Berlin is the biggest city in germany.", "external_file_name": "string", "file_id": "3fa85f64-5717-4562-b3fc-2c963f66afa6", "state": "Label matching status", "candidates": "Candidates that were found in the label <-> file matching", } ] ), ) labels = ds.get_all_labels(index=DC_TEST_INDEX) assert labels == [ Label( query="What is berlin?", document=Document(content="Berlin is the biggest city in germany."), is_correct_answer=True, is_correct_document=True, origin="user-feedback", answer=Answer("biggest city in germany"), id="3fa85f64-5717-4562-b3fc-2c963f66afa6", pipeline_id=None, created_at=None, updated_at=None, meta={}, filters={}, ) ] def test_fetches_labels_for_evaluation_set_raises_deepsetclouderror_when_nothing_found(self, ds): responses.add( method=responses.GET, url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets/{DC_TEST_INDEX}", status=404, ) with pytest.raises(DeepsetCloudError, match=f"No evaluation set found with the name {DC_TEST_INDEX}"): ds.get_all_labels(index=DC_TEST_INDEX) def test_query_by_embedding(self, ds): query_emb = np.random.randn(768) responses.add( method=responses.POST, url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query", match=[ matchers.json_params_matcher( { "query_emb": query_emb.tolist(), "top_k": 10, "return_embedding": False, "scale_score": True, "use_prefiltering": False, } ) ], json=[], status=200, ) emb_docs = ds.query_by_embedding(query_emb) assert len(emb_docs) == 0 def test_get_all_docs_without_index(self): document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) assert document_store.get_all_documents() == [] def test_get_all_docs_generator_without_index(self): document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) assert list(document_store.get_all_documents_generator()) == [] def test_get_doc_by_id_without_index(self): document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) assert document_store.get_document_by_id(id="some id") == None def test_get_docs_by_id_without_index(self): document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) assert document_store.get_documents_by_id(ids=["some id"]) == [] def test_get_doc_count_without_index(self): document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) assert document_store.get_document_count() == 0 def test_query_by_emb_without_index(self): query_emb = np.random.randn(768) document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) assert document_store.query_by_embedding(query_emb=query_emb) == [] def test_query_without_index(self): document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) assert document_store.query(query="some query") == []