From 255072d8d548a19a1678ddc46b41d41cf5d09bc4 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Fri, 4 Nov 2022 17:05:10 +0100 Subject: [PATCH] refactor: move dC tests to their own module and job (#3529) * move dC tests to their own module and job * restore global var * revert --- .github/workflows/tests.yml | 27 ++ conftest.py | 3 + test/conftest.py | 11 - test/document_stores/test_deepsetcloud.py | 419 ++++++++++++++++++ test/document_stores/test_document_store.py | 447 +------------------- 5 files changed, 451 insertions(+), 456 deletions(-) create mode 100644 test/document_stores/test_deepsetcloud.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 883b5808f..0e1a2fe85 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -224,6 +224,33 @@ jobs: channel: '#haystack' if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main' + integration-tests-dc: + name: Integration / dC / ${{ matrix.os }} + needs: + - unit-tests + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest,macos-latest,windows-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v3 + + - name: Setup Python + uses: ./.github/actions/python_cache/ + + - name: Install Haystack + run: pip install -U . + + - name: Run tests + run: | + pytest --maxfail=5 -m "document_store and integration" test/document_stores/test_deepsetcloud.py + + - uses: act10ns/slack@v1 + with: + status: ${{ job.status }} + channel: '#haystack' + if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main' # # TODO: the following steps need to be revisited # diff --git a/conftest.py b/conftest.py index a381d802f..9a3a5fd79 100644 --- a/conftest.py +++ b/conftest.py @@ -4,6 +4,9 @@ def pytest_addoption(parser): action="store", default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate, pinecone", ) + parser.addoption( + "--mock-dc", action="store_true", default=True, help="Mock HTTP requests to dC while running tests" + ) def pytest_generate_tests(metafunc): diff --git a/test/conftest.py b/test/conftest.py index e61c790aa..3d5268c8b 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -26,7 +26,6 @@ import requests from haystack import Answer, BaseComponent from haystack.document_stores import ( BaseDocumentStore, - DeepsetCloudDocumentStore, InMemoryDocumentStore, ElasticsearchDocumentStore, WeaviateDocumentStore, @@ -86,11 +85,7 @@ from .mocks import pinecone as pinecone_mock # To manually run the tests with default PostgreSQL instead of SQLite, switch the lines below SQL_TYPE = "sqlite" -# SQL_TYPE = "postgres" - SAMPLES_PATH = Path(__file__).parent / "samples" - -# to run tests against Deepset Cloud set MOCK_DC to False and set the following params DC_API_ENDPOINT = "https://DC_API/v1" DC_TEST_INDEX = "document_retrieval_1" DC_API_KEY = "NO_KEY" @@ -603,12 +598,6 @@ def deepset_cloud_fixture(): responses.add_passthru(DC_API_ENDPOINT) -@pytest.fixture -@responses.activate -def deepset_cloud_document_store(deepset_cloud_fixture): - return DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=DC_TEST_INDEX) - - @pytest.fixture def rag_generator(): return RAGenerator(model_name_or_path="facebook/rag-token-nq", generator_type="token", max_length=20) diff --git a/test/document_stores/test_deepsetcloud.py b/test/document_stores/test_deepsetcloud.py new file mode 100644 index 000000000..4e5c2995e --- /dev/null +++ b/test/document_stores/test_deepsetcloud.py @@ -0,0 +1,419 @@ +import logging +import json +from pathlib import Path +from uuid import uuid4 + +import pytest +import responses +import numpy as np + +from responses import matchers + +from haystack.document_stores import DeepsetCloudDocumentStore +from haystack.utils import DeepsetCloudError +from haystack.schema import Document, Label, Answer + + +DC_API_ENDPOINT = "https://dc.example.com/v1" +DC_TEST_INDEX = "document_retrieval_1" +DC_API_KEY = "NO_KEY" +SAMPLES_PATH = Path(__file__).parent.parent / "samples" + + +@pytest.fixture +def dc_api_mock(request): + """ + This fixture contains responses activation, so either this one or ds() below must be + passed to tests that require mocking. + + If `--mock-dc` was False, responses are never activated and it doesn't matter if the + fixture is passed or not. + """ + if request.config.getoption("--mock-dc"): + responses.add( + method=responses.GET, + url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}", + match=[responses.matchers.header_matcher({"authorization": f"Bearer {DC_API_KEY}"})], + json={"indexing": {"status": "INDEXED", "pending_file_count": 0, "total_file_count": 31}}, + status=200, + ) + + responses.add( + method=responses.GET, + url=f"{DC_API_ENDPOINT}/workspaces/default/pipelines", + match=[responses.matchers.header_matcher({"authorization": f"Bearer {DC_API_KEY}"})], + json={ + "data": [ + { + "name": DC_TEST_INDEX, + "status": "DEPLOYED", + "indexing": {"status": "INDEXED", "pending_file_count": 0, "total_file_count": 31}, + } + ], + "has_more": False, + "total": 1, + }, + ) + + # activate the default mock, same as using the @responses.activate everywhere + with responses.mock as m: + yield m + + +@pytest.mark.document_store +@pytest.mark.integration +@pytest.mark.usefixtures("dc_api_mock") +class TestDeepsetCloudDocumentStore: + + # Fixtures + + @pytest.fixture + def ds(self): + """ + We make this fixture depend on `dc_api_mock` so that passing the document store will + activate the mocking and we spare one function parameter. + """ + return DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=DC_TEST_INDEX) + + # Integration tests + + def test_init_with_dot_product(self, ds): + assert ds.return_embedding == False + assert ds.similarity == "dot_product" + + def test_init_with_cosine(self): + document_store = DeepsetCloudDocumentStore( + api_endpoint=DC_API_ENDPOINT, + api_key=DC_API_KEY, + index=DC_TEST_INDEX, + similarity="cosine", + return_embedding=True, + ) + assert document_store.return_embedding == True + assert document_store.similarity == "cosine" + + def test_invalid_token(self): + responses.add( + method=responses.GET, + url=f"{DC_API_ENDPOINT}/workspaces/default/pipelines", + match=[matchers.header_matcher({"authorization": "Bearer invalid_token"})], + body="Internal Server Error", + status=500, + ) + + with pytest.raises( + DeepsetCloudError, + match=f"Could not connect to deepset Cloud:\nGET {DC_API_ENDPOINT}/workspaces/default/pipelines failed: HTTP 500 - Internal Server Error", + ): + DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key="invalid_token", index=DC_TEST_INDEX) + + def test_invalid_api_endpoint(self): + responses.add( + method=responses.GET, url=f"{DC_API_ENDPOINT}00/workspaces/default/pipelines", body="Not Found", status=404 + ) + + with pytest.raises( + DeepsetCloudError, + match=f"Could not connect to deepset Cloud:\nGET {DC_API_ENDPOINT}00/workspaces/default/pipelines failed: " + f"HTTP 404 - Not Found\nNot Found", + ): + DeepsetCloudDocumentStore(api_endpoint=f"{DC_API_ENDPOINT}00", api_key=DC_API_KEY, index=DC_TEST_INDEX) + + def test_invalid_index(self, caplog): + responses.add( + method=responses.GET, + url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/invalid_index", + body="Not Found", + status=404, + ) + + with caplog.at_level(logging.INFO): + DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index="invalid_index") + assert ( + "You are using a DeepsetCloudDocumentStore with an index that does not exist on deepset Cloud." + in caplog.text + ) + + def test_documents(self, ds): + with open(SAMPLES_PATH / "dc" / "documents-stream.response", "r") as f: + documents_stream_response = f.read() + docs = [json.loads(l) for l in documents_stream_response.splitlines()] + filtered_docs = [doc for doc in docs if doc["meta"]["file_id"] == docs[0]["meta"]["file_id"]] + documents_stream_filtered_response = "\n".join([json.dumps(d) for d in filtered_docs]) + + responses.add( + method=responses.POST, + url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-stream", + body=documents_stream_response, + status=200, + ) + + responses.add( + method=responses.POST, + url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-stream", + match=[ + matchers.json_params_matcher( + {"filters": {"file_id": [docs[0]["meta"]["file_id"]]}, "return_embedding": False} + ) + ], + body=documents_stream_filtered_response, + status=200, + ) + + for doc in filtered_docs: + responses.add( + method=responses.GET, + url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents/{doc['id']}", + json=doc, + status=200, + ) + + docs = ds.get_all_documents() + assert len(docs) > 1 + assert isinstance(docs[0], Document) + + first_doc = next(ds.get_all_documents_generator()) + assert isinstance(first_doc, Document) + assert first_doc.meta["file_id"] is not None + + filtered_docs = ds.get_all_documents(filters={"file_id": [first_doc.meta["file_id"]]}) + assert len(filtered_docs) > 0 + assert len(filtered_docs) < len(docs) + + ids = [doc.id for doc in filtered_docs] + single_doc_by_id = ds.get_document_by_id(ids[0]) + assert single_doc_by_id is not None + assert single_doc_by_id.meta["file_id"] == first_doc.meta["file_id"] + + docs_by_id = ds.get_documents_by_id(ids) + assert len(docs_by_id) == len(filtered_docs) + for doc in docs_by_id: + assert doc.meta["file_id"] == first_doc.meta["file_id"] + + def test_query(self, ds): + + with open(SAMPLES_PATH / "dc" / "query_winterfell.response", "r") as f: + query_winterfell_response = f.read() + query_winterfell_docs = json.loads(query_winterfell_response) + query_winterfell_filtered_docs = [ + doc + for doc in query_winterfell_docs + if doc["meta"]["file_id"] == query_winterfell_docs[0]["meta"]["file_id"] + ] + query_winterfell_filtered_response = json.dumps(query_winterfell_filtered_docs) + + responses.add( + method=responses.POST, + url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query", + match=[ + matchers.json_params_matcher( + {"query": "winterfell", "top_k": 50, "all_terms_must_match": False, "scale_score": True} + ) + ], + status=200, + body=query_winterfell_response, + ) + + responses.add( + method=responses.POST, + url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query", + match=[ + matchers.json_params_matcher( + { + "query": "winterfell", + "top_k": 50, + "filters": {"file_id": [query_winterfell_docs[0]["meta"]["file_id"]]}, + "all_terms_must_match": False, + "scale_score": True, + } + ) + ], + status=200, + body=query_winterfell_filtered_response, + ) + + docs = ds.query("winterfell", top_k=50) + assert docs is not None + assert len(docs) > 0 + + first_doc = docs[0] + filtered_docs = ds.query("winterfell", top_k=50, filters={"file_id": [first_doc.meta["file_id"]]}) + assert len(filtered_docs) > 0 + assert len(filtered_docs) < len(docs) + + @pytest.mark.parametrize( + "body, expected_count", + [ + ( + { + "data": [ + { + "evaluation_set_id": str(uuid4()), + "name": DC_TEST_INDEX, + "created_at": "2022-03-22T13:40:27.535Z", + "matched_labels": 2, + "total_labels": 10, + } + ], + "has_more": False, + "total": 1, + }, + 10, + ), + ( + { + "data": [ + { + "evaluation_set_id": str(uuid4()), + "name": DC_TEST_INDEX, + "created_at": "2022-03-22T13:40:27.535Z", + "matched_labels": 0, + "total_labels": 0, + } + ], + "has_more": False, + "total": 1, + }, + 0, + ), + ], + ) + def test_count_of_labels_for_evaluation_set(self, ds, body: dict, expected_count: int): + responses.add( + method=responses.GET, + url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets", + status=200, + body=json.dumps(body), + ) + + count = ds.get_label_count(index=DC_TEST_INDEX) + assert count == expected_count + + def test_count_of_labels_for_evaluation_set_raises_DC_error_when_nothing_found(self, ds): + responses.add( + method=responses.GET, + url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets", + status=200, + body=json.dumps({"data": [], "has_more": False, "total": 0}), + ) + + with pytest.raises(DeepsetCloudError, match=f"No evaluation set found with the name {DC_TEST_INDEX}"): + ds.get_label_count(index=DC_TEST_INDEX) + + def test_lists_evaluation_sets(self, ds): + response_evaluation_set = { + "evaluation_set_id": str(uuid4()), + "name": DC_TEST_INDEX, + "created_at": "2022-03-22T13:40:27.535Z", + "matched_labels": 2, + "total_labels": 10, + } + + responses.add( + method=responses.GET, + url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets", + status=200, + body=json.dumps({"data": [response_evaluation_set], "has_more": False, "total": 1}), + ) + + evaluation_sets = ds.get_evaluation_sets() + assert evaluation_sets == [response_evaluation_set] + + def test_fetches_labels_for_evaluation_set(self, ds): + responses.add( + method=responses.GET, + url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets/{DC_TEST_INDEX}", + status=200, + body=json.dumps( + [ + { + "label_id": "3fa85f64-5717-4562-b3fc-2c963f66afa6", + "query": "What is berlin?", + "answer": "biggest city in germany", + "answer_start": 0, + "answer_end": 0, + "meta": {}, + "context": "Berlin is the biggest city in germany.", + "external_file_name": "string", + "file_id": "3fa85f64-5717-4562-b3fc-2c963f66afa6", + "state": "Label matching status", + "candidates": "Candidates that were found in the label <-> file matching", + } + ] + ), + ) + + labels = ds.get_all_labels(index=DC_TEST_INDEX) + assert labels == [ + Label( + query="What is berlin?", + document=Document(content="Berlin is the biggest city in germany."), + is_correct_answer=True, + is_correct_document=True, + origin="user-feedback", + answer=Answer("biggest city in germany"), + id="3fa85f64-5717-4562-b3fc-2c963f66afa6", + pipeline_id=None, + created_at=None, + updated_at=None, + meta={}, + filters={}, + ) + ] + + def test_fetches_labels_for_evaluation_set_raises_deepsetclouderror_when_nothing_found(self, ds): + responses.add( + method=responses.GET, + url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets/{DC_TEST_INDEX}", + status=404, + ) + + with pytest.raises(DeepsetCloudError, match=f"No evaluation set found with the name {DC_TEST_INDEX}"): + ds.get_all_labels(index=DC_TEST_INDEX) + + def test_query_by_embedding(self, ds): + query_emb = np.random.randn(768) + + responses.add( + method=responses.POST, + url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query", + match=[ + matchers.json_params_matcher( + {"query_emb": query_emb.tolist(), "top_k": 10, "return_embedding": False, "scale_score": True} + ) + ], + json=[], + status=200, + ) + + emb_docs = ds.query_by_embedding(query_emb) + assert len(emb_docs) == 0 + + def test_get_all_docs_without_index(self): + document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) + assert document_store.get_all_documents() == [] + + def test_get_all_docs_generator_without_index(self): + document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) + assert list(document_store.get_all_documents_generator()) == [] + + def test_get_doc_by_id_without_index(self): + document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) + assert document_store.get_document_by_id(id="some id") == None + + def test_get_docs_by_id_without_index(self): + document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) + assert document_store.get_documents_by_id(ids=["some id"]) == [] + + def test_get_doc_count_without_index(self): + document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) + assert document_store.get_document_count() == 0 + + def test_query_by_emb_without_index(self): + query_emb = np.random.randn(768) + document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) + assert document_store.query_by_embedding(query_emb=query_emb) == [] + + def test_query_without_index(self): + document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) + assert document_store.query(query="some query") == [] diff --git a/test/document_stores/test_document_store.py b/test/document_stores/test_document_store.py index d5e3076c8..8b2045899 100644 --- a/test/document_stores/test_document_store.py +++ b/test/document_stores/test_document_store.py @@ -1,5 +1,4 @@ from copy import deepcopy -import logging import math import sys from uuid import uuid4 @@ -7,27 +6,12 @@ from uuid import uuid4 import numpy as np import pandas as pd import pytest -import json -import responses -from responses import matchers from unittest.mock import Mock -from elasticsearch import Elasticsearch -from elasticsearch.exceptions import RequestError -from ..conftest import ( - deepset_cloud_fixture, - get_document_store, - ensure_ids_are_correct_uuids, - MOCK_DC, - DC_API_ENDPOINT, - DC_API_KEY, - DC_TEST_INDEX, - SAMPLES_PATH, -) + +from ..conftest import get_document_store, ensure_ids_are_correct_uuids from haystack.document_stores import ( WeaviateDocumentStore, - DeepsetCloudDocumentStore, - InMemoryDocumentStore, MilvusDocumentStore, FAISSDocumentStore, ElasticsearchDocumentStore, @@ -40,7 +24,6 @@ from haystack.errors import DuplicateDocumentError from haystack.schema import Document, Label, Answer, Span from haystack.nodes import EmbeddingRetriever, PreProcessor from haystack.pipelines import DocumentSearchPipeline -from haystack.utils import DeepsetCloudError DOCUMENTS = [ @@ -1286,432 +1269,6 @@ def test_custom_headers(document_store_with_docs: BaseDocumentStore): assert len(documents) > 0 -@pytest.mark.usefixtures(deepset_cloud_fixture.__name__) -@responses.activate -def test_DeepsetCloudDocumentStore_init_with_dot_product(): - document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=DC_TEST_INDEX) - assert document_store.return_embedding == False - assert document_store.similarity == "dot_product" - - -@pytest.mark.usefixtures(deepset_cloud_fixture.__name__) -@responses.activate -def test_DeepsetCloudDocumentStore_init_with_cosine(): - document_store = DeepsetCloudDocumentStore( - api_endpoint=DC_API_ENDPOINT, - api_key=DC_API_KEY, - index=DC_TEST_INDEX, - similarity="cosine", - return_embedding=True, - ) - assert document_store.return_embedding == True - assert document_store.similarity == "cosine" - - -@pytest.mark.usefixtures(deepset_cloud_fixture.__name__) -@responses.activate -def test_DeepsetCloudDocumentStore_invalid_token(): - if MOCK_DC: - responses.add( - method=responses.GET, - url=f"{DC_API_ENDPOINT}/workspaces/default/pipelines", - match=[matchers.header_matcher({"authorization": "Bearer invalid_token"})], - body="Internal Server Error", - status=500, - ) - - with pytest.raises( - DeepsetCloudError, - match=f"Could not connect to deepset Cloud:\nGET {DC_API_ENDPOINT}/workspaces/default/pipelines failed: HTTP 500 - Internal Server Error", - ): - DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key="invalid_token", index=DC_TEST_INDEX) - - -@pytest.mark.usefixtures(deepset_cloud_fixture.__name__) -@responses.activate -def test_DeepsetCloudDocumentStore_invalid_api_endpoint(): - if MOCK_DC: - responses.add( - method=responses.GET, url=f"{DC_API_ENDPOINT}00/workspaces/default/pipelines", body="Not Found", status=404 - ) - - with pytest.raises( - DeepsetCloudError, - match=f"Could not connect to deepset Cloud:\nGET {DC_API_ENDPOINT}00/workspaces/default/pipelines failed: " - f"HTTP 404 - Not Found\nNot Found", - ): - DeepsetCloudDocumentStore(api_endpoint=f"{DC_API_ENDPOINT}00", api_key=DC_API_KEY, index=DC_TEST_INDEX) - - -@pytest.mark.usefixtures(deepset_cloud_fixture.__name__) -@responses.activate -def test_DeepsetCloudDocumentStore_invalid_index(caplog): - if MOCK_DC: - responses.add( - method=responses.GET, - url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/invalid_index", - body="Not Found", - status=404, - ) - - with caplog.at_level(logging.INFO): - DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index="invalid_index") - assert ( - "You are using a DeepsetCloudDocumentStore with an index that does not exist on deepset Cloud." - in caplog.text - ) - - -@responses.activate -def test_DeepsetCloudDocumentStore_documents(deepset_cloud_document_store): - if MOCK_DC: - with open(SAMPLES_PATH / "dc" / "documents-stream.response", "r") as f: - documents_stream_response = f.read() - docs = [json.loads(l) for l in documents_stream_response.splitlines()] - filtered_docs = [doc for doc in docs if doc["meta"]["file_id"] == docs[0]["meta"]["file_id"]] - documents_stream_filtered_response = "\n".join([json.dumps(d) for d in filtered_docs]) - - responses.add( - method=responses.POST, - url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-stream", - body=documents_stream_response, - status=200, - ) - - responses.add( - method=responses.POST, - url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-stream", - match=[ - matchers.json_params_matcher( - {"filters": {"file_id": [docs[0]["meta"]["file_id"]]}, "return_embedding": False} - ) - ], - body=documents_stream_filtered_response, - status=200, - ) - - for doc in filtered_docs: - responses.add( - method=responses.GET, - url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents/{doc['id']}", - json=doc, - status=200, - ) - else: - responses.add_passthru(DC_API_ENDPOINT) - - docs = deepset_cloud_document_store.get_all_documents() - assert len(docs) > 1 - assert isinstance(docs[0], Document) - - first_doc = next(deepset_cloud_document_store.get_all_documents_generator()) - assert isinstance(first_doc, Document) - assert first_doc.meta["file_id"] is not None - - filtered_docs = deepset_cloud_document_store.get_all_documents(filters={"file_id": [first_doc.meta["file_id"]]}) - assert len(filtered_docs) > 0 - assert len(filtered_docs) < len(docs) - - ids = [doc.id for doc in filtered_docs] - single_doc_by_id = deepset_cloud_document_store.get_document_by_id(ids[0]) - assert single_doc_by_id is not None - assert single_doc_by_id.meta["file_id"] == first_doc.meta["file_id"] - - docs_by_id = deepset_cloud_document_store.get_documents_by_id(ids) - assert len(docs_by_id) == len(filtered_docs) - for doc in docs_by_id: - assert doc.meta["file_id"] == first_doc.meta["file_id"] - - -@responses.activate -def test_DeepsetCloudDocumentStore_query(deepset_cloud_document_store): - if MOCK_DC: - with open(SAMPLES_PATH / "dc" / "query_winterfell.response", "r") as f: - query_winterfell_response = f.read() - query_winterfell_docs = json.loads(query_winterfell_response) - query_winterfell_filtered_docs = [ - doc - for doc in query_winterfell_docs - if doc["meta"]["file_id"] == query_winterfell_docs[0]["meta"]["file_id"] - ] - query_winterfell_filtered_response = json.dumps(query_winterfell_filtered_docs) - - responses.add( - method=responses.POST, - url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query", - match=[ - matchers.json_params_matcher( - {"query": "winterfell", "top_k": 50, "all_terms_must_match": False, "scale_score": True} - ) - ], - status=200, - body=query_winterfell_response, - ) - - responses.add( - method=responses.POST, - url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query", - match=[ - matchers.json_params_matcher( - { - "query": "winterfell", - "top_k": 50, - "filters": {"file_id": [query_winterfell_docs[0]["meta"]["file_id"]]}, - "all_terms_must_match": False, - "scale_score": True, - } - ) - ], - status=200, - body=query_winterfell_filtered_response, - ) - else: - responses.add_passthru(DC_API_ENDPOINT) - - docs = deepset_cloud_document_store.query("winterfell", top_k=50) - assert docs is not None - assert len(docs) > 0 - - first_doc = docs[0] - filtered_docs = deepset_cloud_document_store.query( - "winterfell", top_k=50, filters={"file_id": [first_doc.meta["file_id"]]} - ) - assert len(filtered_docs) > 0 - assert len(filtered_docs) < len(docs) - - -@pytest.mark.parametrize( - "body, expected_count", - [ - ( - { - "data": [ - { - "evaluation_set_id": str(uuid4()), - "name": DC_TEST_INDEX, - "created_at": "2022-03-22T13:40:27.535Z", - "matched_labels": 2, - "total_labels": 10, - } - ], - "has_more": False, - "total": 1, - }, - 10, - ), - ( - { - "data": [ - { - "evaluation_set_id": str(uuid4()), - "name": DC_TEST_INDEX, - "created_at": "2022-03-22T13:40:27.535Z", - "matched_labels": 0, - "total_labels": 0, - } - ], - "has_more": False, - "total": 1, - }, - 0, - ), - ], -) -@responses.activate -def test_DeepsetCloudDocumentStore_count_of_labels_for_evaluation_set( - deepset_cloud_document_store, body: dict, expected_count: int -): - if MOCK_DC: - responses.add( - method=responses.GET, - url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets", - status=200, - body=json.dumps(body), - ) - else: - responses.add_passthru(DC_API_ENDPOINT) - - count = deepset_cloud_document_store.get_label_count(index=DC_TEST_INDEX) - assert count == expected_count - - -@responses.activate -def test_DeepsetCloudDocumentStore_count_of_labels_for_evaluation_set_raises_DC_error_when_nothing_found( - deepset_cloud_document_store, -): - if MOCK_DC: - responses.add( - method=responses.GET, - url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets", - status=200, - body=json.dumps({"data": [], "has_more": False, "total": 0}), - ) - else: - responses.add_passthru(DC_API_ENDPOINT) - - with pytest.raises(DeepsetCloudError, match=f"No evaluation set found with the name {DC_TEST_INDEX}"): - deepset_cloud_document_store.get_label_count(index=DC_TEST_INDEX) - - -@responses.activate -def test_DeepsetCloudDocumentStore_lists_evaluation_sets(deepset_cloud_document_store): - response_evaluation_set = { - "evaluation_set_id": str(uuid4()), - "name": DC_TEST_INDEX, - "created_at": "2022-03-22T13:40:27.535Z", - "matched_labels": 2, - "total_labels": 10, - } - if MOCK_DC: - responses.add( - method=responses.GET, - url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets", - status=200, - body=json.dumps({"data": [response_evaluation_set], "has_more": False, "total": 1}), - ) - else: - responses.add_passthru(DC_API_ENDPOINT) - - evaluation_sets = deepset_cloud_document_store.get_evaluation_sets() - assert evaluation_sets == [response_evaluation_set] - - -@responses.activate -def test_DeepsetCloudDocumentStore_fetches_labels_for_evaluation_set(deepset_cloud_document_store): - if MOCK_DC: - responses.add( - method=responses.GET, - url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets/{DC_TEST_INDEX}", - status=200, - body=json.dumps( - [ - { - "label_id": "3fa85f64-5717-4562-b3fc-2c963f66afa6", - "query": "What is berlin?", - "answer": "biggest city in germany", - "answer_start": 0, - "answer_end": 0, - "meta": {}, - "context": "Berlin is the biggest city in germany.", - "external_file_name": "string", - "file_id": "3fa85f64-5717-4562-b3fc-2c963f66afa6", - "state": "Label matching status", - "candidates": "Candidates that were found in the label <-> file matching", - } - ] - ), - ) - else: - responses.add_passthru(DC_API_ENDPOINT) - - labels = deepset_cloud_document_store.get_all_labels(index=DC_TEST_INDEX) - assert labels == [ - Label( - query="What is berlin?", - document=Document(content="Berlin is the biggest city in germany."), - is_correct_answer=True, - is_correct_document=True, - origin="user-feedback", - answer=Answer("biggest city in germany"), - id="3fa85f64-5717-4562-b3fc-2c963f66afa6", - pipeline_id=None, - created_at=None, - updated_at=None, - meta={}, - filters={}, - ) - ] - - -@responses.activate -def test_DeepsetCloudDocumentStore_fetches_labels_for_evaluation_set_raises_deepsetclouderror_when_nothing_found( - deepset_cloud_document_store, -): - if MOCK_DC: - responses.add( - method=responses.GET, - url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets/{DC_TEST_INDEX}", - status=404, - ) - else: - responses.add_passthru(DC_API_ENDPOINT) - - with pytest.raises(DeepsetCloudError, match=f"No evaluation set found with the name {DC_TEST_INDEX}"): - deepset_cloud_document_store.get_all_labels(index=DC_TEST_INDEX) - - -@responses.activate -def test_DeepsetCloudDocumentStore_query_by_embedding(deepset_cloud_document_store): - query_emb = np.random.randn(768) - if MOCK_DC: - responses.add( - method=responses.POST, - url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query", - match=[ - matchers.json_params_matcher( - {"query_emb": query_emb.tolist(), "top_k": 10, "return_embedding": False, "scale_score": True} - ) - ], - json=[], - status=200, - ) - else: - responses.add_passthru(DC_API_ENDPOINT) - - emb_docs = deepset_cloud_document_store.query_by_embedding(query_emb) - assert len(emb_docs) == 0 - - -@pytest.mark.usefixtures(deepset_cloud_fixture.__name__) -@responses.activate -def test_DeepsetCloudDocumentStore_get_all_docs_without_index(): - document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) - assert document_store.get_all_documents() == [] - - -@pytest.mark.usefixtures(deepset_cloud_fixture.__name__) -@responses.activate -def test_DeepsetCloudDocumentStore_get_all_docs_generator_without_index(): - document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) - assert list(document_store.get_all_documents_generator()) == [] - - -@pytest.mark.usefixtures(deepset_cloud_fixture.__name__) -@responses.activate -def test_DeepsetCloudDocumentStore_get_doc_by_id_without_index(): - document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) - assert document_store.get_document_by_id(id="some id") == None - - -@pytest.mark.usefixtures(deepset_cloud_fixture.__name__) -@responses.activate -def test_DeepsetCloudDocumentStore_get_docs_by_id_without_index(): - document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) - assert document_store.get_documents_by_id(ids=["some id"]) == [] - - -@pytest.mark.usefixtures(deepset_cloud_fixture.__name__) -@responses.activate -def test_DeepsetCloudDocumentStore_get_doc_count_without_index(): - document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) - assert document_store.get_document_count() == 0 - - -@pytest.mark.usefixtures(deepset_cloud_fixture.__name__) -@responses.activate -def test_DeepsetCloudDocumentStore_query_by_emb_without_index(): - query_emb = np.random.randn(768) - document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) - assert document_store.query_by_embedding(query_emb=query_emb) == [] - - -@pytest.mark.usefixtures(deepset_cloud_fixture.__name__) -@responses.activate -def test_DeepsetCloudDocumentStore_query_without_index(): - document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None) - assert document_store.query(query="some query") == [] - - @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) def test_elasticsearch_brownfield_support(document_store_with_docs): new_document_store = InMemoryDocumentStore()