mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-14 09:28:56 +00:00
refactor: move dC tests to their own module and job (#3529)
* move dC tests to their own module and job * restore global var * revert
This commit is contained in:
parent
815017ad5b
commit
255072d8d5
27
.github/workflows/tests.yml
vendored
27
.github/workflows/tests.yml
vendored
@ -224,6 +224,33 @@ jobs:
|
||||
channel: '#haystack'
|
||||
if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main'
|
||||
|
||||
integration-tests-dc:
|
||||
name: Integration / dC / ${{ matrix.os }}
|
||||
needs:
|
||||
- unit-tests
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest,macos-latest,windows-latest]
|
||||
runs-on: ${{ matrix.os }}
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Setup Python
|
||||
uses: ./.github/actions/python_cache/
|
||||
|
||||
- name: Install Haystack
|
||||
run: pip install -U .
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
pytest --maxfail=5 -m "document_store and integration" test/document_stores/test_deepsetcloud.py
|
||||
|
||||
- uses: act10ns/slack@v1
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
channel: '#haystack'
|
||||
if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main'
|
||||
#
|
||||
# TODO: the following steps need to be revisited
|
||||
#
|
||||
|
@ -4,6 +4,9 @@ def pytest_addoption(parser):
|
||||
action="store",
|
||||
default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate, pinecone",
|
||||
)
|
||||
parser.addoption(
|
||||
"--mock-dc", action="store_true", default=True, help="Mock HTTP requests to dC while running tests"
|
||||
)
|
||||
|
||||
|
||||
def pytest_generate_tests(metafunc):
|
||||
|
@ -26,7 +26,6 @@ import requests
|
||||
from haystack import Answer, BaseComponent
|
||||
from haystack.document_stores import (
|
||||
BaseDocumentStore,
|
||||
DeepsetCloudDocumentStore,
|
||||
InMemoryDocumentStore,
|
||||
ElasticsearchDocumentStore,
|
||||
WeaviateDocumentStore,
|
||||
@ -86,11 +85,7 @@ from .mocks import pinecone as pinecone_mock
|
||||
|
||||
# To manually run the tests with default PostgreSQL instead of SQLite, switch the lines below
|
||||
SQL_TYPE = "sqlite"
|
||||
# SQL_TYPE = "postgres"
|
||||
|
||||
SAMPLES_PATH = Path(__file__).parent / "samples"
|
||||
|
||||
# to run tests against Deepset Cloud set MOCK_DC to False and set the following params
|
||||
DC_API_ENDPOINT = "https://DC_API/v1"
|
||||
DC_TEST_INDEX = "document_retrieval_1"
|
||||
DC_API_KEY = "NO_KEY"
|
||||
@ -603,12 +598,6 @@ def deepset_cloud_fixture():
|
||||
responses.add_passthru(DC_API_ENDPOINT)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@responses.activate
|
||||
def deepset_cloud_document_store(deepset_cloud_fixture):
|
||||
return DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=DC_TEST_INDEX)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def rag_generator():
|
||||
return RAGenerator(model_name_or_path="facebook/rag-token-nq", generator_type="token", max_length=20)
|
||||
|
419
test/document_stores/test_deepsetcloud.py
Normal file
419
test/document_stores/test_deepsetcloud.py
Normal file
@ -0,0 +1,419 @@
|
||||
import logging
|
||||
import json
|
||||
from pathlib import Path
|
||||
from uuid import uuid4
|
||||
|
||||
import pytest
|
||||
import responses
|
||||
import numpy as np
|
||||
|
||||
from responses import matchers
|
||||
|
||||
from haystack.document_stores import DeepsetCloudDocumentStore
|
||||
from haystack.utils import DeepsetCloudError
|
||||
from haystack.schema import Document, Label, Answer
|
||||
|
||||
|
||||
DC_API_ENDPOINT = "https://dc.example.com/v1"
|
||||
DC_TEST_INDEX = "document_retrieval_1"
|
||||
DC_API_KEY = "NO_KEY"
|
||||
SAMPLES_PATH = Path(__file__).parent.parent / "samples"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dc_api_mock(request):
|
||||
"""
|
||||
This fixture contains responses activation, so either this one or ds() below must be
|
||||
passed to tests that require mocking.
|
||||
|
||||
If `--mock-dc` was False, responses are never activated and it doesn't matter if the
|
||||
fixture is passed or not.
|
||||
"""
|
||||
if request.config.getoption("--mock-dc"):
|
||||
responses.add(
|
||||
method=responses.GET,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}",
|
||||
match=[responses.matchers.header_matcher({"authorization": f"Bearer {DC_API_KEY}"})],
|
||||
json={"indexing": {"status": "INDEXED", "pending_file_count": 0, "total_file_count": 31}},
|
||||
status=200,
|
||||
)
|
||||
|
||||
responses.add(
|
||||
method=responses.GET,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/pipelines",
|
||||
match=[responses.matchers.header_matcher({"authorization": f"Bearer {DC_API_KEY}"})],
|
||||
json={
|
||||
"data": [
|
||||
{
|
||||
"name": DC_TEST_INDEX,
|
||||
"status": "DEPLOYED",
|
||||
"indexing": {"status": "INDEXED", "pending_file_count": 0, "total_file_count": 31},
|
||||
}
|
||||
],
|
||||
"has_more": False,
|
||||
"total": 1,
|
||||
},
|
||||
)
|
||||
|
||||
# activate the default mock, same as using the @responses.activate everywhere
|
||||
with responses.mock as m:
|
||||
yield m
|
||||
|
||||
|
||||
@pytest.mark.document_store
|
||||
@pytest.mark.integration
|
||||
@pytest.mark.usefixtures("dc_api_mock")
|
||||
class TestDeepsetCloudDocumentStore:
|
||||
|
||||
# Fixtures
|
||||
|
||||
@pytest.fixture
|
||||
def ds(self):
|
||||
"""
|
||||
We make this fixture depend on `dc_api_mock` so that passing the document store will
|
||||
activate the mocking and we spare one function parameter.
|
||||
"""
|
||||
return DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=DC_TEST_INDEX)
|
||||
|
||||
# Integration tests
|
||||
|
||||
def test_init_with_dot_product(self, ds):
|
||||
assert ds.return_embedding == False
|
||||
assert ds.similarity == "dot_product"
|
||||
|
||||
def test_init_with_cosine(self):
|
||||
document_store = DeepsetCloudDocumentStore(
|
||||
api_endpoint=DC_API_ENDPOINT,
|
||||
api_key=DC_API_KEY,
|
||||
index=DC_TEST_INDEX,
|
||||
similarity="cosine",
|
||||
return_embedding=True,
|
||||
)
|
||||
assert document_store.return_embedding == True
|
||||
assert document_store.similarity == "cosine"
|
||||
|
||||
def test_invalid_token(self):
|
||||
responses.add(
|
||||
method=responses.GET,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/pipelines",
|
||||
match=[matchers.header_matcher({"authorization": "Bearer invalid_token"})],
|
||||
body="Internal Server Error",
|
||||
status=500,
|
||||
)
|
||||
|
||||
with pytest.raises(
|
||||
DeepsetCloudError,
|
||||
match=f"Could not connect to deepset Cloud:\nGET {DC_API_ENDPOINT}/workspaces/default/pipelines failed: HTTP 500 - Internal Server Error",
|
||||
):
|
||||
DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key="invalid_token", index=DC_TEST_INDEX)
|
||||
|
||||
def test_invalid_api_endpoint(self):
|
||||
responses.add(
|
||||
method=responses.GET, url=f"{DC_API_ENDPOINT}00/workspaces/default/pipelines", body="Not Found", status=404
|
||||
)
|
||||
|
||||
with pytest.raises(
|
||||
DeepsetCloudError,
|
||||
match=f"Could not connect to deepset Cloud:\nGET {DC_API_ENDPOINT}00/workspaces/default/pipelines failed: "
|
||||
f"HTTP 404 - Not Found\nNot Found",
|
||||
):
|
||||
DeepsetCloudDocumentStore(api_endpoint=f"{DC_API_ENDPOINT}00", api_key=DC_API_KEY, index=DC_TEST_INDEX)
|
||||
|
||||
def test_invalid_index(self, caplog):
|
||||
responses.add(
|
||||
method=responses.GET,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/invalid_index",
|
||||
body="Not Found",
|
||||
status=404,
|
||||
)
|
||||
|
||||
with caplog.at_level(logging.INFO):
|
||||
DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index="invalid_index")
|
||||
assert (
|
||||
"You are using a DeepsetCloudDocumentStore with an index that does not exist on deepset Cloud."
|
||||
in caplog.text
|
||||
)
|
||||
|
||||
def test_documents(self, ds):
|
||||
with open(SAMPLES_PATH / "dc" / "documents-stream.response", "r") as f:
|
||||
documents_stream_response = f.read()
|
||||
docs = [json.loads(l) for l in documents_stream_response.splitlines()]
|
||||
filtered_docs = [doc for doc in docs if doc["meta"]["file_id"] == docs[0]["meta"]["file_id"]]
|
||||
documents_stream_filtered_response = "\n".join([json.dumps(d) for d in filtered_docs])
|
||||
|
||||
responses.add(
|
||||
method=responses.POST,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-stream",
|
||||
body=documents_stream_response,
|
||||
status=200,
|
||||
)
|
||||
|
||||
responses.add(
|
||||
method=responses.POST,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-stream",
|
||||
match=[
|
||||
matchers.json_params_matcher(
|
||||
{"filters": {"file_id": [docs[0]["meta"]["file_id"]]}, "return_embedding": False}
|
||||
)
|
||||
],
|
||||
body=documents_stream_filtered_response,
|
||||
status=200,
|
||||
)
|
||||
|
||||
for doc in filtered_docs:
|
||||
responses.add(
|
||||
method=responses.GET,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents/{doc['id']}",
|
||||
json=doc,
|
||||
status=200,
|
||||
)
|
||||
|
||||
docs = ds.get_all_documents()
|
||||
assert len(docs) > 1
|
||||
assert isinstance(docs[0], Document)
|
||||
|
||||
first_doc = next(ds.get_all_documents_generator())
|
||||
assert isinstance(first_doc, Document)
|
||||
assert first_doc.meta["file_id"] is not None
|
||||
|
||||
filtered_docs = ds.get_all_documents(filters={"file_id": [first_doc.meta["file_id"]]})
|
||||
assert len(filtered_docs) > 0
|
||||
assert len(filtered_docs) < len(docs)
|
||||
|
||||
ids = [doc.id for doc in filtered_docs]
|
||||
single_doc_by_id = ds.get_document_by_id(ids[0])
|
||||
assert single_doc_by_id is not None
|
||||
assert single_doc_by_id.meta["file_id"] == first_doc.meta["file_id"]
|
||||
|
||||
docs_by_id = ds.get_documents_by_id(ids)
|
||||
assert len(docs_by_id) == len(filtered_docs)
|
||||
for doc in docs_by_id:
|
||||
assert doc.meta["file_id"] == first_doc.meta["file_id"]
|
||||
|
||||
def test_query(self, ds):
|
||||
|
||||
with open(SAMPLES_PATH / "dc" / "query_winterfell.response", "r") as f:
|
||||
query_winterfell_response = f.read()
|
||||
query_winterfell_docs = json.loads(query_winterfell_response)
|
||||
query_winterfell_filtered_docs = [
|
||||
doc
|
||||
for doc in query_winterfell_docs
|
||||
if doc["meta"]["file_id"] == query_winterfell_docs[0]["meta"]["file_id"]
|
||||
]
|
||||
query_winterfell_filtered_response = json.dumps(query_winterfell_filtered_docs)
|
||||
|
||||
responses.add(
|
||||
method=responses.POST,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query",
|
||||
match=[
|
||||
matchers.json_params_matcher(
|
||||
{"query": "winterfell", "top_k": 50, "all_terms_must_match": False, "scale_score": True}
|
||||
)
|
||||
],
|
||||
status=200,
|
||||
body=query_winterfell_response,
|
||||
)
|
||||
|
||||
responses.add(
|
||||
method=responses.POST,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query",
|
||||
match=[
|
||||
matchers.json_params_matcher(
|
||||
{
|
||||
"query": "winterfell",
|
||||
"top_k": 50,
|
||||
"filters": {"file_id": [query_winterfell_docs[0]["meta"]["file_id"]]},
|
||||
"all_terms_must_match": False,
|
||||
"scale_score": True,
|
||||
}
|
||||
)
|
||||
],
|
||||
status=200,
|
||||
body=query_winterfell_filtered_response,
|
||||
)
|
||||
|
||||
docs = ds.query("winterfell", top_k=50)
|
||||
assert docs is not None
|
||||
assert len(docs) > 0
|
||||
|
||||
first_doc = docs[0]
|
||||
filtered_docs = ds.query("winterfell", top_k=50, filters={"file_id": [first_doc.meta["file_id"]]})
|
||||
assert len(filtered_docs) > 0
|
||||
assert len(filtered_docs) < len(docs)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"body, expected_count",
|
||||
[
|
||||
(
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"evaluation_set_id": str(uuid4()),
|
||||
"name": DC_TEST_INDEX,
|
||||
"created_at": "2022-03-22T13:40:27.535Z",
|
||||
"matched_labels": 2,
|
||||
"total_labels": 10,
|
||||
}
|
||||
],
|
||||
"has_more": False,
|
||||
"total": 1,
|
||||
},
|
||||
10,
|
||||
),
|
||||
(
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"evaluation_set_id": str(uuid4()),
|
||||
"name": DC_TEST_INDEX,
|
||||
"created_at": "2022-03-22T13:40:27.535Z",
|
||||
"matched_labels": 0,
|
||||
"total_labels": 0,
|
||||
}
|
||||
],
|
||||
"has_more": False,
|
||||
"total": 1,
|
||||
},
|
||||
0,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_count_of_labels_for_evaluation_set(self, ds, body: dict, expected_count: int):
|
||||
responses.add(
|
||||
method=responses.GET,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets",
|
||||
status=200,
|
||||
body=json.dumps(body),
|
||||
)
|
||||
|
||||
count = ds.get_label_count(index=DC_TEST_INDEX)
|
||||
assert count == expected_count
|
||||
|
||||
def test_count_of_labels_for_evaluation_set_raises_DC_error_when_nothing_found(self, ds):
|
||||
responses.add(
|
||||
method=responses.GET,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets",
|
||||
status=200,
|
||||
body=json.dumps({"data": [], "has_more": False, "total": 0}),
|
||||
)
|
||||
|
||||
with pytest.raises(DeepsetCloudError, match=f"No evaluation set found with the name {DC_TEST_INDEX}"):
|
||||
ds.get_label_count(index=DC_TEST_INDEX)
|
||||
|
||||
def test_lists_evaluation_sets(self, ds):
|
||||
response_evaluation_set = {
|
||||
"evaluation_set_id": str(uuid4()),
|
||||
"name": DC_TEST_INDEX,
|
||||
"created_at": "2022-03-22T13:40:27.535Z",
|
||||
"matched_labels": 2,
|
||||
"total_labels": 10,
|
||||
}
|
||||
|
||||
responses.add(
|
||||
method=responses.GET,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets",
|
||||
status=200,
|
||||
body=json.dumps({"data": [response_evaluation_set], "has_more": False, "total": 1}),
|
||||
)
|
||||
|
||||
evaluation_sets = ds.get_evaluation_sets()
|
||||
assert evaluation_sets == [response_evaluation_set]
|
||||
|
||||
def test_fetches_labels_for_evaluation_set(self, ds):
|
||||
responses.add(
|
||||
method=responses.GET,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets/{DC_TEST_INDEX}",
|
||||
status=200,
|
||||
body=json.dumps(
|
||||
[
|
||||
{
|
||||
"label_id": "3fa85f64-5717-4562-b3fc-2c963f66afa6",
|
||||
"query": "What is berlin?",
|
||||
"answer": "biggest city in germany",
|
||||
"answer_start": 0,
|
||||
"answer_end": 0,
|
||||
"meta": {},
|
||||
"context": "Berlin is the biggest city in germany.",
|
||||
"external_file_name": "string",
|
||||
"file_id": "3fa85f64-5717-4562-b3fc-2c963f66afa6",
|
||||
"state": "Label matching status",
|
||||
"candidates": "Candidates that were found in the label <-> file matching",
|
||||
}
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
labels = ds.get_all_labels(index=DC_TEST_INDEX)
|
||||
assert labels == [
|
||||
Label(
|
||||
query="What is berlin?",
|
||||
document=Document(content="Berlin is the biggest city in germany."),
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
origin="user-feedback",
|
||||
answer=Answer("biggest city in germany"),
|
||||
id="3fa85f64-5717-4562-b3fc-2c963f66afa6",
|
||||
pipeline_id=None,
|
||||
created_at=None,
|
||||
updated_at=None,
|
||||
meta={},
|
||||
filters={},
|
||||
)
|
||||
]
|
||||
|
||||
def test_fetches_labels_for_evaluation_set_raises_deepsetclouderror_when_nothing_found(self, ds):
|
||||
responses.add(
|
||||
method=responses.GET,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets/{DC_TEST_INDEX}",
|
||||
status=404,
|
||||
)
|
||||
|
||||
with pytest.raises(DeepsetCloudError, match=f"No evaluation set found with the name {DC_TEST_INDEX}"):
|
||||
ds.get_all_labels(index=DC_TEST_INDEX)
|
||||
|
||||
def test_query_by_embedding(self, ds):
|
||||
query_emb = np.random.randn(768)
|
||||
|
||||
responses.add(
|
||||
method=responses.POST,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query",
|
||||
match=[
|
||||
matchers.json_params_matcher(
|
||||
{"query_emb": query_emb.tolist(), "top_k": 10, "return_embedding": False, "scale_score": True}
|
||||
)
|
||||
],
|
||||
json=[],
|
||||
status=200,
|
||||
)
|
||||
|
||||
emb_docs = ds.query_by_embedding(query_emb)
|
||||
assert len(emb_docs) == 0
|
||||
|
||||
def test_get_all_docs_without_index(self):
|
||||
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
||||
assert document_store.get_all_documents() == []
|
||||
|
||||
def test_get_all_docs_generator_without_index(self):
|
||||
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
||||
assert list(document_store.get_all_documents_generator()) == []
|
||||
|
||||
def test_get_doc_by_id_without_index(self):
|
||||
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
||||
assert document_store.get_document_by_id(id="some id") == None
|
||||
|
||||
def test_get_docs_by_id_without_index(self):
|
||||
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
||||
assert document_store.get_documents_by_id(ids=["some id"]) == []
|
||||
|
||||
def test_get_doc_count_without_index(self):
|
||||
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
||||
assert document_store.get_document_count() == 0
|
||||
|
||||
def test_query_by_emb_without_index(self):
|
||||
query_emb = np.random.randn(768)
|
||||
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
||||
assert document_store.query_by_embedding(query_emb=query_emb) == []
|
||||
|
||||
def test_query_without_index(self):
|
||||
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
||||
assert document_store.query(query="some query") == []
|
@ -1,5 +1,4 @@
|
||||
from copy import deepcopy
|
||||
import logging
|
||||
import math
|
||||
import sys
|
||||
from uuid import uuid4
|
||||
@ -7,27 +6,12 @@ from uuid import uuid4
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
import json
|
||||
import responses
|
||||
from responses import matchers
|
||||
from unittest.mock import Mock
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch.exceptions import RequestError
|
||||
|
||||
from ..conftest import (
|
||||
deepset_cloud_fixture,
|
||||
get_document_store,
|
||||
ensure_ids_are_correct_uuids,
|
||||
MOCK_DC,
|
||||
DC_API_ENDPOINT,
|
||||
DC_API_KEY,
|
||||
DC_TEST_INDEX,
|
||||
SAMPLES_PATH,
|
||||
)
|
||||
|
||||
from ..conftest import get_document_store, ensure_ids_are_correct_uuids
|
||||
from haystack.document_stores import (
|
||||
WeaviateDocumentStore,
|
||||
DeepsetCloudDocumentStore,
|
||||
InMemoryDocumentStore,
|
||||
MilvusDocumentStore,
|
||||
FAISSDocumentStore,
|
||||
ElasticsearchDocumentStore,
|
||||
@ -40,7 +24,6 @@ from haystack.errors import DuplicateDocumentError
|
||||
from haystack.schema import Document, Label, Answer, Span
|
||||
from haystack.nodes import EmbeddingRetriever, PreProcessor
|
||||
from haystack.pipelines import DocumentSearchPipeline
|
||||
from haystack.utils import DeepsetCloudError
|
||||
|
||||
|
||||
DOCUMENTS = [
|
||||
@ -1286,432 +1269,6 @@ def test_custom_headers(document_store_with_docs: BaseDocumentStore):
|
||||
assert len(documents) > 0
|
||||
|
||||
|
||||
@pytest.mark.usefixtures(deepset_cloud_fixture.__name__)
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_init_with_dot_product():
|
||||
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=DC_TEST_INDEX)
|
||||
assert document_store.return_embedding == False
|
||||
assert document_store.similarity == "dot_product"
|
||||
|
||||
|
||||
@pytest.mark.usefixtures(deepset_cloud_fixture.__name__)
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_init_with_cosine():
|
||||
document_store = DeepsetCloudDocumentStore(
|
||||
api_endpoint=DC_API_ENDPOINT,
|
||||
api_key=DC_API_KEY,
|
||||
index=DC_TEST_INDEX,
|
||||
similarity="cosine",
|
||||
return_embedding=True,
|
||||
)
|
||||
assert document_store.return_embedding == True
|
||||
assert document_store.similarity == "cosine"
|
||||
|
||||
|
||||
@pytest.mark.usefixtures(deepset_cloud_fixture.__name__)
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_invalid_token():
|
||||
if MOCK_DC:
|
||||
responses.add(
|
||||
method=responses.GET,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/pipelines",
|
||||
match=[matchers.header_matcher({"authorization": "Bearer invalid_token"})],
|
||||
body="Internal Server Error",
|
||||
status=500,
|
||||
)
|
||||
|
||||
with pytest.raises(
|
||||
DeepsetCloudError,
|
||||
match=f"Could not connect to deepset Cloud:\nGET {DC_API_ENDPOINT}/workspaces/default/pipelines failed: HTTP 500 - Internal Server Error",
|
||||
):
|
||||
DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key="invalid_token", index=DC_TEST_INDEX)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures(deepset_cloud_fixture.__name__)
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_invalid_api_endpoint():
|
||||
if MOCK_DC:
|
||||
responses.add(
|
||||
method=responses.GET, url=f"{DC_API_ENDPOINT}00/workspaces/default/pipelines", body="Not Found", status=404
|
||||
)
|
||||
|
||||
with pytest.raises(
|
||||
DeepsetCloudError,
|
||||
match=f"Could not connect to deepset Cloud:\nGET {DC_API_ENDPOINT}00/workspaces/default/pipelines failed: "
|
||||
f"HTTP 404 - Not Found\nNot Found",
|
||||
):
|
||||
DeepsetCloudDocumentStore(api_endpoint=f"{DC_API_ENDPOINT}00", api_key=DC_API_KEY, index=DC_TEST_INDEX)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures(deepset_cloud_fixture.__name__)
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_invalid_index(caplog):
|
||||
if MOCK_DC:
|
||||
responses.add(
|
||||
method=responses.GET,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/invalid_index",
|
||||
body="Not Found",
|
||||
status=404,
|
||||
)
|
||||
|
||||
with caplog.at_level(logging.INFO):
|
||||
DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index="invalid_index")
|
||||
assert (
|
||||
"You are using a DeepsetCloudDocumentStore with an index that does not exist on deepset Cloud."
|
||||
in caplog.text
|
||||
)
|
||||
|
||||
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_documents(deepset_cloud_document_store):
|
||||
if MOCK_DC:
|
||||
with open(SAMPLES_PATH / "dc" / "documents-stream.response", "r") as f:
|
||||
documents_stream_response = f.read()
|
||||
docs = [json.loads(l) for l in documents_stream_response.splitlines()]
|
||||
filtered_docs = [doc for doc in docs if doc["meta"]["file_id"] == docs[0]["meta"]["file_id"]]
|
||||
documents_stream_filtered_response = "\n".join([json.dumps(d) for d in filtered_docs])
|
||||
|
||||
responses.add(
|
||||
method=responses.POST,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-stream",
|
||||
body=documents_stream_response,
|
||||
status=200,
|
||||
)
|
||||
|
||||
responses.add(
|
||||
method=responses.POST,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-stream",
|
||||
match=[
|
||||
matchers.json_params_matcher(
|
||||
{"filters": {"file_id": [docs[0]["meta"]["file_id"]]}, "return_embedding": False}
|
||||
)
|
||||
],
|
||||
body=documents_stream_filtered_response,
|
||||
status=200,
|
||||
)
|
||||
|
||||
for doc in filtered_docs:
|
||||
responses.add(
|
||||
method=responses.GET,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents/{doc['id']}",
|
||||
json=doc,
|
||||
status=200,
|
||||
)
|
||||
else:
|
||||
responses.add_passthru(DC_API_ENDPOINT)
|
||||
|
||||
docs = deepset_cloud_document_store.get_all_documents()
|
||||
assert len(docs) > 1
|
||||
assert isinstance(docs[0], Document)
|
||||
|
||||
first_doc = next(deepset_cloud_document_store.get_all_documents_generator())
|
||||
assert isinstance(first_doc, Document)
|
||||
assert first_doc.meta["file_id"] is not None
|
||||
|
||||
filtered_docs = deepset_cloud_document_store.get_all_documents(filters={"file_id": [first_doc.meta["file_id"]]})
|
||||
assert len(filtered_docs) > 0
|
||||
assert len(filtered_docs) < len(docs)
|
||||
|
||||
ids = [doc.id for doc in filtered_docs]
|
||||
single_doc_by_id = deepset_cloud_document_store.get_document_by_id(ids[0])
|
||||
assert single_doc_by_id is not None
|
||||
assert single_doc_by_id.meta["file_id"] == first_doc.meta["file_id"]
|
||||
|
||||
docs_by_id = deepset_cloud_document_store.get_documents_by_id(ids)
|
||||
assert len(docs_by_id) == len(filtered_docs)
|
||||
for doc in docs_by_id:
|
||||
assert doc.meta["file_id"] == first_doc.meta["file_id"]
|
||||
|
||||
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_query(deepset_cloud_document_store):
|
||||
if MOCK_DC:
|
||||
with open(SAMPLES_PATH / "dc" / "query_winterfell.response", "r") as f:
|
||||
query_winterfell_response = f.read()
|
||||
query_winterfell_docs = json.loads(query_winterfell_response)
|
||||
query_winterfell_filtered_docs = [
|
||||
doc
|
||||
for doc in query_winterfell_docs
|
||||
if doc["meta"]["file_id"] == query_winterfell_docs[0]["meta"]["file_id"]
|
||||
]
|
||||
query_winterfell_filtered_response = json.dumps(query_winterfell_filtered_docs)
|
||||
|
||||
responses.add(
|
||||
method=responses.POST,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query",
|
||||
match=[
|
||||
matchers.json_params_matcher(
|
||||
{"query": "winterfell", "top_k": 50, "all_terms_must_match": False, "scale_score": True}
|
||||
)
|
||||
],
|
||||
status=200,
|
||||
body=query_winterfell_response,
|
||||
)
|
||||
|
||||
responses.add(
|
||||
method=responses.POST,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query",
|
||||
match=[
|
||||
matchers.json_params_matcher(
|
||||
{
|
||||
"query": "winterfell",
|
||||
"top_k": 50,
|
||||
"filters": {"file_id": [query_winterfell_docs[0]["meta"]["file_id"]]},
|
||||
"all_terms_must_match": False,
|
||||
"scale_score": True,
|
||||
}
|
||||
)
|
||||
],
|
||||
status=200,
|
||||
body=query_winterfell_filtered_response,
|
||||
)
|
||||
else:
|
||||
responses.add_passthru(DC_API_ENDPOINT)
|
||||
|
||||
docs = deepset_cloud_document_store.query("winterfell", top_k=50)
|
||||
assert docs is not None
|
||||
assert len(docs) > 0
|
||||
|
||||
first_doc = docs[0]
|
||||
filtered_docs = deepset_cloud_document_store.query(
|
||||
"winterfell", top_k=50, filters={"file_id": [first_doc.meta["file_id"]]}
|
||||
)
|
||||
assert len(filtered_docs) > 0
|
||||
assert len(filtered_docs) < len(docs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"body, expected_count",
|
||||
[
|
||||
(
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"evaluation_set_id": str(uuid4()),
|
||||
"name": DC_TEST_INDEX,
|
||||
"created_at": "2022-03-22T13:40:27.535Z",
|
||||
"matched_labels": 2,
|
||||
"total_labels": 10,
|
||||
}
|
||||
],
|
||||
"has_more": False,
|
||||
"total": 1,
|
||||
},
|
||||
10,
|
||||
),
|
||||
(
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"evaluation_set_id": str(uuid4()),
|
||||
"name": DC_TEST_INDEX,
|
||||
"created_at": "2022-03-22T13:40:27.535Z",
|
||||
"matched_labels": 0,
|
||||
"total_labels": 0,
|
||||
}
|
||||
],
|
||||
"has_more": False,
|
||||
"total": 1,
|
||||
},
|
||||
0,
|
||||
),
|
||||
],
|
||||
)
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_count_of_labels_for_evaluation_set(
|
||||
deepset_cloud_document_store, body: dict, expected_count: int
|
||||
):
|
||||
if MOCK_DC:
|
||||
responses.add(
|
||||
method=responses.GET,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets",
|
||||
status=200,
|
||||
body=json.dumps(body),
|
||||
)
|
||||
else:
|
||||
responses.add_passthru(DC_API_ENDPOINT)
|
||||
|
||||
count = deepset_cloud_document_store.get_label_count(index=DC_TEST_INDEX)
|
||||
assert count == expected_count
|
||||
|
||||
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_count_of_labels_for_evaluation_set_raises_DC_error_when_nothing_found(
|
||||
deepset_cloud_document_store,
|
||||
):
|
||||
if MOCK_DC:
|
||||
responses.add(
|
||||
method=responses.GET,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets",
|
||||
status=200,
|
||||
body=json.dumps({"data": [], "has_more": False, "total": 0}),
|
||||
)
|
||||
else:
|
||||
responses.add_passthru(DC_API_ENDPOINT)
|
||||
|
||||
with pytest.raises(DeepsetCloudError, match=f"No evaluation set found with the name {DC_TEST_INDEX}"):
|
||||
deepset_cloud_document_store.get_label_count(index=DC_TEST_INDEX)
|
||||
|
||||
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_lists_evaluation_sets(deepset_cloud_document_store):
|
||||
response_evaluation_set = {
|
||||
"evaluation_set_id": str(uuid4()),
|
||||
"name": DC_TEST_INDEX,
|
||||
"created_at": "2022-03-22T13:40:27.535Z",
|
||||
"matched_labels": 2,
|
||||
"total_labels": 10,
|
||||
}
|
||||
if MOCK_DC:
|
||||
responses.add(
|
||||
method=responses.GET,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets",
|
||||
status=200,
|
||||
body=json.dumps({"data": [response_evaluation_set], "has_more": False, "total": 1}),
|
||||
)
|
||||
else:
|
||||
responses.add_passthru(DC_API_ENDPOINT)
|
||||
|
||||
evaluation_sets = deepset_cloud_document_store.get_evaluation_sets()
|
||||
assert evaluation_sets == [response_evaluation_set]
|
||||
|
||||
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_fetches_labels_for_evaluation_set(deepset_cloud_document_store):
|
||||
if MOCK_DC:
|
||||
responses.add(
|
||||
method=responses.GET,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets/{DC_TEST_INDEX}",
|
||||
status=200,
|
||||
body=json.dumps(
|
||||
[
|
||||
{
|
||||
"label_id": "3fa85f64-5717-4562-b3fc-2c963f66afa6",
|
||||
"query": "What is berlin?",
|
||||
"answer": "biggest city in germany",
|
||||
"answer_start": 0,
|
||||
"answer_end": 0,
|
||||
"meta": {},
|
||||
"context": "Berlin is the biggest city in germany.",
|
||||
"external_file_name": "string",
|
||||
"file_id": "3fa85f64-5717-4562-b3fc-2c963f66afa6",
|
||||
"state": "Label matching status",
|
||||
"candidates": "Candidates that were found in the label <-> file matching",
|
||||
}
|
||||
]
|
||||
),
|
||||
)
|
||||
else:
|
||||
responses.add_passthru(DC_API_ENDPOINT)
|
||||
|
||||
labels = deepset_cloud_document_store.get_all_labels(index=DC_TEST_INDEX)
|
||||
assert labels == [
|
||||
Label(
|
||||
query="What is berlin?",
|
||||
document=Document(content="Berlin is the biggest city in germany."),
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
origin="user-feedback",
|
||||
answer=Answer("biggest city in germany"),
|
||||
id="3fa85f64-5717-4562-b3fc-2c963f66afa6",
|
||||
pipeline_id=None,
|
||||
created_at=None,
|
||||
updated_at=None,
|
||||
meta={},
|
||||
filters={},
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_fetches_labels_for_evaluation_set_raises_deepsetclouderror_when_nothing_found(
|
||||
deepset_cloud_document_store,
|
||||
):
|
||||
if MOCK_DC:
|
||||
responses.add(
|
||||
method=responses.GET,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets/{DC_TEST_INDEX}",
|
||||
status=404,
|
||||
)
|
||||
else:
|
||||
responses.add_passthru(DC_API_ENDPOINT)
|
||||
|
||||
with pytest.raises(DeepsetCloudError, match=f"No evaluation set found with the name {DC_TEST_INDEX}"):
|
||||
deepset_cloud_document_store.get_all_labels(index=DC_TEST_INDEX)
|
||||
|
||||
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_query_by_embedding(deepset_cloud_document_store):
|
||||
query_emb = np.random.randn(768)
|
||||
if MOCK_DC:
|
||||
responses.add(
|
||||
method=responses.POST,
|
||||
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query",
|
||||
match=[
|
||||
matchers.json_params_matcher(
|
||||
{"query_emb": query_emb.tolist(), "top_k": 10, "return_embedding": False, "scale_score": True}
|
||||
)
|
||||
],
|
||||
json=[],
|
||||
status=200,
|
||||
)
|
||||
else:
|
||||
responses.add_passthru(DC_API_ENDPOINT)
|
||||
|
||||
emb_docs = deepset_cloud_document_store.query_by_embedding(query_emb)
|
||||
assert len(emb_docs) == 0
|
||||
|
||||
|
||||
@pytest.mark.usefixtures(deepset_cloud_fixture.__name__)
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_get_all_docs_without_index():
|
||||
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
||||
assert document_store.get_all_documents() == []
|
||||
|
||||
|
||||
@pytest.mark.usefixtures(deepset_cloud_fixture.__name__)
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_get_all_docs_generator_without_index():
|
||||
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
||||
assert list(document_store.get_all_documents_generator()) == []
|
||||
|
||||
|
||||
@pytest.mark.usefixtures(deepset_cloud_fixture.__name__)
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_get_doc_by_id_without_index():
|
||||
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
||||
assert document_store.get_document_by_id(id="some id") == None
|
||||
|
||||
|
||||
@pytest.mark.usefixtures(deepset_cloud_fixture.__name__)
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_get_docs_by_id_without_index():
|
||||
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
||||
assert document_store.get_documents_by_id(ids=["some id"]) == []
|
||||
|
||||
|
||||
@pytest.mark.usefixtures(deepset_cloud_fixture.__name__)
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_get_doc_count_without_index():
|
||||
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
||||
assert document_store.get_document_count() == 0
|
||||
|
||||
|
||||
@pytest.mark.usefixtures(deepset_cloud_fixture.__name__)
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_query_by_emb_without_index():
|
||||
query_emb = np.random.randn(768)
|
||||
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
||||
assert document_store.query_by_embedding(query_emb=query_emb) == []
|
||||
|
||||
|
||||
@pytest.mark.usefixtures(deepset_cloud_fixture.__name__)
|
||||
@responses.activate
|
||||
def test_DeepsetCloudDocumentStore_query_without_index():
|
||||
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
||||
assert document_store.query(query="some query") == []
|
||||
|
||||
|
||||
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
|
||||
def test_elasticsearch_brownfield_support(document_store_with_docs):
|
||||
new_document_store = InMemoryDocumentStore()
|
||||
|
Loading…
x
Reference in New Issue
Block a user