mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-29 11:50:34 +00:00

* Split root conftest into multiple ones and remove unused fixtures * Remove some constants and make them fixtures * Remove unnecessary fixture scoping * Fix failing whisper tests * Fix image_file_paths fixture
423 lines
16 KiB
Python
423 lines
16 KiB
Python
import logging
|
|
import json
|
|
from pathlib import Path
|
|
from uuid import uuid4
|
|
|
|
import pytest
|
|
import responses
|
|
import numpy as np
|
|
|
|
from responses import matchers
|
|
|
|
from haystack.document_stores import DeepsetCloudDocumentStore
|
|
from haystack.utils import DeepsetCloudError
|
|
from haystack.schema import Document, Label, Answer
|
|
|
|
|
|
DC_API_ENDPOINT = "https://dc.example.com/v1"
|
|
DC_TEST_INDEX = "document_retrieval_1"
|
|
DC_API_KEY = "NO_KEY"
|
|
|
|
|
|
@pytest.fixture
|
|
def dc_api_mock(request):
|
|
"""
|
|
This fixture contains responses activation, so either this one or ds() below must be
|
|
passed to tests that require mocking.
|
|
|
|
If `--mock-dc` was False, responses are never activated and it doesn't matter if the
|
|
fixture is passed or not.
|
|
"""
|
|
if request.config.getoption("--mock-dc"):
|
|
responses.add(
|
|
method=responses.GET,
|
|
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}",
|
|
match=[responses.matchers.header_matcher({"authorization": f"Bearer {DC_API_KEY}"})],
|
|
json={"indexing": {"status": "INDEXED", "pending_file_count": 0, "total_file_count": 31}},
|
|
status=200,
|
|
)
|
|
|
|
responses.add(
|
|
method=responses.GET,
|
|
url=f"{DC_API_ENDPOINT}/workspaces/default/pipelines",
|
|
match=[responses.matchers.header_matcher({"authorization": f"Bearer {DC_API_KEY}"})],
|
|
json={
|
|
"data": [
|
|
{
|
|
"name": DC_TEST_INDEX,
|
|
"status": "DEPLOYED",
|
|
"indexing": {"status": "INDEXED", "pending_file_count": 0, "total_file_count": 31},
|
|
}
|
|
],
|
|
"has_more": False,
|
|
"total": 1,
|
|
},
|
|
)
|
|
|
|
# activate the default mock, same as using the @responses.activate everywhere
|
|
with responses.mock as m:
|
|
yield m
|
|
|
|
|
|
@pytest.mark.document_store
|
|
@pytest.mark.integration
|
|
@pytest.mark.usefixtures("dc_api_mock")
|
|
class TestDeepsetCloudDocumentStore:
|
|
# Fixtures
|
|
|
|
@pytest.fixture
|
|
def ds(self):
|
|
"""
|
|
We make this fixture depend on `dc_api_mock` so that passing the document store will
|
|
activate the mocking and we spare one function parameter.
|
|
"""
|
|
return DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=DC_TEST_INDEX)
|
|
|
|
# Integration tests
|
|
|
|
def test_init_with_dot_product(self, ds):
|
|
assert ds.return_embedding == False
|
|
assert ds.similarity == "dot_product"
|
|
|
|
def test_init_with_cosine(self):
|
|
document_store = DeepsetCloudDocumentStore(
|
|
api_endpoint=DC_API_ENDPOINT,
|
|
api_key=DC_API_KEY,
|
|
index=DC_TEST_INDEX,
|
|
similarity="cosine",
|
|
return_embedding=True,
|
|
)
|
|
assert document_store.return_embedding == True
|
|
assert document_store.similarity == "cosine"
|
|
|
|
def test_invalid_token(self):
|
|
responses.add(
|
|
method=responses.GET,
|
|
url=f"{DC_API_ENDPOINT}/workspaces/default/pipelines",
|
|
match=[matchers.header_matcher({"authorization": "Bearer invalid_token"})],
|
|
body="Internal Server Error",
|
|
status=500,
|
|
)
|
|
|
|
with pytest.raises(
|
|
DeepsetCloudError,
|
|
match=f"Could not connect to deepset Cloud:\nGET {DC_API_ENDPOINT}/workspaces/default/pipelines failed: HTTP 500 - Internal Server Error",
|
|
):
|
|
DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key="invalid_token", index=DC_TEST_INDEX)
|
|
|
|
def test_invalid_api_endpoint(self):
|
|
responses.add(
|
|
method=responses.GET, url=f"{DC_API_ENDPOINT}00/workspaces/default/pipelines", body="Not Found", status=404
|
|
)
|
|
|
|
with pytest.raises(
|
|
DeepsetCloudError,
|
|
match=f"Could not connect to deepset Cloud:\nGET {DC_API_ENDPOINT}00/workspaces/default/pipelines failed: "
|
|
f"HTTP 404 - Not Found\nNot Found",
|
|
):
|
|
DeepsetCloudDocumentStore(api_endpoint=f"{DC_API_ENDPOINT}00", api_key=DC_API_KEY, index=DC_TEST_INDEX)
|
|
|
|
def test_invalid_index(self, caplog):
|
|
responses.add(
|
|
method=responses.GET,
|
|
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/invalid_index",
|
|
body="Not Found",
|
|
status=404,
|
|
)
|
|
|
|
with caplog.at_level(logging.INFO):
|
|
DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index="invalid_index")
|
|
assert (
|
|
"You are using a DeepsetCloudDocumentStore with an index that does not exist on deepset Cloud."
|
|
in caplog.text
|
|
)
|
|
|
|
def test_documents(self, ds, samples_path):
|
|
with open(samples_path / "dc" / "documents-stream.response", "r") as f:
|
|
documents_stream_response = f.read()
|
|
docs = [json.loads(l) for l in documents_stream_response.splitlines()]
|
|
filtered_docs = [doc for doc in docs if doc["meta"]["file_id"] == docs[0]["meta"]["file_id"]]
|
|
documents_stream_filtered_response = "\n".join([json.dumps(d) for d in filtered_docs])
|
|
|
|
responses.add(
|
|
method=responses.POST,
|
|
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-stream",
|
|
body=documents_stream_response,
|
|
status=200,
|
|
)
|
|
|
|
responses.add(
|
|
method=responses.POST,
|
|
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-stream",
|
|
match=[
|
|
matchers.json_params_matcher(
|
|
{"filters": {"file_id": [docs[0]["meta"]["file_id"]]}, "return_embedding": False}
|
|
)
|
|
],
|
|
body=documents_stream_filtered_response,
|
|
status=200,
|
|
)
|
|
|
|
for doc in filtered_docs:
|
|
responses.add(
|
|
method=responses.GET,
|
|
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents/{doc['id']}",
|
|
json=doc,
|
|
status=200,
|
|
)
|
|
|
|
docs = ds.get_all_documents()
|
|
assert len(docs) > 1
|
|
assert isinstance(docs[0], Document)
|
|
|
|
first_doc = next(ds.get_all_documents_generator())
|
|
assert isinstance(first_doc, Document)
|
|
assert first_doc.meta["file_id"] is not None
|
|
|
|
filtered_docs = ds.get_all_documents(filters={"file_id": [first_doc.meta["file_id"]]})
|
|
assert len(filtered_docs) > 0
|
|
assert len(filtered_docs) < len(docs)
|
|
|
|
ids = [doc.id for doc in filtered_docs]
|
|
single_doc_by_id = ds.get_document_by_id(ids[0])
|
|
assert single_doc_by_id is not None
|
|
assert single_doc_by_id.meta["file_id"] == first_doc.meta["file_id"]
|
|
|
|
docs_by_id = ds.get_documents_by_id(ids)
|
|
assert len(docs_by_id) == len(filtered_docs)
|
|
for doc in docs_by_id:
|
|
assert doc.meta["file_id"] == first_doc.meta["file_id"]
|
|
|
|
def test_query(self, ds, samples_path):
|
|
with open(samples_path / "dc" / "query_winterfell.response", "r") as f:
|
|
query_winterfell_response = f.read()
|
|
query_winterfell_docs = json.loads(query_winterfell_response)
|
|
query_winterfell_filtered_docs = [
|
|
doc
|
|
for doc in query_winterfell_docs
|
|
if doc["meta"]["file_id"] == query_winterfell_docs[0]["meta"]["file_id"]
|
|
]
|
|
query_winterfell_filtered_response = json.dumps(query_winterfell_filtered_docs)
|
|
|
|
responses.add(
|
|
method=responses.POST,
|
|
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query",
|
|
match=[
|
|
matchers.json_params_matcher(
|
|
{"query": "winterfell", "top_k": 50, "all_terms_must_match": False, "scale_score": True}
|
|
)
|
|
],
|
|
status=200,
|
|
body=query_winterfell_response,
|
|
)
|
|
|
|
responses.add(
|
|
method=responses.POST,
|
|
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query",
|
|
match=[
|
|
matchers.json_params_matcher(
|
|
{
|
|
"query": "winterfell",
|
|
"top_k": 50,
|
|
"filters": {"file_id": [query_winterfell_docs[0]["meta"]["file_id"]]},
|
|
"all_terms_must_match": False,
|
|
"scale_score": True,
|
|
}
|
|
)
|
|
],
|
|
status=200,
|
|
body=query_winterfell_filtered_response,
|
|
)
|
|
|
|
docs = ds.query("winterfell", top_k=50)
|
|
assert docs is not None
|
|
assert len(docs) > 0
|
|
|
|
first_doc = docs[0]
|
|
filtered_docs = ds.query("winterfell", top_k=50, filters={"file_id": [first_doc.meta["file_id"]]})
|
|
assert len(filtered_docs) > 0
|
|
assert len(filtered_docs) < len(docs)
|
|
|
|
@pytest.mark.parametrize(
|
|
"body, expected_count",
|
|
[
|
|
(
|
|
{
|
|
"data": [
|
|
{
|
|
"evaluation_set_id": str(uuid4()),
|
|
"name": DC_TEST_INDEX,
|
|
"created_at": "2022-03-22T13:40:27.535Z",
|
|
"matched_labels": 2,
|
|
"total_labels": 10,
|
|
}
|
|
],
|
|
"has_more": False,
|
|
"total": 1,
|
|
},
|
|
10,
|
|
),
|
|
(
|
|
{
|
|
"data": [
|
|
{
|
|
"evaluation_set_id": str(uuid4()),
|
|
"name": DC_TEST_INDEX,
|
|
"created_at": "2022-03-22T13:40:27.535Z",
|
|
"matched_labels": 0,
|
|
"total_labels": 0,
|
|
}
|
|
],
|
|
"has_more": False,
|
|
"total": 1,
|
|
},
|
|
0,
|
|
),
|
|
],
|
|
)
|
|
def test_count_of_labels_for_evaluation_set(self, ds, body: dict, expected_count: int):
|
|
responses.add(
|
|
method=responses.GET,
|
|
url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets",
|
|
status=200,
|
|
body=json.dumps(body),
|
|
)
|
|
|
|
count = ds.get_label_count(index=DC_TEST_INDEX)
|
|
assert count == expected_count
|
|
|
|
def test_count_of_labels_for_evaluation_set_raises_DC_error_when_nothing_found(self, ds):
|
|
responses.add(
|
|
method=responses.GET,
|
|
url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets",
|
|
status=200,
|
|
body=json.dumps({"data": [], "has_more": False, "total": 0}),
|
|
)
|
|
|
|
with pytest.raises(DeepsetCloudError, match=f"No evaluation set found with the name {DC_TEST_INDEX}"):
|
|
ds.get_label_count(index=DC_TEST_INDEX)
|
|
|
|
def test_lists_evaluation_sets(self, ds):
|
|
response_evaluation_set = {
|
|
"evaluation_set_id": str(uuid4()),
|
|
"name": DC_TEST_INDEX,
|
|
"created_at": "2022-03-22T13:40:27.535Z",
|
|
"matched_labels": 2,
|
|
"total_labels": 10,
|
|
}
|
|
|
|
responses.add(
|
|
method=responses.GET,
|
|
url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets",
|
|
status=200,
|
|
body=json.dumps({"data": [response_evaluation_set], "has_more": False, "total": 1}),
|
|
)
|
|
|
|
evaluation_sets = ds.get_evaluation_sets()
|
|
assert evaluation_sets == [response_evaluation_set]
|
|
|
|
def test_fetches_labels_for_evaluation_set(self, ds):
|
|
responses.add(
|
|
method=responses.GET,
|
|
url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets/{DC_TEST_INDEX}",
|
|
status=200,
|
|
body=json.dumps(
|
|
[
|
|
{
|
|
"label_id": "3fa85f64-5717-4562-b3fc-2c963f66afa6",
|
|
"query": "What is berlin?",
|
|
"answer": "biggest city in germany",
|
|
"answer_start": 0,
|
|
"answer_end": 0,
|
|
"meta": {},
|
|
"context": "Berlin is the biggest city in germany.",
|
|
"external_file_name": "string",
|
|
"file_id": "3fa85f64-5717-4562-b3fc-2c963f66afa6",
|
|
"state": "Label matching status",
|
|
"candidates": "Candidates that were found in the label <-> file matching",
|
|
}
|
|
]
|
|
),
|
|
)
|
|
|
|
labels = ds.get_all_labels(index=DC_TEST_INDEX)
|
|
assert labels == [
|
|
Label(
|
|
query="What is berlin?",
|
|
document=Document(content="Berlin is the biggest city in germany."),
|
|
is_correct_answer=True,
|
|
is_correct_document=True,
|
|
origin="user-feedback",
|
|
answer=Answer("biggest city in germany"),
|
|
id="3fa85f64-5717-4562-b3fc-2c963f66afa6",
|
|
pipeline_id=None,
|
|
created_at=None,
|
|
updated_at=None,
|
|
meta={},
|
|
filters={},
|
|
)
|
|
]
|
|
|
|
def test_fetches_labels_for_evaluation_set_raises_deepsetclouderror_when_nothing_found(self, ds):
|
|
responses.add(
|
|
method=responses.GET,
|
|
url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets/{DC_TEST_INDEX}",
|
|
status=404,
|
|
)
|
|
|
|
with pytest.raises(DeepsetCloudError, match=f"No evaluation set found with the name {DC_TEST_INDEX}"):
|
|
ds.get_all_labels(index=DC_TEST_INDEX)
|
|
|
|
def test_query_by_embedding(self, ds):
|
|
query_emb = np.random.randn(768)
|
|
|
|
responses.add(
|
|
method=responses.POST,
|
|
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query",
|
|
match=[
|
|
matchers.json_params_matcher(
|
|
{
|
|
"query_emb": query_emb.tolist(),
|
|
"top_k": 10,
|
|
"return_embedding": False,
|
|
"scale_score": True,
|
|
"use_prefiltering": False,
|
|
}
|
|
)
|
|
],
|
|
json=[],
|
|
status=200,
|
|
)
|
|
|
|
emb_docs = ds.query_by_embedding(query_emb)
|
|
assert len(emb_docs) == 0
|
|
|
|
def test_get_all_docs_without_index(self):
|
|
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
|
assert document_store.get_all_documents() == []
|
|
|
|
def test_get_all_docs_generator_without_index(self):
|
|
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
|
assert list(document_store.get_all_documents_generator()) == []
|
|
|
|
def test_get_doc_by_id_without_index(self):
|
|
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
|
assert document_store.get_document_by_id(id="some id") == None
|
|
|
|
def test_get_docs_by_id_without_index(self):
|
|
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
|
assert document_store.get_documents_by_id(ids=["some id"]) == []
|
|
|
|
def test_get_doc_count_without_index(self):
|
|
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
|
assert document_store.get_document_count() == 0
|
|
|
|
def test_query_by_emb_without_index(self):
|
|
query_emb = np.random.randn(768)
|
|
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
|
assert document_store.query_by_embedding(query_emb=query_emb) == []
|
|
|
|
def test_query_without_index(self):
|
|
document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
|
|
assert document_store.query(query="some query") == []
|