mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-31 01:39:45 +00:00 
			
		
		
		
	 5ac3dffbef
			
		
	
	
		5ac3dffbef
		
			
		
	
	
	
	
		
			
			* Split root conftest into multiple ones and remove unused fixtures * Remove some constants and make them fixtures * Remove unnecessary fixture scoping * Fix failing whisper tests * Fix image_file_paths fixture
		
			
				
	
	
		
			423 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			423 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import logging
 | |
| import json
 | |
| from pathlib import Path
 | |
| from uuid import uuid4
 | |
| 
 | |
| import pytest
 | |
| import responses
 | |
| import numpy as np
 | |
| 
 | |
| from responses import matchers
 | |
| 
 | |
| from haystack.document_stores import DeepsetCloudDocumentStore
 | |
| from haystack.utils import DeepsetCloudError
 | |
| from haystack.schema import Document, Label, Answer
 | |
| 
 | |
| 
 | |
| DC_API_ENDPOINT = "https://dc.example.com/v1"
 | |
| DC_TEST_INDEX = "document_retrieval_1"
 | |
| DC_API_KEY = "NO_KEY"
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| def dc_api_mock(request):
 | |
|     """
 | |
|     This fixture contains responses activation, so either this one or ds() below must be
 | |
|     passed to tests that require mocking.
 | |
| 
 | |
|     If `--mock-dc` was False, responses are never activated and it doesn't matter if the
 | |
|     fixture is passed or not.
 | |
|     """
 | |
|     if request.config.getoption("--mock-dc"):
 | |
|         responses.add(
 | |
|             method=responses.GET,
 | |
|             url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}",
 | |
|             match=[responses.matchers.header_matcher({"authorization": f"Bearer {DC_API_KEY}"})],
 | |
|             json={"indexing": {"status": "INDEXED", "pending_file_count": 0, "total_file_count": 31}},
 | |
|             status=200,
 | |
|         )
 | |
| 
 | |
|         responses.add(
 | |
|             method=responses.GET,
 | |
|             url=f"{DC_API_ENDPOINT}/workspaces/default/pipelines",
 | |
|             match=[responses.matchers.header_matcher({"authorization": f"Bearer {DC_API_KEY}"})],
 | |
|             json={
 | |
|                 "data": [
 | |
|                     {
 | |
|                         "name": DC_TEST_INDEX,
 | |
|                         "status": "DEPLOYED",
 | |
|                         "indexing": {"status": "INDEXED", "pending_file_count": 0, "total_file_count": 31},
 | |
|                     }
 | |
|                 ],
 | |
|                 "has_more": False,
 | |
|                 "total": 1,
 | |
|             },
 | |
|         )
 | |
| 
 | |
|         # activate the default mock, same as using the @responses.activate everywhere
 | |
|         with responses.mock as m:
 | |
|             yield m
 | |
| 
 | |
| 
 | |
| @pytest.mark.document_store
 | |
| @pytest.mark.integration
 | |
| @pytest.mark.usefixtures("dc_api_mock")
 | |
| class TestDeepsetCloudDocumentStore:
 | |
|     # Fixtures
 | |
| 
 | |
|     @pytest.fixture
 | |
|     def ds(self):
 | |
|         """
 | |
|         We make this fixture depend on `dc_api_mock` so that passing the document store will
 | |
|         activate the mocking and we spare one function parameter.
 | |
|         """
 | |
|         return DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=DC_TEST_INDEX)
 | |
| 
 | |
|     # Integration tests
 | |
| 
 | |
|     def test_init_with_dot_product(self, ds):
 | |
|         assert ds.return_embedding == False
 | |
|         assert ds.similarity == "dot_product"
 | |
| 
 | |
|     def test_init_with_cosine(self):
 | |
|         document_store = DeepsetCloudDocumentStore(
 | |
|             api_endpoint=DC_API_ENDPOINT,
 | |
|             api_key=DC_API_KEY,
 | |
|             index=DC_TEST_INDEX,
 | |
|             similarity="cosine",
 | |
|             return_embedding=True,
 | |
|         )
 | |
|         assert document_store.return_embedding == True
 | |
|         assert document_store.similarity == "cosine"
 | |
| 
 | |
|     def test_invalid_token(self):
 | |
|         responses.add(
 | |
|             method=responses.GET,
 | |
|             url=f"{DC_API_ENDPOINT}/workspaces/default/pipelines",
 | |
|             match=[matchers.header_matcher({"authorization": "Bearer invalid_token"})],
 | |
|             body="Internal Server Error",
 | |
|             status=500,
 | |
|         )
 | |
| 
 | |
|         with pytest.raises(
 | |
|             DeepsetCloudError,
 | |
|             match=f"Could not connect to deepset Cloud:\nGET {DC_API_ENDPOINT}/workspaces/default/pipelines failed: HTTP 500 - Internal Server Error",
 | |
|         ):
 | |
|             DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key="invalid_token", index=DC_TEST_INDEX)
 | |
| 
 | |
|     def test_invalid_api_endpoint(self):
 | |
|         responses.add(
 | |
|             method=responses.GET, url=f"{DC_API_ENDPOINT}00/workspaces/default/pipelines", body="Not Found", status=404
 | |
|         )
 | |
| 
 | |
|         with pytest.raises(
 | |
|             DeepsetCloudError,
 | |
|             match=f"Could not connect to deepset Cloud:\nGET {DC_API_ENDPOINT}00/workspaces/default/pipelines failed: "
 | |
|             f"HTTP 404 - Not Found\nNot Found",
 | |
|         ):
 | |
|             DeepsetCloudDocumentStore(api_endpoint=f"{DC_API_ENDPOINT}00", api_key=DC_API_KEY, index=DC_TEST_INDEX)
 | |
| 
 | |
|     def test_invalid_index(self, caplog):
 | |
|         responses.add(
 | |
|             method=responses.GET,
 | |
|             url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/invalid_index",
 | |
|             body="Not Found",
 | |
|             status=404,
 | |
|         )
 | |
| 
 | |
|         with caplog.at_level(logging.INFO):
 | |
|             DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index="invalid_index")
 | |
|             assert (
 | |
|                 "You are using a DeepsetCloudDocumentStore with an index that does not exist on deepset Cloud."
 | |
|                 in caplog.text
 | |
|             )
 | |
| 
 | |
|     def test_documents(self, ds, samples_path):
 | |
|         with open(samples_path / "dc" / "documents-stream.response", "r") as f:
 | |
|             documents_stream_response = f.read()
 | |
|             docs = [json.loads(l) for l in documents_stream_response.splitlines()]
 | |
|             filtered_docs = [doc for doc in docs if doc["meta"]["file_id"] == docs[0]["meta"]["file_id"]]
 | |
|             documents_stream_filtered_response = "\n".join([json.dumps(d) for d in filtered_docs])
 | |
| 
 | |
|             responses.add(
 | |
|                 method=responses.POST,
 | |
|                 url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-stream",
 | |
|                 body=documents_stream_response,
 | |
|                 status=200,
 | |
|             )
 | |
| 
 | |
|             responses.add(
 | |
|                 method=responses.POST,
 | |
|                 url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-stream",
 | |
|                 match=[
 | |
|                     matchers.json_params_matcher(
 | |
|                         {"filters": {"file_id": [docs[0]["meta"]["file_id"]]}, "return_embedding": False}
 | |
|                     )
 | |
|                 ],
 | |
|                 body=documents_stream_filtered_response,
 | |
|                 status=200,
 | |
|             )
 | |
| 
 | |
|             for doc in filtered_docs:
 | |
|                 responses.add(
 | |
|                     method=responses.GET,
 | |
|                     url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents/{doc['id']}",
 | |
|                     json=doc,
 | |
|                     status=200,
 | |
|                 )
 | |
| 
 | |
|         docs = ds.get_all_documents()
 | |
|         assert len(docs) > 1
 | |
|         assert isinstance(docs[0], Document)
 | |
| 
 | |
|         first_doc = next(ds.get_all_documents_generator())
 | |
|         assert isinstance(first_doc, Document)
 | |
|         assert first_doc.meta["file_id"] is not None
 | |
| 
 | |
|         filtered_docs = ds.get_all_documents(filters={"file_id": [first_doc.meta["file_id"]]})
 | |
|         assert len(filtered_docs) > 0
 | |
|         assert len(filtered_docs) < len(docs)
 | |
| 
 | |
|         ids = [doc.id for doc in filtered_docs]
 | |
|         single_doc_by_id = ds.get_document_by_id(ids[0])
 | |
|         assert single_doc_by_id is not None
 | |
|         assert single_doc_by_id.meta["file_id"] == first_doc.meta["file_id"]
 | |
| 
 | |
|         docs_by_id = ds.get_documents_by_id(ids)
 | |
|         assert len(docs_by_id) == len(filtered_docs)
 | |
|         for doc in docs_by_id:
 | |
|             assert doc.meta["file_id"] == first_doc.meta["file_id"]
 | |
| 
 | |
|     def test_query(self, ds, samples_path):
 | |
|         with open(samples_path / "dc" / "query_winterfell.response", "r") as f:
 | |
|             query_winterfell_response = f.read()
 | |
|             query_winterfell_docs = json.loads(query_winterfell_response)
 | |
|             query_winterfell_filtered_docs = [
 | |
|                 doc
 | |
|                 for doc in query_winterfell_docs
 | |
|                 if doc["meta"]["file_id"] == query_winterfell_docs[0]["meta"]["file_id"]
 | |
|             ]
 | |
|             query_winterfell_filtered_response = json.dumps(query_winterfell_filtered_docs)
 | |
| 
 | |
|         responses.add(
 | |
|             method=responses.POST,
 | |
|             url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query",
 | |
|             match=[
 | |
|                 matchers.json_params_matcher(
 | |
|                     {"query": "winterfell", "top_k": 50, "all_terms_must_match": False, "scale_score": True}
 | |
|                 )
 | |
|             ],
 | |
|             status=200,
 | |
|             body=query_winterfell_response,
 | |
|         )
 | |
| 
 | |
|         responses.add(
 | |
|             method=responses.POST,
 | |
|             url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query",
 | |
|             match=[
 | |
|                 matchers.json_params_matcher(
 | |
|                     {
 | |
|                         "query": "winterfell",
 | |
|                         "top_k": 50,
 | |
|                         "filters": {"file_id": [query_winterfell_docs[0]["meta"]["file_id"]]},
 | |
|                         "all_terms_must_match": False,
 | |
|                         "scale_score": True,
 | |
|                     }
 | |
|                 )
 | |
|             ],
 | |
|             status=200,
 | |
|             body=query_winterfell_filtered_response,
 | |
|         )
 | |
| 
 | |
|         docs = ds.query("winterfell", top_k=50)
 | |
|         assert docs is not None
 | |
|         assert len(docs) > 0
 | |
| 
 | |
|         first_doc = docs[0]
 | |
|         filtered_docs = ds.query("winterfell", top_k=50, filters={"file_id": [first_doc.meta["file_id"]]})
 | |
|         assert len(filtered_docs) > 0
 | |
|         assert len(filtered_docs) < len(docs)
 | |
| 
 | |
|     @pytest.mark.parametrize(
 | |
|         "body, expected_count",
 | |
|         [
 | |
|             (
 | |
|                 {
 | |
|                     "data": [
 | |
|                         {
 | |
|                             "evaluation_set_id": str(uuid4()),
 | |
|                             "name": DC_TEST_INDEX,
 | |
|                             "created_at": "2022-03-22T13:40:27.535Z",
 | |
|                             "matched_labels": 2,
 | |
|                             "total_labels": 10,
 | |
|                         }
 | |
|                     ],
 | |
|                     "has_more": False,
 | |
|                     "total": 1,
 | |
|                 },
 | |
|                 10,
 | |
|             ),
 | |
|             (
 | |
|                 {
 | |
|                     "data": [
 | |
|                         {
 | |
|                             "evaluation_set_id": str(uuid4()),
 | |
|                             "name": DC_TEST_INDEX,
 | |
|                             "created_at": "2022-03-22T13:40:27.535Z",
 | |
|                             "matched_labels": 0,
 | |
|                             "total_labels": 0,
 | |
|                         }
 | |
|                     ],
 | |
|                     "has_more": False,
 | |
|                     "total": 1,
 | |
|                 },
 | |
|                 0,
 | |
|             ),
 | |
|         ],
 | |
|     )
 | |
|     def test_count_of_labels_for_evaluation_set(self, ds, body: dict, expected_count: int):
 | |
|         responses.add(
 | |
|             method=responses.GET,
 | |
|             url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets",
 | |
|             status=200,
 | |
|             body=json.dumps(body),
 | |
|         )
 | |
| 
 | |
|         count = ds.get_label_count(index=DC_TEST_INDEX)
 | |
|         assert count == expected_count
 | |
| 
 | |
|     def test_count_of_labels_for_evaluation_set_raises_DC_error_when_nothing_found(self, ds):
 | |
|         responses.add(
 | |
|             method=responses.GET,
 | |
|             url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets",
 | |
|             status=200,
 | |
|             body=json.dumps({"data": [], "has_more": False, "total": 0}),
 | |
|         )
 | |
| 
 | |
|         with pytest.raises(DeepsetCloudError, match=f"No evaluation set found with the name {DC_TEST_INDEX}"):
 | |
|             ds.get_label_count(index=DC_TEST_INDEX)
 | |
| 
 | |
|     def test_lists_evaluation_sets(self, ds):
 | |
|         response_evaluation_set = {
 | |
|             "evaluation_set_id": str(uuid4()),
 | |
|             "name": DC_TEST_INDEX,
 | |
|             "created_at": "2022-03-22T13:40:27.535Z",
 | |
|             "matched_labels": 2,
 | |
|             "total_labels": 10,
 | |
|         }
 | |
| 
 | |
|         responses.add(
 | |
|             method=responses.GET,
 | |
|             url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets",
 | |
|             status=200,
 | |
|             body=json.dumps({"data": [response_evaluation_set], "has_more": False, "total": 1}),
 | |
|         )
 | |
| 
 | |
|         evaluation_sets = ds.get_evaluation_sets()
 | |
|         assert evaluation_sets == [response_evaluation_set]
 | |
| 
 | |
|     def test_fetches_labels_for_evaluation_set(self, ds):
 | |
|         responses.add(
 | |
|             method=responses.GET,
 | |
|             url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets/{DC_TEST_INDEX}",
 | |
|             status=200,
 | |
|             body=json.dumps(
 | |
|                 [
 | |
|                     {
 | |
|                         "label_id": "3fa85f64-5717-4562-b3fc-2c963f66afa6",
 | |
|                         "query": "What is berlin?",
 | |
|                         "answer": "biggest city in germany",
 | |
|                         "answer_start": 0,
 | |
|                         "answer_end": 0,
 | |
|                         "meta": {},
 | |
|                         "context": "Berlin is the biggest city in germany.",
 | |
|                         "external_file_name": "string",
 | |
|                         "file_id": "3fa85f64-5717-4562-b3fc-2c963f66afa6",
 | |
|                         "state": "Label matching status",
 | |
|                         "candidates": "Candidates that were found in the label <-> file matching",
 | |
|                     }
 | |
|                 ]
 | |
|             ),
 | |
|         )
 | |
| 
 | |
|         labels = ds.get_all_labels(index=DC_TEST_INDEX)
 | |
|         assert labels == [
 | |
|             Label(
 | |
|                 query="What is berlin?",
 | |
|                 document=Document(content="Berlin is the biggest city in germany."),
 | |
|                 is_correct_answer=True,
 | |
|                 is_correct_document=True,
 | |
|                 origin="user-feedback",
 | |
|                 answer=Answer("biggest city in germany"),
 | |
|                 id="3fa85f64-5717-4562-b3fc-2c963f66afa6",
 | |
|                 pipeline_id=None,
 | |
|                 created_at=None,
 | |
|                 updated_at=None,
 | |
|                 meta={},
 | |
|                 filters={},
 | |
|             )
 | |
|         ]
 | |
| 
 | |
|     def test_fetches_labels_for_evaluation_set_raises_deepsetclouderror_when_nothing_found(self, ds):
 | |
|         responses.add(
 | |
|             method=responses.GET,
 | |
|             url=f"{DC_API_ENDPOINT}/workspaces/default/evaluation_sets/{DC_TEST_INDEX}",
 | |
|             status=404,
 | |
|         )
 | |
| 
 | |
|         with pytest.raises(DeepsetCloudError, match=f"No evaluation set found with the name {DC_TEST_INDEX}"):
 | |
|             ds.get_all_labels(index=DC_TEST_INDEX)
 | |
| 
 | |
|     def test_query_by_embedding(self, ds):
 | |
|         query_emb = np.random.randn(768)
 | |
| 
 | |
|         responses.add(
 | |
|             method=responses.POST,
 | |
|             url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query",
 | |
|             match=[
 | |
|                 matchers.json_params_matcher(
 | |
|                     {
 | |
|                         "query_emb": query_emb.tolist(),
 | |
|                         "top_k": 10,
 | |
|                         "return_embedding": False,
 | |
|                         "scale_score": True,
 | |
|                         "use_prefiltering": False,
 | |
|                     }
 | |
|                 )
 | |
|             ],
 | |
|             json=[],
 | |
|             status=200,
 | |
|         )
 | |
| 
 | |
|         emb_docs = ds.query_by_embedding(query_emb)
 | |
|         assert len(emb_docs) == 0
 | |
| 
 | |
|     def test_get_all_docs_without_index(self):
 | |
|         document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
 | |
|         assert document_store.get_all_documents() == []
 | |
| 
 | |
|     def test_get_all_docs_generator_without_index(self):
 | |
|         document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
 | |
|         assert list(document_store.get_all_documents_generator()) == []
 | |
| 
 | |
|     def test_get_doc_by_id_without_index(self):
 | |
|         document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
 | |
|         assert document_store.get_document_by_id(id="some id") == None
 | |
| 
 | |
|     def test_get_docs_by_id_without_index(self):
 | |
|         document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
 | |
|         assert document_store.get_documents_by_id(ids=["some id"]) == []
 | |
| 
 | |
|     def test_get_doc_count_without_index(self):
 | |
|         document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
 | |
|         assert document_store.get_document_count() == 0
 | |
| 
 | |
|     def test_query_by_emb_without_index(self):
 | |
|         query_emb = np.random.randn(768)
 | |
|         document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
 | |
|         assert document_store.query_by_embedding(query_emb=query_emb) == []
 | |
| 
 | |
|     def test_query_without_index(self):
 | |
|         document_store = DeepsetCloudDocumentStore(api_endpoint=DC_API_ENDPOINT, api_key=DC_API_KEY, index=None)
 | |
|         assert document_store.query(query="some query") == []
 |