2021-11-15 12:16:27 +01:00
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
import json
|
|
|
|
import pytest
|
|
|
|
|
2022-10-26 12:09:04 +02:00
|
|
|
from haystack.pipelines import Pipeline, RootNode, DocumentSearchPipeline
|
2022-07-07 15:10:13 +02:00
|
|
|
from haystack.nodes import FARMReader, BM25Retriever, JoinDocuments
|
2021-11-15 12:16:27 +01:00
|
|
|
|
2022-05-17 10:55:53 +02:00
|
|
|
from ..conftest import SAMPLES_PATH, MockRetriever as BaseMockRetriever, MockReader
|
2022-03-15 11:17:26 +01:00
|
|
|
|
|
|
|
|
|
|
|
class MockRetriever(BaseMockRetriever):
|
|
|
|
def retrieve(self, *args, **kwargs):
|
|
|
|
top_k = None
|
|
|
|
if "top_k" in kwargs.keys():
|
|
|
|
top_k = kwargs["top_k"]
|
|
|
|
elif len(args) > 0:
|
|
|
|
top_k = args[-1]
|
|
|
|
|
|
|
|
if top_k and not isinstance(top_k, int):
|
|
|
|
raise ValueError("TEST ERROR!")
|
2021-11-15 12:16:27 +01:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.elasticsearch
|
|
|
|
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
|
|
|
|
def test_node_names_validation(document_store_with_docs, tmp_path):
|
|
|
|
pipeline = Pipeline()
|
|
|
|
pipeline.add_node(
|
2022-04-26 16:09:39 +02:00
|
|
|
component=BM25Retriever(document_store=document_store_with_docs), name="Retriever", inputs=["Query"]
|
2022-02-03 13:43:18 +01:00
|
|
|
)
|
2021-11-15 12:16:27 +01:00
|
|
|
pipeline.add_node(
|
2022-02-03 13:43:18 +01:00
|
|
|
component=FARMReader(model_name_or_path="deepset/minilm-uncased-squad2", num_processes=0),
|
|
|
|
name="Reader",
|
|
|
|
inputs=["Retriever"],
|
|
|
|
)
|
2021-11-15 12:16:27 +01:00
|
|
|
|
|
|
|
with pytest.raises(ValueError) as exc_info:
|
|
|
|
pipeline.run(
|
|
|
|
query="Who lives in Berlin?",
|
|
|
|
params={
|
2022-02-03 13:43:18 +01:00
|
|
|
"Reader": {"top_k": 3},
|
|
|
|
"non-existing-node": {"top_k": 10},
|
2021-11-15 12:16:27 +01:00
|
|
|
"top_k": 5,
|
|
|
|
"non-existing-global_param": "wrong",
|
|
|
|
},
|
2022-02-03 13:43:18 +01:00
|
|
|
debug=True,
|
2021-11-15 12:16:27 +01:00
|
|
|
)
|
|
|
|
exception_raised = str(exc_info.value)
|
|
|
|
assert "non-existing-node" in exception_raised
|
|
|
|
assert "non-existing-global_param" in exception_raised
|
|
|
|
assert "Reader" not in exception_raised
|
|
|
|
assert "top_k" not in exception_raised
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.elasticsearch
|
|
|
|
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
|
|
|
|
def test_debug_attributes_global(document_store_with_docs, tmp_path):
|
|
|
|
|
2022-04-26 16:09:39 +02:00
|
|
|
es_retriever = BM25Retriever(document_store=document_store_with_docs)
|
2021-12-22 17:20:23 +01:00
|
|
|
reader = FARMReader(model_name_or_path="deepset/minilm-uncased-squad2", num_processes=0)
|
2021-11-15 12:16:27 +01:00
|
|
|
|
|
|
|
pipeline = Pipeline()
|
|
|
|
pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
|
|
|
|
pipeline.add_node(component=reader, name="Reader", inputs=["ESRetriever"])
|
|
|
|
|
|
|
|
prediction = pipeline.run(
|
2022-02-03 13:43:18 +01:00
|
|
|
query="Who lives in Berlin?", params={"ESRetriever": {"top_k": 10}, "Reader": {"top_k": 3}}, debug=True
|
2021-11-15 12:16:27 +01:00
|
|
|
)
|
|
|
|
assert "_debug" in prediction.keys()
|
|
|
|
assert "ESRetriever" in prediction["_debug"].keys()
|
|
|
|
assert "Reader" in prediction["_debug"].keys()
|
|
|
|
assert "input" in prediction["_debug"]["ESRetriever"].keys()
|
|
|
|
assert "output" in prediction["_debug"]["ESRetriever"].keys()
|
|
|
|
assert "input" in prediction["_debug"]["Reader"].keys()
|
|
|
|
assert "output" in prediction["_debug"]["Reader"].keys()
|
|
|
|
assert prediction["_debug"]["ESRetriever"]["input"]
|
|
|
|
assert prediction["_debug"]["ESRetriever"]["output"]
|
|
|
|
assert prediction["_debug"]["Reader"]["input"]
|
|
|
|
assert prediction["_debug"]["Reader"]["output"]
|
|
|
|
|
|
|
|
# Avoid circular reference: easiest way to detect those is to use json.dumps
|
|
|
|
json.dumps(prediction, default=str)
|
|
|
|
|
2022-02-03 13:43:18 +01:00
|
|
|
|
2021-11-15 12:16:27 +01:00
|
|
|
@pytest.mark.elasticsearch
|
|
|
|
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
|
|
|
|
def test_debug_attributes_per_node(document_store_with_docs, tmp_path):
|
|
|
|
|
2022-04-26 16:09:39 +02:00
|
|
|
es_retriever = BM25Retriever(document_store=document_store_with_docs)
|
2021-12-22 17:20:23 +01:00
|
|
|
reader = FARMReader(model_name_or_path="deepset/minilm-uncased-squad2", num_processes=0)
|
2021-11-15 12:16:27 +01:00
|
|
|
|
|
|
|
pipeline = Pipeline()
|
|
|
|
pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
|
|
|
|
pipeline.add_node(component=reader, name="Reader", inputs=["ESRetriever"])
|
|
|
|
|
|
|
|
prediction = pipeline.run(
|
2022-03-07 19:25:33 +01:00
|
|
|
query="Who lives in Berlin?", params={"ESRetriever": {"top_k": 10, "debug": True}, "Reader": {"top_k": 3}}
|
2021-11-15 12:16:27 +01:00
|
|
|
)
|
|
|
|
assert "_debug" in prediction.keys()
|
|
|
|
assert "ESRetriever" in prediction["_debug"].keys()
|
|
|
|
assert "Reader" not in prediction["_debug"].keys()
|
|
|
|
assert "input" in prediction["_debug"]["ESRetriever"].keys()
|
|
|
|
assert "output" in prediction["_debug"]["ESRetriever"].keys()
|
|
|
|
assert prediction["_debug"]["ESRetriever"]["input"]
|
|
|
|
assert prediction["_debug"]["ESRetriever"]["output"]
|
|
|
|
|
|
|
|
# Avoid circular reference: easiest way to detect those is to use json.dumps
|
|
|
|
json.dumps(prediction, default=str)
|
2022-07-07 15:10:13 +02:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.elasticsearch
|
|
|
|
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
|
|
|
|
def test_debug_attributes_for_join_nodes(document_store_with_docs, tmp_path):
|
|
|
|
|
|
|
|
es_retriever_1 = BM25Retriever(document_store=document_store_with_docs)
|
|
|
|
es_retriever_2 = BM25Retriever(document_store=document_store_with_docs)
|
|
|
|
|
|
|
|
pipeline = Pipeline()
|
|
|
|
pipeline.add_node(component=es_retriever_1, name="ESRetriever1", inputs=["Query"])
|
|
|
|
pipeline.add_node(component=es_retriever_2, name="ESRetriever2", inputs=["Query"])
|
|
|
|
pipeline.add_node(component=JoinDocuments(), name="JoinDocuments", inputs=["ESRetriever1", "ESRetriever2"])
|
|
|
|
|
|
|
|
prediction = pipeline.run(query="Who lives in Berlin?", debug=True)
|
|
|
|
assert "_debug" in prediction.keys()
|
|
|
|
assert "ESRetriever1" in prediction["_debug"].keys()
|
|
|
|
assert "ESRetriever2" in prediction["_debug"].keys()
|
|
|
|
assert "JoinDocuments" in prediction["_debug"].keys()
|
|
|
|
assert "input" in prediction["_debug"]["ESRetriever1"].keys()
|
|
|
|
assert "output" in prediction["_debug"]["ESRetriever1"].keys()
|
|
|
|
assert "input" in prediction["_debug"]["ESRetriever2"].keys()
|
|
|
|
assert "output" in prediction["_debug"]["ESRetriever2"].keys()
|
|
|
|
assert "input" in prediction["_debug"]["JoinDocuments"].keys()
|
|
|
|
assert "output" in prediction["_debug"]["JoinDocuments"].keys()
|
|
|
|
assert prediction["_debug"]["ESRetriever1"]["input"]
|
|
|
|
assert prediction["_debug"]["ESRetriever1"]["output"]
|
|
|
|
assert prediction["_debug"]["ESRetriever2"]["input"]
|
|
|
|
assert prediction["_debug"]["ESRetriever2"]["output"]
|
|
|
|
assert prediction["_debug"]["JoinDocuments"]["input"]
|
|
|
|
assert prediction["_debug"]["JoinDocuments"]["output"]
|
|
|
|
|
|
|
|
# Avoid circular reference: easiest way to detect those is to use json.dumps
|
|
|
|
json.dumps(prediction, default=str)
|
2021-11-15 12:16:27 +01:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.elasticsearch
|
|
|
|
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
|
|
|
|
def test_global_debug_attributes_override_node_ones(document_store_with_docs, tmp_path):
|
|
|
|
|
2022-04-26 16:09:39 +02:00
|
|
|
es_retriever = BM25Retriever(document_store=document_store_with_docs)
|
2021-12-22 17:20:23 +01:00
|
|
|
reader = FARMReader(model_name_or_path="deepset/minilm-uncased-squad2", num_processes=0)
|
2021-11-15 12:16:27 +01:00
|
|
|
|
|
|
|
pipeline = Pipeline()
|
|
|
|
pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
|
|
|
|
pipeline.add_node(component=reader, name="Reader", inputs=["ESRetriever"])
|
|
|
|
|
|
|
|
prediction = pipeline.run(
|
|
|
|
query="Who lives in Berlin?",
|
2022-02-03 13:43:18 +01:00
|
|
|
params={"ESRetriever": {"top_k": 10, "debug": True}, "Reader": {"top_k": 3, "debug": True}},
|
|
|
|
debug=False,
|
2021-11-15 12:16:27 +01:00
|
|
|
)
|
|
|
|
assert "_debug" not in prediction.keys()
|
|
|
|
|
|
|
|
prediction = pipeline.run(
|
|
|
|
query="Who lives in Berlin?",
|
2022-02-03 13:43:18 +01:00
|
|
|
params={"ESRetriever": {"top_k": 10, "debug": False}, "Reader": {"top_k": 3, "debug": False}},
|
|
|
|
debug=True,
|
2021-11-15 12:16:27 +01:00
|
|
|
)
|
|
|
|
assert "_debug" in prediction.keys()
|
|
|
|
assert "ESRetriever" in prediction["_debug"].keys()
|
|
|
|
assert "Reader" in prediction["_debug"].keys()
|
|
|
|
assert "input" in prediction["_debug"]["ESRetriever"].keys()
|
|
|
|
assert "output" in prediction["_debug"]["ESRetriever"].keys()
|
|
|
|
assert "input" in prediction["_debug"]["Reader"].keys()
|
|
|
|
assert "output" in prediction["_debug"]["Reader"].keys()
|
|
|
|
assert prediction["_debug"]["ESRetriever"]["input"]
|
|
|
|
assert prediction["_debug"]["ESRetriever"]["output"]
|
|
|
|
assert prediction["_debug"]["Reader"]["input"]
|
|
|
|
assert prediction["_debug"]["Reader"]["output"]
|
|
|
|
|
|
|
|
|
2022-03-15 11:17:26 +01:00
|
|
|
def test_missing_top_level_arg():
|
|
|
|
pipeline = Pipeline()
|
|
|
|
pipeline.add_node(component=MockRetriever(), name="Retriever", inputs=["Query"])
|
|
|
|
pipeline.add_node(component=MockReader(), name="Reader", inputs=["Retriever"])
|
|
|
|
|
2021-11-15 12:16:27 +01:00
|
|
|
with pytest.raises(Exception) as exc:
|
2022-03-15 11:17:26 +01:00
|
|
|
pipeline.run(params={"Retriever": {"top_k": 10}})
|
|
|
|
assert "Must provide a 'query' parameter" in str(exc.value)
|
|
|
|
|
|
|
|
|
|
|
|
def test_unexpected_top_level_arg():
|
|
|
|
pipeline = Pipeline()
|
|
|
|
pipeline.add_node(component=MockRetriever(), name="Retriever", inputs=["Query"])
|
|
|
|
pipeline.add_node(component=MockReader(), name="Reader", inputs=["Retriever"])
|
2021-11-15 12:16:27 +01:00
|
|
|
|
|
|
|
with pytest.raises(Exception) as exc:
|
2022-03-15 11:17:26 +01:00
|
|
|
pipeline.run(invalid_query="Who made the PDF specification?", params={"Retriever": {"top_k": 10}})
|
2021-11-15 12:16:27 +01:00
|
|
|
assert "run() got an unexpected keyword argument 'invalid_query'" in str(exc.value)
|
|
|
|
|
2022-03-15 11:17:26 +01:00
|
|
|
|
|
|
|
def test_unexpected_node_arg():
|
|
|
|
pipeline = Pipeline()
|
|
|
|
pipeline.add_node(component=MockRetriever(), name="Retriever", inputs=["Query"])
|
|
|
|
pipeline.add_node(component=MockReader(), name="Reader", inputs=["Retriever"])
|
|
|
|
|
2021-11-15 12:16:27 +01:00
|
|
|
with pytest.raises(Exception) as exc:
|
2022-03-15 11:17:26 +01:00
|
|
|
pipeline.run(query="Who made the PDF specification?", params={"Retriever": {"invalid": 10}})
|
|
|
|
assert "Invalid parameter 'invalid' for the node 'Retriever'" in str(exc.value)
|
2021-11-15 12:16:27 +01:00
|
|
|
|
|
|
|
|
2022-10-26 12:09:04 +02:00
|
|
|
@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
|
|
|
|
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
|
|
|
|
def test_pipeline_run_counters(retriever, document_store):
|
|
|
|
documents = [{"content": "Sample text for document-1", "meta": {"source": "wiki1"}}]
|
|
|
|
|
|
|
|
document_store.write_documents(documents)
|
|
|
|
document_store.update_embeddings(retriever)
|
|
|
|
|
|
|
|
p = DocumentSearchPipeline(retriever=retriever)
|
|
|
|
p.run(query="Irrelevant", params={"top_k": 1})
|
|
|
|
assert p.pipeline.run_total == 1
|
|
|
|
for i in range(p.pipeline.event_run_total_threshold + 1):
|
|
|
|
p.run(query="Irrelevant", params={"top_k": 1})
|
|
|
|
|
|
|
|
assert p.pipeline.run_total == 102
|
|
|
|
assert p.pipeline.last_window_run_total == 101
|
|
|
|
|
|
|
|
|
2021-11-15 12:16:27 +01:00
|
|
|
def test_debug_info_propagation():
|
|
|
|
class A(RootNode):
|
|
|
|
def run(self):
|
|
|
|
test = "A"
|
|
|
|
return {"test": test, "_debug": {"debug_key_a": "debug_value_a"}}, "output_1"
|
|
|
|
|
|
|
|
class B(RootNode):
|
|
|
|
def run(self, test):
|
|
|
|
test += "B"
|
2021-11-17 15:50:56 +01:00
|
|
|
return {"test": test, "_debug": "debug_value_b"}, "output_1"
|
2021-11-15 12:16:27 +01:00
|
|
|
|
|
|
|
class C(RootNode):
|
|
|
|
def run(self, test):
|
|
|
|
test += "C"
|
|
|
|
return {"test": test}, "output_1"
|
|
|
|
|
|
|
|
class D(RootNode):
|
|
|
|
def run(self, test, _debug):
|
|
|
|
test += "C"
|
2021-11-17 15:50:56 +01:00
|
|
|
assert _debug["B"]["runtime"] == "debug_value_b"
|
2021-11-15 12:16:27 +01:00
|
|
|
return {"test": test}, "output_1"
|
|
|
|
|
|
|
|
pipeline = Pipeline()
|
|
|
|
pipeline.add_node(name="A", component=A(), inputs=["Query"])
|
|
|
|
pipeline.add_node(name="B", component=B(), inputs=["A"])
|
|
|
|
pipeline.add_node(name="C", component=C(), inputs=["B"])
|
|
|
|
pipeline.add_node(name="D", component=D(), inputs=["C"])
|
|
|
|
output = pipeline.run(query="test")
|
2021-11-17 15:50:56 +01:00
|
|
|
assert output["_debug"]["A"]["runtime"]["debug_key_a"] == "debug_value_a"
|
|
|
|
assert output["_debug"]["B"]["runtime"] == "debug_value_b"
|