2021-02-02 17:32:17 +01:00
|
|
|
from pathlib import Path
|
|
|
|
|
2021-11-11 11:02:22 +01:00
|
|
|
import os
|
2020-11-20 17:41:08 +01:00
|
|
|
import pytest
|
|
|
|
|
2021-11-15 12:16:27 +01:00
|
|
|
from haystack.pipelines import Pipeline, RootNode
|
2020-11-20 17:41:08 +01:00
|
|
|
|
2021-09-27 10:52:07 +02:00
|
|
|
@pytest.mark.elasticsearch
|
2021-08-02 14:51:24 +02:00
|
|
|
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
|
|
|
|
def test_load_and_save_yaml(document_store, tmp_path):
|
2021-02-16 16:24:28 +01:00
|
|
|
# test correct load of indexing pipeline from yaml
|
2021-06-08 15:20:13 +02:00
|
|
|
pipeline = Pipeline.load_from_yaml(
|
2021-10-07 22:13:25 +02:00
|
|
|
Path(__file__).parent/"samples"/"pipeline"/"test_pipeline.yaml", pipeline_name="indexing_pipeline"
|
2021-06-08 15:20:13 +02:00
|
|
|
)
|
|
|
|
pipeline.run(
|
2021-10-19 15:22:44 +02:00
|
|
|
file_paths=Path(__file__).parent/"samples"/"pdf"/"sample_pdf_1.pdf"
|
2021-06-08 15:20:13 +02:00
|
|
|
)
|
2021-02-16 16:24:28 +01:00
|
|
|
# test correct load of query pipeline from yaml
|
2021-06-08 15:20:13 +02:00
|
|
|
pipeline = Pipeline.load_from_yaml(
|
2021-10-07 22:13:25 +02:00
|
|
|
Path(__file__).parent/"samples"/"pipeline"/"test_pipeline.yaml", pipeline_name="query_pipeline"
|
2021-06-08 15:20:13 +02:00
|
|
|
)
|
|
|
|
prediction = pipeline.run(
|
2021-10-19 15:22:44 +02:00
|
|
|
query="Who made the PDF specification?", params={"ESRetriever": {"top_k": 10}, "Reader": {"top_k": 3}}
|
2021-06-08 15:20:13 +02:00
|
|
|
)
|
2021-02-16 16:24:28 +01:00
|
|
|
assert prediction["query"] == "Who made the PDF specification?"
|
2021-10-13 14:23:23 +02:00
|
|
|
assert prediction["answers"][0].answer == "Adobe Systems"
|
2021-10-07 22:13:25 +02:00
|
|
|
assert "_debug" not in prediction.keys()
|
2021-02-02 17:32:17 +01:00
|
|
|
|
|
|
|
# test invalid pipeline name
|
|
|
|
with pytest.raises(Exception):
|
2021-06-08 15:20:13 +02:00
|
|
|
Pipeline.load_from_yaml(
|
2021-10-07 22:13:25 +02:00
|
|
|
path=Path(__file__).parent/"samples"/"pipeline"/"test_pipeline.yaml", pipeline_name="invalid"
|
2021-06-08 15:20:13 +02:00
|
|
|
)
|
2021-04-30 12:23:29 +02:00
|
|
|
# test config export
|
|
|
|
pipeline.save_to_yaml(tmp_path / "test.yaml")
|
2021-06-08 15:20:13 +02:00
|
|
|
with open(tmp_path / "test.yaml", "r", encoding="utf-8") as stream:
|
2021-04-30 12:23:29 +02:00
|
|
|
saved_yaml = stream.read()
|
2021-06-08 15:20:13 +02:00
|
|
|
expected_yaml = """
|
2021-04-30 12:23:29 +02:00
|
|
|
components:
|
|
|
|
- name: ESRetriever
|
|
|
|
params:
|
|
|
|
document_store: ElasticsearchDocumentStore
|
|
|
|
type: ElasticsearchRetriever
|
|
|
|
- name: ElasticsearchDocumentStore
|
|
|
|
params:
|
2021-08-02 14:51:24 +02:00
|
|
|
index: haystack_test
|
2021-04-30 12:23:29 +02:00
|
|
|
label_index: haystack_test_label
|
|
|
|
type: ElasticsearchDocumentStore
|
|
|
|
- name: Reader
|
|
|
|
params:
|
|
|
|
model_name_or_path: deepset/roberta-base-squad2
|
|
|
|
no_ans_boost: -10
|
|
|
|
type: FARMReader
|
|
|
|
pipelines:
|
|
|
|
- name: query
|
|
|
|
nodes:
|
|
|
|
- inputs:
|
|
|
|
- Query
|
|
|
|
name: ESRetriever
|
|
|
|
- inputs:
|
|
|
|
- ESRetriever
|
|
|
|
name: Reader
|
2021-08-02 14:51:24 +02:00
|
|
|
type: Pipeline
|
2021-04-30 12:23:29 +02:00
|
|
|
version: '0.8'
|
2021-06-08 15:20:13 +02:00
|
|
|
"""
|
|
|
|
assert saved_yaml.replace(" ", "").replace("\n", "") == expected_yaml.replace(
|
|
|
|
" ", ""
|
|
|
|
).replace("\n", "")
|
2021-04-30 12:23:29 +02:00
|
|
|
|
2021-02-02 17:32:17 +01:00
|
|
|
|
2021-10-28 16:48:06 +02:00
|
|
|
def test_load_tfidfretriever_yaml(tmp_path):
|
|
|
|
documents = [
|
|
|
|
{
|
|
|
|
"content": "A Doc specifically talking about haystack. Haystack can be used to scale QA models to large document collections."
|
|
|
|
}
|
|
|
|
]
|
|
|
|
pipeline = Pipeline.load_from_yaml(
|
|
|
|
Path(__file__).parent/"samples"/"pipeline"/"test_pipeline_tfidfretriever.yaml", pipeline_name="query_pipeline"
|
|
|
|
)
|
|
|
|
with pytest.raises(Exception) as exc_info:
|
|
|
|
pipeline.run(
|
|
|
|
query="What can be used to scale QA models to large document collections?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 3}}
|
|
|
|
)
|
|
|
|
exception_raised = str(exc_info.value)
|
|
|
|
assert "Retrieval requires dataframe df and tf-idf matrix" in exception_raised
|
|
|
|
|
|
|
|
pipeline.get_node(name="Retriever").document_store.write_documents(documents=documents)
|
|
|
|
prediction = pipeline.run(
|
|
|
|
query="What can be used to scale QA models to large document collections?",
|
|
|
|
params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 3}}
|
|
|
|
)
|
|
|
|
assert prediction["query"] == "What can be used to scale QA models to large document collections?"
|
|
|
|
assert prediction["answers"][0].answer == "haystack"
|
|
|
|
|
|
|
|
|
2021-09-27 10:52:07 +02:00
|
|
|
# @pytest.mark.slow
|
|
|
|
# @pytest.mark.elasticsearch
|
|
|
|
# @pytest.mark.parametrize(
|
|
|
|
# "retriever_with_docs, document_store_with_docs",
|
|
|
|
# [("elasticsearch", "elasticsearch")],
|
|
|
|
# indirect=True,
|
|
|
|
# )
|
2021-04-21 12:18:33 +02:00
|
|
|
@pytest.mark.parametrize(
|
2021-09-27 10:52:07 +02:00
|
|
|
"retriever_with_docs,document_store_with_docs",
|
|
|
|
[
|
|
|
|
("dpr", "elasticsearch"),
|
|
|
|
("dpr", "faiss"),
|
|
|
|
("dpr", "memory"),
|
|
|
|
("dpr", "milvus"),
|
|
|
|
("embedding", "elasticsearch"),
|
|
|
|
("embedding", "faiss"),
|
|
|
|
("embedding", "memory"),
|
|
|
|
("embedding", "milvus"),
|
|
|
|
("elasticsearch", "elasticsearch"),
|
|
|
|
("es_filter_only", "elasticsearch"),
|
|
|
|
("tfidf", "memory"),
|
|
|
|
],
|
2021-06-08 15:20:13 +02:00
|
|
|
indirect=True,
|
2021-04-21 12:18:33 +02:00
|
|
|
)
|
2021-09-27 10:52:07 +02:00
|
|
|
def test_graph_creation(retriever_with_docs, document_store_with_docs):
|
2020-11-20 17:41:08 +01:00
|
|
|
pipeline = Pipeline()
|
|
|
|
pipeline.add_node(name="ES", component=retriever_with_docs, inputs=["Query"])
|
|
|
|
|
|
|
|
with pytest.raises(AssertionError):
|
2021-06-08 15:20:13 +02:00
|
|
|
pipeline.add_node(
|
|
|
|
name="Reader", component=retriever_with_docs, inputs=["ES.output_2"]
|
|
|
|
)
|
2020-11-20 17:41:08 +01:00
|
|
|
|
|
|
|
with pytest.raises(AssertionError):
|
2021-06-08 15:20:13 +02:00
|
|
|
pipeline.add_node(
|
|
|
|
name="Reader", component=retriever_with_docs, inputs=["ES.wrong_edge_label"]
|
|
|
|
)
|
2020-11-20 17:41:08 +01:00
|
|
|
|
|
|
|
with pytest.raises(Exception):
|
2021-06-08 15:20:13 +02:00
|
|
|
pipeline.add_node(
|
|
|
|
name="Reader", component=retriever_with_docs, inputs=["InvalidNode"]
|
|
|
|
)
|
2020-11-20 17:41:08 +01:00
|
|
|
|
2021-04-21 12:18:33 +02:00
|
|
|
with pytest.raises(Exception):
|
|
|
|
pipeline = Pipeline()
|
2021-06-08 15:20:13 +02:00
|
|
|
pipeline.add_node(
|
|
|
|
name="ES", component=retriever_with_docs, inputs=["InvalidNode"]
|
|
|
|
)
|
2021-04-21 12:18:33 +02:00
|
|
|
|
2020-11-20 17:41:08 +01:00
|
|
|
|
2021-03-10 18:17:23 +01:00
|
|
|
def test_parallel_paths_in_pipeline_graph():
|
|
|
|
class A(RootNode):
|
2021-09-10 11:41:16 +02:00
|
|
|
def run(self):
|
|
|
|
test = "A"
|
|
|
|
return {"test": test}, "output_1"
|
2021-03-10 18:17:23 +01:00
|
|
|
|
|
|
|
class B(RootNode):
|
2021-09-10 11:41:16 +02:00
|
|
|
def run(self, test):
|
|
|
|
test += "B"
|
|
|
|
return {"test": test}, "output_1"
|
2021-03-10 18:17:23 +01:00
|
|
|
|
|
|
|
class C(RootNode):
|
2021-09-10 11:41:16 +02:00
|
|
|
def run(self, test):
|
|
|
|
test += "C"
|
|
|
|
return {"test": test}, "output_1"
|
2021-03-10 18:17:23 +01:00
|
|
|
|
|
|
|
class D(RootNode):
|
2021-09-10 11:41:16 +02:00
|
|
|
def run(self, test):
|
|
|
|
test += "D"
|
|
|
|
return {"test": test}, "output_1"
|
2021-03-10 18:17:23 +01:00
|
|
|
|
|
|
|
class E(RootNode):
|
2021-09-10 11:41:16 +02:00
|
|
|
def run(self, test):
|
|
|
|
test += "E"
|
|
|
|
return {"test": test}, "output_1"
|
2021-03-10 18:17:23 +01:00
|
|
|
|
|
|
|
class JoinNode(RootNode):
|
2021-09-10 11:41:16 +02:00
|
|
|
def run(self, inputs):
|
|
|
|
test = (
|
|
|
|
inputs[0]["test"] + inputs[1]["test"]
|
2021-06-08 15:20:13 +02:00
|
|
|
)
|
2021-09-10 11:41:16 +02:00
|
|
|
return {"test": test}, "output_1"
|
2021-03-10 18:17:23 +01:00
|
|
|
|
|
|
|
pipeline = Pipeline()
|
|
|
|
pipeline.add_node(name="A", component=A(), inputs=["Query"])
|
|
|
|
pipeline.add_node(name="B", component=B(), inputs=["A"])
|
|
|
|
pipeline.add_node(name="C", component=C(), inputs=["B"])
|
|
|
|
pipeline.add_node(name="E", component=E(), inputs=["C"])
|
|
|
|
pipeline.add_node(name="D", component=D(), inputs=["B"])
|
|
|
|
pipeline.add_node(name="F", component=JoinNode(), inputs=["D", "E"])
|
|
|
|
output = pipeline.run(query="test")
|
2021-09-10 11:41:16 +02:00
|
|
|
assert output["test"] == "ABDABCE"
|
2021-03-10 18:17:23 +01:00
|
|
|
|
|
|
|
pipeline = Pipeline()
|
|
|
|
pipeline.add_node(name="A", component=A(), inputs=["Query"])
|
|
|
|
pipeline.add_node(name="B", component=B(), inputs=["A"])
|
|
|
|
pipeline.add_node(name="C", component=C(), inputs=["B"])
|
|
|
|
pipeline.add_node(name="D", component=D(), inputs=["B"])
|
|
|
|
pipeline.add_node(name="E", component=JoinNode(), inputs=["C", "D"])
|
|
|
|
output = pipeline.run(query="test")
|
2021-09-10 11:41:16 +02:00
|
|
|
assert output["test"] == "ABCABD"
|
2021-03-18 12:41:30 +01:00
|
|
|
|
|
|
|
|
|
|
|
def test_parallel_paths_in_pipeline_graph_with_branching():
|
|
|
|
class AWithOutput1(RootNode):
|
|
|
|
outgoing_edges = 2
|
2021-06-08 15:20:13 +02:00
|
|
|
|
2021-09-10 11:41:16 +02:00
|
|
|
def run(self):
|
|
|
|
output = "A"
|
|
|
|
return {"output": output}, "output_1"
|
2021-03-18 12:41:30 +01:00
|
|
|
|
|
|
|
class AWithOutput2(RootNode):
|
|
|
|
outgoing_edges = 2
|
2021-06-08 15:20:13 +02:00
|
|
|
|
2021-09-10 11:41:16 +02:00
|
|
|
def run(self):
|
|
|
|
output = "A"
|
|
|
|
return {"output": output}, "output_2"
|
2021-03-18 12:41:30 +01:00
|
|
|
|
|
|
|
class AWithOutputAll(RootNode):
|
|
|
|
outgoing_edges = 2
|
2021-06-08 15:20:13 +02:00
|
|
|
|
2021-09-10 11:41:16 +02:00
|
|
|
def run(self):
|
|
|
|
output = "A"
|
|
|
|
return {"output": output}, "output_all"
|
2021-03-18 12:41:30 +01:00
|
|
|
|
|
|
|
class B(RootNode):
|
2021-09-10 11:41:16 +02:00
|
|
|
def run(self, output):
|
|
|
|
output += "B"
|
|
|
|
return {"output": output}, "output_1"
|
2021-03-18 12:41:30 +01:00
|
|
|
|
|
|
|
class C(RootNode):
|
2021-09-10 11:41:16 +02:00
|
|
|
def run(self, output):
|
|
|
|
output += "C"
|
|
|
|
return {"output": output}, "output_1"
|
2021-03-18 12:41:30 +01:00
|
|
|
|
|
|
|
class D(RootNode):
|
2021-09-10 11:41:16 +02:00
|
|
|
def run(self, output):
|
|
|
|
output += "D"
|
|
|
|
return {"output": output}, "output_1"
|
2021-03-18 12:41:30 +01:00
|
|
|
|
|
|
|
class E(RootNode):
|
2021-09-10 11:41:16 +02:00
|
|
|
def run(self, output):
|
|
|
|
output += "E"
|
|
|
|
return {"output": output}, "output_1"
|
2021-03-18 12:41:30 +01:00
|
|
|
|
|
|
|
class JoinNode(RootNode):
|
2021-09-10 11:41:16 +02:00
|
|
|
def run(self, output=None, inputs=None):
|
|
|
|
if inputs:
|
|
|
|
output = ""
|
|
|
|
for input_dict in inputs:
|
|
|
|
output += input_dict["output"]
|
|
|
|
return {"output": output}, "output_1"
|
2021-03-18 12:41:30 +01:00
|
|
|
|
|
|
|
pipeline = Pipeline()
|
|
|
|
pipeline.add_node(name="A", component=AWithOutput1(), inputs=["Query"])
|
|
|
|
pipeline.add_node(name="B", component=B(), inputs=["A.output_1"])
|
|
|
|
pipeline.add_node(name="C", component=C(), inputs=["A.output_2"])
|
|
|
|
pipeline.add_node(name="D", component=E(), inputs=["B"])
|
|
|
|
pipeline.add_node(name="E", component=D(), inputs=["B"])
|
|
|
|
pipeline.add_node(name="F", component=JoinNode(), inputs=["D", "E", "C"])
|
|
|
|
output = pipeline.run(query="test")
|
|
|
|
assert output["output"] == "ABEABD"
|
|
|
|
|
|
|
|
pipeline = Pipeline()
|
|
|
|
pipeline.add_node(name="A", component=AWithOutput2(), inputs=["Query"])
|
|
|
|
pipeline.add_node(name="B", component=B(), inputs=["A.output_1"])
|
|
|
|
pipeline.add_node(name="C", component=C(), inputs=["A.output_2"])
|
|
|
|
pipeline.add_node(name="D", component=E(), inputs=["B"])
|
|
|
|
pipeline.add_node(name="E", component=D(), inputs=["B"])
|
|
|
|
pipeline.add_node(name="F", component=JoinNode(), inputs=["D", "E", "C"])
|
|
|
|
output = pipeline.run(query="test")
|
|
|
|
assert output["output"] == "AC"
|
|
|
|
|
|
|
|
pipeline = Pipeline()
|
|
|
|
pipeline.add_node(name="A", component=AWithOutputAll(), inputs=["Query"])
|
|
|
|
pipeline.add_node(name="B", component=B(), inputs=["A.output_1"])
|
|
|
|
pipeline.add_node(name="C", component=C(), inputs=["A.output_2"])
|
|
|
|
pipeline.add_node(name="D", component=E(), inputs=["B"])
|
|
|
|
pipeline.add_node(name="E", component=D(), inputs=["B"])
|
|
|
|
pipeline.add_node(name="F", component=JoinNode(), inputs=["D", "E", "C"])
|
|
|
|
output = pipeline.run(query="test")
|
|
|
|
assert output["output"] == "ACABEABD"
|
|
|
|
|
|
|
|
|
2021-11-11 11:02:22 +01:00
|
|
|
def test_existing_faiss_document_store():
|
|
|
|
clean_faiss_document_store()
|
|
|
|
|
|
|
|
pipeline = Pipeline.load_from_yaml(
|
|
|
|
Path(__file__).parent/"samples"/"pipeline"/"test_pipeline_faiss_indexing.yaml", pipeline_name="indexing_pipeline"
|
|
|
|
)
|
|
|
|
pipeline.run(
|
|
|
|
file_paths=Path(__file__).parent/"samples"/"pdf"/"sample_pdf_1.pdf"
|
|
|
|
)
|
|
|
|
|
|
|
|
new_document_store = pipeline.get_document_store()
|
|
|
|
new_document_store.save('existing_faiss_document_store')
|
|
|
|
|
|
|
|
# test correct load of query pipeline from yaml
|
|
|
|
pipeline = Pipeline.load_from_yaml(
|
|
|
|
Path(__file__).parent/"samples"/"pipeline"/"test_pipeline_faiss_retrieval.yaml", pipeline_name="query_pipeline"
|
|
|
|
)
|
|
|
|
|
|
|
|
retriever = pipeline.get_node("DPRRetriever")
|
|
|
|
existing_document_store = retriever.document_store
|
|
|
|
faiss_index = existing_document_store.faiss_indexes['document']
|
|
|
|
assert faiss_index.ntotal == 2
|
|
|
|
|
|
|
|
prediction = pipeline.run(
|
|
|
|
query="Who made the PDF specification?", params={"DPRRetriever": {"top_k": 10}}
|
|
|
|
)
|
|
|
|
|
|
|
|
assert prediction["query"] == "Who made the PDF specification?"
|
|
|
|
assert len(prediction["documents"]) == 2
|
|
|
|
clean_faiss_document_store()
|
|
|
|
|
|
|
|
|
|
|
|
def clean_faiss_document_store():
|
|
|
|
if Path('existing_faiss_document_store').exists():
|
|
|
|
os.remove('existing_faiss_document_store')
|
|
|
|
if Path('existing_faiss_document_store.json').exists():
|
|
|
|
os.remove('existing_faiss_document_store.json')
|
|
|
|
if Path('faiss_document_store.db').exists():
|
|
|
|
os.remove('faiss_document_store.db')
|