mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-05 03:28:09 +00:00
350 lines
16 KiB
Python
350 lines
16 KiB
Python
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
import os
|
|
from unittest.mock import ANY
|
|
|
|
import pytest
|
|
|
|
from haystack import AsyncPipeline, Document, Pipeline
|
|
from haystack.components.agents import Agent
|
|
from haystack.components.embedders.openai_document_embedder import OpenAIDocumentEmbedder
|
|
from haystack.components.embedders.openai_text_embedder import OpenAITextEmbedder
|
|
from haystack.components.generators.chat import OpenAIChatGenerator
|
|
from haystack.components.rankers.sentence_transformers_similarity import SentenceTransformersSimilarityRanker
|
|
from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
|
|
from haystack.dataclasses import ChatMessage
|
|
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
|
from haystack.tools import PipelineTool
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_pipeline():
|
|
pipeline = Pipeline()
|
|
pipeline.add_component("bm25_retriever", InMemoryBM25Retriever(document_store=InMemoryDocumentStore()))
|
|
pipeline.add_component("ranker", SentenceTransformersSimilarityRanker(model="fake-model"))
|
|
pipeline.connect("bm25_retriever", "ranker")
|
|
return pipeline
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_async_pipeline():
|
|
pipeline = AsyncPipeline()
|
|
pipeline.add_component("bm25_retriever", InMemoryBM25Retriever(document_store=InMemoryDocumentStore()))
|
|
pipeline.add_component("ranker", SentenceTransformersSimilarityRanker(model="fake-model"))
|
|
pipeline.connect("bm25_retriever", "ranker")
|
|
return pipeline
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_pipeline_dict():
|
|
return {
|
|
"metadata": {},
|
|
"max_runs_per_component": 100,
|
|
"components": {
|
|
"bm25_retriever": {
|
|
"type": "haystack.components.retrievers.in_memory.bm25_retriever.InMemoryBM25Retriever",
|
|
"init_parameters": {
|
|
"document_store": {
|
|
"type": "haystack.document_stores.in_memory.document_store.InMemoryDocumentStore",
|
|
"init_parameters": {
|
|
"bm25_tokenization_regex": "(?u)\\b\\w\\w+\\b",
|
|
"bm25_algorithm": "BM25L",
|
|
"bm25_parameters": {},
|
|
"embedding_similarity_function": "dot_product",
|
|
"index": ANY,
|
|
"return_embedding": True,
|
|
},
|
|
},
|
|
"filters": None,
|
|
"top_k": 10,
|
|
"scale_score": False,
|
|
"filter_policy": "replace",
|
|
},
|
|
},
|
|
"ranker": {
|
|
"type": "haystack.components.rankers.sentence_transformers_similarity."
|
|
"SentenceTransformersSimilarityRanker",
|
|
"init_parameters": {
|
|
"device": {"type": "single", "device": ANY},
|
|
"model": "fake-model",
|
|
"token": {"type": "env_var", "env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False},
|
|
"top_k": 10,
|
|
"query_prefix": "",
|
|
"document_prefix": "",
|
|
"meta_fields_to_embed": [],
|
|
"embedding_separator": "\n",
|
|
"scale_score": True,
|
|
"score_threshold": None,
|
|
"trust_remote_code": False,
|
|
"model_kwargs": None,
|
|
"tokenizer_kwargs": None,
|
|
"config_kwargs": None,
|
|
"backend": "torch",
|
|
"batch_size": 16,
|
|
},
|
|
},
|
|
},
|
|
"connections": [{"sender": "bm25_retriever.documents", "receiver": "ranker.documents"}],
|
|
"connection_type_validation": True,
|
|
}
|
|
|
|
|
|
class TestPipelineTool:
|
|
def test_init_invalid_pipeline(self):
|
|
with pytest.raises(
|
|
ValueError, match="The 'pipeline' parameter must be an instance of Pipeline or AsyncPipeline."
|
|
):
|
|
PipelineTool(pipeline="invalid_pipeline", name="test_tool", description="A test tool")
|
|
|
|
def test_to_dict(self, sample_pipeline, sample_pipeline_dict):
|
|
tool = PipelineTool(
|
|
pipeline=sample_pipeline,
|
|
input_mapping={"query": ["bm25_retriever.query"]},
|
|
output_mapping={"ranker.documents": "documents"},
|
|
name="test_tool",
|
|
description="A test tool",
|
|
)
|
|
|
|
tool_dict = tool.to_dict()
|
|
assert tool_dict == {
|
|
"type": "haystack.tools.pipeline_tool.PipelineTool",
|
|
"data": {
|
|
"pipeline": sample_pipeline_dict,
|
|
"name": "test_tool",
|
|
"input_mapping": {"query": ["bm25_retriever.query"]},
|
|
"output_mapping": {"ranker.documents": "documents"},
|
|
"description": "A test tool",
|
|
"parameters": None,
|
|
"inputs_from_state": None,
|
|
"outputs_to_state": None,
|
|
"is_pipeline_async": False,
|
|
"outputs_to_string": None,
|
|
},
|
|
}
|
|
|
|
def test_to_dict_async_pipeline(self, sample_async_pipeline, sample_pipeline_dict):
|
|
tool = PipelineTool(
|
|
pipeline=sample_async_pipeline,
|
|
input_mapping={"query": ["bm25_retriever.query"]},
|
|
output_mapping={"ranker.documents": "documents"},
|
|
name="test_tool",
|
|
description="A test tool",
|
|
)
|
|
|
|
tool_dict = tool.to_dict()
|
|
assert tool_dict == {
|
|
"type": "haystack.tools.pipeline_tool.PipelineTool",
|
|
"data": {
|
|
"pipeline": sample_pipeline_dict,
|
|
"name": "test_tool",
|
|
"input_mapping": {"query": ["bm25_retriever.query"]},
|
|
"output_mapping": {"ranker.documents": "documents"},
|
|
"description": "A test tool",
|
|
"parameters": None,
|
|
"inputs_from_state": None,
|
|
"outputs_to_state": None,
|
|
"is_pipeline_async": True,
|
|
"outputs_to_string": None,
|
|
},
|
|
}
|
|
|
|
def test_from_dict(self, sample_pipeline):
|
|
tool = PipelineTool(
|
|
pipeline=sample_pipeline,
|
|
input_mapping={"query": ["bm25_retriever.query"]},
|
|
output_mapping={"ranker.documents": "documents"},
|
|
name="test_tool",
|
|
description="A test tool",
|
|
)
|
|
|
|
tool_dict = tool.to_dict()
|
|
recreated_tool = PipelineTool.from_dict(tool_dict)
|
|
|
|
assert tool.name == recreated_tool.name
|
|
assert tool.description == recreated_tool.description
|
|
assert tool._input_mapping == recreated_tool._input_mapping
|
|
assert tool._output_mapping == recreated_tool._output_mapping
|
|
assert tool.parameters == recreated_tool.parameters
|
|
assert isinstance(recreated_tool._pipeline, Pipeline)
|
|
|
|
def test_from_dict_async_pipeline(self, sample_async_pipeline):
|
|
tool = PipelineTool(
|
|
pipeline=sample_async_pipeline,
|
|
input_mapping={"query": ["bm25_retriever.query"]},
|
|
output_mapping={"ranker.documents": "documents"},
|
|
name="test_tool",
|
|
description="A test tool",
|
|
)
|
|
|
|
tool_dict = tool.to_dict()
|
|
recreated_tool = PipelineTool.from_dict(tool_dict)
|
|
|
|
assert tool.name == recreated_tool.name
|
|
assert tool.description == recreated_tool.description
|
|
assert tool._input_mapping == recreated_tool._input_mapping
|
|
assert tool._output_mapping == recreated_tool._output_mapping
|
|
assert tool.parameters == recreated_tool.parameters
|
|
assert isinstance(recreated_tool._pipeline, AsyncPipeline)
|
|
|
|
def test_auto_generated_tool_params(self, sample_pipeline):
|
|
tool = PipelineTool(
|
|
pipeline=sample_pipeline,
|
|
input_mapping={"query": ["bm25_retriever.query", "ranker.query"]},
|
|
output_mapping={"ranker.documents": "documents"},
|
|
name="test_tool",
|
|
description="A test tool",
|
|
)
|
|
|
|
assert tool.parameters == {
|
|
"description": "A component that combines: 'bm25_retriever': Run the InMemoryBM25Retriever on the "
|
|
"given input data., 'ranker': Returns a list of documents ranked by their similarity "
|
|
"to the given query.",
|
|
"properties": {
|
|
"query": {
|
|
"description": "Provided to the 'bm25_retriever' component as: 'The query string for the Retriever."
|
|
"', and Provided to the 'ranker' component as: 'The input query to compare the "
|
|
"documents to.'.",
|
|
"type": "string",
|
|
}
|
|
},
|
|
"required": ["query"],
|
|
"type": "object",
|
|
}
|
|
|
|
def test_auto_generated_tool_params_no_mappings(self, sample_pipeline):
|
|
tool = PipelineTool(pipeline=sample_pipeline, name="test_tool", description="A test tool")
|
|
assert tool.parameters == {
|
|
"description": "A component that combines: 'bm25_retriever': Run the InMemoryBM25Retriever on the given "
|
|
"input data., 'ranker': Returns a list of documents ranked by their similarity to the "
|
|
"given query.",
|
|
"properties": {
|
|
"query": {
|
|
"description": "Provided to the 'bm25_retriever' component as: 'The query string for the "
|
|
"Retriever.', and Provided to the 'ranker' component as: 'The input query to "
|
|
"compare the documents to.'.",
|
|
"type": "string",
|
|
},
|
|
"filters": {
|
|
"anyOf": [{"additionalProperties": True, "type": "object"}, {"type": "null"}],
|
|
"description": "Provided to the 'bm25_retriever' component as: 'A dictionary with filters to "
|
|
"narrow down the search space when retrieving documents.'.",
|
|
},
|
|
"top_k": {
|
|
"description": "Provided to the 'bm25_retriever' component as: 'The maximum number of documents "
|
|
"to return.', and Provided to the 'ranker' component as: 'The maximum number "
|
|
"of documents to return.'.",
|
|
"type": "integer",
|
|
},
|
|
"scale_score": {
|
|
"description": "Provided to the 'bm25_retriever' component as: 'When `True`, scales the score "
|
|
"of retrieved documents to a range of 0 to 1, where 1 means extremely relevant."
|
|
"\nWhen `False`, uses raw similarity scores.', and Provided to the 'ranker' "
|
|
"component as: 'If `True`, scales the raw logit predictions using a Sigmoid "
|
|
"activation function.\nIf `False`, disables scaling of the raw logit predictions."
|
|
"\nIf set, overrides the value set at initialization.'.",
|
|
"type": "boolean",
|
|
},
|
|
"score_threshold": {
|
|
"anyOf": [{"type": "number"}, {"type": "null"}],
|
|
"description": "Provided to the 'ranker' component as: 'Use it to return documents only with "
|
|
"a score above this threshold.\nIf set, overrides the value set at initialization.'"
|
|
".",
|
|
},
|
|
},
|
|
"required": ["query"],
|
|
"type": "object",
|
|
}
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set")
|
|
def test_live_pipeline_tool(self):
|
|
# Initialize a document store and add some documents
|
|
document_store = InMemoryDocumentStore()
|
|
document_embedder = OpenAIDocumentEmbedder()
|
|
documents = [
|
|
Document(content="Nikola Tesla was a Serbian-American inventor and electrical engineer."),
|
|
Document(
|
|
content="He is best known for his contributions to the design of the modern alternating current (AC) "
|
|
"electricity supply system."
|
|
),
|
|
]
|
|
docs_with_embeddings = document_embedder.run(documents=documents)["documents"]
|
|
document_store.write_documents(docs_with_embeddings)
|
|
|
|
# Build a simple retrieval pipeline
|
|
retrieval_pipeline = Pipeline()
|
|
retrieval_pipeline.add_component("embedder", OpenAITextEmbedder())
|
|
retrieval_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
|
|
|
|
retrieval_pipeline.connect("embedder.embedding", "retriever.query_embedding")
|
|
|
|
# Wrap the pipeline as a tool
|
|
retriever_tool = PipelineTool(
|
|
pipeline=retrieval_pipeline,
|
|
input_mapping={"query": ["embedder.text"]},
|
|
output_mapping={"retriever.documents": "documents"},
|
|
name="document_retriever",
|
|
description="This tool retrieves documents relevant to Nikola Tesla from the document store",
|
|
)
|
|
|
|
# Create an Agent with the tool
|
|
agent = Agent(
|
|
chat_generator=OpenAIChatGenerator(model="gpt-4.1-mini"),
|
|
system_prompt="For any questions about Nikola Tesla, always use the document_retriever.",
|
|
tools=[retriever_tool],
|
|
)
|
|
|
|
# Let the Agent handle a query
|
|
result = agent.run([ChatMessage.from_user("Who was Nikola Tesla?")])
|
|
|
|
assert len(result["messages"]) == 5 # System msg, User msg, Agent msg, Tool call result, Agent mgs
|
|
assert "nikola" in result["messages"][-1].text.lower()
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.integration
|
|
@pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set")
|
|
async def test_live_async_pipeline_tool(self):
|
|
# Initialize a document store and add some documents
|
|
document_store = InMemoryDocumentStore()
|
|
document_embedder = OpenAIDocumentEmbedder()
|
|
documents = [
|
|
Document(content="Nikola Tesla was a Serbian-American inventor and electrical engineer."),
|
|
Document(
|
|
content="He is best known for his contributions to the design of the modern alternating current (AC) "
|
|
"electricity supply system."
|
|
),
|
|
]
|
|
docs_with_embeddings = document_embedder.run(documents=documents)["documents"]
|
|
document_store.write_documents(docs_with_embeddings)
|
|
|
|
# Build a simple retrieval pipeline
|
|
retrieval_pipeline = AsyncPipeline()
|
|
retrieval_pipeline.add_component("embedder", OpenAITextEmbedder())
|
|
retrieval_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
|
|
|
|
retrieval_pipeline.connect("embedder.embedding", "retriever.query_embedding")
|
|
|
|
# Wrap the pipeline as a tool
|
|
retriever_tool = PipelineTool(
|
|
pipeline=retrieval_pipeline,
|
|
input_mapping={"query": ["embedder.text"]},
|
|
output_mapping={"retriever.documents": "documents"},
|
|
name="document_retriever",
|
|
description="For any questions about Nikola Tesla, always use this tool",
|
|
)
|
|
|
|
# Create an Agent with the tool
|
|
agent = Agent(
|
|
chat_generator=OpenAIChatGenerator(model="gpt-4.1-mini"),
|
|
system_prompt="For any questions about Nikola Tesla, always use the document_retriever.",
|
|
tools=[retriever_tool],
|
|
)
|
|
|
|
# Let the Agent handle a query
|
|
result = await agent.run_async([ChatMessage.from_user("Who was Nikola Tesla?")])
|
|
|
|
assert len(result["messages"]) == 5 # System msg, User msg, Agent msg, Tool call result, Agent mgs
|
|
assert "nikola" in result["messages"][-1].text.lower()
|