mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-24 08:28:22 +00:00
refactor: Merge Pipeline
s definition in core
package (#6973)
* Move marshalling functions in core Pipeline * Move telemetry gathering in core Pipeline * Move run logic in core Pipeline * Update root Pipeline import * Add release notes * Update Pipeline docs path * Update releasenotes/notes/merge-pipeline-definitions-1da80e9803e2a8bb.yaml Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
This commit is contained in:
parent
549021d2fc
commit
f96eb3847f
@ -1,6 +1,6 @@
|
|||||||
loaders:
|
loaders:
|
||||||
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
|
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
|
||||||
search_path: [../../../haystack]
|
search_path: [../../../haystack/core/pipeline]
|
||||||
modules: ["pipeline"]
|
modules: ["pipeline"]
|
||||||
ignore_when_discovered: ["__init__"]
|
ignore_when_discovered: ["__init__"]
|
||||||
processors:
|
processors:
|
||||||
|
@ -1,9 +1,8 @@
|
|||||||
from haystack.core.component import component
|
from haystack.core.component import component
|
||||||
|
from haystack.core.errors import ComponentError, DeserializationError
|
||||||
|
from haystack.core.pipeline import Pipeline
|
||||||
from haystack.core.serialization import default_from_dict, default_to_dict
|
from haystack.core.serialization import default_from_dict, default_to_dict
|
||||||
from haystack.core.errors import DeserializationError, ComponentError
|
from haystack.dataclasses import Answer, Document, ExtractedAnswer, GeneratedAnswer
|
||||||
from haystack.pipeline import Pipeline
|
|
||||||
from haystack.dataclasses import Document, Answer, GeneratedAnswer, ExtractedAnswer
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"component",
|
"component",
|
||||||
|
@ -4,9 +4,11 @@
|
|||||||
import importlib
|
import importlib
|
||||||
import itertools
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
|
from collections import defaultdict
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List, Mapping, Optional, Set, Tuple, Type, TypeVar, Union
|
from typing import Any, Dict, List, Mapping, Optional, Set, TextIO, Tuple, Type, TypeVar, Union
|
||||||
|
|
||||||
import networkx # type:ignore
|
import networkx # type:ignore
|
||||||
|
|
||||||
@ -20,11 +22,14 @@ from haystack.core.errors import (
|
|||||||
)
|
)
|
||||||
from haystack.core.serialization import component_from_dict, component_to_dict
|
from haystack.core.serialization import component_from_dict, component_to_dict
|
||||||
from haystack.core.type_utils import _type_name, _types_are_compatible
|
from haystack.core.type_utils import _type_name, _types_are_compatible
|
||||||
|
from haystack.marshal import Marshaller, YamlMarshaller
|
||||||
|
from haystack.telemetry import pipeline_running
|
||||||
from haystack.utils import is_in_jupyter
|
from haystack.utils import is_in_jupyter
|
||||||
|
|
||||||
from .descriptions import find_pipeline_inputs, find_pipeline_outputs
|
from .descriptions import find_pipeline_inputs, find_pipeline_outputs
|
||||||
from .draw import _to_mermaid_image
|
from .draw import _to_mermaid_image
|
||||||
|
|
||||||
|
DEFAULT_MARSHALLER = YamlMarshaller()
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# We use a generic type to annotate the return value of classmethods,
|
# We use a generic type to annotate the return value of classmethods,
|
||||||
@ -56,6 +61,8 @@ class Pipeline:
|
|||||||
max_loops_allowed: how many times the pipeline can run the same node before throwing an exception.
|
max_loops_allowed: how many times the pipeline can run the same node before throwing an exception.
|
||||||
debug_path: when debug is enabled in `run()`, where to save the debug data.
|
debug_path: when debug is enabled in `run()`, where to save the debug data.
|
||||||
"""
|
"""
|
||||||
|
self._telemetry_runs = 0
|
||||||
|
self._last_telemetry_sent: Optional[datetime] = None
|
||||||
self.metadata = metadata or {}
|
self.metadata = metadata or {}
|
||||||
self.max_loops_allowed = max_loops_allowed
|
self.max_loops_allowed = max_loops_allowed
|
||||||
self.graph = networkx.MultiDiGraph()
|
self.graph = networkx.MultiDiGraph()
|
||||||
@ -194,6 +201,57 @@ class Pipeline:
|
|||||||
|
|
||||||
return pipe
|
return pipe
|
||||||
|
|
||||||
|
def dumps(self, marshaller: Marshaller = DEFAULT_MARSHALLER) -> str:
|
||||||
|
"""
|
||||||
|
Returns the string representation of this pipeline according to the
|
||||||
|
format dictated by the `Marshaller` in use.
|
||||||
|
|
||||||
|
:params marshaller: The Marshaller used to create the string representation. Defaults to
|
||||||
|
`YamlMarshaller`
|
||||||
|
|
||||||
|
:returns: A string representing the pipeline.
|
||||||
|
"""
|
||||||
|
return marshaller.marshal(self.to_dict())
|
||||||
|
|
||||||
|
def dump(self, fp: TextIO, marshaller: Marshaller = DEFAULT_MARSHALLER):
|
||||||
|
"""
|
||||||
|
Writes the string representation of this pipeline to the file-like object
|
||||||
|
passed in the `fp` argument.
|
||||||
|
|
||||||
|
:params fp: A file-like object ready to be written to.
|
||||||
|
:params marshaller: The Marshaller used to create the string representation. Defaults to
|
||||||
|
`YamlMarshaller`.
|
||||||
|
"""
|
||||||
|
fp.write(marshaller.marshal(self.to_dict()))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def loads(cls, data: Union[str, bytes, bytearray], marshaller: Marshaller = DEFAULT_MARSHALLER) -> "Pipeline":
|
||||||
|
"""
|
||||||
|
Creates a `Pipeline` object from the string representation passed in the `data` argument.
|
||||||
|
|
||||||
|
:params data: The string representation of the pipeline, can be `str`, `bytes` or `bytearray`.
|
||||||
|
:params marshaller: the Marshaller used to create the string representation. Defaults to
|
||||||
|
`YamlMarshaller`
|
||||||
|
|
||||||
|
:returns: A `Pipeline` object.
|
||||||
|
"""
|
||||||
|
return cls.from_dict(marshaller.unmarshal(data))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, fp: TextIO, marshaller: Marshaller = DEFAULT_MARSHALLER) -> "Pipeline":
|
||||||
|
"""
|
||||||
|
Creates a `Pipeline` object from the string representation read from the file-like
|
||||||
|
object passed in the `fp` argument.
|
||||||
|
|
||||||
|
:params data: The string representation of the pipeline, can be `str`, `bytes` or `bytearray`.
|
||||||
|
:params fp: A file-like object ready to be read from.
|
||||||
|
:params marshaller: the Marshaller used to create the string representation. Defaults to
|
||||||
|
`YamlMarshaller`
|
||||||
|
|
||||||
|
:returns: A `Pipeline` object.
|
||||||
|
"""
|
||||||
|
return cls.from_dict(marshaller.unmarshal(fp.read()))
|
||||||
|
|
||||||
def add_component(self, name: str, instance: Component) -> None:
|
def add_component(self, name: str, instance: Component) -> None:
|
||||||
"""
|
"""
|
||||||
Create a component for the given component. Components are not connected to anything by default:
|
Create a component for the given component. Components are not connected to anything by default:
|
||||||
@ -545,10 +603,54 @@ class Pipeline:
|
|||||||
f"Input {socket_name} for component {component_name} is already sent by {socket.senders}."
|
f"Input {socket_name} for component {component_name} is already sent by {socket.senders}."
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO: We're ignoring this linting rules for the time being, after we properly optimize this function we'll remove the noqa
|
# TODO: We're ignoring these linting rules for the time being, after we properly optimize this function we'll remove the noqa
|
||||||
def run( # noqa: C901, PLR0912 pylint: disable=too-many-branches
|
def run( # noqa: C901, PLR0912, PLR0915 pylint: disable=too-many-branches
|
||||||
self, data: Dict[str, Any], debug: bool = False
|
self, data: Dict[str, Any], debug: bool = False
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Runs the pipeline with given input data.
|
||||||
|
|
||||||
|
:param data: A dictionary of inputs for the pipeline's components. Each key is a component name
|
||||||
|
and its value is a dictionary of that component's input parameters.
|
||||||
|
:param debug: Set to True to collect and return debug information.
|
||||||
|
:return: A dictionary containing the pipeline's output.
|
||||||
|
:raises PipelineRuntimeError: If a component fails or returns unexpected output.
|
||||||
|
|
||||||
|
Example a - Using named components:
|
||||||
|
Consider a 'Hello' component that takes a 'word' input and outputs a greeting.
|
||||||
|
|
||||||
|
```python
|
||||||
|
@component
|
||||||
|
class Hello:
|
||||||
|
@component.output_types(output=str)
|
||||||
|
def run(self, word: str):
|
||||||
|
return {"output": f"Hello, {word}!"}
|
||||||
|
```
|
||||||
|
|
||||||
|
Create a pipeline with two 'Hello' components connected together:
|
||||||
|
|
||||||
|
```python
|
||||||
|
pipeline = Pipeline()
|
||||||
|
pipeline.add_component("hello", Hello())
|
||||||
|
pipeline.add_component("hello2", Hello())
|
||||||
|
pipeline.connect("hello.output", "hello2.word")
|
||||||
|
result = pipeline.run(data={"hello": {"word": "world"}})
|
||||||
|
```
|
||||||
|
|
||||||
|
This runs the pipeline with the specified input for 'hello', yielding
|
||||||
|
{'hello2': {'output': 'Hello, Hello, world!!'}}.
|
||||||
|
|
||||||
|
Example b - Using flat inputs:
|
||||||
|
You can also pass inputs directly without specifying component names:
|
||||||
|
|
||||||
|
```python
|
||||||
|
result = pipeline.run(data={"word": "world"})
|
||||||
|
```
|
||||||
|
|
||||||
|
The pipeline resolves inputs to the correct components, returning
|
||||||
|
{'hello2': {'output': 'Hello, Hello, world!!'}}.
|
||||||
|
"""
|
||||||
|
pipeline_running(self)
|
||||||
# NOTE: We're assuming data is formatted like so as of now
|
# NOTE: We're assuming data is formatted like so as of now
|
||||||
# data = {
|
# data = {
|
||||||
# "comp1": {"input1": 1, "input2": 2},
|
# "comp1": {"input1": 1, "input2": 2},
|
||||||
@ -563,9 +665,22 @@ class Pipeline:
|
|||||||
# As of now it's here to make sure we don't have failing tests that assume warm_up() is called in run()
|
# As of now it's here to make sure we don't have failing tests that assume warm_up() is called in run()
|
||||||
self.warm_up()
|
self.warm_up()
|
||||||
|
|
||||||
|
# check whether the data is a nested dictionary of component inputs where each key is a component name
|
||||||
|
# and each value is a dictionary of input parameters for that component
|
||||||
|
is_nested_component_input = all(isinstance(value, dict) for value in data.values())
|
||||||
|
if not is_nested_component_input:
|
||||||
|
# flat input, a dict where keys are input names and values are the corresponding values
|
||||||
|
# we need to convert it to a nested dictionary of component inputs and then run the pipeline
|
||||||
|
# just like in the previous case
|
||||||
|
data, unresolved_inputs = self._prepare_component_input_data(data)
|
||||||
|
if unresolved_inputs:
|
||||||
|
logger.warning(
|
||||||
|
"Inputs %s were not matched to any component inputs, please check your run parameters.",
|
||||||
|
list(unresolved_inputs.keys()),
|
||||||
|
)
|
||||||
|
|
||||||
# Raise if input is malformed in some way
|
# Raise if input is malformed in some way
|
||||||
self._validate_input(data)
|
self._validate_input(data)
|
||||||
|
|
||||||
# NOTE: The above NOTE and TODO are technically not true.
|
# NOTE: The above NOTE and TODO are technically not true.
|
||||||
# This implementation of run supports only the first format, but the second format is actually
|
# This implementation of run supports only the first format, but the second format is actually
|
||||||
# never received by this method. It's handled by the `run()` method of the `Pipeline` class
|
# never received by this method. It's handled by the `run()` method of the `Pipeline` class
|
||||||
@ -774,6 +889,44 @@ class Pipeline:
|
|||||||
|
|
||||||
return final_outputs
|
return final_outputs
|
||||||
|
|
||||||
|
def _prepare_component_input_data(self, data: Dict[str, Any]) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Organizes input data for pipeline components and identifies any inputs that are not matched to any
|
||||||
|
component's input slots.
|
||||||
|
|
||||||
|
This method processes a flat dictionary of input data, where each key-value pair represents an input name
|
||||||
|
and its corresponding value. It distributes these inputs to the appropriate pipeline components based on
|
||||||
|
their input requirements. Inputs that don't match any component's input slots are classified as unresolved.
|
||||||
|
|
||||||
|
:param data: A dictionary with input names as keys and input values as values.
|
||||||
|
:type data: Dict[str, Any]
|
||||||
|
:return: A tuple containing two elements:
|
||||||
|
1. A dictionary mapping component names to their respective matched inputs.
|
||||||
|
2. A dictionary of inputs that were not matched to any component, termed as unresolved keyword arguments.
|
||||||
|
:rtype: Tuple[Dict[str, Dict[str, Any]], Dict[str, Any]]
|
||||||
|
"""
|
||||||
|
pipeline_input_data: Dict[str, Dict[str, Any]] = defaultdict(dict)
|
||||||
|
unresolved_kwargs = {}
|
||||||
|
|
||||||
|
# Retrieve the input slots for each component in the pipeline
|
||||||
|
available_inputs: Dict[str, Dict[str, Any]] = self.inputs()
|
||||||
|
|
||||||
|
# Go through all provided to distribute them to the appropriate component inputs
|
||||||
|
for input_name, input_value in data.items():
|
||||||
|
resolved_at_least_once = False
|
||||||
|
|
||||||
|
# Check each component to see if it has a slot for the current kwarg
|
||||||
|
for component_name, component_inputs in available_inputs.items():
|
||||||
|
if input_name in component_inputs:
|
||||||
|
# If a match is found, add the kwarg to the component's input data
|
||||||
|
pipeline_input_data[component_name][input_name] = input_value
|
||||||
|
resolved_at_least_once = True
|
||||||
|
|
||||||
|
if not resolved_at_least_once:
|
||||||
|
unresolved_kwargs[input_name] = input_value
|
||||||
|
|
||||||
|
return pipeline_input_data, unresolved_kwargs
|
||||||
|
|
||||||
|
|
||||||
def _connections_status(
|
def _connections_status(
|
||||||
sender_node: str, receiver_node: str, sender_sockets: List[OutputSocket], receiver_sockets: List[InputSocket]
|
sender_node: str, receiver_node: str, sender_sockets: List[OutputSocket], receiver_sockets: List[InputSocket]
|
||||||
|
@ -1,200 +0,0 @@
|
|||||||
from collections import defaultdict
|
|
||||||
from typing import Any, Dict, Optional, Union, TextIO, Tuple
|
|
||||||
from pathlib import Path
|
|
||||||
import datetime
|
|
||||||
import logging
|
|
||||||
|
|
||||||
from haystack.core.pipeline import Pipeline as _pipeline
|
|
||||||
from haystack.telemetry import pipeline_running
|
|
||||||
from haystack.marshal import Marshaller, YamlMarshaller
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_MARSHALLER = YamlMarshaller()
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class Pipeline(_pipeline):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
metadata: Optional[Dict[str, Any]] = None,
|
|
||||||
max_loops_allowed: int = 100,
|
|
||||||
debug_path: Union[Path, str] = Path(".haystack_debug/"),
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Creates the Pipeline.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
metadata: arbitrary dictionary to store metadata about this pipeline. Make sure all the values contained in
|
|
||||||
this dictionary can be serialized and deserialized if you wish to save this pipeline to file with
|
|
||||||
`save_pipelines()/load_pipelines()`.
|
|
||||||
max_loops_allowed: how many times the pipeline can run the same node before throwing an exception.
|
|
||||||
debug_path: when debug is enabled in `run()`, where to save the debug data.
|
|
||||||
"""
|
|
||||||
self._telemetry_runs = 0
|
|
||||||
self._last_telemetry_sent: Optional[datetime.datetime] = None
|
|
||||||
super().__init__(metadata=metadata, max_loops_allowed=max_loops_allowed, debug_path=debug_path)
|
|
||||||
|
|
||||||
def run(self, data: Dict[str, Any], debug: bool = False) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Runs the pipeline with given input data.
|
|
||||||
|
|
||||||
:param data: A dictionary of inputs for the pipeline's components. Each key is a component name
|
|
||||||
and its value is a dictionary of that component's input parameters.
|
|
||||||
:param debug: Set to True to collect and return debug information.
|
|
||||||
:return: A dictionary containing the pipeline's output.
|
|
||||||
:raises PipelineRuntimeError: If a component fails or returns unexpected output.
|
|
||||||
|
|
||||||
Example a - Using named components:
|
|
||||||
Consider a 'Hello' component that takes a 'word' input and outputs a greeting.
|
|
||||||
|
|
||||||
```python
|
|
||||||
@component
|
|
||||||
class Hello:
|
|
||||||
@component.output_types(output=str)
|
|
||||||
def run(self, word: str):
|
|
||||||
return {"output": f"Hello, {word}!"}
|
|
||||||
```
|
|
||||||
|
|
||||||
Create a pipeline with two 'Hello' components connected together:
|
|
||||||
|
|
||||||
```python
|
|
||||||
pipeline = Pipeline()
|
|
||||||
pipeline.add_component("hello", Hello())
|
|
||||||
pipeline.add_component("hello2", Hello())
|
|
||||||
pipeline.connect("hello.output", "hello2.word")
|
|
||||||
result = pipeline.run(data={"hello": {"word": "world"}})
|
|
||||||
```
|
|
||||||
|
|
||||||
This runs the pipeline with the specified input for 'hello', yielding
|
|
||||||
{'hello2': {'output': 'Hello, Hello, world!!'}}.
|
|
||||||
|
|
||||||
Example b - Using flat inputs:
|
|
||||||
You can also pass inputs directly without specifying component names:
|
|
||||||
|
|
||||||
```python
|
|
||||||
result = pipeline.run(data={"word": "world"})
|
|
||||||
```
|
|
||||||
|
|
||||||
The pipeline resolves inputs to the correct components, returning
|
|
||||||
{'hello2': {'output': 'Hello, Hello, world!!'}}.
|
|
||||||
"""
|
|
||||||
# check whether the data is a nested dictionary of component inputs where each key is a component name
|
|
||||||
# and each value is a dictionary of input parameters for that component
|
|
||||||
is_nested_component_input = all(isinstance(value, dict) for value in data.values())
|
|
||||||
if is_nested_component_input:
|
|
||||||
return self._run_internal(data=data, debug=debug)
|
|
||||||
else:
|
|
||||||
# flat input, a dict where keys are input names and values are the corresponding values
|
|
||||||
# we need to convert it to a nested dictionary of component inputs and then run the pipeline
|
|
||||||
# just like in the previous case
|
|
||||||
pipeline_inputs, unresolved_inputs = self._prepare_component_input_data(data)
|
|
||||||
if unresolved_inputs:
|
|
||||||
logger.warning(
|
|
||||||
"Inputs %s were not matched to any component inputs, please check your run parameters.",
|
|
||||||
list(unresolved_inputs.keys()),
|
|
||||||
)
|
|
||||||
|
|
||||||
return self._run_internal(data=pipeline_inputs, debug=debug)
|
|
||||||
|
|
||||||
def _run_internal(self, data: Dict[str, Any], debug: bool = False) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Runs the pipeline by invoking the underlying run to initiate the pipeline execution.
|
|
||||||
|
|
||||||
:params data: the inputs to give to the input components of the Pipeline.
|
|
||||||
:params debug: whether to collect and return debug information.
|
|
||||||
|
|
||||||
:returns: A dictionary with the outputs of the output components of the Pipeline.
|
|
||||||
|
|
||||||
:raises PipelineRuntimeError: if any of the components fail or return unexpected output.
|
|
||||||
"""
|
|
||||||
pipeline_running(self)
|
|
||||||
return super().run(data=data, debug=debug)
|
|
||||||
|
|
||||||
def dumps(self, marshaller: Marshaller = DEFAULT_MARSHALLER) -> str:
|
|
||||||
"""
|
|
||||||
Returns the string representation of this pipeline according to the
|
|
||||||
format dictated by the `Marshaller` in use.
|
|
||||||
|
|
||||||
:params marshaller: The Marshaller used to create the string representation. Defaults to
|
|
||||||
`YamlMarshaller`
|
|
||||||
|
|
||||||
:returns: A string representing the pipeline.
|
|
||||||
"""
|
|
||||||
return marshaller.marshal(self.to_dict())
|
|
||||||
|
|
||||||
def dump(self, fp: TextIO, marshaller: Marshaller = DEFAULT_MARSHALLER):
|
|
||||||
"""
|
|
||||||
Writes the string representation of this pipeline to the file-like object
|
|
||||||
passed in the `fp` argument.
|
|
||||||
|
|
||||||
:params fp: A file-like object ready to be written to.
|
|
||||||
:params marshaller: The Marshaller used to create the string representation. Defaults to
|
|
||||||
`YamlMarshaller`.
|
|
||||||
"""
|
|
||||||
fp.write(marshaller.marshal(self.to_dict()))
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def loads(cls, data: Union[str, bytes, bytearray], marshaller: Marshaller = DEFAULT_MARSHALLER) -> "Pipeline":
|
|
||||||
"""
|
|
||||||
Creates a `Pipeline` object from the string representation passed in the `data` argument.
|
|
||||||
|
|
||||||
:params data: The string representation of the pipeline, can be `str`, `bytes` or `bytearray`.
|
|
||||||
:params marshaller: the Marshaller used to create the string representation. Defaults to
|
|
||||||
`YamlMarshaller`
|
|
||||||
|
|
||||||
:returns: A `Pipeline` object.
|
|
||||||
"""
|
|
||||||
return cls.from_dict(marshaller.unmarshal(data))
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def load(cls, fp: TextIO, marshaller: Marshaller = DEFAULT_MARSHALLER) -> "Pipeline":
|
|
||||||
"""
|
|
||||||
Creates a `Pipeline` object from the string representation read from the file-like
|
|
||||||
object passed in the `fp` argument.
|
|
||||||
|
|
||||||
:params data: The string representation of the pipeline, can be `str`, `bytes` or `bytearray`.
|
|
||||||
:params fp: A file-like object ready to be read from.
|
|
||||||
:params marshaller: the Marshaller used to create the string representation. Defaults to
|
|
||||||
`YamlMarshaller`
|
|
||||||
|
|
||||||
:returns: A `Pipeline` object.
|
|
||||||
"""
|
|
||||||
return cls.from_dict(marshaller.unmarshal(fp.read()))
|
|
||||||
|
|
||||||
def _prepare_component_input_data(self, data: Dict[str, Any]) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Any]]:
|
|
||||||
"""
|
|
||||||
Organizes input data for pipeline components and identifies any inputs that are not matched to any
|
|
||||||
component's input slots.
|
|
||||||
|
|
||||||
This method processes a flat dictionary of input data, where each key-value pair represents an input name
|
|
||||||
and its corresponding value. It distributes these inputs to the appropriate pipeline components based on
|
|
||||||
their input requirements. Inputs that don't match any component's input slots are classified as unresolved.
|
|
||||||
|
|
||||||
:param data: A dictionary with input names as keys and input values as values.
|
|
||||||
:type data: Dict[str, Any]
|
|
||||||
:return: A tuple containing two elements:
|
|
||||||
1. A dictionary mapping component names to their respective matched inputs.
|
|
||||||
2. A dictionary of inputs that were not matched to any component, termed as unresolved keyword arguments.
|
|
||||||
:rtype: Tuple[Dict[str, Dict[str, Any]], Dict[str, Any]]
|
|
||||||
"""
|
|
||||||
pipeline_input_data: Dict[str, Dict[str, Any]] = defaultdict(dict)
|
|
||||||
unresolved_kwargs = {}
|
|
||||||
|
|
||||||
# Retrieve the input slots for each component in the pipeline
|
|
||||||
available_inputs: Dict[str, Dict[str, Any]] = self.inputs()
|
|
||||||
|
|
||||||
# Go through all provided to distribute them to the appropriate component inputs
|
|
||||||
for input_name, input_value in data.items():
|
|
||||||
resolved_at_least_once = False
|
|
||||||
|
|
||||||
# Check each component to see if it has a slot for the current kwarg
|
|
||||||
for component_name, component_inputs in available_inputs.items():
|
|
||||||
if input_name in component_inputs:
|
|
||||||
# If a match is found, add the kwarg to the component's input data
|
|
||||||
pipeline_input_data[component_name][input_name] = input_value
|
|
||||||
resolved_at_least_once = True
|
|
||||||
|
|
||||||
if not resolved_at_least_once:
|
|
||||||
unresolved_kwargs[input_name] = input_value
|
|
||||||
|
|
||||||
return pipeline_input_data, unresolved_kwargs
|
|
@ -1,17 +1,18 @@
|
|||||||
from typing import Any, Dict, Optional, TYPE_CHECKING, List, Tuple
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
from collections import defaultdict
|
|
||||||
import datetime
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import uuid
|
import uuid
|
||||||
import yaml
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
import posthog
|
import posthog
|
||||||
|
import yaml
|
||||||
|
|
||||||
from haystack.telemetry._environment import collect_system_specs
|
from haystack.telemetry._environment import collect_system_specs
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from haystack.pipeline import Pipeline
|
from haystack.core.pipeline import Pipeline
|
||||||
|
|
||||||
|
|
||||||
HAYSTACK_TELEMETRY_ENABLED = "HAYSTACK_TELEMETRY_ENABLED"
|
HAYSTACK_TELEMETRY_ENABLED = "HAYSTACK_TELEMETRY_ENABLED"
|
||||||
|
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
enhancements:
|
||||||
|
- |
|
||||||
|
Merge `Pipeline`s definitions into a single `Pipeline` class.
|
||||||
|
The class in the `haystack.pipeline` package has been deleted and only `haystack.core.pipeline` exists now.
|
@ -7,6 +7,7 @@ from unittest.mock import patch
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from haystack.core.component import component
|
||||||
from haystack.core.component.types import InputSocket, OutputSocket
|
from haystack.core.component.types import InputSocket, OutputSocket
|
||||||
from haystack.core.errors import PipelineDrawingError, PipelineError, PipelineRuntimeError
|
from haystack.core.errors import PipelineDrawingError, PipelineError, PipelineRuntimeError
|
||||||
from haystack.core.pipeline import Pipeline
|
from haystack.core.pipeline import Pipeline
|
||||||
@ -16,6 +17,163 @@ from haystack.testing.sample_components import AddFixedValue, Double
|
|||||||
logging.basicConfig(level=logging.DEBUG)
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
|
|
||||||
|
@component
|
||||||
|
class FakeComponent:
|
||||||
|
def __init__(self, an_init_param: Optional[str] = None):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@component.output_types(value=str)
|
||||||
|
def run(self, input_: str):
|
||||||
|
return {"value": input_}
|
||||||
|
|
||||||
|
|
||||||
|
def test_pipeline_resolution_simple_input():
|
||||||
|
@component
|
||||||
|
class Hello:
|
||||||
|
@component.output_types(output=str)
|
||||||
|
def run(self, word: str):
|
||||||
|
"""
|
||||||
|
Takes a string in input and returns "Hello, <string>!"
|
||||||
|
in output.
|
||||||
|
"""
|
||||||
|
return {"output": f"Hello, {word}!"}
|
||||||
|
|
||||||
|
pipeline = Pipeline()
|
||||||
|
pipeline.add_component("hello", Hello())
|
||||||
|
pipeline.add_component("hello2", Hello())
|
||||||
|
|
||||||
|
pipeline.connect("hello.output", "hello2.word")
|
||||||
|
result = pipeline.run(data={"hello": {"word": "world"}})
|
||||||
|
assert result == {"hello2": {"output": "Hello, Hello, world!!"}}
|
||||||
|
|
||||||
|
result = pipeline.run(data={"word": "world"})
|
||||||
|
assert result == {"hello2": {"output": "Hello, Hello, world!!"}}
|
||||||
|
|
||||||
|
|
||||||
|
def test_pipeline_resolution_wrong_input_name(caplog):
|
||||||
|
@component
|
||||||
|
class Hello:
|
||||||
|
@component.output_types(output=str)
|
||||||
|
def run(self, who: str):
|
||||||
|
"""
|
||||||
|
Takes a string in input and returns "Hello, <string>!"
|
||||||
|
in output.
|
||||||
|
"""
|
||||||
|
return {"output": f"Hello, {who}!"}
|
||||||
|
|
||||||
|
pipeline = Pipeline()
|
||||||
|
pipeline.add_component("hello", Hello())
|
||||||
|
pipeline.add_component("hello2", Hello())
|
||||||
|
|
||||||
|
pipeline.connect("hello.output", "hello2.who")
|
||||||
|
|
||||||
|
# test case with nested component inputs
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
pipeline.run(data={"hello": {"non_existing_input": "world"}})
|
||||||
|
|
||||||
|
# test case with flat component inputs
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
pipeline.run(data={"non_existing_input": "world"})
|
||||||
|
|
||||||
|
# important to check that the warning is logged for UX purposes, leave it here
|
||||||
|
assert "were not matched to any component" in caplog.text
|
||||||
|
|
||||||
|
|
||||||
|
def test_pipeline_resolution_with_mixed_correct_and_incorrect_input_names(caplog):
|
||||||
|
@component
|
||||||
|
class Hello:
|
||||||
|
@component.output_types(output=str)
|
||||||
|
def run(self, who: str):
|
||||||
|
"""
|
||||||
|
Takes a string in input and returns "Hello, <string>!"
|
||||||
|
in output.
|
||||||
|
"""
|
||||||
|
return {"output": f"Hello, {who}!"}
|
||||||
|
|
||||||
|
pipeline = Pipeline()
|
||||||
|
pipeline.add_component("hello", Hello())
|
||||||
|
pipeline.add_component("hello2", Hello())
|
||||||
|
|
||||||
|
pipeline.connect("hello.output", "hello2.who")
|
||||||
|
|
||||||
|
# test case with nested component inputs
|
||||||
|
# this will raise ValueError because hello component does not have an input named "non_existing_input"
|
||||||
|
# even though it has an input named "who"
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
pipeline.run(data={"hello": {"non_existing_input": "world", "who": "world"}})
|
||||||
|
|
||||||
|
# test case with flat component inputs
|
||||||
|
# this will not raise ValueError because the input "who" will be resolved to the correct component
|
||||||
|
# and we'll log a warning for the input "non_existing_input" which was not resolved
|
||||||
|
result = pipeline.run(data={"non_existing_input": "world", "who": "world"})
|
||||||
|
assert result == {"hello2": {"output": "Hello, Hello, world!!"}}
|
||||||
|
|
||||||
|
# important to check that the warning is logged for UX purposes, leave it here
|
||||||
|
assert "were not matched to any component" in caplog.text
|
||||||
|
|
||||||
|
|
||||||
|
def test_pipeline_resolution_duplicate_input_names_across_components():
|
||||||
|
@component
|
||||||
|
class Hello:
|
||||||
|
@component.output_types(output=str)
|
||||||
|
def run(self, who: str, what: str):
|
||||||
|
return {"output": f"Hello {who} {what}!"}
|
||||||
|
|
||||||
|
pipe = Pipeline()
|
||||||
|
pipe.add_component("hello", Hello())
|
||||||
|
pipe.add_component("hello2", Hello())
|
||||||
|
|
||||||
|
pipe.connect("hello.output", "hello2.who")
|
||||||
|
|
||||||
|
result = pipe.run(data={"what": "Haystack", "who": "world"})
|
||||||
|
assert result == {"hello2": {"output": "Hello Hello world Haystack! Haystack!"}}
|
||||||
|
|
||||||
|
resolved, _ = pipe._prepare_component_input_data(data={"what": "Haystack", "who": "world"})
|
||||||
|
|
||||||
|
# why does hello2 have only one input? Because who of hello2 is inserted from hello.output
|
||||||
|
assert resolved == {"hello": {"what": "Haystack", "who": "world"}, "hello2": {"what": "Haystack"}}
|
||||||
|
|
||||||
|
|
||||||
|
def test_pipeline_dumps(test_files_path):
|
||||||
|
pipeline = Pipeline()
|
||||||
|
pipeline.add_component("Comp1", FakeComponent("Foo"))
|
||||||
|
pipeline.add_component("Comp2", FakeComponent())
|
||||||
|
pipeline.connect("Comp1.value", "Comp2.input_")
|
||||||
|
pipeline.max_loops_allowed = 99
|
||||||
|
result = pipeline.dumps()
|
||||||
|
with open(f"{test_files_path}/yaml/test_pipeline.yaml", "r") as f:
|
||||||
|
assert f.read() == result
|
||||||
|
|
||||||
|
|
||||||
|
def test_pipeline_loads(test_files_path):
|
||||||
|
with open(f"{test_files_path}/yaml/test_pipeline.yaml", "r") as f:
|
||||||
|
pipeline = Pipeline.loads(f.read())
|
||||||
|
assert pipeline.max_loops_allowed == 99
|
||||||
|
assert isinstance(pipeline.get_component("Comp1"), FakeComponent)
|
||||||
|
assert isinstance(pipeline.get_component("Comp2"), FakeComponent)
|
||||||
|
|
||||||
|
|
||||||
|
def test_pipeline_dump(test_files_path, tmp_path):
|
||||||
|
pipeline = Pipeline()
|
||||||
|
pipeline.add_component("Comp1", FakeComponent("Foo"))
|
||||||
|
pipeline.add_component("Comp2", FakeComponent())
|
||||||
|
pipeline.connect("Comp1.value", "Comp2.input_")
|
||||||
|
pipeline.max_loops_allowed = 99
|
||||||
|
with open(tmp_path / "out.yaml", "w") as f:
|
||||||
|
pipeline.dump(f)
|
||||||
|
# re-open and ensure it's the same data as the test file
|
||||||
|
with open(f"{test_files_path}/yaml/test_pipeline.yaml", "r") as test_f, open(tmp_path / "out.yaml", "r") as f:
|
||||||
|
assert f.read() == test_f.read()
|
||||||
|
|
||||||
|
|
||||||
|
def test_pipeline_load(test_files_path):
|
||||||
|
with open(f"{test_files_path}/yaml/test_pipeline.yaml", "r") as f:
|
||||||
|
pipeline = Pipeline.load(f)
|
||||||
|
assert pipeline.max_loops_allowed == 99
|
||||||
|
assert isinstance(pipeline.get_component("Comp1"), FakeComponent)
|
||||||
|
assert isinstance(pipeline.get_component("Comp2"), FakeComponent)
|
||||||
|
|
||||||
|
|
||||||
@patch("haystack.core.pipeline.pipeline._to_mermaid_image")
|
@patch("haystack.core.pipeline.pipeline._to_mermaid_image")
|
||||||
@patch("haystack.core.pipeline.pipeline.is_in_jupyter")
|
@patch("haystack.core.pipeline.pipeline.is_in_jupyter")
|
||||||
@patch("IPython.display.Image")
|
@patch("IPython.display.Image")
|
||||||
|
@ -2,11 +2,11 @@ components:
|
|||||||
Comp1:
|
Comp1:
|
||||||
init_parameters:
|
init_parameters:
|
||||||
an_init_param: null
|
an_init_param: null
|
||||||
type: test.test_pipeline.TestComponent
|
type: test.core.pipeline.test_pipeline.FakeComponent
|
||||||
Comp2:
|
Comp2:
|
||||||
init_parameters:
|
init_parameters:
|
||||||
an_init_param: null
|
an_init_param: null
|
||||||
type: test.test_pipeline.TestComponent
|
type: test.core.pipeline.test_pipeline.FakeComponent
|
||||||
connections:
|
connections:
|
||||||
- receiver: Comp2.input_
|
- receiver: Comp2.input_
|
||||||
sender: Comp1.value
|
sender: Comp1.value
|
||||||
|
@ -1,166 +0,0 @@
|
|||||||
from typing import Optional
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from haystack import Pipeline, component
|
|
||||||
|
|
||||||
|
|
||||||
@component
|
|
||||||
class TestComponent:
|
|
||||||
def __init__(self, an_init_param: Optional[str] = None):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@component.output_types(value=str)
|
|
||||||
def run(self, input_: str):
|
|
||||||
return {"value": input_}
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def pipeline():
|
|
||||||
return Pipeline()
|
|
||||||
|
|
||||||
|
|
||||||
def test_pipeline_dumps(pipeline, test_files_path):
|
|
||||||
pipeline.add_component("Comp1", TestComponent("Foo"))
|
|
||||||
pipeline.add_component("Comp2", TestComponent())
|
|
||||||
pipeline.connect("Comp1.value", "Comp2.input_")
|
|
||||||
pipeline.max_loops_allowed = 99
|
|
||||||
result = pipeline.dumps()
|
|
||||||
with open(f"{test_files_path}/yaml/test_pipeline.yaml", "r") as f:
|
|
||||||
assert f.read() == result
|
|
||||||
|
|
||||||
|
|
||||||
def test_pipeline_loads(test_files_path):
|
|
||||||
with open(f"{test_files_path}/yaml/test_pipeline.yaml", "r") as f:
|
|
||||||
pipeline = Pipeline.loads(f.read())
|
|
||||||
assert pipeline.max_loops_allowed == 99
|
|
||||||
assert isinstance(pipeline.get_component("Comp1"), TestComponent)
|
|
||||||
assert isinstance(pipeline.get_component("Comp2"), TestComponent)
|
|
||||||
|
|
||||||
|
|
||||||
def test_pipeline_dump(pipeline, test_files_path, tmp_path):
|
|
||||||
pipeline.add_component("Comp1", TestComponent("Foo"))
|
|
||||||
pipeline.add_component("Comp2", TestComponent())
|
|
||||||
pipeline.connect("Comp1.value", "Comp2.input_")
|
|
||||||
pipeline.max_loops_allowed = 99
|
|
||||||
with open(tmp_path / "out.yaml", "w") as f:
|
|
||||||
pipeline.dump(f)
|
|
||||||
# re-open and ensure it's the same data as the test file
|
|
||||||
with open(f"{test_files_path}/yaml/test_pipeline.yaml", "r") as test_f, open(tmp_path / "out.yaml", "r") as f:
|
|
||||||
assert f.read() == test_f.read()
|
|
||||||
|
|
||||||
|
|
||||||
def test_pipeline_load(test_files_path):
|
|
||||||
with open(f"{test_files_path}/yaml/test_pipeline.yaml", "r") as f:
|
|
||||||
pipeline = Pipeline.load(f)
|
|
||||||
assert pipeline.max_loops_allowed == 99
|
|
||||||
assert isinstance(pipeline.get_component("Comp1"), TestComponent)
|
|
||||||
assert isinstance(pipeline.get_component("Comp2"), TestComponent)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.unit
|
|
||||||
def test_pipeline_resolution_simple_input():
|
|
||||||
@component
|
|
||||||
class Hello:
|
|
||||||
@component.output_types(output=str)
|
|
||||||
def run(self, word: str):
|
|
||||||
"""
|
|
||||||
Takes a string in input and returns "Hello, <string>!"
|
|
||||||
in output.
|
|
||||||
"""
|
|
||||||
return {"output": f"Hello, {word}!"}
|
|
||||||
|
|
||||||
pipeline = Pipeline()
|
|
||||||
pipeline.add_component("hello", Hello())
|
|
||||||
pipeline.add_component("hello2", Hello())
|
|
||||||
|
|
||||||
pipeline.connect("hello.output", "hello2.word")
|
|
||||||
result = pipeline.run(data={"hello": {"word": "world"}})
|
|
||||||
assert result == {"hello2": {"output": "Hello, Hello, world!!"}}
|
|
||||||
|
|
||||||
result = pipeline.run(data={"word": "world"})
|
|
||||||
assert result == {"hello2": {"output": "Hello, Hello, world!!"}}
|
|
||||||
|
|
||||||
|
|
||||||
def test_pipeline_resolution_wrong_input_name(caplog):
|
|
||||||
@component
|
|
||||||
class Hello:
|
|
||||||
@component.output_types(output=str)
|
|
||||||
def run(self, who: str):
|
|
||||||
"""
|
|
||||||
Takes a string in input and returns "Hello, <string>!"
|
|
||||||
in output.
|
|
||||||
"""
|
|
||||||
return {"output": f"Hello, {who}!"}
|
|
||||||
|
|
||||||
pipeline = Pipeline()
|
|
||||||
pipeline.add_component("hello", Hello())
|
|
||||||
pipeline.add_component("hello2", Hello())
|
|
||||||
|
|
||||||
pipeline.connect("hello.output", "hello2.who")
|
|
||||||
|
|
||||||
# test case with nested component inputs
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
pipeline.run(data={"hello": {"non_existing_input": "world"}})
|
|
||||||
|
|
||||||
# test case with flat component inputs
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
pipeline.run(data={"non_existing_input": "world"})
|
|
||||||
|
|
||||||
# important to check that the warning is logged for UX purposes, leave it here
|
|
||||||
assert "were not matched to any component" in caplog.text
|
|
||||||
|
|
||||||
|
|
||||||
def test_pipeline_resolution_with_mixed_correct_and_incorrect_input_names(caplog):
|
|
||||||
@component
|
|
||||||
class Hello:
|
|
||||||
@component.output_types(output=str)
|
|
||||||
def run(self, who: str):
|
|
||||||
"""
|
|
||||||
Takes a string in input and returns "Hello, <string>!"
|
|
||||||
in output.
|
|
||||||
"""
|
|
||||||
return {"output": f"Hello, {who}!"}
|
|
||||||
|
|
||||||
pipeline = Pipeline()
|
|
||||||
pipeline.add_component("hello", Hello())
|
|
||||||
pipeline.add_component("hello2", Hello())
|
|
||||||
|
|
||||||
pipeline.connect("hello.output", "hello2.who")
|
|
||||||
|
|
||||||
# test case with nested component inputs
|
|
||||||
# this will raise ValueError because hello component does not have an input named "non_existing_input"
|
|
||||||
# even though it has an input named "who"
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
pipeline.run(data={"hello": {"non_existing_input": "world", "who": "world"}})
|
|
||||||
|
|
||||||
# test case with flat component inputs
|
|
||||||
# this will not raise ValueError because the input "who" will be resolved to the correct component
|
|
||||||
# and we'll log a warning for the input "non_existing_input" which was not resolved
|
|
||||||
result = pipeline.run(data={"non_existing_input": "world", "who": "world"})
|
|
||||||
assert result == {"hello2": {"output": "Hello, Hello, world!!"}}
|
|
||||||
|
|
||||||
# important to check that the warning is logged for UX purposes, leave it here
|
|
||||||
assert "were not matched to any component" in caplog.text
|
|
||||||
|
|
||||||
|
|
||||||
def test_pipeline_resolution_duplicate_input_names_across_components():
|
|
||||||
@component
|
|
||||||
class Hello:
|
|
||||||
@component.output_types(output=str)
|
|
||||||
def run(self, who: str, what: str):
|
|
||||||
return {"output": f"Hello {who} {what}!"}
|
|
||||||
|
|
||||||
pipe = Pipeline()
|
|
||||||
pipe.add_component("hello", Hello())
|
|
||||||
pipe.add_component("hello2", Hello())
|
|
||||||
|
|
||||||
pipe.connect("hello.output", "hello2.who")
|
|
||||||
|
|
||||||
result = pipe.run(data={"what": "Haystack", "who": "world"})
|
|
||||||
assert result == {"hello2": {"output": "Hello Hello world Haystack! Haystack!"}}
|
|
||||||
|
|
||||||
resolved, _ = pipe._prepare_component_input_data(data={"what": "Haystack", "who": "world"})
|
|
||||||
|
|
||||||
# why does hello2 have only one input? Because who of hello2 is inserted from hello.output
|
|
||||||
assert resolved == {"hello": {"what": "Haystack", "who": "world"}, "hello2": {"what": "Haystack"}}
|
|
Loading…
x
Reference in New Issue
Block a user