feat: Add from_template class method to Pipeline (#7240)

* move templating code under the core package

* make from_predefined part of the Pipeline API

* add tests

* amend release notes

* import under haystack package

* Apply suggestions from code review

Co-authored-by: David S. Batista <dsbatista@gmail.com>

* from_predefined -> from_template

* remove template inheritance for more readability

---------

Co-authored-by: David S. Batista <dsbatista@gmail.com>
This commit is contained in:
Massimiliano Pippi 2024-02-29 12:23:32 +01:00 committed by GitHub
parent 4766efbf19
commit e7809b6fea
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 78 additions and 85 deletions

View File

@ -1,6 +1,6 @@
from haystack.core.component import component
from haystack.core.errors import ComponentError, DeserializationError
from haystack.core.pipeline import Pipeline
from haystack.core.pipeline import Pipeline, PredefinedPipeline
from haystack.core.serialization import default_from_dict, default_to_dict
from haystack.dataclasses import Answer, Document, ExtractedAnswer, GeneratedAnswer
import haystack.logging
@ -16,6 +16,7 @@ __all__ = [
"DeserializationError",
"ComponentError",
"Pipeline",
"PredefinedPipeline",
"Document",
"Answer",
"GeneratedAnswer",

View File

@ -1,6 +1,7 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
from haystack.core.pipeline.pipeline import Pipeline
from .pipeline import Pipeline
from .template import PredefinedPipeline
__all__ = ["Pipeline"]
__all__ = ["Pipeline", "PredefinedPipeline"]

View File

@ -20,6 +20,7 @@ from haystack.core.errors import (
PipelineMaxLoops,
PipelineRuntimeError,
PipelineValidationError,
PipelineUnmarshalError,
)
from haystack.core.serialization import component_from_dict, component_to_dict
from haystack.core.type_utils import _type_name, _types_are_compatible
@ -30,6 +31,7 @@ from haystack import tracing
from .descriptions import find_pipeline_inputs, find_pipeline_outputs
from .draw import _to_mermaid_image
from .template import PipelineTemplate, PredefinedPipeline
DEFAULT_MARSHALLER = YamlMarshaller()
logger = logging.getLogger(__name__)
@ -983,6 +985,30 @@ class Pipeline:
return pipeline_input_data, unresolved_kwargs
@classmethod
def from_template(
cls, predefined_pipeline: PredefinedPipeline, template_params: Optional[Dict[str, Any]] = None
) -> "Pipeline":
"""
Create a Pipeline from a predefined template. See `PredefinedPipeline` for available options.
:param predefined_pipeline: The predefined pipeline to use.
:param template_params: An optional dictionary of parameters to use when rendering the pipeline template.
:returns: An instance of `Pipeline`.
"""
tpl = PipelineTemplate.from_predefined(predefined_pipeline)
# If tpl.render() fails, we let bubble up the original error
rendered = tpl.render(template_params)
# If there was a problem with the rendered version of the
# template, we add it to the error stack for debugging
try:
return cls.loads(rendered)
except Exception as e:
msg = f"Error unmarshalling pipeline: {e}\n"
msg += f"Source:\n{rendered}"
raise PipelineUnmarshalError(msg)
def _connections_status(
sender_node: str, receiver_node: str, sender_sockets: List[OutputSocket], receiver_sockets: List[InputSocket]

View File

@ -1,6 +1,6 @@
{% extends "base.yaml.jinja2" %}
---
{% block components %}
components:
generator:
init_parameters:
api_key:
@ -12,13 +12,13 @@
prompt_builder:
init_parameters:
template: {% raw %}"Answer the question {{question}}.\n\nAnswer:"{% endraw +%}
template: {% raw %}"Answer the question {{question}}.\n\nAnswer:\n"{% endraw %}
type: "haystack.components.builders.prompt_builder.PromptBuilder"
{% endblock %}
{% block connections %}
connections:
- receiver: generator.prompt
sender: prompt_builder.prompt
{% endblock %}
metadata:
{}

View File

@ -1,6 +1,6 @@
{% extends "base.yaml.jinja2" %}
---
{% block components %}
components:
cleaner:
init_parameters:
remove_empty_lines: true
@ -73,9 +73,8 @@
type: "haystack.document_stores.in_memory.document_store.InMemoryDocumentStore"
policy: "FAIL"
type: "haystack.components.writers.document_writer.DocumentWriter"
{% endblock %}
{% block connections %}
connections:
- receiver: text_file_converter.sources
sender: file_type_router.text/plain
- receiver: doc_joiner.documents
@ -94,4 +93,6 @@
sender: splitter.documents
- receiver: writer.documents
sender: embedder.documents
{% endblock %}
metadata:
{}

View File

@ -1,6 +1,6 @@
{% extends "base.yaml.jinja2" %}
---
{% block components %}
components:
answer_builder:
init_parameters: {}
type: "haystack.components.builders.answer_builder.AnswerBuilder"
@ -47,9 +47,8 @@
template: |
{% raw %}"\nGiven these documents, answer the question.\n\nDocuments:\n{% for doc in documents %}\n{{ doc.content }}\n {% endfor %}\n\nQuestion: {{question}}\n\nAnswer:\n"{% endraw %}
type: "haystack.components.builders.prompt_builder.PromptBuilder"
{% endblock %}
{% block connections %}
connections:
- receiver: retriever.query_embedding
sender: text_embedder.embedding
- receiver: prompt_builder.documents
@ -62,4 +61,6 @@
sender: generator.replies
- receiver: answer_builder.meta
sender: generator.meta
{% endblock %}
metadata:
{}

View File

@ -4,9 +4,6 @@ from typing import Dict, Any, Optional, Union
from jinja2 import meta, TemplateSyntaxError, Environment, PackageLoader
from haystack import Pipeline
from haystack.core.errors import PipelineUnmarshalError
TEMPLATE_FILE_EXTENSION = ".yaml.jinja2"
TEMPLATE_HOME_DIR = Path(__file__).resolve().parent / "predefined"
@ -17,9 +14,6 @@ class PredefinedPipeline(Enum):
Enumeration of predefined pipeline templates that can be used to create a `PipelineTemplate`.
"""
# When type is empty, the template source must be provided to the PipelineTemplate before calling build()
EMPTY = "empty"
# Maintain 1-to-1 mapping between the enum name and the template file name in templates directory
GENERATIVE_QA = "generative_qa"
RAG = "rag"
@ -73,7 +67,7 @@ class PipelineTemplate:
:param template_content: The raw template source to use in the template.
"""
env = Environment(
loader=PackageLoader("haystack.templates", "predefined"), trim_blocks=True, lstrip_blocks=True
loader=PackageLoader("haystack.core.pipeline", "predefined"), trim_blocks=True, lstrip_blocks=True
)
try:
self._template = env.from_string(template_content)
@ -84,7 +78,7 @@ class PipelineTemplate:
self.template_variables = meta.find_undeclared_variables(env.parse(template_content))
self._template_content = template_content
def build(self, template_params: Optional[Dict[str, Any]] = None) -> Pipeline:
def render(self, template_params: Optional[Dict[str, Any]] = None) -> str:
"""
Constructs a `Pipeline` instance based on the template.
@ -93,13 +87,7 @@ class PipelineTemplate:
:return: An instance of `Pipeline` constructed from the rendered template and custom component configurations.
"""
template_params = template_params or {}
rendered = self._template.render(**template_params)
try:
return Pipeline.loads(rendered)
except Exception as e:
msg = f"Error unmarshalling pipeline: {e}\n"
msg += f"Source:\n{rendered}"
raise PipelineUnmarshalError(msg)
return self._template.render(**template_params)
@classmethod
def from_file(cls, file_path: Union[Path, str]) -> "PipelineTemplate":
@ -118,10 +106,6 @@ class PipelineTemplate:
:param predefined_pipeline: The predefined pipeline to use.
:return: An instance of `PipelineTemplate `.
"""
if predefined_pipeline == PredefinedPipeline.EMPTY:
# This is temporary, to ease the refactoring
raise ValueError("Please provide a PipelineType value")
template_path = f"{TEMPLATE_HOME_DIR}/{predefined_pipeline.value}{TEMPLATE_FILE_EXTENSION}"
return cls.from_file(template_path)

View File

@ -1,3 +0,0 @@
from haystack.templates.pipeline import PipelineTemplate, PredefinedPipeline
__all__ = ["PipelineTemplate", "PredefinedPipeline"]

View File

@ -1,17 +0,0 @@
---
components:
{% block components %}
{}
{% endblock %}
connections:
{% block connections %}
[]
{% endblock %}
metadata:
{% block metadata %}
{}
{% endblock %}

View File

@ -1,35 +1,23 @@
---
highlights: >
Introducing a flexible and dynamic approach to creating NLP pipelines with Haystack's new PipelineTemplate class!
This innovative feature utilizes Jinja templated YAML files, allowing users to effortlessly construct and customize
complex data processing pipelines for various NLP tasks. From question answering and document indexing to custom
pipeline requirements, the PipelineTemplate simplifies configuration and enhances adaptability. Users can now easily
override default components or integrate custom settings with simple, straightforward code.
Introducing a flexible and dynamic approach to creating NLP pipelines with predefined templates.
This innovative feature allows users to easily construct and customize complex data processing pipelines for various
NLP tasks.
For example, the following pipeline template can be used to create an indexing pipeline:
```python
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.templates import PipelineTemplate, PredefinedPipeline
from haystack import Pipeline, PredefinedPipeline
pt = PipelineTemplate(PredefinedPipeline.INDEXING)
pipe = pt.build(template_params={"use_pdf_file_converter": True})
result = pipe.run(data={"sources": ["some_local_dir/and_text_file.txt", "some_other_local_dir/and_pdf_file.pdf"]})
print(result)
pipe = Pipeline.from_template(PredefinedPipeline.INDEXING)
result = pipe.run(data={"sources": ["some_local_dir/and_text_file.txt"]})
```
In the above example, a PredefinedPipeline.INDEXING enum is used to create a pipeline with the PDF file converter
enabled. The pipeline is then run on a list of local files and the result is printed (number of indexed documents).
The above example creates a PredefinedPipeline.INDEXING pipeline ready to be used. We can use the same template
to create a slightly different indexing pipeline, adding a PDF to text converter:
We could have of course used the same PipelineTemplate class to create any other pre-defined pipeline or even a
custom pipeline with custom components and settings.
On the other hand, the following pipeline template can be used to create a pre-defined RAG pipeline:
```python
from haystack.templates import PipelineTemplate, PredefinedPipeline
from haystack import Pipeline, PredefinedPipeline
pipe = PipelineTemplate(PredefinedPipeline.RAG).build()
pipe = Pipeline.from_template(PredefinedPipeline.INDEXING, template_params={"use_pdf_file_converter": True})
result = pipe.run(query="What's the meaning of life?")
print(result)
```
PipelineTemplate can load templates from various inputs, including strings, files, and predefined templates.

View File

@ -10,7 +10,7 @@ import pytest
from haystack.core.component import component
from haystack.core.component.types import InputSocket, OutputSocket
from haystack.core.errors import PipelineDrawingError, PipelineError, PipelineMaxLoops, PipelineRuntimeError
from haystack.core.pipeline import Pipeline
from haystack.core.pipeline import Pipeline, PredefinedPipeline
from haystack.testing.factory import component_class
from haystack.testing.sample_components import AddFixedValue, Double
@ -654,3 +654,14 @@ def test_describe_no_outputs():
p.connect("a.x", "c.x")
p.connect("b.y", "c.y")
assert p.outputs() == {}
def test_from_template():
pipe = Pipeline.from_template(PredefinedPipeline.INDEXING)
assert pipe.get_component("cleaner")
with pytest.raises(ValueError):
pipe.get_component("pdf_file_converter")
pipe = Pipeline.from_template(PredefinedPipeline.INDEXING, template_params={"use_pdf_file_converter": True})
assert pipe.get_component("cleaner")
assert pipe.get_component("pdf_file_converter")

View File

@ -3,7 +3,7 @@ import tempfile
import pytest
from haystack import Pipeline
from haystack.templates.pipeline import PipelineTemplate, PredefinedPipeline
from haystack.core.pipeline.template import PipelineTemplate, PredefinedPipeline
@pytest.fixture
@ -45,8 +45,8 @@ class TestPipelineTemplate:
# Building a pipeline directly using all default components specified in a predefined or custom template.
def test_build_pipeline_with_default_components(self):
pipeline = PipelineTemplate.from_predefined(PredefinedPipeline.INDEXING).build()
assert isinstance(pipeline, Pipeline)
rendered = PipelineTemplate.from_predefined(PredefinedPipeline.INDEXING).render()
pipeline = Pipeline.loads(rendered)
# pipeline has components
assert pipeline.get_component("cleaner")