mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-19 11:58:44 +00:00
feat: Add PipelineTemplate for ready made pipelines (#7001)
* Initial working version * More experiments * Use jinja2 tempplate extensions * Remove for_type static method * Add ternary filter, with_kwargs * Add indexing pipeline * Rename with_kwargs -> with_parameters, allow cummulative invocation * Remove ternary filter, use another approach to conditionals in default component configuration * Massi feedback, add pydocs * PipelineTemplateBuilder -> PipelineTemplate * Add unit tests * Add reno note * Fix pylint, mypy * PR review - Madeesh * Add list_variables * PR review feedback - Madeesh * Update haystack/templates/pipelines.py Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> * Add timeout, fix pylint * Use full import path * Update release note * Fix failing windows test * Update haystack/templates/source.py Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> * Update haystack/templates/source.py Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> * PR feedback Madeesh * Fix pylint --------- Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
This commit is contained in:
parent
29acffca1d
commit
cb01cb4207
4
haystack/templates/__init__.py
Normal file
4
haystack/templates/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
from haystack.templates.pipelines import PipelineTemplate
|
||||||
|
from haystack.templates.source import PredefinedTemplate, TemplateSource
|
||||||
|
|
||||||
|
__all__ = ["PipelineTemplate", "TemplateSource", "PredefinedTemplate"]
|
104
haystack/templates/indexing.yaml.jinja2
Normal file
104
haystack/templates/indexing.yaml.jinja2
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
components:
|
||||||
|
cleaner: {{ cleaner | default({
|
||||||
|
"init_parameters": {
|
||||||
|
"remove_empty_lines": true,
|
||||||
|
"remove_extra_whitespaces": true,
|
||||||
|
"remove_regex": None,
|
||||||
|
"remove_repeated_substrings": false,
|
||||||
|
"remove_substrings": None
|
||||||
|
},
|
||||||
|
"type": "haystack.components.preprocessors.document_cleaner.DocumentCleaner"
|
||||||
|
}, true) | tojson }}
|
||||||
|
embedder: {{ embedder | default({
|
||||||
|
"init_parameters": {
|
||||||
|
"batch_size": 32,
|
||||||
|
"device": "cpu",
|
||||||
|
"embedding_separator": "\\n",
|
||||||
|
"model": "sentence-transformers/all-MiniLM-L6-v2",
|
||||||
|
"normalize_embeddings": false,
|
||||||
|
"prefix": "",
|
||||||
|
"progress_bar": true,
|
||||||
|
"suffix": "",
|
||||||
|
"token": None
|
||||||
|
},
|
||||||
|
"type": "haystack.components.embedders.sentence_transformers_document_embedder.SentenceTransformersDocumentEmbedder"
|
||||||
|
}, true) | tojson }}
|
||||||
|
|
||||||
|
# FileTypeRouter is used to route different file types to different file converters
|
||||||
|
# The default mime types are set to text/plain. If we'll handle PDF files, we'll add application/pdf to mime types
|
||||||
|
# Let's configure that part dynamically based on the use_pdf_file_converter flag
|
||||||
|
|
||||||
|
{% set default_mime_types = ["text/plain"] %}
|
||||||
|
{% set additional_mime_types = ["application/pdf"] if use_pdf_file_converter | default(false) else [] %}
|
||||||
|
{% set file_type_router_mime_types = default_mime_types + additional_mime_types %}
|
||||||
|
|
||||||
|
file_type_router: {{ file_type_router | default({
|
||||||
|
"init_parameters": {
|
||||||
|
"mime_types": file_type_router_mime_types,
|
||||||
|
},
|
||||||
|
"type": "haystack.components.routers.file_type_router.FileTypeRouter"
|
||||||
|
}, true) | tojson }}
|
||||||
|
doc_joiner: {{ doc_joiner | default({
|
||||||
|
"init_parameters": {
|
||||||
|
"join_mode": "concatenate"
|
||||||
|
},
|
||||||
|
"type": "haystack.components.joiners.document_joiner.DocumentJoiner"
|
||||||
|
}, true) | tojson }}
|
||||||
|
splitter: {{ splitter | default({
|
||||||
|
"init_parameters": {
|
||||||
|
"split_by": "sentence",
|
||||||
|
"split_length": 250,
|
||||||
|
"split_overlap": 30
|
||||||
|
},
|
||||||
|
"type": "haystack.components.preprocessors.document_splitter.DocumentSplitter"
|
||||||
|
}, true) | tojson }}
|
||||||
|
{% if use_pdf_file_converter %}
|
||||||
|
pdf_file_converter: {{ pdf_file_converter | default({
|
||||||
|
"init_parameters": {
|
||||||
|
"converter_name": "default"
|
||||||
|
},
|
||||||
|
"type": "haystack.components.converters.pypdf.PyPDFToDocument"
|
||||||
|
}, true) | tojson }}
|
||||||
|
{% endif %}
|
||||||
|
text_file_converter: {{ text_file_converter | default({
|
||||||
|
"init_parameters": {
|
||||||
|
"encoding": "utf-8"
|
||||||
|
},
|
||||||
|
"type": "haystack.components.converters.txt.TextFileToDocument"
|
||||||
|
}, true) | tojson }}
|
||||||
|
writer: {{ writer | default({
|
||||||
|
"init_parameters": {
|
||||||
|
"document_store": {
|
||||||
|
"init_parameters": {
|
||||||
|
"bm25_algorithm": "BM25Okapi",
|
||||||
|
"bm25_parameters": {},
|
||||||
|
"bm25_tokenization_regex": "(?u)\\b\\w\\w+\\b",
|
||||||
|
"embedding_similarity_function": "dot_product"
|
||||||
|
},
|
||||||
|
"type": "haystack.document_stores.in_memory.document_store.InMemoryDocumentStore"
|
||||||
|
},
|
||||||
|
"policy": "FAIL"
|
||||||
|
},
|
||||||
|
"type": "haystack.components.writers.document_writer.DocumentWriter"
|
||||||
|
}, true) | tojson }}
|
||||||
|
connections:
|
||||||
|
- receiver: text_file_converter.sources
|
||||||
|
sender: file_type_router.text/plain
|
||||||
|
- receiver: doc_joiner.documents
|
||||||
|
sender: text_file_converter.documents
|
||||||
|
{% if use_pdf_file_converter %}
|
||||||
|
- receiver: pdf_file_converter.sources
|
||||||
|
sender: file_type_router.application/pdf
|
||||||
|
- receiver: doc_joiner.documents
|
||||||
|
sender: pdf_file_converter.documents
|
||||||
|
{% endif %}
|
||||||
|
- receiver: cleaner.documents
|
||||||
|
sender: doc_joiner.documents
|
||||||
|
- receiver: splitter.documents
|
||||||
|
sender: cleaner.documents
|
||||||
|
- receiver: embedder.documents
|
||||||
|
sender: splitter.documents
|
||||||
|
- receiver: writer.documents
|
||||||
|
sender: embedder.documents
|
||||||
|
max_loops_allowed: 5
|
||||||
|
metadata: {}
|
149
haystack/templates/pipelines.py
Normal file
149
haystack/templates/pipelines.py
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
from typing import Dict, Any, Set, Optional
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
from jinja2 import meta, TemplateSyntaxError
|
||||||
|
from jinja2.nativetypes import NativeEnvironment
|
||||||
|
|
||||||
|
from haystack import Pipeline
|
||||||
|
from haystack.core.component import Component
|
||||||
|
from haystack.core.errors import PipelineValidationError
|
||||||
|
from haystack.core.serialization import component_to_dict
|
||||||
|
from haystack.templates.source import TemplateSource
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineTemplate:
|
||||||
|
"""
|
||||||
|
The PipelineTemplate class enables the straightforward creation of flexible and configurable pipelines using
|
||||||
|
Jinja2 templated YAML files. Specifically designed to simplify the setup of complex data processing pipelines for
|
||||||
|
a range of NLP tasks—including question answering, retriever augmented generation (RAG), document indexing, among
|
||||||
|
others - PipelineTemplate empowers users to dynamically generate pipeline configurations from templates and
|
||||||
|
customize components as necessary. Its design philosophy centers on providing an accessible, yet powerful, tool
|
||||||
|
for constructing pipelines that accommodate both common use cases and specialized requirements with ease.
|
||||||
|
|
||||||
|
|
||||||
|
The class enables two primary use cases:
|
||||||
|
|
||||||
|
1. Building a pipeline directly using all default components specified in a predefined or custom template.
|
||||||
|
2. Customizing pipelines by overriding default components with custom component settings, integrating user-provided
|
||||||
|
component instances, and adjusting component parameters conditionally.
|
||||||
|
|
||||||
|
Examples of usage:
|
||||||
|
|
||||||
|
- **Default Build**: Instantiating a pipeline with default settings for a "question answering" (qa) task.
|
||||||
|
```python
|
||||||
|
from haystack.templates import PipelineTemplate, TemplateSource, PredefinedTemplate
|
||||||
|
|
||||||
|
# Create a pipeline with default components for a QA task
|
||||||
|
ts = TemplateSource.from_predefined(PredefinedTemplate.QA)
|
||||||
|
pipe = PipelineTemplate(ts).build()
|
||||||
|
print(pipe.run(data={"question": "What's the capital of Bosnia and Herzegovina? Be brief"}))
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Custom Component Settings**: Customizing a pipeline by overriding a component, such as integrating a
|
||||||
|
streaming-capable generator for real-time feedback.
|
||||||
|
```python
|
||||||
|
from haystack.components.generators import OpenAIGenerator
|
||||||
|
from haystack.components.generators.utils import print_streaming_chunk
|
||||||
|
from haystack.templates import PipelineTemplate, TemplateSource, PredefinedTemplate
|
||||||
|
|
||||||
|
# Customize the pipeline with a streaming-capable generator
|
||||||
|
ts = TemplateSource.from_predefined(PredefinedTemplate.QA)
|
||||||
|
streaming_pipe = PipelineTemplate(ts).override("generator",
|
||||||
|
OpenAIGenerator(
|
||||||
|
streaming_callback=print_streaming_chunk)).build()
|
||||||
|
streaming_pipe.run(data={"question": "What's the capital of Germany? Tell me about it"})
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Customizing for Specific Tasks**: Building a pipeline for document indexing with specific components tailored
|
||||||
|
to the task.
|
||||||
|
```python
|
||||||
|
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
|
||||||
|
from haystack.templates import PipelineTemplate, TemplateSource, PredefinedTemplate
|
||||||
|
|
||||||
|
# Customize the pipeline for document indexing with specific components, include PDF file converter
|
||||||
|
ts = TemplateSource.from_predefined(PredefinedTemplate.INDEXING)
|
||||||
|
ptb = PipelineTemplate(ts, template_params={"use_pdf_file_converter": True})
|
||||||
|
ptb.override("embedder", SentenceTransformersDocumentEmbedder(progress_bar=True))
|
||||||
|
pipe = ptb.build()
|
||||||
|
|
||||||
|
result = pipe.run(data={
|
||||||
|
"sources": ["some_text_file.txt", "another_pdf_file.pdf"]})
|
||||||
|
print(result)
|
||||||
|
```
|
||||||
|
|
||||||
|
The `PipelineTemplate` is designed to offer both ease of use for common pipeline configurations and the
|
||||||
|
flexibility to customize and extend pipelines as required by advanced users and specific use cases.
|
||||||
|
"""
|
||||||
|
|
||||||
|
template_file_extension = ".yaml.jinja2"
|
||||||
|
|
||||||
|
def __init__(self, pipeline_template: TemplateSource, template_params: Optional[Dict[str, Any]] = None):
|
||||||
|
"""
|
||||||
|
Initialize a PipelineTemplate.
|
||||||
|
|
||||||
|
:param pipeline_template: The template source to use. See `TemplateSource` for available methods to load
|
||||||
|
templates.
|
||||||
|
:param template_params: An optional dictionary of parameters to use when rendering the pipeline template.
|
||||||
|
"""
|
||||||
|
self.template_text = pipeline_template.template
|
||||||
|
env = NativeEnvironment()
|
||||||
|
try:
|
||||||
|
self.template = env.from_string(self.template_text)
|
||||||
|
except TemplateSyntaxError as e:
|
||||||
|
raise ValueError(f"Invalid pipeline template, template syntax error: {e.message}") from e
|
||||||
|
self.templated_variables = self._extract_variables(env)
|
||||||
|
self.components: Dict[str, Any] = {}
|
||||||
|
self.template_params = template_params or {}
|
||||||
|
|
||||||
|
def override(self, component_name: str, component_instance: Component) -> "PipelineTemplate":
|
||||||
|
"""
|
||||||
|
Overrides a component specified in the pipeline template with a custom component instance.
|
||||||
|
|
||||||
|
:param component_name: The name of the component within the template to override.
|
||||||
|
:param component_instance: The instance of the component to use as an override. Must be an instance
|
||||||
|
of a class annotated with `@component`.
|
||||||
|
|
||||||
|
:return: The instance of `PipelineTemplate` to allow for method chaining.
|
||||||
|
|
||||||
|
:raises PipelineValidationError: If the `component_name` does not exist in the template or if
|
||||||
|
`component_instance` is not a valid component.
|
||||||
|
"""
|
||||||
|
# check if the component_name is allowed in the template
|
||||||
|
if component_name not in self.templated_variables:
|
||||||
|
raise PipelineValidationError(f"Component '{component_name}' is not defined in the pipeline template")
|
||||||
|
if not isinstance(component_instance, Component):
|
||||||
|
raise PipelineValidationError(
|
||||||
|
f"'{type(component_instance)}' doesn't seem to be a component. Is this class decorated with @component?"
|
||||||
|
)
|
||||||
|
self.components[component_name] = component_to_dict(component_instance)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def list_variables(self) -> Set[str]:
|
||||||
|
"""
|
||||||
|
Lists all templated variables in the pipeline template.
|
||||||
|
|
||||||
|
:return: a list of strings representing the names of templated variables in the pipeline template.
|
||||||
|
"""
|
||||||
|
return self.templated_variables
|
||||||
|
|
||||||
|
def build(self):
|
||||||
|
"""
|
||||||
|
Constructs a `Pipeline` instance based on the template and any overridden components.
|
||||||
|
|
||||||
|
:return: An instance of `Pipeline` constructed from the rendered template and custom component configurations.
|
||||||
|
"""
|
||||||
|
rendered_yaml = self.template.render(**self.components, **self.template_params)
|
||||||
|
pipeline_yaml = yaml.safe_load(rendered_yaml)
|
||||||
|
return Pipeline.from_dict(pipeline_yaml)
|
||||||
|
|
||||||
|
def _extract_variables(self, env: NativeEnvironment) -> Set[str]:
|
||||||
|
"""
|
||||||
|
Extracts all variables from a list of Jinja template strings.
|
||||||
|
|
||||||
|
:param env: A Jinja native environment.
|
||||||
|
:return: A set of variable names extracted from the template strings.
|
||||||
|
"""
|
||||||
|
variables = set()
|
||||||
|
ast = env.parse(self.template_text)
|
||||||
|
variables.update(meta.find_undeclared_variables(ast))
|
||||||
|
return variables
|
23
haystack/templates/qa.yaml.jinja2
Normal file
23
haystack/templates/qa.yaml.jinja2
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
components:
|
||||||
|
generator: {{ generator | default({
|
||||||
|
"init_parameters": {
|
||||||
|
"api_key": {
|
||||||
|
"env_vars": [ "OPENAI_API_KEY" ],
|
||||||
|
"strict": true,
|
||||||
|
"type": "env_var"
|
||||||
|
},
|
||||||
|
"model": "gpt-3.5-turbo"
|
||||||
|
},
|
||||||
|
"type": "haystack.components.generators.openai.OpenAIGenerator"
|
||||||
|
}, true) | tojson }}
|
||||||
|
prompt_builder: {{ prompt_builder | default({
|
||||||
|
"init_parameters": {
|
||||||
|
"template": "\nGiven the question {{question}}\nAnswer it in German only.\n\nAntwort:",
|
||||||
|
},
|
||||||
|
"type": "haystack.components.builders.prompt_builder.PromptBuilder"
|
||||||
|
}, true) | tojson }}
|
||||||
|
connections:
|
||||||
|
- receiver: generator.prompt
|
||||||
|
sender: prompt_builder.prompt
|
||||||
|
max_loops_allowed: 2
|
||||||
|
metadata: {}
|
72
haystack/templates/rag.yaml.jinja2
Normal file
72
haystack/templates/rag.yaml.jinja2
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
components:
|
||||||
|
answer_builder: {{ answer_builder | default({
|
||||||
|
"init_parameters": {},
|
||||||
|
"type": "haystack.components.builders.answer_builder.AnswerBuilder"
|
||||||
|
}, true) | tojson }}
|
||||||
|
generator: {{ generator | default({
|
||||||
|
"init_parameters": {
|
||||||
|
"api_key": {
|
||||||
|
"env_vars": [ "OPENAI_API_KEY" ],
|
||||||
|
"strict": true,
|
||||||
|
"type": "env_var"
|
||||||
|
},
|
||||||
|
"model": "gpt-3.5-turbo"
|
||||||
|
},
|
||||||
|
"type": "haystack.components.generators.openai.OpenAIGenerator"
|
||||||
|
}, true) | tojson }}
|
||||||
|
retriever: {{ retriever | default({
|
||||||
|
"init_parameters": {
|
||||||
|
"document_store": {
|
||||||
|
"init_parameters": {
|
||||||
|
"bm25_algorithm": "BM25L",
|
||||||
|
"bm25_parameters": {},
|
||||||
|
"bm25_tokenization_regex": "(?u)\\b\\w\\w+\\b",
|
||||||
|
"embedding_similarity_function": "dot_product"
|
||||||
|
},
|
||||||
|
"type": "haystack.document_stores.in_memory.document_store.InMemoryDocumentStore"
|
||||||
|
},
|
||||||
|
"filters": None,
|
||||||
|
"return_embedding": false,
|
||||||
|
"scale_score": false,
|
||||||
|
"top_k": 10
|
||||||
|
},
|
||||||
|
"type": "haystack.components.retrievers.in_memory.embedding_retriever.InMemoryEmbeddingRetriever"
|
||||||
|
}, true) | tojson }}
|
||||||
|
text_embedder: {{ text_embedder | default({
|
||||||
|
"init_parameters": {
|
||||||
|
"batch_size": 32,
|
||||||
|
"device": "cpu",
|
||||||
|
"model": "sentence-transformers/all-mpnet-base-v2",
|
||||||
|
"normalize_embeddings": false,
|
||||||
|
"prefix": "",
|
||||||
|
"progress_bar": true,
|
||||||
|
"suffix": "",
|
||||||
|
"token": {
|
||||||
|
"env_vars": [ "HF_API_TOKEN" ],
|
||||||
|
"strict": false,
|
||||||
|
"type": "env_var"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"type": "haystack.components.embedders.sentence_transformers_text_embedder.SentenceTransformersTextEmbedder"
|
||||||
|
}, true) | tojson }}
|
||||||
|
prompt_builder: {{ prompt_builder | default({
|
||||||
|
"init_parameters": {
|
||||||
|
"template": "\nGiven these documents, answer the question.\n\nDocuments:\n{% for doc in documents %}\n{{ doc.content }}\n {% endfor %}\n\nQuestion: {{question}}\n\nAnswer:\n",
|
||||||
|
},
|
||||||
|
"type": "haystack.components.builders.prompt_builder.PromptBuilder"
|
||||||
|
}, true) | tojson }}
|
||||||
|
connections:
|
||||||
|
- receiver: retriever.query_embedding
|
||||||
|
sender: text_embedder.embedding
|
||||||
|
- receiver: prompt_builder.documents
|
||||||
|
sender: retriever.documents
|
||||||
|
- receiver: answer_builder.documents
|
||||||
|
sender: retriever.documents
|
||||||
|
- receiver: generator.prompt
|
||||||
|
sender: prompt_builder.prompt
|
||||||
|
- receiver: answer_builder.replies
|
||||||
|
sender: generator.replies
|
||||||
|
- receiver: answer_builder.meta
|
||||||
|
sender: generator.meta
|
||||||
|
max_loops_allowed: 100
|
||||||
|
metadata: {}
|
112
haystack/templates/source.py
Normal file
112
haystack/templates/source.py
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
import re
|
||||||
|
from enum import Enum
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
TEMPLATE_FILE_EXTENSION = ".yaml.jinja2"
|
||||||
|
TEMPLATE_HOME_DIR = Path(__file__).resolve().parent
|
||||||
|
|
||||||
|
|
||||||
|
class PredefinedTemplate(Enum):
|
||||||
|
"""
|
||||||
|
Enumeration of predefined pipeline templates that can be used to create a `PipelineTemplate` using `TemplateSource`.
|
||||||
|
See `TemplateSource.from_predefined` for usage.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# maintain 1-to-1 mapping between the enum name and the template file name in templates directory
|
||||||
|
QA = "qa"
|
||||||
|
RAG = "rag"
|
||||||
|
INDEXING = "indexing"
|
||||||
|
|
||||||
|
|
||||||
|
class TemplateSource:
|
||||||
|
"""
|
||||||
|
TemplateSource loads template content from various inputs, including strings, files, predefined templates, and URLs.
|
||||||
|
The class provides mechanisms to load templates dynamically and ensure they contain valid Jinja2 syntax.
|
||||||
|
|
||||||
|
TemplateSource is used by `PipelineTemplate` to load pipeline templates from various sources.
|
||||||
|
For example:
|
||||||
|
```python
|
||||||
|
# Load a predefined indexing pipeline template
|
||||||
|
ts = TemplateSource.from_predefined(PredefinedTemplate.INDEXING)
|
||||||
|
pipeline = PipelineTemplate(ts)
|
||||||
|
|
||||||
|
# Load a custom pipeline template from a file
|
||||||
|
ts = TemplateSource.from_file("path/to/custom_template.yaml.jinja2")
|
||||||
|
pipeline = PipelineTemplate(ts)
|
||||||
|
```
|
||||||
|
|
||||||
|
Similar methods are available to load templates from strings and URLs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, template: str):
|
||||||
|
"""
|
||||||
|
Initialize a TemplateSource.
|
||||||
|
:param template: The template string to use.
|
||||||
|
"""
|
||||||
|
self._template = template
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_str(cls, template_str: str) -> "TemplateSource":
|
||||||
|
"""
|
||||||
|
Create a TemplateSource from a string.
|
||||||
|
:param template_str: The template string to use. Must contain valid Jinja2 syntax.
|
||||||
|
:return: An instance of `TemplateSource`.
|
||||||
|
"""
|
||||||
|
if not cls._contains_jinja2_syntax(template_str):
|
||||||
|
raise ValueError("The provided template does not contain Jinja2 syntax.")
|
||||||
|
return cls(template_str)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_file(cls, file_path: Union[Path, str]) -> "TemplateSource":
|
||||||
|
"""
|
||||||
|
Create a TemplateSource from a file.
|
||||||
|
:param file_path: The path to the file containing the template. Must contain valid Jinja2 syntax.
|
||||||
|
:return: An instance of `TemplateSource`.
|
||||||
|
"""
|
||||||
|
with open(file_path, "r") as file:
|
||||||
|
return cls.from_str(file.read())
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_predefined(cls, predefined_template: PredefinedTemplate) -> "TemplateSource":
|
||||||
|
"""
|
||||||
|
Create a TemplateSource from a predefined template. See `PredefinedTemplate` for available options.
|
||||||
|
:param predefined_template: The name of the predefined template to use.
|
||||||
|
:return: An instance of `TemplateSource`.
|
||||||
|
"""
|
||||||
|
template_path = f"{TEMPLATE_HOME_DIR}/{predefined_template.value}{TEMPLATE_FILE_EXTENSION}"
|
||||||
|
return cls.from_file(template_path)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_url(cls, url: str) -> "TemplateSource":
|
||||||
|
"""
|
||||||
|
Create a TemplateSource from a URL.
|
||||||
|
:param url: The URL to fetch the template from. Must contain valid Jinja2 syntax.
|
||||||
|
:return: An instance of `TemplateSource`.
|
||||||
|
"""
|
||||||
|
response = requests.get(url, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
return cls.from_str(response.text)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def template(self) -> str:
|
||||||
|
"""
|
||||||
|
Returns the raw template string as a read-only property.
|
||||||
|
"""
|
||||||
|
return self._template
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _contains_jinja2_syntax(potential_jinja_template: str) -> bool:
|
||||||
|
"""
|
||||||
|
Determines if a given string contains Jinja2 templating syntax.
|
||||||
|
|
||||||
|
:param potential_jinja_template: The string to check for Jinja2 syntax.
|
||||||
|
|
||||||
|
:return: `True` if Jinja2 syntax is found, otherwise `False`.
|
||||||
|
"""
|
||||||
|
# Patterns to look for: {{ var }}, {% block %}, {# comment #}
|
||||||
|
patterns = [r"\{\{.*?\}\}", r"\{%.*?%\}", r"\{#.*?#\}"]
|
||||||
|
combined_pattern = re.compile("|".join(patterns))
|
||||||
|
return bool(combined_pattern.search(potential_jinja_template))
|
@ -0,0 +1,42 @@
|
|||||||
|
---
|
||||||
|
highlights:
|
||||||
|
- |
|
||||||
|
Introducing a flexible and dynamic approach to creating NLP pipelines with Haystack's new PipelineTemplate class!
|
||||||
|
This innovative feature utilizes Jinja2 templated YAML files, allowing users to effortlessly construct and customize
|
||||||
|
complex data processing pipelines for various NLP tasks. From question answering and document indexing to custom
|
||||||
|
pipeline requirements, the PipelineTemplate simplifies configuration and enhances adaptability. Users can now easily
|
||||||
|
override default components or integrate custom settings with simple, straightforward code.
|
||||||
|
|
||||||
|
For example, the following pipeline template can be used to create an indexing pipeline:
|
||||||
|
```python
|
||||||
|
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
|
||||||
|
from haystack.templates import PipelineTemplate, TemplateSource, PredefinedTemplate
|
||||||
|
|
||||||
|
ts = TemplateSource.from_predefined(PredefinedTemplate.INDEXING)
|
||||||
|
pt = PipelineTemplate(ts, template_params={"use_pdf_file_converter": True})
|
||||||
|
pt.override("embedder", SentenceTransformersDocumentEmbedder(progress_bar=True))
|
||||||
|
pipe = ptb.build()
|
||||||
|
|
||||||
|
result = pipe.run(data={"sources": ["some_local_dir/and_text_file.txt", "some_other_local_dir/and_pdf_file.pdf"]})
|
||||||
|
print(result)
|
||||||
|
```
|
||||||
|
|
||||||
|
In the above example, a PredefinedTemplate.INDEXING enum is used to create a pipeline with a custom instance of
|
||||||
|
SentenceTransformersDocumentEmbedder and the PDF file converter enabled. The pipeline is then run on a list of
|
||||||
|
local files and the result is printed (number of indexed documents).
|
||||||
|
|
||||||
|
We could have of course used the same PipelineTemplate class to create any other pre-defined pipeline or even a
|
||||||
|
custom pipeline with custom components and settings.
|
||||||
|
|
||||||
|
On the other hand, the following pipeline template can be used to create a pre-defined RAG pipeline:
|
||||||
|
```python
|
||||||
|
from haystack.templates import PipelineTemplate, TemplateSource, PredefinedTemplate
|
||||||
|
|
||||||
|
ts = TemplateSource.from_predefined(PredefinedTemplate.RAG)
|
||||||
|
pipe = PipelineTemplate(ts).build()
|
||||||
|
result = pipe.run(query="What's the meaning of life?")
|
||||||
|
print(result)
|
||||||
|
```
|
||||||
|
|
||||||
|
TemplateSource loads template content from various inputs, including strings, files, predefined templates, and URLs.
|
||||||
|
The class provides mechanisms to load templates dynamically and ensure they contain valid Jinja2 syntax.
|
122
test/templates/test_templates.py
Normal file
122
test/templates/test_templates.py
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from haystack import Pipeline
|
||||||
|
from haystack.components.builders import PromptBuilder
|
||||||
|
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
|
||||||
|
from haystack.components.generators import HuggingFaceTGIGenerator
|
||||||
|
from haystack.core.errors import PipelineValidationError
|
||||||
|
from haystack.templates import PipelineTemplate, TemplateSource, PredefinedTemplate
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def random_valid_template():
|
||||||
|
template = """components:
|
||||||
|
generator: {{ generator | tojson }}
|
||||||
|
prompt_builder: {{prompt_builder}}
|
||||||
|
|
||||||
|
connections:
|
||||||
|
- receiver: generator.prompt
|
||||||
|
sender: prompt_builder.prompt
|
||||||
|
max_loops_allowed: 2
|
||||||
|
metadata: {}
|
||||||
|
"""
|
||||||
|
return template
|
||||||
|
|
||||||
|
|
||||||
|
class TestPipelineTemplate:
|
||||||
|
# test_TemplateSource
|
||||||
|
# If the provided template does not contain Jinja2 syntax.
|
||||||
|
def test_from_str(self):
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
TemplateSource.from_str("invalid_template")
|
||||||
|
|
||||||
|
# If the provided template contains Jinja2 syntax.
|
||||||
|
def test_from_str_valid(self):
|
||||||
|
ts = TemplateSource.from_str("{{ valid_template }}")
|
||||||
|
assert ts.template == "{{ valid_template }}"
|
||||||
|
|
||||||
|
# If the provided file path does not exist.
|
||||||
|
def test_from_file_invalid_path(self):
|
||||||
|
with pytest.raises(FileNotFoundError):
|
||||||
|
TemplateSource.from_file("invalid_path")
|
||||||
|
|
||||||
|
# If the provided file path exists.
|
||||||
|
@pytest.mark.skipif(sys.platform == "win32", reason="Fails on Windows CI with permission denied")
|
||||||
|
def test_from_file_valid_path(self, random_valid_template):
|
||||||
|
temp_file = tempfile.NamedTemporaryFile(mode="w")
|
||||||
|
temp_file.write(random_valid_template)
|
||||||
|
temp_file.flush()
|
||||||
|
ts = TemplateSource.from_file(temp_file.name)
|
||||||
|
assert ts.template == random_valid_template
|
||||||
|
|
||||||
|
# Use predefined template
|
||||||
|
def test_from_predefined_invalid_template(self):
|
||||||
|
ts = TemplateSource.from_predefined(PredefinedTemplate.INDEXING)
|
||||||
|
assert len(ts.template) > 0
|
||||||
|
|
||||||
|
# Raises PipelineValidationError when attempting to override a non-existent component
|
||||||
|
def test_override_nonexistent_component(self):
|
||||||
|
ts = TemplateSource.from_predefined(PredefinedTemplate.INDEXING)
|
||||||
|
|
||||||
|
with pytest.raises(PipelineValidationError):
|
||||||
|
PipelineTemplate(ts).override("nonexistent_component", SentenceTransformersDocumentEmbedder())
|
||||||
|
|
||||||
|
# Building a pipeline directly using all default components specified in a predefined or custom template.
|
||||||
|
def test_build_pipeline_with_default_components(self):
|
||||||
|
ts = TemplateSource.from_predefined(PredefinedTemplate.INDEXING)
|
||||||
|
pipeline = PipelineTemplate(ts).build()
|
||||||
|
assert isinstance(pipeline, Pipeline)
|
||||||
|
|
||||||
|
# pipeline has components
|
||||||
|
assert pipeline.get_component("cleaner")
|
||||||
|
assert pipeline.get_component("writer")
|
||||||
|
assert pipeline.get_component("embedder")
|
||||||
|
|
||||||
|
# pipeline should have inputs and outputs
|
||||||
|
assert len(pipeline.inputs()) > 0
|
||||||
|
assert len(pipeline.outputs()) > 0
|
||||||
|
|
||||||
|
# Customizing pipelines by overriding default components with custom component settings
|
||||||
|
def test_customize_pipeline_with_overrides(self):
|
||||||
|
ts = TemplateSource.from_predefined(PredefinedTemplate.INDEXING)
|
||||||
|
pt = PipelineTemplate(ts)
|
||||||
|
|
||||||
|
pt.override("embedder", SentenceTransformersDocumentEmbedder(progress_bar=True, batch_size=64))
|
||||||
|
pipe = pt.build()
|
||||||
|
|
||||||
|
assert isinstance(pipe, Pipeline)
|
||||||
|
assert pipe.get_component("embedder")
|
||||||
|
embedder: SentenceTransformersDocumentEmbedder = pipe.get_component("embedder")
|
||||||
|
embedder_dict = embedder.to_dict()
|
||||||
|
assert embedder_dict["init_parameters"]["progress_bar"]
|
||||||
|
assert embedder_dict["init_parameters"]["batch_size"] == 64
|
||||||
|
|
||||||
|
# Overrides a generator component specified in the pipeline template with a completely different generator
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_override_component(self):
|
||||||
|
# integration because we'll fetch the tokenizer
|
||||||
|
pipe = (
|
||||||
|
PipelineTemplate(TemplateSource.from_predefined(PredefinedTemplate.QA))
|
||||||
|
.override("generator", HuggingFaceTGIGenerator())
|
||||||
|
.build()
|
||||||
|
)
|
||||||
|
assert isinstance(pipe, Pipeline)
|
||||||
|
assert pipe.get_component("generator")
|
||||||
|
assert isinstance(pipe.get_component("generator"), HuggingFaceTGIGenerator)
|
||||||
|
|
||||||
|
# Building a pipeline with a custom template that uses Jinja2 syntax to specify components and their connections
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_building_pipeline_with_direct_template(self, random_valid_template):
|
||||||
|
pt = PipelineTemplate(TemplateSource.from_str(random_valid_template))
|
||||||
|
pt.override("generator", HuggingFaceTGIGenerator())
|
||||||
|
pt.override("prompt_builder", PromptBuilder("Some fake prompt"))
|
||||||
|
pipe = pt.build()
|
||||||
|
|
||||||
|
assert isinstance(pipe, Pipeline)
|
||||||
|
assert pipe.get_component("generator")
|
||||||
|
assert isinstance(pipe.get_component("generator"), HuggingFaceTGIGenerator)
|
||||||
|
assert pipe.get_component("prompt_builder")
|
||||||
|
assert isinstance(pipe.get_component("prompt_builder"), PromptBuilder)
|
Loading…
x
Reference in New Issue
Block a user