From cb01cb4207cb0d08be149e8dc063effdc0bf4634 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Mon, 19 Feb 2024 13:59:48 +0100 Subject: [PATCH] feat: Add PipelineTemplate for ready made pipelines (#7001) * Initial working version * More experiments * Use jinja2 tempplate extensions * Remove for_type static method * Add ternary filter, with_kwargs * Add indexing pipeline * Rename with_kwargs -> with_parameters, allow cummulative invocation * Remove ternary filter, use another approach to conditionals in default component configuration * Massi feedback, add pydocs * PipelineTemplateBuilder -> PipelineTemplate * Add unit tests * Add reno note * Fix pylint, mypy * PR review - Madeesh * Add list_variables * PR review feedback - Madeesh * Update haystack/templates/pipelines.py Co-authored-by: Madeesh Kannan * Add timeout, fix pylint * Use full import path * Update release note * Fix failing windows test * Update haystack/templates/source.py Co-authored-by: Madeesh Kannan * Update haystack/templates/source.py Co-authored-by: Madeesh Kannan * PR feedback Madeesh * Fix pylint --------- Co-authored-by: Madeesh Kannan --- haystack/templates/__init__.py | 4 + haystack/templates/indexing.yaml.jinja2 | 104 ++++++++++++ haystack/templates/pipelines.py | 149 ++++++++++++++++++ haystack/templates/qa.yaml.jinja2 | 23 +++ haystack/templates/rag.yaml.jinja2 | 72 +++++++++ haystack/templates/source.py | 112 +++++++++++++ ...d-pipeline-templates-831f857c6387f8c3.yaml | 42 +++++ test/templates/test_templates.py | 122 ++++++++++++++ 8 files changed, 628 insertions(+) create mode 100644 haystack/templates/__init__.py create mode 100644 haystack/templates/indexing.yaml.jinja2 create mode 100644 haystack/templates/pipelines.py create mode 100644 haystack/templates/qa.yaml.jinja2 create mode 100644 haystack/templates/rag.yaml.jinja2 create mode 100644 haystack/templates/source.py create mode 100644 releasenotes/notes/add-pipeline-templates-831f857c6387f8c3.yaml create mode 100644 test/templates/test_templates.py diff --git a/haystack/templates/__init__.py b/haystack/templates/__init__.py new file mode 100644 index 000000000..5962bb0f6 --- /dev/null +++ b/haystack/templates/__init__.py @@ -0,0 +1,4 @@ +from haystack.templates.pipelines import PipelineTemplate +from haystack.templates.source import PredefinedTemplate, TemplateSource + +__all__ = ["PipelineTemplate", "TemplateSource", "PredefinedTemplate"] diff --git a/haystack/templates/indexing.yaml.jinja2 b/haystack/templates/indexing.yaml.jinja2 new file mode 100644 index 000000000..2bac8a9b3 --- /dev/null +++ b/haystack/templates/indexing.yaml.jinja2 @@ -0,0 +1,104 @@ +components: + cleaner: {{ cleaner | default({ + "init_parameters": { + "remove_empty_lines": true, + "remove_extra_whitespaces": true, + "remove_regex": None, + "remove_repeated_substrings": false, + "remove_substrings": None + }, + "type": "haystack.components.preprocessors.document_cleaner.DocumentCleaner" + }, true) | tojson }} + embedder: {{ embedder | default({ + "init_parameters": { + "batch_size": 32, + "device": "cpu", + "embedding_separator": "\\n", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "normalize_embeddings": false, + "prefix": "", + "progress_bar": true, + "suffix": "", + "token": None + }, + "type": "haystack.components.embedders.sentence_transformers_document_embedder.SentenceTransformersDocumentEmbedder" + }, true) | tojson }} + + # FileTypeRouter is used to route different file types to different file converters + # The default mime types are set to text/plain. If we'll handle PDF files, we'll add application/pdf to mime types + # Let's configure that part dynamically based on the use_pdf_file_converter flag + + {% set default_mime_types = ["text/plain"] %} + {% set additional_mime_types = ["application/pdf"] if use_pdf_file_converter | default(false) else [] %} + {% set file_type_router_mime_types = default_mime_types + additional_mime_types %} + + file_type_router: {{ file_type_router | default({ + "init_parameters": { + "mime_types": file_type_router_mime_types, + }, + "type": "haystack.components.routers.file_type_router.FileTypeRouter" + }, true) | tojson }} + doc_joiner: {{ doc_joiner | default({ + "init_parameters": { + "join_mode": "concatenate" + }, + "type": "haystack.components.joiners.document_joiner.DocumentJoiner" + }, true) | tojson }} + splitter: {{ splitter | default({ + "init_parameters": { + "split_by": "sentence", + "split_length": 250, + "split_overlap": 30 + }, + "type": "haystack.components.preprocessors.document_splitter.DocumentSplitter" + }, true) | tojson }} + {% if use_pdf_file_converter %} + pdf_file_converter: {{ pdf_file_converter | default({ + "init_parameters": { + "converter_name": "default" + }, + "type": "haystack.components.converters.pypdf.PyPDFToDocument" + }, true) | tojson }} + {% endif %} + text_file_converter: {{ text_file_converter | default({ + "init_parameters": { + "encoding": "utf-8" + }, + "type": "haystack.components.converters.txt.TextFileToDocument" + }, true) | tojson }} + writer: {{ writer | default({ + "init_parameters": { + "document_store": { + "init_parameters": { + "bm25_algorithm": "BM25Okapi", + "bm25_parameters": {}, + "bm25_tokenization_regex": "(?u)\\b\\w\\w+\\b", + "embedding_similarity_function": "dot_product" + }, + "type": "haystack.document_stores.in_memory.document_store.InMemoryDocumentStore" + }, + "policy": "FAIL" + }, + "type": "haystack.components.writers.document_writer.DocumentWriter" + }, true) | tojson }} +connections: +- receiver: text_file_converter.sources + sender: file_type_router.text/plain +- receiver: doc_joiner.documents + sender: text_file_converter.documents + {% if use_pdf_file_converter %} +- receiver: pdf_file_converter.sources + sender: file_type_router.application/pdf +- receiver: doc_joiner.documents + sender: pdf_file_converter.documents + {% endif %} +- receiver: cleaner.documents + sender: doc_joiner.documents +- receiver: splitter.documents + sender: cleaner.documents +- receiver: embedder.documents + sender: splitter.documents +- receiver: writer.documents + sender: embedder.documents +max_loops_allowed: 5 +metadata: {} diff --git a/haystack/templates/pipelines.py b/haystack/templates/pipelines.py new file mode 100644 index 000000000..4b0caaa37 --- /dev/null +++ b/haystack/templates/pipelines.py @@ -0,0 +1,149 @@ +from typing import Dict, Any, Set, Optional + +import yaml +from jinja2 import meta, TemplateSyntaxError +from jinja2.nativetypes import NativeEnvironment + +from haystack import Pipeline +from haystack.core.component import Component +from haystack.core.errors import PipelineValidationError +from haystack.core.serialization import component_to_dict +from haystack.templates.source import TemplateSource + + +class PipelineTemplate: + """ + The PipelineTemplate class enables the straightforward creation of flexible and configurable pipelines using + Jinja2 templated YAML files. Specifically designed to simplify the setup of complex data processing pipelines for + a range of NLP tasks—including question answering, retriever augmented generation (RAG), document indexing, among + others - PipelineTemplate empowers users to dynamically generate pipeline configurations from templates and + customize components as necessary. Its design philosophy centers on providing an accessible, yet powerful, tool + for constructing pipelines that accommodate both common use cases and specialized requirements with ease. + + + The class enables two primary use cases: + + 1. Building a pipeline directly using all default components specified in a predefined or custom template. + 2. Customizing pipelines by overriding default components with custom component settings, integrating user-provided + component instances, and adjusting component parameters conditionally. + + Examples of usage: + + - **Default Build**: Instantiating a pipeline with default settings for a "question answering" (qa) task. + ```python + from haystack.templates import PipelineTemplate, TemplateSource, PredefinedTemplate + + # Create a pipeline with default components for a QA task + ts = TemplateSource.from_predefined(PredefinedTemplate.QA) + pipe = PipelineTemplate(ts).build() + print(pipe.run(data={"question": "What's the capital of Bosnia and Herzegovina? Be brief"})) + ``` + + - **Custom Component Settings**: Customizing a pipeline by overriding a component, such as integrating a + streaming-capable generator for real-time feedback. + ```python + from haystack.components.generators import OpenAIGenerator + from haystack.components.generators.utils import print_streaming_chunk + from haystack.templates import PipelineTemplate, TemplateSource, PredefinedTemplate + + # Customize the pipeline with a streaming-capable generator + ts = TemplateSource.from_predefined(PredefinedTemplate.QA) + streaming_pipe = PipelineTemplate(ts).override("generator", + OpenAIGenerator( + streaming_callback=print_streaming_chunk)).build() + streaming_pipe.run(data={"question": "What's the capital of Germany? Tell me about it"}) + ``` + + - **Customizing for Specific Tasks**: Building a pipeline for document indexing with specific components tailored + to the task. + ```python + from haystack.components.embedders import SentenceTransformersDocumentEmbedder + from haystack.templates import PipelineTemplate, TemplateSource, PredefinedTemplate + + # Customize the pipeline for document indexing with specific components, include PDF file converter + ts = TemplateSource.from_predefined(PredefinedTemplate.INDEXING) + ptb = PipelineTemplate(ts, template_params={"use_pdf_file_converter": True}) + ptb.override("embedder", SentenceTransformersDocumentEmbedder(progress_bar=True)) + pipe = ptb.build() + + result = pipe.run(data={ + "sources": ["some_text_file.txt", "another_pdf_file.pdf"]}) + print(result) + ``` + + The `PipelineTemplate` is designed to offer both ease of use for common pipeline configurations and the + flexibility to customize and extend pipelines as required by advanced users and specific use cases. + """ + + template_file_extension = ".yaml.jinja2" + + def __init__(self, pipeline_template: TemplateSource, template_params: Optional[Dict[str, Any]] = None): + """ + Initialize a PipelineTemplate. + + :param pipeline_template: The template source to use. See `TemplateSource` for available methods to load + templates. + :param template_params: An optional dictionary of parameters to use when rendering the pipeline template. + """ + self.template_text = pipeline_template.template + env = NativeEnvironment() + try: + self.template = env.from_string(self.template_text) + except TemplateSyntaxError as e: + raise ValueError(f"Invalid pipeline template, template syntax error: {e.message}") from e + self.templated_variables = self._extract_variables(env) + self.components: Dict[str, Any] = {} + self.template_params = template_params or {} + + def override(self, component_name: str, component_instance: Component) -> "PipelineTemplate": + """ + Overrides a component specified in the pipeline template with a custom component instance. + + :param component_name: The name of the component within the template to override. + :param component_instance: The instance of the component to use as an override. Must be an instance + of a class annotated with `@component`. + + :return: The instance of `PipelineTemplate` to allow for method chaining. + + :raises PipelineValidationError: If the `component_name` does not exist in the template or if + `component_instance` is not a valid component. + """ + # check if the component_name is allowed in the template + if component_name not in self.templated_variables: + raise PipelineValidationError(f"Component '{component_name}' is not defined in the pipeline template") + if not isinstance(component_instance, Component): + raise PipelineValidationError( + f"'{type(component_instance)}' doesn't seem to be a component. Is this class decorated with @component?" + ) + self.components[component_name] = component_to_dict(component_instance) + return self + + def list_variables(self) -> Set[str]: + """ + Lists all templated variables in the pipeline template. + + :return: a list of strings representing the names of templated variables in the pipeline template. + """ + return self.templated_variables + + def build(self): + """ + Constructs a `Pipeline` instance based on the template and any overridden components. + + :return: An instance of `Pipeline` constructed from the rendered template and custom component configurations. + """ + rendered_yaml = self.template.render(**self.components, **self.template_params) + pipeline_yaml = yaml.safe_load(rendered_yaml) + return Pipeline.from_dict(pipeline_yaml) + + def _extract_variables(self, env: NativeEnvironment) -> Set[str]: + """ + Extracts all variables from a list of Jinja template strings. + + :param env: A Jinja native environment. + :return: A set of variable names extracted from the template strings. + """ + variables = set() + ast = env.parse(self.template_text) + variables.update(meta.find_undeclared_variables(ast)) + return variables diff --git a/haystack/templates/qa.yaml.jinja2 b/haystack/templates/qa.yaml.jinja2 new file mode 100644 index 000000000..091121f9e --- /dev/null +++ b/haystack/templates/qa.yaml.jinja2 @@ -0,0 +1,23 @@ +components: + generator: {{ generator | default({ + "init_parameters": { + "api_key": { + "env_vars": [ "OPENAI_API_KEY" ], + "strict": true, + "type": "env_var" + }, + "model": "gpt-3.5-turbo" + }, + "type": "haystack.components.generators.openai.OpenAIGenerator" + }, true) | tojson }} + prompt_builder: {{ prompt_builder | default({ + "init_parameters": { + "template": "\nGiven the question {{question}}\nAnswer it in German only.\n\nAntwort:", + }, + "type": "haystack.components.builders.prompt_builder.PromptBuilder" + }, true) | tojson }} +connections: +- receiver: generator.prompt + sender: prompt_builder.prompt +max_loops_allowed: 2 +metadata: {} diff --git a/haystack/templates/rag.yaml.jinja2 b/haystack/templates/rag.yaml.jinja2 new file mode 100644 index 000000000..33b8db083 --- /dev/null +++ b/haystack/templates/rag.yaml.jinja2 @@ -0,0 +1,72 @@ +components: + answer_builder: {{ answer_builder | default({ + "init_parameters": {}, + "type": "haystack.components.builders.answer_builder.AnswerBuilder" + }, true) | tojson }} + generator: {{ generator | default({ + "init_parameters": { + "api_key": { + "env_vars": [ "OPENAI_API_KEY" ], + "strict": true, + "type": "env_var" + }, + "model": "gpt-3.5-turbo" + }, + "type": "haystack.components.generators.openai.OpenAIGenerator" + }, true) | tojson }} + retriever: {{ retriever | default({ + "init_parameters": { + "document_store": { + "init_parameters": { + "bm25_algorithm": "BM25L", + "bm25_parameters": {}, + "bm25_tokenization_regex": "(?u)\\b\\w\\w+\\b", + "embedding_similarity_function": "dot_product" + }, + "type": "haystack.document_stores.in_memory.document_store.InMemoryDocumentStore" + }, + "filters": None, + "return_embedding": false, + "scale_score": false, + "top_k": 10 + }, + "type": "haystack.components.retrievers.in_memory.embedding_retriever.InMemoryEmbeddingRetriever" + }, true) | tojson }} + text_embedder: {{ text_embedder | default({ + "init_parameters": { + "batch_size": 32, + "device": "cpu", + "model": "sentence-transformers/all-mpnet-base-v2", + "normalize_embeddings": false, + "prefix": "", + "progress_bar": true, + "suffix": "", + "token": { + "env_vars": [ "HF_API_TOKEN" ], + "strict": false, + "type": "env_var" + } + }, + "type": "haystack.components.embedders.sentence_transformers_text_embedder.SentenceTransformersTextEmbedder" + }, true) | tojson }} + prompt_builder: {{ prompt_builder | default({ + "init_parameters": { + "template": "\nGiven these documents, answer the question.\n\nDocuments:\n{% for doc in documents %}\n{{ doc.content }}\n {% endfor %}\n\nQuestion: {{question}}\n\nAnswer:\n", + }, + "type": "haystack.components.builders.prompt_builder.PromptBuilder" + }, true) | tojson }} +connections: +- receiver: retriever.query_embedding + sender: text_embedder.embedding +- receiver: prompt_builder.documents + sender: retriever.documents +- receiver: answer_builder.documents + sender: retriever.documents +- receiver: generator.prompt + sender: prompt_builder.prompt +- receiver: answer_builder.replies + sender: generator.replies +- receiver: answer_builder.meta + sender: generator.meta +max_loops_allowed: 100 +metadata: {} diff --git a/haystack/templates/source.py b/haystack/templates/source.py new file mode 100644 index 000000000..449994b85 --- /dev/null +++ b/haystack/templates/source.py @@ -0,0 +1,112 @@ +import re +from enum import Enum +from pathlib import Path +from typing import Union + +import requests + +TEMPLATE_FILE_EXTENSION = ".yaml.jinja2" +TEMPLATE_HOME_DIR = Path(__file__).resolve().parent + + +class PredefinedTemplate(Enum): + """ + Enumeration of predefined pipeline templates that can be used to create a `PipelineTemplate` using `TemplateSource`. + See `TemplateSource.from_predefined` for usage. + """ + + # maintain 1-to-1 mapping between the enum name and the template file name in templates directory + QA = "qa" + RAG = "rag" + INDEXING = "indexing" + + +class TemplateSource: + """ + TemplateSource loads template content from various inputs, including strings, files, predefined templates, and URLs. + The class provides mechanisms to load templates dynamically and ensure they contain valid Jinja2 syntax. + + TemplateSource is used by `PipelineTemplate` to load pipeline templates from various sources. + For example: + ```python + # Load a predefined indexing pipeline template + ts = TemplateSource.from_predefined(PredefinedTemplate.INDEXING) + pipeline = PipelineTemplate(ts) + + # Load a custom pipeline template from a file + ts = TemplateSource.from_file("path/to/custom_template.yaml.jinja2") + pipeline = PipelineTemplate(ts) + ``` + + Similar methods are available to load templates from strings and URLs. + """ + + def __init__(self, template: str): + """ + Initialize a TemplateSource. + :param template: The template string to use. + """ + self._template = template + + @classmethod + def from_str(cls, template_str: str) -> "TemplateSource": + """ + Create a TemplateSource from a string. + :param template_str: The template string to use. Must contain valid Jinja2 syntax. + :return: An instance of `TemplateSource`. + """ + if not cls._contains_jinja2_syntax(template_str): + raise ValueError("The provided template does not contain Jinja2 syntax.") + return cls(template_str) + + @classmethod + def from_file(cls, file_path: Union[Path, str]) -> "TemplateSource": + """ + Create a TemplateSource from a file. + :param file_path: The path to the file containing the template. Must contain valid Jinja2 syntax. + :return: An instance of `TemplateSource`. + """ + with open(file_path, "r") as file: + return cls.from_str(file.read()) + + @classmethod + def from_predefined(cls, predefined_template: PredefinedTemplate) -> "TemplateSource": + """ + Create a TemplateSource from a predefined template. See `PredefinedTemplate` for available options. + :param predefined_template: The name of the predefined template to use. + :return: An instance of `TemplateSource`. + """ + template_path = f"{TEMPLATE_HOME_DIR}/{predefined_template.value}{TEMPLATE_FILE_EXTENSION}" + return cls.from_file(template_path) + + @classmethod + def from_url(cls, url: str) -> "TemplateSource": + """ + Create a TemplateSource from a URL. + :param url: The URL to fetch the template from. Must contain valid Jinja2 syntax. + :return: An instance of `TemplateSource`. + """ + response = requests.get(url, timeout=10) + response.raise_for_status() + return cls.from_str(response.text) + + @property + def template(self) -> str: + """ + Returns the raw template string as a read-only property. + """ + return self._template + + @staticmethod + def _contains_jinja2_syntax(potential_jinja_template: str) -> bool: + """ + Determines if a given string contains Jinja2 templating syntax. + + :param potential_jinja_template: The string to check for Jinja2 syntax. + + :return: `True` if Jinja2 syntax is found, otherwise `False`. + """ + # Patterns to look for: {{ var }}, {% block %}, {# comment #} + patterns = [r"\{\{.*?\}\}", r"\{%.*?%\}", r"\{#.*?#\}"] + combined_pattern = re.compile("|".join(patterns)) + return bool(combined_pattern.search(potential_jinja_template)) diff --git a/releasenotes/notes/add-pipeline-templates-831f857c6387f8c3.yaml b/releasenotes/notes/add-pipeline-templates-831f857c6387f8c3.yaml new file mode 100644 index 000000000..c76c4ae4e --- /dev/null +++ b/releasenotes/notes/add-pipeline-templates-831f857c6387f8c3.yaml @@ -0,0 +1,42 @@ +--- +highlights: + - | + Introducing a flexible and dynamic approach to creating NLP pipelines with Haystack's new PipelineTemplate class! + This innovative feature utilizes Jinja2 templated YAML files, allowing users to effortlessly construct and customize + complex data processing pipelines for various NLP tasks. From question answering and document indexing to custom + pipeline requirements, the PipelineTemplate simplifies configuration and enhances adaptability. Users can now easily + override default components or integrate custom settings with simple, straightforward code. + + For example, the following pipeline template can be used to create an indexing pipeline: + ```python + from haystack.components.embedders import SentenceTransformersDocumentEmbedder + from haystack.templates import PipelineTemplate, TemplateSource, PredefinedTemplate + + ts = TemplateSource.from_predefined(PredefinedTemplate.INDEXING) + pt = PipelineTemplate(ts, template_params={"use_pdf_file_converter": True}) + pt.override("embedder", SentenceTransformersDocumentEmbedder(progress_bar=True)) + pipe = ptb.build() + + result = pipe.run(data={"sources": ["some_local_dir/and_text_file.txt", "some_other_local_dir/and_pdf_file.pdf"]}) + print(result) + ``` + + In the above example, a PredefinedTemplate.INDEXING enum is used to create a pipeline with a custom instance of + SentenceTransformersDocumentEmbedder and the PDF file converter enabled. The pipeline is then run on a list of + local files and the result is printed (number of indexed documents). + + We could have of course used the same PipelineTemplate class to create any other pre-defined pipeline or even a + custom pipeline with custom components and settings. + + On the other hand, the following pipeline template can be used to create a pre-defined RAG pipeline: + ```python + from haystack.templates import PipelineTemplate, TemplateSource, PredefinedTemplate + + ts = TemplateSource.from_predefined(PredefinedTemplate.RAG) + pipe = PipelineTemplate(ts).build() + result = pipe.run(query="What's the meaning of life?") + print(result) + ``` + + TemplateSource loads template content from various inputs, including strings, files, predefined templates, and URLs. + The class provides mechanisms to load templates dynamically and ensure they contain valid Jinja2 syntax. diff --git a/test/templates/test_templates.py b/test/templates/test_templates.py new file mode 100644 index 000000000..9f95a956b --- /dev/null +++ b/test/templates/test_templates.py @@ -0,0 +1,122 @@ +import sys +import tempfile + +import pytest + +from haystack import Pipeline +from haystack.components.builders import PromptBuilder +from haystack.components.embedders import SentenceTransformersDocumentEmbedder +from haystack.components.generators import HuggingFaceTGIGenerator +from haystack.core.errors import PipelineValidationError +from haystack.templates import PipelineTemplate, TemplateSource, PredefinedTemplate + + +@pytest.fixture +def random_valid_template(): + template = """components: + generator: {{ generator | tojson }} + prompt_builder: {{prompt_builder}} + +connections: +- receiver: generator.prompt + sender: prompt_builder.prompt +max_loops_allowed: 2 +metadata: {} +""" + return template + + +class TestPipelineTemplate: + # test_TemplateSource + # If the provided template does not contain Jinja2 syntax. + def test_from_str(self): + with pytest.raises(ValueError): + TemplateSource.from_str("invalid_template") + + # If the provided template contains Jinja2 syntax. + def test_from_str_valid(self): + ts = TemplateSource.from_str("{{ valid_template }}") + assert ts.template == "{{ valid_template }}" + + # If the provided file path does not exist. + def test_from_file_invalid_path(self): + with pytest.raises(FileNotFoundError): + TemplateSource.from_file("invalid_path") + + # If the provided file path exists. + @pytest.mark.skipif(sys.platform == "win32", reason="Fails on Windows CI with permission denied") + def test_from_file_valid_path(self, random_valid_template): + temp_file = tempfile.NamedTemporaryFile(mode="w") + temp_file.write(random_valid_template) + temp_file.flush() + ts = TemplateSource.from_file(temp_file.name) + assert ts.template == random_valid_template + + # Use predefined template + def test_from_predefined_invalid_template(self): + ts = TemplateSource.from_predefined(PredefinedTemplate.INDEXING) + assert len(ts.template) > 0 + + # Raises PipelineValidationError when attempting to override a non-existent component + def test_override_nonexistent_component(self): + ts = TemplateSource.from_predefined(PredefinedTemplate.INDEXING) + + with pytest.raises(PipelineValidationError): + PipelineTemplate(ts).override("nonexistent_component", SentenceTransformersDocumentEmbedder()) + + # Building a pipeline directly using all default components specified in a predefined or custom template. + def test_build_pipeline_with_default_components(self): + ts = TemplateSource.from_predefined(PredefinedTemplate.INDEXING) + pipeline = PipelineTemplate(ts).build() + assert isinstance(pipeline, Pipeline) + + # pipeline has components + assert pipeline.get_component("cleaner") + assert pipeline.get_component("writer") + assert pipeline.get_component("embedder") + + # pipeline should have inputs and outputs + assert len(pipeline.inputs()) > 0 + assert len(pipeline.outputs()) > 0 + + # Customizing pipelines by overriding default components with custom component settings + def test_customize_pipeline_with_overrides(self): + ts = TemplateSource.from_predefined(PredefinedTemplate.INDEXING) + pt = PipelineTemplate(ts) + + pt.override("embedder", SentenceTransformersDocumentEmbedder(progress_bar=True, batch_size=64)) + pipe = pt.build() + + assert isinstance(pipe, Pipeline) + assert pipe.get_component("embedder") + embedder: SentenceTransformersDocumentEmbedder = pipe.get_component("embedder") + embedder_dict = embedder.to_dict() + assert embedder_dict["init_parameters"]["progress_bar"] + assert embedder_dict["init_parameters"]["batch_size"] == 64 + + # Overrides a generator component specified in the pipeline template with a completely different generator + @pytest.mark.integration + def test_override_component(self): + # integration because we'll fetch the tokenizer + pipe = ( + PipelineTemplate(TemplateSource.from_predefined(PredefinedTemplate.QA)) + .override("generator", HuggingFaceTGIGenerator()) + .build() + ) + assert isinstance(pipe, Pipeline) + assert pipe.get_component("generator") + assert isinstance(pipe.get_component("generator"), HuggingFaceTGIGenerator) + + # Building a pipeline with a custom template that uses Jinja2 syntax to specify components and their connections + @pytest.mark.integration + def test_building_pipeline_with_direct_template(self, random_valid_template): + pt = PipelineTemplate(TemplateSource.from_str(random_valid_template)) + pt.override("generator", HuggingFaceTGIGenerator()) + pt.override("prompt_builder", PromptBuilder("Some fake prompt")) + pipe = pt.build() + + assert isinstance(pipe, Pipeline) + assert pipe.get_component("generator") + assert isinstance(pipe.get_component("generator"), HuggingFaceTGIGenerator) + assert pipe.get_component("prompt_builder") + assert isinstance(pipe.get_component("prompt_builder"), PromptBuilder)