From 60ff46e4e1de1fc7694a4b9fafbe65439b98f7e3 Mon Sep 17 00:00:00 2001
From: tstadel <60758086+tstadel@users.noreply.github.com>
Date: Mon, 25 Apr 2022 20:14:48 +0200
Subject: [PATCH] Log evaluation results to MLflow (#2337)
* track eval results in mlflow
* Update Documentation & Code Style
* add pipeline.yaml and environment info
* improve logging to mlflow
* Update Documentation & Code Style
* introduce ExperimentTracker
* Update Documentation & Code Style
* move modeling.utils.logger to utils.experiment_tracking
* renaming: tracker and TrackingHead
* Update Documentation & Code Style
* refactor env tracking
* fix pylint findings
* Update Documentation & Code Style
* rename MLFlowTrackingHead to MLflowTrackingHead
* implement dataset hash
* Update Documentation & Code Style
* set docstrings
* Update Documentation & Code Style
* introduce PipelineBundle and Corpus
* Update Documentation & Code Style
* support reusing index
* Update Documentation & Code Style
* rename Corpus to FileCorpus
* fix Corpus -> FileCorpus
* Update Documentation & Code Style
* resolve cyclic dependencies
* fix linter issues
* Update Documentation & Code Style
* remove helper classes
* Update Documentation & Code Style
* fix imports
* fix another unused import
* update docstrings
* Update Documentation & Code Style
* simplify usage of experiment tracking tools
* fix Literal import
* revert schema changes
* Update Documentation & Code Style
* always end run
* Update Documentation & Code Style
* fix mypy issue
* rename to execute_eval_run
* Update Documentation & Code Style
* fix merge of get_or_create_env_meta_data
* improve docstrings
* Update Documentation & Code Style
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
docs/_src/api/api/pipelines.md | 90 +
haystack/__init__.py | 6 +-
haystack/environment.py | 62 +
.../haystack-pipeline-1.1.0.schema.json | 16 +-
.../haystack-pipeline-1.3.1rc0.schema.json | 3730 +++++++++++++++++
haystack/modeling/data_handler/data_silo.py | 4 +-
haystack/modeling/data_handler/processor.py | 4 +-
haystack/modeling/evaluation/eval.py | 6 +-
haystack/modeling/infer.py | 3 -
haystack/modeling/logger.py | 145 -
haystack/modeling/model/adaptive_model.py | 4 +-
haystack/modeling/model/biadaptive_model.py | 4 +-
haystack/modeling/model/optimization.py | 14 +-
haystack/modeling/model/triadaptive_model.py | 4 +-
haystack/modeling/training/base.py | 14 +-
haystack/nodes/retriever/dense.py | 4 +-
haystack/pipelines/base.py | 202 +
haystack/telemetry.py | 63 +-
haystack/utils/__init__.py | 7 +
haystack/utils/experiment_tracking.py | 188 +
setup.cfg | 2 +-
21 files changed, 4322 insertions(+), 250 deletions(-)
create mode 100644 haystack/environment.py
create mode 100644 haystack/json-schemas/haystack-pipeline-1.3.1rc0.schema.json
delete mode 100644 haystack/modeling/logger.py
create mode 100644 haystack/utils/experiment_tracking.py
diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md
index 1e44e1648..8d005ec4b 100644
--- a/docs/_src/api/api/pipelines.md
+++ b/docs/_src/api/api/pipelines.md
@@ -464,6 +464,96 @@ If True the index will be kept after beir evaluation. Otherwise it will be delet
Returns a tuple containing the ncdg, map, recall and precision scores.
Each metric is represented by a dictionary containing the scores for each top_k value.
+
+
+#### execute\_eval\_run
+
+```python
+@classmethod
+def execute_eval_run(cls, index_pipeline: Pipeline, query_pipeline: Pipeline, evaluation_set_labels: List[MultiLabel], corpus_file_paths: List[str], experiment_name: str, experiment_run_name: str, experiment_tracking_tool: Literal["mlflow", None] = None, experiment_tracking_uri: Optional[str] = None, corpus_file_metas: List[Dict[str, Any]] = None, corpus_meta: Dict[str, Any] = {}, evaluation_set_meta: Dict[str, Any] = {}, pipeline_meta: Dict[str, Any] = {}, index_params: dict = {}, query_params: dict = {}, sas_model_name_or_path: str = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, reuse_index: bool = False) -> EvaluationResult
+```
+
+Starts an experiment run that first indexes the specified files (forming a corpus) using the index pipeline
+
+and subsequently evaluates the query pipeline on the provided labels (forming an evaluation set) using pipeline.eval().
+Parameters and results (metrics and predictions) of the run are tracked by an experiment tracking tool for further analysis.
+You can specify the experiment tracking tool by setting the params `experiment_tracking_tool` and `experiment_tracking_uri`
+or by passing a (custom) tracking head to Tracker.set_tracking_head().
+Note, that `experiment_tracking_tool` only supports `mlflow` currently.
+
+For easier comparison you can pass additional metadata regarding corpus (corpus_meta), evaluation set (evaluation_set_meta) and pipelines (pipeline_meta).
+E.g. you can give them names or ids to identify them across experiment runs.
+
+This method executes an experiment run. Each experiment run is part of at least one experiment.
+An experiment typically consists of multiple runs to be compared (e.g. using different retrievers in query pipeline).
+Experiment tracking tools usually share the same concepts of experiments and provide additional functionality to easily compare runs across experiments.
+
+E.g. you can call execute_eval_run() multiple times with different retrievers in your query pipeline and compare the runs in mlflow:
+
+```python
+ | for retriever_type, query_pipeline in zip(["sparse", "dpr", "embedding"], [sparse_pipe, dpr_pipe, embedding_pipe]):
+ | eval_result = Pipeline.execute_eval_run(
+ | index_pipeline=index_pipeline,
+ | query_pipeline=query_pipeline,
+ | evaluation_set_labels=labels,
+ | corpus_file_paths=file_paths,
+ | corpus_file_metas=file_metas,
+ | experiment_tracking_tool="mlflow",
+ | experiment_tracking_uri="http://localhost:5000",
+ | experiment_name="my-retriever-experiment",
+ | experiment_run_name=f"run_{retriever_type}",
+ | pipeline_meta={"name": f"my-pipeline-{retriever_type}"},
+ | evaluation_set_meta={"name": "my-evalset"},
+ | corpus_meta={"name": "my-corpus"}.
+ | reuse_index=False
+ | )
+```
+
+**Arguments**:
+
+- `index_pipeline`: The indexing pipeline to use.
+- `query_pipeline`: The query pipeline to evaluate.
+- `evaluation_set_labels`: The labels to evaluate on forming an evalution set.
+- `corpus_file_paths`: The files to be indexed and searched during evaluation forming a corpus.
+- `experiment_name`: The name of the experiment
+- `experiment_run_name`: The name of the experiment run
+- `experiment_tracking_tool`: The experiment tracking tool to be used. Currently we only support "mlflow".
+If left unset the current TrackingHead specified by Tracker.set_tracking_head() will be used.
+- `experiment_tracking_uri`: The uri of the experiment tracking server to be used. Must be specified if experiment_tracking_tool is set.
+You can use deepset's public mlflow server via https://public-mlflow.deepset.ai/.
+Note, that artifact logging (e.g. Pipeline YAML or evaluation result CSVs) are currently not allowed on deepset's public mlflow server as this might expose sensitive data.
+- `corpus_file_metas`: The optional metadata to be stored for each corpus file (e.g. title).
+- `corpus_meta`: Metadata about the corpus to track (e.g. name, date, author, version).
+- `evaluation_set_meta`: Metadata about the evalset to track (e.g. name, date, author, version).
+- `pipeline_meta`: Metadata about the pipelines to track (e.g. name, author, version).
+- `index_params`: The params to use during indexing (see pipeline.run's params).
+- `query_params`: The params to use during querying (see pipeline.run's params).
+- `sas_model_name_or_path`: Name or path of "Semantic Answer Similarity (SAS) model". When set, the model will be used to calculate similarity between predictions and labels and generate the SAS metric.
+The SAS metric correlates better with human judgement of correct answers as it does not rely on string overlaps.
+Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture.
+More info in the paper: https://arxiv.org/abs/2108.06130
+Models:
+- You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data.
+Not all cross encoders can be used because of different return types.
+If you use custom cross encoders please make sure they work with sentence_transformers.CrossEncoder class
+- Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
+- Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large"
+- Large model for German only: "deepset/gbert-large-sts"
+- `sas_batch_size`: Number of prediction label pairs to encode at once by CrossEncoder or SentenceTransformer while calculating SAS.
+- `sas_use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity.
+Falls back to CPU if no GPU is available.
+- `add_isolated_node_eval`: If set to True, in addition to the integrated evaluation of the pipeline, each node is evaluated in isolated evaluation mode.
+This mode helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node.
+If a node performs much better in the isolated evaluation than in the integrated evaluation, the previous node needs to be optimized to improve the pipeline's performance.
+If a node's performance is similar in both modes, this node itself needs to be optimized to improve the pipeline's performance.
+The isolated evaluation calculates the upper bound of each node's evaluation metrics under the assumption that it received perfect inputs from the previous node.
+To this end, labels are used as input to the node instead of the output of the previous node in the pipeline.
+The generated dataframes in the EvaluationResult then contain additional rows, which can be distinguished from the integrated evaluation results based on the
+values "integrated" or "isolated" in the column "eval_mode" and the evaluation report then additionally lists the upper bound of each node's evaluation metrics.
+- `reuse_index`: Whether to reuse existing non-empty index and to keep the index after evaluation.
+If True the index will be kept after evaluation and no indexing will take place if index has already documents. Otherwise it will be deleted immediately afterwards.
+Defaults to False.
+
#### eval
diff --git a/haystack/__init__.py b/haystack/__init__.py
index 27a987a34..57ada3489 100644
--- a/haystack/__init__.py
+++ b/haystack/__init__.py
@@ -22,7 +22,7 @@ logging.getLogger("haystack").setLevel(logging.INFO)
import pandas as pd
-from haystack.schema import Document, Answer, Label, MultiLabel, Span
+from haystack.schema import Document, Answer, Label, MultiLabel, Span, EvaluationResult
from haystack.nodes.base import BaseComponent
from haystack.pipelines.base import Pipeline
@@ -104,7 +104,6 @@ except ImportError:
pass
from haystack.modeling.evaluation import eval
-from haystack.modeling.logger import MLFlowLogger, StdoutLogger, TensorBoardLogger
from haystack.nodes.other import JoinDocuments, Docs2Answers, JoinAnswers, RouteDocuments
from haystack.nodes.query_classifier import SklearnQueryClassifier, TransformersQueryClassifier
from haystack.nodes.file_classifier import FileTypeClassifier
@@ -178,9 +177,6 @@ if graph_retriever:
# Adding them to sys.modules would enable `import haystack.pipelines.JoinDocuments`,
# which I believe it's a very rare import style.
setattr(file_converter, "FileTypeClassifier", FileTypeClassifier)
-setattr(modeling_utils, "MLFlowLogger", MLFlowLogger)
-setattr(modeling_utils, "StdoutLogger", StdoutLogger)
-setattr(modeling_utils, "TensorBoardLogger", TensorBoardLogger)
setattr(pipelines, "JoinDocuments", JoinDocuments)
setattr(pipelines, "Docs2Answers", Docs2Answers)
setattr(pipelines, "SklearnQueryClassifier", SklearnQueryClassifier)
diff --git a/haystack/environment.py b/haystack/environment.py
new file mode 100644
index 000000000..60eecc10f
--- /dev/null
+++ b/haystack/environment.py
@@ -0,0 +1,62 @@
+import os
+import platform
+import sys
+from typing import Any, Dict
+import torch
+import transformers
+
+from haystack import __version__
+
+
+HAYSTACK_EXECUTION_CONTEXT = "HAYSTACK_EXECUTION_CONTEXT"
+HAYSTACK_DOCKER_CONTAINER = "HAYSTACK_DOCKER_CONTAINER"
+
+
+env_meta_data: Dict[str, Any] = {}
+
+
+def get_or_create_env_meta_data() -> Dict[str, Any]:
+ """
+ Collects meta data about the setup that is used with Haystack, such as: operating system, python version, Haystack version, transformers version, pytorch version, number of GPUs, execution environment, and the value stored in the env variable HAYSTACK_EXECUTION_CONTEXT.
+ """
+ global env_meta_data # pylint: disable=global-statement
+ if not env_meta_data:
+ env_meta_data = {
+ "os_version": platform.release(),
+ "os_family": platform.system(),
+ "os_machine": platform.machine(),
+ "python_version": platform.python_version(),
+ "haystack_version": __version__,
+ "transformers_version": transformers.__version__,
+ "torch_version": torch.__version__,
+ "torch_cuda_version": torch.version.cuda if torch.cuda.is_available() else 0,
+ "n_gpu": torch.cuda.device_count() if torch.cuda.is_available() else 0,
+ "n_cpu": os.cpu_count(),
+ "context": os.environ.get(HAYSTACK_EXECUTION_CONTEXT),
+ "execution_env": _get_execution_environment(),
+ }
+ return env_meta_data
+
+
+def _get_execution_environment():
+ """
+ Identifies the execution environment that Haystack is running in.
+ Options are: colab notebook, kubernetes, CPU/GPU docker container, test environment, jupyter notebook, python script
+ """
+ if os.environ.get("CI", "False").lower() == "true":
+ execution_env = "ci"
+ elif "google.colab" in sys.modules:
+ execution_env = "colab"
+ elif "KUBERNETES_SERVICE_HOST" in os.environ:
+ execution_env = "kubernetes"
+ elif HAYSTACK_DOCKER_CONTAINER in os.environ:
+ execution_env = os.environ.get(HAYSTACK_DOCKER_CONTAINER)
+ # check if pytest is imported
+ elif "pytest" in sys.modules:
+ execution_env = "test"
+ else:
+ try:
+ execution_env = get_ipython().__class__.__name__ # pylint: disable=undefined-variable
+ except NameError:
+ execution_env = "script"
+ return execution_env
diff --git a/haystack/json-schemas/haystack-pipeline-1.1.0.schema.json b/haystack/json-schemas/haystack-pipeline-1.1.0.schema.json
index af5092631..f67b85559 100644
--- a/haystack/json-schemas/haystack-pipeline-1.1.0.schema.json
+++ b/haystack/json-schemas/haystack-pipeline-1.1.0.schema.json
@@ -1411,10 +1411,10 @@
"title": "Use Auth Token",
"anyOf": [
{
- "type": "boolean"
+ "type": "string"
},
{
- "type": "string"
+ "type": "boolean"
}
]
}
@@ -1682,10 +1682,10 @@
"title": "Use Auth Token",
"anyOf": [
{
- "type": "boolean"
+ "type": "string"
},
{
- "type": "string"
+ "type": "boolean"
}
]
}
@@ -1949,10 +1949,10 @@
"title": "Use Auth Token",
"anyOf": [
{
- "type": "boolean"
+ "type": "string"
},
{
- "type": "string"
+ "type": "boolean"
}
]
}
@@ -3124,10 +3124,10 @@
"title": "Use Auth Token",
"anyOf": [
{
- "type": "boolean"
+ "type": "string"
},
{
- "type": "string"
+ "type": "boolean"
}
]
}
diff --git a/haystack/json-schemas/haystack-pipeline-1.3.1rc0.schema.json b/haystack/json-schemas/haystack-pipeline-1.3.1rc0.schema.json
new file mode 100644
index 000000000..85b9ab85b
--- /dev/null
+++ b/haystack/json-schemas/haystack-pipeline-1.3.1rc0.schema.json
@@ -0,0 +1,3730 @@
+{
+ "$schema": "http://json-schema.org/draft-07/schema",
+ "$id": "https://haystack.deepset.ai/haystack/json-schemas/haystack-pipeline-1.3.1rc0.schema.json",
+ "title": "Haystack Pipeline",
+ "description": "Haystack Pipeline YAML file describing the nodes of the pipelines. For more info read the docs at: https://haystack.deepset.ai/components/pipelines#yaml-file-definitions",
+ "type": "object",
+ "properties": {
+ "version": {
+ "title": "Version",
+ "description": "Version of the Haystack Pipeline file.",
+ "type": "string",
+ "oneOf": [
+ {
+ "const": "1.3.1rc0"
+ }
+ ]
+ },
+ "components": {
+ "title": "Components",
+ "description": "Component nodes and their configurations, to later be used in the pipelines section. Define here all the building blocks for the pipelines.",
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "$ref": "#/definitions/DeepsetCloudDocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/ElasticsearchDocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/FAISSDocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/GraphDBKnowledgeGraphComponent"
+ },
+ {
+ "$ref": "#/definitions/InMemoryDocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/Milvus2DocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/OpenDistroElasticsearchDocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/OpenSearchDocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/PineconeDocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/SQLDocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/WeaviateDocumentStoreComponent"
+ },
+ {
+ "$ref": "#/definitions/AzureConverterComponent"
+ },
+ {
+ "$ref": "#/definitions/CrawlerComponent"
+ },
+ {
+ "$ref": "#/definitions/DensePassageRetrieverComponent"
+ },
+ {
+ "$ref": "#/definitions/Docs2AnswersComponent"
+ },
+ {
+ "$ref": "#/definitions/DocxToTextConverterComponent"
+ },
+ {
+ "$ref": "#/definitions/ElasticsearchFilterOnlyRetrieverComponent"
+ },
+ {
+ "$ref": "#/definitions/ElasticsearchRetrieverComponent"
+ },
+ {
+ "$ref": "#/definitions/EmbeddingRetrieverComponent"
+ },
+ {
+ "$ref": "#/definitions/EntityExtractorComponent"
+ },
+ {
+ "$ref": "#/definitions/EvalAnswersComponent"
+ },
+ {
+ "$ref": "#/definitions/EvalDocumentsComponent"
+ },
+ {
+ "$ref": "#/definitions/FARMReaderComponent"
+ },
+ {
+ "$ref": "#/definitions/FileTypeClassifierComponent"
+ },
+ {
+ "$ref": "#/definitions/ImageToTextConverterComponent"
+ },
+ {
+ "$ref": "#/definitions/JoinAnswersComponent"
+ },
+ {
+ "$ref": "#/definitions/JoinDocumentsComponent"
+ },
+ {
+ "$ref": "#/definitions/MarkdownConverterComponent"
+ },
+ {
+ "$ref": "#/definitions/PDFToTextConverterComponent"
+ },
+ {
+ "$ref": "#/definitions/PDFToTextOCRConverterComponent"
+ },
+ {
+ "$ref": "#/definitions/ParsrConverterComponent"
+ },
+ {
+ "$ref": "#/definitions/PreProcessorComponent"
+ },
+ {
+ "$ref": "#/definitions/QuestionGeneratorComponent"
+ },
+ {
+ "$ref": "#/definitions/RAGeneratorComponent"
+ },
+ {
+ "$ref": "#/definitions/RCIReaderComponent"
+ },
+ {
+ "$ref": "#/definitions/RouteDocumentsComponent"
+ },
+ {
+ "$ref": "#/definitions/SentenceTransformersRankerComponent"
+ },
+ {
+ "$ref": "#/definitions/Seq2SeqGeneratorComponent"
+ },
+ {
+ "$ref": "#/definitions/SklearnQueryClassifierComponent"
+ },
+ {
+ "$ref": "#/definitions/TableReaderComponent"
+ },
+ {
+ "$ref": "#/definitions/TableTextRetrieverComponent"
+ },
+ {
+ "$ref": "#/definitions/Text2SparqlRetrieverComponent"
+ },
+ {
+ "$ref": "#/definitions/TextConverterComponent"
+ },
+ {
+ "$ref": "#/definitions/TfidfRetrieverComponent"
+ },
+ {
+ "$ref": "#/definitions/TikaConverterComponent"
+ },
+ {
+ "$ref": "#/definitions/TransformersDocumentClassifierComponent"
+ },
+ {
+ "$ref": "#/definitions/TransformersQueryClassifierComponent"
+ },
+ {
+ "$ref": "#/definitions/TransformersReaderComponent"
+ },
+ {
+ "$ref": "#/definitions/TransformersSummarizerComponent"
+ },
+ {
+ "$ref": "#/definitions/TransformersTranslatorComponent"
+ }
+ ]
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": true
+ },
+ "pipelines": {
+ "title": "Pipelines",
+ "description": "Multiple pipelines can be defined using the components from the same YAML file.",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Name of the pipeline.",
+ "type": "string"
+ },
+ "nodes": {
+ "title": "Nodes",
+ "description": "Nodes to be used by this particular pipeline",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "The name of this particular node in the pipeline. This should be one of the names from the components defined in the same file.",
+ "type": "string"
+ },
+ "inputs": {
+ "title": "Inputs",
+ "description": "Input parameters for this node.",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "required": [
+ "name",
+ "inputs"
+ ],
+ "additionalProperties": false
+ },
+ "required": [
+ "name",
+ "nodes"
+ ],
+ "additionalProperties": false
+ },
+ "additionalProperties": false
+ },
+ "additionalProperties": false
+ }
+ }
+ },
+ "required": [
+ "version",
+ "components",
+ "pipelines"
+ ],
+ "additionalProperties": false,
+ "definitions": {
+ "DeepsetCloudDocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "DeepsetCloudDocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "api_key": {
+ "title": "Api Key",
+ "type": "string"
+ },
+ "workspace": {
+ "title": "Workspace",
+ "default": "default",
+ "type": "string"
+ },
+ "index": {
+ "title": "Index",
+ "default": "default",
+ "type": "string"
+ },
+ "duplicate_documents": {
+ "title": "Duplicate Documents",
+ "default": "overwrite",
+ "type": "string"
+ },
+ "api_endpoint": {
+ "title": "Api Endpoint",
+ "type": "string"
+ },
+ "similarity": {
+ "title": "Similarity",
+ "default": "dot_product",
+ "type": "string"
+ },
+ "return_embedding": {
+ "title": "Return Embedding",
+ "default": false,
+ "type": "boolean"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "ElasticsearchDocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "ElasticsearchDocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "host": {
+ "title": "Host",
+ "default": "localhost",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ ]
+ },
+ "port": {
+ "title": "Port",
+ "default": 9200,
+ "anyOf": [
+ {
+ "type": "integer"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "integer"
+ }
+ }
+ ]
+ },
+ "username": {
+ "title": "Username",
+ "default": "",
+ "type": "string"
+ },
+ "password": {
+ "title": "Password",
+ "default": "",
+ "type": "string"
+ },
+ "api_key_id": {
+ "title": "Api Key Id",
+ "type": "string"
+ },
+ "api_key": {
+ "title": "Api Key",
+ "type": "string"
+ },
+ "aws4auth": {
+ "title": "Aws4Auth"
+ },
+ "index": {
+ "title": "Index",
+ "default": "document",
+ "type": "string"
+ },
+ "label_index": {
+ "title": "Label Index",
+ "default": "label",
+ "type": "string"
+ },
+ "search_fields": {
+ "title": "Search Fields",
+ "default": "content",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {}
+ }
+ ]
+ },
+ "content_field": {
+ "title": "Content Field",
+ "default": "content",
+ "type": "string"
+ },
+ "name_field": {
+ "title": "Name Field",
+ "default": "name",
+ "type": "string"
+ },
+ "embedding_field": {
+ "title": "Embedding Field",
+ "default": "embedding",
+ "type": "string"
+ },
+ "embedding_dim": {
+ "title": "Embedding Dim",
+ "default": 768,
+ "type": "integer"
+ },
+ "custom_mapping": {
+ "title": "Custom Mapping",
+ "type": "object"
+ },
+ "excluded_meta_data": {
+ "title": "Excluded Meta Data",
+ "type": "array",
+ "items": {}
+ },
+ "analyzer": {
+ "title": "Analyzer",
+ "default": "standard",
+ "type": "string"
+ },
+ "scheme": {
+ "title": "Scheme",
+ "default": "http",
+ "type": "string"
+ },
+ "ca_certs": {
+ "title": "Ca Certs",
+ "type": "string"
+ },
+ "verify_certs": {
+ "title": "Verify Certs",
+ "default": true,
+ "type": "boolean"
+ },
+ "recreate_index": {
+ "title": "Recreate Index",
+ "default": false,
+ "type": "boolean"
+ },
+ "create_index": {
+ "title": "Create Index",
+ "default": true,
+ "type": "boolean"
+ },
+ "refresh_type": {
+ "title": "Refresh Type",
+ "default": "wait_for",
+ "type": "string"
+ },
+ "similarity": {
+ "title": "Similarity",
+ "default": "dot_product"
+ },
+ "timeout": {
+ "title": "Timeout",
+ "default": 30
+ },
+ "return_embedding": {
+ "title": "Return Embedding",
+ "default": false,
+ "type": "boolean"
+ },
+ "duplicate_documents": {
+ "title": "Duplicate Documents",
+ "default": "overwrite",
+ "type": "string"
+ },
+ "index_type": {
+ "title": "Index Type",
+ "default": "flat",
+ "type": "string"
+ },
+ "scroll": {
+ "title": "Scroll",
+ "default": "1d",
+ "type": "string"
+ },
+ "skip_missing_embeddings": {
+ "title": "Skip Missing Embeddings",
+ "default": true,
+ "type": "boolean"
+ },
+ "synonyms": {
+ "title": "Synonyms",
+ "type": "array",
+ "items": {}
+ },
+ "synonym_type": {
+ "title": "Synonym Type",
+ "default": "synonym",
+ "type": "string"
+ },
+ "use_system_proxy": {
+ "title": "Use System Proxy",
+ "default": false,
+ "type": "boolean"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "FAISSDocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "FAISSDocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "sql_url": {
+ "title": "Sql Url",
+ "default": "sqlite:///faiss_document_store.db",
+ "type": "string"
+ },
+ "vector_dim": {
+ "title": "Vector Dim",
+ "type": "integer"
+ },
+ "embedding_dim": {
+ "title": "Embedding Dim",
+ "default": 768,
+ "type": "integer"
+ },
+ "faiss_index_factory_str": {
+ "title": "Faiss Index Factory Str",
+ "default": "Flat",
+ "type": "string"
+ },
+ "faiss_index": {
+ "title": "Faiss Index",
+ "type": "string",
+ "default": null
+ },
+ "return_embedding": {
+ "title": "Return Embedding",
+ "default": false,
+ "type": "boolean"
+ },
+ "index": {
+ "title": "Index",
+ "default": "document",
+ "type": "string"
+ },
+ "similarity": {
+ "title": "Similarity",
+ "default": "dot_product",
+ "type": "string"
+ },
+ "embedding_field": {
+ "title": "Embedding Field",
+ "default": "embedding",
+ "type": "string"
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ },
+ "duplicate_documents": {
+ "title": "Duplicate Documents",
+ "default": "overwrite",
+ "type": "string"
+ },
+ "faiss_index_path": {
+ "title": "Faiss Index Path",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string",
+ "format": "path"
+ }
+ ]
+ },
+ "faiss_config_path": {
+ "title": "Faiss Config Path",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string",
+ "format": "path"
+ }
+ ]
+ },
+ "isolation_level": {
+ "title": "Isolation Level",
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "GraphDBKnowledgeGraphComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "GraphDBKnowledgeGraph"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "host": {
+ "title": "Host",
+ "default": "localhost",
+ "type": "string"
+ },
+ "port": {
+ "title": "Port",
+ "default": 7200,
+ "type": "integer"
+ },
+ "username": {
+ "title": "Username",
+ "default": "",
+ "type": "string"
+ },
+ "password": {
+ "title": "Password",
+ "default": "",
+ "type": "string"
+ },
+ "index": {
+ "title": "Index",
+ "type": "string"
+ },
+ "prefixes": {
+ "title": "Prefixes",
+ "default": "",
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "InMemoryDocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "InMemoryDocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "index": {
+ "title": "Index",
+ "default": "document",
+ "type": "string"
+ },
+ "label_index": {
+ "title": "Label Index",
+ "default": "label",
+ "type": "string"
+ },
+ "embedding_field": {
+ "title": "Embedding Field",
+ "default": "embedding",
+ "type": "string"
+ },
+ "embedding_dim": {
+ "title": "Embedding Dim",
+ "default": 768,
+ "type": "integer"
+ },
+ "return_embedding": {
+ "title": "Return Embedding",
+ "default": false,
+ "type": "boolean"
+ },
+ "similarity": {
+ "title": "Similarity",
+ "default": "dot_product",
+ "type": "string"
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ },
+ "duplicate_documents": {
+ "title": "Duplicate Documents",
+ "default": "overwrite",
+ "type": "string"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "scoring_batch_size": {
+ "title": "Scoring Batch Size",
+ "default": 500000,
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "Milvus2DocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "Milvus2DocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "sql_url": {
+ "title": "Sql Url",
+ "default": "sqlite:///",
+ "type": "string"
+ },
+ "host": {
+ "title": "Host",
+ "default": "localhost",
+ "type": "string"
+ },
+ "port": {
+ "title": "Port",
+ "default": "19530",
+ "type": "string"
+ },
+ "connection_pool": {
+ "title": "Connection Pool",
+ "default": "SingletonThread",
+ "type": "string"
+ },
+ "index": {
+ "title": "Index",
+ "default": "document",
+ "type": "string"
+ },
+ "vector_dim": {
+ "title": "Vector Dim",
+ "type": "integer"
+ },
+ "embedding_dim": {
+ "title": "Embedding Dim",
+ "default": 768,
+ "type": "integer"
+ },
+ "index_file_size": {
+ "title": "Index File Size",
+ "default": 1024,
+ "type": "integer"
+ },
+ "similarity": {
+ "title": "Similarity",
+ "default": "dot_product",
+ "type": "string"
+ },
+ "index_type": {
+ "title": "Index Type",
+ "default": "IVF_FLAT",
+ "type": "string"
+ },
+ "index_param": {
+ "title": "Index Param",
+ "type": "object"
+ },
+ "search_param": {
+ "title": "Search Param",
+ "type": "object"
+ },
+ "return_embedding": {
+ "title": "Return Embedding",
+ "default": false,
+ "type": "boolean"
+ },
+ "embedding_field": {
+ "title": "Embedding Field",
+ "default": "embedding",
+ "type": "string"
+ },
+ "id_field": {
+ "title": "Id Field",
+ "default": "id",
+ "type": "string"
+ },
+ "custom_fields": {
+ "title": "Custom Fields",
+ "type": "array",
+ "items": {}
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ },
+ "duplicate_documents": {
+ "title": "Duplicate Documents",
+ "default": "overwrite",
+ "type": "string"
+ },
+ "isolation_level": {
+ "title": "Isolation Level",
+ "type": "string"
+ },
+ "consistency_level": {
+ "title": "Consistency Level",
+ "default": 0,
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "OpenDistroElasticsearchDocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "OpenDistroElasticsearchDocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "similarity": {
+ "title": "Similarity",
+ "default": "cosine"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "OpenSearchDocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "OpenSearchDocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "verify_certs": {
+ "title": "Verify Certs",
+ "default": false
+ },
+ "scheme": {
+ "title": "Scheme",
+ "default": "https"
+ },
+ "username": {
+ "title": "Username",
+ "default": "admin"
+ },
+ "password": {
+ "title": "Password",
+ "default": "admin"
+ },
+ "port": {
+ "title": "Port",
+ "default": 9200
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "PineconeDocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "PineconeDocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "api_key": {
+ "title": "Api Key",
+ "type": "string"
+ },
+ "environment": {
+ "title": "Environment",
+ "default": "us-west1-gcp",
+ "type": "string"
+ },
+ "sql_url": {
+ "title": "Sql Url",
+ "default": "sqlite:///pinecone_document_store.db",
+ "type": "string"
+ },
+ "pinecone_index": {
+ "title": "Pinecone Index",
+ "type": "string",
+ "default": null
+ },
+ "embedding_dim": {
+ "title": "Embedding Dim",
+ "default": 768,
+ "type": "integer"
+ },
+ "return_embedding": {
+ "title": "Return Embedding",
+ "default": false,
+ "type": "boolean"
+ },
+ "index": {
+ "title": "Index",
+ "default": "document",
+ "type": "string"
+ },
+ "similarity": {
+ "title": "Similarity",
+ "default": "cosine",
+ "type": "string"
+ },
+ "replicas": {
+ "title": "Replicas",
+ "default": 1,
+ "type": "integer"
+ },
+ "shards": {
+ "title": "Shards",
+ "default": 1,
+ "type": "integer"
+ },
+ "embedding_field": {
+ "title": "Embedding Field",
+ "default": "embedding",
+ "type": "string"
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ },
+ "duplicate_documents": {
+ "title": "Duplicate Documents",
+ "default": "overwrite",
+ "type": "string"
+ }
+ },
+ "required": [
+ "api_key"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "SQLDocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "SQLDocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "url": {
+ "title": "Url",
+ "default": "sqlite://",
+ "type": "string"
+ },
+ "index": {
+ "title": "Index",
+ "default": "document",
+ "type": "string"
+ },
+ "label_index": {
+ "title": "Label Index",
+ "default": "label",
+ "type": "string"
+ },
+ "duplicate_documents": {
+ "title": "Duplicate Documents",
+ "default": "overwrite",
+ "type": "string"
+ },
+ "check_same_thread": {
+ "title": "Check Same Thread",
+ "default": false,
+ "type": "boolean"
+ },
+ "isolation_level": {
+ "title": "Isolation Level",
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "WeaviateDocumentStoreComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "WeaviateDocumentStore"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "host": {
+ "title": "Host",
+ "default": "http://localhost",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ ]
+ },
+ "port": {
+ "title": "Port",
+ "default": 8080,
+ "anyOf": [
+ {
+ "type": "integer"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "integer"
+ }
+ }
+ ]
+ },
+ "timeout_config": {
+ "title": "Timeout Config",
+ "default": [
+ 5,
+ 15
+ ],
+ "type": "array",
+ "items": {}
+ },
+ "username": {
+ "title": "Username",
+ "type": "string"
+ },
+ "password": {
+ "title": "Password",
+ "type": "string"
+ },
+ "index": {
+ "title": "Index",
+ "default": "Document",
+ "type": "string"
+ },
+ "embedding_dim": {
+ "title": "Embedding Dim",
+ "default": 768,
+ "type": "integer"
+ },
+ "content_field": {
+ "title": "Content Field",
+ "default": "content",
+ "type": "string"
+ },
+ "name_field": {
+ "title": "Name Field",
+ "default": "name",
+ "type": "string"
+ },
+ "similarity": {
+ "title": "Similarity",
+ "default": "cosine",
+ "type": "string"
+ },
+ "index_type": {
+ "title": "Index Type",
+ "default": "hnsw",
+ "type": "string"
+ },
+ "custom_schema": {
+ "title": "Custom Schema",
+ "type": "object"
+ },
+ "return_embedding": {
+ "title": "Return Embedding",
+ "default": false,
+ "type": "boolean"
+ },
+ "embedding_field": {
+ "title": "Embedding Field",
+ "default": "embedding",
+ "type": "string"
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ },
+ "duplicate_documents": {
+ "title": "Duplicate Documents",
+ "default": "overwrite",
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "AzureConverterComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "AzureConverter"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "endpoint": {
+ "title": "Endpoint",
+ "type": "string"
+ },
+ "credential_key": {
+ "title": "Credential Key",
+ "type": "string"
+ },
+ "model_id": {
+ "title": "Model Id",
+ "default": "prebuilt-document",
+ "type": "string"
+ },
+ "valid_languages": {
+ "title": "Valid Languages",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "save_json": {
+ "title": "Save Json",
+ "default": false,
+ "type": "boolean"
+ },
+ "preceding_context_len": {
+ "title": "Preceding Context Len",
+ "default": 3,
+ "type": "integer"
+ },
+ "following_context_len": {
+ "title": "Following Context Len",
+ "default": 3,
+ "type": "integer"
+ },
+ "merge_multiple_column_headers": {
+ "title": "Merge Multiple Column Headers",
+ "default": true,
+ "type": "boolean"
+ }
+ },
+ "required": [
+ "endpoint",
+ "credential_key"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "CrawlerComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "Crawler"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "output_dir": {
+ "title": "Output Dir",
+ "type": "string"
+ },
+ "urls": {
+ "title": "Urls",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "crawler_depth": {
+ "title": "Crawler Depth",
+ "default": 1,
+ "type": "integer"
+ },
+ "filter_urls": {
+ "title": "Filter Urls",
+ "type": "array",
+ "items": {}
+ },
+ "overwrite_existing_files": {
+ "title": "Overwrite Existing Files",
+ "default": true
+ }
+ },
+ "required": [
+ "output_dir"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "DensePassageRetrieverComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "DensePassageRetriever"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "document_store": {
+ "title": "Document Store",
+ "type": "string"
+ },
+ "query_embedding_model": {
+ "title": "Query Embedding Model",
+ "default": "facebook/dpr-question_encoder-single-nq-base",
+ "anyOf": [
+ {
+ "type": "string",
+ "format": "path"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "passage_embedding_model": {
+ "title": "Passage Embedding Model",
+ "default": "facebook/dpr-ctx_encoder-single-nq-base",
+ "anyOf": [
+ {
+ "type": "string",
+ "format": "path"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "max_seq_len_query": {
+ "title": "Max Seq Len Query",
+ "default": 64,
+ "type": "integer"
+ },
+ "max_seq_len_passage": {
+ "title": "Max Seq Len Passage",
+ "default": 256,
+ "type": "integer"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "batch_size": {
+ "title": "Batch Size",
+ "default": 16,
+ "type": "integer"
+ },
+ "embed_title": {
+ "title": "Embed Title",
+ "default": true,
+ "type": "boolean"
+ },
+ "use_fast_tokenizers": {
+ "title": "Use Fast Tokenizers",
+ "default": true,
+ "type": "boolean"
+ },
+ "infer_tokenizer_classes": {
+ "title": "Infer Tokenizer Classes",
+ "default": false,
+ "type": "boolean"
+ },
+ "similarity_function": {
+ "title": "Similarity Function",
+ "default": "dot_product",
+ "type": "string"
+ },
+ "global_loss_buffer_size": {
+ "title": "Global Loss Buffer Size",
+ "default": 150000,
+ "type": "integer"
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ },
+ "devices": {
+ "title": "Devices",
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ "use_auth_token": {
+ "title": "Use Auth Token",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "boolean"
+ }
+ ]
+ }
+ },
+ "required": [
+ "document_store"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "Docs2AnswersComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "Docs2Answers"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {},
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "DocxToTextConverterComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "DocxToTextConverter"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables",
+ "default": false,
+ "type": "boolean"
+ },
+ "valid_languages": {
+ "title": "Valid Languages",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "ElasticsearchFilterOnlyRetrieverComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "ElasticsearchFilterOnlyRetriever"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "document_store": {
+ "title": "Document Store",
+ "type": "string"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "all_terms_must_match": {
+ "title": "All Terms Must Match",
+ "default": false,
+ "type": "boolean"
+ },
+ "custom_query": {
+ "title": "Custom Query",
+ "type": "string"
+ }
+ },
+ "required": [
+ "document_store"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "ElasticsearchRetrieverComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "ElasticsearchRetriever"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "document_store": {
+ "title": "Document Store",
+ "type": "string"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "all_terms_must_match": {
+ "title": "All Terms Must Match",
+ "default": false,
+ "type": "boolean"
+ },
+ "custom_query": {
+ "title": "Custom Query",
+ "type": "string"
+ }
+ },
+ "required": [
+ "document_store"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "EmbeddingRetrieverComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "EmbeddingRetriever"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "document_store": {
+ "title": "Document Store",
+ "type": "string"
+ },
+ "embedding_model": {
+ "title": "Embedding Model",
+ "type": "string"
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "batch_size": {
+ "title": "Batch Size",
+ "default": 32,
+ "type": "integer"
+ },
+ "max_seq_len": {
+ "title": "Max Seq Len",
+ "default": 512,
+ "type": "integer"
+ },
+ "model_format": {
+ "title": "Model Format",
+ "default": "farm",
+ "type": "string"
+ },
+ "pooling_strategy": {
+ "title": "Pooling Strategy",
+ "default": "reduce_mean",
+ "type": "string"
+ },
+ "emb_extraction_layer": {
+ "title": "Emb Extraction Layer",
+ "default": -1,
+ "type": "integer"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ },
+ "devices": {
+ "title": "Devices",
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ "use_auth_token": {
+ "title": "Use Auth Token",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "boolean"
+ }
+ ]
+ }
+ },
+ "required": [
+ "document_store",
+ "embedding_model"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "EntityExtractorComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "EntityExtractor"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "default": "dslim/bert-base-NER",
+ "type": "string"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "EvalAnswersComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "EvalAnswers"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "skip_incorrect_retrieval": {
+ "title": "Skip Incorrect Retrieval",
+ "default": true,
+ "type": "boolean"
+ },
+ "open_domain": {
+ "title": "Open Domain",
+ "default": true,
+ "type": "boolean"
+ },
+ "sas_model": {
+ "title": "Sas Model",
+ "type": "string"
+ },
+ "debug": {
+ "title": "Debug",
+ "default": false,
+ "type": "boolean"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "EvalDocumentsComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "EvalDocuments"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "debug": {
+ "title": "Debug",
+ "default": false,
+ "type": "boolean"
+ },
+ "open_domain": {
+ "title": "Open Domain",
+ "default": true,
+ "type": "boolean"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "FARMReaderComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "FARMReader"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "type": "string"
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "context_window_size": {
+ "title": "Context Window Size",
+ "default": 150,
+ "type": "integer"
+ },
+ "batch_size": {
+ "title": "Batch Size",
+ "default": 50,
+ "type": "integer"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "no_ans_boost": {
+ "title": "No Ans Boost",
+ "default": 0.0,
+ "type": "number"
+ },
+ "return_no_answer": {
+ "title": "Return No Answer",
+ "default": false,
+ "type": "boolean"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "top_k_per_candidate": {
+ "title": "Top K Per Candidate",
+ "default": 3,
+ "type": "integer"
+ },
+ "top_k_per_sample": {
+ "title": "Top K Per Sample",
+ "default": 1,
+ "type": "integer"
+ },
+ "num_processes": {
+ "title": "Num Processes",
+ "type": "integer"
+ },
+ "max_seq_len": {
+ "title": "Max Seq Len",
+ "default": 256,
+ "type": "integer"
+ },
+ "doc_stride": {
+ "title": "Doc Stride",
+ "default": 128,
+ "type": "integer"
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ },
+ "duplicate_filtering": {
+ "title": "Duplicate Filtering",
+ "default": 0,
+ "type": "integer"
+ },
+ "use_confidence_scores": {
+ "title": "Use Confidence Scores",
+ "default": true,
+ "type": "boolean"
+ },
+ "proxies": {
+ "title": "Proxies",
+ "type": "object",
+ "additionalProperties": {
+ "type": "string"
+ }
+ },
+ "local_files_only": {
+ "title": "Local Files Only",
+ "default": false
+ },
+ "force_download": {
+ "title": "Force Download",
+ "default": false
+ },
+ "use_auth_token": {
+ "title": "Use Auth Token",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "boolean"
+ }
+ ]
+ }
+ },
+ "required": [
+ "model_name_or_path"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "FileTypeClassifierComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "FileTypeClassifier"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "supported_types": {
+ "title": "Supported Types",
+ "default": [
+ "txt",
+ "pdf",
+ "md",
+ "docx",
+ "html"
+ ],
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "ImageToTextConverterComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "ImageToTextConverter"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables",
+ "default": false,
+ "type": "boolean"
+ },
+ "valid_languages": {
+ "title": "Valid Languages",
+ "default": [
+ "eng"
+ ],
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "JoinAnswersComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "JoinAnswers"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "join_mode": {
+ "title": "Join Mode",
+ "default": "concatenate",
+ "type": "string"
+ },
+ "weights": {
+ "title": "Weights",
+ "type": "array",
+ "items": {
+ "type": "number"
+ }
+ },
+ "top_k_join": {
+ "title": "Top K Join",
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "JoinDocumentsComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "JoinDocuments"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "join_mode": {
+ "title": "Join Mode",
+ "default": "concatenate",
+ "type": "string"
+ },
+ "weights": {
+ "title": "Weights",
+ "type": "array",
+ "items": {
+ "type": "number"
+ }
+ },
+ "top_k_join": {
+ "title": "Top K Join",
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "MarkdownConverterComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "MarkdownConverter"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables",
+ "default": false,
+ "type": "boolean"
+ },
+ "valid_languages": {
+ "title": "Valid Languages",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "PDFToTextConverterComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "PDFToTextConverter"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables",
+ "default": false,
+ "type": "boolean"
+ },
+ "valid_languages": {
+ "title": "Valid Languages",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "PDFToTextOCRConverterComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "PDFToTextOCRConverter"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables",
+ "default": false,
+ "type": "boolean"
+ },
+ "valid_languages": {
+ "title": "Valid Languages",
+ "default": [
+ "eng"
+ ],
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "ParsrConverterComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "ParsrConverter"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "parsr_url": {
+ "title": "Parsr Url",
+ "default": "http://localhost:3001",
+ "type": "string"
+ },
+ "extractor": {
+ "title": "Extractor",
+ "default": "pdfminer",
+ "enum": [
+ "pdfminer",
+ "pdfjs"
+ ],
+ "type": "string"
+ },
+ "table_detection_mode": {
+ "title": "Table Detection Mode",
+ "default": "lattice",
+ "enum": [
+ "lattice",
+ "stream"
+ ],
+ "type": "string"
+ },
+ "preceding_context_len": {
+ "title": "Preceding Context Len",
+ "default": 3,
+ "type": "integer"
+ },
+ "following_context_len": {
+ "title": "Following Context Len",
+ "default": 3,
+ "type": "integer"
+ },
+ "remove_page_headers": {
+ "title": "Remove Page Headers",
+ "default": false,
+ "type": "boolean"
+ },
+ "remove_page_footers": {
+ "title": "Remove Page Footers",
+ "default": false,
+ "type": "boolean"
+ },
+ "remove_table_of_contents": {
+ "title": "Remove Table Of Contents",
+ "default": false,
+ "type": "boolean"
+ },
+ "valid_languages": {
+ "title": "Valid Languages",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "PreProcessorComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "PreProcessor"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "clean_whitespace": {
+ "title": "Clean Whitespace",
+ "default": true,
+ "type": "boolean"
+ },
+ "clean_header_footer": {
+ "title": "Clean Header Footer",
+ "default": false,
+ "type": "boolean"
+ },
+ "clean_empty_lines": {
+ "title": "Clean Empty Lines",
+ "default": true,
+ "type": "boolean"
+ },
+ "remove_substrings": {
+ "title": "Remove Substrings",
+ "default": [],
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "split_by": {
+ "title": "Split By",
+ "default": "word",
+ "type": "string"
+ },
+ "split_length": {
+ "title": "Split Length",
+ "default": 200,
+ "type": "integer"
+ },
+ "split_overlap": {
+ "title": "Split Overlap",
+ "default": 0,
+ "type": "integer"
+ },
+ "split_respect_sentence_boundary": {
+ "title": "Split Respect Sentence Boundary",
+ "default": true,
+ "type": "boolean"
+ },
+ "language": {
+ "title": "Language",
+ "default": "en",
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "QuestionGeneratorComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "QuestionGenerator"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "default": "valhalla/t5-base-e2e-qg"
+ },
+ "model_version": {
+ "title": "Model Version"
+ },
+ "num_beams": {
+ "title": "Num Beams",
+ "default": 4
+ },
+ "max_length": {
+ "title": "Max Length",
+ "default": 256
+ },
+ "no_repeat_ngram_size": {
+ "title": "No Repeat Ngram Size",
+ "default": 3
+ },
+ "length_penalty": {
+ "title": "Length Penalty",
+ "default": 1.5
+ },
+ "early_stopping": {
+ "title": "Early Stopping",
+ "default": true
+ },
+ "split_length": {
+ "title": "Split Length",
+ "default": 50
+ },
+ "split_overlap": {
+ "title": "Split Overlap",
+ "default": 10
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true
+ },
+ "prompt": {
+ "title": "Prompt",
+ "default": "generate questions:"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "RAGeneratorComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "RAGenerator"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "default": "facebook/rag-token-nq",
+ "type": "string"
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "retriever": {
+ "title": "Retriever",
+ "type": "string",
+ "default": null
+ },
+ "generator_type": {
+ "title": "Generator Type",
+ "default": "token",
+ "type": "string"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 2,
+ "type": "integer"
+ },
+ "max_length": {
+ "title": "Max Length",
+ "default": 200,
+ "type": "integer"
+ },
+ "min_length": {
+ "title": "Min Length",
+ "default": 2,
+ "type": "integer"
+ },
+ "num_beams": {
+ "title": "Num Beams",
+ "default": 2,
+ "type": "integer"
+ },
+ "embed_title": {
+ "title": "Embed Title",
+ "default": true,
+ "type": "boolean"
+ },
+ "prefix": {
+ "title": "Prefix",
+ "type": "string"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "RCIReaderComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "RCIReader"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "row_model_name_or_path": {
+ "title": "Row Model Name Or Path",
+ "default": "michaelrglass/albert-base-rci-wikisql-row",
+ "type": "string"
+ },
+ "column_model_name_or_path": {
+ "title": "Column Model Name Or Path",
+ "default": "michaelrglass/albert-base-rci-wikisql-col",
+ "type": "string"
+ },
+ "row_model_version": {
+ "title": "Row Model Version",
+ "type": "string"
+ },
+ "column_model_version": {
+ "title": "Column Model Version",
+ "type": "string"
+ },
+ "row_tokenizer": {
+ "title": "Row Tokenizer",
+ "type": "string"
+ },
+ "column_tokenizer": {
+ "title": "Column Tokenizer",
+ "type": "string"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "max_seq_len": {
+ "title": "Max Seq Len",
+ "default": 256,
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "RouteDocumentsComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "RouteDocuments"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "split_by": {
+ "title": "Split By",
+ "default": "content_type",
+ "type": "string"
+ },
+ "metadata_values": {
+ "title": "Metadata Values",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "SentenceTransformersRankerComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "SentenceTransformersRanker"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string",
+ "format": "path"
+ }
+ ]
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "devices": {
+ "title": "Devices",
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ }
+ },
+ "required": [
+ "model_name_or_path"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "Seq2SeqGeneratorComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "Seq2SeqGenerator"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "type": "string"
+ },
+ "input_converter": {
+ "title": "Input Converter",
+ "type": "string",
+ "default": null
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 1,
+ "type": "integer"
+ },
+ "max_length": {
+ "title": "Max Length",
+ "default": 200,
+ "type": "integer"
+ },
+ "min_length": {
+ "title": "Min Length",
+ "default": 2,
+ "type": "integer"
+ },
+ "num_beams": {
+ "title": "Num Beams",
+ "default": 8,
+ "type": "integer"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ }
+ },
+ "required": [
+ "model_name_or_path"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "SklearnQueryClassifierComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "SklearnQueryClassifier"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "default": "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {}
+ ]
+ },
+ "vectorizer_name_or_path": {
+ "title": "Vectorizer Name Or Path",
+ "default": "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {}
+ ]
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TableReaderComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TableReader"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "default": "google/tapas-base-finetuned-wtq",
+ "type": "string"
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "tokenizer": {
+ "title": "Tokenizer",
+ "type": "string"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "top_k_per_candidate": {
+ "title": "Top K Per Candidate",
+ "default": 3,
+ "type": "integer"
+ },
+ "return_no_answer": {
+ "title": "Return No Answer",
+ "default": false,
+ "type": "boolean"
+ },
+ "max_seq_len": {
+ "title": "Max Seq Len",
+ "default": 256,
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TableTextRetrieverComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TableTextRetriever"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "document_store": {
+ "title": "Document Store",
+ "type": "string"
+ },
+ "query_embedding_model": {
+ "title": "Query Embedding Model",
+ "default": "deepset/bert-small-mm_retrieval-question_encoder",
+ "anyOf": [
+ {
+ "type": "string",
+ "format": "path"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "passage_embedding_model": {
+ "title": "Passage Embedding Model",
+ "default": "deepset/bert-small-mm_retrieval-passage_encoder",
+ "anyOf": [
+ {
+ "type": "string",
+ "format": "path"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "table_embedding_model": {
+ "title": "Table Embedding Model",
+ "default": "deepset/bert-small-mm_retrieval-table_encoder",
+ "anyOf": [
+ {
+ "type": "string",
+ "format": "path"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "max_seq_len_query": {
+ "title": "Max Seq Len Query",
+ "default": 64,
+ "type": "integer"
+ },
+ "max_seq_len_passage": {
+ "title": "Max Seq Len Passage",
+ "default": 256,
+ "type": "integer"
+ },
+ "max_seq_len_table": {
+ "title": "Max Seq Len Table",
+ "default": 256,
+ "type": "integer"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "batch_size": {
+ "title": "Batch Size",
+ "default": 16,
+ "type": "integer"
+ },
+ "embed_meta_fields": {
+ "title": "Embed Meta Fields",
+ "default": [
+ "name",
+ "section_title",
+ "caption"
+ ],
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "use_fast_tokenizers": {
+ "title": "Use Fast Tokenizers",
+ "default": true,
+ "type": "boolean"
+ },
+ "infer_tokenizer_classes": {
+ "title": "Infer Tokenizer Classes",
+ "default": false,
+ "type": "boolean"
+ },
+ "similarity_function": {
+ "title": "Similarity Function",
+ "default": "dot_product",
+ "type": "string"
+ },
+ "global_loss_buffer_size": {
+ "title": "Global Loss Buffer Size",
+ "default": 150000,
+ "type": "integer"
+ },
+ "progress_bar": {
+ "title": "Progress Bar",
+ "default": true,
+ "type": "boolean"
+ },
+ "devices": {
+ "title": "Devices",
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ "use_auth_token": {
+ "title": "Use Auth Token",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "boolean"
+ }
+ ]
+ }
+ },
+ "required": [
+ "document_store"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "Text2SparqlRetrieverComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "Text2SparqlRetriever"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "knowledge_graph": {
+ "title": "Knowledge Graph"
+ },
+ "model_name_or_path": {
+ "title": "Model Name Or Path"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 1,
+ "type": "integer"
+ }
+ },
+ "required": [
+ "knowledge_graph",
+ "model_name_or_path"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TextConverterComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TextConverter"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables",
+ "default": false,
+ "type": "boolean"
+ },
+ "valid_languages": {
+ "title": "Valid Languages",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TfidfRetrieverComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TfidfRetriever"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "document_store": {
+ "title": "Document Store",
+ "type": "string"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "auto_fit": {
+ "title": "Auto Fit",
+ "default": true
+ }
+ },
+ "required": [
+ "document_store"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TikaConverterComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TikaConverter"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "tika_url": {
+ "title": "Tika Url",
+ "default": "http://localhost:9998/tika",
+ "type": "string"
+ },
+ "remove_numeric_tables": {
+ "title": "Remove Numeric Tables",
+ "default": false,
+ "type": "boolean"
+ },
+ "valid_languages": {
+ "title": "Valid Languages",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TransformersDocumentClassifierComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TransformersDocumentClassifier"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "default": "bhadresh-savani/distilbert-base-uncased-emotion",
+ "type": "string"
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "tokenizer": {
+ "title": "Tokenizer",
+ "type": "string"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "return_all_scores": {
+ "title": "Return All Scores",
+ "default": false,
+ "type": "boolean"
+ },
+ "task": {
+ "title": "Task",
+ "default": "text-classification",
+ "type": "string"
+ },
+ "labels": {
+ "title": "Labels",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "batch_size": {
+ "title": "Batch Size",
+ "default": -1,
+ "type": "integer"
+ },
+ "classification_field": {
+ "title": "Classification Field",
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TransformersQueryClassifierComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TransformersQueryClassifier"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "default": "shahrukhx01/bert-mini-finetune-question-detection",
+ "anyOf": [
+ {
+ "type": "string",
+ "format": "path"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TransformersReaderComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TransformersReader"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "default": "distilbert-base-uncased-distilled-squad",
+ "type": "string"
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "tokenizer": {
+ "title": "Tokenizer",
+ "type": "string"
+ },
+ "context_window_size": {
+ "title": "Context Window Size",
+ "default": 70,
+ "type": "integer"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "top_k": {
+ "title": "Top K",
+ "default": 10,
+ "type": "integer"
+ },
+ "top_k_per_candidate": {
+ "title": "Top K Per Candidate",
+ "default": 4,
+ "type": "integer"
+ },
+ "return_no_answers": {
+ "title": "Return No Answers",
+ "default": true,
+ "type": "boolean"
+ },
+ "max_seq_len": {
+ "title": "Max Seq Len",
+ "default": 256,
+ "type": "integer"
+ },
+ "doc_stride": {
+ "title": "Doc Stride",
+ "default": 128,
+ "type": "integer"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TransformersSummarizerComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TransformersSummarizer"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "default": "google/pegasus-xsum",
+ "type": "string"
+ },
+ "model_version": {
+ "title": "Model Version",
+ "type": "string"
+ },
+ "tokenizer": {
+ "title": "Tokenizer",
+ "type": "string"
+ },
+ "max_length": {
+ "title": "Max Length",
+ "default": 200,
+ "type": "integer"
+ },
+ "min_length": {
+ "title": "Min Length",
+ "default": 5,
+ "type": "integer"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "clean_up_tokenization_spaces": {
+ "title": "Clean Up Tokenization Spaces",
+ "default": true,
+ "type": "boolean"
+ },
+ "separator_for_single_summary": {
+ "title": "Separator For Single Summary",
+ "default": " ",
+ "type": "string"
+ },
+ "generate_single_summary": {
+ "title": "Generate Single Summary",
+ "default": false,
+ "type": "boolean"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ },
+ "TransformersTranslatorComponent": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "title": "Name",
+ "description": "Custom name for the component. Helpful for visualization and debugging.",
+ "type": "string"
+ },
+ "type": {
+ "title": "Type",
+ "description": "Haystack Class name for the component.",
+ "type": "string",
+ "const": "TransformersTranslator"
+ },
+ "params": {
+ "title": "Parameters",
+ "type": "object",
+ "properties": {
+ "model_name_or_path": {
+ "title": "Model Name Or Path",
+ "type": "string"
+ },
+ "tokenizer_name": {
+ "title": "Tokenizer Name",
+ "type": "string"
+ },
+ "max_seq_len": {
+ "title": "Max Seq Len",
+ "type": "integer"
+ },
+ "clean_up_tokenization_spaces": {
+ "title": "Clean Up Tokenization Spaces",
+ "default": true,
+ "type": "boolean"
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ }
+ },
+ "required": [
+ "model_name_or_path"
+ ],
+ "additionalProperties": false,
+ "description": "Each parameter can reference other components defined in the same YAML file."
+ }
+ },
+ "required": [
+ "type",
+ "name"
+ ],
+ "additionalProperties": false
+ }
+ }
+}
\ No newline at end of file
diff --git a/haystack/modeling/data_handler/data_silo.py b/haystack/modeling/data_handler/data_silo.py
index 1b4aa1fe7..799b702f0 100644
--- a/haystack/modeling/data_handler/data_silo.py
+++ b/haystack/modeling/data_handler/data_silo.py
@@ -18,7 +18,7 @@ from torch.utils.data.sampler import RandomSampler, SequentialSampler
from haystack.modeling.data_handler.dataloader import NamedDataLoader
from haystack.modeling.data_handler.processor import Processor
-from haystack.modeling.logger import MLFlowLogger as MlLogger
+from haystack.utils.experiment_tracking import Tracker as tracker
from haystack.modeling.utils import log_ascii_workers, grouper, calc_chunksize
from haystack.modeling.visual import TRACTOR_SMALL
@@ -497,7 +497,7 @@ class DataSilo:
logger.info("Average passage length after clipping: {}".format(ave_len[1]))
logger.info("Proportion passages clipped: {}".format(clipped[1]))
- MlLogger.log_params(
+ tracker.track_params(
{
"n_samples_train": self.counts["train"],
"n_samples_dev": self.counts["dev"],
diff --git a/haystack/modeling/data_handler/processor.py b/haystack/modeling/data_handler/processor.py
index 8c4ef2979..b44f2e432 100644
--- a/haystack/modeling/data_handler/processor.py
+++ b/haystack/modeling/data_handler/processor.py
@@ -31,7 +31,7 @@ from haystack.modeling.data_handler.samples import (
offset_to_token_idx_vecorized,
)
from haystack.modeling.data_handler.input_features import sample_to_features_text
-from haystack.modeling.logger import MLFlowLogger as MlLogger
+from haystack.utils.experiment_tracking import Tracker as tracker
DOWNSTREAM_TASK_MAP = {
@@ -359,7 +359,7 @@ class Processor(ABC):
for name in names:
value = getattr(self, name)
params.update({name: str(value)})
- MlLogger.log_params(params)
+ tracker.track_params(params)
class SquadProcessor(Processor):
diff --git a/haystack/modeling/evaluation/eval.py b/haystack/modeling/evaluation/eval.py
index 0bd17a9c4..4cdba7409 100644
--- a/haystack/modeling/evaluation/eval.py
+++ b/haystack/modeling/evaluation/eval.py
@@ -8,7 +8,7 @@ from tqdm import tqdm
from haystack.modeling.evaluation.metrics import compute_metrics, compute_report_metrics
from haystack.modeling.model.adaptive_model import AdaptiveModel
-from haystack.modeling.logger import MLFlowLogger as MlLogger
+from haystack.utils.experiment_tracking import Tracker as tracker
from haystack.modeling.visual import BUSH_SEP
@@ -157,11 +157,11 @@ class Evaluator:
for head_num, head in enumerate(results):
logger.info("\n _________ {} _________".format(head["task_name"]))
for metric_name, metric_val in head.items():
- # log with ML framework (e.g. Mlflow)
+ # log with experiment tracking framework (e.g. Mlflow)
if logging:
if not metric_name in ["preds", "labels"] and not metric_name.startswith("_"):
if isinstance(metric_val, numbers.Number):
- MlLogger.log_metrics(
+ tracker.track_metrics(
metrics={f"{dataset_name}_{metric_name}_{head['task_name']}": metric_val}, step=steps
)
# print via standard python logger
diff --git a/haystack/modeling/infer.py b/haystack/modeling/infer.py
index fcd304339..85b828b22 100644
--- a/haystack/modeling/infer.py
+++ b/haystack/modeling/infer.py
@@ -21,7 +21,6 @@ from haystack.modeling.utils import (
)
from haystack.modeling.data_handler.inputs import QAInput
from haystack.modeling.model.adaptive_model import AdaptiveModel, BaseAdaptiveModel
-from haystack.modeling.logger import MLFlowLogger
from haystack.modeling.model.predictions import QAPred
@@ -74,8 +73,6 @@ class Inferencer:
:return: An instance of the Inferencer.
"""
- MLFlowLogger.disable()
-
# Init device and distributed settings
self.devices, n_gpu = initialize_device_settings(use_cuda=gpu, multi_gpu=False)
diff --git a/haystack/modeling/logger.py b/haystack/modeling/logger.py
deleted file mode 100644
index e57c8d0f7..000000000
--- a/haystack/modeling/logger.py
+++ /dev/null
@@ -1,145 +0,0 @@
-import logging
-import mlflow
-from requests.exceptions import ConnectionError
-
-
-logger = logging.getLogger(__name__)
-
-
-class BaseMLLogger:
- """
- Base class for tracking experiments.
-
- This class can be extended to implement custom logging backends like MLFlow, Tensorboard, or Sacred.
- """
-
- disable_logging = False
-
- def __init__(self, tracking_uri, **kwargs):
- self.tracking_uri = tracking_uri
-
- def init_experiment(self, tracking_uri):
- raise NotImplementedError()
-
- @classmethod
- def log_metrics(cls, metrics, step):
- raise NotImplementedError()
-
- @classmethod
- def log_artifacts(cls, self):
- raise NotImplementedError()
-
- @classmethod
- def log_params(cls, params):
- raise NotImplementedError()
-
-
-class StdoutLogger(BaseMLLogger):
- """Minimal logger printing metrics and params to stdout.
- Useful for services like AWS SageMaker, where you parse metrics from the actual logs"""
-
- def init_experiment(self, experiment_name, run_name=None, nested=True):
- logger.info(f"\n **** Starting experiment '{experiment_name}' (Run: {run_name}) ****")
-
- @classmethod
- def log_metrics(cls, metrics, step):
- logger.info(f"Logged metrics at step {step}: \n {metrics}")
-
- @classmethod
- def log_params(cls, params):
- logger.info(f"Logged parameters: \n {params}")
-
- @classmethod
- def log_artifacts(cls, dir_path, artifact_path=None):
- raise NotImplementedError
-
- @classmethod
- def end_run(cls):
- logger.info(f"**** End of Experiment **** ")
-
-
-class MLFlowLogger(BaseMLLogger):
- """
- Logger for MLFlow experiment tracking.
- """
-
- def init_experiment(self, experiment_name, run_name=None, nested=True):
- if not self.disable_logging:
- try:
- mlflow.set_tracking_uri(self.tracking_uri)
- mlflow.set_experiment(experiment_name)
- mlflow.start_run(run_name=run_name, nested=nested)
- except ConnectionError:
- raise Exception(
- f"MLFlow cannot connect to the remote server at {self.tracking_uri}.\n"
- f"MLFlow also supports logging runs locally to files. Set the MLFlowLogger "
- f"tracking_uri to an empty string to use that."
- )
-
- @classmethod
- def log_metrics(cls, metrics, step):
- if not cls.disable_logging:
- try:
- mlflow.log_metrics(metrics, step=step)
- except ConnectionError:
- logger.warning(f"ConnectionError in logging metrics to MLFlow.")
- except Exception as e:
- logger.warning(f"Failed to log metrics: {e}")
-
- @classmethod
- def log_params(cls, params):
- if not cls.disable_logging:
- try:
- mlflow.log_params(params)
- except ConnectionError:
- logger.warning("ConnectionError in logging params to MLFlow")
- except Exception as e:
- logger.warning(f"Failed to log params: {e}")
-
- @classmethod
- def log_artifacts(cls, dir_path, artifact_path=None):
- if not cls.disable_logging:
- try:
- mlflow.log_artifacts(dir_path, artifact_path)
- except ConnectionError:
- logger.warning(f"ConnectionError in logging artifacts to MLFlow")
- except Exception as e:
- logger.warning(f"Failed to log artifacts: {e}")
-
- @classmethod
- def end_run(cls):
- if not cls.disable_logging:
- mlflow.end_run()
-
- @classmethod
- def disable(cls):
- logger.info("ML Logging is turned off. No parameters, metrics or artifacts will be logged to MLFlow.")
- cls.disable_logging = True
-
-
-class TensorBoardLogger(BaseMLLogger):
- """
- PyTorch TensorBoard Logger
- """
-
- def __init__(self, **kwargs):
- try:
- from tensorboardX import SummaryWriter # pylint: disable=import-error
- except (ImportError, ModuleNotFoundError):
- logger.info(
- "tensorboardX not found, can't initialize TensorBoardLogger. "
- "Enable it with 'pip install tensorboardX'."
- )
-
- TensorBoardLogger.summary_writer = SummaryWriter()
- super().__init__(**kwargs)
-
- @classmethod
- def log_metrics(cls, metrics, step):
- for key, value in metrics.items():
- TensorBoardLogger.summary_writer.add_scalar(tag=key, scalar_value=value, global_step=step)
-
- @classmethod
- def log_params(cls, params):
- for key, value in params.items():
- TensorBoardLogger.summary_writer.add_text(tag=key, text_string=str(value))
diff --git a/haystack/modeling/model/adaptive_model.py b/haystack/modeling/model/adaptive_model.py
index 9061ed51c..ac126e485 100644
--- a/haystack/modeling/model/adaptive_model.py
+++ b/haystack/modeling/model/adaptive_model.py
@@ -15,7 +15,7 @@ from transformers.convert_graph_to_onnx import convert, quantize as quantize_mod
from haystack.modeling.data_handler.processor import Processor
from haystack.modeling.model.language_model import LanguageModel
from haystack.modeling.model.prediction_head import PredictionHead, QuestionAnsweringHead
-from haystack.modeling.logger import MLFlowLogger as MlLogger
+from haystack.utils.experiment_tracking import Tracker as tracker
logger = logging.getLogger(__name__)
@@ -556,7 +556,7 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
"lm_output_types": ",".join(self.lm_output_types),
}
try:
- MlLogger.log_params(params)
+ tracker.track_params(params)
except Exception as e:
logger.warning(f"ML logging didn't work: {e}")
diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py
index cc157c335..e960fb01d 100644
--- a/haystack/modeling/model/biadaptive_model.py
+++ b/haystack/modeling/model/biadaptive_model.py
@@ -10,7 +10,7 @@ from torch import nn
from haystack.modeling.data_handler.processor import Processor
from haystack.modeling.model.language_model import LanguageModel
from haystack.modeling.model.prediction_head import PredictionHead, TextSimilarityHead
-from haystack.modeling.logger import MLFlowLogger as MlLogger
+from haystack.utils.experiment_tracking import Tracker as tracker
logger = logging.getLogger(__name__)
@@ -335,7 +335,7 @@ class BiAdaptiveModel(nn.Module):
"prediction_heads": ",".join([head.__class__.__name__ for head in self.prediction_heads]),
}
try:
- MlLogger.log_params(params)
+ tracker.track_params(params)
except Exception as e:
logger.warning(f"ML logging didn't work: {e}")
diff --git a/haystack/modeling/model/optimization.py b/haystack/modeling/model/optimization.py
index c41d56186..403bca715 100644
--- a/haystack/modeling/model/optimization.py
+++ b/haystack/modeling/model/optimization.py
@@ -10,7 +10,7 @@ from torch.nn import DataParallel
from torch.nn.parallel import DistributedDataParallel
from haystack.modeling.model.adaptive_model import AdaptiveModel
-from haystack.modeling.logger import MLFlowLogger as MlLogger
+from haystack.utils.experiment_tracking import Tracker as tracker
logger = logging.getLogger(__name__)
@@ -161,7 +161,7 @@ def initialize_optimizer(
schedule_opts["num_training_steps"] = num_train_optimization_steps
# Log params
- MlLogger.log_params({"use_amp": use_amp, "num_train_optimization_steps": schedule_opts["num_training_steps"]})
+ tracker.track_params({"use_amp": use_amp, "num_train_optimization_steps": schedule_opts["num_training_steps"]})
# Get optimizer from pytorch, transformers or apex
optimizer = _get_optim(model, optimizer_opts)
@@ -189,8 +189,8 @@ def _get_optim(model, opts: Dict):
# Logging
logger.info(f"Loading optimizer `{optimizer_name}`: '{opts}'")
- MlLogger.log_params(opts)
- MlLogger.log_params({"optimizer_name": optimizer_name})
+ tracker.track_params(opts)
+ tracker.track_params({"optimizer_name": optimizer_name})
weight_decay = opts.pop("weight_decay", None)
no_decay = opts.pop("no_decay", None)
@@ -279,15 +279,15 @@ def get_scheduler(optimizer, opts):
# convert from warmup proportion to steps if required
if "num_warmup_steps" in allowed_args and "num_warmup_steps" not in opts and "warmup_proportion" in opts:
opts["num_warmup_steps"] = int(opts["warmup_proportion"] * opts["num_training_steps"])
- MlLogger.log_params({"warmup_proportion": opts["warmup_proportion"]})
+ tracker.track_params({"warmup_proportion": opts["warmup_proportion"]})
# only pass args that are supported by the constructor
constructor_opts = {k: v for k, v in opts.items() if k in allowed_args}
# Logging
logger.info(f"Loading schedule `{schedule_name}`: '{constructor_opts}'")
- MlLogger.log_params(constructor_opts)
- MlLogger.log_params({"schedule_name": schedule_name})
+ tracker.track_params(constructor_opts)
+ tracker.track_params({"schedule_name": schedule_name})
scheduler = sched_constructor(optimizer, **constructor_opts)
scheduler.opts = opts # save the opts with the scheduler to use in load/save
diff --git a/haystack/modeling/model/triadaptive_model.py b/haystack/modeling/model/triadaptive_model.py
index 0de639a02..9d3e8cfe6 100644
--- a/haystack/modeling/model/triadaptive_model.py
+++ b/haystack/modeling/model/triadaptive_model.py
@@ -9,7 +9,7 @@ from torch import nn
from haystack.modeling.data_handler.processor import Processor
from haystack.modeling.model.language_model import LanguageModel
from haystack.modeling.model.prediction_head import PredictionHead
-from haystack.modeling.logger import MLFlowLogger as MlLogger
+from haystack.utils.experiment_tracking import Tracker as tracker
logger = logging.getLogger(__name__)
@@ -369,7 +369,7 @@ class TriAdaptiveModel(nn.Module):
"prediction_heads": ",".join([head.__class__.__name__ for head in self.prediction_heads]),
}
try:
- MlLogger.log_params(params)
+ tracker.track_params(params)
except Exception as e:
logger.warning(f"ML logging didn't work: {e}")
diff --git a/haystack/modeling/training/base.py b/haystack/modeling/training/base.py
index 9e813fd16..e029e9732 100644
--- a/haystack/modeling/training/base.py
+++ b/haystack/modeling/training/base.py
@@ -19,7 +19,7 @@ from haystack.modeling.evaluation.eval import Evaluator
from haystack.modeling.model.adaptive_model import AdaptiveModel
from haystack.modeling.model.optimization import get_scheduler
from haystack.modeling.utils import GracefulKiller
-from haystack.modeling.logger import MLFlowLogger as MlLogger
+from haystack.utils.experiment_tracking import Tracker as tracker
try:
from apex import amp
@@ -161,7 +161,7 @@ class Trainer:
Useful to achieve larger effective batch sizes that would not fit in GPU memory.
:param local_rank: Local rank of process when distributed training via DDP is used.
:param early_stopping: an initialized EarlyStopping object to control early stopping and saving of best models.
- :param log_learning_rate: Whether to log learning rate to Mlflow
+ :param log_learning_rate: Whether to log learning rate to experiment tracker (e.g. Mlflow)
:param log_loss_every: Log current train loss after this many train steps.
:param checkpoint_on_sigterm: save a checkpoint for the Trainer when a SIGTERM signal is sent. The checkpoint
can be used to resume training. It is useful in frameworks like AWS SageMaker with Spot instances where
@@ -377,9 +377,9 @@ class Trainer:
loss = self.adjust_loss(loss)
if self.global_step % self.log_loss_every == 0 and self.local_rank in [-1, 0]:
if self.local_rank in [-1, 0]:
- MlLogger.log_metrics({"Train_loss_total": float(loss.detach().cpu().numpy())}, step=self.global_step)
+ tracker.track_metrics({"Train_loss_total": float(loss.detach().cpu().numpy())}, step=self.global_step)
if self.log_learning_rate:
- MlLogger.log_metrics({"learning_rate": self.lr_schedule.get_last_lr()[0]}, step=self.global_step)
+ tracker.track_metrics({"learning_rate": self.lr_schedule.get_last_lr()[0]}, step=self.global_step)
if self.use_amp:
with amp.scale_loss(loss, self.optimizer) as scaled_loss:
scaled_loss.backward()
@@ -406,7 +406,7 @@ class Trainer:
def log_params(self):
params = {"epochs": self.epochs, "n_gpu": self.n_gpu, "device": self.device}
- MlLogger.log_params(params)
+ tracker.track_params(params)
@classmethod
def create_or_load_checkpoint(
@@ -700,7 +700,7 @@ class DistillationTrainer(Trainer):
Useful to achieve larger effective batch sizes that would not fit in GPU memory.
:param local_rank: Local rank of process when distributed training via DDP is used.
:param early_stopping: an initialized EarlyStopping object to control early stopping and saving of best models.
- :param log_learning_rate: Whether to log learning rate to Mlflow
+ :param log_learning_rate: Whether to log learning rate to experiment tracker (e.g. Mlflow)
:param log_loss_every: Log current train loss after this many train steps.
:param checkpoint_on_sigterm: save a checkpoint for the Trainer when a SIGTERM signal is sent. The checkpoint
can be used to resume training. It is useful in frameworks like AWS SageMaker with Spot instances where
@@ -842,7 +842,7 @@ class TinyBERTDistillationTrainer(Trainer):
Useful to achieve larger effective batch sizes that would not fit in GPU memory.
:param local_rank: Local rank of process when distributed training via DDP is used.
:param early_stopping: an initialized EarlyStopping object to control early stopping and saving of best models.
- :param log_learning_rate: Whether to log learning rate to Mlflow
+ :param log_learning_rate: Whether to log learning rate to experiment tracker (e.g. Mlflow)
:param log_loss_every: Log current train loss after this many train steps.
:param checkpoint_on_sigterm: save a checkpoint for the Trainer when a SIGTERM signal is sent. The checkpoint
can be used to resume training. It is useful in frameworks like AWS SageMaker with Spot instances where
diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py
index bf343cfe2..2fc2008a2 100644
--- a/haystack/nodes/retriever/dense.py
+++ b/haystack/nodes/retriever/dense.py
@@ -455,7 +455,7 @@ class DensePassageRetriever(BaseRetriever):
use_amp=use_amp,
)
- # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
+ # 7. Let it grow! Watch the tracked metrics live on experiment tracker (e.g. Mlflow)
trainer.train()
self.model.save(Path(save_dir), lm1_name=query_encoder_save_dir, lm2_name=passage_encoder_save_dir)
@@ -985,7 +985,7 @@ class TableTextRetriever(BaseRetriever):
use_amp=use_amp,
)
- # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
+ # 7. Let it grow! Watch the tracked metrics live on experiment tracker (e.g. Mlflow)
trainer.train()
self.model.save(
diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py
index 70963bbac..a60206bfb 100644
--- a/haystack/pipelines/base.py
+++ b/haystack/pipelines/base.py
@@ -1,6 +1,11 @@
from __future__ import annotations
from typing import Dict, List, Optional, Any, Set, Tuple, Union
+try:
+ from typing import Literal
+except ImportError:
+ from typing_extensions import Literal # type: ignore
+
import copy
import json
import inspect
@@ -46,6 +51,7 @@ from haystack.nodes.base import BaseComponent
from haystack.nodes.retriever.base import BaseRetriever
from haystack.document_stores.base import BaseDocumentStore
from haystack.telemetry import send_event
+from haystack.utils.experiment_tracking import MLflowTrackingHead, Tracker as tracker
logger = logging.getLogger(__name__)
@@ -53,6 +59,7 @@ logger = logging.getLogger(__name__)
ROOT_NODE_TO_PIPELINE_NAME = {"query": "query", "file": "indexing"}
CODE_GEN_DEFAULT_COMMENT = "This code has been generated."
+TRACKING_TOOL_TO_HEAD = {"mlflow": MLflowTrackingHead}
class RootNode(BaseComponent):
@@ -770,6 +777,201 @@ class Pipeline(BasePipeline):
ndcg, map_, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
return ndcg, map_, recall, precision
+ @classmethod
+ def execute_eval_run(
+ cls,
+ index_pipeline: Pipeline,
+ query_pipeline: Pipeline,
+ evaluation_set_labels: List[MultiLabel],
+ corpus_file_paths: List[str],
+ experiment_name: str,
+ experiment_run_name: str,
+ experiment_tracking_tool: Literal["mlflow", None] = None,
+ experiment_tracking_uri: Optional[str] = None,
+ corpus_file_metas: List[Dict[str, Any]] = None,
+ corpus_meta: Dict[str, Any] = {},
+ evaluation_set_meta: Dict[str, Any] = {},
+ pipeline_meta: Dict[str, Any] = {},
+ index_params: dict = {},
+ query_params: dict = {},
+ sas_model_name_or_path: str = None,
+ sas_batch_size: int = 32,
+ sas_use_gpu: bool = True,
+ add_isolated_node_eval: bool = False,
+ reuse_index: bool = False,
+ ) -> EvaluationResult:
+ """
+ Starts an experiment run that first indexes the specified files (forming a corpus) using the index pipeline
+ and subsequently evaluates the query pipeline on the provided labels (forming an evaluation set) using pipeline.eval().
+ Parameters and results (metrics and predictions) of the run are tracked by an experiment tracking tool for further analysis.
+ You can specify the experiment tracking tool by setting the params `experiment_tracking_tool` and `experiment_tracking_uri`
+ or by passing a (custom) tracking head to Tracker.set_tracking_head().
+ Note, that `experiment_tracking_tool` only supports `mlflow` currently.
+
+ For easier comparison you can pass additional metadata regarding corpus (corpus_meta), evaluation set (evaluation_set_meta) and pipelines (pipeline_meta).
+ E.g. you can give them names or ids to identify them across experiment runs.
+
+ This method executes an experiment run. Each experiment run is part of at least one experiment.
+ An experiment typically consists of multiple runs to be compared (e.g. using different retrievers in query pipeline).
+ Experiment tracking tools usually share the same concepts of experiments and provide additional functionality to easily compare runs across experiments.
+
+ E.g. you can call execute_eval_run() multiple times with different retrievers in your query pipeline and compare the runs in mlflow:
+
+ ```python
+ | for retriever_type, query_pipeline in zip(["sparse", "dpr", "embedding"], [sparse_pipe, dpr_pipe, embedding_pipe]):
+ | eval_result = Pipeline.execute_eval_run(
+ | index_pipeline=index_pipeline,
+ | query_pipeline=query_pipeline,
+ | evaluation_set_labels=labels,
+ | corpus_file_paths=file_paths,
+ | corpus_file_metas=file_metas,
+ | experiment_tracking_tool="mlflow",
+ | experiment_tracking_uri="http://localhost:5000",
+ | experiment_name="my-retriever-experiment",
+ | experiment_run_name=f"run_{retriever_type}",
+ | pipeline_meta={"name": f"my-pipeline-{retriever_type}"},
+ | evaluation_set_meta={"name": "my-evalset"},
+ | corpus_meta={"name": "my-corpus"}.
+ | reuse_index=False
+ | )
+ ```
+
+ :param index_pipeline: The indexing pipeline to use.
+ :param query_pipeline: The query pipeline to evaluate.
+ :param evaluation_set_labels: The labels to evaluate on forming an evalution set.
+ :param corpus_file_paths: The files to be indexed and searched during evaluation forming a corpus.
+ :param experiment_name: The name of the experiment
+ :param experiment_run_name: The name of the experiment run
+ :param experiment_tracking_tool: The experiment tracking tool to be used. Currently we only support "mlflow".
+ If left unset the current TrackingHead specified by Tracker.set_tracking_head() will be used.
+ :param experiment_tracking_uri: The uri of the experiment tracking server to be used. Must be specified if experiment_tracking_tool is set.
+ You can use deepset's public mlflow server via https://public-mlflow.deepset.ai/.
+ Note, that artifact logging (e.g. Pipeline YAML or evaluation result CSVs) are currently not allowed on deepset's public mlflow server as this might expose sensitive data.
+ :param corpus_file_metas: The optional metadata to be stored for each corpus file (e.g. title).
+ :param corpus_meta: Metadata about the corpus to track (e.g. name, date, author, version).
+ :param evaluation_set_meta: Metadata about the evalset to track (e.g. name, date, author, version).
+ :param pipeline_meta: Metadata about the pipelines to track (e.g. name, author, version).
+ :param index_params: The params to use during indexing (see pipeline.run's params).
+ :param query_params: The params to use during querying (see pipeline.run's params).
+ :param sas_model_name_or_path: Name or path of "Semantic Answer Similarity (SAS) model". When set, the model will be used to calculate similarity between predictions and labels and generate the SAS metric.
+ The SAS metric correlates better with human judgement of correct answers as it does not rely on string overlaps.
+ Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture.
+ More info in the paper: https://arxiv.org/abs/2108.06130
+ Models:
+ - You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data.
+ Not all cross encoders can be used because of different return types.
+ If you use custom cross encoders please make sure they work with sentence_transformers.CrossEncoder class
+ - Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
+ - Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large"
+ - Large model for German only: "deepset/gbert-large-sts"
+ :param sas_batch_size: Number of prediction label pairs to encode at once by CrossEncoder or SentenceTransformer while calculating SAS.
+ :param sas_use_gpu: Whether to use a GPU or the CPU for calculating semantic answer similarity.
+ Falls back to CPU if no GPU is available.
+ :param add_isolated_node_eval: If set to True, in addition to the integrated evaluation of the pipeline, each node is evaluated in isolated evaluation mode.
+ This mode helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node.
+ If a node performs much better in the isolated evaluation than in the integrated evaluation, the previous node needs to be optimized to improve the pipeline's performance.
+ If a node's performance is similar in both modes, this node itself needs to be optimized to improve the pipeline's performance.
+ The isolated evaluation calculates the upper bound of each node's evaluation metrics under the assumption that it received perfect inputs from the previous node.
+ To this end, labels are used as input to the node instead of the output of the previous node in the pipeline.
+ The generated dataframes in the EvaluationResult then contain additional rows, which can be distinguished from the integrated evaluation results based on the
+ values "integrated" or "isolated" in the column "eval_mode" and the evaluation report then additionally lists the upper bound of each node's evaluation metrics.
+ :param reuse_index: Whether to reuse existing non-empty index and to keep the index after evaluation.
+ If True the index will be kept after evaluation and no indexing will take place if index has already documents. Otherwise it will be deleted immediately afterwards.
+ Defaults to False.
+ """
+ if experiment_tracking_tool is not None:
+ tracking_head_cls = TRACKING_TOOL_TO_HEAD.get(experiment_tracking_tool, None)
+ if tracking_head_cls is None:
+ raise HaystackError(
+ f"Please specify a valid experiment_tracking_tool. Possible values are: {TRACKING_TOOL_TO_HEAD.keys()}"
+ )
+ if experiment_tracking_uri is None:
+ raise HaystackError(f"experiment_tracking_uri must be specified if experiment_tracking_tool is set.")
+ tracking_head = tracking_head_cls(tracking_uri=experiment_tracking_uri)
+ tracker.set_tracking_head(tracking_head)
+
+ try:
+ tracker.init_experiment(
+ experiment_name=experiment_name, run_name=experiment_run_name, tags={experiment_name: "True"}
+ )
+ tracker.track_params(
+ {
+ "dataset_label_count": len(evaluation_set_labels),
+ "dataset": evaluation_set_meta,
+ "sas_model_name_or_path": sas_model_name_or_path,
+ "sas_batch_size": sas_batch_size,
+ "sas_use_gpu": sas_use_gpu,
+ "pipeline_index_params": index_params,
+ "pipeline_query_params": query_params,
+ "pipeline": pipeline_meta,
+ "corpus_file_count": len(corpus_file_paths),
+ "corpus": corpus_meta,
+ "type": "offline/evaluation",
+ }
+ )
+
+ # check index before eval
+ document_store = index_pipeline.get_document_store()
+ if document_store is None:
+ raise HaystackError(f"Document store not found. Please provide pipelines with proper document store.")
+ document_count = document_store.get_document_count()
+
+ if document_count > 0:
+ if not reuse_index:
+ raise HaystackError(f"Index '{document_store.index}' is not empty. Please provide an empty index.")
+ else:
+ logger.info(f"indexing {len(corpus_file_paths)} documents...")
+ index_pipeline.run(file_paths=corpus_file_paths, meta=corpus_file_metas, params=index_params)
+ document_count = document_store.get_document_count()
+ logger.info(f"indexing {len(evaluation_set_labels)} files to {document_count} documents finished.")
+
+ tracker.track_params({"pipeline_index_document_count": document_count})
+
+ eval_result = query_pipeline.eval(
+ labels=evaluation_set_labels,
+ params=query_params,
+ sas_model_name_or_path=sas_model_name_or_path,
+ sas_batch_size=sas_batch_size,
+ sas_use_gpu=sas_use_gpu,
+ add_isolated_node_eval=add_isolated_node_eval,
+ )
+
+ integrated_metrics = eval_result.calculate_metrics()
+ integrated_top_1_metrics = eval_result.calculate_metrics(simulated_top_k_reader=1)
+ metrics = {"integrated": integrated_metrics, "integrated_top_1": integrated_top_1_metrics}
+ if add_isolated_node_eval:
+ isolated_metrics = eval_result.calculate_metrics(eval_mode="isolated")
+ isolated_top_1_metrics = eval_result.calculate_metrics(eval_mode="isolated", simulated_top_k_reader=1)
+ metrics["isolated"] = isolated_metrics
+ metrics["isolated_top_1"] = isolated_top_1_metrics
+ tracker.track_metrics(metrics, step=0)
+
+ with tempfile.TemporaryDirectory() as temp_dir:
+ eval_result_dir = Path(temp_dir) / "eval_result"
+ eval_result_dir.mkdir(exist_ok=True)
+ eval_result.save(out_dir=eval_result_dir)
+ tracker.track_artifacts(eval_result_dir, artifact_path="eval_result")
+ with open(Path(temp_dir) / "pipelines.yaml", "w") as outfile:
+ index_config = index_pipeline.get_config()
+ query_config = query_pipeline.get_config()
+ components = list(
+ {c["name"]: c for c in (index_config["components"] + query_config["components"])}.values()
+ )
+ pipelines = index_config["pipelines"] + query_config["pipelines"]
+ config = {"version": index_config["version"], "components": components, "pipelines": pipelines}
+ yaml.dump(config, outfile, default_flow_style=False)
+ tracker.track_artifacts(temp_dir)
+
+ # Clean up document store
+ if not reuse_index and document_store.index is not None:
+ logger.info(f"Cleaning up: deleting index '{document_store.index}'...")
+ document_store.delete_index(document_store.index)
+
+ finally:
+ tracker.end_run()
+
+ return eval_result
+
@send_event
def eval(
self,
diff --git a/haystack/telemetry.py b/haystack/telemetry.py
index 75661e13f..801a26924 100644
--- a/haystack/telemetry.py
+++ b/haystack/telemetry.py
@@ -5,34 +5,26 @@
You can opt-out of sharing usage statistics by calling disable_telemetry() or by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page.
You can log all events to the local file specified in LOG_PATH for inspection by setting the environment variable HAYSTACK_TELEMETRY_LOGGING_TO_FILE_ENABLED to "True".
"""
-from typing import List, Dict, Any, Optional
-
import os
-import sys
+from typing import Any, Dict, List, Optional
import uuid
import logging
-import platform
from enum import Enum
from functools import wraps
from pathlib import Path
import yaml
-import torch
import posthog
-import transformers
-from haystack import __version__
+from haystack.environment import HAYSTACK_EXECUTION_CONTEXT, get_or_create_env_meta_data
posthog.api_key = "phc_F5v11iI2YHkoP6Er3cPILWSrLhY3D6UY4dEMga4eoaa"
posthog.host = "https://tm.hs.deepset.ai"
HAYSTACK_TELEMETRY_ENABLED = "HAYSTACK_TELEMETRY_ENABLED"
HAYSTACK_TELEMETRY_LOGGING_TO_FILE_ENABLED = "HAYSTACK_TELEMETRY_LOGGING_TO_FILE_ENABLED"
-HAYSTACK_EXECUTION_CONTEXT = "HAYSTACK_EXECUTION_CONTEXT"
-HAYSTACK_DOCKER_CONTAINER = "HAYSTACK_DOCKER_CONTAINER"
CONFIG_PATH = Path("~/.haystack/config.yaml").expanduser()
LOG_PATH = Path("~/.haystack/telemetry.log").expanduser()
-telemetry_meta_data: Dict[str, Any] = {}
user_id: Optional[str] = None
logger = logging.getLogger(__name__)
@@ -49,7 +41,7 @@ def print_telemetry_report():
"""
if is_telemetry_enabled():
user_id = _get_or_create_user_id()
- meta_data = _get_or_create_telemetry_meta_data()
+ meta_data = get_or_create_env_meta_data()
print({**{"user_id": user_id}, **meta_data})
else:
print("Telemetry is disabled.")
@@ -152,7 +144,7 @@ def send_custom_event(event: str = "", payload: Dict[str, Any] = {}):
:param payload: A dictionary containing event meta data, e.g., parameter settings
"""
- event_properties = {**(NonPrivateParameters.apply_filter(payload)), **_get_or_create_telemetry_meta_data()}
+ event_properties = {**(NonPrivateParameters.apply_filter(payload)), **get_or_create_env_meta_data()}
if user_id is None:
raise RuntimeError("User id was not initialized")
try:
@@ -224,53 +216,6 @@ def _get_or_create_user_id() -> str:
return user_id
-def _get_or_create_telemetry_meta_data() -> Dict[str, Any]:
- """
- Collects meta data about the setup that is used with Haystack, such as: operating system, python version, Haystack version, transformers version, pytorch version, number of GPUs, execution environment, and the value stored in the env variable HAYSTACK_EXECUTION_CONTEXT.
- """
- global telemetry_meta_data # pylint: disable=global-statement
- if not telemetry_meta_data:
- telemetry_meta_data = {
- "os_version": platform.release(),
- "os_family": platform.system(),
- "os_machine": platform.machine(),
- "python_version": platform.python_version(),
- "haystack_version": __version__,
- "transformers_version": transformers.__version__,
- "torch_version": torch.__version__,
- "torch_cuda_version": torch.version.cuda if torch.cuda.is_available() else 0,
- "n_gpu": torch.cuda.device_count() if torch.cuda.is_available() else 0,
- "n_cpu": os.cpu_count(),
- "context": os.environ.get(HAYSTACK_EXECUTION_CONTEXT),
- "execution_env": _get_execution_environment(),
- }
- return telemetry_meta_data
-
-
-def _get_execution_environment():
- """
- Identifies the execution environment that Haystack is running in.
- Options are: colab notebook, kubernetes, CPU/GPU docker container, test environment, jupyter notebook, python script
- """
- if os.environ.get("CI", "False").lower() == "true":
- execution_env = "ci"
- elif "google.colab" in sys.modules:
- execution_env = "colab"
- elif "KUBERNETES_SERVICE_HOST" in os.environ:
- execution_env = "kubernetes"
- elif HAYSTACK_DOCKER_CONTAINER in os.environ:
- execution_env = os.environ.get(HAYSTACK_DOCKER_CONTAINER)
- # check if pytest is imported
- elif "pytest" in sys.modules:
- execution_env = "test"
- else:
- try:
- execution_env = get_ipython().__class__.__name__ # pylint: disable=undefined-variable
- except NameError:
- execution_env = "script"
- return execution_env
-
-
def _read_telemetry_config():
"""
Loads the config from the file specified in CONFIG_PATH
diff --git a/haystack/utils/__init__.py b/haystack/utils/__init__.py
index f04cd1336..8f0e8572c 100644
--- a/haystack/utils/__init__.py
+++ b/haystack/utils/__init__.py
@@ -19,3 +19,10 @@ from haystack.utils.export_utils import (
)
from haystack.utils.squad_data import SquadData
from haystack.utils.context_matching import calculate_context_similarity, match_context, match_contexts
+from haystack.utils.experiment_tracking import (
+ Tracker,
+ NoTrackingHead,
+ BaseTrackingHead,
+ MLflowTrackingHead,
+ StdoutTrackingHead,
+)
diff --git a/haystack/utils/experiment_tracking.py b/haystack/utils/experiment_tracking.py
new file mode 100644
index 000000000..c16c0538d
--- /dev/null
+++ b/haystack/utils/experiment_tracking.py
@@ -0,0 +1,188 @@
+from abc import ABC, abstractmethod
+import logging
+from pathlib import Path
+from typing import Any, Dict, Union
+import mlflow
+from requests.exceptions import ConnectionError
+
+from haystack.environment import get_or_create_env_meta_data
+
+
+logger = logging.getLogger(__name__)
+
+
+def flatten_dict(dict_to_flatten: dict, prefix: str = ""):
+ flat_dict = {}
+ for k, v in dict_to_flatten.items():
+ if isinstance(v, dict):
+ flat_dict.update(flatten_dict(v, prefix + k + "_"))
+ else:
+ flat_dict[prefix + k] = v
+ return flat_dict
+
+
+class BaseTrackingHead(ABC):
+ """
+ Base class for tracking experiments.
+
+ This class can be extended to implement custom logging backends like MLflow, WandB, or TensorBoard.
+ """
+
+ @abstractmethod
+ def init_experiment(
+ self, experiment_name: str, run_name: str = None, tags: Dict[str, Any] = None, nested: bool = False
+ ):
+ raise NotImplementedError()
+
+ @abstractmethod
+ def track_metrics(self, metrics: Dict[str, Any], step: int):
+ raise NotImplementedError()
+
+ @abstractmethod
+ def track_artifacts(self, dir_path: Union[str, Path], artifact_path: str = None):
+ raise NotImplementedError()
+
+ @abstractmethod
+ def track_params(self, params: Dict[str, Any]):
+ raise NotImplementedError()
+
+ @abstractmethod
+ def end_run(self):
+ raise NotImplementedError()
+
+
+class NoTrackingHead(BaseTrackingHead):
+ """
+ Null object implementation of a tracking head: i.e. does nothing.
+ """
+
+ def init_experiment(
+ self, experiment_name: str, run_name: str = None, tags: Dict[str, Any] = None, nested: bool = False
+ ):
+ pass
+
+ def track_metrics(self, metrics: Dict[str, Any], step: int):
+ pass
+
+ def track_artifacts(self, dir_path: Union[str, Path], artifact_path: str = None):
+ pass
+
+ def track_params(self, params: Dict[str, Any]):
+ pass
+
+ def end_run(self):
+ pass
+
+
+class Tracker:
+ """
+ Facade for tracking experiments.
+ """
+
+ tracker: BaseTrackingHead = NoTrackingHead()
+
+ @classmethod
+ def init_experiment(
+ cls, experiment_name: str, run_name: str = None, tags: Dict[str, Any] = None, nested: bool = False
+ ):
+ cls.tracker.init_experiment(experiment_name=experiment_name, run_name=run_name, tags=tags, nested=nested)
+
+ @classmethod
+ def track_metrics(cls, metrics: Dict[str, Any], step: int):
+ cls.tracker.track_metrics(metrics=metrics, step=step)
+
+ @classmethod
+ def track_artifacts(cls, dir_path: Union[str, Path], artifact_path: str = None):
+ cls.tracker.track_artifacts(dir_path=dir_path, artifact_path=artifact_path)
+
+ @classmethod
+ def track_params(cls, params: Dict[str, Any]):
+ cls.tracker.track_params(params=params)
+
+ @classmethod
+ def end_run(cls):
+ cls.tracker.end_run()
+
+ @classmethod
+ def set_tracking_head(cls, tracker: BaseTrackingHead):
+ cls.tracker = tracker
+
+
+class StdoutTrackingHead(BaseTrackingHead):
+ """
+ Experiment tracking head printing metrics and params to stdout.
+ Useful for services like AWS SageMaker, where you parse metrics from the actual logs
+ """
+
+ def init_experiment(
+ self, experiment_name: str, run_name: str = None, tags: Dict[str, Any] = None, nested: bool = False
+ ):
+ logger.info(f"\n **** Starting experiment '{experiment_name}' (Run: {run_name}) ****")
+
+ def track_metrics(self, metrics: Dict[str, Any], step: int):
+ logger.info(f"Logged metrics at step {step}: \n {metrics}")
+
+ def track_params(self, params: Dict[str, Any]):
+ logger.info(f"Logged parameters: \n {params}")
+
+ def track_artifacts(self, dir_path: Union[str, Path], artifact_path: str = None):
+ logger.warning(f"Cannot log artifacts with StdoutLogger: \n {dir_path}")
+
+ def end_run(self):
+ logger.info(f"**** End of Experiment **** ")
+
+
+class MLflowTrackingHead(BaseTrackingHead):
+ def __init__(self, tracking_uri: str, auto_track_environment: bool = True) -> None:
+ """
+ Experiment tracking head for MLflow.
+ """
+ super().__init__()
+ self.tracking_uri = tracking_uri
+ self.auto_track_environment = auto_track_environment
+
+ def init_experiment(
+ self, experiment_name: str, run_name: str = None, tags: Dict[str, Any] = None, nested: bool = False
+ ):
+ try:
+ mlflow.set_tracking_uri(self.tracking_uri)
+ mlflow.set_experiment(experiment_name)
+ mlflow.start_run(run_name=run_name, nested=nested, tags=tags)
+ logger.info(f"Tracking run {run_name} of experiment {experiment_name} by mlflow under {self.tracking_uri}")
+ if self.auto_track_environment:
+ mlflow.log_params(flatten_dict({"environment": get_or_create_env_meta_data()}))
+ except ConnectionError:
+ raise Exception(
+ f"MLflow cannot connect to the remote server at {self.tracking_uri}.\n"
+ f"MLflow also supports logging runs locally to files. Set the MLflowTrackingHead "
+ f"tracking_uri to an empty string to use that."
+ )
+
+ def track_metrics(self, metrics: Dict[str, Any], step: int):
+ try:
+ metrics = flatten_dict(metrics)
+ mlflow.log_metrics(metrics, step=step)
+ except ConnectionError:
+ logger.warning(f"ConnectionError in logging metrics to MLflow.")
+ except Exception as e:
+ logger.warning(f"Failed to log metrics: {e}")
+
+ def track_params(self, params: Dict[str, Any]):
+ try:
+ params = flatten_dict(params)
+ mlflow.log_params(params)
+ except ConnectionError:
+ logger.warning("ConnectionError in logging params to MLflow")
+ except Exception as e:
+ logger.warning(f"Failed to log params: {e}")
+
+ def track_artifacts(self, dir_path: Union[str, Path], artifact_path: str = None):
+ try:
+ mlflow.log_artifacts(dir_path, artifact_path)
+ except ConnectionError:
+ logger.warning(f"ConnectionError in logging artifacts to MLflow")
+ except Exception as e:
+ logger.warning(f"Failed to log artifacts: {e}")
+
+ def end_run(self):
+ mlflow.end_run()
diff --git a/setup.cfg b/setup.cfg
index 6e6d0e01e..b8a03ad33 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -91,7 +91,7 @@ install_requires =
# Metrics and logging
seqeval
- mlflow<=1.13.1
+ mlflow
# Elasticsearch
elasticsearch>=7.7,<=7.10