From 11cf94a9652a577732941f27ad59eb7c8bc5063e Mon Sep 17 00:00:00 2001 From: Sara Zan Date: Tue, 15 Mar 2022 11:17:26 +0100 Subject: [PATCH] Pipeline's YAML: syntax validation (#2226) * Add BasePipeline.validate_config, BasePipeline.validate_yaml, and some new custom exception classes * Make error composition work properly * Clarify typing * Help mypy a bit more * Update Documentation & Code Style * Enable autogenerated docs for Milvus1 and 2 separately * Revert "Enable autogenerated docs for Milvus1 and 2 separately" This reverts commit 282be4a78a6e95862a9b4c924fc3dea5ca71e28d. * Update Documentation & Code Style * Re-enable 'additionalProperties: False' * Add pipeline.type to JSON Schema, was somehow forgotten * Disable additionalProperties on the pipeline properties too * Fix json-schemas for 1.1.0 and 1.2.0 (should not do it again in the future) * Cal super in PipelineValidationError * Improve _read_pipeline_config_from_yaml's error handling * Fix generate_json_schema.py to include document stores * Fix json schemas (retro-fix 1.1.0 again) * Improve custom errors printing, add link to docs * Add function in BaseComponent to list its subclasses in a module * Make some document stores base classes abstract * Add marker 'integration' in pytest flags * Slighly improve validation of pipelines at load * Adding tests for YAML loading and validation * Make custom_query Optional for validation issues * Fix bug in _read_pipeline_config_from_yaml * Improve error handling in BasePipeline and Pipeline and add DAG check * Move json schema generation into haystack/nodes/_json_schema.py (useful for tests) * Simplify errors slightly * Add some YAML validation tests * Remove load_from_config from BasePipeline, it was never used anyway * Improve tests * Include json-schemas in package * Fix conftest imports * Make BasePipeline abstract * Improve mocking by making the test independent from the YAML version * Add exportable_to_yaml decorator to forget about set_config on mock nodes * Fix mypy errors * Comment out one monkeypatch * Fix typing again * Improve error message for validation * Add required properties to pipelines * Fix YAML version for REST API YAMLs to 1.2.0 * Fix load_from_yaml call in load_from_deepset_cloud * fix HaystackError.__getattr__ * Add super().__init__()in most nodes and docstore, comment set_config * Remove type from REST API pipelines * Remove useless init from doc2answers * Call super in Seq3SeqGenerator * Typo in deepsetcloud.py * Fix rest api indexing error mismatch and mock version of JSON schema in all tests * Working on pipeline tests * Improve errors printing slightly * Add back test_pipeline.yaml * _json_schema.py supports different versions with identical schemas * Add type to 0.7 schema for backwards compatibility * Fix small bug in _json_schema.py * Try alternative to generate json schemas on the CI * Update Documentation & Code Style * Make linux CI match autoformat CI * Fix super-init-not-called * Accidentally committed file * Update Documentation & Code Style * fix test_summarizer_translation.py's import * Mock YAML in a few suites, split and simplify test_pipeline_debug_and_validation.py::test_invalid_run_args * Fix json schema for ray tests too * Update Documentation & Code Style * Reintroduce validation * Usa unstable version in tests and rest api * Make unstable support the latest versions * Update Documentation & Code Style * Remove needless fixture * Make type in pipeline optional in the strings validation * Fix schemas * Fix string validation for pipeline type * Improve validate_config_strings * Remove type from test p[ipelines * Update Documentation & Code Style * Fix test_pipeline * Removing more type from pipelines * Temporary CI patc * Fix issue with exportable_to_yaml never invoking the wrapped init * rm stray file * pipeline tests are green again * Linux CI now needs .[all] to generate the schema * Bugfixes, pipeline tests seems to be green * Typo in version after merge * Implement missing methods in Weaviate * Trying to avoid FAISS tests from running in the Milvus1 test suite * Fix some stray test paths and faiss index dumping * Fix pytest markers list * Temporarily disable cache to be able to see tests failures * Fix pyproject.toml syntax * Use only tmp_path * Fix preprocessor signature after merge * Fix faiss bug * Fix Ray test * Fix documentation issue by removing quotes from faiss type * Update Documentation & Code Style * use document properly in preprocessor tests * Update Documentation & Code Style * make preprocessor capable of handling documents * import document * Revert support for documents in preprocessor, do later * Fix bug in _json_schema.py that was breaking validation * re-enable cache * Update Documentation & Code Style * Simplify calling _json_schema.py from the CI * Remove redundant ABC inheritance * Ensure exportable_to_yaml works only on implementations * Rename subclass to class_ in Meta * Make run() and get_config() abstract in BasePipeline * Revert unintended change in preprocessor * Move outgoing_edges_input_node check inside try block * Rename VALID_CODE_GEN_INPUT_REGEX into VALID_INPUT_REGEX * Add check for a RecursionError on validate_config_strings * Address usages of _pipeline_config in data silo and elasticsearch * Rename _pipeline_config into _init_parameters * Fix pytest marker and remove unused imports * Remove most redundant ABCs * Rename _init_parameters into _component_configuration * Remove set_config and type from _component_configuration's dict * Remove last instances of set_config and replace with super().__init__() * Implement __init_subclass__ approach * Simplify checks on the existence of _component_configuration * Fix faiss issue * Dynamic generation of node schemas & weed out old schemas * Add debatable test * Add docstring to debatable test * Positive diff between schemas implemented * Improve diff printing * Rename REST API YAML files to trigger IDE validation * Fix typing issues * Fix more typing * Typo in YAML filename * Remove needless type:ignore * Add tests * Fix tests & validation feedback for accessory classes in custom nodes * Refactor RAGeneratorType out * Fix broken import in conftest * Improve source error handling * Remove unused import in test_eval.py breaking tests * Fix changed error message in tests matches too * Normalize generate_openapi_specs.py and generate_json_schema.py in the actions * Fix path to generate_openapi_specs.py in autoformat.yml * Update Documentation & Code Style * Add test for FAISSDocumentStore-like situations (superclass with init params) * Update Documentation & Code Style * Fix indentation * Remove commented set_config * Store model_name_or_path in FARMReader to use in DistillationDataSilo * Rename _component_configuration into _component_config * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .github/utils/generate_json_schema.py | 307 +- .github/utils/generate_openapi_specs.py | 32 + .github/workflows/autoformat.yml | 9 +- .github/workflows/linux_ci.yml | 9 +- docs/_src/api/api/document_store.md | 50 +- docs/_src/api/api/generator.md | 4 +- docs/_src/api/api/pipelines.md | 63 +- docs/_src/api/api/preprocessor.md | 1 + docs/_src/api/api/retriever.md | 2 +- .../api/openapi/generate_openapi_specs.py | 31 - haystack/__init__.py | 2 +- haystack/document_stores/deepsetcloud.py | 9 +- haystack/document_stores/elasticsearch.py | 42 +- haystack/document_stores/faiss.py | 35 +- haystack/document_stores/graphdb.py | 3 +- haystack/document_stores/memory.py | 12 +- haystack/document_stores/milvus1.py | 20 +- haystack/document_stores/milvus2.py | 23 +- haystack/document_stores/sql.py | 9 +- haystack/document_stores/weaviate.py | 53 +- haystack/errors.py | 67 +- haystack/modeling/data_handler/data_silo.py | 2 +- haystack/nodes/_json_schema.py | 486 ++++ .../nodes/answer_generator/transformers.py | 29 +- haystack/nodes/base.py | 87 +- haystack/nodes/connector/crawler.py | 2 + .../nodes/document_classifier/transformers.py | 14 +- haystack/nodes/evaluator/evaluator.py | 2 + haystack/nodes/extractor/entity.py | 2 +- haystack/nodes/file_classifier/file_type.py | 3 +- haystack/nodes/file_converter/azure.py | 14 +- haystack/nodes/file_converter/base.py | 4 +- haystack/nodes/file_converter/image.py | 4 +- haystack/nodes/file_converter/parsr.py | 13 +- haystack/nodes/file_converter/pdf.py | 5 +- haystack/nodes/file_converter/tika.py | 4 +- haystack/nodes/other/docs2answers.py | 3 - haystack/nodes/other/join_answers.py | 3 +- haystack/nodes/other/join_docs.py | 3 +- haystack/nodes/other/route_documents.py | 3 +- haystack/nodes/preprocessor/base.py | 4 + haystack/nodes/preprocessor/preprocessor.py | 17 +- haystack/nodes/query_classifier/sklearn.py | 3 +- .../nodes/query_classifier/transformers.py | 4 +- .../question_generator/question_generator.py | 12 +- .../nodes/ranker/sentence_transformers.py | 4 +- haystack/nodes/reader/farm.py | 25 +- haystack/nodes/reader/table.py | 25 +- haystack/nodes/reader/transformers.py | 14 +- haystack/nodes/retriever/base.py | 5 + haystack/nodes/retriever/dense.py | 56 +- haystack/nodes/retriever/sparse.py | 8 +- haystack/nodes/retriever/text2sparql.py | 3 +- haystack/nodes/summarizer/transformers.py | 13 +- haystack/nodes/translator/transformers.py | 9 +- haystack/pipelines/base.py | 209 +- haystack/pipelines/config.py | 154 +- ...on => haystack-pipeline-1.0.0.schema.json} | 1092 ++++++- .../haystack-pipeline-1.2.0.schema.json | 2590 ----------------- .../haystack-pipeline-1.2.1rc0.schema.json | 989 ++++++- ...=> haystack-pipeline-unstable.schema.json} | 1096 ++++++- json-schemas/haystack-pipeline.schema.json | 118 +- pyproject.toml | 22 +- rest_api/config.py | 2 +- rest_api/controller/file_upload.py | 7 +- ...l => pipeline_empty.haystack-pipeline.yml} | 5 +- ...s.yaml => pipelines.haystack-pipeline.yml} | 4 +- ...ml => pipelines_dpr.haystack-pipeline.yml} | 4 +- .../test_pipeline.haystack-pipeline.yml | 47 + .../test/samples/pipeline/test_pipeline.yaml | 103 - .../test_pipeline_faiss_indexing.yaml | 31 - .../test_pipeline_faiss_retrieval.yaml | 19 - .../test_pipeline_tfidfretriever.yaml | 25 - rest_api/test/test_rest_api.py | 15 +- setup.cfg | 5 + test/__init__.py | 0 test/conftest.py | 111 +- test/samples/dc/pipeline_config.json | 4 +- test/samples/pipeline/test_pipeline.yaml | 21 +- .../test_pipeline_faiss_indexing.yaml | 3 +- .../test_pipeline_faiss_retrieval.yaml | 3 +- .../test_pipeline_tfidfretriever.yaml | 3 +- test/samples/pipeline/test_ray_pipeline.yaml | 46 + test/test_distillation.py | 2 +- test/test_document_store.py | 2 +- test/test_eval.py | 6 +- test/test_faiss_and_milvus.py | 2 +- test/test_file_converter.py | 2 +- test/test_filetype_classifier.py | 20 +- test/test_generator.py | 2 +- test/test_modeling_dpr.py | 12 +- test/test_modeling_processor.py | 2 +- .../test_modeling_processor_saving_loading.py | 6 +- test/test_pipeline.py | 419 +-- test/test_pipeline_debug_and_validation.py | 45 +- test/test_pipeline_yaml.py | 670 +++++ test/test_preprocessor.py | 2 +- test/test_ray.py | 31 +- test/test_retriever.py | 14 +- test/test_standard_pipelines.py | 2 +- test/test_summarizer_translation.py | 2 +- test/test_tokenization.py | 4 +- test/test_utils.py | 2 +- test/test_weaviate.py | 2 +- 104 files changed, 5456 insertions(+), 4189 deletions(-) create mode 100644 .github/utils/generate_openapi_specs.py delete mode 100644 docs/_src/api/openapi/generate_openapi_specs.py create mode 100644 haystack/nodes/_json_schema.py rename json-schemas/{haystack-pipeline-1.1.0.schema.json => haystack-pipeline-1.0.0.schema.json} (71%) delete mode 100644 json-schemas/haystack-pipeline-1.2.0.schema.json rename json-schemas/{haystack-pipeline-0.7.schema.json => haystack-pipeline-unstable.schema.json} (70%) rename rest_api/pipeline/{pipeline_empty.yaml => pipeline_empty.haystack-pipeline.yml} (88%) rename rest_api/pipeline/{pipelines.yaml => pipelines.haystack-pipeline.yml} (96%) rename rest_api/pipeline/{pipelines_dpr.yaml => pipelines_dpr.haystack-pipeline.yml} (96%) create mode 100644 rest_api/test/samples/pipeline/test_pipeline.haystack-pipeline.yml delete mode 100644 rest_api/test/samples/pipeline/test_pipeline.yaml delete mode 100644 rest_api/test/samples/pipeline/test_pipeline_faiss_indexing.yaml delete mode 100644 rest_api/test/samples/pipeline/test_pipeline_faiss_retrieval.yaml delete mode 100644 rest_api/test/samples/pipeline/test_pipeline_tfidfretriever.yaml create mode 100644 test/__init__.py create mode 100644 test/samples/pipeline/test_ray_pipeline.yaml create mode 100644 test/test_pipeline_yaml.py diff --git a/.github/utils/generate_json_schema.py b/.github/utils/generate_json_schema.py index 2eecb1644..36f124d36 100644 --- a/.github/utils/generate_json_schema.py +++ b/.github/utils/generate_json_schema.py @@ -1,307 +1,10 @@ -import json +import sys import logging -import subprocess -from pathlib import Path -from typing import Any, Dict, Optional, Set, Tuple -from haystack import __version__ -import haystack.document_stores -import haystack.nodes -import pydantic.schema -from fastapi.dependencies.utils import get_typed_signature -from pydantic import BaseConfig, BaseSettings, Required, SecretStr, create_model -from pydantic.fields import ModelField -from pydantic.schema import SkipField, TypeModelOrEnum, TypeModelSet, encode_default -from pydantic.schema import field_singleton_schema as _field_singleton_schema -from pydantic.typing import is_callable_type -from pydantic.utils import lenient_issubclass - -schema_version = __version__ -filename = f"haystack-pipeline-{schema_version}.schema.json" -destination_path = Path(__file__).parent.parent.parent / "json-schemas" / filename +logging.basicConfig(level=logging.INFO) -class Settings(BaseSettings): - input_token: SecretStr - github_repository: str +sys.path.append(".") +from haystack.nodes._json_schema import update_json_schema - -# Monkey patch Pydantic's field_singleton_schema to convert classes and functions to -# strings in JSON Schema -def field_singleton_schema( - field: ModelField, - *, - by_alias: bool, - model_name_map: Dict[TypeModelOrEnum, str], - ref_template: str, - schema_overrides: bool = False, - ref_prefix: Optional[str] = None, - known_models: TypeModelSet, -) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]: - try: - return _field_singleton_schema( - field, - by_alias=by_alias, - model_name_map=model_name_map, - ref_template=ref_template, - schema_overrides=schema_overrides, - ref_prefix=ref_prefix, - known_models=known_models, - ) - except (ValueError, SkipField): - schema: Dict[str, Any] = {"type": "string"} - - if isinstance(field.default, type) or is_callable_type(field.default): - default = field.default.__name__ - else: - default = field.default - if not field.required: - schema["default"] = encode_default(default) - return schema, {}, set() - - -# Monkeypatch Pydantic's field_singleton_schema -pydantic.schema.field_singleton_schema = field_singleton_schema - - -class Config(BaseConfig): - extra = "forbid" - - -def get_json_schema(): - """ - Generate JSON schema for Haystack pipelines. - """ - schema_definitions = {} - additional_definitions = {} - - modules_with_nodes = [haystack.nodes, haystack.document_stores] - possible_nodes = [] - for module in modules_with_nodes: - for importable_name in dir(module): - imported = getattr(module, importable_name) - possible_nodes.append((module, imported)) - # TODO: decide if there's a better way to not include Base classes other than by - # the prefix "Base" in the name. Maybe it could make sense to have a list of - # all the valid nodes to include in the main source code and then using that here. - for module, node in possible_nodes: - if lenient_issubclass(node, haystack.nodes.BaseComponent) and not node.__name__.startswith("Base"): - logging.info(f"Processing node: {node.__name__}") - init_method = getattr(node, "__init__", None) - if init_method: - signature = get_typed_signature(init_method) - param_fields = [ - param - for param in signature.parameters.values() - if param.kind not in {param.VAR_POSITIONAL, param.VAR_KEYWORD} - ] - # Remove self parameter - param_fields.pop(0) - param_fields_kwargs: Dict[str, Any] = {} - for param in param_fields: - logging.info(f"--- processing param: {param.name}") - annotation = Any - if param.annotation != param.empty: - annotation = param.annotation - default = Required - if param.default != param.empty: - default = param.default - param_fields_kwargs[param.name] = (annotation, default) - model = create_model(f"{node.__name__}ComponentParams", __config__=Config, **param_fields_kwargs) - model.update_forward_refs(**model.__dict__) - params_schema = model.schema() - params_schema["title"] = "Parameters" - params_schema[ - "description" - ] = "Each parameter can reference other components defined in the same YAML file." - if "definitions" in params_schema: - params_definitions = params_schema.pop("definitions") - additional_definitions.update(params_definitions) - component_schema = { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string", - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": f"{node.__name__}", - }, - "params": params_schema, - }, - "required": ["type", "name"], - "additionalProperties": False, - } - schema_definitions[f"{node.__name__}Component"] = component_schema - - all_definitions = {**schema_definitions, **additional_definitions} - component_refs = [{"$ref": f"#/definitions/{name}"} for name in schema_definitions] - pipeline_schema = { - "$schema": "http://json-schema.org/draft-07/schema", - "$id": f"https://haystack.deepset.ai/json-schemas/{filename}", - "title": "Haystack Pipeline", - "description": "Haystack Pipeline YAML file describing the nodes of the pipelines. For more info read the docs at: https://haystack.deepset.ai/components/pipelines#yaml-file-definitions", - "type": "object", - "properties": { - "version": { - "title": "Version", - "description": "Version of the Haystack Pipeline file.", - "type": "string", - "const": schema_version, - }, - "components": { - "title": "Components", - "description": "Component nodes and their configurations, to later be used in the pipelines section. Define here all the building blocks for the pipelines.", - "type": "array", - "items": {"anyOf": component_refs}, - "required": ["type", "name"], - "additionalProperties": False, - }, - "pipelines": { - "title": "Pipelines", - "description": "Multiple pipelines can be defined using the components from the same YAML file.", - "type": "array", - "items": { - "type": "object", - "properties": { - "name": {"title": "Name", "description": "Name of the pipeline.", "type": "string"}, - "nodes": { - "title": "Nodes", - "description": "Nodes to be used by this particular pipeline", - "type": "array", - "items": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "The name of this particular node in the pipeline. This should be one of the names from the components defined in the same file.", - "type": "string", - }, - "inputs": { - "title": "Inputs", - "description": "Input parameters for this node.", - "type": "array", - "items": {"type": "string"}, - }, - }, - "additionalProperties": False, - }, - "required": ["name", "nodes"], - "additionalProperties": False, - }, - }, - "additionalProperties": False, - }, - }, - }, - "required": ["version", "components", "pipelines"], - "additionalProperties": False, - "definitions": all_definitions, - } - return pipeline_schema - - -def list_indexed_versions(index): - """ - Given the schema index as a parsed JSON, - return a list of all the versions it contains. - """ - indexed_versions = [] - for version_entry in index["oneOf"]: - for property_entry in version_entry["allOf"]: - if "properties" in property_entry.keys(): - indexed_versions.append(property_entry["properties"]["version"]["const"]) - return indexed_versions - - -def cleanup_rc_versions(index): - """ - Given the schema index as a parsed JSON, - removes any existing (unstable) rc version from it. - """ - new_versions_list = [] - for version_entry in index["oneOf"]: - for property_entry in version_entry["allOf"]: - if "properties" in property_entry.keys(): - if "rc" not in property_entry["properties"]["version"]["const"]: - new_versions_list.append(version_entry) - break - index["oneOf"] = new_versions_list - return index - - -def new_version_entry(version): - """ - Returns a new entry for the version index JSON schema. - """ - return { - "allOf": [ - {"properties": {"version": {"const": version}}}, - { - "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/json-schemas/" - f"haystack-pipeline-{version}.schema.json" - }, - ] - } - - -def generate_json_schema(): - # Create new schema file - pipeline_schema = get_json_schema() - destination_path.parent.mkdir(parents=True, exist_ok=True) - destination_path.write_text(json.dumps(pipeline_schema, indent=2)) - - # Update schema index - index = [] - index_path = Path(__file__).parent.parent.parent / "json-schemas" / "haystack-pipeline.schema.json" - with open(index_path, "r") as index_file: - index = json.load(index_file) - if index: - index = cleanup_rc_versions(index) - indexed_versions = list_indexed_versions(index) - if not any(version == schema_version for version in indexed_versions): - index["oneOf"].append(new_version_entry(schema_version)) - with open(index_path, "w") as index_file: - json.dump(index, index_file, indent=4) - - -def main(): - from github import Github - - generate_json_schema() - logging.basicConfig(level=logging.INFO) - settings = Settings() - logging.info(f"Using config: {settings.json()}") - g = Github(settings.input_token.get_secret_value()) - repo = g.get_repo(settings.github_repository) - - logging.info("Setting up GitHub Actions git user") - subprocess.run(["git", "config", "user.name", "github-actions"], check=True) - subprocess.run(["git", "config", "user.email", "github-actions@github.com"], check=True) - branch_name = "generate-json-schema" - logging.info(f"Creating a new branch {branch_name}") - subprocess.run(["git", "checkout", "-b", branch_name], check=True) - logging.info("Adding updated file") - subprocess.run(["git", "add", str(destination_path)], check=True) - logging.info("Committing updated file") - message = "⬆ Upgrade JSON Schema file" - subprocess.run(["git", "commit", "-m", message], check=True) - logging.info("Pushing branch") - subprocess.run(["git", "push", "origin", branch_name], check=True) - logging.info("Creating PR") - pr = repo.create_pull(title=message, body=message, base="master", head=branch_name) - logging.info(f"Created PR: {pr.number}") - logging.info("Finished") - - -if __name__ == "__main__": - # If you only want to generate the JSON Schema file without submitting a PR - # uncomment this line: - generate_json_schema() - - # and comment this line: - # main() +update_json_schema(update_index=True) diff --git a/.github/utils/generate_openapi_specs.py b/.github/utils/generate_openapi_specs.py new file mode 100644 index 000000000..ce2634ba2 --- /dev/null +++ b/.github/utils/generate_openapi_specs.py @@ -0,0 +1,32 @@ +import json +from pathlib import Path +import os +import sys +import shutil + +REST_PATH = Path("./rest_api").absolute() +PIPELINE_PATH = str(REST_PATH / "pipeline" / "pipeline_empty.haystack-pipeline.yml") +APP_PATH = str(REST_PATH / "application.py") +DOCS_PATH = Path("./docs") / "_src" / "api" / "openapi" + +os.environ["PIPELINE_YAML_PATH"] = PIPELINE_PATH + +print(f"Loading OpenAPI specs from {APP_PATH} with pipeline at {PIPELINE_PATH}") + +sys.path.append(".") +from rest_api.application import get_openapi_specs, haystack_version + +# Generate the openapi specs +specs = get_openapi_specs() + +# Dump the specs into a JSON file +with open(DOCS_PATH / "openapi.json", "w") as f: + json.dump(specs, f, indent=4) + +# Remove rc versions of the specs from the folder +for specs_file in os.listdir(): + if os.path.isfile(specs_file) and "rc" in specs_file and Path(specs_file).suffix == ".json": + os.remove(specs_file) + +# Add versioned copy +shutil.copy(DOCS_PATH / "openapi.json", DOCS_PATH / f"openapi-{haystack_version}.json") diff --git a/.github/workflows/autoformat.yml b/.github/workflows/autoformat.yml index 42aaeda17..8225e98ad 100644 --- a/.github/workflows/autoformat.yml +++ b/.github/workflows/autoformat.yml @@ -40,7 +40,7 @@ jobs: - name: Install Dependencies run: | pip install --upgrade pip - pip install .[test] + pip install .[all] pip install rest_api/ pip install ui/ pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cpu.html @@ -69,14 +69,11 @@ jobs: # Generates the OpenAPI specs file to be used on the documentation website - name: Generate OpenAPI Specs - run: | - pip install rest_api/ - cd docs/_src/api/openapi/ - python generate_openapi_specs.py + run: python .github/utils/generate_openapi_specs.py # Generates a new JSON schema for the pipeline YAML validation - name: Generate JSON schema for pipelines - run: python ./.github/utils/generate_json_schema.py + run: python .github/utils/generate_json_schema.py # Commit the files to GitHub - name: Commit files diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index 7a38d4c0d..530626e8e 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -193,14 +193,11 @@ jobs: # Generates the OpenAPI specs file to be used on the documentation website - name: Generate OpenAPI Specs - run: | - pip install rest_api/ - cd docs/_src/api/openapi/ - python generate_openapi_specs.py + run: python .github/utils/generate_openapi_specs.py # Generates a new JSON schema for the pipeline YAML validation - name: Generate JSON schema for pipelines - run: python ./.github/utils/generate_json_schema.py + run: python .github/utils/generate_json_schema.py # If there is anything to commit, fail # Note: this CI action mirrors autoformat.yml, with the difference that it @@ -287,7 +284,7 @@ jobs: if: steps.cache.outputs.cache-hit != 'true' run: | pip install --upgrade pip - pip install .[test] + pip install .[all] pip install rest_api/ pip install ui/ pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cpu.html diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index 02469ee1a..755b42dc8 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -2174,7 +2174,7 @@ the vector embeddings are indexed in a FAISS Index. #### \_\_init\_\_ ```python -def __init__(sql_url: str = "sqlite:///faiss_document_store.db", vector_dim: int = None, embedding_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: "Optional[faiss.swigfaiss.Index]" = None, return_embedding: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", faiss_index_path: Union[str, Path] = None, faiss_config_path: Union[str, Path] = None, isolation_level: str = None, **kwargs, ,) +def __init__(sql_url: str = "sqlite:///faiss_document_store.db", vector_dim: int = None, embedding_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", faiss_index_path: Union[str, Path] = None, faiss_config_path: Union[str, Path] = None, isolation_level: str = None, **kwargs, ,) ``` **Arguments**: @@ -3565,6 +3565,54 @@ operation. None + + +#### delete\_labels + +```python +def delete_labels() +``` + +Implemented to respect BaseDocumentStore's contract. + +Weaviate does not support labels (yet). + + + +#### get\_all\_labels + +```python +def get_all_labels() +``` + +Implemented to respect BaseDocumentStore's contract. + +Weaviate does not support labels (yet). + + + +#### get\_label\_count + +```python +def get_label_count() +``` + +Implemented to respect BaseDocumentStore's contract. + +Weaviate does not support labels (yet). + + + +#### write\_labels + +```python +def write_labels() +``` + +Implemented to respect BaseDocumentStore's contract. + +Weaviate does not support labels (yet). + # Module graphdb diff --git a/docs/_src/api/api/generator.md b/docs/_src/api/api/generator.md index 3356f6b56..5fa8caa06 100644 --- a/docs/_src/api/api/generator.md +++ b/docs/_src/api/api/generator.md @@ -90,7 +90,7 @@ i.e. the model can easily adjust to domain documents even after training has fin #### \_\_init\_\_ ```python -def __init__(model_name_or_path: str = "facebook/rag-token-nq", model_version: Optional[str] = None, retriever: Optional[DensePassageRetriever] = None, generator_type: RAGeneratorType = RAGeneratorType.TOKEN, top_k: int = 2, max_length: int = 200, min_length: int = 2, num_beams: int = 2, embed_title: bool = True, prefix: Optional[str] = None, use_gpu: bool = True) +def __init__(model_name_or_path: str = "facebook/rag-token-nq", model_version: Optional[str] = None, retriever: Optional[DensePassageRetriever] = None, generator_type: str = "token", top_k: int = 2, max_length: int = 200, min_length: int = 2, num_beams: int = 2, embed_title: bool = True, prefix: Optional[str] = None, use_gpu: bool = True) ``` Load a RAG model from Transformers along with passage_embedding_model. @@ -104,7 +104,7 @@ See https://huggingface.co/transformers/model_doc/rag.html for more details See https://huggingface.co/models for full list of available models. - `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. - `retriever`: `DensePassageRetriever` used to embedded passages for the docs passed to `predict()`. This is optional and is only needed if the docs you pass don't already contain embeddings in `Document.embedding`. -- `generator_type`: Which RAG generator implementation to use? RAG-TOKEN or RAG-SEQUENCE +- `generator_type`: Which RAG generator implementation to use ("token" or "sequence") - `top_k`: Number of independently generated text to return - `max_length`: Maximum length of generated text - `min_length`: Minimum length of generated text diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md index b7200da26..7dbbff393 100644 --- a/docs/_src/api/api/pipelines.md +++ b/docs/_src/api/api/pipelines.md @@ -17,7 +17,7 @@ RootNode feeds inputs together with corresponding params to a Pipeline. ## BasePipeline ```python -class BasePipeline() +class BasePipeline(ABC) ``` Base class for pipelines, providing the most basic methods to load and save them in different ways. @@ -28,10 +28,11 @@ See also the `Pipeline` class for the actual pipeline logic. #### get\_config ```python +@abstractmethod def get_config(return_defaults: bool = False) -> dict ``` -Returns a configuration for the Pipeline that can be used with `BasePipeline.load_from_config()`. +Returns a configuration for the Pipeline that can be used with `Pipeline.load_from_config()`. **Arguments**: @@ -81,6 +82,7 @@ Default value is True. ```python @classmethod +@abstractmethod def load_from_config(cls, pipeline_config: Dict, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True) ``` @@ -137,6 +139,7 @@ variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an ```python @classmethod +@abstractmethod def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True) ``` @@ -519,6 +522,62 @@ Create a Graphviz visualization of the pipeline. - `path`: the path to save the image. + + +#### load\_from\_yaml + +```python +@classmethod +def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True) +``` + +Load Pipeline from a YAML file defining the individual components and how they're tied together to form + +a Pipeline. A single YAML can declare multiple Pipelines, in which case an explicit `pipeline_name` must +be passed. + +Here's a sample configuration: + + ```yaml + | version: '1.0' + | + | components: # define all the building-blocks for Pipeline + | - name: MyReader # custom-name for the component; helpful for visualization & debugging + | type: FARMReader # Haystack Class name for the component + | params: + | no_ans_boost: -10 + | model_name_or_path: deepset/roberta-base-squad2 + | - name: MyESRetriever + | type: ElasticsearchRetriever + | params: + | document_store: MyDocumentStore # params can reference other components defined in the YAML + | custom_query: null + | - name: MyDocumentStore + | type: ElasticsearchDocumentStore + | params: + | index: haystack_test + | + | pipelines: # multiple Pipelines can be defined using the components from above + | - name: my_query_pipeline # a simple extractive-qa Pipeline + | nodes: + | - name: MyESRetriever + | inputs: [Query] + | - name: MyReader + | inputs: [MyESRetriever] + ``` + +Note that, in case of a mismatch in version between Haystack and the YAML, a warning will be printed. +If the pipeline loads correctly regardless, save again the pipeline using `Pipeline.save_to_yaml()` to remove the warning. + +**Arguments**: + +- `path`: path of the YAML file. +- `pipeline_name`: if the YAML contains multiple pipelines, the pipeline_name to load must be set. +- `overwrite_with_env_variables`: Overwrite the YAML configuration with environment variables. For example, +to change index name param for an ElasticsearchDocumentStore, an env +variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an +`_` sign must be used to specify nested hierarchical properties. + #### load\_from\_config diff --git a/docs/_src/api/api/preprocessor.md b/docs/_src/api/api/preprocessor.md index ed54f7901..aab435cb2 100644 --- a/docs/_src/api/api/preprocessor.md +++ b/docs/_src/api/api/preprocessor.md @@ -15,6 +15,7 @@ class BasePreProcessor(BaseComponent) #### process ```python +@abstractmethod def process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[dict] ``` diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md index 7086a2844..0a3e638b0 100644 --- a/docs/_src/api/api/retriever.md +++ b/docs/_src/api/api/retriever.md @@ -107,7 +107,7 @@ class ElasticsearchRetriever(BaseRetriever) #### \_\_init\_\_ ```python -def __init__(document_store: KeywordDocumentStore, top_k: int = 10, custom_query: str = None) +def __init__(document_store: KeywordDocumentStore, top_k: int = 10, custom_query: Optional[str] = None) ``` **Arguments**: diff --git a/docs/_src/api/openapi/generate_openapi_specs.py b/docs/_src/api/openapi/generate_openapi_specs.py deleted file mode 100644 index bb74de954..000000000 --- a/docs/_src/api/openapi/generate_openapi_specs.py +++ /dev/null @@ -1,31 +0,0 @@ -import json -from pathlib import Path -import os -import sys -import shutil - -sys.path.append("../../../../") - -rest_path = Path("../../../../rest_api").absolute() -pipeline_path = str(rest_path / "pipeline" / "pipeline_empty.yaml") -app_path = str(rest_path / "application.py") -print(f"Loading OpenAPI specs from {app_path} with pipeline at {pipeline_path}") - -os.environ["PIPELINE_YAML_PATH"] = pipeline_path - -from rest_api.application import get_openapi_specs, haystack_version - -# Generate the openapi specs -specs = get_openapi_specs() - -# Dump the specs into a JSON file -with open("openapi.json", "w") as f: - json.dump(specs, f, indent=4) - -# Remove rc versions of the specs from the folder -for specs_file in os.listdir(): - if os.path.isfile(specs_file) and "rc" in specs_file and Path(specs_file).suffix == ".json": - os.remove(specs_file) - -# Add versioned copy -shutil.copy("openapi.json", f"openapi-{haystack_version}.json") diff --git a/haystack/__init__.py b/haystack/__init__.py index f023dc7ef..743cb9742 100644 --- a/haystack/__init__.py +++ b/haystack/__init__.py @@ -7,7 +7,7 @@ except (ModuleNotFoundError, ImportError): # Python <= 3.7 import importlib_metadata as metadata # type: ignore -__version__ = metadata.version("farm-haystack") +__version__: str = str(metadata.version("farm-haystack")) # This configuration must be done before any import to apply to all submodules diff --git a/haystack/document_stores/deepsetcloud.py b/haystack/document_stores/deepsetcloud.py index 3a76f2313..fe4506691 100644 --- a/haystack/document_stores/deepsetcloud.py +++ b/haystack/document_stores/deepsetcloud.py @@ -65,14 +65,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore): f"{indexing_info['pending_file_count']} files are pending to be indexed. Indexing status: {indexing_info['status']}" ) - self.set_config( - workspace=workspace, - index=index, - duplicate_documents=duplicate_documents, - api_endpoint=api_endpoint, - similarity=similarity, - return_embedding=return_embedding, - ) + super().__init__() def get_all_documents( self, diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index b9f572ebc..54b91d0d3 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -140,41 +140,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): :param use_system_proxy: Whether to use system proxy. """ - # save init parameters to enable export of component config as YAML - self.set_config( - host=host, - port=port, - username=username, - password=password, - api_key_id=api_key_id, - api_key=api_key, - aws4auth=aws4auth, - index=index, - label_index=label_index, - search_fields=search_fields, - content_field=content_field, - name_field=name_field, - embedding_field=embedding_field, - embedding_dim=embedding_dim, - custom_mapping=custom_mapping, - excluded_meta_data=excluded_meta_data, - analyzer=analyzer, - scheme=scheme, - ca_certs=ca_certs, - verify_certs=verify_certs, - create_index=create_index, - duplicate_documents=duplicate_documents, - refresh_type=refresh_type, - similarity=similarity, - timeout=timeout, - return_embedding=return_embedding, - index_type=index_type, - scroll=scroll, - skip_missing_embeddings=skip_missing_embeddings, - synonyms=synonyms, - synonym_type=synonym_type, - use_system_proxy=use_system_proxy, - ) + super().__init__() self.client = self._init_elastic_client( host=host, @@ -352,11 +318,12 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): if self.search_fields: for search_field in self.search_fields: if search_field in mapping["properties"] and mapping["properties"][search_field]["type"] != "text": + host_data = self.client.transport.hosts[0] raise Exception( f"The search_field '{search_field}' of index '{index_name}' with type '{mapping['properties'][search_field]['type']}' " f"does not have the right type 'text' to be queried in fulltext search. Please use only 'text' type properties as search_fields. " f"This error might occur if you are trying to use haystack 1.0 and above with an existing elasticsearch index created with a previous version of haystack." - f"In this case deleting the index with `curl -X DELETE \"{self.pipeline_config['params']['host']}:{self.pipeline_config['params']['port']}/{index_name}\"` will fix your environment. " + f"In this case deleting the index with `curl -X DELETE \"{host_data['host']}:{host_data['port']}/{index_name}\"` will fix your environment. " f"Note, that all data stored in the index will be lost!" ) if self.embedding_field: @@ -1823,11 +1790,12 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore): search_field in mappings["properties"] and mappings["properties"][search_field]["type"] != "text" ): + host_data = self.client.transport.hosts[0] raise Exception( f"The search_field '{search_field}' of index '{index_name}' with type '{mappings['properties'][search_field]['type']}' " f"does not have the right type 'text' to be queried in fulltext search. Please use only 'text' type properties as search_fields. " f"This error might occur if you are trying to use haystack 1.0 and above with an existing elasticsearch index created with a previous version of haystack." - f"In this case deleting the index with `curl -X DELETE \"{self.pipeline_config['params']['host']}:{self.pipeline_config['params']['port']}/{index_name}\"` will fix your environment. " + f"In this case deleting the index with `curl -X DELETE \"{host_data['host']}:{host_data['port']}/{index_name}\"` will fix your environment. " f"Note, that all data stored in the index will be lost!" ) diff --git a/haystack/document_stores/faiss.py b/haystack/document_stores/faiss.py index b9a2cf30e..0d62446a1 100644 --- a/haystack/document_stores/faiss.py +++ b/haystack/document_stores/faiss.py @@ -1,15 +1,15 @@ -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Union, List, Optional, Dict, Generator if TYPE_CHECKING: from haystack.nodes.retriever import BaseRetriever import json import logging -from pathlib import Path -from typing import Union, List, Optional, Dict, Generator -from tqdm.auto import tqdm import warnings import numpy as np +from copy import deepcopy +from pathlib import Path +from tqdm.auto import tqdm from inspect import Signature, signature try: @@ -22,7 +22,6 @@ except (ImportError, ModuleNotFoundError) as ie: _optional_component_not_installed(__name__, "faiss", ie) - from haystack.schema import Document from haystack.document_stores.base import get_batches_from_generator @@ -47,7 +46,7 @@ class FAISSDocumentStore(SQLDocumentStore): vector_dim: int = None, embedding_dim: int = 768, faiss_index_factory_str: str = "Flat", - faiss_index: "Optional[faiss.swigfaiss.Index]" = None, + faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, index: str = "document", similarity: str = "dot_product", @@ -112,21 +111,6 @@ class FAISSDocumentStore(SQLDocumentStore): self.__class__.__init__(self, **init_params) # pylint: disable=non-parent-init-called return - # save init parameters to enable export of component config as YAML - self.set_config( - sql_url=sql_url, - vector_dim=vector_dim, - embedding_dim=embedding_dim, - faiss_index_factory_str=faiss_index_factory_str, - return_embedding=return_embedding, - duplicate_documents=duplicate_documents, - index=index, - similarity=similarity, - embedding_field=embedding_field, - progress_bar=progress_bar, - isolation_level=isolation_level, - ) - if similarity in ("dot_product", "cosine"): self.similarity = similarity self.metric_type = faiss.METRIC_INNER_PRODUCT @@ -614,8 +598,15 @@ class FAISSDocumentStore(SQLDocumentStore): config_path = index_path.with_suffix(".json") faiss.write_index(self.faiss_indexes[self.index], str(index_path)) + + config_to_save = deepcopy(self._component_config["params"]) + keys_to_remove = ["faiss_index", "faiss_index_path"] + for key in keys_to_remove: + if key in config_to_save.keys(): + del config_to_save[key] + with open(config_path, "w") as ipp: - json.dump(self.pipeline_config["params"], ipp) + json.dump(config_to_save, ipp, default=str) def _load_init_params_from_config( self, index_path: Union[str, Path], config_path: Optional[Union[str, Path]] = None diff --git a/haystack/document_stores/graphdb.py b/haystack/document_stores/graphdb.py index c737c3e49..f1d56e664 100644 --- a/haystack/document_stores/graphdb.py +++ b/haystack/document_stores/graphdb.py @@ -38,8 +38,7 @@ class GraphDBKnowledgeGraph(BaseKnowledgeGraph): :param index: name of the index (also called repository) stored in the GraphDB instance :param prefixes: definitions of namespaces with a new line after each namespace, e.g., PREFIX hp: """ - # save init parameters to enable export of component config as YAML - self.set_config(host=host, port=port, username=username, password=password, index=index, prefixes=prefixes) + super().__init__() self.url = f"http://{host}:{port}" self.index = index diff --git a/haystack/document_stores/memory.py b/haystack/document_stores/memory.py index 6270251db..f877f0e7b 100644 --- a/haystack/document_stores/memory.py +++ b/haystack/document_stores/memory.py @@ -66,17 +66,7 @@ class InMemoryDocumentStore(BaseDocumentStore): Since the data is originally stored in CPU memory there is little risk of overruning memory when running on CPU. """ - # save init parameters to enable export of component config as YAML - self.set_config( - index=index, - label_index=label_index, - embedding_field=embedding_field, - embedding_dim=embedding_dim, - return_embedding=return_embedding, - similarity=similarity, - progress_bar=progress_bar, - duplicate_documents=duplicate_documents, - ) + super().__init__() self.indexes: Dict[str, Dict] = defaultdict(dict) self.index: str = index diff --git a/haystack/document_stores/milvus1.py b/haystack/document_stores/milvus1.py index d351e194b..fdeb648d2 100644 --- a/haystack/document_stores/milvus1.py +++ b/haystack/document_stores/milvus1.py @@ -107,25 +107,7 @@ class Milvus1DocumentStore(SQLDocumentStore): exists. :param isolation_level: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) """ - # save init parameters to enable export of component config as YAML - self.set_config( - sql_url=sql_url, - milvus_url=milvus_url, - connection_pool=connection_pool, - index=index, - vector_dim=vector_dim, - embedding_dim=embedding_dim, - index_file_size=index_file_size, - similarity=similarity, - index_type=index_type, - index_param=index_param, - search_param=search_param, - duplicate_documents=duplicate_documents, - return_embedding=return_embedding, - embedding_field=embedding_field, - progress_bar=progress_bar, - isolation_level=isolation_level, - ) + super().__init__() self.milvus_server = Milvus(uri=milvus_url, pool=connection_pool) diff --git a/haystack/document_stores/milvus2.py b/haystack/document_stores/milvus2.py index 087f0a4b5..3120e92d1 100644 --- a/haystack/document_stores/milvus2.py +++ b/haystack/document_stores/milvus2.py @@ -126,29 +126,8 @@ class Milvus2DocumentStore(SQLDocumentStore): exists. :param isolation_level: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) """ + super().__init__() - # save init parameters to enable export of component config as YAML - self.set_config( - sql_url=sql_url, - host=host, - port=port, - connection_pool=connection_pool, - index=index, - vector_dim=vector_dim, - embedding_dim=embedding_dim, - index_file_size=index_file_size, - similarity=similarity, - index_type=index_type, - index_param=index_param, - search_param=search_param, - duplicate_documents=duplicate_documents, - id_field=id_field, - return_embedding=return_embedding, - embedding_field=embedding_field, - progress_bar=progress_bar, - custom_fields=custom_fields, - isolation_level=isolation_level, - ) connections.add_connection(default={"host": host, "port": port}) connections.connect() diff --git a/haystack/document_stores/sql.py b/haystack/document_stores/sql.py index aa076fc05..0cca02ea0 100644 --- a/haystack/document_stores/sql.py +++ b/haystack/document_stores/sql.py @@ -134,15 +134,8 @@ class SQLDocumentStore(BaseDocumentStore): :param check_same_thread: Set to False to mitigate multithreading issues in older SQLite versions (see https://docs.sqlalchemy.org/en/14/dialects/sqlite.html?highlight=check_same_thread#threading-pooling-behavior) :param isolation_level: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) """ + super().__init__() - # save init parameters to enable export of component config as YAML - self.set_config( - url=url, - index=index, - label_index=label_index, - duplicate_documents=duplicate_documents, - check_same_thread=check_same_thread, - ) create_engine_params = {} if isolation_level: create_engine_params["isolation_level"] = isolation_level diff --git a/haystack/document_stores/weaviate.py b/haystack/document_stores/weaviate.py index baf6355e5..d096399f6 100644 --- a/haystack/document_stores/weaviate.py +++ b/haystack/document_stores/weaviate.py @@ -105,25 +105,8 @@ class WeaviateDocumentStore(BaseDocumentStore): """ if similarity != "cosine": raise ValueError(f"Weaviate only supports cosine similarity, but you provided {similarity}") - # save init parameters to enable export of component config as YAML - self.set_config( - host=host, - port=port, - timeout_config=timeout_config, - username=username, - password=password, - index=index, - embedding_dim=embedding_dim, - content_field=content_field, - name_field=name_field, - similarity=similarity, - index_type=index_type, - custom_schema=custom_schema, - return_embedding=return_embedding, - embedding_field=embedding_field, - progress_bar=progress_bar, - duplicate_documents=duplicate_documents, - ) + + super().__init__() # Connect to Weaviate server using python binding weaviate_url = f"{host}:{port}" @@ -1162,3 +1145,35 @@ class WeaviateDocumentStore(BaseDocumentStore): docs_to_delete = [doc for doc in docs_to_delete if doc.id in ids] for doc in docs_to_delete: self.weaviate_client.data_object.delete(doc.id) + + def delete_labels(self): + """ + Implemented to respect BaseDocumentStore's contract. + + Weaviate does not support labels (yet). + """ + raise NotImplementedError("Weaviate does not support labels (yet).") + + def get_all_labels(self): + """ + Implemented to respect BaseDocumentStore's contract. + + Weaviate does not support labels (yet). + """ + raise NotImplementedError("Weaviate does not support labels (yet).") + + def get_label_count(self): + """ + Implemented to respect BaseDocumentStore's contract. + + Weaviate does not support labels (yet). + """ + raise NotImplementedError("Weaviate does not support labels (yet).") + + def write_labels(self): + """ + Implemented to respect BaseDocumentStore's contract. + + Weaviate does not support labels (yet). + """ + pass diff --git a/haystack/errors.py b/haystack/errors.py index fbd13de99..dd937e6ca 100644 --- a/haystack/errors.py +++ b/haystack/errors.py @@ -1,8 +1,71 @@ # coding: utf8 -"""Custom Errors for Haystack stacks""" +"""Custom Errors for Haystack""" + +from typing import Optional -class DuplicateDocumentError(ValueError): +class HaystackError(Exception): + """ + Any error generated by Haystack. + + This error wraps its source transparently in such a way that its attributes + can be accessed directly: for example, if the original error has a `message` attribute, + `HaystackError.message` will exist and have the expected content. + """ + + def __init__(self, message: Optional[str] = None, docs_link: Optional[str] = None): + super().__init__() + if message: + self.message = message + self.docs_link = None + + def __getattr__(self, attr): + # If self.__cause__ is None, it will raise the expected AttributeError + getattr(self.__cause__, attr) + + def __str__(self): + if self.docs_link: + docs_message = f"\n\nCheck out the documentation at {self.docs_link}" + return self.message + docs_message + return self.message + + def __repr__(self): + return str(self) + + +class PipelineError(HaystackError): + """Exception for issues raised within a pipeline""" + + def __init__( + self, message: Optional[str] = None, docs_link: Optional[str] = "https://haystack.deepset.ai/pipelines" + ): + super().__init__(message=message, docs_link=docs_link) + + +class PipelineSchemaError(PipelineError): + """Exception for issues arising when reading/building the JSON schema of pipelines""" + + pass + + +class PipelineConfigError(PipelineError): + """Exception for issues raised within a pipeline's config file""" + + def __init__( + self, + message: Optional[str] = None, + docs_link: Optional[str] = "https://haystack.deepset.ai/pipelines#yaml-file-definitions", + ): + super().__init__(message=message, docs_link=docs_link) + + +class DocumentStoreError(HaystackError): + """Exception for issues that occur in a document store""" + + pass + + +class DuplicateDocumentError(DocumentStoreError, ValueError): """Exception for Duplicate document""" pass diff --git a/haystack/modeling/data_handler/data_silo.py b/haystack/modeling/data_handler/data_silo.py index 5a7590d28..ae2a51c52 100644 --- a/haystack/modeling/data_handler/data_silo.py +++ b/haystack/modeling/data_handler/data_silo.py @@ -884,7 +884,7 @@ class DistillationDataSilo(DataSilo): "max_seq_len": self.processor.max_seq_len, "dev_split": self.processor.dev_split, "tasks": self.processor.tasks, - "teacher_name_or_path": self.teacher.pipeline_config["params"]["model_name_or_path"], + "teacher_name_or_path": self.teacher.model_name_or_path, "data_silo_type": self.__class__.__name__, } checksum = get_dict_checksum(payload_dict) diff --git a/haystack/nodes/_json_schema.py b/haystack/nodes/_json_schema.py new file mode 100644 index 000000000..b98230099 --- /dev/null +++ b/haystack/nodes/_json_schema.py @@ -0,0 +1,486 @@ +from typing import Any, Callable, Dict, List, Optional, Set, Tuple + +import logging + +from sqlalchemy import schema + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +import os +import re +import sys +import json +import inspect +from pathlib import Path +from copy import deepcopy +from difflib import SequenceMatcher + +import pydantic.schema +from pydantic import BaseConfig, BaseSettings, Required, SecretStr, create_model +from pydantic.typing import ForwardRef, evaluate_forwardref, is_callable_type +from pydantic.fields import ModelField +from pydantic.schema import ( + SkipField, + TypeModelOrEnum, + TypeModelSet, + encode_default, + field_singleton_schema as _field_singleton_schema, +) + +from haystack import __version__ as haystack_version +from haystack.errors import HaystackError, PipelineSchemaError +from haystack.nodes.base import BaseComponent + + +JSON_SCHEMAS_PATH = Path(__file__).parent.parent.parent / "json-schemas" +SCHEMA_URL = "https://haystack.deepset.ai/json-schemas/" + +# Allows accessory classes (like enums and helpers) to be registered as valid input for +# custom node's init parameters. For now we disable this feature, but flipping this variables +# re-enables it. Mind that string validation will still cut out most attempts to load anything +# else than enums and class constants: see Pipeline.load_from_config() +ALLOW_ACCESSORY_CLASSES = False + + +class Settings(BaseSettings): + input_token: SecretStr + github_repository: str + + +# Monkey patch Pydantic's field_singleton_schema to convert classes and functions to +# strings in JSON Schema +def field_singleton_schema( + field: ModelField, + *, + by_alias: bool, + model_name_map: Dict[TypeModelOrEnum, str], + ref_template: str, + schema_overrides: bool = False, + ref_prefix: Optional[str] = None, + known_models: TypeModelSet, +) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]: + try: + return _field_singleton_schema( + field, + by_alias=by_alias, + model_name_map=model_name_map, + ref_template=ref_template, + schema_overrides=schema_overrides, + ref_prefix=ref_prefix, + known_models=known_models, + ) + except (ValueError, SkipField): + schema: Dict[str, Any] = {"type": "string"} + + if isinstance(field.default, type) or is_callable_type(field.default): + default = field.default.__name__ + else: + default = field.default + if not field.required: + schema["default"] = encode_default(default) + return schema, {}, set() + + +# Monkeypatch Pydantic's field_singleton_schema +pydantic.schema.field_singleton_schema = field_singleton_schema + + +# From FastAPI's internals +def get_typed_signature(call: Callable[..., Any]) -> inspect.Signature: + signature = inspect.signature(call) + globalns = getattr(call, "__globals__", {}) + typed_params = [ + inspect.Parameter( + name=param.name, kind=param.kind, default=param.default, annotation=get_typed_annotation(param, globalns) + ) + for param in signature.parameters.values() + ] + typed_signature = inspect.Signature(typed_params) + return typed_signature + + +# From FastAPI's internals +def get_typed_annotation(param: inspect.Parameter, globalns: Dict[str, Any]) -> Any: + annotation = param.annotation + if isinstance(annotation, str): + annotation = ForwardRef(annotation) + annotation = evaluate_forwardref(annotation, globalns, globalns) + return annotation + + +class Config(BaseConfig): + extra = "forbid" # type: ignore + + +def find_subclasses_in_modules(importable_modules: List[str], include_base_classes: bool = False): + """ + This function returns a list `(module, class)` of all the classes that can be imported + dynamically, for example from a pipeline YAML definition or to generate documentation. + + By default it won't include Base classes, which should be abstract. + """ + return [ + (module, clazz) + for module in importable_modules + for _, clazz in inspect.getmembers(sys.modules[module]) + if ( + inspect.isclass(clazz) + and not inspect.isabstract(clazz) + and issubclass(clazz, BaseComponent) + and (include_base_classes or not clazz.__name__.startswith("Base")) + ) + ] + + +def create_schema_for_node(node: BaseComponent) -> Tuple[Dict[str, Any], Dict[str, Any]]: + """ + Create the JSON schema for a single BaseComponent subclass, + including all accessory classes. + + :returns: the schema for the node and all accessory classes, + and a dict with the reference to the node only. + """ + if not hasattr(node, "__name__"): + raise PipelineSchemaError(f"Node {node} has no __name__ attribute, cannot create a schema for it.") + + node_name = getattr(node, "__name__") + + logger.info(f"Processing node: {node_name}") + + # Read the relevant init parameters from __init__'s signature + init_method = getattr(node, "__init__", None) + if not init_method: + raise PipelineSchemaError(f"Could not read the __init__ method of {node_name} to create its schema.") + + signature = get_typed_signature(init_method) + param_fields = [ + param for param in signature.parameters.values() if param.kind not in {param.VAR_POSITIONAL, param.VAR_KEYWORD} + ] + # Remove self parameter + param_fields.pop(0) + param_fields_kwargs: Dict[str, Any] = {} + + # Read all the paramteres extracted from the __init__ method with type and default value + for param in param_fields: + annotation = Any + if param.annotation != param.empty: + annotation = param.annotation + default = Required + if param.default != param.empty: + default = param.default + param_fields_kwargs[param.name] = (annotation, default) + + # Create the model with Pydantic and extract the schema + model = create_model(f"{node_name}ComponentParams", __config__=Config, **param_fields_kwargs) + model.update_forward_refs(**model.__dict__) + params_schema = model.schema() + params_schema["title"] = "Parameters" + desc = "Each parameter can reference other components defined in the same YAML file." + params_schema["description"] = desc + + # Definitions for accessory classes will show up here + params_definitions = {} + if "definitions" in params_schema: + if ALLOW_ACCESSORY_CLASSES: + params_definitions = params_schema.pop("definitions") + else: + raise PipelineSchemaError( + f"Node {node_name} takes object instances as parameters " + "in its __init__ function. This is currently not allowed: " + "please use only Python primitives" + ) + + # Write out the schema and ref and return them + component_name = f"{node_name}Component" + component_schema = { + component_name: { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string", + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": f"{node_name}", + }, + "params": params_schema, + }, + "required": ["type", "name"], + "additionalProperties": False, + }, + **params_definitions, + } + return component_schema, {"$ref": f"#/definitions/{component_name}"} + + +def get_json_schema( + filename: str, compatible_versions: List[str], modules: List[str] = ["haystack.document_stores", "haystack.nodes"] +): + """ + Generate JSON schema for Haystack pipelines. + """ + schema_definitions = {} # All the schemas for the node and accessory classes + node_refs = [] # References to the nodes only (accessory classes cannot be listed among the nodes in a config) + + # List all known nodes in the given modules + possible_nodes = find_subclasses_in_modules(importable_modules=modules) + + # Build the definitions and refs for the nodes + for _, node in possible_nodes: + node_definition, node_ref = create_schema_for_node(node) + schema_definitions.update(node_definition) + node_refs.append(node_ref) + + pipeline_schema = { + "$schema": "http://json-schema.org/draft-07/schema", + "$id": f"{SCHEMA_URL}{filename}", + "title": "Haystack Pipeline", + "description": "Haystack Pipeline YAML file describing the nodes of the pipelines. For more info read the docs at: https://haystack.deepset.ai/components/pipelines#yaml-file-definitions", + "type": "object", + "properties": { + "version": { + "title": "Version", + "description": "Version of the Haystack Pipeline file.", + "type": "string", + "oneOf": [{"const": version} for version in compatible_versions], + }, + "components": { + "title": "Components", + "description": "Component nodes and their configurations, to later be used in the pipelines section. Define here all the building blocks for the pipelines.", + "type": "array", + "items": {"anyOf": node_refs}, + "required": ["type", "name"], + "additionalProperties": True, # To allow for custom components in IDEs - will be set to False at validation time. + }, + "pipelines": { + "title": "Pipelines", + "description": "Multiple pipelines can be defined using the components from the same YAML file.", + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"title": "Name", "description": "Name of the pipeline.", "type": "string"}, + "nodes": { + "title": "Nodes", + "description": "Nodes to be used by this particular pipeline", + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "The name of this particular node in the pipeline. This should be one of the names from the components defined in the same file.", + "type": "string", + }, + "inputs": { + "title": "Inputs", + "description": "Input parameters for this node.", + "type": "array", + "items": {"type": "string"}, + }, + }, + "required": ["name", "inputs"], + "additionalProperties": False, + }, + "required": ["name", "nodes"], + "additionalProperties": False, + }, + "additionalProperties": False, + }, + "additionalProperties": False, + }, + }, + }, + "required": ["version", "components", "pipelines"], + "additionalProperties": False, + "definitions": schema_definitions, + } + return pipeline_schema + + +def inject_definition_in_schema(node: BaseComponent, schema: Dict[str, Any]) -> Dict[str, Any]: + """ + Given a node and a schema in dict form, injects the JSON schema for the new component + so that pipelines containing such note can be validated against it. + + :returns: the updated schema + """ + schema_definition, node_ref = create_schema_for_node(node) + schema["definitions"].update(schema_definition) + schema["properties"]["components"]["items"]["anyOf"].append(node_ref) + logger.info(f"Added definition for {getattr(node, '__name__')}") + return schema + + +def natural_sort(list_to_sort: List[str]) -> List[str]: + """Sorts a list keeping numbers in the correct numerical order""" + convert = lambda text: int(text) if text.isdigit() else text.lower() + alphanumeric_key = lambda key: [convert(c) for c in re.split("([0-9]+)", key)] + return sorted(list_to_sort, key=alphanumeric_key) + + +def load(path: Path) -> Dict[str, Any]: + """Shorthand for loading a JSON""" + with open(path, "r") as json_file: + return json.load(json_file) + + +def dump(data: Dict[str, Any], path: Path) -> None: + """Shorthand for dumping to JSON""" + with open(path, "w") as json_file: + json.dump(data, json_file, indent=2) + + +def new_version_entry(version): + """ + Returns a new entry for the version index JSON schema. + """ + return { + "allOf": [ + {"properties": {"version": {"oneOf": [{"const": version}]}}}, + { + "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/json-schemas/" + f"haystack-pipeline-{version}.schema.json" + }, + ] + } + + +def update_json_schema( + update_index: bool, + destination_path: Path = JSON_SCHEMAS_PATH, + index_path: Path = JSON_SCHEMAS_PATH / "haystack-pipeline.schema.json", +): + # Locate the latest schema's path + latest_schema_path = destination_path / Path( + natural_sort(os.listdir(destination_path))[-3] + ) # -1 is index, -2 is unstable + logger.info(f"Latest schema: {latest_schema_path}") + latest_schema = load(latest_schema_path) + + # List the versions supported by the last schema + supported_versions_block = deepcopy(latest_schema["properties"]["version"]["oneOf"]) + supported_versions = [entry["const"].replace('"', "") for entry in supported_versions_block] + logger.info(f"Versions supported by this schema: {supported_versions}") + + # Create new schema with the same filename and versions embedded, to be identical to the latest one. + new_schema = get_json_schema(latest_schema_path.name, supported_versions) + + # Check for backwards compatibility with difflib's SequenceMatcher + # (https://docs.python.org/3/library/difflib.html#difflib.SequenceMatcher) + # If the opcodes contain only "insert" and "equal", that means the new schema + # only added lines and did not remove anything from the previous schema. + # We decided that additions only imply backwards compatibility. + # Any other opcode ("replace", "delete") imply that something has been removed + # in the new schema, which breaks backwards compatibility and means we should + # store a new, separate schema. + # People wishing to upgrade from the older schema version will have to change + # version in their YAML to avoid failing validation. + latest_schema_string = json.dumps(latest_schema) + new_schema_string = json.dumps(new_schema) + matcher = SequenceMatcher(None, latest_schema_string, new_schema_string) + schema_diff = matcher.get_opcodes() + is_backwards_incompatible = any(opcode[0] not in ["insert", "equal"] for opcode in schema_diff) + + unstable_versions_block = [] + + # If the two schemas are incompatible, we need a new file. + # Update the schema's filename and supported versions, then save it. + if is_backwards_incompatible: + + # Print a quick diff to explain the differences + logger.info(f"The schemas are NOT backwards compatible. This is the list of INCOMPATIBLE changes only:") + for tag, i1, i2, j1, j2 in schema_diff: + if tag not in ["equal", "insert"]: + logger.info("{!r:>8} --> {!r}".format(latest_schema_string[i1:i2], new_schema_string[j1:j2])) + + filename = f"haystack-pipeline-{haystack_version}.schema.json" + logger.info(f"Adding {filename} to the schema folder.") + + # Let's check if the schema changed without a version change + if haystack_version in supported_versions and len(supported_versions) > 1: + logger.info( + f"Version {haystack_version} was supported by the latest schema" + f"(supported versions: {supported_versions}). " + f"Removing support for version {haystack_version} from it." + ) + + supported_versions_block = [ + entry for entry in supported_versions_block if entry["const"].replace('"', "") != haystack_version + ] + latest_schema["properties"]["version"]["oneOf"] = supported_versions_block + dump(latest_schema, latest_schema_path) + + # Update the JSON schema index too + if update_index: + index = load(index_path) + index["oneOf"][-1]["allOf"][0]["properties"]["version"]["oneOf"] = supported_versions_block + dump(index, index_path) + + # Dump the new schema file + new_schema["$id"] = f"{SCHEMA_URL}{filename}" + unstable_versions_block = [{"const": haystack_version}] + new_schema["properties"]["version"]["oneOf"] = [{"const": haystack_version}] + dump(new_schema, destination_path / filename) + + # Update schema index with a whole new entry + if update_index: + index = load(index_path) + new_entry = new_version_entry(haystack_version) + if all(new_entry != entry for entry in index["oneOf"]): + index["oneOf"].append(new_version_entry(haystack_version)) + dump(index, index_path) + + # If the two schemas are compatible, no need to write a new one: + # Just add the new version to the list of versions supported by + # the latest schema if it's not there yet + else: + + # Print a quick diff to explain the differences + if not schema_diff or all(tag[0] == "equal" for tag in schema_diff): + logger.info("The schemas are identical, won't create a new file.") + else: + logger.info("The schemas are backwards compatible, overwriting the latest schema.") + logger.info("This is the list of changes:") + for tag, i1, i2, j1, j2 in schema_diff: + if tag not in "equal": + logger.info("{!r:>8} --> {!r}".format(latest_schema_string[i1:i2], new_schema_string[j1:j2])) + + # Overwrite the latest schema (safe to do for additions) + dump(new_schema, latest_schema_path) + + if haystack_version in supported_versions: + unstable_versions_block = supported_versions_block + logger.info( + f"Version {haystack_version} was already supported " f"(supported versions: {supported_versions})" + ) + else: + logger.info( + f"This version ({haystack_version}) was not listed " + f"(supported versions: {supported_versions}): " + "updating the supported versions list." + ) + + # Updating the latest schema's list of supported versions + supported_versions_block.append({"const": haystack_version}) + unstable_versions_block = supported_versions_block + latest_schema["properties"]["version"]["oneOf"] = supported_versions_block + dump(latest_schema, latest_schema_path) + + # Update the JSON schema index too + if update_index: + index = load(index_path) + index["oneOf"][-1]["allOf"][0]["properties"]["version"]["oneOf"] = supported_versions_block + dump(index, index_path) + + # Update the unstable schema (for tests and internal use). + unstable_filename = "haystack-pipeline-unstable.schema.json" + unstable_schema = deepcopy(new_schema) + unstable_schema["$id"] = f"{SCHEMA_URL}{unstable_filename}" + unstable_schema["properties"]["version"]["oneOf"] = [{"const": "unstable"}] + unstable_versions_block + dump(unstable_schema, destination_path / unstable_filename) diff --git a/haystack/nodes/answer_generator/transformers.py b/haystack/nodes/answer_generator/transformers.py index 874897f33..2580000ac 100644 --- a/haystack/nodes/answer_generator/transformers.py +++ b/haystack/nodes/answer_generator/transformers.py @@ -23,11 +23,6 @@ from haystack.nodes.retriever.dense import DensePassageRetriever logger = logging.getLogger(__name__) -class RAGeneratorType(Enum): - TOKEN = (1,) - SEQUENCE = 2 - - class RAGenerator(BaseGenerator): """ Implementation of Facebook's Retrieval-Augmented Generator (https://arxiv.org/abs/2005.11401) based on @@ -76,7 +71,7 @@ class RAGenerator(BaseGenerator): model_name_or_path: str = "facebook/rag-token-nq", model_version: Optional[str] = None, retriever: Optional[DensePassageRetriever] = None, - generator_type: RAGeneratorType = RAGeneratorType.TOKEN, + generator_type: str = "token", top_k: int = 2, max_length: int = 200, min_length: int = 2, @@ -94,7 +89,7 @@ class RAGenerator(BaseGenerator): See https://huggingface.co/models for full list of available models. :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :param retriever: `DensePassageRetriever` used to embedded passages for the docs passed to `predict()`. This is optional and is only needed if the docs you pass don't already contain embeddings in `Document.embedding`. - :param generator_type: Which RAG generator implementation to use? RAG-TOKEN or RAG-SEQUENCE + :param generator_type: Which RAG generator implementation to use ("token" or "sequence") :param top_k: Number of independently generated text to return :param max_length: Maximum length of generated text :param min_length: Minimum length of generated text @@ -103,21 +98,7 @@ class RAGenerator(BaseGenerator): :param prefix: The prefix used by the generator's tokenizer. :param use_gpu: Whether to use GPU. Falls back on CPU if no GPU is available. """ - - # save init parameters to enable export of component config as YAML - self.set_config( - model_name_or_path=model_name_or_path, - model_version=model_version, - retriever=retriever, - generator_type=generator_type, - top_k=top_k, - max_length=max_length, - min_length=min_length, - num_beams=num_beams, - embed_title=embed_title, - prefix=prefix, - use_gpu=use_gpu, - ) + super().__init__() self.model_name_or_path = model_name_or_path self.max_length = max_length @@ -138,7 +119,7 @@ class RAGenerator(BaseGenerator): self.tokenizer = RagTokenizer.from_pretrained(model_name_or_path) - if self.generator_type == RAGeneratorType.SEQUENCE: + if self.generator_type == "sequence": raise NotImplementedError("RagSequenceForGeneration is not implemented yet") # TODO: Enable when transformers have it. Refer https://github.com/huggingface/transformers/issues/7905 # Also refer refer https://github.com/huggingface/transformers/issues/7829 @@ -361,7 +342,7 @@ class Seq2SeqGenerator(BaseGenerator): :param num_beams: Number of beams for beam search. 1 means no beam search. :param use_gpu: Whether to use GPU or the CPU. Falls back on CPU if no GPU is available. """ - + super().__init__() self.model_name_or_path = model_name_or_path self.max_length = max_length self.min_length = min_length diff --git a/haystack/nodes/base.py b/haystack/nodes/base.py index b947c9beb..a647163d0 100644 --- a/haystack/nodes/base.py +++ b/haystack/nodes/base.py @@ -1,42 +1,84 @@ from __future__ import annotations -from typing import Any, Callable, Optional, Dict, List, Tuple, Optional +from typing import Any, Optional, Dict, List, Tuple, Optional -import io -from functools import wraps +import sys from copy import deepcopy -from abc import abstractmethod +from abc import ABC, abstractmethod +from functools import wraps import inspect import logging from haystack.schema import Document, MultiLabel +from haystack.errors import HaystackError logger = logging.getLogger(__name__) -class BaseComponent: +def exportable_to_yaml(init_func): + """ + Decorator that saves the init parameters of a node that later can + be used with exporting YAML configuration of a Pipeline. + """ + + @wraps(init_func) + def wrapper_exportable_to_yaml(self, *args, **kwargs): + + # Call the actuall __init__ function with all the arguments + init_func(self, *args, **kwargs) + + # Warn for unnamed input params - should be rare + if args: + logger.warning( + "Unnamed __init__ parameters will not be saved to YAML if Pipeline.save_to_yaml() is called!" + ) + # Create the configuration dictionary if it doesn't exist yet + if not self._component_config: + self._component_config = {"params": {}, "type": type(self).__name__} + + # Make sure it runs only on the __init__of the implementations, not in superclasses + if init_func.__qualname__ == f"{self.__class__.__name__}.{init_func.__name__}": + + # Store all the named input parameters in self._component_config + for k, v in kwargs.items(): + if isinstance(v, BaseComponent): + self._component_config["params"][k] = v._component_config + elif v is not None: + self._component_config["params"][k] = v + + return wrapper_exportable_to_yaml + + +class BaseComponent(ABC): """ A base class for implementing nodes in a Pipeline. """ outgoing_edges: int - subclasses: dict = {} - pipeline_config: dict = {} name: Optional[str] = None + _subclasses: dict = {} + _component_config: dict = {} + # __init_subclass__ is invoked when a subclass of BaseComponent is _imported_ + # (not instantiated). It works approximately as a metaclass. def __init_subclass__(cls, **kwargs): - """ - Automatically keeps track of all available subclasses. - Enables generic load() for all specific component implementations. - """ + super().__init_subclass__(**kwargs) - cls.subclasses[cls.__name__] = cls + + # Automatically registers all the init parameters in + # an instance attribute called `_component_config`, + # used to save this component to YAML. See exportable_to_yaml() + cls.__init__ = exportable_to_yaml(cls.__init__) + + # Keeps track of all available subclasses by name. + # Enables generic load() for all specific component implementations. + cls._subclasses[cls.__name__] = cls @classmethod def get_subclass(cls, component_type: str): - if component_type not in cls.subclasses.keys(): - raise Exception(f"Haystack component with the name '{component_type}' does not exist.") - subclass = cls.subclasses[component_type] + if component_type not in cls._subclasses.keys(): + raise HaystackError(f"Haystack component with the name '{component_type}' does not exist.") + subclass = cls._subclasses[component_type] return subclass @classmethod @@ -165,18 +207,3 @@ class BaseComponent: output["params"] = params return output, stream - - def set_config(self, **kwargs): - """ - Save the init parameters of a component that later can be used with exporting - YAML configuration of a Pipeline. - - :param kwargs: all parameters passed to the __init__() of the Component. - """ - if not self.pipeline_config: - self.pipeline_config = {"params": {}, "type": type(self).__name__} - for k, v in kwargs.items(): - if isinstance(v, BaseComponent): - self.pipeline_config["params"][k] = v.pipeline_config - elif v is not None: - self.pipeline_config["params"][k] = v diff --git a/haystack/nodes/connector/crawler.py b/haystack/nodes/connector/crawler.py index d5a4ab747..470255ed9 100644 --- a/haystack/nodes/connector/crawler.py +++ b/haystack/nodes/connector/crawler.py @@ -58,6 +58,8 @@ class Crawler(BaseComponent): All URLs not matching at least one of the regular expressions will be dropped. :param overwrite_existing_files: Whether to overwrite existing files in output_dir with new content """ + super().__init__() + IN_COLAB = "google.colab" in sys.modules options = webdriver.chrome.options.Options() diff --git a/haystack/nodes/document_classifier/transformers.py b/haystack/nodes/document_classifier/transformers.py index 649f96401..738900564 100644 --- a/haystack/nodes/document_classifier/transformers.py +++ b/haystack/nodes/document_classifier/transformers.py @@ -101,18 +101,8 @@ class TransformersDocumentClassifier(BaseDocumentClassifier): :param batch_size: batch size to be processed at once :param classification_field: Name of Document's meta field to be used for classification. If left unset, Document.content is used by default. """ - # save init parameters to enable export of component config as YAML - self.set_config( - model_name_or_path=model_name_or_path, - model_version=model_version, - tokenizer=tokenizer, - use_gpu=use_gpu, - return_all_scores=return_all_scores, - labels=labels, - task=task, - batch_size=batch_size, - classification_field=classification_field, - ) + super().__init__() + if labels and task == "text-classification": logger.warning( f"Provided labels {labels} will be ignored for task text-classification. Set task to " diff --git a/haystack/nodes/evaluator/evaluator.py b/haystack/nodes/evaluator/evaluator.py index 75acf7104..26851ea77 100644 --- a/haystack/nodes/evaluator/evaluator.py +++ b/haystack/nodes/evaluator/evaluator.py @@ -41,6 +41,7 @@ class EvalDocuments(BaseComponent): "EvalDocuments node is deprecated and will be removed in a future version. " "Please use pipeline.eval() instead." ) + super().__init__() self.init_counts() self.no_answer_warning = False self.debug = debug @@ -205,6 +206,7 @@ class EvalAnswers(BaseComponent): "EvalAnswers node is deprecated and will be removed in a future version. " "Please use pipeline.eval() instead." ) + super().__init__() self.log: List = [] self.debug = debug self.skip_incorrect_retrieval = skip_incorrect_retrieval diff --git a/haystack/nodes/extractor/entity.py b/haystack/nodes/extractor/entity.py index ba53f7e6a..3498f496e 100644 --- a/haystack/nodes/extractor/entity.py +++ b/haystack/nodes/extractor/entity.py @@ -21,8 +21,8 @@ class EntityExtractor(BaseComponent): outgoing_edges = 1 def __init__(self, model_name_or_path: str = "dslim/bert-base-NER", use_gpu: bool = True): + super().__init__() - self.set_config(model_name_or_path=model_name_or_path) self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) diff --git a/haystack/nodes/file_classifier/file_type.py b/haystack/nodes/file_classifier/file_type.py index 17741225a..78b3b87f6 100644 --- a/haystack/nodes/file_classifier/file_type.py +++ b/haystack/nodes/file_classifier/file_type.py @@ -30,7 +30,8 @@ class FileTypeClassifier(BaseComponent): if len(set(supported_types)) != len(supported_types): raise ValueError("supported_types can't contain duplicate values.") - self.set_config(supported_types=supported_types) + super().__init__() + self.supported_types = supported_types def _get_extension(self, file_paths: List[Path]) -> str: diff --git a/haystack/nodes/file_converter/azure.py b/haystack/nodes/file_converter/azure.py index 98e817352..a4a6db81b 100644 --- a/haystack/nodes/file_converter/azure.py +++ b/haystack/nodes/file_converter/azure.py @@ -57,17 +57,7 @@ class AzureConverter(BaseConverter): This parameter lets you choose, whether to merge multiple column header rows to a single row. """ - # save init parameters to enable export of component config as YAML - self.set_config( - endpoint=endpoint, - credential_key=credential_key, - model_id=model_id, - valid_languages=valid_languages, - save_json=save_json, - preceding_context_len=preceding_context_len, - following_context_len=following_context_len, - merge_multiple_column_headers=merge_multiple_column_headers, - ) + super().__init__(valid_languages=valid_languages) self.document_analysis_client = DocumentAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(credential_key) @@ -79,8 +69,6 @@ class AzureConverter(BaseConverter): self.following_context_len = following_context_len self.merge_multiple_column_headers = merge_multiple_column_headers - super().__init__(valid_languages=valid_languages) - def convert( self, file_path: Path, diff --git a/haystack/nodes/file_converter/base.py b/haystack/nodes/file_converter/base.py index 4d3b1b07a..d30791b93 100644 --- a/haystack/nodes/file_converter/base.py +++ b/haystack/nodes/file_converter/base.py @@ -27,9 +27,7 @@ class BaseConverter(BaseComponent): not one of the valid languages, then it might likely be encoding error resulting in garbled text. """ - - # save init parameters to enable export of component config as YAML - self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) + super().__init__() self.remove_numeric_tables = remove_numeric_tables self.valid_languages = valid_languages diff --git a/haystack/nodes/file_converter/image.py b/haystack/nodes/file_converter/image.py index 8e6531ed8..e9ddac105 100644 --- a/haystack/nodes/file_converter/image.py +++ b/haystack/nodes/file_converter/image.py @@ -35,9 +35,7 @@ class ImageToTextConverter(BaseConverter): # List of available languages print(pytesseract.get_languages(config='')) """ - - # save init parameters to enable export of component config as YAML - self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) + super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) verify_installation = subprocess.run(["tesseract -v"], shell=True) if verify_installation.returncode == 127: diff --git a/haystack/nodes/file_converter/parsr.py b/haystack/nodes/file_converter/parsr.py index a525ba91d..f6f6d3a5d 100644 --- a/haystack/nodes/file_converter/parsr.py +++ b/haystack/nodes/file_converter/parsr.py @@ -55,18 +55,7 @@ class ParsrConverter(BaseConverter): not one of the valid languages, then it might likely be encoding error resulting in garbled text. """ - # save init parameters to enable export of component config as YAML - self.set_config( - parsr_url=parsr_url, - extractor=extractor, - table_detection_mode=table_detection_mode, - preceding_context_len=preceding_context_len, - following_context_len=following_context_len, - remove_page_headers=remove_page_headers, - remove_page_footers=remove_page_footers, - remove_table_of_contents=remove_table_of_contents, - valid_languages=valid_languages, - ) + super().__init__(valid_languages=valid_languages) try: ping = requests.get(parsr_url) diff --git a/haystack/nodes/file_converter/pdf.py b/haystack/nodes/file_converter/pdf.py index 67e02d697..52446d33e 100644 --- a/haystack/nodes/file_converter/pdf.py +++ b/haystack/nodes/file_converter/pdf.py @@ -33,8 +33,7 @@ class PDFToTextConverter(BaseConverter): not one of the valid languages, then it might likely be encoding error resulting in garbled text. """ - # save init parameters to enable export of component config as YAML - self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) + super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) verify_installation = subprocess.run(["pdftotext -v"], shell=True) if verify_installation.returncode == 127: @@ -170,8 +169,6 @@ class PDFToTextOCRConverter(BaseConverter): # init image to text instance self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages) - # save init parameters to enable export of component config as YAML - self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) def convert( diff --git a/haystack/nodes/file_converter/tika.py b/haystack/nodes/file_converter/tika.py index 1a8867ddc..a0e015d9b 100644 --- a/haystack/nodes/file_converter/tika.py +++ b/haystack/nodes/file_converter/tika.py @@ -59,9 +59,7 @@ class TikaConverter(BaseConverter): not one of the valid languages, then it might likely be encoding error resulting in garbled text. """ - - # save init parameters to enable export of component config as YAML - self.set_config(tika_url=tika_url, remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) + super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) ping = requests.get(tika_url) if ping.status_code != 200: diff --git a/haystack/nodes/other/docs2answers.py b/haystack/nodes/other/docs2answers.py index 59b7341f6..7d537354d 100644 --- a/haystack/nodes/other/docs2answers.py +++ b/haystack/nodes/other/docs2answers.py @@ -13,9 +13,6 @@ class Docs2Answers(BaseComponent): outgoing_edges = 1 - def __init__(self): - self.set_config() - def run(self, query: str, documents: List[Document]): # type: ignore # conversion from Document -> Answer answers: List[Answer] = [] diff --git a/haystack/nodes/other/join_answers.py b/haystack/nodes/other/join_answers.py index e96652dfc..6a5c05a8c 100644 --- a/haystack/nodes/other/join_answers.py +++ b/haystack/nodes/other/join_answers.py @@ -26,8 +26,7 @@ class JoinAnswers(BaseComponent): weights is not None and join_mode == "concatenate" ), "Weights are not compatible with 'concatenate' join_mode" - # Save init parameters to enable export of component config as YAML - self.set_config(join_mode=join_mode, weights=weights, top_k_join=top_k_join) + super().__init__() self.join_mode = join_mode self.weights = [float(i) / sum(weights) for i in weights] if weights else None diff --git a/haystack/nodes/other/join_docs.py b/haystack/nodes/other/join_docs.py index 96fed82b3..aa977591e 100644 --- a/haystack/nodes/other/join_docs.py +++ b/haystack/nodes/other/join_docs.py @@ -39,8 +39,7 @@ class JoinDocuments(BaseComponent): weights is not None and join_mode == "concatenate" ), "Weights are not compatible with 'concatenate' join_mode." - # save init parameters to enable export of component config as YAML - self.set_config(join_mode=join_mode, weights=weights, top_k_join=top_k_join) + super().__init__() self.join_mode = join_mode self.weights = [float(i) / sum(weights) for i in weights] if weights else None diff --git a/haystack/nodes/other/route_documents.py b/haystack/nodes/other/route_documents.py index f9ba7e3ed..81fd67c9f 100644 --- a/haystack/nodes/other/route_documents.py +++ b/haystack/nodes/other/route_documents.py @@ -32,8 +32,7 @@ class RouteDocuments(BaseComponent): "to group the documents to." ) - # Save init parameters to enable export of component config as YAML - self.set_config(split_by=split_by, metadata_values=metadata_values) + super().__init__() self.split_by = split_by self.metadata_values = metadata_values diff --git a/haystack/nodes/preprocessor/base.py b/haystack/nodes/preprocessor/base.py index f2d73a37e..3ccf2fecc 100644 --- a/haystack/nodes/preprocessor/base.py +++ b/haystack/nodes/preprocessor/base.py @@ -1,11 +1,13 @@ from typing import List, Dict, Any, Optional, Union +from abc import abstractmethod from haystack.nodes.base import BaseComponent class BasePreProcessor(BaseComponent): outgoing_edges = 1 + @abstractmethod def process( self, documents: Union[dict, List[dict]], @@ -23,6 +25,7 @@ class BasePreProcessor(BaseComponent): """ raise NotImplementedError + @abstractmethod def clean( self, document: dict, @@ -33,6 +36,7 @@ class BasePreProcessor(BaseComponent): ) -> Dict[str, Any]: raise NotImplementedError + @abstractmethod def split( self, document: dict, diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py index b774fcb63..8a7d39ce0 100644 --- a/haystack/nodes/preprocessor/preprocessor.py +++ b/haystack/nodes/preprocessor/preprocessor.py @@ -72,18 +72,7 @@ class PreProcessor(BasePreProcessor): the number of words will be <= split_length. :param language: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. Available options: "en", "es", "de", "fr" & many more. """ - - # save init parameters to enable export of component config as YAML - self.set_config( - clean_whitespace=clean_whitespace, - clean_header_footer=clean_header_footer, - clean_empty_lines=clean_empty_lines, - remove_substrings=remove_substrings, - split_by=split_by, - split_length=split_length, - split_overlap=split_overlap, - split_respect_sentence_boundary=split_respect_sentence_boundary, - ) + super().__init__() try: nltk.data.find("tokenizers/punkt") @@ -131,9 +120,9 @@ class PreProcessor(BasePreProcessor): ret = [] - if type(documents) == dict: + if isinstance(documents, dict): ret = self._process_single(document=documents, **kwargs) # type: ignore - elif type(documents) == list: + elif isinstance(documents, list): ret = self._process_batch(documents=list(documents), **kwargs) else: diff --git a/haystack/nodes/query_classifier/sklearn.py b/haystack/nodes/query_classifier/sklearn.py index bfa6b7a70..b17ba7a32 100644 --- a/haystack/nodes/query_classifier/sklearn.py +++ b/haystack/nodes/query_classifier/sklearn.py @@ -72,8 +72,7 @@ class SklearnQueryClassifier(BaseQueryClassifier): ): raise TypeError("model_name_or_path and vectorizer_name_or_path must either be of type Path or str") - # save init parameters to enable export of component config as YAML - self.set_config(model_name_or_path=model_name_or_path, vectorizer_name_or_path=vectorizer_name_or_path) + super().__init__() if isinstance(model_name_or_path, Path): file_url = urllib.request.pathname2url(r"{}".format(model_name_or_path)) diff --git a/haystack/nodes/query_classifier/transformers.py b/haystack/nodes/query_classifier/transformers.py index f6a0500b4..1bc30a184 100644 --- a/haystack/nodes/query_classifier/transformers.py +++ b/haystack/nodes/query_classifier/transformers.py @@ -63,8 +63,8 @@ class TransformersQueryClassifier(BaseQueryClassifier): :param model_name_or_path: Transformer based fine tuned mini bert model for query classification :param use_gpu: Whether to use GPU (if available). """ - # save init parameters to enable export of component config as YAML - self.set_config(model_name_or_path=model_name_or_path) + super().__init__() + self.devices, _ = initialize_device_settings(use_cuda=use_gpu) device = 0 if self.devices[0].type == "cuda" else -1 diff --git a/haystack/nodes/question_generator/question_generator.py b/haystack/nodes/question_generator/question_generator.py index ba3f411b5..22dbf1ede 100644 --- a/haystack/nodes/question_generator/question_generator.py +++ b/haystack/nodes/question_generator/question_generator.py @@ -47,21 +47,11 @@ class QuestionGenerator(BaseComponent): :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :param use_gpu: Whether to use GPU or the CPU. Falls back on CPU if no GPU is available. """ + super().__init__() self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path) self.model.to(str(self.devices[0])) self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) - self.set_config( - model_name_or_path=model_name_or_path, - model_version=model_version, - max_length=max_length, - num_beams=num_beams, - no_repeat_ngram_size=no_repeat_ngram_size, - length_penalty=length_penalty, - early_stopping=early_stopping, - split_length=split_length, - split_overlap=split_overlap, - ) self.num_beams = num_beams self.max_length = max_length self.no_repeat_ngram_size = no_repeat_ngram_size diff --git a/haystack/nodes/ranker/sentence_transformers.py b/haystack/nodes/ranker/sentence_transformers.py index 54a672e7c..67defe5fe 100644 --- a/haystack/nodes/ranker/sentence_transformers.py +++ b/haystack/nodes/ranker/sentence_transformers.py @@ -52,9 +52,7 @@ class SentenceTransformersRanker(BaseRanker): :param use_gpu: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available. :param devices: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. ["cuda:0"]). """ - - # save init parameters to enable export of component config as YAML - self.set_config(model_name_or_path=model_name_or_path, model_version=model_version, top_k=top_k) + super().__init__() self.top_k = top_k diff --git a/haystack/nodes/reader/farm.py b/haystack/nodes/reader/farm.py index 78f90309d..ccecc772f 100644 --- a/haystack/nodes/reader/farm.py +++ b/haystack/nodes/reader/farm.py @@ -113,30 +113,8 @@ class FARMReader(BaseReader): the local token will be used, which must be previously created via `transformer-cli login`. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained """ + super().__init__() - # save init parameters to enable export of component config as YAML - self.set_config( - model_name_or_path=model_name_or_path, - model_version=model_version, - context_window_size=context_window_size, - batch_size=batch_size, - use_gpu=use_gpu, - no_ans_boost=no_ans_boost, - return_no_answer=return_no_answer, - top_k=top_k, - top_k_per_candidate=top_k_per_candidate, - top_k_per_sample=top_k_per_sample, - num_processes=num_processes, - max_seq_len=max_seq_len, - doc_stride=doc_stride, - progress_bar=progress_bar, - duplicate_filtering=duplicate_filtering, - proxies=proxies, - local_files_only=local_files_only, - force_download=force_download, - use_confidence_scores=use_confidence_scores, - **kwargs, - ) self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) self.return_no_answers = return_no_answer @@ -175,6 +153,7 @@ class FARMReader(BaseReader): self.use_gpu = use_gpu self.progress_bar = progress_bar self.use_confidence_scores = use_confidence_scores + self.model_name_or_path = model_name_or_path # Used in distillation, see DistillationDataSilo._get_checksum() def _training_procedure( self, diff --git a/haystack/nodes/reader/table.py b/haystack/nodes/reader/table.py index d5076e061..4d343b1a5 100644 --- a/haystack/nodes/reader/table.py +++ b/haystack/nodes/reader/table.py @@ -95,17 +95,7 @@ class TableReader(BaseReader): query + table exceed max_seq_len, the table will be truncated by removing rows until the input size fits the model. """ - # Save init parameters to enable export of component config as YAML - self.set_config( - model_name_or_path=model_name_or_path, - model_version=model_version, - tokenizer=tokenizer, - use_gpu=use_gpu, - top_k=top_k, - top_k_per_candidate=top_k_per_candidate, - return_no_answer=return_no_answer, - max_seq_len=max_seq_len, - ) + super().__init__() self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) config = TapasConfig.from_pretrained(model_name_or_path) @@ -480,18 +470,7 @@ class RCIReader(BaseReader): query + table exceed max_seq_len, the table will be truncated by removing rows until the input size fits the model. """ - # Save init parameters to enable export of component config as YAML - self.set_config( - row_model_name_or_path=row_model_name_or_path, - column_model_name_or_path=column_model_name_or_path, - row_model_version=row_model_version, - column_model_version=column_model_version, - row_tokenizer=row_tokenizer, - column_tokenizer=column_tokenizer, - use_gpu=use_gpu, - top_k=top_k, - max_seq_len=max_seq_len, - ) + super().__init__() self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) self.row_model = AutoModelForSequenceClassification.from_pretrained( diff --git a/haystack/nodes/reader/transformers.py b/haystack/nodes/reader/transformers.py index 5051648ab..baf51cb09 100644 --- a/haystack/nodes/reader/transformers.py +++ b/haystack/nodes/reader/transformers.py @@ -60,19 +60,7 @@ class TransformersReader(BaseReader): :param max_seq_len: max sequence length of one input text for the model :param doc_stride: length of striding window for splitting long texts (used if len(text) > max_seq_len) """ - # save init parameters to enable export of component config as YAML - self.set_config( - model_name_or_path=model_name_or_path, - model_version=model_version, - tokenizer=tokenizer, - context_window_size=context_window_size, - use_gpu=use_gpu, - top_k=top_k, - doc_stride=doc_stride, - top_k_per_candidate=top_k_per_candidate, - return_no_answers=return_no_answers, - max_seq_len=max_seq_len, - ) + super().__init__() self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) device = 0 if self.devices[0].type == "cuda" else -1 diff --git a/haystack/nodes/retriever/base.py b/haystack/nodes/retriever/base.py index b000adac7..4e1f9828e 100644 --- a/haystack/nodes/retriever/base.py +++ b/haystack/nodes/retriever/base.py @@ -8,6 +8,7 @@ from tqdm import tqdm from copy import deepcopy from haystack.schema import Document, MultiLabel +from haystack.errors import HaystackError from haystack.nodes.base import BaseComponent from haystack.document_stores.base import BaseDocumentStore, BaseKnowledgeGraph @@ -240,6 +241,10 @@ class BaseRetriever(BaseComponent): headers: Optional[Dict[str, str]] = None, ): if root_node == "Query": + if not query: + raise HaystackError( + "Must provide a 'query' parameter for retrievers in pipelines where Query is the root node." + ) self.query_count += 1 run_query_timed = self.timing(self.run_query, "query_time") output, stream = run_query_timed(query=query, filters=filters, top_k=top_k, index=index, headers=headers) diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index 238adc172..5d56245fc 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -108,25 +108,7 @@ class DensePassageRetriever(BaseRetriever): the local token will be used, which must be previously created via `transformer-cli login`. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained """ - # save init parameters to enable export of component config as YAML - self.set_config( - document_store=document_store, - query_embedding_model=query_embedding_model, - passage_embedding_model=passage_embedding_model, - model_version=model_version, - max_seq_len_query=max_seq_len_query, - max_seq_len_passage=max_seq_len_passage, - top_k=top_k, - use_gpu=use_gpu, - batch_size=batch_size, - embed_title=embed_title, - use_fast_tokenizers=use_fast_tokenizers, - infer_tokenizer_classes=infer_tokenizer_classes, - similarity_function=similarity_function, - progress_bar=progress_bar, - devices=devices, - use_auth_token=use_auth_token, - ) + super().__init__() if devices is not None: self.devices = devices @@ -606,27 +588,7 @@ class TableTextRetriever(BaseRetriever): the local token will be used, which must be previously created via `transformer-cli login`. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained """ - # save init parameters to enable export of component config as YAML - self.set_config( - document_store=document_store, - query_embedding_model=query_embedding_model, - passage_embedding_model=passage_embedding_model, - table_embedding_model=table_embedding_model, - model_version=model_version, - max_seq_len_query=max_seq_len_query, - max_seq_len_passage=max_seq_len_passage, - max_seq_len_table=max_seq_len_table, - top_k=top_k, - use_gpu=use_gpu, - batch_size=batch_size, - embed_meta_fields=embed_meta_fields, - use_fast_tokenizers=use_fast_tokenizers, - infer_tokenizer_classes=infer_tokenizer_classes, - similarity_function=similarity_function, - progress_bar=progress_bar, - devices=devices, - use_auth_token=use_auth_token, - ) + super().__init__() if devices is not None: self.devices = devices @@ -1145,19 +1107,7 @@ class EmbeddingRetriever(BaseRetriever): the local token will be used, which must be previously created via `transformer-cli login`. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained """ - # save init parameters to enable export of component config as YAML - self.set_config( - document_store=document_store, - embedding_model=embedding_model, - model_version=model_version, - use_gpu=use_gpu, - batch_size=batch_size, - max_seq_len=max_seq_len, - model_format=model_format, - pooling_strategy=pooling_strategy, - emb_extraction_layer=emb_extraction_layer, - top_k=top_k, - ) + super().__init__() if devices is not None: self.devices = devices diff --git a/haystack/nodes/retriever/sparse.py b/haystack/nodes/retriever/sparse.py index cd25e7196..cba1ac613 100644 --- a/haystack/nodes/retriever/sparse.py +++ b/haystack/nodes/retriever/sparse.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) class ElasticsearchRetriever(BaseRetriever): - def __init__(self, document_store: KeywordDocumentStore, top_k: int = 10, custom_query: str = None): + def __init__(self, document_store: KeywordDocumentStore, top_k: int = 10, custom_query: Optional[str] = None): """ :param document_store: an instance of an ElasticsearchDocumentStore to retrieve documents from. :param custom_query: query string as per Elasticsearch DSL with a mandatory query placeholder(query). @@ -87,8 +87,7 @@ class ElasticsearchRetriever(BaseRetriever): :param top_k: How many documents to return per query. """ - # save init parameters to enable export of component config as YAML - self.set_config(document_store=document_store, top_k=top_k, custom_query=custom_query) + super().__init__() self.document_store: KeywordDocumentStore = document_store self.top_k = top_k self.custom_query = custom_query @@ -176,8 +175,7 @@ class TfidfRetriever(BaseRetriever): :param top_k: How many documents to return per query. :param auto_fit: Whether to automatically update tf-idf matrix by calling fit() after new documents have been added """ - # save init parameters to enable export of component config as YAML - self.set_config(document_store=document_store, top_k=top_k, auto_fit=auto_fit) + super().__init__() self.vectorizer = TfidfVectorizer( lowercase=True, stop_words=None, token_pattern=r"(?u)\b\w\w+\b", ngram_range=(1, 1) diff --git a/haystack/nodes/retriever/text2sparql.py b/haystack/nodes/retriever/text2sparql.py index 7c75523b7..9db92e320 100644 --- a/haystack/nodes/retriever/text2sparql.py +++ b/haystack/nodes/retriever/text2sparql.py @@ -23,8 +23,7 @@ class Text2SparqlRetriever(BaseGraphRetriever): :param model_name_or_path: Name of or path to a pre-trained BartForConditionalGeneration model. :param top_k: How many SPARQL queries to generate per text query. """ - # save init parameters to enable export of component config as YAML - self.set_config(knowledge_graph=knowledge_graph, model_name_or_path=model_name_or_path, top_k=top_k) + super().__init__() self.knowledge_graph = knowledge_graph # TODO We should extend this to any seq2seq models and use the AutoModel class diff --git a/haystack/nodes/summarizer/transformers.py b/haystack/nodes/summarizer/transformers.py index c8fe4ef83..d5976b9c1 100644 --- a/haystack/nodes/summarizer/transformers.py +++ b/haystack/nodes/summarizer/transformers.py @@ -82,18 +82,7 @@ class TransformersSummarizer(BaseSummarizer): be summarized. Important: The summary will depend on the order of the supplied documents! """ - # save init parameters to enable export of component config as YAML - self.set_config( - model_name_or_path=model_name_or_path, - model_version=model_version, - tokenizer=tokenizer, - max_length=max_length, - min_length=min_length, - use_gpu=use_gpu, - clean_up_tokenization_spaces=clean_up_tokenization_spaces, - separator_for_single_summary=separator_for_single_summary, - generate_single_summary=generate_single_summary, - ) + super().__init__() self.devices, _ = initialize_device_settings(use_cuda=use_gpu) device = 0 if self.devices[0].type == "cuda" else -1 diff --git a/haystack/nodes/translator/transformers.py b/haystack/nodes/translator/transformers.py index 94fce890c..82eeaa559 100644 --- a/haystack/nodes/translator/transformers.py +++ b/haystack/nodes/translator/transformers.py @@ -60,14 +60,7 @@ class TransformersTranslator(BaseTranslator): :param clean_up_tokenization_spaces: Whether or not to clean up the tokenization spaces. (default True) :param use_gpu: Whether to use GPU or the CPU. Falls back on CPU if no GPU is available. """ - - # save init parameters to enable export of component config as YAML - self.set_config( - model_name_or_path=model_name_or_path, - tokenizer_name=tokenizer_name, - max_seq_len=max_seq_len, - clean_up_tokenization_spaces=clean_up_tokenization_spaces, - ) + super().__init__() self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) self.max_seq_len = max_seq_len diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index dfc27639e..c10e3e9a2 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -1,4 +1,5 @@ from __future__ import annotations +from os import pipe from typing import Dict, List, Optional, Any import copy @@ -10,7 +11,12 @@ import numpy as np import pandas as pd from pathlib import Path import networkx as nx +from abc import ABC, abstractmethod +from jsonschema import Draft7Validator +from jsonschema.exceptions import ValidationError +from jsonschema import _utils as jsonschema_utils from pandas.core.frame import DataFrame +from transformers import pipelines import yaml from networkx import DiGraph from networkx.drawing.nx_agraph import to_agraph @@ -19,7 +25,14 @@ from haystack.nodes.evaluator.evaluator import ( calculate_f1_str_multi, semantic_answer_similarity, ) -from haystack.pipelines.config import get_component_definitions, get_pipeline_definition, read_pipeline_config_from_yaml +from haystack.pipelines.config import ( + JSON_SCHEMAS_PATH, + get_component_definitions, + get_pipeline_definition, + read_pipeline_config_from_yaml, + validate_config_strings, + validate_config, +) from haystack.pipelines.utils import generate_code, print_eval_report from haystack.utils import DeepsetCloud @@ -32,6 +45,7 @@ except: from haystack import __version__ from haystack.schema import EvaluationResult, MultiLabel, Document +from haystack.errors import PipelineError, PipelineConfigError from haystack.nodes.base import BaseComponent from haystack.nodes.retriever.base import BaseRetriever from haystack.document_stores.base import BaseDocumentStore @@ -55,22 +69,24 @@ class RootNode(BaseComponent): return {}, "output_1" -class BasePipeline: +class BasePipeline(ABC): """ Base class for pipelines, providing the most basic methods to load and save them in different ways. See also the `Pipeline` class for the actual pipeline logic. """ + @abstractmethod def run(self, **kwargs): - raise NotImplementedError + raise NotImplementedError("This is an abstract method. Use Pipeline or RayPipeline instead.") + @abstractmethod def get_config(self, return_defaults: bool = False) -> dict: """ - Returns a configuration for the Pipeline that can be used with `BasePipeline.load_from_config()`. + Returns a configuration for the Pipeline that can be used with `Pipeline.load_from_config()`. :param return_defaults: whether to output parameters that have the default values. """ - raise NotImplementedError + raise NotImplementedError("This is an abstract method. Use Pipeline or RayPipeline instead.") def to_code( self, pipeline_variable_name: str = "pipeline", generate_imports: bool = True, add_comment: bool = False @@ -121,6 +137,7 @@ class BasePipeline: logger.error("Could not create notebook cell. Make sure you're running in a notebook environment.") @classmethod + @abstractmethod def load_from_config( cls, pipeline_config: Dict, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True ): @@ -169,26 +186,10 @@ class BasePipeline: variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an `_` sign must be used to specify nested hierarchical properties. """ - pipeline_definition = get_pipeline_definition(pipeline_config=pipeline_config, pipeline_name=pipeline_name) - if pipeline_definition["type"] == "Pipeline": - return Pipeline.load_from_config( - pipeline_config=pipeline_config, - pipeline_name=pipeline_name, - overwrite_with_env_variables=overwrite_with_env_variables, - ) - elif pipeline_definition["type"] == "RayPipeline": - return RayPipeline.load_from_config( - pipeline_config=pipeline_config, - pipeline_name=pipeline_name, - overwrite_with_env_variables=overwrite_with_env_variables, - ) - else: - raise KeyError( - f"Pipeline Type '{pipeline_definition['type']}' is not a valid. The available types are" - f"'Pipeline' and 'RayPipeline'." - ) + raise NotImplementedError("This is an abstract method. Use Pipeline or RayPipeline instead.") @classmethod + @abstractmethod def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True): """ Load Pipeline from a YAML file defining the individual components and how they're tied together to form @@ -235,21 +236,7 @@ class BasePipeline: variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an `_` sign must be used to specify nested hierarchical properties. """ - - pipeline_config = read_pipeline_config_from_yaml(path) - if pipeline_config["version"] != __version__: - logger.warning( - f"YAML version ({pipeline_config['version']}) does not match with Haystack version ({__version__}). " - "Issues may occur during loading. " - "To fix this warning, save again this pipeline with the current Haystack version using Pipeline.save_to_yaml(), " - "check out our migration guide at https://haystack.deepset.ai/overview/migration " - f"or downgrade to haystack version {__version__}." - ) - return cls.load_from_config( - pipeline_config=pipeline_config, - pipeline_name=pipeline_name, - overwrite_with_env_variables=overwrite_with_env_variables, - ) + raise NotImplementedError("This is an abstract method. Use Pipeline or RayPipeline instead.") @classmethod def load_from_deepset_cloud( @@ -302,6 +289,7 @@ class BasePipeline: ) component_config["params"] = params + del pipeline_config["name"] # Would fail validation otherwise pipeline = cls.load_from_config( pipeline_config=pipeline_config, pipeline_name=pipeline_name, @@ -501,43 +489,64 @@ class Pipeline(BasePipeline): In cases when the predecessor node has multiple outputs, e.g., a "QueryClassifier", the output must be specified explicitly as "QueryClassifier.output_2". """ + valid_root_nodes = ["Query", "File"] if self.root_node is None: root_node = inputs[0] - if root_node in ["Query", "File"]: + if root_node in valid_root_nodes: self.root_node = root_node self.graph.add_node(root_node, component=RootNode()) else: - raise KeyError(f"Root node '{root_node}' is invalid. Available options are 'Query' and 'File'.") + raise PipelineConfigError( + f"Root node '{root_node}' is invalid. Available options are {valid_root_nodes}." + ) component.name = name self.graph.add_node(name, component=component, inputs=inputs) if len(self.graph.nodes) == 2: # first node added; connect with Root - assert len(inputs) == 1 and inputs[0].split(".")[0] == self.root_node, ( - f"The '{name}' node can only input from {self.root_node}. " - f"Set the 'inputs' parameter to ['{self.root_node}']" - ) + if not len(inputs) == 1 and inputs[0].split(".")[0] == self.root_node: + raise PipelineConfigError( + f"The '{name}' node can only input from {self.root_node}. " + f"Set the 'inputs' parameter to ['{self.root_node}']" + ) self.graph.add_edge(self.root_node, name, label="output_1") return - for i in inputs: - if "." in i: - [input_node_name, input_edge_name] = i.split(".") - assert "output_" in input_edge_name, f"'{input_edge_name}' is not a valid edge name." + for input_node in inputs: + if "." in input_node: + [input_node_name, input_edge_name] = input_node.split(".") + if not "output_" in input_edge_name: + raise PipelineConfigError(f"'{input_edge_name}' is not a valid edge name.") + outgoing_edges_input_node = self.graph.nodes[input_node_name]["component"].outgoing_edges - assert int(input_edge_name.split("_")[1]) <= outgoing_edges_input_node, ( - f"Cannot connect '{input_edge_name}' from '{input_node_name}' as it only has " - f"{outgoing_edges_input_node} outgoing edge(s)." - ) + if not int(input_edge_name.split("_")[1]) <= outgoing_edges_input_node: + raise PipelineConfigError( + f"Cannot connect '{input_edge_name}' from '{input_node_name}' as it only has " + f"{outgoing_edges_input_node} outgoing edge(s)." + ) else: - outgoing_edges_input_node = self.graph.nodes[i]["component"].outgoing_edges - assert outgoing_edges_input_node == 1, ( - f"Adding an edge from {i} to {name} is ambiguous as {i} has {outgoing_edges_input_node} edges. " - f"Please specify the output explicitly." - ) - input_node_name = i + try: + outgoing_edges_input_node = self.graph.nodes[input_node]["component"].outgoing_edges + if not outgoing_edges_input_node == 1: + raise PipelineConfigError( + f"Adding an edge from {input_node} to {name} is ambiguous as {input_node} has {outgoing_edges_input_node} edges. " + f"Please specify the output explicitly." + ) + + except KeyError as e: + raise PipelineConfigError( + f"Cannot find node '{input_node}'. Make sure you're not using more " + f"than one root node ({valid_root_nodes}) in the same pipeline and that a node " + f"called '{input_node}' is defined." + ) from e + + input_node_name = input_node input_edge_name = "output_1" self.graph.add_edge(input_node_name, name, label=input_edge_name) + if not nx.is_directed_acyclic_graph(self.graph): + self.graph.remove_node(name) + raise PipelineConfigError(f"Cannot add '{name}': it will create a loop in the pipeline.") + def get_node(self, name: str) -> Optional[BaseComponent]: """ Get a node from the Pipeline. @@ -968,6 +977,61 @@ class Pipeline(BasePipeline): graphviz.layout("dot") graphviz.draw(path) + @classmethod + def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True): + """ + Load Pipeline from a YAML file defining the individual components and how they're tied together to form + a Pipeline. A single YAML can declare multiple Pipelines, in which case an explicit `pipeline_name` must + be passed. + + Here's a sample configuration: + + ```yaml + | version: '1.0' + | + | components: # define all the building-blocks for Pipeline + | - name: MyReader # custom-name for the component; helpful for visualization & debugging + | type: FARMReader # Haystack Class name for the component + | params: + | no_ans_boost: -10 + | model_name_or_path: deepset/roberta-base-squad2 + | - name: MyESRetriever + | type: ElasticsearchRetriever + | params: + | document_store: MyDocumentStore # params can reference other components defined in the YAML + | custom_query: null + | - name: MyDocumentStore + | type: ElasticsearchDocumentStore + | params: + | index: haystack_test + | + | pipelines: # multiple Pipelines can be defined using the components from above + | - name: my_query_pipeline # a simple extractive-qa Pipeline + | nodes: + | - name: MyESRetriever + | inputs: [Query] + | - name: MyReader + | inputs: [MyESRetriever] + ``` + + Note that, in case of a mismatch in version between Haystack and the YAML, a warning will be printed. + If the pipeline loads correctly regardless, save again the pipeline using `Pipeline.save_to_yaml()` to remove the warning. + + :param path: path of the YAML file. + :param pipeline_name: if the YAML contains multiple pipelines, the pipeline_name to load must be set. + :param overwrite_with_env_variables: Overwrite the YAML configuration with environment variables. For example, + to change index name param for an ElasticsearchDocumentStore, an env + variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an + `_` sign must be used to specify nested hierarchical properties. + """ + + pipeline_config = read_pipeline_config_from_yaml(path) + return cls.load_from_config( + pipeline_config=pipeline_config, + pipeline_name=pipeline_name, + overwrite_with_env_variables=overwrite_with_env_variables, + ) + @classmethod def load_from_config( cls, pipeline_config: Dict, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True @@ -1017,6 +1081,8 @@ class Pipeline(BasePipeline): variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an `_` sign must be used to specify nested hierarchical properties. """ + validate_config(pipeline_config) + pipeline_definition = get_pipeline_definition(pipeline_config=pipeline_config, pipeline_name=pipeline_name) component_definitions = get_component_definitions( pipeline_config=pipeline_config, overwrite_with_env_variables=overwrite_with_env_variables @@ -1063,8 +1129,16 @@ class Pipeline(BasePipeline): instance = BaseComponent.load_from_args(component_type=component_type, **component_params) components[name] = instance + + except KeyError as ke: + raise PipelineConfigError( + f"Failed loading pipeline component '{name}': " + "seems like the component does not exist. Did you spell its name correctly?" + ) from ke except Exception as e: - raise Exception(f"Failed loading pipeline component '{name}': {e}") + raise PipelineConfigError( + f"Failed loading pipeline component '{name}'. " "See the stacktrace above for more informations." + ) from e return instance def save_to_yaml(self, path: Path, return_defaults: bool = False): @@ -1085,15 +1159,16 @@ class Pipeline(BasePipeline): :param return_defaults: whether to output parameters that have the default values. """ pipeline_name = ROOT_NODE_TO_PIPELINE_NAME[self.root_node.lower()] - pipelines: dict = {pipeline_name: {"name": pipeline_name, "type": self.__class__.__name__, "nodes": []}} + pipelines: dict = {pipeline_name: {"name": pipeline_name, "nodes": []}} components = {} for node in self.graph.nodes: if node == self.root_node: continue component_instance = self.graph.nodes.get(node)["component"] - component_type = component_instance.pipeline_config["type"] - component_params = component_instance.pipeline_config["params"] + + component_type = component_instance._component_config["type"] + component_params = component_instance._component_config["params"] components[node] = {"name": node, "type": component_type, "params": {}} component_parent_classes = inspect.getmro(type(component_instance)) @@ -1112,7 +1187,7 @@ class Pipeline(BasePipeline): sub_component = param_value sub_component_type_name = sub_component["type"] sub_component_signature = inspect.signature( - BaseComponent.subclasses[sub_component_type_name] + BaseComponent._subclasses[sub_component_type_name] ).parameters sub_component_params = { k: v @@ -1313,14 +1388,6 @@ class RayPipeline(Pipeline): :param address: The IP address for the Ray cluster. If set to None, a local Ray instance is started. """ pipeline_config = read_pipeline_config_from_yaml(path) - if pipeline_config["version"] != __version__: - logger.warning( - f"YAML version ({pipeline_config['version']}) does not match with Haystack version ({__version__}). " - "Issues may occur during loading. " - "To fix this warning, save again this pipeline with the current Haystack version using Pipeline.save_to_yaml(), " - "check out our migration guide at https://haystack.deepset.ai/overview/migration " - f"or downgrade to haystack version {__version__}." - ) return RayPipeline.load_from_config( pipeline_config=pipeline_config, pipeline_name=pipeline_name, diff --git a/haystack/pipelines/config.py b/haystack/pipelines/config.py index be46dfa71..bba20f30f 100644 --- a/haystack/pipelines/config.py +++ b/haystack/pipelines/config.py @@ -1,17 +1,26 @@ +from typing import Any, Dict, List, Optional + +import re +import os import copy import logging -import os from pathlib import Path -import re -from typing import Any, Dict, List, Optional from networkx import DiGraph import yaml +import json +from jsonschema.validators import Draft7Validator +from jsonschema.exceptions import ValidationError + +from haystack import __version__ +from haystack.nodes.base import BaseComponent +from haystack.nodes._json_schema import inject_definition_in_schema, JSON_SCHEMAS_PATH +from haystack.errors import PipelineConfigError, PipelineSchemaError, HaystackError logger = logging.getLogger(__name__) -VALID_CODE_GEN_INPUT_REGEX = re.compile(r"^[-a-zA-Z0-9_/.:]+$") +VALID_INPUT_REGEX = re.compile(r"^[-a-zA-Z0-9_/.:]+$") def get_pipeline_definition(pipeline_config: Dict[str, Any], pipeline_name: Optional[str] = None) -> Dict[str, Any]: @@ -26,11 +35,14 @@ def get_pipeline_definition(pipeline_config: Dict[str, Any], pipeline_name: Opti if len(pipeline_config["pipelines"]) == 1: pipeline_definition = pipeline_config["pipelines"][0] else: - raise Exception("The YAML contains multiple pipelines. Please specify the pipeline name to load.") + raise PipelineConfigError("The YAML contains multiple pipelines. Please specify the pipeline name to load.") else: pipelines_in_definitions = list(filter(lambda p: p["name"] == pipeline_name, pipeline_config["pipelines"])) if not pipelines_in_definitions: - raise KeyError(f"Cannot find any pipeline with name '{pipeline_name}' declared in the YAML file.") + raise PipelineConfigError( + f"Cannot find any pipeline with name '{pipeline_name}' declared in the YAML file. " + f"Existing pipelines: {[p['name'] for p in pipeline_config['pipelines']]}" + ) pipeline_definition = pipelines_in_definitions[0] return pipeline_definition @@ -62,20 +74,29 @@ def read_pipeline_config_from_yaml(path: Path): return yaml.safe_load(stream) -def validate_config(pipeline_config: Dict[str, Any]): - for component in pipeline_config["components"]: - _validate_user_input(component["name"]) - _validate_user_input(component["type"]) - for k, v in component.get("params", {}).items(): - _validate_user_input(k) - _validate_user_input(v) - for pipeline in pipeline_config["pipelines"]: - _validate_user_input(pipeline["name"]) - _validate_user_input(pipeline["type"]) - for node in pipeline["nodes"]: - _validate_user_input(node["name"]) - for input in node["inputs"]: - _validate_user_input(input) +def validate_config_strings(pipeline_config: Any): + """ + Ensures that strings used in the pipelines configuration + contain only alphanumeric characters and basic punctuation. + """ + try: + if isinstance(pipeline_config, dict): + for key, value in pipeline_config.items(): + validate_config_strings(key) + validate_config_strings(value) + + elif isinstance(pipeline_config, list): + for value in pipeline_config: + validate_config_strings(value) + + else: + if not VALID_INPUT_REGEX.match(str(pipeline_config)): + raise PipelineConfigError( + f"'{pipeline_config}' is not a valid variable name or value. " + "Use alphanumeric characters or dash, underscore and colon only." + ) + except RecursionError as e: + raise PipelineConfigError("The given pipeline configuration is recursive, can't validate it.") from e def build_component_dependency_graph( @@ -111,9 +132,96 @@ def build_component_dependency_graph( return graph -def _validate_user_input(input: str): - if isinstance(input, str) and not VALID_CODE_GEN_INPUT_REGEX.match(input): - raise ValueError(f"'{input}' is not a valid config variable name. Use word characters only.") +def validate_yaml(path: Path): + """ + Validates the given YAML file using the autogenerated JSON schema. + + :param pipeline_config: the configuration to validate + :return: None if validation is successful + :raise: `PipelineConfigError` in case of issues. + """ + pipeline_config = read_pipeline_config_from_yaml(path) + validate_config(pipeline_config=pipeline_config) + logging.debug(f"'{path}' contains valid Haystack pipelines.") + + +def validate_config(pipeline_config: Dict) -> None: + """ + Validates the given configuration using the autogenerated JSON schema. + + :param pipeline_config: the configuration to validate + :return: None if validation is successful + :raise: `PipelineConfigError` in case of issues. + """ + validate_config_strings(pipeline_config) + + with open(JSON_SCHEMAS_PATH / f"haystack-pipeline-unstable.schema.json", "r") as schema_file: + schema = json.load(schema_file) + + compatible_versions = [version["const"].replace('"', "") for version in schema["properties"]["version"]["oneOf"]] + loaded_custom_nodes = [] + + while True: + + try: + Draft7Validator(schema).validate(instance=pipeline_config) + + if pipeline_config["version"] == "unstable": + logging.warning( + "You seem to be using the 'unstable' version of the schema to validate " + "your pipeline configuration.\n" + "This is NOT RECOMMENDED in production environments, as pipelines " + "might manage to load and then misbehave without warnings.\n" + f"Please pin your configurations to '{__version__}' to ensure stability." + ) + + elif pipeline_config["version"] not in compatible_versions: + raise PipelineConfigError( + f"Cannot load pipeline configuration of version {pipeline_config['version']} " + f"in Haystack version {__version__} " + f"(only versions {compatible_versions} are compatible with this Haystack release).\n" + "Please check out the release notes (https://github.com/deepset-ai/haystack/releases/latest), " + "the documentation (https://haystack.deepset.ai/components/pipelines#yaml-file-definitions) " + "and fix your configuration accordingly." + ) + break + + except ValidationError as validation: + + # If the validation comes from an unknown node, try to find it and retry: + if list(validation.relative_schema_path) == ["properties", "components", "items", "anyOf"]: + if validation.instance["type"] not in loaded_custom_nodes: + + logger.info( + f"Missing definition for node of type {validation.instance['type']}. Looking into local classes..." + ) + missing_component = BaseComponent.get_subclass(validation.instance["type"]) + schema = inject_definition_in_schema(node=missing_component, schema=schema) + loaded_custom_nodes.append(validation.instance["type"]) + continue + + # A node with the given name was imported, but something else is wrong with it. + # Probably it references unknown classes in its init parameters. + raise PipelineSchemaError( + f"Cannot process node of type {validation.instance['type']}. Make sure its __init__ function " + "does not reference external classes, but uses only Python primitive types." + ) from validation + + # Format the error to make it as clear as possible + error_path = [ + i + for i in list(validation.relative_schema_path)[:-1] + if repr(i) != "'items'" and repr(i) != "'properties'" + ] + error_location = "->".join(repr(index) for index in error_path) + if error_location: + error_location = f"The error is in {error_location}." + + raise PipelineConfigError( + f"Validation failed. {validation.message}. {error_location} " "See the stacktrace for more information." + ) from validation + + logging.debug(f"Pipeline configuration is valid.") def _overwrite_with_env_variables(component_definition: Dict[str, Any]): diff --git a/json-schemas/haystack-pipeline-1.1.0.schema.json b/json-schemas/haystack-pipeline-1.0.0.schema.json similarity index 71% rename from json-schemas/haystack-pipeline-1.1.0.schema.json rename to json-schemas/haystack-pipeline-1.0.0.schema.json index 088561ddc..6524ed657 100644 --- a/json-schemas/haystack-pipeline-1.1.0.schema.json +++ b/json-schemas/haystack-pipeline-1.0.0.schema.json @@ -1,6 +1,6 @@ { "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://haystack.deepset.ai/json-schemas/haystack-pipeline-1.1.0.schema.json", + "$id": "https://haystack.deepset.ai/json-schemas/haystack-pipeline-1.0.0.schema.json", "title": "Haystack Pipeline", "description": "Haystack Pipeline YAML file describing the nodes of the pipelines. For more info read the docs at: https://haystack.deepset.ai/components/pipelines#yaml-file-definitions", "type": "object", @@ -9,7 +9,17 @@ "title": "Version", "description": "Version of the Haystack Pipeline file.", "type": "string", - "const": "1.1.0" + "oneOf": [ + { + "const": "1.0.0" + }, + { + "const": "1.1.0" + }, + { + "const": "1.2.0" + } + ] }, "components": { "title": "Components", @@ -17,6 +27,36 @@ "type": "array", "items": { "anyOf": [ + { + "$ref": "#/definitions/DeepsetCloudDocumentStoreComponent" + }, + { + "$ref": "#/definitions/ElasticsearchDocumentStoreComponent" + }, + { + "$ref": "#/definitions/FAISSDocumentStoreComponent" + }, + { + "$ref": "#/definitions/GraphDBKnowledgeGraphComponent" + }, + { + "$ref": "#/definitions/InMemoryDocumentStoreComponent" + }, + { + "$ref": "#/definitions/Milvus2DocumentStoreComponent" + }, + { + "$ref": "#/definitions/OpenDistroElasticsearchDocumentStoreComponent" + }, + { + "$ref": "#/definitions/OpenSearchDocumentStoreComponent" + }, + { + "$ref": "#/definitions/SQLDocumentStoreComponent" + }, + { + "$ref": "#/definitions/WeaviateDocumentStoreComponent" + }, { "$ref": "#/definitions/AzureConverterComponent" }, @@ -59,6 +99,9 @@ { "$ref": "#/definitions/ImageToTextConverterComponent" }, + { + "$ref": "#/definitions/JoinAnswersComponent" + }, { "$ref": "#/definitions/JoinDocumentsComponent" }, @@ -86,6 +129,9 @@ { "$ref": "#/definitions/RCIReaderComponent" }, + { + "$ref": "#/definitions/RouteDocumentsComponent" + }, { "$ref": "#/definitions/SentenceTransformersRankerComponent" }, @@ -134,7 +180,7 @@ "type", "name" ], - "additionalProperties": false + "additionalProperties": true }, "pipelines": { "title": "Pipelines", @@ -169,6 +215,10 @@ } } }, + "required": [ + "name", + "inputs" + ], "additionalProperties": false }, "required": [ @@ -176,7 +226,8 @@ "nodes" ], "additionalProperties": false - } + }, + "additionalProperties": false }, "additionalProperties": false } @@ -189,6 +240,929 @@ ], "additionalProperties": false, "definitions": { + "DeepsetCloudDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "DeepsetCloudDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "api_key": { + "title": "Api Key", + "type": "string" + }, + "workspace": { + "title": "Workspace", + "default": "default", + "type": "string" + }, + "index": { + "title": "Index", + "default": "default", + "type": "string" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "api_endpoint": { + "title": "Api Endpoint", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "ElasticsearchDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "ElasticsearchDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "host": { + "title": "Host", + "default": "localhost", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "port": { + "title": "Port", + "default": 9200, + "anyOf": [ + { + "type": "integer" + }, + { + "type": "array", + "items": { + "type": "integer" + } + } + ] + }, + "username": { + "title": "Username", + "default": "", + "type": "string" + }, + "password": { + "title": "Password", + "default": "", + "type": "string" + }, + "api_key_id": { + "title": "Api Key Id", + "type": "string" + }, + "api_key": { + "title": "Api Key", + "type": "string" + }, + "aws4auth": { + "title": "Aws4Auth" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "search_fields": { + "title": "Search Fields", + "default": "content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": {} + } + ] + }, + "content_field": { + "title": "Content Field", + "default": "content", + "type": "string" + }, + "name_field": { + "title": "Name Field", + "default": "name", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "custom_mapping": { + "title": "Custom Mapping", + "type": "object" + }, + "excluded_meta_data": { + "title": "Excluded Meta Data", + "type": "array", + "items": {} + }, + "analyzer": { + "title": "Analyzer", + "default": "standard", + "type": "string" + }, + "scheme": { + "title": "Scheme", + "default": "http", + "type": "string" + }, + "ca_certs": { + "title": "Ca Certs", + "type": "string" + }, + "verify_certs": { + "title": "Verify Certs", + "default": true, + "type": "boolean" + }, + "recreate_index": { + "title": "Recreate Index", + "default": false, + "type": "boolean" + }, + "create_index": { + "title": "Create Index", + "default": true, + "type": "boolean" + }, + "refresh_type": { + "title": "Refresh Type", + "default": "wait_for", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product" + }, + "timeout": { + "title": "Timeout", + "default": 30 + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "flat", + "type": "string" + }, + "scroll": { + "title": "Scroll", + "default": "1d", + "type": "string" + }, + "skip_missing_embeddings": { + "title": "Skip Missing Embeddings", + "default": true, + "type": "boolean" + }, + "synonyms": { + "title": "Synonyms", + "type": "array", + "items": {} + }, + "synonym_type": { + "title": "Synonym Type", + "default": "synonym", + "type": "string" + }, + "use_system_proxy": { + "title": "Use System Proxy", + "default": false, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "FAISSDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "FAISSDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "sql_url": { + "title": "Sql Url", + "default": "sqlite:///faiss_document_store.db", + "type": "string" + }, + "vector_dim": { + "title": "Vector Dim", + "type": "integer" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "faiss_index_factory_str": { + "title": "Faiss Index Factory Str", + "default": "Flat", + "type": "string" + }, + "faiss_index": { + "title": "Faiss Index", + "type": "string", + "default": null + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "faiss_index_path": { + "title": "Faiss Index Path", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string", + "format": "path" + } + ] + }, + "faiss_config_path": { + "title": "Faiss Config Path", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string", + "format": "path" + } + ] + }, + "isolation_level": { + "title": "Isolation Level", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "GraphDBKnowledgeGraphComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "GraphDBKnowledgeGraph" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "host": { + "title": "Host", + "default": "localhost", + "type": "string" + }, + "port": { + "title": "Port", + "default": 7200, + "type": "integer" + }, + "username": { + "title": "Username", + "default": "", + "type": "string" + }, + "password": { + "title": "Password", + "default": "", + "type": "string" + }, + "index": { + "title": "Index", + "type": "string" + }, + "prefixes": { + "title": "Prefixes", + "default": "", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "InMemoryDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "InMemoryDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "scoring_batch_size": { + "title": "Scoring Batch Size", + "default": 500000, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "Milvus2DocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "Milvus2DocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "sql_url": { + "title": "Sql Url", + "default": "sqlite:///", + "type": "string" + }, + "host": { + "title": "Host", + "default": "localhost", + "type": "string" + }, + "port": { + "title": "Port", + "default": "19530", + "type": "string" + }, + "connection_pool": { + "title": "Connection Pool", + "default": "SingletonThread", + "type": "string" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "vector_dim": { + "title": "Vector Dim", + "type": "integer" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "index_file_size": { + "title": "Index File Size", + "default": 1024, + "type": "integer" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "IVF_FLAT", + "type": "string" + }, + "index_param": { + "title": "Index Param", + "type": "object" + }, + "search_param": { + "title": "Search Param", + "type": "object" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "id_field": { + "title": "Id Field", + "default": "id", + "type": "string" + }, + "custom_fields": { + "title": "Custom Fields", + "type": "array", + "items": {} + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "isolation_level": { + "title": "Isolation Level", + "type": "string" + }, + "consistency_level": { + "title": "Consistency Level", + "default": 0, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "OpenDistroElasticsearchDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "OpenDistroElasticsearchDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "host": { + "title": "Host", + "default": "https://admin:admin@localhost:9200/" + }, + "similarity": { + "title": "Similarity", + "default": "cosine" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "OpenSearchDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "OpenSearchDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "verify_certs": { + "title": "Verify Certs", + "default": false + }, + "scheme": { + "title": "Scheme", + "default": "https" + }, + "username": { + "title": "Username", + "default": "admin" + }, + "password": { + "title": "Password", + "default": "admin" + }, + "port": { + "title": "Port", + "default": 9200 + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "SQLDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "SQLDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "url": { + "title": "Url", + "default": "sqlite://", + "type": "string" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "check_same_thread": { + "title": "Check Same Thread", + "default": false, + "type": "boolean" + }, + "isolation_level": { + "title": "Isolation Level", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "WeaviateDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "WeaviateDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "host": { + "title": "Host", + "default": "http://localhost", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "port": { + "title": "Port", + "default": 8080, + "anyOf": [ + { + "type": "integer" + }, + { + "type": "array", + "items": { + "type": "integer" + } + } + ] + }, + "timeout_config": { + "title": "Timeout Config", + "default": [ + 5, + 15 + ], + "type": "array", + "items": {} + }, + "username": { + "title": "Username", + "type": "string" + }, + "password": { + "title": "Password", + "type": "string" + }, + "index": { + "title": "Index", + "default": "Document", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "content_field": { + "title": "Content Field", + "default": "content", + "type": "string" + }, + "name_field": { + "title": "Name Field", + "default": "name", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "cosine", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "hnsw", + "type": "string" + }, + "custom_schema": { + "title": "Custom Schema", + "type": "object" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, "AzureConverterComponent": { "type": "object", "properties": { @@ -1093,6 +2067,51 @@ ], "additionalProperties": false }, + "JoinAnswersComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "JoinAnswers" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "join_mode": { + "title": "Join Mode", + "default": "concatenate", + "type": "string" + }, + "weights": { + "title": "Weights", + "type": "array", + "items": { + "type": "number" + } + }, + "top_k_join": { + "title": "Top K Join", + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, "JoinDocumentsComponent": { "type": "object", "properties": { @@ -1307,12 +2326,12 @@ }, "preceding_context_len": { "title": "Preceding Context Len", - "default": 1, + "default": 3, "type": "integer" }, "following_context_len": { "title": "Following Context Len", - "default": 1, + "default": 3, "type": "integer" }, "remove_page_headers": { @@ -1584,6 +2603,16 @@ ], "additionalProperties": false }, + "RAGeneratorType": { + "title": "RAGeneratorType", + "description": "An enumeration.", + "enum": [ + [ + 1 + ], + 2 + ] + }, "RCIReaderComponent": { "type": "object", "properties": { @@ -1654,6 +2683,47 @@ ], "additionalProperties": false }, + "RouteDocumentsComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "RouteDocuments" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "split_by": { + "title": "Split By", + "default": "content_type", + "type": "string" + }, + "metadata_values": { + "title": "Metadata Values", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, "SentenceTransformersRankerComponent": { "type": "object", "properties": { @@ -2583,16 +3653,6 @@ "name" ], "additionalProperties": false - }, - "RAGeneratorType": { - "title": "RAGeneratorType", - "description": "An enumeration.", - "enum": [ - [ - 1 - ], - 2 - ] } } } \ No newline at end of file diff --git a/json-schemas/haystack-pipeline-1.2.0.schema.json b/json-schemas/haystack-pipeline-1.2.0.schema.json deleted file mode 100644 index eb36978cd..000000000 --- a/json-schemas/haystack-pipeline-1.2.0.schema.json +++ /dev/null @@ -1,2590 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://haystack.deepset.ai/json-schemas/haystack-pipeline-1.2.0.schema.json", - "title": "Haystack Pipeline", - "description": "Haystack Pipeline YAML file describing the nodes of the pipelines. For more info read the docs at: https://haystack.deepset.ai/components/pipelines#yaml-file-definitions", - "type": "object", - "properties": { - "version": { - "title": "Version", - "description": "Version of the Haystack Pipeline file.", - "type": "string", - "const": "1.2.0" - }, - "components": { - "title": "Components", - "description": "Component nodes and their configurations, to later be used in the pipelines section. Define here all the building blocks for the pipelines.", - "type": "array", - "items": { - "anyOf": [ - { - "$ref": "#/definitions/AzureConverterComponent" - }, - { - "$ref": "#/definitions/CrawlerComponent" - }, - { - "$ref": "#/definitions/DensePassageRetrieverComponent" - }, - { - "$ref": "#/definitions/Docs2AnswersComponent" - }, - { - "$ref": "#/definitions/DocxToTextConverterComponent" - }, - { - "$ref": "#/definitions/ElasticsearchFilterOnlyRetrieverComponent" - }, - { - "$ref": "#/definitions/ElasticsearchRetrieverComponent" - }, - { - "$ref": "#/definitions/EmbeddingRetrieverComponent" - }, - { - "$ref": "#/definitions/EntityExtractorComponent" - }, - { - "$ref": "#/definitions/EvalAnswersComponent" - }, - { - "$ref": "#/definitions/EvalDocumentsComponent" - }, - { - "$ref": "#/definitions/FARMReaderComponent" - }, - { - "$ref": "#/definitions/FileTypeClassifierComponent" - }, - { - "$ref": "#/definitions/ImageToTextConverterComponent" - }, - { - "$ref": "#/definitions/JoinDocumentsComponent" - }, - { - "$ref": "#/definitions/MarkdownConverterComponent" - }, - { - "$ref": "#/definitions/PDFToTextConverterComponent" - }, - { - "$ref": "#/definitions/PDFToTextOCRConverterComponent" - }, - { - "$ref": "#/definitions/ParsrConverterComponent" - }, - { - "$ref": "#/definitions/PreProcessorComponent" - }, - { - "$ref": "#/definitions/QuestionGeneratorComponent" - }, - { - "$ref": "#/definitions/RAGeneratorComponent" - }, - { - "$ref": "#/definitions/RCIReaderComponent" - }, - { - "$ref": "#/definitions/SentenceTransformersRankerComponent" - }, - { - "$ref": "#/definitions/Seq2SeqGeneratorComponent" - }, - { - "$ref": "#/definitions/SklearnQueryClassifierComponent" - }, - { - "$ref": "#/definitions/TableReaderComponent" - }, - { - "$ref": "#/definitions/TableTextRetrieverComponent" - }, - { - "$ref": "#/definitions/Text2SparqlRetrieverComponent" - }, - { - "$ref": "#/definitions/TextConverterComponent" - }, - { - "$ref": "#/definitions/TfidfRetrieverComponent" - }, - { - "$ref": "#/definitions/TikaConverterComponent" - }, - { - "$ref": "#/definitions/TransformersDocumentClassifierComponent" - }, - { - "$ref": "#/definitions/TransformersQueryClassifierComponent" - }, - { - "$ref": "#/definitions/TransformersReaderComponent" - }, - { - "$ref": "#/definitions/TransformersSummarizerComponent" - }, - { - "$ref": "#/definitions/TransformersTranslatorComponent" - } - ] - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "pipelines": { - "title": "Pipelines", - "description": "Multiple pipelines can be defined using the components from the same YAML file.", - "type": "array", - "items": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Name of the pipeline.", - "type": "string" - }, - "nodes": { - "title": "Nodes", - "description": "Nodes to be used by this particular pipeline", - "type": "array", - "items": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "The name of this particular node in the pipeline. This should be one of the names from the components defined in the same file.", - "type": "string" - }, - "inputs": { - "title": "Inputs", - "description": "Input parameters for this node.", - "type": "array", - "items": { - "type": "string" - } - } - }, - "additionalProperties": false - }, - "required": [ - "name", - "nodes" - ], - "additionalProperties": false - } - }, - "additionalProperties": false - } - } - }, - "required": [ - "version", - "components", - "pipelines" - ], - "additionalProperties": false, - "definitions": { - "AzureConverterComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "AzureConverter" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "endpoint": { - "title": "Endpoint", - "type": "string" - }, - "credential_key": { - "title": "Credential Key", - "type": "string" - }, - "model_id": { - "title": "Model Id", - "default": "prebuilt-document", - "type": "string" - }, - "valid_languages": { - "title": "Valid Languages", - "type": "array", - "items": { - "type": "string" - } - }, - "save_json": { - "title": "Save Json", - "default": false, - "type": "boolean" - }, - "preceding_context_len": { - "title": "Preceding Context Len", - "default": 3, - "type": "integer" - }, - "following_context_len": { - "title": "Following Context Len", - "default": 3, - "type": "integer" - }, - "merge_multiple_column_headers": { - "title": "Merge Multiple Column Headers", - "default": true, - "type": "boolean" - } - }, - "required": [ - "endpoint", - "credential_key" - ], - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "CrawlerComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "Crawler" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "output_dir": { - "title": "Output Dir", - "type": "string" - }, - "urls": { - "title": "Urls", - "type": "array", - "items": { - "type": "string" - } - }, - "crawler_depth": { - "title": "Crawler Depth", - "default": 1, - "type": "integer" - }, - "filter_urls": { - "title": "Filter Urls", - "type": "array", - "items": {} - }, - "overwrite_existing_files": { - "title": "Overwrite Existing Files", - "default": true - } - }, - "required": [ - "output_dir" - ], - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "DensePassageRetrieverComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "DensePassageRetriever" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "document_store": { - "title": "Document Store", - "type": "string" - }, - "query_embedding_model": { - "title": "Query Embedding Model", - "default": "facebook/dpr-question_encoder-single-nq-base", - "anyOf": [ - { - "type": "string", - "format": "path" - }, - { - "type": "string" - } - ] - }, - "passage_embedding_model": { - "title": "Passage Embedding Model", - "default": "facebook/dpr-ctx_encoder-single-nq-base", - "anyOf": [ - { - "type": "string", - "format": "path" - }, - { - "type": "string" - } - ] - }, - "model_version": { - "title": "Model Version", - "type": "string" - }, - "max_seq_len_query": { - "title": "Max Seq Len Query", - "default": 64, - "type": "integer" - }, - "max_seq_len_passage": { - "title": "Max Seq Len Passage", - "default": 256, - "type": "integer" - }, - "top_k": { - "title": "Top K", - "default": 10, - "type": "integer" - }, - "use_gpu": { - "title": "Use Gpu", - "default": true, - "type": "boolean" - }, - "batch_size": { - "title": "Batch Size", - "default": 16, - "type": "integer" - }, - "embed_title": { - "title": "Embed Title", - "default": true, - "type": "boolean" - }, - "use_fast_tokenizers": { - "title": "Use Fast Tokenizers", - "default": true, - "type": "boolean" - }, - "infer_tokenizer_classes": { - "title": "Infer Tokenizer Classes", - "default": false, - "type": "boolean" - }, - "similarity_function": { - "title": "Similarity Function", - "default": "dot_product", - "type": "string" - }, - "global_loss_buffer_size": { - "title": "Global Loss Buffer Size", - "default": 150000, - "type": "integer" - }, - "progress_bar": { - "title": "Progress Bar", - "default": true, - "type": "boolean" - }, - "devices": { - "title": "Devices", - "type": "array", - "items": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "string" - }, - { - "type": "string" - } - ] - } - }, - "use_auth_token": { - "title": "Use Auth Token", - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "string" - } - ] - } - }, - "required": [ - "document_store" - ], - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "Docs2AnswersComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "Docs2Answers" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": {}, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "DocxToTextConverterComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "DocxToTextConverter" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "remove_numeric_tables": { - "title": "Remove Numeric Tables", - "default": false, - "type": "boolean" - }, - "valid_languages": { - "title": "Valid Languages", - "type": "array", - "items": { - "type": "string" - } - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "ElasticsearchFilterOnlyRetrieverComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "ElasticsearchFilterOnlyRetriever" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "document_store": { - "title": "Document Store", - "type": "string" - }, - "top_k": { - "title": "Top K", - "default": 10, - "type": "integer" - }, - "custom_query": { - "title": "Custom Query", - "type": "string" - } - }, - "required": [ - "document_store" - ], - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "ElasticsearchRetrieverComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "ElasticsearchRetriever" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "document_store": { - "title": "Document Store", - "type": "string" - }, - "top_k": { - "title": "Top K", - "default": 10, - "type": "integer" - }, - "custom_query": { - "title": "Custom Query", - "type": "string" - } - }, - "required": [ - "document_store" - ], - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "EmbeddingRetrieverComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "EmbeddingRetriever" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "document_store": { - "title": "Document Store", - "type": "string" - }, - "embedding_model": { - "title": "Embedding Model", - "type": "string" - }, - "model_version": { - "title": "Model Version", - "type": "string" - }, - "use_gpu": { - "title": "Use Gpu", - "default": true, - "type": "boolean" - }, - "batch_size": { - "title": "Batch Size", - "default": 32, - "type": "integer" - }, - "max_seq_len": { - "title": "Max Seq Len", - "default": 512, - "type": "integer" - }, - "model_format": { - "title": "Model Format", - "default": "farm", - "type": "string" - }, - "pooling_strategy": { - "title": "Pooling Strategy", - "default": "reduce_mean", - "type": "string" - }, - "emb_extraction_layer": { - "title": "Emb Extraction Layer", - "default": -1, - "type": "integer" - }, - "top_k": { - "title": "Top K", - "default": 10, - "type": "integer" - }, - "progress_bar": { - "title": "Progress Bar", - "default": true, - "type": "boolean" - }, - "devices": { - "title": "Devices", - "type": "array", - "items": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "string" - }, - { - "type": "string" - } - ] - } - }, - "use_auth_token": { - "title": "Use Auth Token", - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "string" - } - ] - } - }, - "required": [ - "document_store", - "embedding_model" - ], - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "EntityExtractorComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "EntityExtractor" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "model_name_or_path": { - "title": "Model Name Or Path", - "default": "dslim/bert-base-NER", - "type": "string" - }, - "use_gpu": { - "title": "Use Gpu", - "default": true, - "type": "boolean" - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "EvalAnswersComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "EvalAnswers" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "skip_incorrect_retrieval": { - "title": "Skip Incorrect Retrieval", - "default": true, - "type": "boolean" - }, - "open_domain": { - "title": "Open Domain", - "default": true, - "type": "boolean" - }, - "sas_model": { - "title": "Sas Model", - "type": "string" - }, - "debug": { - "title": "Debug", - "default": false, - "type": "boolean" - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "EvalDocumentsComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "EvalDocuments" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "debug": { - "title": "Debug", - "default": false, - "type": "boolean" - }, - "open_domain": { - "title": "Open Domain", - "default": true, - "type": "boolean" - }, - "top_k": { - "title": "Top K", - "default": 10, - "type": "integer" - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "FARMReaderComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "FARMReader" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "model_name_or_path": { - "title": "Model Name Or Path", - "type": "string" - }, - "model_version": { - "title": "Model Version", - "type": "string" - }, - "context_window_size": { - "title": "Context Window Size", - "default": 150, - "type": "integer" - }, - "batch_size": { - "title": "Batch Size", - "default": 50, - "type": "integer" - }, - "use_gpu": { - "title": "Use Gpu", - "default": true, - "type": "boolean" - }, - "no_ans_boost": { - "title": "No Ans Boost", - "default": 0.0, - "type": "number" - }, - "return_no_answer": { - "title": "Return No Answer", - "default": false, - "type": "boolean" - }, - "top_k": { - "title": "Top K", - "default": 10, - "type": "integer" - }, - "top_k_per_candidate": { - "title": "Top K Per Candidate", - "default": 3, - "type": "integer" - }, - "top_k_per_sample": { - "title": "Top K Per Sample", - "default": 1, - "type": "integer" - }, - "num_processes": { - "title": "Num Processes", - "type": "integer" - }, - "max_seq_len": { - "title": "Max Seq Len", - "default": 256, - "type": "integer" - }, - "doc_stride": { - "title": "Doc Stride", - "default": 128, - "type": "integer" - }, - "progress_bar": { - "title": "Progress Bar", - "default": true, - "type": "boolean" - }, - "duplicate_filtering": { - "title": "Duplicate Filtering", - "default": 0, - "type": "integer" - }, - "use_confidence_scores": { - "title": "Use Confidence Scores", - "default": true, - "type": "boolean" - }, - "proxies": { - "title": "Proxies", - "type": "object", - "additionalProperties": { - "type": "string" - } - }, - "local_files_only": { - "title": "Local Files Only", - "default": false - }, - "force_download": { - "title": "Force Download", - "default": false - }, - "use_auth_token": { - "title": "Use Auth Token", - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "string" - } - ] - } - }, - "required": [ - "model_name_or_path" - ], - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "FileTypeClassifierComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "FileTypeClassifier" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "supported_types": { - "title": "Supported Types", - "default": [ - "txt", - "pdf", - "md", - "docx", - "html" - ], - "type": "array", - "items": { - "type": "string" - } - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "ImageToTextConverterComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "ImageToTextConverter" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "remove_numeric_tables": { - "title": "Remove Numeric Tables", - "default": false, - "type": "boolean" - }, - "valid_languages": { - "title": "Valid Languages", - "default": [ - "eng" - ], - "type": "array", - "items": { - "type": "string" - } - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "JoinDocumentsComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "JoinDocuments" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "join_mode": { - "title": "Join Mode", - "default": "concatenate", - "type": "string" - }, - "weights": { - "title": "Weights", - "type": "array", - "items": { - "type": "number" - } - }, - "top_k_join": { - "title": "Top K Join", - "type": "integer" - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "MarkdownConverterComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "MarkdownConverter" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "remove_numeric_tables": { - "title": "Remove Numeric Tables", - "default": false, - "type": "boolean" - }, - "valid_languages": { - "title": "Valid Languages", - "type": "array", - "items": { - "type": "string" - } - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "PDFToTextConverterComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "PDFToTextConverter" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "remove_numeric_tables": { - "title": "Remove Numeric Tables", - "default": false, - "type": "boolean" - }, - "valid_languages": { - "title": "Valid Languages", - "type": "array", - "items": { - "type": "string" - } - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "PDFToTextOCRConverterComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "PDFToTextOCRConverter" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "remove_numeric_tables": { - "title": "Remove Numeric Tables", - "default": false, - "type": "boolean" - }, - "valid_languages": { - "title": "Valid Languages", - "default": [ - "eng" - ], - "type": "array", - "items": { - "type": "string" - } - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "ParsrConverterComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "ParsrConverter" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "parsr_url": { - "title": "Parsr Url", - "default": "http://localhost:3001", - "type": "string" - }, - "extractor": { - "title": "Extractor", - "default": "pdfminer", - "enum": [ - "pdfminer", - "pdfjs" - ], - "type": "string" - }, - "table_detection_mode": { - "title": "Table Detection Mode", - "default": "lattice", - "enum": [ - "lattice", - "stream" - ], - "type": "string" - }, - "preceding_context_len": { - "title": "Preceding Context Len", - "default": 1, - "type": "integer" - }, - "following_context_len": { - "title": "Following Context Len", - "default": 1, - "type": "integer" - }, - "remove_page_headers": { - "title": "Remove Page Headers", - "default": false, - "type": "boolean" - }, - "remove_page_footers": { - "title": "Remove Page Footers", - "default": false, - "type": "boolean" - }, - "remove_table_of_contents": { - "title": "Remove Table Of Contents", - "default": false, - "type": "boolean" - }, - "valid_languages": { - "title": "Valid Languages", - "type": "array", - "items": { - "type": "string" - } - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "PreProcessorComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "PreProcessor" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "clean_whitespace": { - "title": "Clean Whitespace", - "default": true, - "type": "boolean" - }, - "clean_header_footer": { - "title": "Clean Header Footer", - "default": false, - "type": "boolean" - }, - "clean_empty_lines": { - "title": "Clean Empty Lines", - "default": true, - "type": "boolean" - }, - "split_by": { - "title": "Split By", - "default": "word", - "type": "string" - }, - "split_length": { - "title": "Split Length", - "default": 200, - "type": "integer" - }, - "split_overlap": { - "title": "Split Overlap", - "default": 0, - "type": "integer" - }, - "split_respect_sentence_boundary": { - "title": "Split Respect Sentence Boundary", - "default": true, - "type": "boolean" - }, - "language": { - "title": "Language", - "default": "en", - "type": "string" - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "QuestionGeneratorComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "QuestionGenerator" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "model_name_or_path": { - "title": "Model Name Or Path", - "default": "valhalla/t5-base-e2e-qg" - }, - "model_version": { - "title": "Model Version" - }, - "num_beams": { - "title": "Num Beams", - "default": 4 - }, - "max_length": { - "title": "Max Length", - "default": 256 - }, - "no_repeat_ngram_size": { - "title": "No Repeat Ngram Size", - "default": 3 - }, - "length_penalty": { - "title": "Length Penalty", - "default": 1.5 - }, - "early_stopping": { - "title": "Early Stopping", - "default": true - }, - "split_length": { - "title": "Split Length", - "default": 50 - }, - "split_overlap": { - "title": "Split Overlap", - "default": 10 - }, - "use_gpu": { - "title": "Use Gpu", - "default": true - }, - "prompt": { - "title": "Prompt", - "default": "generate questions:" - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "RAGeneratorComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "RAGenerator" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "model_name_or_path": { - "title": "Model Name Or Path", - "default": "facebook/rag-token-nq", - "type": "string" - }, - "model_version": { - "title": "Model Version", - "type": "string" - }, - "retriever": { - "title": "Retriever", - "type": "string", - "default": null - }, - "generator_type": { - "default": [ - 1 - ], - "allOf": [ - { - "$ref": "#/definitions/RAGeneratorType" - } - ] - }, - "top_k": { - "title": "Top K", - "default": 2, - "type": "integer" - }, - "max_length": { - "title": "Max Length", - "default": 200, - "type": "integer" - }, - "min_length": { - "title": "Min Length", - "default": 2, - "type": "integer" - }, - "num_beams": { - "title": "Num Beams", - "default": 2, - "type": "integer" - }, - "embed_title": { - "title": "Embed Title", - "default": true, - "type": "boolean" - }, - "prefix": { - "title": "Prefix", - "type": "string" - }, - "use_gpu": { - "title": "Use Gpu", - "default": true, - "type": "boolean" - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "RCIReaderComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "RCIReader" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "row_model_name_or_path": { - "title": "Row Model Name Or Path", - "default": "michaelrglass/albert-base-rci-wikisql-row", - "type": "string" - }, - "column_model_name_or_path": { - "title": "Column Model Name Or Path", - "default": "michaelrglass/albert-base-rci-wikisql-col", - "type": "string" - }, - "row_model_version": { - "title": "Row Model Version", - "type": "string" - }, - "column_model_version": { - "title": "Column Model Version", - "type": "string" - }, - "row_tokenizer": { - "title": "Row Tokenizer", - "type": "string" - }, - "column_tokenizer": { - "title": "Column Tokenizer", - "type": "string" - }, - "use_gpu": { - "title": "Use Gpu", - "default": true, - "type": "boolean" - }, - "top_k": { - "title": "Top K", - "default": 10, - "type": "integer" - }, - "max_seq_len": { - "title": "Max Seq Len", - "default": 256, - "type": "integer" - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "SentenceTransformersRankerComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "SentenceTransformersRanker" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "model_name_or_path": { - "title": "Model Name Or Path", - "anyOf": [ - { - "type": "string" - }, - { - "type": "string", - "format": "path" - } - ] - }, - "model_version": { - "title": "Model Version", - "type": "string" - }, - "top_k": { - "title": "Top K", - "default": 10, - "type": "integer" - }, - "use_gpu": { - "title": "Use Gpu", - "default": true, - "type": "boolean" - }, - "devices": { - "title": "Devices", - "type": "array", - "items": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "string" - }, - { - "type": "string" - } - ] - } - } - }, - "required": [ - "model_name_or_path" - ], - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "Seq2SeqGeneratorComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "Seq2SeqGenerator" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "model_name_or_path": { - "title": "Model Name Or Path", - "type": "string" - }, - "input_converter": { - "title": "Input Converter", - "type": "string", - "default": null - }, - "top_k": { - "title": "Top K", - "default": 1, - "type": "integer" - }, - "max_length": { - "title": "Max Length", - "default": 200, - "type": "integer" - }, - "min_length": { - "title": "Min Length", - "default": 2, - "type": "integer" - }, - "num_beams": { - "title": "Num Beams", - "default": 8, - "type": "integer" - }, - "use_gpu": { - "title": "Use Gpu", - "default": true, - "type": "boolean" - } - }, - "required": [ - "model_name_or_path" - ], - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "SklearnQueryClassifierComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "SklearnQueryClassifier" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "model_name_or_path": { - "title": "Model Name Or Path", - "default": "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle", - "anyOf": [ - { - "type": "string" - }, - {} - ] - }, - "vectorizer_name_or_path": { - "title": "Vectorizer Name Or Path", - "default": "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle", - "anyOf": [ - { - "type": "string" - }, - {} - ] - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "TableReaderComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "TableReader" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "model_name_or_path": { - "title": "Model Name Or Path", - "default": "google/tapas-base-finetuned-wtq", - "type": "string" - }, - "model_version": { - "title": "Model Version", - "type": "string" - }, - "tokenizer": { - "title": "Tokenizer", - "type": "string" - }, - "use_gpu": { - "title": "Use Gpu", - "default": true, - "type": "boolean" - }, - "top_k": { - "title": "Top K", - "default": 10, - "type": "integer" - }, - "top_k_per_candidate": { - "title": "Top K Per Candidate", - "default": 3, - "type": "integer" - }, - "return_no_answer": { - "title": "Return No Answer", - "default": false, - "type": "boolean" - }, - "max_seq_len": { - "title": "Max Seq Len", - "default": 256, - "type": "integer" - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "TableTextRetrieverComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "TableTextRetriever" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "document_store": { - "title": "Document Store", - "type": "string" - }, - "query_embedding_model": { - "title": "Query Embedding Model", - "default": "deepset/bert-small-mm_retrieval-question_encoder", - "anyOf": [ - { - "type": "string", - "format": "path" - }, - { - "type": "string" - } - ] - }, - "passage_embedding_model": { - "title": "Passage Embedding Model", - "default": "deepset/bert-small-mm_retrieval-passage_encoder", - "anyOf": [ - { - "type": "string", - "format": "path" - }, - { - "type": "string" - } - ] - }, - "table_embedding_model": { - "title": "Table Embedding Model", - "default": "deepset/bert-small-mm_retrieval-table_encoder", - "anyOf": [ - { - "type": "string", - "format": "path" - }, - { - "type": "string" - } - ] - }, - "model_version": { - "title": "Model Version", - "type": "string" - }, - "max_seq_len_query": { - "title": "Max Seq Len Query", - "default": 64, - "type": "integer" - }, - "max_seq_len_passage": { - "title": "Max Seq Len Passage", - "default": 256, - "type": "integer" - }, - "max_seq_len_table": { - "title": "Max Seq Len Table", - "default": 256, - "type": "integer" - }, - "top_k": { - "title": "Top K", - "default": 10, - "type": "integer" - }, - "use_gpu": { - "title": "Use Gpu", - "default": true, - "type": "boolean" - }, - "batch_size": { - "title": "Batch Size", - "default": 16, - "type": "integer" - }, - "embed_meta_fields": { - "title": "Embed Meta Fields", - "default": [ - "name", - "section_title", - "caption" - ], - "type": "array", - "items": { - "type": "string" - } - }, - "use_fast_tokenizers": { - "title": "Use Fast Tokenizers", - "default": true, - "type": "boolean" - }, - "infer_tokenizer_classes": { - "title": "Infer Tokenizer Classes", - "default": false, - "type": "boolean" - }, - "similarity_function": { - "title": "Similarity Function", - "default": "dot_product", - "type": "string" - }, - "global_loss_buffer_size": { - "title": "Global Loss Buffer Size", - "default": 150000, - "type": "integer" - }, - "progress_bar": { - "title": "Progress Bar", - "default": true, - "type": "boolean" - }, - "devices": { - "title": "Devices", - "type": "array", - "items": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "string" - }, - { - "type": "string" - } - ] - } - }, - "use_auth_token": { - "title": "Use Auth Token", - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "string" - } - ] - } - }, - "required": [ - "document_store" - ], - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "Text2SparqlRetrieverComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "Text2SparqlRetriever" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "knowledge_graph": { - "title": "Knowledge Graph" - }, - "model_name_or_path": { - "title": "Model Name Or Path" - }, - "top_k": { - "title": "Top K", - "default": 1, - "type": "integer" - } - }, - "required": [ - "knowledge_graph", - "model_name_or_path" - ], - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "TextConverterComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "TextConverter" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "remove_numeric_tables": { - "title": "Remove Numeric Tables", - "default": false, - "type": "boolean" - }, - "valid_languages": { - "title": "Valid Languages", - "type": "array", - "items": { - "type": "string" - } - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "TfidfRetrieverComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "TfidfRetriever" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "document_store": { - "title": "Document Store", - "type": "string" - }, - "top_k": { - "title": "Top K", - "default": 10, - "type": "integer" - }, - "auto_fit": { - "title": "Auto Fit", - "default": true - } - }, - "required": [ - "document_store" - ], - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "TikaConverterComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "TikaConverter" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "tika_url": { - "title": "Tika Url", - "default": "http://localhost:9998/tika", - "type": "string" - }, - "remove_numeric_tables": { - "title": "Remove Numeric Tables", - "default": false, - "type": "boolean" - }, - "valid_languages": { - "title": "Valid Languages", - "type": "array", - "items": { - "type": "string" - } - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "TransformersDocumentClassifierComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "TransformersDocumentClassifier" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "model_name_or_path": { - "title": "Model Name Or Path", - "default": "bhadresh-savani/distilbert-base-uncased-emotion", - "type": "string" - }, - "model_version": { - "title": "Model Version", - "type": "string" - }, - "tokenizer": { - "title": "Tokenizer", - "type": "string" - }, - "use_gpu": { - "title": "Use Gpu", - "default": true, - "type": "boolean" - }, - "return_all_scores": { - "title": "Return All Scores", - "default": false, - "type": "boolean" - }, - "task": { - "title": "Task", - "default": "text-classification", - "type": "string" - }, - "labels": { - "title": "Labels", - "type": "array", - "items": { - "type": "string" - } - }, - "batch_size": { - "title": "Batch Size", - "default": -1, - "type": "integer" - }, - "classification_field": { - "title": "Classification Field", - "type": "string" - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "TransformersQueryClassifierComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "TransformersQueryClassifier" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "model_name_or_path": { - "title": "Model Name Or Path", - "default": "shahrukhx01/bert-mini-finetune-question-detection", - "anyOf": [ - { - "type": "string", - "format": "path" - }, - { - "type": "string" - } - ] - }, - "use_gpu": { - "title": "Use Gpu", - "default": true, - "type": "boolean" - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "TransformersReaderComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "TransformersReader" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "model_name_or_path": { - "title": "Model Name Or Path", - "default": "distilbert-base-uncased-distilled-squad", - "type": "string" - }, - "model_version": { - "title": "Model Version", - "type": "string" - }, - "tokenizer": { - "title": "Tokenizer", - "type": "string" - }, - "context_window_size": { - "title": "Context Window Size", - "default": 70, - "type": "integer" - }, - "use_gpu": { - "title": "Use Gpu", - "default": true, - "type": "boolean" - }, - "top_k": { - "title": "Top K", - "default": 10, - "type": "integer" - }, - "top_k_per_candidate": { - "title": "Top K Per Candidate", - "default": 4, - "type": "integer" - }, - "return_no_answers": { - "title": "Return No Answers", - "default": true, - "type": "boolean" - }, - "max_seq_len": { - "title": "Max Seq Len", - "default": 256, - "type": "integer" - }, - "doc_stride": { - "title": "Doc Stride", - "default": 128, - "type": "integer" - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "TransformersSummarizerComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "TransformersSummarizer" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "model_name_or_path": { - "title": "Model Name Or Path", - "default": "google/pegasus-xsum", - "type": "string" - }, - "model_version": { - "title": "Model Version", - "type": "string" - }, - "tokenizer": { - "title": "Tokenizer", - "type": "string" - }, - "max_length": { - "title": "Max Length", - "default": 200, - "type": "integer" - }, - "min_length": { - "title": "Min Length", - "default": 5, - "type": "integer" - }, - "use_gpu": { - "title": "Use Gpu", - "default": true, - "type": "boolean" - }, - "clean_up_tokenization_spaces": { - "title": "Clean Up Tokenization Spaces", - "default": true, - "type": "boolean" - }, - "separator_for_single_summary": { - "title": "Separator For Single Summary", - "default": " ", - "type": "string" - }, - "generate_single_summary": { - "title": "Generate Single Summary", - "default": false, - "type": "boolean" - } - }, - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "TransformersTranslatorComponent": { - "type": "object", - "properties": { - "name": { - "title": "Name", - "description": "Custom name for the component. Helpful for visualization and debugging.", - "type": "string" - }, - "type": { - "title": "Type", - "description": "Haystack Class name for the component.", - "type": "string", - "const": "TransformersTranslator" - }, - "params": { - "title": "Parameters", - "type": "object", - "properties": { - "model_name_or_path": { - "title": "Model Name Or Path", - "type": "string" - }, - "tokenizer_name": { - "title": "Tokenizer Name", - "type": "string" - }, - "max_seq_len": { - "title": "Max Seq Len", - "type": "integer" - }, - "clean_up_tokenization_spaces": { - "title": "Clean Up Tokenization Spaces", - "default": true, - "type": "boolean" - }, - "use_gpu": { - "title": "Use Gpu", - "default": true, - "type": "boolean" - } - }, - "required": [ - "model_name_or_path" - ], - "additionalProperties": false, - "description": "Each parameter can reference other components defined in the same YAML file." - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "RAGeneratorType": { - "title": "RAGeneratorType", - "description": "An enumeration.", - "enum": [ - [ - 1 - ], - 2 - ] - } - } -} \ No newline at end of file diff --git a/json-schemas/haystack-pipeline-1.2.1rc0.schema.json b/json-schemas/haystack-pipeline-1.2.1rc0.schema.json index 45353fcc7..064b81792 100644 --- a/json-schemas/haystack-pipeline-1.2.1rc0.schema.json +++ b/json-schemas/haystack-pipeline-1.2.1rc0.schema.json @@ -9,7 +9,11 @@ "title": "Version", "description": "Version of the Haystack Pipeline file.", "type": "string", - "const": "1.2.1rc0" + "oneOf": [ + { + "const": "1.2.1rc0" + } + ] }, "components": { "title": "Components", @@ -17,6 +21,36 @@ "type": "array", "items": { "anyOf": [ + { + "$ref": "#/definitions/DeepsetCloudDocumentStoreComponent" + }, + { + "$ref": "#/definitions/ElasticsearchDocumentStoreComponent" + }, + { + "$ref": "#/definitions/FAISSDocumentStoreComponent" + }, + { + "$ref": "#/definitions/GraphDBKnowledgeGraphComponent" + }, + { + "$ref": "#/definitions/InMemoryDocumentStoreComponent" + }, + { + "$ref": "#/definitions/Milvus2DocumentStoreComponent" + }, + { + "$ref": "#/definitions/OpenDistroElasticsearchDocumentStoreComponent" + }, + { + "$ref": "#/definitions/OpenSearchDocumentStoreComponent" + }, + { + "$ref": "#/definitions/SQLDocumentStoreComponent" + }, + { + "$ref": "#/definitions/WeaviateDocumentStoreComponent" + }, { "$ref": "#/definitions/AzureConverterComponent" }, @@ -140,7 +174,7 @@ "type", "name" ], - "additionalProperties": false + "additionalProperties": true }, "pipelines": { "title": "Pipelines", @@ -175,6 +209,10 @@ } } }, + "required": [ + "name", + "inputs" + ], "additionalProperties": false }, "required": [ @@ -182,7 +220,8 @@ "nodes" ], "additionalProperties": false - } + }, + "additionalProperties": false }, "additionalProperties": false } @@ -195,6 +234,929 @@ ], "additionalProperties": false, "definitions": { + "DeepsetCloudDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "DeepsetCloudDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "api_key": { + "title": "Api Key", + "type": "string" + }, + "workspace": { + "title": "Workspace", + "default": "default", + "type": "string" + }, + "index": { + "title": "Index", + "default": "default", + "type": "string" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "api_endpoint": { + "title": "Api Endpoint", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "ElasticsearchDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "ElasticsearchDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "host": { + "title": "Host", + "default": "localhost", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "port": { + "title": "Port", + "default": 9200, + "anyOf": [ + { + "type": "integer" + }, + { + "type": "array", + "items": { + "type": "integer" + } + } + ] + }, + "username": { + "title": "Username", + "default": "", + "type": "string" + }, + "password": { + "title": "Password", + "default": "", + "type": "string" + }, + "api_key_id": { + "title": "Api Key Id", + "type": "string" + }, + "api_key": { + "title": "Api Key", + "type": "string" + }, + "aws4auth": { + "title": "Aws4Auth" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "search_fields": { + "title": "Search Fields", + "default": "content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": {} + } + ] + }, + "content_field": { + "title": "Content Field", + "default": "content", + "type": "string" + }, + "name_field": { + "title": "Name Field", + "default": "name", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "custom_mapping": { + "title": "Custom Mapping", + "type": "object" + }, + "excluded_meta_data": { + "title": "Excluded Meta Data", + "type": "array", + "items": {} + }, + "analyzer": { + "title": "Analyzer", + "default": "standard", + "type": "string" + }, + "scheme": { + "title": "Scheme", + "default": "http", + "type": "string" + }, + "ca_certs": { + "title": "Ca Certs", + "type": "string" + }, + "verify_certs": { + "title": "Verify Certs", + "default": true, + "type": "boolean" + }, + "recreate_index": { + "title": "Recreate Index", + "default": false, + "type": "boolean" + }, + "create_index": { + "title": "Create Index", + "default": true, + "type": "boolean" + }, + "refresh_type": { + "title": "Refresh Type", + "default": "wait_for", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product" + }, + "timeout": { + "title": "Timeout", + "default": 30 + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "flat", + "type": "string" + }, + "scroll": { + "title": "Scroll", + "default": "1d", + "type": "string" + }, + "skip_missing_embeddings": { + "title": "Skip Missing Embeddings", + "default": true, + "type": "boolean" + }, + "synonyms": { + "title": "Synonyms", + "type": "array", + "items": {} + }, + "synonym_type": { + "title": "Synonym Type", + "default": "synonym", + "type": "string" + }, + "use_system_proxy": { + "title": "Use System Proxy", + "default": false, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "FAISSDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "FAISSDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "sql_url": { + "title": "Sql Url", + "default": "sqlite:///faiss_document_store.db", + "type": "string" + }, + "vector_dim": { + "title": "Vector Dim", + "type": "integer" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "faiss_index_factory_str": { + "title": "Faiss Index Factory Str", + "default": "Flat", + "type": "string" + }, + "faiss_index": { + "title": "Faiss Index", + "type": "string", + "default": null + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "faiss_index_path": { + "title": "Faiss Index Path", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string", + "format": "path" + } + ] + }, + "faiss_config_path": { + "title": "Faiss Config Path", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string", + "format": "path" + } + ] + }, + "isolation_level": { + "title": "Isolation Level", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "GraphDBKnowledgeGraphComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "GraphDBKnowledgeGraph" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "host": { + "title": "Host", + "default": "localhost", + "type": "string" + }, + "port": { + "title": "Port", + "default": 7200, + "type": "integer" + }, + "username": { + "title": "Username", + "default": "", + "type": "string" + }, + "password": { + "title": "Password", + "default": "", + "type": "string" + }, + "index": { + "title": "Index", + "type": "string" + }, + "prefixes": { + "title": "Prefixes", + "default": "", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "InMemoryDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "InMemoryDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "scoring_batch_size": { + "title": "Scoring Batch Size", + "default": 500000, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "Milvus2DocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "Milvus2DocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "sql_url": { + "title": "Sql Url", + "default": "sqlite:///", + "type": "string" + }, + "host": { + "title": "Host", + "default": "localhost", + "type": "string" + }, + "port": { + "title": "Port", + "default": "19530", + "type": "string" + }, + "connection_pool": { + "title": "Connection Pool", + "default": "SingletonThread", + "type": "string" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "vector_dim": { + "title": "Vector Dim", + "type": "integer" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "index_file_size": { + "title": "Index File Size", + "default": 1024, + "type": "integer" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "IVF_FLAT", + "type": "string" + }, + "index_param": { + "title": "Index Param", + "type": "object" + }, + "search_param": { + "title": "Search Param", + "type": "object" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "id_field": { + "title": "Id Field", + "default": "id", + "type": "string" + }, + "custom_fields": { + "title": "Custom Fields", + "type": "array", + "items": {} + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "isolation_level": { + "title": "Isolation Level", + "type": "string" + }, + "consistency_level": { + "title": "Consistency Level", + "default": 0, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "OpenDistroElasticsearchDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "OpenDistroElasticsearchDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "host": { + "title": "Host", + "default": "https://admin:admin@localhost:9200/" + }, + "similarity": { + "title": "Similarity", + "default": "cosine" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "OpenSearchDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "OpenSearchDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "verify_certs": { + "title": "Verify Certs", + "default": false + }, + "scheme": { + "title": "Scheme", + "default": "https" + }, + "username": { + "title": "Username", + "default": "admin" + }, + "password": { + "title": "Password", + "default": "admin" + }, + "port": { + "title": "Port", + "default": 9200 + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "SQLDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "SQLDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "url": { + "title": "Url", + "default": "sqlite://", + "type": "string" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "check_same_thread": { + "title": "Check Same Thread", + "default": false, + "type": "boolean" + }, + "isolation_level": { + "title": "Isolation Level", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "WeaviateDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "WeaviateDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "host": { + "title": "Host", + "default": "http://localhost", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "port": { + "title": "Port", + "default": 8080, + "anyOf": [ + { + "type": "integer" + }, + { + "type": "array", + "items": { + "type": "integer" + } + } + ] + }, + "timeout_config": { + "title": "Timeout Config", + "default": [ + 5, + 15 + ], + "type": "array", + "items": {} + }, + "username": { + "title": "Username", + "type": "string" + }, + "password": { + "title": "Password", + "type": "string" + }, + "index": { + "title": "Index", + "default": "Document", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "content_field": { + "title": "Content Field", + "default": "content", + "type": "string" + }, + "name_field": { + "title": "Name Field", + "default": "name", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "cosine", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "hnsw", + "type": "string" + }, + "custom_schema": { + "title": "Custom Schema", + "type": "object" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, "AzureConverterComponent": { "type": "object", "properties": { @@ -1581,14 +2543,9 @@ "default": null }, "generator_type": { - "default": [ - 1 - ], - "allOf": [ - { - "$ref": "#/definitions/RAGeneratorType" - } - ] + "title": "Generator Type", + "default": "token", + "type": "string" }, "top_k": { "title": "Top K", @@ -2675,16 +3632,6 @@ "name" ], "additionalProperties": false - }, - "RAGeneratorType": { - "title": "RAGeneratorType", - "description": "An enumeration.", - "enum": [ - [ - 1 - ], - 2 - ] } } } \ No newline at end of file diff --git a/json-schemas/haystack-pipeline-0.7.schema.json b/json-schemas/haystack-pipeline-unstable.schema.json similarity index 70% rename from json-schemas/haystack-pipeline-0.7.schema.json rename to json-schemas/haystack-pipeline-unstable.schema.json index 6796f43cf..4f49f3631 100644 --- a/json-schemas/haystack-pipeline-0.7.schema.json +++ b/json-schemas/haystack-pipeline-unstable.schema.json @@ -1,6 +1,6 @@ { "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://haystack.deepset.ai/json-schemas/haystack-pipeline.0.7.schema.json", + "$id": "https://haystack.deepset.ai/json-schemas/haystack-pipeline-unstable.schema.json", "title": "Haystack Pipeline", "description": "Haystack Pipeline YAML file describing the nodes of the pipelines. For more info read the docs at: https://haystack.deepset.ai/components/pipelines#yaml-file-definitions", "type": "object", @@ -9,7 +9,14 @@ "title": "Version", "description": "Version of the Haystack Pipeline file.", "type": "string", - "const": "0.7" + "oneOf": [ + { + "const": "unstable" + }, + { + "const": "1.2.1rc0" + } + ] }, "components": { "title": "Components", @@ -17,6 +24,36 @@ "type": "array", "items": { "anyOf": [ + { + "$ref": "#/definitions/DeepsetCloudDocumentStoreComponent" + }, + { + "$ref": "#/definitions/ElasticsearchDocumentStoreComponent" + }, + { + "$ref": "#/definitions/FAISSDocumentStoreComponent" + }, + { + "$ref": "#/definitions/GraphDBKnowledgeGraphComponent" + }, + { + "$ref": "#/definitions/InMemoryDocumentStoreComponent" + }, + { + "$ref": "#/definitions/Milvus2DocumentStoreComponent" + }, + { + "$ref": "#/definitions/OpenDistroElasticsearchDocumentStoreComponent" + }, + { + "$ref": "#/definitions/OpenSearchDocumentStoreComponent" + }, + { + "$ref": "#/definitions/SQLDocumentStoreComponent" + }, + { + "$ref": "#/definitions/WeaviateDocumentStoreComponent" + }, { "$ref": "#/definitions/AzureConverterComponent" }, @@ -59,6 +96,9 @@ { "$ref": "#/definitions/ImageToTextConverterComponent" }, + { + "$ref": "#/definitions/JoinAnswersComponent" + }, { "$ref": "#/definitions/JoinDocumentsComponent" }, @@ -86,6 +126,9 @@ { "$ref": "#/definitions/RCIReaderComponent" }, + { + "$ref": "#/definitions/RouteDocumentsComponent" + }, { "$ref": "#/definitions/SentenceTransformersRankerComponent" }, @@ -134,7 +177,7 @@ "type", "name" ], - "additionalProperties": false + "additionalProperties": true }, "pipelines": { "title": "Pipelines", @@ -169,6 +212,10 @@ } } }, + "required": [ + "name", + "inputs" + ], "additionalProperties": false }, "required": [ @@ -176,7 +223,8 @@ "nodes" ], "additionalProperties": false - } + }, + "additionalProperties": false }, "additionalProperties": false } @@ -189,6 +237,929 @@ ], "additionalProperties": false, "definitions": { + "DeepsetCloudDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "DeepsetCloudDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "api_key": { + "title": "Api Key", + "type": "string" + }, + "workspace": { + "title": "Workspace", + "default": "default", + "type": "string" + }, + "index": { + "title": "Index", + "default": "default", + "type": "string" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "api_endpoint": { + "title": "Api Endpoint", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "ElasticsearchDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "ElasticsearchDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "host": { + "title": "Host", + "default": "localhost", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "port": { + "title": "Port", + "default": 9200, + "anyOf": [ + { + "type": "integer" + }, + { + "type": "array", + "items": { + "type": "integer" + } + } + ] + }, + "username": { + "title": "Username", + "default": "", + "type": "string" + }, + "password": { + "title": "Password", + "default": "", + "type": "string" + }, + "api_key_id": { + "title": "Api Key Id", + "type": "string" + }, + "api_key": { + "title": "Api Key", + "type": "string" + }, + "aws4auth": { + "title": "Aws4Auth" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "search_fields": { + "title": "Search Fields", + "default": "content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": {} + } + ] + }, + "content_field": { + "title": "Content Field", + "default": "content", + "type": "string" + }, + "name_field": { + "title": "Name Field", + "default": "name", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "custom_mapping": { + "title": "Custom Mapping", + "type": "object" + }, + "excluded_meta_data": { + "title": "Excluded Meta Data", + "type": "array", + "items": {} + }, + "analyzer": { + "title": "Analyzer", + "default": "standard", + "type": "string" + }, + "scheme": { + "title": "Scheme", + "default": "http", + "type": "string" + }, + "ca_certs": { + "title": "Ca Certs", + "type": "string" + }, + "verify_certs": { + "title": "Verify Certs", + "default": true, + "type": "boolean" + }, + "recreate_index": { + "title": "Recreate Index", + "default": false, + "type": "boolean" + }, + "create_index": { + "title": "Create Index", + "default": true, + "type": "boolean" + }, + "refresh_type": { + "title": "Refresh Type", + "default": "wait_for", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product" + }, + "timeout": { + "title": "Timeout", + "default": 30 + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "flat", + "type": "string" + }, + "scroll": { + "title": "Scroll", + "default": "1d", + "type": "string" + }, + "skip_missing_embeddings": { + "title": "Skip Missing Embeddings", + "default": true, + "type": "boolean" + }, + "synonyms": { + "title": "Synonyms", + "type": "array", + "items": {} + }, + "synonym_type": { + "title": "Synonym Type", + "default": "synonym", + "type": "string" + }, + "use_system_proxy": { + "title": "Use System Proxy", + "default": false, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "FAISSDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "FAISSDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "sql_url": { + "title": "Sql Url", + "default": "sqlite:///faiss_document_store.db", + "type": "string" + }, + "vector_dim": { + "title": "Vector Dim", + "type": "integer" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "faiss_index_factory_str": { + "title": "Faiss Index Factory Str", + "default": "Flat", + "type": "string" + }, + "faiss_index": { + "title": "Faiss Index", + "type": "string", + "default": null + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "faiss_index_path": { + "title": "Faiss Index Path", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string", + "format": "path" + } + ] + }, + "faiss_config_path": { + "title": "Faiss Config Path", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string", + "format": "path" + } + ] + }, + "isolation_level": { + "title": "Isolation Level", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "GraphDBKnowledgeGraphComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "GraphDBKnowledgeGraph" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "host": { + "title": "Host", + "default": "localhost", + "type": "string" + }, + "port": { + "title": "Port", + "default": 7200, + "type": "integer" + }, + "username": { + "title": "Username", + "default": "", + "type": "string" + }, + "password": { + "title": "Password", + "default": "", + "type": "string" + }, + "index": { + "title": "Index", + "type": "string" + }, + "prefixes": { + "title": "Prefixes", + "default": "", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "InMemoryDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "InMemoryDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "scoring_batch_size": { + "title": "Scoring Batch Size", + "default": 500000, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "Milvus2DocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "Milvus2DocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "sql_url": { + "title": "Sql Url", + "default": "sqlite:///", + "type": "string" + }, + "host": { + "title": "Host", + "default": "localhost", + "type": "string" + }, + "port": { + "title": "Port", + "default": "19530", + "type": "string" + }, + "connection_pool": { + "title": "Connection Pool", + "default": "SingletonThread", + "type": "string" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "vector_dim": { + "title": "Vector Dim", + "type": "integer" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "index_file_size": { + "title": "Index File Size", + "default": 1024, + "type": "integer" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "IVF_FLAT", + "type": "string" + }, + "index_param": { + "title": "Index Param", + "type": "object" + }, + "search_param": { + "title": "Search Param", + "type": "object" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "id_field": { + "title": "Id Field", + "default": "id", + "type": "string" + }, + "custom_fields": { + "title": "Custom Fields", + "type": "array", + "items": {} + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "isolation_level": { + "title": "Isolation Level", + "type": "string" + }, + "consistency_level": { + "title": "Consistency Level", + "default": 0, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "OpenDistroElasticsearchDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "OpenDistroElasticsearchDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "host": { + "title": "Host", + "default": "https://admin:admin@localhost:9200/" + }, + "similarity": { + "title": "Similarity", + "default": "cosine" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "OpenSearchDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "OpenSearchDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "verify_certs": { + "title": "Verify Certs", + "default": false + }, + "scheme": { + "title": "Scheme", + "default": "https" + }, + "username": { + "title": "Username", + "default": "admin" + }, + "password": { + "title": "Password", + "default": "admin" + }, + "port": { + "title": "Port", + "default": 9200 + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "SQLDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "SQLDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "url": { + "title": "Url", + "default": "sqlite://", + "type": "string" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "check_same_thread": { + "title": "Check Same Thread", + "default": false, + "type": "boolean" + }, + "isolation_level": { + "title": "Isolation Level", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "WeaviateDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "WeaviateDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "host": { + "title": "Host", + "default": "http://localhost", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "port": { + "title": "Port", + "default": 8080, + "anyOf": [ + { + "type": "integer" + }, + { + "type": "array", + "items": { + "type": "integer" + } + } + ] + }, + "timeout_config": { + "title": "Timeout Config", + "default": [ + 5, + 15 + ], + "type": "array", + "items": {} + }, + "username": { + "title": "Username", + "type": "string" + }, + "password": { + "title": "Password", + "type": "string" + }, + "index": { + "title": "Index", + "default": "Document", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "content_field": { + "title": "Content Field", + "default": "content", + "type": "string" + }, + "name_field": { + "title": "Name Field", + "default": "name", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "cosine", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "hnsw", + "type": "string" + }, + "custom_schema": { + "title": "Custom Schema", + "type": "object" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, "AzureConverterComponent": { "type": "object", "properties": { @@ -1093,6 +2064,51 @@ ], "additionalProperties": false }, + "JoinAnswersComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "JoinAnswers" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "join_mode": { + "title": "Join Mode", + "default": "concatenate", + "type": "string" + }, + "weights": { + "title": "Weights", + "type": "array", + "items": { + "type": "number" + } + }, + "top_k_join": { + "title": "Top K Join", + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, "JoinDocumentsComponent": { "type": "object", "properties": { @@ -1381,6 +2397,14 @@ "default": true, "type": "boolean" }, + "remove_substrings": { + "title": "Remove Substrings", + "default": [], + "type": "array", + "items": { + "type": "string" + } + }, "split_by": { "title": "Split By", "default": "word", @@ -1522,14 +2546,9 @@ "default": null }, "generator_type": { - "default": [ - 1 - ], - "allOf": [ - { - "$ref": "#/definitions/RAGeneratorType" - } - ] + "title": "Generator Type", + "default": "token", + "type": "string" }, "top_k": { "title": "Top K", @@ -1646,6 +2665,47 @@ ], "additionalProperties": false }, + "RouteDocumentsComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "RouteDocuments" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "split_by": { + "title": "Split By", + "default": "content_type", + "type": "string" + }, + "metadata_values": { + "title": "Metadata Values", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, "SentenceTransformersRankerComponent": { "type": "object", "properties": { @@ -2575,16 +3635,6 @@ "name" ], "additionalProperties": false - }, - "RAGeneratorType": { - "title": "RAGeneratorType", - "description": "An enumeration.", - "enum": [ - [ - 1 - ], - 2 - ] } } -} +} \ No newline at end of file diff --git a/json-schemas/haystack-pipeline.schema.json b/json-schemas/haystack-pipeline.schema.json index 23fb7504e..bc868e97e 100644 --- a/json-schemas/haystack-pipeline.schema.json +++ b/json-schemas/haystack-pipeline.schema.json @@ -1,65 +1,69 @@ { - "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://haystack.deepset.ai/json-schemas/haystack-pipeline-1.1.0.schema.json", - "title": "Haystack Pipeline", - "description": "Haystack Pipeline YAML file describing the nodes of the pipelines. For more info read the docs at: https://haystack.deepset.ai/components/pipelines#yaml-file-definitions", - "type": "object", - "oneOf": [ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://haystack.deepset.ai/json-schemas/haystack-pipeline-1.1.0.schema.json", + "title": "Haystack Pipeline", + "description": "Haystack Pipeline YAML file describing the nodes of the pipelines. For more info read the docs at: https://haystack.deepset.ai/components/pipelines#yaml-file-definitions", + "type": "object", + "oneOf": [ + { + "allOf": [ { - "allOf": [ + "properties": { + "version": { + "oneOf": [ { - "properties": { - "version": { - "const": "0.7" - } - } - }, - { - "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/json-schemas/haystack-pipeline-0.7.schema.json" + "const": "unstable" } - ] + ] + } + } }, { - "allOf": [ - { - "properties": { - "version": { - "const": "1.1.0" - } - } - }, - { - "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/json-schemas/haystack-pipeline-1.1.0.schema.json" - } - ] - }, - { - "allOf": [ - { - "properties": { - "version": { - "const": "1.2.0" - } - } - }, - { - "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/json-schemas/haystack-pipeline-1.2.0.schema.json" - } - ] - }, - { - "allOf": [ - { - "properties": { - "version": { - "const": "1.2.1rc0" - } - } - }, - { - "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/json-schemas/haystack-pipeline-1.2.1rc0.schema.json" - } - ] + "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/json-schemas/haystack-pipeline-unstable.schema.json" } - ] + ] + }, + { + "allOf": [ + { + "properties": { + "version": { + "oneOf": [ + { + "const": "1.0.0" + }, + { + "const": "1.1.0" + }, + { + "const": "1.2.0" + } + ] + } + } + }, + { + "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/json-schemas/haystack-pipeline-1.0.0.schema.json" + } + ] + }, + { + "allOf": [ + { + "properties": { + "version": { + "oneOf": [ + { + "const": "1.2.1rc0" + } + ] + } + } + }, + { + "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/json-schemas/haystack-pipeline-1.2.1rc0.schema.json" + } + ] + } + ] } \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index ce74864ee..26ef9a2d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,14 +93,18 @@ min-similarity-lines=6 minversion = "6.0" addopts = "--strict-markers" markers = [ - "slow: marks tests as slow (deselect with '-m \"not slow\"')", - "tika: marks tests which require tika container (deselect with '-m \"not tika\"')", - "elasticsearch: marks tests which require elasticsearch container (deselect with '-m \"not elasticsearch\"')", - "graphdb: marks tests which require graphdb container (deselect with '-m \"not graphdb\"')", - "generator: marks generator tests (deselect with '-m \"not generator\"')", - "pipeline: marks tests with pipeline", - "summarizer: marks summarizer tests", - "weaviate: marks tests that require weaviate container", - "embedding_dim: marks usage of document store with non-default embedding dimension (e.g @pytest.mark.embedding_dim(128))", + "integration: integration tests (deselect with '-m \"not integration\"')", + "slow: slow tests (deselect with '-m \"not slow\"')", + "tika: require tika container (deselect with '-m \"not tika\"')", + "elasticsearch: require elasticsearch container (deselect with '-m \"not elasticsearch\"')", + "graphdb: require graphdb container (deselect with '-m \"not graphdb\"')", + "generator: generator tests (deselect with '-m \"not generator\"')", + "pipeline: tests with pipelines", + "summarizer: summarizer tests", + "weaviate: require weaviate container", + "faiss: uses FAISS", + "milvus: requires a Milvus 2 setup", + "milvus1: requires a Milvus 1 container", + "embedding_dim: uses a document store with non-default embedding dimension (e.g @pytest.mark.embedding_dim(128))", ] log_cli = true \ No newline at end of file diff --git a/rest_api/config.py b/rest_api/config.py index cedbe8c4f..2211743a2 100644 --- a/rest_api/config.py +++ b/rest_api/config.py @@ -3,7 +3,7 @@ from pathlib import Path PIPELINE_YAML_PATH = os.getenv( - "PIPELINE_YAML_PATH", str((Path(__file__).parent / "pipeline" / "pipelines.yaml").absolute()) + "PIPELINE_YAML_PATH", str((Path(__file__).parent / "pipeline" / "pipelines.haystack-pipeline.yml").absolute()) ) QUERY_PIPELINE_NAME = os.getenv("QUERY_PIPELINE_NAME", "query") INDEXING_PIPELINE_NAME = os.getenv("INDEXING_PIPELINE_NAME", "indexing") diff --git a/rest_api/controller/file_upload.py b/rest_api/controller/file_upload.py index da35f7421..2c1b97c8a 100644 --- a/rest_api/controller/file_upload.py +++ b/rest_api/controller/file_upload.py @@ -11,6 +11,7 @@ from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Depends from pydantic import BaseModel from haystack.pipelines.base import Pipeline +from haystack.errors import PipelineConfigError from haystack.pipelines.config import get_component_definitions, get_pipeline_definition, read_pipeline_config_from_yaml from rest_api.config import PIPELINE_YAML_PATH, FILE_UPLOAD_PATH, INDEXING_PIPELINE_NAME from rest_api.controller.utils import as_form @@ -43,10 +44,10 @@ try: INDEXING_PIPELINE = None else: INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME) -except KeyError: - INDEXING_PIPELINE = None - logger.warning("Indexing Pipeline not found in the YAML configuration. File Upload API will not be available.") +except PipelineConfigError as e: + INDEXING_PIPELINE = None + logger.error(f"{e.message}. File Upload API will not be available.") # create directory for uploading files os.makedirs(FILE_UPLOAD_PATH, exist_ok=True) diff --git a/rest_api/pipeline/pipeline_empty.yaml b/rest_api/pipeline/pipeline_empty.haystack-pipeline.yml similarity index 88% rename from rest_api/pipeline/pipeline_empty.yaml rename to rest_api/pipeline/pipeline_empty.haystack-pipeline.yml index 5d1cde422..569f19dfa 100644 --- a/rest_api/pipeline/pipeline_empty.yaml +++ b/rest_api/pipeline/pipeline_empty.haystack-pipeline.yml @@ -1,6 +1,5 @@ # Dummy pipeline, used when the CI needs to load the REST API to extract the OpenAPI specs. DO NOT USE. - -version: '1.1.0' +version: 'unstable' components: - name: FileTypeClassifier @@ -8,8 +7,6 @@ components: pipelines: - name: query - type: Query nodes: - name: FileTypeClassifier inputs: [File] - diff --git a/rest_api/pipeline/pipelines.yaml b/rest_api/pipeline/pipelines.haystack-pipeline.yml similarity index 96% rename from rest_api/pipeline/pipelines.yaml rename to rest_api/pipeline/pipelines.haystack-pipeline.yml index a3928f896..72565a41f 100644 --- a/rest_api/pipeline/pipelines.yaml +++ b/rest_api/pipeline/pipelines.haystack-pipeline.yml @@ -1,4 +1,4 @@ -version: '1.1.0' +version: 'unstable' components: # define all the building-blocks for Pipeline - name: DocumentStore @@ -30,14 +30,12 @@ components: # define all the building-blocks for Pipeline pipelines: - name: query # a sample extractive-qa Pipeline - type: Query nodes: - name: Retriever inputs: [Query] - name: Reader inputs: [Retriever] - name: indexing - type: Indexing nodes: - name: FileTypeClassifier inputs: [File] diff --git a/rest_api/pipeline/pipelines_dpr.yaml b/rest_api/pipeline/pipelines_dpr.haystack-pipeline.yml similarity index 96% rename from rest_api/pipeline/pipelines_dpr.yaml rename to rest_api/pipeline/pipelines_dpr.haystack-pipeline.yml index 907385d85..eaf426921 100644 --- a/rest_api/pipeline/pipelines_dpr.yaml +++ b/rest_api/pipeline/pipelines_dpr.haystack-pipeline.yml @@ -1,4 +1,4 @@ -version: '1.1.0' +version: 'unstable' components: # define all the building-blocks for Pipeline - name: DocumentStore @@ -30,14 +30,12 @@ components: # define all the building-blocks for Pipeline pipelines: - name: query # a sample extractive-qa Pipeline - type: Query nodes: - name: Retriever inputs: [Query] - name: Reader inputs: [Retriever] - name: indexing - type: Indexing nodes: - name: FileTypeClassifier inputs: [File] diff --git a/rest_api/test/samples/pipeline/test_pipeline.haystack-pipeline.yml b/rest_api/test/samples/pipeline/test_pipeline.haystack-pipeline.yml new file mode 100644 index 000000000..93316ec9f --- /dev/null +++ b/rest_api/test/samples/pipeline/test_pipeline.haystack-pipeline.yml @@ -0,0 +1,47 @@ +version: '1.1.0' + +components: + - name: Reader + type: FARMReader + params: + no_ans_boost: -10 + model_name_or_path: deepset/roberta-base-squad2 + num_processes: 0 + - name: ESRetriever + type: ElasticsearchRetriever + params: + document_store: DocumentStore + custom_query: null + - name: DocumentStore + type: ElasticsearchDocumentStore + params: + index: haystack_test + label_index: haystack_test_label + - name: Preprocessor + type: PreProcessor + params: + clean_whitespace: true + - name: PDFConverter + type: PDFToTextConverter + params: + remove_numeric_tables: false + + +pipelines: + - name: test-query + nodes: + - name: ESRetriever + inputs: [Query] + - name: Reader + inputs: [ESRetriever] + + - name: test-indexing + nodes: + - name: PDFConverter + inputs: [File] + - name: Preprocessor + inputs: [PDFConverter] + - name: ESRetriever + inputs: [Preprocessor] + - name: DocumentStore + inputs: [ESRetriever] diff --git a/rest_api/test/samples/pipeline/test_pipeline.yaml b/rest_api/test/samples/pipeline/test_pipeline.yaml deleted file mode 100644 index 7d84214a2..000000000 --- a/rest_api/test/samples/pipeline/test_pipeline.yaml +++ /dev/null @@ -1,103 +0,0 @@ -version: '1.1.0' - -components: - - name: Reader - type: FARMReader - params: - no_ans_boost: -10 - model_name_or_path: deepset/roberta-base-squad2 - num_processes: 0 - - name: ESRetriever - type: ElasticsearchRetriever - params: - document_store: DocumentStore - custom_query: null - - name: DocumentStore - type: ElasticsearchDocumentStore - params: - index: haystack_test - label_index: haystack_test_label - - name: PDFConverter - type: PDFToTextConverter - params: - remove_numeric_tables: false - - name: Preprocessor - type: PreProcessor - params: - clean_whitespace: true - - name: IndexTimeDocumentClassifier - type: TransformersDocumentClassifier - params: - batch_size: 16 - use_gpu: -1 - - name: QueryTimeDocumentClassifier - type: TransformersDocumentClassifier - params: - use_gpu: -1 - - -pipelines: - - name: query_pipeline - type: Pipeline - nodes: - - name: ESRetriever - inputs: [Query] - - name: Reader - inputs: [ESRetriever] - - - name: ray_query_pipeline - type: RayPipeline - nodes: - - name: ESRetriever - replicas: 2 - inputs: [ Query ] - - name: Reader - inputs: [ ESRetriever ] - - - name: query_pipeline_with_document_classifier - type: Pipeline - nodes: - - name: ESRetriever - inputs: [Query] - - name: QueryTimeDocumentClassifier - inputs: [ESRetriever] - - name: Reader - inputs: [QueryTimeDocumentClassifier] - - - name: indexing_pipeline - type: Pipeline - nodes: - - name: PDFConverter - inputs: [File] - - name: Preprocessor - inputs: [PDFConverter] - - name: ESRetriever - inputs: [Preprocessor] - - name: DocumentStore - inputs: [ESRetriever] - - - name: indexing_text_pipeline - type: Pipeline - nodes: - - name: TextConverter - inputs: [File] - - name: Preprocessor - inputs: [TextConverter] - - name: ESRetriever - inputs: [Preprocessor] - - name: DocumentStore - inputs: [ESRetriever] - - - name: indexing_pipeline_with_classifier - type: Pipeline - nodes: - - name: PDFConverter - inputs: [File] - - name: Preprocessor - inputs: [PDFConverter] - - name: IndexTimeDocumentClassifier - inputs: [Preprocessor] - - name: ESRetriever - inputs: [IndexTimeDocumentClassifier] - - name: DocumentStore - inputs: [ESRetriever] diff --git a/rest_api/test/samples/pipeline/test_pipeline_faiss_indexing.yaml b/rest_api/test/samples/pipeline/test_pipeline_faiss_indexing.yaml deleted file mode 100644 index d11ec5413..000000000 --- a/rest_api/test/samples/pipeline/test_pipeline_faiss_indexing.yaml +++ /dev/null @@ -1,31 +0,0 @@ -version: '1.1.0' - -components: - - name: DPRRetriever - type: DensePassageRetriever - params: - document_store: NewFAISSDocumentStore - - name: PDFConverter - type: PDFToTextConverter - params: - remove_numeric_tables: false - - name: Preprocessor - type: PreProcessor - params: - clean_whitespace: true - - name: NewFAISSDocumentStore - type: FAISSDocumentStore - - -pipelines: - - name: indexing_pipeline - type: Pipeline - nodes: - - name: PDFConverter - inputs: [File] - - name: Preprocessor - inputs: [PDFConverter] - - name: DPRRetriever - inputs: [Preprocessor] - - name: NewFAISSDocumentStore - inputs: [DPRRetriever] diff --git a/rest_api/test/samples/pipeline/test_pipeline_faiss_retrieval.yaml b/rest_api/test/samples/pipeline/test_pipeline_faiss_retrieval.yaml deleted file mode 100644 index 39130ef01..000000000 --- a/rest_api/test/samples/pipeline/test_pipeline_faiss_retrieval.yaml +++ /dev/null @@ -1,19 +0,0 @@ -version: '1.1.0' - -components: - - name: DPRRetriever - type: DensePassageRetriever - params: - document_store: ExistingFAISSDocumentStore - - name: ExistingFAISSDocumentStore - type: FAISSDocumentStore - params: - faiss_index_path: 'existing_faiss_document_store' - - -pipelines: - - name: query_pipeline - type: Pipeline - nodes: - - name: DPRRetriever - inputs: [Query] diff --git a/rest_api/test/samples/pipeline/test_pipeline_tfidfretriever.yaml b/rest_api/test/samples/pipeline/test_pipeline_tfidfretriever.yaml deleted file mode 100644 index 8a0079723..000000000 --- a/rest_api/test/samples/pipeline/test_pipeline_tfidfretriever.yaml +++ /dev/null @@ -1,25 +0,0 @@ -version: '1.1.0' - -components: - - name: Reader - type: FARMReader - params: - no_ans_boost: -10 - model_name_or_path: deepset/minilm-uncased-squad2 - num_processes: 0 - - name: Retriever - type: TfidfRetriever - params: - document_store: DocumentStore - - name: DocumentStore - type: InMemoryDocumentStore - - -pipelines: - - name: query_pipeline - type: Pipeline - nodes: - - name: Retriever - inputs: [Query] - - name: Reader - inputs: [Retriever] diff --git a/rest_api/test/test_rest_api.py b/rest_api/test/test_rest_api.py index 1aa0c41d8..43d99079a 100644 --- a/rest_api/test/test_rest_api.py +++ b/rest_api/test/test_rest_api.py @@ -45,9 +45,10 @@ def exclude_no_answer(responses): @pytest.fixture() def client() -> TestClient: os.environ["PIPELINE_YAML_PATH"] = str( - (Path(__file__).parent / "samples" / "pipeline" / "test_pipeline.yaml").absolute() + (Path(__file__).parent / "samples" / "pipeline" / "test_pipeline.haystack-pipeline.yml").absolute() ) - os.environ["INDEXING_PIPELINE_NAME"] = "indexing_text_pipeline" + os.environ["INDEXING_PIPELINE_NAME"] = "test-indexing" + os.environ["QUERY_PIPELINE_NAME"] = "test-query" client = TestClient(app) client.post(url="/documents/delete_by_filters", data='{"filters": {}}') @@ -217,15 +218,7 @@ def test_query_with_invalid_filter(populated_client: TestClient): assert len(response_json["answers"]) == 0 -def test_query_with_no_documents_and_no_answers(): - os.environ["PIPELINE_YAML_PATH"] = str( - (Path(__file__).parent / "samples" / "pipeline" / "test_pipeline.yaml").absolute() - ) - os.environ["INDEXING_PIPELINE_NAME"] = "indexing_text_pipeline" - client = TestClient(app) - - # Clean up to make sure the docstore is empty - client.post(url="/documents/delete_by_filters", data='{"filters": {}}') +def test_query_with_no_documents_and_no_answers(client: TestClient): query = {"query": "Who made the PDF specification?"} response = client.post(url="/query", json=query) assert 200 == response.status_code diff --git a/setup.cfg b/setup.cfg index 1036b1c11..d3b7cfd45 100644 --- a/setup.cfg +++ b/setup.cfg @@ -75,6 +75,9 @@ install_requires = # pip unfortunately backtracks into the databind direction ultimately getting lost. azure-core<1.23 + # TEMPORARY!!! + azure-core<1.23.0 + # Preprocessing more_itertools # for windowing python-docx @@ -106,6 +109,8 @@ exclude = test* tutorials* ui* +include = + json-schemas [options.extras_require] diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/conftest.py b/test/conftest.py index 2efb852c9..3e7716163 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,8 +1,9 @@ +from typing import List, Optional, Tuple, Dict + import subprocess import time from subprocess import run from sys import platform -import os import gc import uuid import logging @@ -15,6 +16,8 @@ import psutil import pytest import requests +from haystack.nodes.base import BaseComponent, MultiLabel + try: from milvus import Milvus @@ -27,7 +30,6 @@ try: from elasticsearch import Elasticsearch from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore import weaviate - from haystack.document_stores.weaviate import WeaviateDocumentStore from haystack.document_stores import MilvusDocumentStore from haystack.document_stores.graphdb import GraphDBKnowledgeGraph @@ -39,19 +41,15 @@ except (ImportError, ModuleNotFoundError) as ie: _optional_component_not_installed("test", "test", ie) +from haystack.document_stores import BaseDocumentStore, DeepsetCloudDocumentStore, InMemoryDocumentStore -from haystack.document_stores import DeepsetCloudDocumentStore, InMemoryDocumentStore - +from haystack.nodes import BaseReader, BaseRetriever from haystack.nodes.answer_generator.transformers import Seq2SeqGenerator - -from haystack.nodes.answer_generator.transformers import RAGenerator, RAGeneratorType -from haystack.modeling.infer import Inferencer, QAInferencer +from haystack.nodes.answer_generator.transformers import RAGenerator from haystack.nodes.ranker import SentenceTransformersRanker from haystack.nodes.document_classifier.transformers import TransformersDocumentClassifier from haystack.nodes.retriever.sparse import ElasticsearchFilterOnlyRetriever, ElasticsearchRetriever, TfidfRetriever from haystack.nodes.retriever.dense import DensePassageRetriever, EmbeddingRetriever, TableTextRetriever -from haystack.schema import Document - from haystack.nodes.reader.farm import FARMReader from haystack.nodes.reader.transformers import TransformersReader from haystack.nodes.reader.table import TableReader, RCIReader @@ -59,6 +57,10 @@ from haystack.nodes.summarizer.transformers import TransformersSummarizer from haystack.nodes.translator import TransformersTranslator from haystack.nodes.question_generator import QuestionGenerator +from haystack.modeling.infer import Inferencer, QAInferencer + +from haystack.schema import Document + # To manually run the tests with default PostgreSQL instead of SQLite, switch the lines below SQL_TYPE = "sqlite" @@ -96,22 +98,28 @@ def pytest_collection_modifyitems(config, items): # add pytest markers for tests that are not explicitly marked but include some keywords # in the test name (e.g. test_elasticsearch_client would get the "elasticsearch" marker) + # TODO evaluate if we need all of there (the non document store ones seems to be unused) if "generator" in item.nodeid: item.add_marker(pytest.mark.generator) elif "summarizer" in item.nodeid: item.add_marker(pytest.mark.summarizer) elif "tika" in item.nodeid: item.add_marker(pytest.mark.tika) - elif "elasticsearch" in item.nodeid: - item.add_marker(pytest.mark.elasticsearch) - elif "graphdb" in item.nodeid: - item.add_marker(pytest.mark.graphdb) elif "pipeline" in item.nodeid: item.add_marker(pytest.mark.pipeline) elif "slow" in item.nodeid: item.add_marker(pytest.mark.slow) + elif "elasticsearch" in item.nodeid: + item.add_marker(pytest.mark.elasticsearch) + elif "graphdb" in item.nodeid: + item.add_marker(pytest.mark.graphdb) elif "weaviate" in item.nodeid: item.add_marker(pytest.mark.weaviate) + elif "faiss" in item.nodeid: + item.add_marker(pytest.mark.faiss) + elif "milvus" in item.nodeid: + item.add_marker(pytest.mark.milvus) + item.add_marker(pytest.mark.milvus1) # if the cli argument "--document_store_type" is used, we want to skip all tests that have markers of other docstores # Example: pytest -v test_document_store.py --document_store_type="memory" => skip all tests marked with "elasticsearch" @@ -139,6 +147,81 @@ def pytest_collection_modifyitems(config, items): item.add_marker(skip_milvus) +# +# Empty mocks, as a base for unit tests. +# +# Monkeypatch the methods you need with either a mock implementation +# or a unittest.mock.MagicMock object (https://docs.python.org/3/library/unittest.mock.html) +# + + +class MockNode(BaseComponent): + outgoing_edges = 1 + + def run(self, *a, **k): + pass + + +class MockDocumentStore(BaseDocumentStore): + outgoing_edges = 1 + + def _create_document_field_map(self, *a, **k): + pass + + def delete_documents(self, *a, **k): + pass + + def delete_labels(self, *a, **k): + pass + + def get_all_documents(self, *a, **k): + pass + + def get_all_documents_generator(self, *a, **k): + pass + + def get_all_labels(self, *a, **k): + pass + + def get_document_by_id(self, *a, **k): + pass + + def get_document_count(self, *a, **k): + pass + + def get_documents_by_id(self, *a, **k): + pass + + def get_label_count(self, *a, **k): + pass + + def query_by_embedding(self, *a, **k): + pass + + def write_documents(self, *a, **k): + pass + + def write_labels(self, *a, **k): + pass + + +class MockRetriever(BaseRetriever): + outgoing_edges = 1 + + def retrieve(self, query: str, top_k: int): + pass + + +class MockReader(BaseReader): + outgoing_edges = 1 + + def predict(self, query: str, documents: List[Document], top_k: Optional[int] = None): + pass + + def predict_batch(self, query_doc_list: List[dict], top_k: Optional[int] = None, batch_size: Optional[int] = None): + pass + + @pytest.fixture(scope="function", autouse=True) def gc_cleanup(request): """ @@ -295,7 +378,7 @@ def deepset_cloud_document_store(deepset_cloud_fixture): @pytest.fixture(scope="function") def rag_generator(): - return RAGenerator(model_name_or_path="facebook/rag-token-nq", generator_type=RAGeneratorType.TOKEN, max_length=20) + return RAGenerator(model_name_or_path="facebook/rag-token-nq", generator_type="token", max_length=20) @pytest.fixture(scope="function") diff --git a/test/samples/dc/pipeline_config.json b/test/samples/dc/pipeline_config.json index 9efb574b0..5594f4a0a 100644 --- a/test/samples/dc/pipeline_config.json +++ b/test/samples/dc/pipeline_config.json @@ -1,5 +1,5 @@ { - "version": "0.9", + "version": "unstable", "name": "document_retrieval_1", "components": [ { @@ -40,7 +40,6 @@ "pipelines": [ { "name": "query", - "type": "Query", "nodes": [ { "name": "Retriever", @@ -52,7 +51,6 @@ }, { "name": "indexing", - "type": "Indexing", "nodes": [ { "name": "TextFileConverter", diff --git a/test/samples/pipeline/test_pipeline.yaml b/test/samples/pipeline/test_pipeline.yaml index 7d84214a2..b1306ce60 100644 --- a/test/samples/pipeline/test_pipeline.yaml +++ b/test/samples/pipeline/test_pipeline.yaml @@ -1,4 +1,4 @@ -version: '1.1.0' +version: 'unstable' components: - name: Reader @@ -11,7 +11,6 @@ components: type: ElasticsearchRetriever params: document_store: DocumentStore - custom_query: null - name: DocumentStore type: ElasticsearchDocumentStore params: @@ -29,33 +28,22 @@ components: type: TransformersDocumentClassifier params: batch_size: 16 - use_gpu: -1 + use_gpu: false - name: QueryTimeDocumentClassifier type: TransformersDocumentClassifier params: - use_gpu: -1 + use_gpu: false pipelines: - name: query_pipeline - type: Pipeline nodes: - name: ESRetriever inputs: [Query] - name: Reader inputs: [ESRetriever] - - name: ray_query_pipeline - type: RayPipeline - nodes: - - name: ESRetriever - replicas: 2 - inputs: [ Query ] - - name: Reader - inputs: [ ESRetriever ] - - name: query_pipeline_with_document_classifier - type: Pipeline nodes: - name: ESRetriever inputs: [Query] @@ -65,7 +53,6 @@ pipelines: inputs: [QueryTimeDocumentClassifier] - name: indexing_pipeline - type: Pipeline nodes: - name: PDFConverter inputs: [File] @@ -77,7 +64,6 @@ pipelines: inputs: [ESRetriever] - name: indexing_text_pipeline - type: Pipeline nodes: - name: TextConverter inputs: [File] @@ -89,7 +75,6 @@ pipelines: inputs: [ESRetriever] - name: indexing_pipeline_with_classifier - type: Pipeline nodes: - name: PDFConverter inputs: [File] diff --git a/test/samples/pipeline/test_pipeline_faiss_indexing.yaml b/test/samples/pipeline/test_pipeline_faiss_indexing.yaml index d11ec5413..9fb2a254f 100644 --- a/test/samples/pipeline/test_pipeline_faiss_indexing.yaml +++ b/test/samples/pipeline/test_pipeline_faiss_indexing.yaml @@ -1,4 +1,4 @@ -version: '1.1.0' +version: 'unstable' components: - name: DPRRetriever @@ -19,7 +19,6 @@ components: pipelines: - name: indexing_pipeline - type: Pipeline nodes: - name: PDFConverter inputs: [File] diff --git a/test/samples/pipeline/test_pipeline_faiss_retrieval.yaml b/test/samples/pipeline/test_pipeline_faiss_retrieval.yaml index 39130ef01..89a5cbf48 100644 --- a/test/samples/pipeline/test_pipeline_faiss_retrieval.yaml +++ b/test/samples/pipeline/test_pipeline_faiss_retrieval.yaml @@ -1,4 +1,4 @@ -version: '1.1.0' +version: 'unstable' components: - name: DPRRetriever @@ -13,7 +13,6 @@ components: pipelines: - name: query_pipeline - type: Pipeline nodes: - name: DPRRetriever inputs: [Query] diff --git a/test/samples/pipeline/test_pipeline_tfidfretriever.yaml b/test/samples/pipeline/test_pipeline_tfidfretriever.yaml index 8a0079723..b954d4282 100644 --- a/test/samples/pipeline/test_pipeline_tfidfretriever.yaml +++ b/test/samples/pipeline/test_pipeline_tfidfretriever.yaml @@ -1,4 +1,4 @@ -version: '1.1.0' +version: 'unstable' components: - name: Reader @@ -17,7 +17,6 @@ components: pipelines: - name: query_pipeline - type: Pipeline nodes: - name: Retriever inputs: [Query] diff --git a/test/samples/pipeline/test_ray_pipeline.yaml b/test/samples/pipeline/test_ray_pipeline.yaml new file mode 100644 index 000000000..3ec3864b0 --- /dev/null +++ b/test/samples/pipeline/test_ray_pipeline.yaml @@ -0,0 +1,46 @@ +version: 'unstable' + +components: + - name: Reader + type: FARMReader + params: + no_ans_boost: -10 + model_name_or_path: deepset/roberta-base-squad2 + num_processes: 0 + - name: ESRetriever + type: ElasticsearchRetriever + params: + document_store: DocumentStore + - name: DocumentStore + type: ElasticsearchDocumentStore + params: + index: haystack_test + label_index: haystack_test_label + - name: PDFConverter + type: PDFToTextConverter + params: + remove_numeric_tables: false + - name: Preprocessor + type: PreProcessor + params: + clean_whitespace: true + - name: IndexTimeDocumentClassifier + type: TransformersDocumentClassifier + params: + batch_size: 16 + use_gpu: false + - name: QueryTimeDocumentClassifier + type: TransformersDocumentClassifier + params: + use_gpu: false + + +pipelines: + - name: ray_query_pipeline + type: RayPipeline + nodes: + - name: ESRetriever + replicas: 2 + inputs: [ Query ] + - name: Reader + inputs: [ ESRetriever ] diff --git a/test/test_distillation.py b/test/test_distillation.py index 11418f053..ec4df8594 100644 --- a/test/test_distillation.py +++ b/test/test_distillation.py @@ -3,7 +3,7 @@ from haystack.nodes import FARMReader from haystack.modeling.data_handler.processor import UnlabeledTextProcessor import torch -from conftest import SAMPLES_PATH +from .conftest import SAMPLES_PATH def create_checkpoint(model): diff --git a/test/test_document_store.py b/test/test_document_store.py index 68b102544..3c0733a97 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -8,7 +8,7 @@ from unittest.mock import Mock from elasticsearch import Elasticsearch from elasticsearch.exceptions import RequestError -from conftest import ( +from .conftest import ( deepset_cloud_fixture, get_document_store, MOCK_DC, diff --git a/test/test_eval.py b/test/test_eval.py index 9d9a959ae..3bbd6c637 100644 --- a/test/test_eval.py +++ b/test/test_eval.py @@ -1,11 +1,8 @@ import pytest import sys -from pathlib import Path from haystack.document_stores.base import BaseDocumentStore from haystack.document_stores.memory import InMemoryDocumentStore from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore -from haystack.nodes.answer_generator.transformers import RAGenerator, RAGeneratorType -from haystack.nodes.retriever.dense import EmbeddingRetriever from haystack.nodes.preprocessor import PreProcessor from haystack.nodes.evaluator import EvalAnswers, EvalDocuments from haystack.nodes.query_classifier.transformers import TransformersQueryClassifier @@ -19,10 +16,9 @@ from haystack.pipelines.standard_pipelines import ( RetrieverQuestionGenerationPipeline, TranslationWrapperPipeline, ) -from haystack.nodes.summarizer.transformers import TransformersSummarizer from haystack.schema import Answer, Document, EvaluationResult, Label, MultiLabel, Span -from conftest import SAMPLES_PATH +from .conftest import SAMPLES_PATH @pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Causes OOM on windows github runner") diff --git a/test/test_faiss_and_milvus.py b/test/test_faiss_and_milvus.py index 7c702f4c7..ba7c85d3c 100644 --- a/test/test_faiss_and_milvus.py +++ b/test/test_faiss_and_milvus.py @@ -13,7 +13,7 @@ from haystack.document_stores.weaviate import WeaviateDocumentStore from haystack.pipelines import Pipeline from haystack.nodes.retriever.dense import EmbeddingRetriever -from conftest import ensure_ids_are_correct_uuids +from .conftest import ensure_ids_are_correct_uuids DOCUMENTS = [ diff --git a/test/test_file_converter.py b/test/test_file_converter.py index 7b16a36d9..c5a731771 100644 --- a/test/test_file_converter.py +++ b/test/test_file_converter.py @@ -13,7 +13,7 @@ from haystack.nodes import ( ParsrConverter, ) -from conftest import SAMPLES_PATH +from .conftest import SAMPLES_PATH @pytest.mark.tika diff --git a/test/test_filetype_classifier.py b/test/test_filetype_classifier.py index c13858003..1676c4eb0 100644 --- a/test/test_filetype_classifier.py +++ b/test/test_filetype_classifier.py @@ -3,9 +3,9 @@ from pathlib import Path from haystack.nodes.file_classifier.file_type import FileTypeClassifier, DEFAULT_TYPES -def test_filetype_classifier_single_file(tmpdir): +def test_filetype_classifier_single_file(tmp_path): node = FileTypeClassifier() - test_files = [tmpdir / f"test.{extension}" for extension in DEFAULT_TYPES] + test_files = [tmp_path / f"test.{extension}" for extension in DEFAULT_TYPES] for edge_index, test_file in enumerate(test_files): output, edge = node.run(test_file) @@ -13,35 +13,35 @@ def test_filetype_classifier_single_file(tmpdir): assert output == {"file_paths": [test_file]} -def test_filetype_classifier_many_files(tmpdir): +def test_filetype_classifier_many_files(tmp_path): node = FileTypeClassifier() for edge_index, extension in enumerate(DEFAULT_TYPES): - test_files = [tmpdir / f"test_{idx}.{extension}" for idx in range(10)] + test_files = [tmp_path / f"test_{idx}.{extension}" for idx in range(10)] output, edge = node.run(test_files) assert edge == f"output_{edge_index+1}" assert output == {"file_paths": test_files} -def test_filetype_classifier_many_files_mixed_extensions(tmpdir): +def test_filetype_classifier_many_files_mixed_extensions(tmp_path): node = FileTypeClassifier() - test_files = [tmpdir / f"test.{extension}" for extension in DEFAULT_TYPES] + test_files = [tmp_path / f"test.{extension}" for extension in DEFAULT_TYPES] with pytest.raises(ValueError): node.run(test_files) -def test_filetype_classifier_unsupported_extension(tmpdir): +def test_filetype_classifier_unsupported_extension(tmp_path): node = FileTypeClassifier() - test_file = tmpdir / f"test.really_weird_extension" + test_file = tmp_path / f"test.really_weird_extension" with pytest.raises(ValueError): node.run(test_file) -def test_filetype_classifier_custom_extensions(tmpdir): +def test_filetype_classifier_custom_extensions(tmp_path): node = FileTypeClassifier(supported_types=["my_extension"]) - test_file = tmpdir / f"test.my_extension" + test_file = tmp_path / f"test.my_extension" output, edge = node.run(test_file) assert edge == f"output_1" assert output == {"file_paths": [test_file]} diff --git a/test/test_generator.py b/test/test_generator.py index f5a217b22..a245d873b 100644 --- a/test/test_generator.py +++ b/test/test_generator.py @@ -9,7 +9,7 @@ from haystack.nodes.answer_generator import Seq2SeqGenerator from haystack.pipelines import TranslationWrapperPipeline, GenerativeQAPipeline -from conftest import DOCS_WITH_EMBEDDINGS +from .conftest import DOCS_WITH_EMBEDDINGS # Keeping few (retriever,document_store) combination to reduce test time diff --git a/test/test_modeling_dpr.py b/test/test_modeling_dpr.py index f8d83ab86..c6a30c021 100644 --- a/test/test_modeling_dpr.py +++ b/test/test_modeling_dpr.py @@ -603,7 +603,7 @@ def test_dpr_context_only(): assert tensor_names == ["passage_input_ids", "passage_segment_ids", "passage_attention_mask", "label_ids"] -def test_dpr_processor_save_load(): +def test_dpr_processor_save_load(tmp_path): d = { "query": "big little lies season 2 how many episodes ?", "passages": [ @@ -646,9 +646,9 @@ def test_dpr_processor_save_load(): metric="text_similarity_metric", shuffle_negatives=False, ) - processor.save(save_dir="testsave/dpr_processor") + processor.save(save_dir=f"{tmp_path}/testsave/dpr_processor") dataset, tensor_names, _ = processor.dataset_from_dicts(dicts=[d], return_baskets=False) - loadedprocessor = TextSimilarityProcessor.load_from_dir(load_dir="testsave/dpr_processor") + loadedprocessor = TextSimilarityProcessor.load_from_dir(load_dir=f"{tmp_path}/testsave/dpr_processor") dataset2, tensor_names, _ = loadedprocessor.dataset_from_dicts(dicts=[d], return_baskets=False) assert np.array_equal(dataset.tensors[0], dataset2.tensors[0]) @@ -667,7 +667,7 @@ def test_dpr_processor_save_load(): {"query": "facebook/dpr-question_encoder-single-nq-base", "passage": "facebook/dpr-ctx_encoder-single-nq-base"}, ], ) -def test_dpr_processor_save_load_non_bert_tokenizer(query_and_passage_model): +def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_model): """ This test compares 1) a model that was loaded from model hub with 2) a model from model hub that was saved to disk and then loaded from disk and @@ -729,7 +729,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(query_and_passage_model): model.connect_heads_with_processor(processor.tasks, require_labels=False) # save model that was loaded from model hub to disk - save_dir = "testsave/dpr_model" + save_dir = f"{tmp_path}/testsave/dpr_model" query_encoder_dir = "query_encoder" passage_encoder_dir = "passage_encoder" model.save(Path(save_dir), lm1_name=query_encoder_dir, lm2_name=passage_encoder_dir) @@ -841,7 +841,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(query_and_passage_model): assert np.array_equal(all_embeddings["query"][0], all_embeddings2["query"][0]) # save the model that was loaded from disk to disk - save_dir = "testsave/dpr_model" + save_dir = f"{tmp_path}/testsave/dpr_model" query_encoder_dir = "query_encoder" passage_encoder_dir = "passage_encoder" loaded_model.save(Path(save_dir), lm1_name=query_encoder_dir, lm2_name=passage_encoder_dir) diff --git a/test/test_modeling_processor.py b/test/test_modeling_processor.py index bc5af647a..a6643d737 100644 --- a/test/test_modeling_processor.py +++ b/test/test_modeling_processor.py @@ -6,7 +6,7 @@ from transformers import AutoTokenizer from haystack.modeling.data_handler.processor import SquadProcessor from haystack.modeling.model.tokenization import Tokenizer -from conftest import SAMPLES_PATH +from .conftest import SAMPLES_PATH # during inference (parameter return_baskets = False) we do not convert labels diff --git a/test/test_modeling_processor_saving_loading.py b/test/test_modeling_processor_saving_loading.py index 410e776b9..801044896 100644 --- a/test/test_modeling_processor_saving_loading.py +++ b/test/test_modeling_processor_saving_loading.py @@ -6,10 +6,10 @@ from haystack.modeling.model.tokenization import Tokenizer from haystack.modeling.utils import set_all_seeds import torch -from conftest import SAMPLES_PATH +from .conftest import SAMPLES_PATH -def test_processor_saving_loading(caplog): +def test_processor_saving_loading(tmp_path, caplog): if caplog is not None: caplog.set_level(logging.CRITICAL) @@ -31,7 +31,7 @@ def test_processor_saving_loading(caplog): dicts = processor.file_to_dicts(file=SAMPLES_PATH / "qa" / "dev-sample.json") data, tensor_names, _ = processor.dataset_from_dicts(dicts=dicts, indices=[1]) - save_dir = Path("testsave/processor") + save_dir = tmp_path / Path("testsave/processor") processor.save(save_dir) processor = processor.load_from_dir(save_dir) diff --git a/test/test_pipeline.py b/test/test_pipeline.py index ed0465476..ce7bdcade 100644 --- a/test/test_pipeline.py +++ b/test/test_pipeline.py @@ -9,6 +9,7 @@ import pandas as pd import pytest from requests import PreparedRequest import responses +import logging import yaml from haystack import __version__, Document, Answer, JoinAnswers @@ -19,21 +20,33 @@ from haystack.nodes.other.join_docs import JoinDocuments from haystack.nodes.base import BaseComponent from haystack.nodes.retriever.base import BaseRetriever from haystack.nodes.retriever.sparse import ElasticsearchRetriever -from haystack.pipelines import Pipeline, DocumentSearchPipeline, RootNode, ExtractiveQAPipeline -from haystack.pipelines.config import _validate_user_input, validate_config +from haystack.pipelines import Pipeline, DocumentSearchPipeline, RootNode +from haystack.pipelines.config import validate_config_strings from haystack.pipelines.utils import generate_code +from haystack.errors import PipelineConfigError from haystack.nodes import DensePassageRetriever, EmbeddingRetriever, RouteDocuments, PreProcessor, TextConverter - -from conftest import MOCK_DC, DC_API_ENDPOINT, DC_API_KEY, DC_TEST_INDEX, SAMPLES_PATH, deepset_cloud_fixture from haystack.utils.deepsetcloud import DeepsetCloudError +from .conftest import ( + MOCK_DC, + DC_API_ENDPOINT, + DC_API_KEY, + DC_TEST_INDEX, + SAMPLES_PATH, + MockDocumentStore, + MockRetriever, + deepset_cloud_fixture, +) + class ParentComponent(BaseComponent): outgoing_edges = 1 def __init__(self, dependent: BaseComponent) -> None: super().__init__() - self.set_config(dependent=dependent) + + def run(*args, **kwargs): + logging.info("ParentComponent run() was called") class ParentComponent2(BaseComponent): @@ -41,157 +54,38 @@ class ParentComponent2(BaseComponent): def __init__(self, dependent: BaseComponent) -> None: super().__init__() - self.set_config(dependent=dependent) + + def run(*args, **kwargs): + logging.info("ParentComponent2 run() was called") class ChildComponent(BaseComponent): def __init__(self, some_key: str = None) -> None: super().__init__() - self.set_config(some_key=some_key) + + def run(*args, **kwargs): + logging.info("ChildComponent run() was called") -@pytest.mark.elasticsearch -@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True) -def test_load_and_save_yaml(document_store, tmp_path): - # test correct load of indexing pipeline from yaml - pipeline = Pipeline.load_from_yaml( - SAMPLES_PATH / "pipeline" / "test_pipeline.yaml", pipeline_name="indexing_pipeline" - ) - pipeline.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf") - # test correct load of query pipeline from yaml - pipeline = Pipeline.load_from_yaml(SAMPLES_PATH / "pipeline" / "test_pipeline.yaml", pipeline_name="query_pipeline") - prediction = pipeline.run( - query="Who made the PDF specification?", params={"ESRetriever": {"top_k": 10}, "Reader": {"top_k": 3}} - ) - assert prediction["query"] == "Who made the PDF specification?" - assert prediction["answers"][0].answer == "Adobe Systems" - assert "_debug" not in prediction.keys() +class DummyRetriever(MockRetriever): + def __init__(self, document_store): + self.document_store = document_store - # test invalid pipeline name - with pytest.raises(Exception): - Pipeline.load_from_yaml(path=SAMPLES_PATH / "pipeline" / "test_pipeline.yaml", pipeline_name="invalid") - # test config export - pipeline.save_to_yaml(tmp_path / "test.yaml") - with open(tmp_path / "test.yaml", "r", encoding="utf-8") as stream: - saved_yaml = stream.read() - expected_yaml = f""" - components: - - name: ESRetriever - params: - document_store: ElasticsearchDocumentStore - type: ElasticsearchRetriever - - name: ElasticsearchDocumentStore - params: - index: haystack_test - label_index: haystack_test_label - type: ElasticsearchDocumentStore - - name: Reader - params: - model_name_or_path: deepset/roberta-base-squad2 - no_ans_boost: -10 - num_processes: 0 - type: FARMReader - pipelines: - - name: query - nodes: - - inputs: - - Query - name: ESRetriever - - inputs: - - ESRetriever - name: Reader - type: Pipeline - version: {__version__} - """ - assert saved_yaml.replace(" ", "").replace("\n", "") == expected_yaml.replace(" ", "").replace("\n", "") + def run(self): + test = "test" + return {"test": test}, "output_1" -@pytest.mark.elasticsearch -@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True) -def test_load_and_save_yaml_prebuilt_pipelines(document_store, tmp_path): - # populating index - pipeline = Pipeline.load_from_yaml( - SAMPLES_PATH / "pipeline" / "test_pipeline.yaml", pipeline_name="indexing_pipeline" - ) - pipeline.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf") - # test correct load of query pipeline from yaml - pipeline = ExtractiveQAPipeline.load_from_yaml( - SAMPLES_PATH / "pipeline" / "test_pipeline.yaml", pipeline_name="query_pipeline" - ) - prediction = pipeline.run( - query="Who made the PDF specification?", params={"ESRetriever": {"top_k": 10}, "Reader": {"top_k": 3}} - ) - assert prediction["query"] == "Who made the PDF specification?" - assert prediction["answers"][0].answer == "Adobe Systems" - assert "_debug" not in prediction.keys() - - # test invalid pipeline name - with pytest.raises(Exception): - ExtractiveQAPipeline.load_from_yaml( - path=SAMPLES_PATH / "pipeline" / "test_pipeline.yaml", pipeline_name="invalid" - ) - # test config export - pipeline.save_to_yaml(tmp_path / "test.yaml") - with open(tmp_path / "test.yaml", "r", encoding="utf-8") as stream: - saved_yaml = stream.read() - expected_yaml = f""" - components: - - name: ESRetriever - params: - document_store: ElasticsearchDocumentStore - type: ElasticsearchRetriever - - name: ElasticsearchDocumentStore - params: - index: haystack_test - label_index: haystack_test_label - type: ElasticsearchDocumentStore - - name: Reader - params: - model_name_or_path: deepset/roberta-base-squad2 - no_ans_boost: -10 - num_processes: 0 - type: FARMReader - pipelines: - - name: query - nodes: - - inputs: - - Query - name: ESRetriever - - inputs: - - ESRetriever - name: Reader - type: Pipeline - version: {__version__} - """ - assert saved_yaml.replace(" ", "").replace("\n", "") == expected_yaml.replace(" ", "").replace("\n", "") - - -def test_load_tfidfretriever_yaml(tmp_path): - documents = [ - { - "content": "A Doc specifically talking about haystack. Haystack can be used to scale QA models to large document collections." - } - ] - pipeline = Pipeline.load_from_yaml( - SAMPLES_PATH / "pipeline" / "test_pipeline_tfidfretriever.yaml", pipeline_name="query_pipeline" - ) - with pytest.raises(Exception) as exc_info: - pipeline.run( - query="What can be used to scale QA models to large document collections?", - params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 3}}, - ) - exception_raised = str(exc_info.value) - assert "Retrieval requires dataframe df and tf-idf matrix" in exception_raised - - pipeline.get_node(name="Retriever").document_store.write_documents(documents=documents) - prediction = pipeline.run( - query="What can be used to scale QA models to large document collections?", - params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 3}}, - ) - assert prediction["query"] == "What can be used to scale QA models to large document collections?" - assert prediction["answers"][0].answer == "haystack" +class JoinNode(RootNode): + def run(self, output=None, inputs=None): + if inputs: + output = "" + for input_dict in inputs: + output += input_dict["output"] + return {"output": output}, "output_1" +@pytest.mark.integration @pytest.mark.elasticsearch def test_to_code_creates_same_pipelines(): index_pipeline = Pipeline.load_from_yaml( @@ -202,6 +96,7 @@ def test_to_code_creates_same_pipelines(): ) query_pipeline_code = query_pipeline.to_code(pipeline_variable_name="query_pipeline_from_code") index_pipeline_code = index_pipeline.to_code(pipeline_variable_name="index_pipeline_from_code") + exec(query_pipeline_code) exec(index_pipeline_code) assert locals()["query_pipeline_from_code"] is not None @@ -216,8 +111,7 @@ def test_get_config_creates_dependent_component(): pipeline = Pipeline() pipeline.add_node(component=parent, name="parent", inputs=["Query"]) - expected_pipelines = [{"name": "query", "type": "Pipeline", "nodes": [{"name": "parent", "inputs": ["Query"]}]}] - + expected_pipelines = [{"name": "query", "nodes": [{"name": "parent", "inputs": ["Query"]}]}] expected_components = [ {"name": "parent", "type": "ParentComponent", "params": {"dependent": "ChildComponent"}}, {"name": "ChildComponent", "type": "ChildComponent", "params": {}}, @@ -249,7 +143,6 @@ def test_get_config_creates_only_one_dependent_component_referenced_by_multiple_ expected_pipelines = [ { "name": "query", - "type": "Pipeline", "nodes": [ {"name": "Parent1", "inputs": ["Query"]}, {"name": "Parent2", "inputs": ["Query"]}, @@ -286,7 +179,6 @@ def test_get_config_creates_two_different_dependent_components_of_same_type(): expected_pipelines = [ { "name": "query", - "type": "Pipeline", "nodes": [ {"name": "ParentA", "inputs": ["Query"]}, {"name": "ParentB", "inputs": ["Query"]}, @@ -302,8 +194,34 @@ def test_get_config_creates_two_different_dependent_components_of_same_type(): assert expected_component in config["components"] +def test_get_config_component_with_superclass_arguments(): + class CustomBaseDocumentStore(MockDocumentStore): + def __init__(self, base_parameter: str): + self.base_parameter = base_parameter + + class CustomDocumentStore(CustomBaseDocumentStore): + def __init__(self, sub_parameter: int): + super().__init__(base_parameter="something") + self.sub_parameter = sub_parameter + + class CustomRetriever(MockRetriever): + def __init__(self, document_store): + super().__init__() + self.document_store = document_store + + document_store = CustomDocumentStore(sub_parameter=10) + retriever = CustomRetriever(document_store=document_store) + pipeline = Pipeline() + pipeline.add_node(retriever, name="Retriever", inputs=["Query"]) + + pipeline.get_config() + assert pipeline.get_document_store().sub_parameter == 10 + assert pipeline.get_document_store().base_parameter == "something" + + def test_generate_code_simple_pipeline(): config = { + "version": "unstable", "components": [ { "name": "retri", @@ -316,7 +234,7 @@ def test_generate_code_simple_pipeline(): "params": {"index": "my-index"}, }, ], - "pipelines": [{"name": "query", "type": "Pipeline", "nodes": [{"name": "retri", "inputs": ["Query"]}]}], + "pipelines": [{"name": "query", "nodes": [{"name": "retri", "inputs": ["Query"]}]}], } code = generate_code(pipeline_config=config, pipeline_variable_name="p", generate_imports=False) @@ -331,15 +249,15 @@ def test_generate_code_simple_pipeline(): def test_generate_code_imports(): pipeline_config = { + "version": "unstable", "components": [ {"name": "DocumentStore", "type": "ElasticsearchDocumentStore"}, {"name": "retri", "type": "ElasticsearchRetriever", "params": {"document_store": "DocumentStore"}}, - {"name": "retri2", "type": "EmbeddingRetriever", "params": {"document_store": "DocumentStore"}}, + {"name": "retri2", "type": "TfidfRetriever", "params": {"document_store": "DocumentStore"}}, ], "pipelines": [ { "name": "Query", - "type": "Pipeline", "nodes": [{"name": "retri", "inputs": ["Query"]}, {"name": "retri2", "inputs": ["Query"]}], } ], @@ -348,12 +266,12 @@ def test_generate_code_imports(): code = generate_code(pipeline_config=pipeline_config, pipeline_variable_name="p", generate_imports=True) assert code == ( "from haystack.document_stores import ElasticsearchDocumentStore\n" - "from haystack.nodes import ElasticsearchRetriever, EmbeddingRetriever\n" + "from haystack.nodes import ElasticsearchRetriever, TfidfRetriever\n" "from haystack.pipelines import Pipeline\n" "\n" "document_store = ElasticsearchDocumentStore()\n" "retri = ElasticsearchRetriever(document_store=document_store)\n" - "retri_2 = EmbeddingRetriever(document_store=document_store)\n" + "retri_2 = TfidfRetriever(document_store=document_store)\n" "\n" "p = Pipeline()\n" 'p.add_node(component=retri, name="retri", inputs=["Query"])\n' @@ -363,11 +281,12 @@ def test_generate_code_imports(): def test_generate_code_imports_no_pipeline_cls(): pipeline_config = { + "version": "unstable", "components": [ {"name": "DocumentStore", "type": "ElasticsearchDocumentStore"}, {"name": "retri", "type": "ElasticsearchRetriever", "params": {"document_store": "DocumentStore"}}, ], - "pipelines": [{"name": "Query", "type": "Pipeline", "nodes": [{"name": "retri", "inputs": ["Query"]}]}], + "pipelines": [{"name": "Query", "nodes": [{"name": "retri", "inputs": ["Query"]}]}], } code = generate_code( @@ -390,11 +309,12 @@ def test_generate_code_imports_no_pipeline_cls(): def test_generate_code_comment(): pipeline_config = { + "version": "unstable", "components": [ {"name": "DocumentStore", "type": "ElasticsearchDocumentStore"}, {"name": "retri", "type": "ElasticsearchRetriever", "params": {"document_store": "DocumentStore"}}, ], - "pipelines": [{"name": "Query", "type": "Pipeline", "nodes": [{"name": "retri", "inputs": ["Query"]}]}], + "pipelines": [{"name": "Query", "nodes": [{"name": "retri", "inputs": ["Query"]}]}], } comment = "This is my comment\n...and here is a new line" @@ -416,17 +336,17 @@ def test_generate_code_comment(): def test_generate_code_is_component_order_invariant(): pipeline_config = { + "version": "unstable", "pipelines": [ { "name": "Query", - "type": "Pipeline", "nodes": [ {"name": "EsRetriever", "inputs": ["Query"]}, {"name": "EmbeddingRetriever", "inputs": ["Query"]}, {"name": "JoinResults", "inputs": ["EsRetriever", "EmbeddingRetriever"]}, ], } - ] + ], } doc_store = {"name": "ElasticsearchDocumentStore", "type": "ElasticsearchDocumentStore"} @@ -471,52 +391,45 @@ def test_generate_code_is_component_order_invariant(): @pytest.mark.parametrize("input", ["\btest", " test", "#test", "+test", "\ttest", "\ntest", "test()"]) def test_validate_user_input_invalid(input): - with pytest.raises(ValueError, match="is not a valid config variable name"): - _validate_user_input(input) + with pytest.raises(PipelineConfigError, match="is not a valid variable name or value"): + validate_config_strings(input) @pytest.mark.parametrize( "input", ["test", "testName", "test_name", "test-name", "test-name1234", "http://localhost:8000/my-path"] ) def test_validate_user_input_valid(input): - _validate_user_input(input) + validate_config_strings(input) def test_validate_pipeline_config_invalid_component_name(): - with pytest.raises(ValueError, match="is not a valid config variable name"): - validate_config({"components": [{"name": "\btest"}]}) + with pytest.raises(PipelineConfigError, match="is not a valid variable name or value"): + validate_config_strings({"components": [{"name": "\btest"}]}) def test_validate_pipeline_config_invalid_component_type(): - with pytest.raises(ValueError, match="is not a valid config variable name"): - validate_config({"components": [{"name": "test", "type": "\btest"}]}) + with pytest.raises(PipelineConfigError, match="is not a valid variable name or value"): + validate_config_strings({"components": [{"name": "test", "type": "\btest"}]}) def test_validate_pipeline_config_invalid_component_param(): - with pytest.raises(ValueError, match="is not a valid config variable name"): - validate_config({"components": [{"name": "test", "type": "test", "params": {"key": "\btest"}}]}) + with pytest.raises(PipelineConfigError, match="is not a valid variable name or value"): + validate_config_strings({"components": [{"name": "test", "type": "test", "params": {"key": "\btest"}}]}) def test_validate_pipeline_config_invalid_component_param_key(): - with pytest.raises(ValueError, match="is not a valid config variable name"): - validate_config({"components": [{"name": "test", "type": "test", "params": {"\btest": "test"}}]}) + with pytest.raises(PipelineConfigError, match="is not a valid variable name or value"): + validate_config_strings({"components": [{"name": "test", "type": "test", "params": {"\btest": "test"}}]}) def test_validate_pipeline_config_invalid_pipeline_name(): - with pytest.raises(ValueError, match="is not a valid config variable name"): - validate_config({"components": [{"name": "test", "type": "test"}], "pipelines": [{"name": "\btest"}]}) - - -def test_validate_pipeline_config_invalid_pipeline_type(): - with pytest.raises(ValueError, match="is not a valid config variable name"): - validate_config( - {"components": [{"name": "test", "type": "test"}], "pipelines": [{"name": "test", "type": "\btest"}]} - ) + with pytest.raises(PipelineConfigError, match="is not a valid variable name or value"): + validate_config_strings({"components": [{"name": "test", "type": "test"}], "pipelines": [{"name": "\btest"}]}) def test_validate_pipeline_config_invalid_pipeline_node_name(): - with pytest.raises(ValueError, match="is not a valid config variable name"): - validate_config( + with pytest.raises(PipelineConfigError, match="is not a valid variable name or value"): + validate_config_strings( { "components": [{"name": "test", "type": "test"}], "pipelines": [{"name": "test", "type": "test", "nodes": [{"name": "\btest"}]}], @@ -525,8 +438,8 @@ def test_validate_pipeline_config_invalid_pipeline_node_name(): def test_validate_pipeline_config_invalid_pipeline_node_inputs(): - with pytest.raises(ValueError, match="is not a valid config variable name"): - validate_config( + with pytest.raises(PipelineConfigError, match="is not a valid variable name or value"): + validate_config_strings( { "components": [{"name": "test", "type": "test"}], "pipelines": [{"name": "test", "type": "test", "nodes": [{"name": "test", "inputs": ["\btest"]}]}], @@ -534,6 +447,15 @@ def test_validate_pipeline_config_invalid_pipeline_node_inputs(): ) +def test_validate_pipeline_config_recursive_config(): + pipeline_config = {} + node = {"config": pipeline_config} + pipeline_config["node"] = node + + with pytest.raises(PipelineConfigError, match="recursive"): + validate_config_strings(pipeline_config) + + @pytest.mark.usefixtures(deepset_cloud_fixture.__name__) @responses.activate def test_load_from_deepset_cloud_query(): @@ -1212,46 +1134,42 @@ def test_undeploy_on_deepset_cloud_timeout(): ) -# @pytest.mark.slow -# @pytest.mark.elasticsearch -# @pytest.mark.parametrize( -# "retriever_with_docs, document_store_with_docs", -# [("elasticsearch", "elasticsearch")], -# indirect=True, -# ) -@pytest.mark.parametrize( - "retriever_with_docs,document_store_with_docs", - [ - ("dpr", "elasticsearch"), - ("dpr", "faiss"), - ("dpr", "memory"), - ("dpr", "milvus1"), - ("embedding", "elasticsearch"), - ("embedding", "faiss"), - ("embedding", "memory"), - ("embedding", "milvus1"), - ("elasticsearch", "elasticsearch"), - ("es_filter_only", "elasticsearch"), - ("tfidf", "memory"), - ], - indirect=True, -) -def test_graph_creation(retriever_with_docs, document_store_with_docs): +def test_graph_creation_invalid_edge(): + docstore = MockDocumentStore() + retriever = DummyRetriever(document_store=docstore) pipeline = Pipeline() - pipeline.add_node(name="ES", component=retriever_with_docs, inputs=["Query"]) + pipeline.add_node(name="DocStore", component=docstore, inputs=["Query"]) - with pytest.raises(AssertionError): - pipeline.add_node(name="Reader", component=retriever_with_docs, inputs=["ES.output_2"]) + with pytest.raises(PipelineConfigError, match="'output_2' from 'DocStore'"): + pipeline.add_node(name="Retriever", component=retriever, inputs=["DocStore.output_2"]) - with pytest.raises(AssertionError): - pipeline.add_node(name="Reader", component=retriever_with_docs, inputs=["ES.wrong_edge_label"]) - with pytest.raises(Exception): - pipeline.add_node(name="Reader", component=retriever_with_docs, inputs=["InvalidNode"]) +def test_graph_creation_non_existing_edge(): + docstore = MockDocumentStore() + retriever = DummyRetriever(document_store=docstore) + pipeline = Pipeline() + pipeline.add_node(name="DocStore", component=docstore, inputs=["Query"]) - with pytest.raises(Exception): - pipeline = Pipeline() - pipeline.add_node(name="ES", component=retriever_with_docs, inputs=["InvalidNode"]) + with pytest.raises(PipelineConfigError, match="'wrong_edge_label' is not a valid edge name"): + pipeline.add_node(name="Retriever", component=retriever, inputs=["DocStore.wrong_edge_label"]) + + +def test_graph_creation_invalid_node(): + docstore = MockDocumentStore() + retriever = DummyRetriever(document_store=docstore) + pipeline = Pipeline() + pipeline.add_node(name="DocStore", component=docstore, inputs=["Query"]) + + with pytest.raises(PipelineConfigError, match="Cannot find node 'InvalidNode'"): + pipeline.add_node(name="Retriever", component=retriever, inputs=["InvalidNode"]) + + +def test_graph_creation_invalid_root_node(): + docstore = MockDocumentStore() + pipeline = Pipeline() + + with pytest.raises(PipelineConfigError, match="Root node 'InvalidNode' is invalid"): + pipeline.add_node(name="DocStore", component=docstore, inputs=["InvalidNode"]) def test_parallel_paths_in_pipeline_graph(): @@ -1414,10 +1332,7 @@ def test_pipeline_components(): def test_pipeline_get_document_store_from_components(): - class DummyDocumentStore(BaseDocumentStore): - pass - - doc_store = DummyDocumentStore() + doc_store = MockDocumentStore() pipeline = Pipeline() pipeline.add_node(name="A", component=doc_store, inputs=["File"]) @@ -1425,11 +1340,8 @@ def test_pipeline_get_document_store_from_components(): def test_pipeline_get_document_store_from_components_multiple_doc_stores(): - class DummyDocumentStore(BaseDocumentStore): - pass - - doc_store_a = DummyDocumentStore() - doc_store_b = DummyDocumentStore() + doc_store_a = MockDocumentStore() + doc_store_b = MockDocumentStore() pipeline = Pipeline() pipeline.add_node(name="A", component=doc_store_a, inputs=["File"]) pipeline.add_node(name="B", component=doc_store_b, inputs=["File"]) @@ -1439,18 +1351,7 @@ def test_pipeline_get_document_store_from_components_multiple_doc_stores(): def test_pipeline_get_document_store_from_retriever(): - class DummyRetriever(BaseRetriever): - def __init__(self, document_store): - self.document_store = document_store - - def run(self): - test = "test" - return {"test": test}, "output_1" - - class DummyDocumentStore(BaseDocumentStore): - pass - - doc_store = DummyDocumentStore() + doc_store = MockDocumentStore() retriever = DummyRetriever(document_store=doc_store) pipeline = Pipeline() pipeline.add_node(name="A", component=retriever, inputs=["Query"]) @@ -1459,26 +1360,7 @@ def test_pipeline_get_document_store_from_retriever(): def test_pipeline_get_document_store_from_dual_retriever(): - class DummyRetriever(BaseRetriever): - def __init__(self, document_store): - self.document_store = document_store - - def run(self): - test = "test" - return {"test": test}, "output_1" - - class DummyDocumentStore(BaseDocumentStore): - pass - - class JoinNode(RootNode): - def run(self, output=None, inputs=None): - if inputs: - output = "" - for input_dict in inputs: - output += input_dict["output"] - return {"output": output}, "output_1" - - doc_store = DummyDocumentStore() + doc_store = MockDocumentStore() retriever_a = DummyRetriever(document_store=doc_store) retriever_b = DummyRetriever(document_store=doc_store) pipeline = Pipeline() @@ -1490,27 +1372,8 @@ def test_pipeline_get_document_store_from_dual_retriever(): def test_pipeline_get_document_store_multiple_doc_stores_from_dual_retriever(): - class DummyRetriever(BaseRetriever): - def __init__(self, document_store): - self.document_store = document_store - - def run(self): - test = "test" - return {"test": test}, "output_1" - - class DummyDocumentStore(BaseDocumentStore): - pass - - class JoinNode(RootNode): - def run(self, output=None, inputs=None): - if inputs: - output = "" - for input_dict in inputs: - output += input_dict["output"] - return {"output": output}, "output_1" - - doc_store_a = DummyDocumentStore() - doc_store_b = DummyDocumentStore() + doc_store_a = MockDocumentStore() + doc_store_b = MockDocumentStore() retriever_a = DummyRetriever(document_store=doc_store_a) retriever_b = DummyRetriever(document_store=doc_store_b) pipeline = Pipeline() diff --git a/test/test_pipeline_debug_and_validation.py b/test/test_pipeline_debug_and_validation.py index a71d3c987..851b8a456 100644 --- a/test/test_pipeline_debug_and_validation.py +++ b/test/test_pipeline_debug_and_validation.py @@ -6,7 +6,19 @@ import pytest from haystack.pipelines import Pipeline, RootNode from haystack.nodes import FARMReader, ElasticsearchRetriever -from conftest import SAMPLES_PATH +from .conftest import SAMPLES_PATH, MockRetriever as BaseMockRetriever, MockReader + + +class MockRetriever(BaseMockRetriever): + def retrieve(self, *args, **kwargs): + top_k = None + if "top_k" in kwargs.keys(): + top_k = kwargs["top_k"] + elif len(args) > 0: + top_k = args[-1] + + if top_k and not isinstance(top_k, int): + raise ValueError("TEST ERROR!") @pytest.mark.elasticsearch @@ -132,19 +144,34 @@ def test_global_debug_attributes_override_node_ones(document_store_with_docs, tm assert prediction["_debug"]["Reader"]["output"] -def test_invalid_run_args(): - pipeline = Pipeline.load_from_yaml(SAMPLES_PATH / "pipeline" / "test_pipeline.yaml", pipeline_name="query_pipeline") - with pytest.raises(Exception) as exc: - pipeline.run(params={"ESRetriever": {"top_k": 10}}) - assert "run() missing 1 required positional argument: 'query'" in str(exc.value) +def test_missing_top_level_arg(): + pipeline = Pipeline() + pipeline.add_node(component=MockRetriever(), name="Retriever", inputs=["Query"]) + pipeline.add_node(component=MockReader(), name="Reader", inputs=["Retriever"]) with pytest.raises(Exception) as exc: - pipeline.run(invalid_query="Who made the PDF specification?", params={"ESRetriever": {"top_k": 10}}) + pipeline.run(params={"Retriever": {"top_k": 10}}) + assert "Must provide a 'query' parameter" in str(exc.value) + + +def test_unexpected_top_level_arg(): + pipeline = Pipeline() + pipeline.add_node(component=MockRetriever(), name="Retriever", inputs=["Query"]) + pipeline.add_node(component=MockReader(), name="Reader", inputs=["Retriever"]) + + with pytest.raises(Exception) as exc: + pipeline.run(invalid_query="Who made the PDF specification?", params={"Retriever": {"top_k": 10}}) assert "run() got an unexpected keyword argument 'invalid_query'" in str(exc.value) + +def test_unexpected_node_arg(): + pipeline = Pipeline() + pipeline.add_node(component=MockRetriever(), name="Retriever", inputs=["Query"]) + pipeline.add_node(component=MockReader(), name="Reader", inputs=["Retriever"]) + with pytest.raises(Exception) as exc: - pipeline.run(query="Who made the PDF specification?", params={"ESRetriever": {"invalid": 10}}) - assert "Invalid parameter 'invalid' for the node 'ESRetriever'" in str(exc.value) + pipeline.run(query="Who made the PDF specification?", params={"Retriever": {"invalid": 10}}) + assert "Invalid parameter 'invalid' for the node 'Retriever'" in str(exc.value) def test_debug_info_propagation(): diff --git a/test/test_pipeline_yaml.py b/test/test_pipeline_yaml.py new file mode 100644 index 000000000..cf32bdbcc --- /dev/null +++ b/test/test_pipeline_yaml.py @@ -0,0 +1,670 @@ +import pytest +import json +import numpy as np +import networkx as nx +from enum import Enum +from pydantic.dataclasses import dataclass + +import haystack +from haystack import Pipeline +from haystack import document_stores +from haystack.document_stores.base import BaseDocumentStore +from haystack.nodes import _json_schema +from haystack.nodes import FileTypeClassifier +from haystack.errors import HaystackError, PipelineConfigError, PipelineSchemaError + +from .conftest import SAMPLES_PATH, MockNode, MockDocumentStore, MockReader, MockRetriever +from . import conftest + + +# +# Fixtures +# + + +@pytest.fixture(autouse=True) +def mock_json_schema(request, monkeypatch, tmp_path): + """ + JSON schema with the unstable version and only mocked nodes. + """ + # Do not patch integration tests + if "integration" in request.keywords: + return + + # Mock the subclasses list to make it very small, containing only mock nodes + monkeypatch.setattr( + haystack.nodes._json_schema, + "find_subclasses_in_modules", + lambda *a, **k: [(conftest, MockDocumentStore), (conftest, MockReader), (conftest, MockRetriever)], + ) + # Point the JSON schema path to tmp_path + monkeypatch.setattr(haystack.pipelines.config, "JSON_SCHEMAS_PATH", tmp_path) + + # Generate mock schema in tmp_path + filename = f"haystack-pipeline-unstable.schema.json" + test_schema = _json_schema.get_json_schema(filename=filename, compatible_versions=["unstable"]) + + with open(tmp_path / filename, "w") as schema_file: + json.dump(test_schema, schema_file, indent=4) + + +# +# Integration +# + + +@pytest.mark.integration +@pytest.mark.elasticsearch +def test_load_and_save_from_yaml(tmp_path): + config_path = SAMPLES_PATH / "pipeline" / "test_pipeline.yaml" + + # Test the indexing pipeline: + # Load it + indexing_pipeline = Pipeline.load_from_yaml(path=config_path, pipeline_name="indexing_pipeline") + + # Check if it works + indexing_pipeline.get_document_store().delete_documents() + assert indexing_pipeline.get_document_store().get_document_count() == 0 + indexing_pipeline.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf") + assert indexing_pipeline.get_document_store().get_document_count() > 0 + + # Save it + new_indexing_config = tmp_path / "test_indexing.yaml" + indexing_pipeline.save_to_yaml(new_indexing_config) + + # Re-load it and compare the resulting pipelines + new_indexing_pipeline = Pipeline.load_from_yaml(path=new_indexing_config) + assert nx.is_isomorphic(new_indexing_pipeline.graph, indexing_pipeline.graph) + + # Check that modifying a pipeline modifies the output YAML + modified_indexing_pipeline = Pipeline.load_from_yaml(path=new_indexing_config) + modified_indexing_pipeline.add_node(FileTypeClassifier(), name="file_classifier", inputs=["File"]) + assert not nx.is_isomorphic(new_indexing_pipeline.graph, modified_indexing_pipeline.graph) + + # Test the query pipeline: + # Load it + query_pipeline = Pipeline.load_from_yaml(path=config_path, pipeline_name="query_pipeline") + + # Check if it works + prediction = query_pipeline.run( + query="Who made the PDF specification?", params={"ESRetriever": {"top_k": 10}, "Reader": {"top_k": 3}} + ) + assert prediction["query"] == "Who made the PDF specification?" + assert prediction["answers"][0].answer == "Adobe Systems" + assert "_debug" not in prediction.keys() + + # Save it + new_query_config = tmp_path / "test_query.yaml" + query_pipeline.save_to_yaml(new_query_config) + + # Re-load it and compare the resulting pipelines + new_query_pipeline = Pipeline.load_from_yaml(path=new_query_config) + assert nx.is_isomorphic(new_query_pipeline.graph, query_pipeline.graph) + + # Check that different pipelines produce different files + assert not nx.is_isomorphic(new_query_pipeline.graph, new_indexing_pipeline.graph) + + +# +# Unit +# + + +def test_load_yaml(tmp_path): + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + - name: retriever + type: MockRetriever + - name: reader + type: MockReader + pipelines: + - name: query + nodes: + - name: retriever + inputs: + - Query + - name: reader + inputs: + - retriever + """ + ) + pipeline = Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + assert len(pipeline.graph.nodes) == 3 + assert isinstance(pipeline.get_node("retriever"), MockRetriever) + assert isinstance(pipeline.get_node("reader"), MockReader) + + +def test_load_yaml_non_existing_file(): + with pytest.raises(FileNotFoundError): + Pipeline.load_from_yaml(path=SAMPLES_PATH / "pipeline" / "I_dont_exist.yml") + + +def test_load_yaml_invalid_yaml(tmp_path): + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write("this is not valid YAML!") + with pytest.raises(PipelineConfigError): + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + + +def test_load_yaml_missing_version(tmp_path): + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + """ + components: + - name: docstore + type: MockDocumentStore + pipelines: + - name: my_pipeline + nodes: + - name: docstore + inputs: + - Query + """ + ) + with pytest.raises(PipelineConfigError) as e: + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + assert "version" in str(e) + + +def test_load_yaml_non_existing_version(tmp_path): + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + """ + version: random + components: + - name: docstore + type: MockDocumentStore + pipelines: + - name: my_pipeline + nodes: + - name: docstore + inputs: + - Query + """ + ) + with pytest.raises(PipelineConfigError) as e: + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + assert "version" in str(e) and "random" in str(e) + + +def test_load_yaml_incompatible_version(tmp_path): + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + """ + version: 1.1.0 + components: + - name: docstore + type: MockDocumentStore + pipelines: + - name: my_pipeline + nodes: + - name: docstore + inputs: + - Query + """ + ) + with pytest.raises(PipelineConfigError) as e: + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + assert "version" in str(e) and "1.1.0" in str(e) + + +def test_load_yaml_no_components(tmp_path): + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + pipelines: + - name: my_pipeline + nodes: + """ + ) + with pytest.raises(PipelineConfigError) as e: + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + assert "components" in str(e) + + +def test_load_yaml_wrong_component(tmp_path): + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + - name: docstore + type: ImaginaryDocumentStore + pipelines: + - name: my_pipeline + nodes: + - name: docstore + inputs: + - Query + """ + ) + with pytest.raises(HaystackError) as e: + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + assert "ImaginaryDocumentStore" in str(e) + + +def test_load_yaml_custom_component(tmp_path): + class CustomNode(MockNode): + def __init__(self, param: int): + self.param = param + + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + - name: custom_node + type: CustomNode + params: + param: 1 + pipelines: + - name: my_pipeline + nodes: + - name: custom_node + inputs: + - Query + """ + ) + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + + +def test_load_yaml_custom_component_with_helper_class_in_init(tmp_path): + """ + This test can work from the perspective of YAML schema validation: + HelperClass is picked up correctly and everything gets loaded. + + However, for now we decide to disable this feature. + See haystack/_json_schema.py for details. + """ + + @dataclass # Makes this test class JSON serializable + class HelperClass: + def __init__(self, another_param: str): + self.param = another_param + + class CustomNode(MockNode): + def __init__(self, some_exotic_parameter: HelperClass = HelperClass(1)): + self.some_exotic_parameter = some_exotic_parameter + + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + - name: custom_node + type: CustomNode + pipelines: + - name: my_pipeline + nodes: + - name: custom_node + inputs: + - Query + """ + ) + with pytest.raises(PipelineSchemaError, match="takes object instances as parameters in its __init__ function"): + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + + +def test_load_yaml_custom_component_with_helper_class_in_yaml(tmp_path): + """ + This test can work from the perspective of YAML schema validation: + HelperClass is picked up correctly and everything gets loaded. + + However, for now we decide to disable this feature. + See haystack/_json_schema.py for details. + """ + + class HelperClass: + def __init__(self, another_param: str): + self.param = another_param + + class CustomNode(MockNode): + def __init__(self, some_exotic_parameter: HelperClass): + self.some_exotic_parameter = some_exotic_parameter + + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + - name: custom_node + type: CustomNode + params: + some_exotic_parameter: HelperClass("hello") + pipelines: + - name: my_pipeline + nodes: + - name: custom_node + inputs: + - Query + """ + ) + with pytest.raises(PipelineConfigError, match="not a valid variable name or value"): + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + + +def test_load_yaml_custom_component_with_enum_in_init(tmp_path): + """ + This test can work from the perspective of YAML schema validation: + Flags is picked up correctly and everything gets loaded. + + However, for now we decide to disable this feature. + See haystack/_json_schema.py for details. + """ + + class Flags(Enum): + FIRST_VALUE = 1 + SECOND_VALUE = 2 + + class CustomNode(MockNode): + def __init__(self, some_exotic_parameter: Flags = None): + self.some_exotic_parameter = some_exotic_parameter + + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + - name: custom_node + type: CustomNode + pipelines: + - name: my_pipeline + nodes: + - name: custom_node + inputs: + - Query + """ + ) + with pytest.raises(PipelineSchemaError, match="takes object instances as parameters in its __init__ function"): + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + + +def test_load_yaml_custom_component_with_enum_in_yaml(tmp_path): + """ + This test can work from the perspective of YAML schema validation: + Flags is picked up correctly and everything gets loaded. + + However, for now we decide to disable this feature. + See haystack/_json_schema.py for details. + """ + + class Flags(Enum): + FIRST_VALUE = 1 + SECOND_VALUE = 2 + + class CustomNode(MockNode): + def __init__(self, some_exotic_parameter: Flags): + self.some_exotic_parameter = some_exotic_parameter + + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + - name: custom_node + type: CustomNode + params: + some_exotic_parameter: Flags.SECOND_VALUE + pipelines: + - name: my_pipeline + nodes: + - name: custom_node + inputs: + - Query + """ + ) + with pytest.raises(PipelineSchemaError, match="takes object instances as parameters in its __init__ function"): + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + + +def test_load_yaml_custom_component_with_external_constant(tmp_path): + """ + This is a potential pitfall. The code should work as described here. + """ + + class AnotherClass: + CLASS_CONSTANT = "str" + + class CustomNode(MockNode): + def __init__(self, some_exotic_parameter: str): + self.some_exotic_parameter = some_exotic_parameter + + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + - name: custom_node + type: CustomNode + params: + some_exotic_parameter: AnotherClass.CLASS_CONSTANT # Will *NOT* be resolved + pipelines: + - name: my_pipeline + nodes: + - name: custom_node + inputs: + - Query + """ + ) + pipeline = Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + node = pipeline.get_node("custom_node") + node.some_exotic_parameter == "AnotherClass.CLASS_CONSTANT" + + +def test_load_yaml_custom_component_with_superclass(tmp_path): + class BaseCustomNode(MockNode): + pass + + class CustomNode(BaseCustomNode): + def __init__(self, some_exotic_parameter: str): + self.some_exotic_parameter = some_exotic_parameter + + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + - name: custom_node + type: CustomNode + params: + some_exotic_parameter: value + pipelines: + - name: my_pipeline + nodes: + - name: custom_node + inputs: + - Query + """ + ) + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + + +def test_load_yaml_no_pipelines(tmp_path): + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + - name: docstore + type: MockDocumentStore + pipelines: + """ + ) + with pytest.raises(PipelineConfigError) as e: + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + assert "pipeline" in str(e) + + +def test_load_yaml_invalid_pipeline_name(tmp_path): + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + - name: docstore + type: MockDocumentStore + pipelines: + - name: my_pipeline + nodes: + - name: docstore + inputs: + - Query + """ + ) + with pytest.raises(PipelineConfigError) as e: + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml", pipeline_name="invalid") + assert "invalid" in str(e) and "pipeline" in str(e) + + +def test_load_yaml_pipeline_with_wrong_nodes(tmp_path): + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + - name: docstore + type: MockDocumentStore + pipelines: + - name: my_pipeline + nodes: + - name: not_existing_node + inputs: + - Query + """ + ) + with pytest.raises(PipelineConfigError) as e: + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + assert "not_existing_node" in str(e) + + +def test_load_yaml_pipeline_not_acyclic_graph(tmp_path): + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + - name: retriever + type: MockRetriever + - name: reader + type: MockRetriever + pipelines: + - name: my_pipeline + nodes: + - name: retriever + inputs: + - reader + - name: reader + inputs: + - retriever + """ + ) + with pytest.raises(PipelineConfigError) as e: + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + assert "reader" in str(e) or "retriever" in str(e) + assert "loop" in str(e) + + +def test_load_yaml_wrong_root(tmp_path): + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + - name: retriever + type: MockRetriever + pipelines: + - name: my_pipeline + nodes: + - name: retriever + inputs: + - Nothing + """ + ) + with pytest.raises(PipelineConfigError) as e: + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + assert "Nothing" in str(e) + assert "root" in str(e).lower() + + +def test_load_yaml_two_roots(tmp_path): + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + - name: retriever + type: MockRetriever + - name: retriever_2 + type: MockRetriever + pipelines: + - name: my_pipeline + nodes: + - name: retriever + inputs: + - Query + - name: retriever_2 + inputs: + - File + """ + ) + with pytest.raises(PipelineConfigError) as e: + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + assert "File" in str(e) or "Query" in str(e) + + +def test_load_yaml_disconnected_component(tmp_path): + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + - name: docstore + type: MockDocumentStore + - name: retriever + type: MockRetriever + pipelines: + - name: query + nodes: + - name: docstore + inputs: + - Query + """ + ) + pipeline = Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + assert len(pipeline.graph.nodes) == 2 + assert isinstance(pipeline.get_document_store(), MockDocumentStore) + assert not pipeline.get_node("retriever") + + +def test_save_yaml(tmp_path): + pipeline = Pipeline() + pipeline.add_node(MockRetriever(), name="retriever", inputs=["Query"]) + pipeline.save_to_yaml(tmp_path / "saved_pipeline.yml") + + with open(tmp_path / "saved_pipeline.yml", "r") as saved_yaml: + content = saved_yaml.read() + + assert content.count("retriever") == 2 + assert "MockRetriever" in content + assert "Query" in content + assert f"version: {haystack.__version__}" in content + + +def test_save_yaml_overwrite(tmp_path): + pipeline = Pipeline() + retriever = MockRetriever() + pipeline.add_node(component=retriever, name="retriever", inputs=["Query"]) + + with open(tmp_path / "saved_pipeline.yml", "w") as _: + pass + + pipeline.save_to_yaml(tmp_path / "saved_pipeline.yml") + + with open(tmp_path / "saved_pipeline.yml", "r") as saved_yaml: + content = saved_yaml.read() + assert content != "" diff --git a/test/test_preprocessor.py b/test/test_preprocessor.py index 80dac4e0d..93a5b9952 100644 --- a/test/test_preprocessor.py +++ b/test/test_preprocessor.py @@ -4,7 +4,7 @@ from haystack import Document from haystack.nodes.file_converter.pdf import PDFToTextConverter from haystack.nodes.preprocessor.preprocessor import PreProcessor -from conftest import SAMPLES_PATH +from .conftest import SAMPLES_PATH TEXT = """ This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in diff --git a/test/test_ray.py b/test/test_ray.py index 693e2d720..6cf63582e 100644 --- a/test/test_ray.py +++ b/test/test_ray.py @@ -5,23 +5,10 @@ import ray from haystack.pipelines import RayPipeline -from conftest import SAMPLES_PATH +from .conftest import SAMPLES_PATH -@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) -def test_load_pipeline(document_store_with_docs): - pipeline = RayPipeline.load_from_yaml( - SAMPLES_PATH / "pipeline" / "test_pipeline.yaml", pipeline_name="ray_query_pipeline", num_cpus=8 - ) - prediction = pipeline.run(query="Who lives in Berlin?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 3}}) - - assert ray.serve.get_deployment(name="ESRetriever").num_replicas == 2 - assert ray.serve.get_deployment(name="Reader").num_replicas == 1 - assert prediction["query"] == "Who lives in Berlin?" - assert prediction["answers"][0].answer == "Carla" - - -@pytest.fixture(scope="function", autouse=True) +@pytest.fixture(autouse=True) def shutdown_ray(): yield try: @@ -30,3 +17,17 @@ def shutdown_ray(): ray.shutdown() except: pass + + +@pytest.mark.integration +@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) +def test_load_pipeline(document_store_with_docs): + pipeline = RayPipeline.load_from_yaml( + SAMPLES_PATH / "pipeline" / "test_ray_pipeline.yaml", pipeline_name="ray_query_pipeline", num_cpus=8 + ) + prediction = pipeline.run(query="Who lives in Berlin?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 3}}) + + assert ray.serve.get_deployment(name="ESRetriever").num_replicas == 2 + assert ray.serve.get_deployment(name="Reader").num_replicas == 1 + assert prediction["query"] == "Who lives in Berlin?" + assert prediction["answers"][0].answer == "Carla" diff --git a/test/test_retriever.py b/test/test_retriever.py index a2fb39095..3b4a86e73 100644 --- a/test/test_retriever.py +++ b/test/test_retriever.py @@ -15,7 +15,7 @@ from haystack.nodes.retriever.dense import DensePassageRetriever, TableTextRetri from haystack.nodes.retriever.sparse import ElasticsearchRetriever, ElasticsearchFilterOnlyRetriever, TfidfRetriever from transformers import DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizerFast -from conftest import SAMPLES_PATH +from .conftest import SAMPLES_PATH @pytest.fixture() @@ -244,8 +244,8 @@ def test_table_text_retriever_embedding(document_store, retriever, docs): @pytest.mark.parametrize("retriever", ["dpr"], indirect=True) @pytest.mark.parametrize("document_store", ["memory"], indirect=True) -def test_dpr_saving_and_loading(retriever, document_store): - retriever.save("test_dpr_save") +def test_dpr_saving_and_loading(tmp_path, retriever, document_store): + retriever.save(f"{tmp_path}/test_dpr_save") def sum_params(model): s = [] @@ -258,7 +258,7 @@ def test_dpr_saving_and_loading(retriever, document_store): original_sum_passage = sum_params(retriever.passage_encoder) del retriever - loaded_retriever = DensePassageRetriever.load("test_dpr_save", document_store) + loaded_retriever = DensePassageRetriever.load(f"{tmp_path}/test_dpr_save", document_store) loaded_sum_query = sum_params(loaded_retriever.query_encoder) loaded_sum_passage = sum_params(loaded_retriever.passage_encoder) @@ -292,8 +292,8 @@ def test_dpr_saving_and_loading(retriever, document_store): @pytest.mark.parametrize("retriever", ["table_text_retriever"], indirect=True) @pytest.mark.embedding_dim(512) -def test_table_text_retriever_saving_and_loading(retriever, document_store): - retriever.save("test_table_text_retriever_save") +def test_table_text_retriever_saving_and_loading(tmp_path, retriever, document_store): + retriever.save(f"{tmp_path}/test_table_text_retriever_save") def sum_params(model): s = [] @@ -307,7 +307,7 @@ def test_table_text_retriever_saving_and_loading(retriever, document_store): original_sum_table = sum_params(retriever.table_encoder) del retriever - loaded_retriever = TableTextRetriever.load("test_table_text_retriever_save", document_store) + loaded_retriever = TableTextRetriever.load(f"{tmp_path}/test_table_text_retriever_save", document_store) loaded_sum_query = sum_params(loaded_retriever.query_encoder) loaded_sum_passage = sum_params(loaded_retriever.passage_encoder) diff --git a/test/test_standard_pipelines.py b/test/test_standard_pipelines.py index bec71e53d..00764d4b9 100644 --- a/test/test_standard_pipelines.py +++ b/test/test_standard_pipelines.py @@ -16,7 +16,7 @@ from haystack.nodes import ( ) from haystack.schema import Document -from conftest import SAMPLES_PATH +from .conftest import SAMPLES_PATH @pytest.mark.parametrize( diff --git a/test/test_summarizer_translation.py b/test/test_summarizer_translation.py index e37f6ecca..5f263c670 100644 --- a/test/test_summarizer_translation.py +++ b/test/test_summarizer_translation.py @@ -2,7 +2,7 @@ import pytest from haystack.pipelines import TranslationWrapperPipeline, SearchSummarizationPipeline from haystack.nodes import DensePassageRetriever, EmbeddingRetriever -from test_summarizer import SPLIT_DOCS +from .test_summarizer import SPLIT_DOCS # Keeping few (retriever,document_store) combination to reduce test time @pytest.mark.slow diff --git a/test/test_tokenization.py b/test/test_tokenization.py index d87f23129..0fdb2e084 100644 --- a/test/test_tokenization.py +++ b/test/test_tokenization.py @@ -192,7 +192,7 @@ def test_bert_tokenizer_all_meta(caplog): ] -def test_save_load(caplog): +def test_save_load(tmp_path, caplog): caplog.set_level(logging.CRITICAL) lang_names = ["bert-base-cased", "roberta-base", "xlnet-base-cased"] @@ -209,7 +209,7 @@ def test_save_load(caplog): for tokenizer in tokenizers: tokenizer_type = tokenizer.__class__.__name__ - save_dir = f"testsave/{tokenizer_type}" + save_dir = f"{tmp_path}/testsave/{tokenizer_type}" tokenizer.save_pretrained(save_dir) tokenizer_loaded = Tokenizer.load(save_dir, tokenizer_class=tokenizer_type) encoded_before = tokenizer.encode_plus(basic_text).encodings[0] diff --git a/test/test_utils.py b/test/test_utils.py index d6b10d6b7..97c2e4ac6 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -7,7 +7,7 @@ from haystack.utils.cleaning import clean_wiki_text from haystack.utils.augment_squad import augment_squad from haystack.utils.squad_data import SquadData -from conftest import SAMPLES_PATH +from .conftest import SAMPLES_PATH def test_convert_files_to_dicts(): diff --git a/test/test_weaviate.py b/test/test_weaviate.py index 5e46e015f..0216caadc 100644 --- a/test/test_weaviate.py +++ b/test/test_weaviate.py @@ -1,7 +1,7 @@ import numpy as np import pytest from haystack.schema import Document -from conftest import get_document_store +from .conftest import get_document_store import uuid embedding_dim = 768