From ae712fe6bf087c717f3e38e4e87d2347165fc12b Mon Sep 17 00:00:00 2001 From: Sara Zan Date: Fri, 1 Apr 2022 14:37:34 +0200 Subject: [PATCH] Upgrade `weaviate-client` to `3.3.3` and fix `get_all_documents` (#1895) * Fix 'bug' on Weaviate only returning max. 100 docs on get_all_documents * Add type * Update Weaviate version on the CI * Fix bug on get_document_count where there are no documents * Add more info in the docstrings of get_all_documents and get_all_documents_generator * Add latest docstring and tutorial changes * Apply Black * Update Documentation & Code Style * Trigger pipeline * Update Documentation & Code Style * Include StefanBogdan feedback * Fix mypy issues and LogicalFilterClause * Add more types * Update Documentation & Code Style * update setup.cfg * Upgrade weaviate containers too * Allow to filter for content field in Weaviate * Use convert_to_weaviate instead of convert_to_pinecone * Fix _get_all_documents_in_index * Update docstrings and docs * Catching an exception in get_document(s)_by_id Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: bogdankostic --- .github/workflows/linux_ci.yml | 2 +- CONTRIBUTING.md | 2 +- docs/_src/api/api/document_store.md | 46 ++++++- docs/_src/usage/usage/document_store.md | 4 +- docs/v0.10.0/_src/api/api/document_store.md | 14 +- docs/v0.9.0/_src/api/api/document_store.md | 14 +- docs/v1.0.0/_src/api/api/document_store.md | 14 +- docs/v1.1.0/_src/api/api/document_store.md | 14 +- docs/v1.2.0/_src/api/api/document_store.md | 14 +- docs/v1.3.0/_src/api/api/document_store.md | 14 +- .../v1.3.0/_src/usage/usage/document_store.md | 2 +- haystack/document_stores/filter_utils.py | 5 +- haystack/document_stores/weaviate.py | 128 +++++++++++++----- haystack/utils/doc_store.py | 2 +- setup.cfg | 2 +- test/conftest.py | 2 +- test/test_document_store.py | 47 ++++--- 17 files changed, 216 insertions(+), 110 deletions(-) diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index c4544dd15..1a5581019 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -261,7 +261,7 @@ jobs: sudo docker-compose ps - name: Run Weaviate - run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.7.2 + run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.11.0 - name: Run GraphDB run: docker run -d -p 7200:7200 --name haystack_test_graphdb deepset/graphdb-free:9.4.1-adoptopenjdk11 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index bb31128b4..37cf48455 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -54,7 +54,7 @@ wget https://github.com/milvus-io/milvus/releases/download/v2.0.0/milvus-standal docker-compose up -d # Weaviate -docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.7.2 +docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.11.0 # GraphDB docker run -d -p 7200:7200 --name haystack_test_graphdb deepset/graphdb-free:9.4.1-adoptopenjdk11 diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index 0f8c4ee1c..697657fb8 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -3144,7 +3144,7 @@ class WeaviateDocumentStore(BaseDocumentStore) ``` Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models. -(See https://www.semi.technology/developers/weaviate/current/index.html#what-is-weaviate) +(See https://weaviate.io/developers/weaviate/current/index.html#what-is-weaviate) Some of the key differences in contrast to FAISS & Milvus: 1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up @@ -3157,7 +3157,7 @@ Weaviate python client is used to connect to the server, more details are here https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html Usage: -1. Start a Weaviate server (see https://www.semi.technology/developers/weaviate/current/getting-started/installation.html) +1. Start a Weaviate server (see https://weaviate.io/developers/weaviate/current/getting-started/installation.html) 2. Init a WeaviateDocumentStore in Haystack Limitations: @@ -3174,7 +3174,7 @@ def __init__(host: Union[str, List[str]] = "http://localhost", port: Union[int, **Arguments**: - `host`: Weaviate server connection URL for storing and processing documents and vectors. -For more details, refer "https://www.semi.technology/developers/weaviate/current/getting-started/installation.html" +For more details, refer "https://weaviate.io/developers/weaviate/current/getting-started/installation.html" - `port`: port of Weaviate instance - `timeout_config`: Weaviate Timeout config as a tuple of (retries, time out seconds). - `username`: username (standard authentication via http_auth) @@ -3188,11 +3188,11 @@ If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will 'cosine' is recommended for Sentence Transformers. - `index_type`: Index type of any vector object defined in weaviate schema. The vector index type is pluggable. Currently, HSNW is only supported. -See: https://www.semi.technology/developers/weaviate/current/more-resources/performance.html +See: https://weaviate.io/developers/weaviate/current/more-resources/performance.html - `custom_schema`: Allows to create custom schema in Weaviate, for more details -See https://www.semi.technology/developers/weaviate/current/data-schema/schema-configuration.html +See https://weaviate.io/developers/weaviate/current/data-schema/schema-configuration.html - `module_name`: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers" -For more details, See https://www.semi.technology/developers/weaviate/current/modules/ +For more details, See https://weaviate.io/developers/weaviate/current/modules/ - `return_embedding`: To return document embedding. - `embedding_field`: Name of field containing an embedding vector. - `progress_bar`: Whether to show a tqdm progress bar or not. @@ -3295,6 +3295,22 @@ def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, U Get documents from the document store. +Note this limitation from the changelog of Weaviate 1.8.0: + +.. quote:: + Due to the increasing cost of each page outlined above, there is a limit to + how many objects can be retrieved using pagination. By default setting the sum + of offset and limit to higher than 10,000 objects, will lead to an error. + If you must retrieve more than 10,000 objects, you can increase this limit by + setting the environment variable `QUERY_MAXIMUM_RESULTS=`. + + Warning: Setting this to arbitrarily high values can make the memory consumption + of a single query explode and single queries can slow down the entire cluster. + We recommend setting this value to the lowest possible value that does not + interfere with your users' expectations. + +(https://github.com/semi-technologies/weaviate/releases/tag/v1.8.0) + **Arguments**: - `index`: Name of the index to get the documents from. If None, the @@ -3341,6 +3357,22 @@ Get documents from the document store. Under-the-hood, documents are fetched in document store and yielded as individual documents. This method can be used to iteratively process a large number of documents without having to load all documents in memory. +Note this limitation from the changelog of Weaviate 1.8.0: + +.. quote:: + Due to the increasing cost of each page outlined above, there is a limit to + how many objects can be retrieved using pagination. By default setting the sum + of offset and limit to higher than 10,000 objects, will lead to an error. + If you must retrieve more than 10,000 objects, you can increase this limit by + setting the environment variable `QUERY_MAXIMUM_RESULTS=`. + + Warning: Setting this to arbitrarily high values can make the memory consumption + of a single query explode and single queries can slow down the entire cluster. + We recommend setting this value to the lowest possible value that does not + interfere with your users' expectations. + +(https://github.com/semi-technologies/weaviate/releases/tag/v1.8.0) + **Arguments**: - `index`: Name of the index to get the documents from. If None, the @@ -3454,7 +3486,7 @@ operation. ``` - `top_k`: How many documents to return per query. - `custom_query`: Custom query that will executed using query.raw method, for more details refer -https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html +https://weaviate.io/developers/weaviate/current/graphql-references/filters.html - `index`: The name of the index in the DocumentStore from which to retrieve documents diff --git a/docs/_src/usage/usage/document_store.md b/docs/_src/usage/usage/document_store.md index bca65eac2..ab18f9bb1 100644 --- a/docs/_src/usage/usage/document_store.md +++ b/docs/_src/usage/usage/document_store.md @@ -128,9 +128,9 @@ document_store = SQLDocumentStore()
The `WeaviateDocumentStore` requires a running Weaviate Server. -You can start a basic instance like this (see the [Weaviate docs](https://www.semi.technology/developers/weaviate/current/) for details): +You can start a basic instance like this (see the [Weaviate docs](https://weaviate.io/developers/weaviate/current/) for details): ``` - docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.7.2 + docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.11.0 ``` Afterwards, you can use it in Haystack: diff --git a/docs/v0.10.0/_src/api/api/document_store.md b/docs/v0.10.0/_src/api/api/document_store.md index 13d2e7025..3e12d5f05 100644 --- a/docs/v0.10.0/_src/api/api/document_store.md +++ b/docs/v0.10.0/_src/api/api/document_store.md @@ -1522,7 +1522,7 @@ class WeaviateDocumentStore(BaseDocumentStore) ``` Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models. -(See https://www.semi.technology/developers/weaviate/current/index.html#what-is-weaviate) +(See https://weaviate.io/developers/weaviate/current/index.html#what-is-weaviate) Some of the key differences in contrast to FAISS & Milvus: 1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up @@ -1533,7 +1533,7 @@ Weaviate python client is used to connect to the server, more details are here https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html Usage: -1. Start a Weaviate server (see https://www.semi.technology/developers/weaviate/current/getting-started/installation.html) +1. Start a Weaviate server (see https://weaviate.io/developers/weaviate/current/getting-started/installation.html) 2. Init a WeaviateDocumentStore in Haystack @@ -1546,7 +1546,7 @@ Usage: **Arguments**: - `host`: Weaviate server connection URL for storing and processing documents and vectors. - For more details, refer "https://www.semi.technology/developers/weaviate/current/getting-started/installation.html" + For more details, refer "https://weaviate.io/developers/weaviate/current/getting-started/installation.html" - `port`: port of Weaviate instance - `timeout_config`: Weaviate Timeout config as a tuple of (retries, time out seconds). - `username`: username (standard authentication via http_auth) @@ -1560,11 +1560,11 @@ Usage: - `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default. - `index_type`: Index type of any vector object defined in weaviate schema. The vector index type is pluggable. Currently, HSNW is only supported. - See: https://www.semi.technology/developers/weaviate/current/more-resources/performance.html + See: https://weaviate.io/developers/weaviate/current/more-resources/performance.html - `custom_schema`: Allows to create custom schema in Weaviate, for more details - See https://www.semi.technology/developers/weaviate/current/data-schema/schema-configuration.html + See https://weaviate.io/developers/weaviate/current/data-schema/schema-configuration.html - `module_name`: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers" - For more details, See https://www.semi.technology/developers/weaviate/current/modules/ + For more details, See https://weaviate.io/developers/weaviate/current/modules/ - `return_embedding`: To return document embedding. - `embedding_field`: Name of field containing an embedding vector. - `progress_bar`: Whether to show a tqdm progress bar or not. @@ -1695,7 +1695,7 @@ that are most relevant to the query as defined by Weaviate semantic search. - `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field - `top_k`: How many documents to return per query. - `custom_query`: Custom query that will executed using query.raw method, for more details refer - https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html + https://weaviate.io/developers/weaviate/current/graphql-references/filters.html - `index`: The name of the index in the DocumentStore from which to retrieve documents diff --git a/docs/v0.9.0/_src/api/api/document_store.md b/docs/v0.9.0/_src/api/api/document_store.md index 52a1aa018..ffde1fc5a 100644 --- a/docs/v0.9.0/_src/api/api/document_store.md +++ b/docs/v0.9.0/_src/api/api/document_store.md @@ -1477,7 +1477,7 @@ class WeaviateDocumentStore(BaseDocumentStore) ``` Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models. -(See https://www.semi.technology/developers/weaviate/current/index.html#what-is-weaviate) +(See https://weaviate.io/developers/weaviate/current/index.html#what-is-weaviate) Some of the key differences in contrast to FAISS & Milvus: 1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up @@ -1488,7 +1488,7 @@ Weaviate python client is used to connect to the server, more details are here https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html Usage: -1. Start a Weaviate server (see https://www.semi.technology/developers/weaviate/current/getting-started/installation.html) +1. Start a Weaviate server (see https://weaviate.io/developers/weaviate/current/getting-started/installation.html) 2. Init a WeaviateDocumentStore in Haystack @@ -1501,7 +1501,7 @@ Usage: **Arguments**: - `host`: Weaviate server connection URL for storing and processing documents and vectors. - For more details, refer "https://www.semi.technology/developers/weaviate/current/getting-started/installation.html" + For more details, refer "https://weaviate.io/developers/weaviate/current/getting-started/installation.html" - `port`: port of Weaviate instance - `timeout_config`: Weaviate Timeout config as a tuple of (retries, time out seconds). - `username`: username (standard authentication via http_auth) @@ -1515,11 +1515,11 @@ Usage: - `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default. - `index_type`: Index type of any vector object defined in weaviate schema. The vector index type is pluggable. Currently, HSNW is only supported. - See: https://www.semi.technology/developers/weaviate/current/more-resources/performance.html + See: https://weaviate.io/developers/weaviate/current/more-resources/performance.html - `custom_schema`: Allows to create custom schema in Weaviate, for more details - See https://www.semi.technology/developers/weaviate/current/data-schema/schema-configuration.html + See https://weaviate.io/developers/weaviate/current/data-schema/schema-configuration.html - `module_name`: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers" - For more details, See https://www.semi.technology/developers/weaviate/current/modules/ + For more details, See https://weaviate.io/developers/weaviate/current/modules/ - `return_embedding`: To return document embedding. - `embedding_field`: Name of field containing an embedding vector. - `progress_bar`: Whether to show a tqdm progress bar or not. @@ -1650,7 +1650,7 @@ that are most relevant to the query as defined by Weaviate semantic search. - `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field - `top_k`: How many documents to return per query. - `custom_query`: Custom query that will executed using query.raw method, for more details refer - https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html + https://weaviate.io/developers/weaviate/current/graphql-references/filters.html - `index`: The name of the index in the DocumentStore from which to retrieve documents diff --git a/docs/v1.0.0/_src/api/api/document_store.md b/docs/v1.0.0/_src/api/api/document_store.md index 8942d0ea0..b3fdfc62e 100644 --- a/docs/v1.0.0/_src/api/api/document_store.md +++ b/docs/v1.0.0/_src/api/api/document_store.md @@ -1678,7 +1678,7 @@ class WeaviateDocumentStore(BaseDocumentStore) ``` Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models. -(See https://www.semi.technology/developers/weaviate/current/index.html#what-is-weaviate) +(See https://weaviate.io/developers/weaviate/current/index.html#what-is-weaviate) Some of the key differences in contrast to FAISS & Milvus: 1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up @@ -1690,7 +1690,7 @@ Weaviate python client is used to connect to the server, more details are here https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html Usage: -1. Start a Weaviate server (see https://www.semi.technology/developers/weaviate/current/getting-started/installation.html) +1. Start a Weaviate server (see https://weaviate.io/developers/weaviate/current/getting-started/installation.html) 2. Init a WeaviateDocumentStore in Haystack Limitations: @@ -1706,7 +1706,7 @@ The current implementation is not supporting the storage of labels, so you canno **Arguments**: - `host`: Weaviate server connection URL for storing and processing documents and vectors. - For more details, refer "https://www.semi.technology/developers/weaviate/current/getting-started/installation.html" + For more details, refer "https://weaviate.io/developers/weaviate/current/getting-started/installation.html" - `port`: port of Weaviate instance - `timeout_config`: Weaviate Timeout config as a tuple of (retries, time out seconds). - `username`: username (standard authentication via http_auth) @@ -1720,11 +1720,11 @@ The current implementation is not supporting the storage of labels, so you canno 'cosine' is recommended for Sentence Transformers. - `index_type`: Index type of any vector object defined in weaviate schema. The vector index type is pluggable. Currently, HSNW is only supported. - See: https://www.semi.technology/developers/weaviate/current/more-resources/performance.html + See: https://weaviate.io/developers/weaviate/current/more-resources/performance.html - `custom_schema`: Allows to create custom schema in Weaviate, for more details - See https://www.semi.technology/developers/weaviate/current/data-schema/schema-configuration.html + See https://weaviate.io/developers/weaviate/current/data-schema/schema-configuration.html - `module_name`: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers" - For more details, See https://www.semi.technology/developers/weaviate/current/modules/ + For more details, See https://weaviate.io/developers/weaviate/current/modules/ - `return_embedding`: To return document embedding. - `embedding_field`: Name of field containing an embedding vector. - `progress_bar`: Whether to show a tqdm progress bar or not. @@ -1863,7 +1863,7 @@ that are most relevant to the query as defined by Weaviate semantic search. - `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field - `top_k`: How many documents to return per query. - `custom_query`: Custom query that will executed using query.raw method, for more details refer - https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html + https://weaviate.io/developers/weaviate/current/graphql-references/filters.html - `index`: The name of the index in the DocumentStore from which to retrieve documents diff --git a/docs/v1.1.0/_src/api/api/document_store.md b/docs/v1.1.0/_src/api/api/document_store.md index b71419c56..c0dbf7979 100644 --- a/docs/v1.1.0/_src/api/api/document_store.md +++ b/docs/v1.1.0/_src/api/api/document_store.md @@ -1757,7 +1757,7 @@ class WeaviateDocumentStore(BaseDocumentStore) ``` Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models. -(See https://www.semi.technology/developers/weaviate/current/index.html#what-is-weaviate) +(See https://weaviate.io/developers/weaviate/current/index.html#what-is-weaviate) Some of the key differences in contrast to FAISS & Milvus: 1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up @@ -1770,7 +1770,7 @@ Weaviate python client is used to connect to the server, more details are here https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html Usage: -1. Start a Weaviate server (see https://www.semi.technology/developers/weaviate/current/getting-started/installation.html) +1. Start a Weaviate server (see https://weaviate.io/developers/weaviate/current/getting-started/installation.html) 2. Init a WeaviateDocumentStore in Haystack Limitations: @@ -1786,7 +1786,7 @@ The current implementation is not supporting the storage of labels, so you canno **Arguments**: - `host`: Weaviate server connection URL for storing and processing documents and vectors. - For more details, refer "https://www.semi.technology/developers/weaviate/current/getting-started/installation.html" + For more details, refer "https://weaviate.io/developers/weaviate/current/getting-started/installation.html" - `port`: port of Weaviate instance - `timeout_config`: Weaviate Timeout config as a tuple of (retries, time out seconds). - `username`: username (standard authentication via http_auth) @@ -1800,11 +1800,11 @@ The current implementation is not supporting the storage of labels, so you canno 'cosine' is recommended for Sentence Transformers. - `index_type`: Index type of any vector object defined in weaviate schema. The vector index type is pluggable. Currently, HSNW is only supported. - See: https://www.semi.technology/developers/weaviate/current/more-resources/performance.html + See: https://weaviate.io/developers/weaviate/current/more-resources/performance.html - `custom_schema`: Allows to create custom schema in Weaviate, for more details - See https://www.semi.technology/developers/weaviate/current/data-schema/schema-configuration.html + See https://weaviate.io/developers/weaviate/current/data-schema/schema-configuration.html - `module_name`: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers" - For more details, See https://www.semi.technology/developers/weaviate/current/modules/ + For more details, See https://weaviate.io/developers/weaviate/current/modules/ - `return_embedding`: To return document embedding. - `embedding_field`: Name of field containing an embedding vector. - `progress_bar`: Whether to show a tqdm progress bar or not. @@ -1943,7 +1943,7 @@ that are most relevant to the query as defined by Weaviate semantic search. - `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field - `top_k`: How many documents to return per query. - `custom_query`: Custom query that will executed using query.raw method, for more details refer - https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html + https://weaviate.io/developers/weaviate/current/graphql-references/filters.html - `index`: The name of the index in the DocumentStore from which to retrieve documents diff --git a/docs/v1.2.0/_src/api/api/document_store.md b/docs/v1.2.0/_src/api/api/document_store.md index d8b8d17c5..97d6c7167 100644 --- a/docs/v1.2.0/_src/api/api/document_store.md +++ b/docs/v1.2.0/_src/api/api/document_store.md @@ -2755,7 +2755,7 @@ class WeaviateDocumentStore(BaseDocumentStore) ``` Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models. -(See https://www.semi.technology/developers/weaviate/current/index.html#what-is-weaviate) +(See https://weaviate.io/developers/weaviate/current/index.html#what-is-weaviate) Some of the key differences in contrast to FAISS & Milvus: 1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up @@ -2768,7 +2768,7 @@ Weaviate python client is used to connect to the server, more details are here https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html Usage: -1. Start a Weaviate server (see https://www.semi.technology/developers/weaviate/current/getting-started/installation.html) +1. Start a Weaviate server (see https://weaviate.io/developers/weaviate/current/getting-started/installation.html) 2. Init a WeaviateDocumentStore in Haystack Limitations: @@ -2785,7 +2785,7 @@ def __init__(host: Union[str, List[str]] = "http://localhost", port: Union[int, **Arguments**: - `host`: Weaviate server connection URL for storing and processing documents and vectors. -For more details, refer "https://www.semi.technology/developers/weaviate/current/getting-started/installation.html" +For more details, refer "https://weaviate.io/developers/weaviate/current/getting-started/installation.html" - `port`: port of Weaviate instance - `timeout_config`: Weaviate Timeout config as a tuple of (retries, time out seconds). - `username`: username (standard authentication via http_auth) @@ -2799,11 +2799,11 @@ If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will 'cosine' is recommended for Sentence Transformers. - `index_type`: Index type of any vector object defined in weaviate schema. The vector index type is pluggable. Currently, HSNW is only supported. -See: https://www.semi.technology/developers/weaviate/current/more-resources/performance.html +See: https://weaviate.io/developers/weaviate/current/more-resources/performance.html - `custom_schema`: Allows to create custom schema in Weaviate, for more details -See https://www.semi.technology/developers/weaviate/current/data-schema/schema-configuration.html +See https://weaviate.io/developers/weaviate/current/data-schema/schema-configuration.html - `module_name`: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers" -For more details, See https://www.semi.technology/developers/weaviate/current/modules/ +For more details, See https://weaviate.io/developers/weaviate/current/modules/ - `return_embedding`: To return document embedding. - `embedding_field`: Name of field containing an embedding vector. - `progress_bar`: Whether to show a tqdm progress bar or not. @@ -3065,7 +3065,7 @@ operation. ``` - `top_k`: How many documents to return per query. - `custom_query`: Custom query that will executed using query.raw method, for more details refer -https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html +https://weaviate.io/developers/weaviate/current/graphql-references/filters.html - `index`: The name of the index in the DocumentStore from which to retrieve documents diff --git a/docs/v1.3.0/_src/api/api/document_store.md b/docs/v1.3.0/_src/api/api/document_store.md index 61b69bf7d..ba04facdd 100644 --- a/docs/v1.3.0/_src/api/api/document_store.md +++ b/docs/v1.3.0/_src/api/api/document_store.md @@ -3136,7 +3136,7 @@ class WeaviateDocumentStore(BaseDocumentStore) ``` Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models. -(See https://www.semi.technology/developers/weaviate/current/index.html#what-is-weaviate) +(See https://weaviate.io/developers/weaviate/current/index.html#what-is-weaviate) Some of the key differences in contrast to FAISS & Milvus: 1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up @@ -3149,7 +3149,7 @@ Weaviate python client is used to connect to the server, more details are here https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html Usage: -1. Start a Weaviate server (see https://www.semi.technology/developers/weaviate/current/getting-started/installation.html) +1. Start a Weaviate server (see https://weaviate.io/developers/weaviate/current/getting-started/installation.html) 2. Init a WeaviateDocumentStore in Haystack Limitations: @@ -3166,7 +3166,7 @@ def __init__(host: Union[str, List[str]] = "http://localhost", port: Union[int, **Arguments**: - `host`: Weaviate server connection URL for storing and processing documents and vectors. -For more details, refer "https://www.semi.technology/developers/weaviate/current/getting-started/installation.html" +For more details, refer "https://weaviate.io/developers/weaviate/current/getting-started/installation.html" - `port`: port of Weaviate instance - `timeout_config`: Weaviate Timeout config as a tuple of (retries, time out seconds). - `username`: username (standard authentication via http_auth) @@ -3180,11 +3180,11 @@ If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will 'cosine' is recommended for Sentence Transformers. - `index_type`: Index type of any vector object defined in weaviate schema. The vector index type is pluggable. Currently, HSNW is only supported. -See: https://www.semi.technology/developers/weaviate/current/more-resources/performance.html +See: https://weaviate.io/developers/weaviate/current/more-resources/performance.html - `custom_schema`: Allows to create custom schema in Weaviate, for more details -See https://www.semi.technology/developers/weaviate/current/data-schema/schema-configuration.html +See https://weaviate.io/developers/weaviate/current/data-schema/schema-configuration.html - `module_name`: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers" -For more details, See https://www.semi.technology/developers/weaviate/current/modules/ +For more details, See https://weaviate.io/developers/weaviate/current/modules/ - `return_embedding`: To return document embedding. - `embedding_field`: Name of field containing an embedding vector. - `progress_bar`: Whether to show a tqdm progress bar or not. @@ -3446,7 +3446,7 @@ operation. ``` - `top_k`: How many documents to return per query. - `custom_query`: Custom query that will executed using query.raw method, for more details refer -https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html +https://weaviate.io/developers/weaviate/current/graphql-references/filters.html - `index`: The name of the index in the DocumentStore from which to retrieve documents diff --git a/docs/v1.3.0/_src/usage/usage/document_store.md b/docs/v1.3.0/_src/usage/usage/document_store.md index bca65eac2..c4a12c291 100644 --- a/docs/v1.3.0/_src/usage/usage/document_store.md +++ b/docs/v1.3.0/_src/usage/usage/document_store.md @@ -128,7 +128,7 @@ document_store = SQLDocumentStore()
The `WeaviateDocumentStore` requires a running Weaviate Server. -You can start a basic instance like this (see the [Weaviate docs](https://www.semi.technology/developers/weaviate/current/) for details): +You can start a basic instance like this (see the [Weaviate docs](https://weaviate.io/developers/weaviate/current/) for details): ``` docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.7.2 ``` diff --git a/haystack/document_stores/filter_utils.py b/haystack/document_stores/filter_utils.py index 9b7bc8c5f..a758ece90 100644 --- a/haystack/document_stores/filter_utils.py +++ b/haystack/document_stores/filter_utils.py @@ -277,7 +277,10 @@ class ComparisonOperation(ABC): data_type = "valueDate" # Comparison value is a plain string except ValueError: - data_type = "valueString" + if self.field_name == "content": + data_type = "valueText" + else: + data_type = "valueString" elif isinstance(value, int): data_type = "valueInt" elif isinstance(value, float): diff --git a/haystack/document_stores/weaviate.py b/haystack/document_stores/weaviate.py index cfdb91c00..1077afd5b 100644 --- a/haystack/document_stores/weaviate.py +++ b/haystack/document_stores/weaviate.py @@ -1,13 +1,21 @@ -import hashlib +from typing import Any, Dict, Generator, List, Optional, Union import re import uuid -from typing import Dict, Generator, List, Optional, Union +import json +import hashlib +import logging from datetime import datetime -import logging -import json import numpy as np from tqdm import tqdm +import weaviate + +try: + from weaviate import client, AuthClientPassword +except (ImportError, ModuleNotFoundError) as ie: + from haystack.utils.import_utils import _optional_component_not_installed + + _optional_component_not_installed(__name__, "weaviate", ie) from haystack.schema import Document from haystack.document_stores import BaseDocumentStore @@ -15,14 +23,6 @@ from haystack.document_stores.base import get_batches_from_generator from haystack.document_stores.filter_utils import LogicalFilterClause from haystack.document_stores.utils import convert_date_to_rfc3339 -try: - from weaviate import client, AuthClientPassword - from weaviate import ObjectsBatchRequest -except (ImportError, ModuleNotFoundError) as ie: - from haystack.utils.import_utils import _optional_component_not_installed - - _optional_component_not_installed(__name__, "weaviate", ie) - logger = logging.getLogger(__name__) UUID_PATTERN = re.compile(r"^[\da-f]{8}-([\da-f]{4}-){3}[\da-f]{12}$", re.IGNORECASE) @@ -32,7 +32,7 @@ class WeaviateDocumentStore(BaseDocumentStore): """ Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models. - (See https://www.semi.technology/developers/weaviate/current/index.html#what-is-weaviate) + (See https://weaviate.io/developers/weaviate/current/index.html#what-is-weaviate) Some of the key differences in contrast to FAISS & Milvus: 1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up @@ -45,7 +45,7 @@ class WeaviateDocumentStore(BaseDocumentStore): https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html Usage: - 1. Start a Weaviate server (see https://www.semi.technology/developers/weaviate/current/getting-started/installation.html) + 1. Start a Weaviate server (see https://weaviate.io/developers/weaviate/current/getting-started/installation.html) 2. Init a WeaviateDocumentStore in Haystack Limitations: @@ -74,7 +74,7 @@ class WeaviateDocumentStore(BaseDocumentStore): ): """ :param host: Weaviate server connection URL for storing and processing documents and vectors. - For more details, refer "https://www.semi.technology/developers/weaviate/current/getting-started/installation.html" + For more details, refer "https://weaviate.io/developers/weaviate/current/getting-started/installation.html" :param port: port of Weaviate instance :param timeout_config: Weaviate Timeout config as a tuple of (retries, time out seconds). :param username: username (standard authentication via http_auth) @@ -88,11 +88,11 @@ class WeaviateDocumentStore(BaseDocumentStore): 'cosine' is recommended for Sentence Transformers. :param index_type: Index type of any vector object defined in weaviate schema. The vector index type is pluggable. Currently, HSNW is only supported. - See: https://www.semi.technology/developers/weaviate/current/more-resources/performance.html + See: https://weaviate.io/developers/weaviate/current/more-resources/performance.html :param custom_schema: Allows to create custom schema in Weaviate, for more details - See https://www.semi.technology/developers/weaviate/current/data-schema/schema-configuration.html + See https://weaviate.io/developers/weaviate/current/data-schema/schema-configuration.html :param module_name: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers" - For more details, See https://www.semi.technology/developers/weaviate/current/modules/ + For more details, See https://weaviate.io/developers/weaviate/current/modules/ :param return_embedding: To return document embedding. :param embedding_field: Name of field containing an embedding vector. :param progress_bar: Whether to show a tqdm progress bar or not. @@ -265,8 +265,11 @@ class WeaviateDocumentStore(BaseDocumentStore): document = None id = self._sanitize_id(id=id, index=index) - - result = self.weaviate_client.data_object.get_by_id(id, with_vector=True) + result = None + try: + result = self.weaviate_client.data_object.get_by_id(id, with_vector=True) + except weaviate.exceptions.UnexpectedStatusCodeException as usce: + logging.debug(f"Weaviate could not get the document requested: {usce}") if result: document = self._convert_weaviate_result_to_document(result, return_embedding=True) return document @@ -289,7 +292,11 @@ class WeaviateDocumentStore(BaseDocumentStore): # TODO: better implementation with multiple where filters instead of chatty call below? for id in ids: id = self._sanitize_id(id=id, index=index) - result = self.weaviate_client.data_object.get_by_id(id, with_vector=True) + result = None + try: + result = self.weaviate_client.data_object.get_by_id(id, with_vector=True) + except weaviate.exceptions.UnexpectedStatusCodeException as usce: + logging.debug(f"Weaviate could not get the document requested: {usce}") if result: document = self._convert_weaviate_result_to_document(result, return_embedding=True) documents.append(document) @@ -458,7 +465,6 @@ class WeaviateDocumentStore(BaseDocumentStore): batched_documents = get_batches_from_generator(document_objects, batch_size) with tqdm(total=len(document_objects), disable=not self.progress_bar) as progress_bar: for document_batch in batched_documents: - docs_batch = ObjectsBatchRequest() for idx, doc in enumerate(document_batch): _doc = {**doc.to_dict(field_map=self._create_document_field_map())} _ = _doc.pop("score", None) @@ -492,10 +498,11 @@ class WeaviateDocumentStore(BaseDocumentStore): for date_field in date_fields: _doc[date_field] = convert_date_to_rfc3339(_doc[date_field]) - docs_batch.add(_doc, class_name=index, uuid=doc_id, vector=vector) - + self.weaviate_client.batch.add_data_object( + data_object=_doc, class_name=index, uuid=doc_id, vector=vector + ) # Ingest a batch of documents - results = self.weaviate_client.batch.create(docs_batch) + results = self.weaviate_client.batch.create_objects() # Weaviate returns errors for every failed document in the batch if results is not None: for result in results: @@ -563,15 +570,14 @@ class WeaviateDocumentStore(BaseDocumentStore): doc_count = 0 if filters: filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate() - result = ( - self.weaviate_client.query.aggregate(index).with_fields("meta { count }").with_where(filter_dict).do() - ) + result = self.weaviate_client.query.aggregate(index).with_meta_count().with_where(filter_dict).do() else: - result = self.weaviate_client.query.aggregate(index).with_fields("meta { count }").do() + result = self.weaviate_client.query.aggregate(index).with_meta_count().do() if "data" in result: if "Aggregate" in result.get("data"): - doc_count = result.get("data").get("Aggregate").get(index)[0]["meta"]["count"] + if result.get("data").get("Aggregate").get(index): + doc_count = result.get("data").get("Aggregate").get(index)[0]["meta"]["count"] return doc_count @@ -586,6 +592,22 @@ class WeaviateDocumentStore(BaseDocumentStore): """ Get documents from the document store. + Note this limitation from the changelog of Weaviate 1.8.0: + + .. quote:: + Due to the increasing cost of each page outlined above, there is a limit to + how many objects can be retrieved using pagination. By default setting the sum + of offset and limit to higher than 10,000 objects, will lead to an error. + If you must retrieve more than 10,000 objects, you can increase this limit by + setting the environment variable `QUERY_MAXIMUM_RESULTS=`. + + Warning: Setting this to arbitrarily high values can make the memory consumption + of a single query explode and single queries can slow down the entire cluster. + We recommend setting this value to the lowest possible value that does not + interfere with your users' expectations. + + (https://github.com/semi-technologies/weaviate/releases/tag/v1.8.0) + :param index: Name of the index to get the documents from. If None, the DocumentStore's default index (self.index) will be used. :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain @@ -651,10 +673,30 @@ class WeaviateDocumentStore(BaseDocumentStore): else: result = self.weaviate_client.query.get(class_name=index, properties=properties).do() - all_docs = {} - if result and "data" in result and "Get" in result.get("data"): - if result.get("data").get("Get").get(index): - all_docs = result.get("data").get("Get").get(index) + # Inherent Weaviate limitation to 100 elements forces us to loop here: + # https://weaviate-python-client.readthedocs.io/en/latest/weaviate.data.html?highlight=100#weaviate.data.DataObject.get + base_query = self.weaviate_client.query.get(class_name=index, properties=properties) + all_docs: List[Any] = [] + num_of_documents = self.get_document_count(index=index, filters=filters) + + while len(all_docs) < num_of_documents: + query = base_query + if filters: + filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate() + query = query.with_where(filter_dict) + + if all_docs: + # .with_limit() must be used with .with_offset, of the latter won't work properly + # https://weaviate-python-client.readthedocs.io/en/latest/weaviate.gql.html?highlight=offset#weaviate.gql.get.GetBuilder.with_offset + query = query.with_limit(100).with_offset(offset=len(all_docs)) + + result = query.do() + + if result and "data" in result and "Get" in result.get("data"): + if result.get("data").get("Get").get(index): + all_docs += result.get("data").get("Get").get(index) + else: + raise ValueError(f"Weaviate returned ad exception: {result}") yield from all_docs @@ -671,6 +713,22 @@ class WeaviateDocumentStore(BaseDocumentStore): document store and yielded as individual documents. This method can be used to iteratively process a large number of documents without having to load all documents in memory. + Note this limitation from the changelog of Weaviate 1.8.0: + + .. quote:: + Due to the increasing cost of each page outlined above, there is a limit to + how many objects can be retrieved using pagination. By default setting the sum + of offset and limit to higher than 10,000 objects, will lead to an error. + If you must retrieve more than 10,000 objects, you can increase this limit by + setting the environment variable `QUERY_MAXIMUM_RESULTS=`. + + Warning: Setting this to arbitrarily high values can make the memory consumption + of a single query explode and single queries can slow down the entire cluster. + We recommend setting this value to the lowest possible value that does not + interfere with your users' expectations. + + (https://github.com/semi-technologies/weaviate/releases/tag/v1.8.0) + :param index: Name of the index to get the documents from. If None, the DocumentStore's default index (self.index) will be used. :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain @@ -793,7 +851,7 @@ class WeaviateDocumentStore(BaseDocumentStore): ``` :param top_k: How many documents to return per query. :param custom_query: Custom query that will executed using query.raw method, for more details refer - https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html + https://weaviate.io/developers/weaviate/current/graphql-references/filters.html :param index: The name of the index in the DocumentStore from which to retrieve documents """ index = self._sanitize_index_name(index) or self.index diff --git a/haystack/utils/doc_store.py b/haystack/utils/doc_store.py index b7605ccb2..d13e22c8c 100644 --- a/haystack/utils/doc_store.py +++ b/haystack/utils/doc_store.py @@ -63,7 +63,7 @@ def launch_weaviate(sleep=15): logger.debug("Starting Weaviate ...") status = subprocess.run( [ - f"docker start {WEAVIATE_CONTAINER_NAME} > /dev/null 2>&1 || docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --name {WEAVIATE_CONTAINER_NAME} semitechnologies/weaviate:1.7.2" + f"docker start {WEAVIATE_CONTAINER_NAME} > /dev/null 2>&1 || docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --name {WEAVIATE_CONTAINER_NAME} semitechnologies/weaviate:1.11.0" ], shell=True, ) diff --git a/setup.cfg b/setup.cfg index ae24b883c..f2e897c07 100644 --- a/setup.cfg +++ b/setup.cfg @@ -142,7 +142,7 @@ only-milvus = milvus = farm-haystack[sql,only-milvus] weaviate = - weaviate-client==2.5.0 + weaviate-client==3.3.3 only-pinecone = pinecone-client pinecone = diff --git a/test/conftest.py b/test/conftest.py index e6adb5a08..b598d8e65 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -302,7 +302,7 @@ def weaviate_fixture(): print("Starting Weaviate servers ...") status = subprocess.run(["docker rm haystack_test_weaviate"], shell=True) status = subprocess.run( - ["docker run -d --name haystack_test_weaviate -p 8080:8080 semitechnologies/weaviate:1.7.2"], shell=True + ["docker run -d --name haystack_test_weaviate -p 8080:8080 semitechnologies/weaviate:1.11.0"], shell=True ) if status.returncode: raise Exception("Failed to launch Weaviate. Please check docker container logs.") diff --git a/test/test_document_store.py b/test/test_document_store.py index 016d5efcd..e42326b37 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -117,7 +117,7 @@ def test_init_elastic_doc_store_with_index_recreation(): assert len(labels) == 0 -def test_write_with_duplicate_doc_ids(document_store): +def test_write_with_duplicate_doc_ids(document_store: BaseDocumentStore): duplicate_documents = [ Document(content="Doc1", id_hash_keys=["content"]), Document(content="Doc1", id_hash_keys=["content"]), @@ -131,7 +131,7 @@ def test_write_with_duplicate_doc_ids(document_store): @pytest.mark.parametrize( "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "weaviate", "pinecone"], indirect=True ) -def test_write_with_duplicate_doc_ids_custom_index(document_store): +def test_write_with_duplicate_doc_ids_custom_index(document_store: BaseDocumentStore): duplicate_documents = [ Document(content="Doc1", id_hash_keys=["content"]), Document(content="Doc1", id_hash_keys=["content"]), @@ -164,7 +164,20 @@ def test_get_all_documents_without_filters(document_store_with_docs): assert {d.meta["meta_field"] for d in documents} == {"test1", "test2", "test3", "test4", "test5"} -def test_get_all_document_filter_duplicate_text_value(document_store): +def test_get_all_documents_large_quantities(document_store: BaseDocumentStore): + # Test to exclude situations like Weaviate not returning more than 100 docs by default + # https://github.com/deepset-ai/haystack/issues/1893 + docs_to_write = [ + {"meta": {"name": f"name_{i}"}, "content": f"text_{i}", "embedding": np.random.rand(768).astype(np.float32)} + for i in range(1000) + ] + document_store.write_documents(docs_to_write) + documents = document_store.get_all_documents() + assert all(isinstance(d, Document) for d in documents) + assert len(documents) == len(docs_to_write) + + +def test_get_all_document_filter_duplicate_text_value(document_store: BaseDocumentStore): documents = [ Document(content="Doc1", meta={"f1": "0"}, id_hash_keys=["meta"]), Document(content="Doc1", meta={"f1": "1", "meta_id": "0"}, id_hash_keys=["meta"]), @@ -355,7 +368,7 @@ def test_get_document_by_id(document_store_with_docs): assert doc.content == documents[0].content -def test_get_documents_by_id(document_store): +def test_get_documents_by_id(document_store: BaseDocumentStore): # generate more documents than the elasticsearch default query size limit of 10 docs_to_generate = 15 documents = [{"content": "doc-" + str(i)} for i in range(docs_to_generate)] @@ -372,7 +385,7 @@ def test_get_documents_by_id(document_store): assert set(retrieved_ids) == set(all_ids) -def test_get_document_count(document_store): +def test_get_document_count(document_store: BaseDocumentStore): documents = [ {"content": "text1", "id": "1", "meta_field_for_count": "a"}, {"content": "text2", "id": "2", "meta_field_for_count": "b"}, @@ -385,7 +398,7 @@ def test_get_document_count(document_store): assert document_store.get_document_count(filters={"meta_field_for_count": ["b"]}) == 3 -def test_get_all_documents_generator(document_store): +def test_get_all_documents_generator(document_store: BaseDocumentStore): documents = [ {"content": "text1", "id": "1", "meta_field_for_count": "a"}, {"content": "text2", "id": "2", "meta_field_for_count": "b"}, @@ -421,7 +434,7 @@ def test_update_existing_documents(document_store, update_existing_documents): assert stored_docs[0].content == original_docs[0]["content"] -def test_write_document_meta(document_store): +def test_write_document_meta(document_store: BaseDocumentStore): documents = [ {"content": "dict_without_meta", "id": "1"}, {"content": "dict_with_meta", "meta_field": "test2", "name": "filename2", "id": "2"}, @@ -438,7 +451,7 @@ def test_write_document_meta(document_store): assert document_store.get_document_by_id("4").meta["meta_field"] == "test4" -def test_write_document_index(document_store): +def test_write_document_index(document_store: BaseDocumentStore): documents = [{"content": "text1", "id": "1"}, {"content": "text2", "id": "2"}] document_store.write_documents([documents[0]], index="haystack_test_one") assert len(document_store.get_all_documents(index="haystack_test_one")) == 1 @@ -453,7 +466,7 @@ def test_write_document_index(document_store): @pytest.mark.parametrize( "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True ) -def test_document_with_embeddings(document_store): +def test_document_with_embeddings(document_store: BaseDocumentStore): documents = [ {"content": "text1", "id": "1", "embedding": np.random.rand(768).astype(np.float32)}, {"content": "text2", "id": "2", "embedding": np.random.rand(768).astype(np.float64)}, @@ -720,7 +733,7 @@ def test_delete_documents_by_id_with_filters(document_store_with_docs): # exclude weaviate because it does not support storing labels @pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "pinecone"], indirect=True) -def test_labels(document_store): +def test_labels(document_store: BaseDocumentStore): label = Label( query="question1", answer=Answer( @@ -808,7 +821,7 @@ def test_labels(document_store): # exclude weaviate because it does not support storing labels @pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "pinecone"], indirect=True) -def test_multilabel(document_store): +def test_multilabel(document_store: BaseDocumentStore): labels = [ Label( id="standard", @@ -924,7 +937,7 @@ def test_multilabel(document_store): # exclude weaviate because it does not support storing labels @pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "pinecone"], indirect=True) -def test_multilabel_no_answer(document_store): +def test_multilabel_no_answer(document_store: BaseDocumentStore): labels = [ Label( query="question", @@ -993,7 +1006,7 @@ def test_multilabel_no_answer(document_store): # exclude weaviate because it does not support storing labels # exclude faiss and milvus as label metadata is not implemented @pytest.mark.parametrize("document_store", ["elasticsearch", "memory"], indirect=True) -def test_multilabel_filter_aggregations(document_store): +def test_multilabel_filter_aggregations(document_store: BaseDocumentStore): labels = [ Label( id="standard", @@ -1089,7 +1102,7 @@ def test_multilabel_filter_aggregations(document_store): # exclude weaviate because it does not support storing labels # exclude faiss and milvus as label metadata is not implemented @pytest.mark.parametrize("document_store", ["elasticsearch", "memory"], indirect=True) -def test_multilabel_meta_aggregations(document_store): +def test_multilabel_meta_aggregations(document_store: BaseDocumentStore): labels = [ Label( id="standard", @@ -1180,7 +1193,7 @@ def test_multilabel_meta_aggregations(document_store): @pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "milvus1", "weaviate", "pinecone"], indirect=True) # Currently update_document_meta() is not implemented for Memory doc store -def test_update_meta(document_store): +def test_update_meta(document_store: BaseDocumentStore): documents = [ Document(content="Doc1", meta={"meta_key_1": "1", "meta_key_2": "1"}), Document(content="Doc2", meta={"meta_key_1": "2", "meta_key_2": "2"}), @@ -1209,7 +1222,7 @@ def test_custom_embedding_field(document_store_type, tmp_path): @pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True) -def test_get_meta_values_by_key(document_store): +def test_get_meta_values_by_key(document_store: BaseDocumentStore): documents = [ Document(content="Doc1", meta={"meta_key_1": "1", "meta_key_2": "11"}), Document(content="Doc2", meta={"meta_key_1": "2", "meta_key_2": "22"}), @@ -1271,7 +1284,7 @@ def test_elasticsearch_delete_index(): @pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True) -def test_elasticsearch_query_with_filters_and_missing_embeddings(document_store): +def test_elasticsearch_query_with_filters_and_missing_embeddings(document_store: BaseDocumentStore): document_store.write_documents(DOCUMENTS) document_without_embedding = Document( content="Doc without embedding", meta={"name": "name_7", "year": "2021", "month": "04"}