mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-03 12:07:03 +00:00
Upgrade weaviate-client
to 3.3.3
and fix get_all_documents
(#1895)
* Fix 'bug' on Weaviate only returning max. 100 docs on get_all_documents * Add type * Update Weaviate version on the CI * Fix bug on get_document_count where there are no documents * Add more info in the docstrings of get_all_documents and get_all_documents_generator * Add latest docstring and tutorial changes * Apply Black * Update Documentation & Code Style * Trigger pipeline * Update Documentation & Code Style * Include StefanBogdan feedback * Fix mypy issues and LogicalFilterClause * Add more types * Update Documentation & Code Style * update setup.cfg * Upgrade weaviate containers too * Allow to filter for content field in Weaviate * Use convert_to_weaviate instead of convert_to_pinecone * Fix _get_all_documents_in_index * Update docstrings and docs * Catching an exception in get_document(s)_by_id Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: bogdankostic <bogdankostic@web.de>
This commit is contained in:
parent
3459020600
commit
ae712fe6bf
2
.github/workflows/linux_ci.yml
vendored
2
.github/workflows/linux_ci.yml
vendored
@ -261,7 +261,7 @@ jobs:
|
||||
sudo docker-compose ps
|
||||
|
||||
- name: Run Weaviate
|
||||
run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.7.2
|
||||
run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.11.0
|
||||
|
||||
- name: Run GraphDB
|
||||
run: docker run -d -p 7200:7200 --name haystack_test_graphdb deepset/graphdb-free:9.4.1-adoptopenjdk11
|
||||
|
@ -54,7 +54,7 @@ wget https://github.com/milvus-io/milvus/releases/download/v2.0.0/milvus-standal
|
||||
docker-compose up -d
|
||||
|
||||
# Weaviate
|
||||
docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.7.2
|
||||
docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.11.0
|
||||
|
||||
# GraphDB
|
||||
docker run -d -p 7200:7200 --name haystack_test_graphdb deepset/graphdb-free:9.4.1-adoptopenjdk11
|
||||
|
@ -3144,7 +3144,7 @@ class WeaviateDocumentStore(BaseDocumentStore)
|
||||
```
|
||||
|
||||
Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models.
|
||||
(See https://www.semi.technology/developers/weaviate/current/index.html#what-is-weaviate)
|
||||
(See https://weaviate.io/developers/weaviate/current/index.html#what-is-weaviate)
|
||||
|
||||
Some of the key differences in contrast to FAISS & Milvus:
|
||||
1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up
|
||||
@ -3157,7 +3157,7 @@ Weaviate python client is used to connect to the server, more details are here
|
||||
https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html
|
||||
|
||||
Usage:
|
||||
1. Start a Weaviate server (see https://www.semi.technology/developers/weaviate/current/getting-started/installation.html)
|
||||
1. Start a Weaviate server (see https://weaviate.io/developers/weaviate/current/getting-started/installation.html)
|
||||
2. Init a WeaviateDocumentStore in Haystack
|
||||
|
||||
Limitations:
|
||||
@ -3174,7 +3174,7 @@ def __init__(host: Union[str, List[str]] = "http://localhost", port: Union[int,
|
||||
**Arguments**:
|
||||
|
||||
- `host`: Weaviate server connection URL for storing and processing documents and vectors.
|
||||
For more details, refer "https://www.semi.technology/developers/weaviate/current/getting-started/installation.html"
|
||||
For more details, refer "https://weaviate.io/developers/weaviate/current/getting-started/installation.html"
|
||||
- `port`: port of Weaviate instance
|
||||
- `timeout_config`: Weaviate Timeout config as a tuple of (retries, time out seconds).
|
||||
- `username`: username (standard authentication via http_auth)
|
||||
@ -3188,11 +3188,11 @@ If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will
|
||||
'cosine' is recommended for Sentence Transformers.
|
||||
- `index_type`: Index type of any vector object defined in weaviate schema. The vector index type is pluggable.
|
||||
Currently, HSNW is only supported.
|
||||
See: https://www.semi.technology/developers/weaviate/current/more-resources/performance.html
|
||||
See: https://weaviate.io/developers/weaviate/current/more-resources/performance.html
|
||||
- `custom_schema`: Allows to create custom schema in Weaviate, for more details
|
||||
See https://www.semi.technology/developers/weaviate/current/data-schema/schema-configuration.html
|
||||
See https://weaviate.io/developers/weaviate/current/data-schema/schema-configuration.html
|
||||
- `module_name`: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers"
|
||||
For more details, See https://www.semi.technology/developers/weaviate/current/modules/
|
||||
For more details, See https://weaviate.io/developers/weaviate/current/modules/
|
||||
- `return_embedding`: To return document embedding.
|
||||
- `embedding_field`: Name of field containing an embedding vector.
|
||||
- `progress_bar`: Whether to show a tqdm progress bar or not.
|
||||
@ -3295,6 +3295,22 @@ def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, U
|
||||
|
||||
Get documents from the document store.
|
||||
|
||||
Note this limitation from the changelog of Weaviate 1.8.0:
|
||||
|
||||
.. quote::
|
||||
Due to the increasing cost of each page outlined above, there is a limit to
|
||||
how many objects can be retrieved using pagination. By default setting the sum
|
||||
of offset and limit to higher than 10,000 objects, will lead to an error.
|
||||
If you must retrieve more than 10,000 objects, you can increase this limit by
|
||||
setting the environment variable `QUERY_MAXIMUM_RESULTS=<desired-value>`.
|
||||
|
||||
Warning: Setting this to arbitrarily high values can make the memory consumption
|
||||
of a single query explode and single queries can slow down the entire cluster.
|
||||
We recommend setting this value to the lowest possible value that does not
|
||||
interfere with your users' expectations.
|
||||
|
||||
(https://github.com/semi-technologies/weaviate/releases/tag/v1.8.0)
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `index`: Name of the index to get the documents from. If None, the
|
||||
@ -3341,6 +3357,22 @@ Get documents from the document store. Under-the-hood, documents are fetched in
|
||||
document store and yielded as individual documents. This method can be used to iteratively process
|
||||
a large number of documents without having to load all documents in memory.
|
||||
|
||||
Note this limitation from the changelog of Weaviate 1.8.0:
|
||||
|
||||
.. quote::
|
||||
Due to the increasing cost of each page outlined above, there is a limit to
|
||||
how many objects can be retrieved using pagination. By default setting the sum
|
||||
of offset and limit to higher than 10,000 objects, will lead to an error.
|
||||
If you must retrieve more than 10,000 objects, you can increase this limit by
|
||||
setting the environment variable `QUERY_MAXIMUM_RESULTS=<desired-value>`.
|
||||
|
||||
Warning: Setting this to arbitrarily high values can make the memory consumption
|
||||
of a single query explode and single queries can slow down the entire cluster.
|
||||
We recommend setting this value to the lowest possible value that does not
|
||||
interfere with your users' expectations.
|
||||
|
||||
(https://github.com/semi-technologies/weaviate/releases/tag/v1.8.0)
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `index`: Name of the index to get the documents from. If None, the
|
||||
@ -3454,7 +3486,7 @@ operation.
|
||||
```
|
||||
- `top_k`: How many documents to return per query.
|
||||
- `custom_query`: Custom query that will executed using query.raw method, for more details refer
|
||||
https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html
|
||||
https://weaviate.io/developers/weaviate/current/graphql-references/filters.html
|
||||
- `index`: The name of the index in the DocumentStore from which to retrieve documents
|
||||
|
||||
<a id="weaviate.WeaviateDocumentStore.query_by_embedding"></a>
|
||||
|
@ -128,9 +128,9 @@ document_store = SQLDocumentStore()
|
||||
<div class="tabcontent">
|
||||
|
||||
The `WeaviateDocumentStore` requires a running Weaviate Server.
|
||||
You can start a basic instance like this (see the [Weaviate docs](https://www.semi.technology/developers/weaviate/current/) for details):
|
||||
You can start a basic instance like this (see the [Weaviate docs](https://weaviate.io/developers/weaviate/current/) for details):
|
||||
```
|
||||
docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.7.2
|
||||
docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.11.0
|
||||
```
|
||||
|
||||
Afterwards, you can use it in Haystack:
|
||||
|
@ -1522,7 +1522,7 @@ class WeaviateDocumentStore(BaseDocumentStore)
|
||||
```
|
||||
|
||||
Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models.
|
||||
(See https://www.semi.technology/developers/weaviate/current/index.html#what-is-weaviate)
|
||||
(See https://weaviate.io/developers/weaviate/current/index.html#what-is-weaviate)
|
||||
|
||||
Some of the key differences in contrast to FAISS & Milvus:
|
||||
1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up
|
||||
@ -1533,7 +1533,7 @@ Weaviate python client is used to connect to the server, more details are here
|
||||
https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html
|
||||
|
||||
Usage:
|
||||
1. Start a Weaviate server (see https://www.semi.technology/developers/weaviate/current/getting-started/installation.html)
|
||||
1. Start a Weaviate server (see https://weaviate.io/developers/weaviate/current/getting-started/installation.html)
|
||||
2. Init a WeaviateDocumentStore in Haystack
|
||||
|
||||
<a name="weaviate.WeaviateDocumentStore.__init__"></a>
|
||||
@ -1546,7 +1546,7 @@ Usage:
|
||||
**Arguments**:
|
||||
|
||||
- `host`: Weaviate server connection URL for storing and processing documents and vectors.
|
||||
For more details, refer "https://www.semi.technology/developers/weaviate/current/getting-started/installation.html"
|
||||
For more details, refer "https://weaviate.io/developers/weaviate/current/getting-started/installation.html"
|
||||
- `port`: port of Weaviate instance
|
||||
- `timeout_config`: Weaviate Timeout config as a tuple of (retries, time out seconds).
|
||||
- `username`: username (standard authentication via http_auth)
|
||||
@ -1560,11 +1560,11 @@ Usage:
|
||||
- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default.
|
||||
- `index_type`: Index type of any vector object defined in weaviate schema. The vector index type is pluggable.
|
||||
Currently, HSNW is only supported.
|
||||
See: https://www.semi.technology/developers/weaviate/current/more-resources/performance.html
|
||||
See: https://weaviate.io/developers/weaviate/current/more-resources/performance.html
|
||||
- `custom_schema`: Allows to create custom schema in Weaviate, for more details
|
||||
See https://www.semi.technology/developers/weaviate/current/data-schema/schema-configuration.html
|
||||
See https://weaviate.io/developers/weaviate/current/data-schema/schema-configuration.html
|
||||
- `module_name`: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers"
|
||||
For more details, See https://www.semi.technology/developers/weaviate/current/modules/
|
||||
For more details, See https://weaviate.io/developers/weaviate/current/modules/
|
||||
- `return_embedding`: To return document embedding.
|
||||
- `embedding_field`: Name of field containing an embedding vector.
|
||||
- `progress_bar`: Whether to show a tqdm progress bar or not.
|
||||
@ -1695,7 +1695,7 @@ that are most relevant to the query as defined by Weaviate semantic search.
|
||||
- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
||||
- `top_k`: How many documents to return per query.
|
||||
- `custom_query`: Custom query that will executed using query.raw method, for more details refer
|
||||
https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html
|
||||
https://weaviate.io/developers/weaviate/current/graphql-references/filters.html
|
||||
- `index`: The name of the index in the DocumentStore from which to retrieve documents
|
||||
|
||||
<a name="weaviate.WeaviateDocumentStore.query_by_embedding"></a>
|
||||
|
@ -1477,7 +1477,7 @@ class WeaviateDocumentStore(BaseDocumentStore)
|
||||
```
|
||||
|
||||
Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models.
|
||||
(See https://www.semi.technology/developers/weaviate/current/index.html#what-is-weaviate)
|
||||
(See https://weaviate.io/developers/weaviate/current/index.html#what-is-weaviate)
|
||||
|
||||
Some of the key differences in contrast to FAISS & Milvus:
|
||||
1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up
|
||||
@ -1488,7 +1488,7 @@ Weaviate python client is used to connect to the server, more details are here
|
||||
https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html
|
||||
|
||||
Usage:
|
||||
1. Start a Weaviate server (see https://www.semi.technology/developers/weaviate/current/getting-started/installation.html)
|
||||
1. Start a Weaviate server (see https://weaviate.io/developers/weaviate/current/getting-started/installation.html)
|
||||
2. Init a WeaviateDocumentStore in Haystack
|
||||
|
||||
<a name="weaviate.WeaviateDocumentStore.__init__"></a>
|
||||
@ -1501,7 +1501,7 @@ Usage:
|
||||
**Arguments**:
|
||||
|
||||
- `host`: Weaviate server connection URL for storing and processing documents and vectors.
|
||||
For more details, refer "https://www.semi.technology/developers/weaviate/current/getting-started/installation.html"
|
||||
For more details, refer "https://weaviate.io/developers/weaviate/current/getting-started/installation.html"
|
||||
- `port`: port of Weaviate instance
|
||||
- `timeout_config`: Weaviate Timeout config as a tuple of (retries, time out seconds).
|
||||
- `username`: username (standard authentication via http_auth)
|
||||
@ -1515,11 +1515,11 @@ Usage:
|
||||
- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default.
|
||||
- `index_type`: Index type of any vector object defined in weaviate schema. The vector index type is pluggable.
|
||||
Currently, HSNW is only supported.
|
||||
See: https://www.semi.technology/developers/weaviate/current/more-resources/performance.html
|
||||
See: https://weaviate.io/developers/weaviate/current/more-resources/performance.html
|
||||
- `custom_schema`: Allows to create custom schema in Weaviate, for more details
|
||||
See https://www.semi.technology/developers/weaviate/current/data-schema/schema-configuration.html
|
||||
See https://weaviate.io/developers/weaviate/current/data-schema/schema-configuration.html
|
||||
- `module_name`: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers"
|
||||
For more details, See https://www.semi.technology/developers/weaviate/current/modules/
|
||||
For more details, See https://weaviate.io/developers/weaviate/current/modules/
|
||||
- `return_embedding`: To return document embedding.
|
||||
- `embedding_field`: Name of field containing an embedding vector.
|
||||
- `progress_bar`: Whether to show a tqdm progress bar or not.
|
||||
@ -1650,7 +1650,7 @@ that are most relevant to the query as defined by Weaviate semantic search.
|
||||
- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
||||
- `top_k`: How many documents to return per query.
|
||||
- `custom_query`: Custom query that will executed using query.raw method, for more details refer
|
||||
https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html
|
||||
https://weaviate.io/developers/weaviate/current/graphql-references/filters.html
|
||||
- `index`: The name of the index in the DocumentStore from which to retrieve documents
|
||||
|
||||
<a name="weaviate.WeaviateDocumentStore.query_by_embedding"></a>
|
||||
|
@ -1678,7 +1678,7 @@ class WeaviateDocumentStore(BaseDocumentStore)
|
||||
```
|
||||
|
||||
Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models.
|
||||
(See https://www.semi.technology/developers/weaviate/current/index.html#what-is-weaviate)
|
||||
(See https://weaviate.io/developers/weaviate/current/index.html#what-is-weaviate)
|
||||
|
||||
Some of the key differences in contrast to FAISS & Milvus:
|
||||
1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up
|
||||
@ -1690,7 +1690,7 @@ Weaviate python client is used to connect to the server, more details are here
|
||||
https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html
|
||||
|
||||
Usage:
|
||||
1. Start a Weaviate server (see https://www.semi.technology/developers/weaviate/current/getting-started/installation.html)
|
||||
1. Start a Weaviate server (see https://weaviate.io/developers/weaviate/current/getting-started/installation.html)
|
||||
2. Init a WeaviateDocumentStore in Haystack
|
||||
|
||||
Limitations:
|
||||
@ -1706,7 +1706,7 @@ The current implementation is not supporting the storage of labels, so you canno
|
||||
**Arguments**:
|
||||
|
||||
- `host`: Weaviate server connection URL for storing and processing documents and vectors.
|
||||
For more details, refer "https://www.semi.technology/developers/weaviate/current/getting-started/installation.html"
|
||||
For more details, refer "https://weaviate.io/developers/weaviate/current/getting-started/installation.html"
|
||||
- `port`: port of Weaviate instance
|
||||
- `timeout_config`: Weaviate Timeout config as a tuple of (retries, time out seconds).
|
||||
- `username`: username (standard authentication via http_auth)
|
||||
@ -1720,11 +1720,11 @@ The current implementation is not supporting the storage of labels, so you canno
|
||||
'cosine' is recommended for Sentence Transformers.
|
||||
- `index_type`: Index type of any vector object defined in weaviate schema. The vector index type is pluggable.
|
||||
Currently, HSNW is only supported.
|
||||
See: https://www.semi.technology/developers/weaviate/current/more-resources/performance.html
|
||||
See: https://weaviate.io/developers/weaviate/current/more-resources/performance.html
|
||||
- `custom_schema`: Allows to create custom schema in Weaviate, for more details
|
||||
See https://www.semi.technology/developers/weaviate/current/data-schema/schema-configuration.html
|
||||
See https://weaviate.io/developers/weaviate/current/data-schema/schema-configuration.html
|
||||
- `module_name`: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers"
|
||||
For more details, See https://www.semi.technology/developers/weaviate/current/modules/
|
||||
For more details, See https://weaviate.io/developers/weaviate/current/modules/
|
||||
- `return_embedding`: To return document embedding.
|
||||
- `embedding_field`: Name of field containing an embedding vector.
|
||||
- `progress_bar`: Whether to show a tqdm progress bar or not.
|
||||
@ -1863,7 +1863,7 @@ that are most relevant to the query as defined by Weaviate semantic search.
|
||||
- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
||||
- `top_k`: How many documents to return per query.
|
||||
- `custom_query`: Custom query that will executed using query.raw method, for more details refer
|
||||
https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html
|
||||
https://weaviate.io/developers/weaviate/current/graphql-references/filters.html
|
||||
- `index`: The name of the index in the DocumentStore from which to retrieve documents
|
||||
|
||||
<a name="weaviate.WeaviateDocumentStore.query_by_embedding"></a>
|
||||
|
@ -1757,7 +1757,7 @@ class WeaviateDocumentStore(BaseDocumentStore)
|
||||
```
|
||||
|
||||
Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models.
|
||||
(See https://www.semi.technology/developers/weaviate/current/index.html#what-is-weaviate)
|
||||
(See https://weaviate.io/developers/weaviate/current/index.html#what-is-weaviate)
|
||||
|
||||
Some of the key differences in contrast to FAISS & Milvus:
|
||||
1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up
|
||||
@ -1770,7 +1770,7 @@ Weaviate python client is used to connect to the server, more details are here
|
||||
https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html
|
||||
|
||||
Usage:
|
||||
1. Start a Weaviate server (see https://www.semi.technology/developers/weaviate/current/getting-started/installation.html)
|
||||
1. Start a Weaviate server (see https://weaviate.io/developers/weaviate/current/getting-started/installation.html)
|
||||
2. Init a WeaviateDocumentStore in Haystack
|
||||
|
||||
Limitations:
|
||||
@ -1786,7 +1786,7 @@ The current implementation is not supporting the storage of labels, so you canno
|
||||
**Arguments**:
|
||||
|
||||
- `host`: Weaviate server connection URL for storing and processing documents and vectors.
|
||||
For more details, refer "https://www.semi.technology/developers/weaviate/current/getting-started/installation.html"
|
||||
For more details, refer "https://weaviate.io/developers/weaviate/current/getting-started/installation.html"
|
||||
- `port`: port of Weaviate instance
|
||||
- `timeout_config`: Weaviate Timeout config as a tuple of (retries, time out seconds).
|
||||
- `username`: username (standard authentication via http_auth)
|
||||
@ -1800,11 +1800,11 @@ The current implementation is not supporting the storage of labels, so you canno
|
||||
'cosine' is recommended for Sentence Transformers.
|
||||
- `index_type`: Index type of any vector object defined in weaviate schema. The vector index type is pluggable.
|
||||
Currently, HSNW is only supported.
|
||||
See: https://www.semi.technology/developers/weaviate/current/more-resources/performance.html
|
||||
See: https://weaviate.io/developers/weaviate/current/more-resources/performance.html
|
||||
- `custom_schema`: Allows to create custom schema in Weaviate, for more details
|
||||
See https://www.semi.technology/developers/weaviate/current/data-schema/schema-configuration.html
|
||||
See https://weaviate.io/developers/weaviate/current/data-schema/schema-configuration.html
|
||||
- `module_name`: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers"
|
||||
For more details, See https://www.semi.technology/developers/weaviate/current/modules/
|
||||
For more details, See https://weaviate.io/developers/weaviate/current/modules/
|
||||
- `return_embedding`: To return document embedding.
|
||||
- `embedding_field`: Name of field containing an embedding vector.
|
||||
- `progress_bar`: Whether to show a tqdm progress bar or not.
|
||||
@ -1943,7 +1943,7 @@ that are most relevant to the query as defined by Weaviate semantic search.
|
||||
- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
||||
- `top_k`: How many documents to return per query.
|
||||
- `custom_query`: Custom query that will executed using query.raw method, for more details refer
|
||||
https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html
|
||||
https://weaviate.io/developers/weaviate/current/graphql-references/filters.html
|
||||
- `index`: The name of the index in the DocumentStore from which to retrieve documents
|
||||
|
||||
<a name="weaviate.WeaviateDocumentStore.query_by_embedding"></a>
|
||||
|
@ -2755,7 +2755,7 @@ class WeaviateDocumentStore(BaseDocumentStore)
|
||||
```
|
||||
|
||||
Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models.
|
||||
(See https://www.semi.technology/developers/weaviate/current/index.html#what-is-weaviate)
|
||||
(See https://weaviate.io/developers/weaviate/current/index.html#what-is-weaviate)
|
||||
|
||||
Some of the key differences in contrast to FAISS & Milvus:
|
||||
1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up
|
||||
@ -2768,7 +2768,7 @@ Weaviate python client is used to connect to the server, more details are here
|
||||
https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html
|
||||
|
||||
Usage:
|
||||
1. Start a Weaviate server (see https://www.semi.technology/developers/weaviate/current/getting-started/installation.html)
|
||||
1. Start a Weaviate server (see https://weaviate.io/developers/weaviate/current/getting-started/installation.html)
|
||||
2. Init a WeaviateDocumentStore in Haystack
|
||||
|
||||
Limitations:
|
||||
@ -2785,7 +2785,7 @@ def __init__(host: Union[str, List[str]] = "http://localhost", port: Union[int,
|
||||
**Arguments**:
|
||||
|
||||
- `host`: Weaviate server connection URL for storing and processing documents and vectors.
|
||||
For more details, refer "https://www.semi.technology/developers/weaviate/current/getting-started/installation.html"
|
||||
For more details, refer "https://weaviate.io/developers/weaviate/current/getting-started/installation.html"
|
||||
- `port`: port of Weaviate instance
|
||||
- `timeout_config`: Weaviate Timeout config as a tuple of (retries, time out seconds).
|
||||
- `username`: username (standard authentication via http_auth)
|
||||
@ -2799,11 +2799,11 @@ If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will
|
||||
'cosine' is recommended for Sentence Transformers.
|
||||
- `index_type`: Index type of any vector object defined in weaviate schema. The vector index type is pluggable.
|
||||
Currently, HSNW is only supported.
|
||||
See: https://www.semi.technology/developers/weaviate/current/more-resources/performance.html
|
||||
See: https://weaviate.io/developers/weaviate/current/more-resources/performance.html
|
||||
- `custom_schema`: Allows to create custom schema in Weaviate, for more details
|
||||
See https://www.semi.technology/developers/weaviate/current/data-schema/schema-configuration.html
|
||||
See https://weaviate.io/developers/weaviate/current/data-schema/schema-configuration.html
|
||||
- `module_name`: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers"
|
||||
For more details, See https://www.semi.technology/developers/weaviate/current/modules/
|
||||
For more details, See https://weaviate.io/developers/weaviate/current/modules/
|
||||
- `return_embedding`: To return document embedding.
|
||||
- `embedding_field`: Name of field containing an embedding vector.
|
||||
- `progress_bar`: Whether to show a tqdm progress bar or not.
|
||||
@ -3065,7 +3065,7 @@ operation.
|
||||
```
|
||||
- `top_k`: How many documents to return per query.
|
||||
- `custom_query`: Custom query that will executed using query.raw method, for more details refer
|
||||
https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html
|
||||
https://weaviate.io/developers/weaviate/current/graphql-references/filters.html
|
||||
- `index`: The name of the index in the DocumentStore from which to retrieve documents
|
||||
|
||||
<a id="weaviate.WeaviateDocumentStore.query_by_embedding"></a>
|
||||
|
@ -3136,7 +3136,7 @@ class WeaviateDocumentStore(BaseDocumentStore)
|
||||
```
|
||||
|
||||
Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models.
|
||||
(See https://www.semi.technology/developers/weaviate/current/index.html#what-is-weaviate)
|
||||
(See https://weaviate.io/developers/weaviate/current/index.html#what-is-weaviate)
|
||||
|
||||
Some of the key differences in contrast to FAISS & Milvus:
|
||||
1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up
|
||||
@ -3149,7 +3149,7 @@ Weaviate python client is used to connect to the server, more details are here
|
||||
https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html
|
||||
|
||||
Usage:
|
||||
1. Start a Weaviate server (see https://www.semi.technology/developers/weaviate/current/getting-started/installation.html)
|
||||
1. Start a Weaviate server (see https://weaviate.io/developers/weaviate/current/getting-started/installation.html)
|
||||
2. Init a WeaviateDocumentStore in Haystack
|
||||
|
||||
Limitations:
|
||||
@ -3166,7 +3166,7 @@ def __init__(host: Union[str, List[str]] = "http://localhost", port: Union[int,
|
||||
**Arguments**:
|
||||
|
||||
- `host`: Weaviate server connection URL for storing and processing documents and vectors.
|
||||
For more details, refer "https://www.semi.technology/developers/weaviate/current/getting-started/installation.html"
|
||||
For more details, refer "https://weaviate.io/developers/weaviate/current/getting-started/installation.html"
|
||||
- `port`: port of Weaviate instance
|
||||
- `timeout_config`: Weaviate Timeout config as a tuple of (retries, time out seconds).
|
||||
- `username`: username (standard authentication via http_auth)
|
||||
@ -3180,11 +3180,11 @@ If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will
|
||||
'cosine' is recommended for Sentence Transformers.
|
||||
- `index_type`: Index type of any vector object defined in weaviate schema. The vector index type is pluggable.
|
||||
Currently, HSNW is only supported.
|
||||
See: https://www.semi.technology/developers/weaviate/current/more-resources/performance.html
|
||||
See: https://weaviate.io/developers/weaviate/current/more-resources/performance.html
|
||||
- `custom_schema`: Allows to create custom schema in Weaviate, for more details
|
||||
See https://www.semi.technology/developers/weaviate/current/data-schema/schema-configuration.html
|
||||
See https://weaviate.io/developers/weaviate/current/data-schema/schema-configuration.html
|
||||
- `module_name`: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers"
|
||||
For more details, See https://www.semi.technology/developers/weaviate/current/modules/
|
||||
For more details, See https://weaviate.io/developers/weaviate/current/modules/
|
||||
- `return_embedding`: To return document embedding.
|
||||
- `embedding_field`: Name of field containing an embedding vector.
|
||||
- `progress_bar`: Whether to show a tqdm progress bar or not.
|
||||
@ -3446,7 +3446,7 @@ operation.
|
||||
```
|
||||
- `top_k`: How many documents to return per query.
|
||||
- `custom_query`: Custom query that will executed using query.raw method, for more details refer
|
||||
https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html
|
||||
https://weaviate.io/developers/weaviate/current/graphql-references/filters.html
|
||||
- `index`: The name of the index in the DocumentStore from which to retrieve documents
|
||||
|
||||
<a id="weaviate.WeaviateDocumentStore.query_by_embedding"></a>
|
||||
|
@ -128,7 +128,7 @@ document_store = SQLDocumentStore()
|
||||
<div class="tabcontent">
|
||||
|
||||
The `WeaviateDocumentStore` requires a running Weaviate Server.
|
||||
You can start a basic instance like this (see the [Weaviate docs](https://www.semi.technology/developers/weaviate/current/) for details):
|
||||
You can start a basic instance like this (see the [Weaviate docs](https://weaviate.io/developers/weaviate/current/) for details):
|
||||
```
|
||||
docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.7.2
|
||||
```
|
||||
|
@ -277,7 +277,10 @@ class ComparisonOperation(ABC):
|
||||
data_type = "valueDate"
|
||||
# Comparison value is a plain string
|
||||
except ValueError:
|
||||
data_type = "valueString"
|
||||
if self.field_name == "content":
|
||||
data_type = "valueText"
|
||||
else:
|
||||
data_type = "valueString"
|
||||
elif isinstance(value, int):
|
||||
data_type = "valueInt"
|
||||
elif isinstance(value, float):
|
||||
|
@ -1,13 +1,21 @@
|
||||
import hashlib
|
||||
from typing import Any, Dict, Generator, List, Optional, Union
|
||||
import re
|
||||
import uuid
|
||||
from typing import Dict, Generator, List, Optional, Union
|
||||
import json
|
||||
import hashlib
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
import logging
|
||||
import json
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
import weaviate
|
||||
|
||||
try:
|
||||
from weaviate import client, AuthClientPassword
|
||||
except (ImportError, ModuleNotFoundError) as ie:
|
||||
from haystack.utils.import_utils import _optional_component_not_installed
|
||||
|
||||
_optional_component_not_installed(__name__, "weaviate", ie)
|
||||
|
||||
from haystack.schema import Document
|
||||
from haystack.document_stores import BaseDocumentStore
|
||||
@ -15,14 +23,6 @@ from haystack.document_stores.base import get_batches_from_generator
|
||||
from haystack.document_stores.filter_utils import LogicalFilterClause
|
||||
from haystack.document_stores.utils import convert_date_to_rfc3339
|
||||
|
||||
try:
|
||||
from weaviate import client, AuthClientPassword
|
||||
from weaviate import ObjectsBatchRequest
|
||||
except (ImportError, ModuleNotFoundError) as ie:
|
||||
from haystack.utils.import_utils import _optional_component_not_installed
|
||||
|
||||
_optional_component_not_installed(__name__, "weaviate", ie)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
UUID_PATTERN = re.compile(r"^[\da-f]{8}-([\da-f]{4}-){3}[\da-f]{12}$", re.IGNORECASE)
|
||||
@ -32,7 +32,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
"""
|
||||
|
||||
Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models.
|
||||
(See https://www.semi.technology/developers/weaviate/current/index.html#what-is-weaviate)
|
||||
(See https://weaviate.io/developers/weaviate/current/index.html#what-is-weaviate)
|
||||
|
||||
Some of the key differences in contrast to FAISS & Milvus:
|
||||
1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up
|
||||
@ -45,7 +45,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html
|
||||
|
||||
Usage:
|
||||
1. Start a Weaviate server (see https://www.semi.technology/developers/weaviate/current/getting-started/installation.html)
|
||||
1. Start a Weaviate server (see https://weaviate.io/developers/weaviate/current/getting-started/installation.html)
|
||||
2. Init a WeaviateDocumentStore in Haystack
|
||||
|
||||
Limitations:
|
||||
@ -74,7 +74,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
):
|
||||
"""
|
||||
:param host: Weaviate server connection URL for storing and processing documents and vectors.
|
||||
For more details, refer "https://www.semi.technology/developers/weaviate/current/getting-started/installation.html"
|
||||
For more details, refer "https://weaviate.io/developers/weaviate/current/getting-started/installation.html"
|
||||
:param port: port of Weaviate instance
|
||||
:param timeout_config: Weaviate Timeout config as a tuple of (retries, time out seconds).
|
||||
:param username: username (standard authentication via http_auth)
|
||||
@ -88,11 +88,11 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
'cosine' is recommended for Sentence Transformers.
|
||||
:param index_type: Index type of any vector object defined in weaviate schema. The vector index type is pluggable.
|
||||
Currently, HSNW is only supported.
|
||||
See: https://www.semi.technology/developers/weaviate/current/more-resources/performance.html
|
||||
See: https://weaviate.io/developers/weaviate/current/more-resources/performance.html
|
||||
:param custom_schema: Allows to create custom schema in Weaviate, for more details
|
||||
See https://www.semi.technology/developers/weaviate/current/data-schema/schema-configuration.html
|
||||
See https://weaviate.io/developers/weaviate/current/data-schema/schema-configuration.html
|
||||
:param module_name: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers"
|
||||
For more details, See https://www.semi.technology/developers/weaviate/current/modules/
|
||||
For more details, See https://weaviate.io/developers/weaviate/current/modules/
|
||||
:param return_embedding: To return document embedding.
|
||||
:param embedding_field: Name of field containing an embedding vector.
|
||||
:param progress_bar: Whether to show a tqdm progress bar or not.
|
||||
@ -265,8 +265,11 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
document = None
|
||||
|
||||
id = self._sanitize_id(id=id, index=index)
|
||||
|
||||
result = self.weaviate_client.data_object.get_by_id(id, with_vector=True)
|
||||
result = None
|
||||
try:
|
||||
result = self.weaviate_client.data_object.get_by_id(id, with_vector=True)
|
||||
except weaviate.exceptions.UnexpectedStatusCodeException as usce:
|
||||
logging.debug(f"Weaviate could not get the document requested: {usce}")
|
||||
if result:
|
||||
document = self._convert_weaviate_result_to_document(result, return_embedding=True)
|
||||
return document
|
||||
@ -289,7 +292,11 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
# TODO: better implementation with multiple where filters instead of chatty call below?
|
||||
for id in ids:
|
||||
id = self._sanitize_id(id=id, index=index)
|
||||
result = self.weaviate_client.data_object.get_by_id(id, with_vector=True)
|
||||
result = None
|
||||
try:
|
||||
result = self.weaviate_client.data_object.get_by_id(id, with_vector=True)
|
||||
except weaviate.exceptions.UnexpectedStatusCodeException as usce:
|
||||
logging.debug(f"Weaviate could not get the document requested: {usce}")
|
||||
if result:
|
||||
document = self._convert_weaviate_result_to_document(result, return_embedding=True)
|
||||
documents.append(document)
|
||||
@ -458,7 +465,6 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
batched_documents = get_batches_from_generator(document_objects, batch_size)
|
||||
with tqdm(total=len(document_objects), disable=not self.progress_bar) as progress_bar:
|
||||
for document_batch in batched_documents:
|
||||
docs_batch = ObjectsBatchRequest()
|
||||
for idx, doc in enumerate(document_batch):
|
||||
_doc = {**doc.to_dict(field_map=self._create_document_field_map())}
|
||||
_ = _doc.pop("score", None)
|
||||
@ -492,10 +498,11 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
for date_field in date_fields:
|
||||
_doc[date_field] = convert_date_to_rfc3339(_doc[date_field])
|
||||
|
||||
docs_batch.add(_doc, class_name=index, uuid=doc_id, vector=vector)
|
||||
|
||||
self.weaviate_client.batch.add_data_object(
|
||||
data_object=_doc, class_name=index, uuid=doc_id, vector=vector
|
||||
)
|
||||
# Ingest a batch of documents
|
||||
results = self.weaviate_client.batch.create(docs_batch)
|
||||
results = self.weaviate_client.batch.create_objects()
|
||||
# Weaviate returns errors for every failed document in the batch
|
||||
if results is not None:
|
||||
for result in results:
|
||||
@ -563,15 +570,14 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
doc_count = 0
|
||||
if filters:
|
||||
filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate()
|
||||
result = (
|
||||
self.weaviate_client.query.aggregate(index).with_fields("meta { count }").with_where(filter_dict).do()
|
||||
)
|
||||
result = self.weaviate_client.query.aggregate(index).with_meta_count().with_where(filter_dict).do()
|
||||
else:
|
||||
result = self.weaviate_client.query.aggregate(index).with_fields("meta { count }").do()
|
||||
result = self.weaviate_client.query.aggregate(index).with_meta_count().do()
|
||||
|
||||
if "data" in result:
|
||||
if "Aggregate" in result.get("data"):
|
||||
doc_count = result.get("data").get("Aggregate").get(index)[0]["meta"]["count"]
|
||||
if result.get("data").get("Aggregate").get(index):
|
||||
doc_count = result.get("data").get("Aggregate").get(index)[0]["meta"]["count"]
|
||||
|
||||
return doc_count
|
||||
|
||||
@ -586,6 +592,22 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
"""
|
||||
Get documents from the document store.
|
||||
|
||||
Note this limitation from the changelog of Weaviate 1.8.0:
|
||||
|
||||
.. quote::
|
||||
Due to the increasing cost of each page outlined above, there is a limit to
|
||||
how many objects can be retrieved using pagination. By default setting the sum
|
||||
of offset and limit to higher than 10,000 objects, will lead to an error.
|
||||
If you must retrieve more than 10,000 objects, you can increase this limit by
|
||||
setting the environment variable `QUERY_MAXIMUM_RESULTS=<desired-value>`.
|
||||
|
||||
Warning: Setting this to arbitrarily high values can make the memory consumption
|
||||
of a single query explode and single queries can slow down the entire cluster.
|
||||
We recommend setting this value to the lowest possible value that does not
|
||||
interfere with your users' expectations.
|
||||
|
||||
(https://github.com/semi-technologies/weaviate/releases/tag/v1.8.0)
|
||||
|
||||
:param index: Name of the index to get the documents from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
@ -651,10 +673,30 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
else:
|
||||
result = self.weaviate_client.query.get(class_name=index, properties=properties).do()
|
||||
|
||||
all_docs = {}
|
||||
if result and "data" in result and "Get" in result.get("data"):
|
||||
if result.get("data").get("Get").get(index):
|
||||
all_docs = result.get("data").get("Get").get(index)
|
||||
# Inherent Weaviate limitation to 100 elements forces us to loop here:
|
||||
# https://weaviate-python-client.readthedocs.io/en/latest/weaviate.data.html?highlight=100#weaviate.data.DataObject.get
|
||||
base_query = self.weaviate_client.query.get(class_name=index, properties=properties)
|
||||
all_docs: List[Any] = []
|
||||
num_of_documents = self.get_document_count(index=index, filters=filters)
|
||||
|
||||
while len(all_docs) < num_of_documents:
|
||||
query = base_query
|
||||
if filters:
|
||||
filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate()
|
||||
query = query.with_where(filter_dict)
|
||||
|
||||
if all_docs:
|
||||
# .with_limit() must be used with .with_offset, of the latter won't work properly
|
||||
# https://weaviate-python-client.readthedocs.io/en/latest/weaviate.gql.html?highlight=offset#weaviate.gql.get.GetBuilder.with_offset
|
||||
query = query.with_limit(100).with_offset(offset=len(all_docs))
|
||||
|
||||
result = query.do()
|
||||
|
||||
if result and "data" in result and "Get" in result.get("data"):
|
||||
if result.get("data").get("Get").get(index):
|
||||
all_docs += result.get("data").get("Get").get(index)
|
||||
else:
|
||||
raise ValueError(f"Weaviate returned ad exception: {result}")
|
||||
|
||||
yield from all_docs
|
||||
|
||||
@ -671,6 +713,22 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
document store and yielded as individual documents. This method can be used to iteratively process
|
||||
a large number of documents without having to load all documents in memory.
|
||||
|
||||
Note this limitation from the changelog of Weaviate 1.8.0:
|
||||
|
||||
.. quote::
|
||||
Due to the increasing cost of each page outlined above, there is a limit to
|
||||
how many objects can be retrieved using pagination. By default setting the sum
|
||||
of offset and limit to higher than 10,000 objects, will lead to an error.
|
||||
If you must retrieve more than 10,000 objects, you can increase this limit by
|
||||
setting the environment variable `QUERY_MAXIMUM_RESULTS=<desired-value>`.
|
||||
|
||||
Warning: Setting this to arbitrarily high values can make the memory consumption
|
||||
of a single query explode and single queries can slow down the entire cluster.
|
||||
We recommend setting this value to the lowest possible value that does not
|
||||
interfere with your users' expectations.
|
||||
|
||||
(https://github.com/semi-technologies/weaviate/releases/tag/v1.8.0)
|
||||
|
||||
:param index: Name of the index to get the documents from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
@ -793,7 +851,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
```
|
||||
:param top_k: How many documents to return per query.
|
||||
:param custom_query: Custom query that will executed using query.raw method, for more details refer
|
||||
https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html
|
||||
https://weaviate.io/developers/weaviate/current/graphql-references/filters.html
|
||||
:param index: The name of the index in the DocumentStore from which to retrieve documents
|
||||
"""
|
||||
index = self._sanitize_index_name(index) or self.index
|
||||
|
@ -63,7 +63,7 @@ def launch_weaviate(sleep=15):
|
||||
logger.debug("Starting Weaviate ...")
|
||||
status = subprocess.run(
|
||||
[
|
||||
f"docker start {WEAVIATE_CONTAINER_NAME} > /dev/null 2>&1 || docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --name {WEAVIATE_CONTAINER_NAME} semitechnologies/weaviate:1.7.2"
|
||||
f"docker start {WEAVIATE_CONTAINER_NAME} > /dev/null 2>&1 || docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --name {WEAVIATE_CONTAINER_NAME} semitechnologies/weaviate:1.11.0"
|
||||
],
|
||||
shell=True,
|
||||
)
|
||||
|
@ -142,7 +142,7 @@ only-milvus =
|
||||
milvus =
|
||||
farm-haystack[sql,only-milvus]
|
||||
weaviate =
|
||||
weaviate-client==2.5.0
|
||||
weaviate-client==3.3.3
|
||||
only-pinecone =
|
||||
pinecone-client
|
||||
pinecone =
|
||||
|
@ -302,7 +302,7 @@ def weaviate_fixture():
|
||||
print("Starting Weaviate servers ...")
|
||||
status = subprocess.run(["docker rm haystack_test_weaviate"], shell=True)
|
||||
status = subprocess.run(
|
||||
["docker run -d --name haystack_test_weaviate -p 8080:8080 semitechnologies/weaviate:1.7.2"], shell=True
|
||||
["docker run -d --name haystack_test_weaviate -p 8080:8080 semitechnologies/weaviate:1.11.0"], shell=True
|
||||
)
|
||||
if status.returncode:
|
||||
raise Exception("Failed to launch Weaviate. Please check docker container logs.")
|
||||
|
@ -117,7 +117,7 @@ def test_init_elastic_doc_store_with_index_recreation():
|
||||
assert len(labels) == 0
|
||||
|
||||
|
||||
def test_write_with_duplicate_doc_ids(document_store):
|
||||
def test_write_with_duplicate_doc_ids(document_store: BaseDocumentStore):
|
||||
duplicate_documents = [
|
||||
Document(content="Doc1", id_hash_keys=["content"]),
|
||||
Document(content="Doc1", id_hash_keys=["content"]),
|
||||
@ -131,7 +131,7 @@ def test_write_with_duplicate_doc_ids(document_store):
|
||||
@pytest.mark.parametrize(
|
||||
"document_store", ["elasticsearch", "faiss", "memory", "milvus1", "weaviate", "pinecone"], indirect=True
|
||||
)
|
||||
def test_write_with_duplicate_doc_ids_custom_index(document_store):
|
||||
def test_write_with_duplicate_doc_ids_custom_index(document_store: BaseDocumentStore):
|
||||
duplicate_documents = [
|
||||
Document(content="Doc1", id_hash_keys=["content"]),
|
||||
Document(content="Doc1", id_hash_keys=["content"]),
|
||||
@ -164,7 +164,20 @@ def test_get_all_documents_without_filters(document_store_with_docs):
|
||||
assert {d.meta["meta_field"] for d in documents} == {"test1", "test2", "test3", "test4", "test5"}
|
||||
|
||||
|
||||
def test_get_all_document_filter_duplicate_text_value(document_store):
|
||||
def test_get_all_documents_large_quantities(document_store: BaseDocumentStore):
|
||||
# Test to exclude situations like Weaviate not returning more than 100 docs by default
|
||||
# https://github.com/deepset-ai/haystack/issues/1893
|
||||
docs_to_write = [
|
||||
{"meta": {"name": f"name_{i}"}, "content": f"text_{i}", "embedding": np.random.rand(768).astype(np.float32)}
|
||||
for i in range(1000)
|
||||
]
|
||||
document_store.write_documents(docs_to_write)
|
||||
documents = document_store.get_all_documents()
|
||||
assert all(isinstance(d, Document) for d in documents)
|
||||
assert len(documents) == len(docs_to_write)
|
||||
|
||||
|
||||
def test_get_all_document_filter_duplicate_text_value(document_store: BaseDocumentStore):
|
||||
documents = [
|
||||
Document(content="Doc1", meta={"f1": "0"}, id_hash_keys=["meta"]),
|
||||
Document(content="Doc1", meta={"f1": "1", "meta_id": "0"}, id_hash_keys=["meta"]),
|
||||
@ -355,7 +368,7 @@ def test_get_document_by_id(document_store_with_docs):
|
||||
assert doc.content == documents[0].content
|
||||
|
||||
|
||||
def test_get_documents_by_id(document_store):
|
||||
def test_get_documents_by_id(document_store: BaseDocumentStore):
|
||||
# generate more documents than the elasticsearch default query size limit of 10
|
||||
docs_to_generate = 15
|
||||
documents = [{"content": "doc-" + str(i)} for i in range(docs_to_generate)]
|
||||
@ -372,7 +385,7 @@ def test_get_documents_by_id(document_store):
|
||||
assert set(retrieved_ids) == set(all_ids)
|
||||
|
||||
|
||||
def test_get_document_count(document_store):
|
||||
def test_get_document_count(document_store: BaseDocumentStore):
|
||||
documents = [
|
||||
{"content": "text1", "id": "1", "meta_field_for_count": "a"},
|
||||
{"content": "text2", "id": "2", "meta_field_for_count": "b"},
|
||||
@ -385,7 +398,7 @@ def test_get_document_count(document_store):
|
||||
assert document_store.get_document_count(filters={"meta_field_for_count": ["b"]}) == 3
|
||||
|
||||
|
||||
def test_get_all_documents_generator(document_store):
|
||||
def test_get_all_documents_generator(document_store: BaseDocumentStore):
|
||||
documents = [
|
||||
{"content": "text1", "id": "1", "meta_field_for_count": "a"},
|
||||
{"content": "text2", "id": "2", "meta_field_for_count": "b"},
|
||||
@ -421,7 +434,7 @@ def test_update_existing_documents(document_store, update_existing_documents):
|
||||
assert stored_docs[0].content == original_docs[0]["content"]
|
||||
|
||||
|
||||
def test_write_document_meta(document_store):
|
||||
def test_write_document_meta(document_store: BaseDocumentStore):
|
||||
documents = [
|
||||
{"content": "dict_without_meta", "id": "1"},
|
||||
{"content": "dict_with_meta", "meta_field": "test2", "name": "filename2", "id": "2"},
|
||||
@ -438,7 +451,7 @@ def test_write_document_meta(document_store):
|
||||
assert document_store.get_document_by_id("4").meta["meta_field"] == "test4"
|
||||
|
||||
|
||||
def test_write_document_index(document_store):
|
||||
def test_write_document_index(document_store: BaseDocumentStore):
|
||||
documents = [{"content": "text1", "id": "1"}, {"content": "text2", "id": "2"}]
|
||||
document_store.write_documents([documents[0]], index="haystack_test_one")
|
||||
assert len(document_store.get_all_documents(index="haystack_test_one")) == 1
|
||||
@ -453,7 +466,7 @@ def test_write_document_index(document_store):
|
||||
@pytest.mark.parametrize(
|
||||
"document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True
|
||||
)
|
||||
def test_document_with_embeddings(document_store):
|
||||
def test_document_with_embeddings(document_store: BaseDocumentStore):
|
||||
documents = [
|
||||
{"content": "text1", "id": "1", "embedding": np.random.rand(768).astype(np.float32)},
|
||||
{"content": "text2", "id": "2", "embedding": np.random.rand(768).astype(np.float64)},
|
||||
@ -720,7 +733,7 @@ def test_delete_documents_by_id_with_filters(document_store_with_docs):
|
||||
|
||||
# exclude weaviate because it does not support storing labels
|
||||
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "pinecone"], indirect=True)
|
||||
def test_labels(document_store):
|
||||
def test_labels(document_store: BaseDocumentStore):
|
||||
label = Label(
|
||||
query="question1",
|
||||
answer=Answer(
|
||||
@ -808,7 +821,7 @@ def test_labels(document_store):
|
||||
|
||||
# exclude weaviate because it does not support storing labels
|
||||
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "pinecone"], indirect=True)
|
||||
def test_multilabel(document_store):
|
||||
def test_multilabel(document_store: BaseDocumentStore):
|
||||
labels = [
|
||||
Label(
|
||||
id="standard",
|
||||
@ -924,7 +937,7 @@ def test_multilabel(document_store):
|
||||
|
||||
# exclude weaviate because it does not support storing labels
|
||||
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "pinecone"], indirect=True)
|
||||
def test_multilabel_no_answer(document_store):
|
||||
def test_multilabel_no_answer(document_store: BaseDocumentStore):
|
||||
labels = [
|
||||
Label(
|
||||
query="question",
|
||||
@ -993,7 +1006,7 @@ def test_multilabel_no_answer(document_store):
|
||||
# exclude weaviate because it does not support storing labels
|
||||
# exclude faiss and milvus as label metadata is not implemented
|
||||
@pytest.mark.parametrize("document_store", ["elasticsearch", "memory"], indirect=True)
|
||||
def test_multilabel_filter_aggregations(document_store):
|
||||
def test_multilabel_filter_aggregations(document_store: BaseDocumentStore):
|
||||
labels = [
|
||||
Label(
|
||||
id="standard",
|
||||
@ -1089,7 +1102,7 @@ def test_multilabel_filter_aggregations(document_store):
|
||||
# exclude weaviate because it does not support storing labels
|
||||
# exclude faiss and milvus as label metadata is not implemented
|
||||
@pytest.mark.parametrize("document_store", ["elasticsearch", "memory"], indirect=True)
|
||||
def test_multilabel_meta_aggregations(document_store):
|
||||
def test_multilabel_meta_aggregations(document_store: BaseDocumentStore):
|
||||
labels = [
|
||||
Label(
|
||||
id="standard",
|
||||
@ -1180,7 +1193,7 @@ def test_multilabel_meta_aggregations(document_store):
|
||||
|
||||
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "milvus1", "weaviate", "pinecone"], indirect=True)
|
||||
# Currently update_document_meta() is not implemented for Memory doc store
|
||||
def test_update_meta(document_store):
|
||||
def test_update_meta(document_store: BaseDocumentStore):
|
||||
documents = [
|
||||
Document(content="Doc1", meta={"meta_key_1": "1", "meta_key_2": "1"}),
|
||||
Document(content="Doc2", meta={"meta_key_1": "2", "meta_key_2": "2"}),
|
||||
@ -1209,7 +1222,7 @@ def test_custom_embedding_field(document_store_type, tmp_path):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
|
||||
def test_get_meta_values_by_key(document_store):
|
||||
def test_get_meta_values_by_key(document_store: BaseDocumentStore):
|
||||
documents = [
|
||||
Document(content="Doc1", meta={"meta_key_1": "1", "meta_key_2": "11"}),
|
||||
Document(content="Doc2", meta={"meta_key_1": "2", "meta_key_2": "22"}),
|
||||
@ -1271,7 +1284,7 @@ def test_elasticsearch_delete_index():
|
||||
|
||||
|
||||
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
|
||||
def test_elasticsearch_query_with_filters_and_missing_embeddings(document_store):
|
||||
def test_elasticsearch_query_with_filters_and_missing_embeddings(document_store: BaseDocumentStore):
|
||||
document_store.write_documents(DOCUMENTS)
|
||||
document_without_embedding = Document(
|
||||
content="Doc without embedding", meta={"name": "name_7", "year": "2021", "month": "04"}
|
||||
|
Loading…
x
Reference in New Issue
Block a user