mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-05 11:38:20 +00:00
update to PineconeDocumentStore to remove dependency on SQL db (#2749)
* update to PineconeDocumentStore to remove dependency on SQL db * Update Documentation & Code Style * typing fixes * Update Documentation & Code Style * fixed embedding generator to yield Documents * Update Documentation & Code Style * fixes for final typing issues * fixes for pylint * Update Documentation & Code Style * uncomment pinecone tests * added new params to docstrings * Update Documentation & Code Style * Update Documentation & Code Style * Update haystack/document_stores/pinecone.py Co-authored-by: Sara Zan <sarazanzo94@gmail.com> * Update haystack/document_stores/pinecone.py Co-authored-by: Sara Zan <sarazanzo94@gmail.com> * Update Documentation & Code Style * Update haystack/document_stores/pinecone.py Co-authored-by: Sara Zan <sarazanzo94@gmail.com> * Update haystack/document_stores/pinecone.py Co-authored-by: Sara Zan <sarazanzo94@gmail.com> * Update haystack/document_stores/pinecone.py Co-authored-by: Sara Zan <sarazanzo94@gmail.com> * Update haystack/document_stores/pinecone.py Co-authored-by: Sara Zan <sarazanzo94@gmail.com> * changes based on comments, updated errors and install * Update Documentation & Code Style * mypy * implement simple filtering in pinecone mock * typo * typo in reverse * account for missing meta key in filtering * typo * added metadata filtering to describe index * added handling for users switching indexes in same doc store, and handling duplicate docs in write * syntax tweaks * added index option to document/embedding count calls * labels implementation in progress * added metadata fields to be indexed for pinecone tests * further changes to mock * WIP implementation of labels+multilabels * switched to rely on labels namespace rather than filter * simpler delete_labels * label fixes, remove debug code * Apply dostring fixes Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * mypy * pylint * docs * temporarily un-mock Pinecone * Small Pinecone test suite * pylint * Add fake test key to pass the None check * Add again fake test key to pass the None check * Add Pinecone to default docstores and fix filters * Fix field name * Change field name * Change field value * Remove comments * forgot to upgrade pyproject.toml Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai> Co-authored-by: Sara Zan <sarazanzo94@gmail.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
This commit is contained in:
parent
891707ecaa
commit
9b1b03002f
@ -1,6 +1,8 @@
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
"--document_store_type", action="store", default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate"
|
||||
"--document_store_type",
|
||||
action="store",
|
||||
default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate, pinecone",
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -4689,7 +4689,7 @@ number of labels for the given index
|
||||
## PineconeDocumentStore
|
||||
|
||||
```python
|
||||
class PineconeDocumentStore(SQLDocumentStore)
|
||||
class PineconeDocumentStore(BaseDocumentStore)
|
||||
```
|
||||
|
||||
Document store for very large scale embedding based dense retrievers like the DPR. This is a hosted document store,
|
||||
@ -4708,7 +4708,7 @@ the vector embeddings and metadata (for filtering) are indexed in a Pinecone Ind
|
||||
#### PineconeDocumentStore.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(api_key: str, environment: str = "us-west1-gcp", sql_url: str = "sqlite:///pinecone_document_store.db", pinecone_index: Optional[pinecone.Index] = None, embedding_dim: int = 768, return_embedding: bool = False, index: str = "document", similarity: str = "cosine", replicas: int = 1, shards: int = 1, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", recreate_index: bool = False, metadata_config: dict = {"indexed": []}, validate_index_sync: bool = True)
|
||||
def __init__(api_key: str, environment: str = "us-west1-gcp", pinecone_index: Optional[pinecone.Index] = None, embedding_dim: int = 768, return_embedding: bool = False, index: str = "document", similarity: str = "cosine", replicas: int = 1, shards: int = 1, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", recreate_index: bool = False, metadata_config: dict = {"indexed": []}, validate_index_sync: bool = True)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -4716,8 +4716,6 @@ def __init__(api_key: str, environment: str = "us-west1-gcp", sql_url: str = "sq
|
||||
- `api_key`: Pinecone vector database API key ([https://app.pinecone.io](https://app.pinecone.io)).
|
||||
- `environment`: Pinecone cloud environment uses `"us-west1-gcp"` by default. Other GCP and AWS regions are
|
||||
supported, contact Pinecone [here](https://www.pinecone.io/contact/) if required.
|
||||
- `sql_url`: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale
|
||||
deployment, Postgres is recommended.
|
||||
- `pinecone_index`: pinecone-client Index object, an index will be initialized or loaded if not specified.
|
||||
- `embedding_dim`: The embedding vector size.
|
||||
- `return_embedding`: Whether to return document embeddings.
|
||||
@ -4743,17 +4741,57 @@ Parameter options:
|
||||
created using the config you are using for initialization. Be aware that all data in the old index will be
|
||||
lost if you choose to recreate the index. Be aware that both the document_index and the label_index will
|
||||
be recreated.
|
||||
- `metadata_config`: Which metadata fields should be indexed. Should be in the format
|
||||
`{"indexed": ["metadata-field-1", "metadata-field-2", "metadata-field-n"]}`.
|
||||
Indexing metadata fields is a prerequisite to allow filtering of documents by metadata values.
|
||||
- `validate_index_sync`: Whether to check that the document count equals the embedding count at initialization time
|
||||
- `metadata_config`: Which metadata fields should be indexed, part of the
|
||||
[selective metadata filtering](https://www.pinecone.io/docs/manage-indexes/`selective`-metadata-indexing) feature.
|
||||
Should be in the format `{"indexed": ["metadata-field-1", "metadata-field-2", "metadata-field-n"]}`. By default,
|
||||
no fields are indexed.
|
||||
|
||||
<a id="pinecone.PineconeDocumentStore.get_document_count"></a>
|
||||
|
||||
#### PineconeDocumentStore.get\_document\_count
|
||||
|
||||
```python
|
||||
def get_document_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int
|
||||
```
|
||||
|
||||
Return the count of embeddings in the document store.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`), or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
- `index`: Optional index to use for the query. If not provided, the default index is used.
|
||||
- `only_documents_without_embedding`: If set to `True`, only documents without embeddings are counted.
|
||||
- `headers`: PineconeDocumentStore does not support headers.
|
||||
|
||||
<a id="pinecone.PineconeDocumentStore.write_documents"></a>
|
||||
|
||||
#### PineconeDocumentStore.write\_documents
|
||||
|
||||
```python
|
||||
def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 32, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None)
|
||||
def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 32, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None, labels: Optional[bool] = False)
|
||||
```
|
||||
|
||||
Add new documents to the DocumentStore.
|
||||
@ -4771,6 +4809,7 @@ Parameter options:
|
||||
- `"overwrite"`: Update any existing documents with the same ID when adding documents.
|
||||
- `"fail"`: An error is raised if the document ID of the document being added already exists.
|
||||
- `headers`: PineconeDocumentStore does not support headers.
|
||||
- `labels`: Tells us whether these records are labels or not. Defaults to False.
|
||||
|
||||
**Raises**:
|
||||
|
||||
@ -4824,12 +4863,55 @@ operation.
|
||||
- `batch_size`: Number of documents to process at a time. When working with large number of documents,
|
||||
batching can help reduce memory footprint.
|
||||
|
||||
<a id="pinecone.PineconeDocumentStore.get_all_documents"></a>
|
||||
|
||||
#### PineconeDocumentStore.get\_all\_documents
|
||||
|
||||
```python
|
||||
def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 32, headers: Optional[Dict[str, str]] = None, namespace: Optional[str] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Retrieves all documents in the index.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `index`: Optional index name to retrieve all documents from.
|
||||
- `filters`: Optional filters to narrow down the documents that will be retrieved.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
- `return_embedding`: Optional flag to return the embedding of the document.
|
||||
- `batch_size`: Number of documents to process at a time. When working with large number of documents,
|
||||
batching can help reduce memory footprint.
|
||||
- `headers`: Pinecone does not support headers.
|
||||
- `namespace`: Optional namespace to retrieve documents from.
|
||||
|
||||
<a id="pinecone.PineconeDocumentStore.get_all_documents_generator"></a>
|
||||
|
||||
#### PineconeDocumentStore.get\_all\_documents\_generator
|
||||
|
||||
```python
|
||||
def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 32, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
|
||||
def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 32, headers: Optional[Dict[str, str]] = None, namespace: Optional[str] = None) -> Generator[Document, None, None]
|
||||
```
|
||||
|
||||
Get all documents from the document store. Under-the-hood, documents are fetched in batches from the
|
||||
@ -4868,6 +4950,45 @@ operation.
|
||||
- `return_embedding`: Whether to return the document embeddings.
|
||||
- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
|
||||
- `headers`: PineconeDocumentStore does not support headers.
|
||||
- `namespace`: Optional namespace to retrieve documents from.
|
||||
|
||||
<a id="pinecone.PineconeDocumentStore.get_documents_by_id"></a>
|
||||
|
||||
#### PineconeDocumentStore.get\_documents\_by\_id
|
||||
|
||||
```python
|
||||
def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 32, headers: Optional[Dict[str, str]] = None, return_embedding: Optional[bool] = None, namespace: str = None) -> List[Document]
|
||||
```
|
||||
|
||||
Retrieves all documents in the index using their IDs.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `ids`: List of IDs to retrieve.
|
||||
- `index`: Optional index name to retrieve all documents from.
|
||||
- `batch_size`: Number of documents to retrieve at a time. When working with large number of documents,
|
||||
batching can help reduce memory footprint.
|
||||
- `headers`: Pinecone does not support headers.
|
||||
- `return_embedding`: Optional flag to return the embedding of the document.
|
||||
- `namespace`: Optional namespace to retrieve documents from.
|
||||
|
||||
<a id="pinecone.PineconeDocumentStore.get_document_by_id"></a>
|
||||
|
||||
#### PineconeDocumentStore.get\_document\_by\_id
|
||||
|
||||
```python
|
||||
def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, return_embedding: Optional[bool] = None, namespace: str = None) -> Document
|
||||
```
|
||||
|
||||
Returns a single Document retrieved using an ID.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `id`: ID string to retrieve.
|
||||
- `index`: Optional index name to retrieve all documents from.
|
||||
- `headers`: Pinecone does not support headers.
|
||||
- `return_embedding`: Optional flag to return the embedding of the document.
|
||||
- `namespace`: Optional namespace to retrieve documents from.
|
||||
|
||||
<a id="pinecone.PineconeDocumentStore.get_embedding_count"></a>
|
||||
|
||||
@ -4879,22 +5000,35 @@ def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str,
|
||||
|
||||
Return the count of embeddings in the document store.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `index`: Optional index name to retrieve all documents from.
|
||||
- `filters`: Filters are not supported for `get_embedding_count` in Pinecone.
|
||||
|
||||
<a id="pinecone.PineconeDocumentStore.update_document_meta"></a>
|
||||
|
||||
#### PineconeDocumentStore.update\_document\_meta
|
||||
|
||||
```python
|
||||
def update_document_meta(id: str, meta: Dict[str, str], index: str = None)
|
||||
def update_document_meta(id: str, meta: Dict[str, str], namespace: str = None, index: str = None)
|
||||
```
|
||||
|
||||
Update the metadata dictionary of a document by specifying its string id
|
||||
Update the metadata dictionary of a document by specifying its string ID.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `id`: ID of the Document to update.
|
||||
- `meta`: Dictionary of new metadata.
|
||||
- `namespace`: Optional namespace to update documents from. If not specified, defaults to the embedding
|
||||
namespace (vectors) if it exists, otherwise the document namespace (no-vectors).
|
||||
- `index`: Optional index name to update documents from.
|
||||
|
||||
<a id="pinecone.PineconeDocumentStore.delete_documents"></a>
|
||||
|
||||
#### PineconeDocumentStore.delete\_documents
|
||||
|
||||
```python
|
||||
def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None)
|
||||
def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, drop_ids: Optional[bool] = True, namespace: Optional[str] = None)
|
||||
```
|
||||
|
||||
Delete documents from the document store.
|
||||
@ -4904,6 +5038,8 @@ Delete documents from the document store.
|
||||
- `index`: Index name to delete the documents from. If `None`, the DocumentStore's default index
|
||||
(`self.index`) will be used.
|
||||
- `ids`: Optional list of IDs to narrow down the documents to be deleted.
|
||||
- `namespace`: Optional namespace string. By default, it deletes vectors from the embeddings namespace
|
||||
unless the namespace is empty, in which case it deletes from the documents namespace.
|
||||
- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
@ -4929,6 +5065,14 @@ operation.
|
||||
}
|
||||
```
|
||||
- `headers`: PineconeDocumentStore does not support headers.
|
||||
- `drop_ids`: Specifies if the locally stored IDs should be deleted. The default
|
||||
is True.
|
||||
- `namespace`: Optional namespace to delete documents from. If not specified, defaults to the embedding
|
||||
namespace (vectors) if it exists, otherwise the document namespace (no-vectors).
|
||||
|
||||
**Returns**:
|
||||
|
||||
`None`:
|
||||
|
||||
<a id="pinecone.PineconeDocumentStore.delete_index"></a>
|
||||
|
||||
@ -4953,7 +5097,7 @@ None
|
||||
#### PineconeDocumentStore.query\_by\_embedding
|
||||
|
||||
```python
|
||||
def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document]
|
||||
def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True, namespace: Optional[str] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
|
||||
@ -5038,7 +5182,47 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
|
||||
def load(cls)
|
||||
```
|
||||
|
||||
Default class method used for loading indexes. Not applicable to the PineconeDocumentStore.
|
||||
Default class method used for loading indexes. Not applicable to PineconeDocumentStore.
|
||||
|
||||
<a id="pinecone.PineconeDocumentStore.delete_labels"></a>
|
||||
|
||||
#### PineconeDocumentStore.delete\_labels
|
||||
|
||||
```python
|
||||
def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 32)
|
||||
```
|
||||
|
||||
Default class method used for deleting labels. Not supported by PineconeDocumentStore.
|
||||
|
||||
<a id="pinecone.PineconeDocumentStore.get_all_labels"></a>
|
||||
|
||||
#### PineconeDocumentStore.get\_all\_labels
|
||||
|
||||
```python
|
||||
def get_all_labels(index=None, filters: Optional[dict] = None, headers: Optional[Dict[str, str]] = None)
|
||||
```
|
||||
|
||||
Default class method used for getting all labels.
|
||||
|
||||
<a id="pinecone.PineconeDocumentStore.get_label_count"></a>
|
||||
|
||||
#### PineconeDocumentStore.get\_label\_count
|
||||
|
||||
```python
|
||||
def get_label_count(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None)
|
||||
```
|
||||
|
||||
Default class method used for counting labels. Not supported by PineconeDocumentStore.
|
||||
|
||||
<a id="pinecone.PineconeDocumentStore.write_labels"></a>
|
||||
|
||||
#### PineconeDocumentStore.write\_labels
|
||||
|
||||
```python
|
||||
def write_labels(labels, index=None, headers: Optional[Dict[str, str]] = None)
|
||||
```
|
||||
|
||||
Default class method used for writing labels.
|
||||
|
||||
<a id="utils"></a>
|
||||
|
||||
|
||||
@ -6,6 +6,7 @@ from sqlalchemy.sql import select
|
||||
from sqlalchemy import and_, or_
|
||||
|
||||
from haystack.document_stores.utils import convert_date_to_rfc3339
|
||||
from haystack.errors import FilterError
|
||||
|
||||
|
||||
def nested_defaultdict() -> defaultdict:
|
||||
@ -460,7 +461,8 @@ class InOperation(ComparisonOperation):
|
||||
# is only initialized with lists, but changing the type annotation would mean duplicating __init__
|
||||
|
||||
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, List]]:
|
||||
assert isinstance(self.comparison_value, list), "'$in' operation requires comparison value to be a list."
|
||||
if not isinstance(self.comparison_value, list):
|
||||
raise FilterError("'$in' operation requires comparison value to be a list.")
|
||||
return {"terms": {self.field_name: self.comparison_value}}
|
||||
|
||||
def convert_to_sql(self, meta_document_orm):
|
||||
@ -470,7 +472,8 @@ class InOperation(ComparisonOperation):
|
||||
|
||||
def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]:
|
||||
filter_dict: Dict[str, Union[str, List[Dict]]] = {"operator": "Or", "operands": []}
|
||||
assert isinstance(self.comparison_value, list), "'$in' operation requires comparison value to be a list."
|
||||
if not isinstance(self.comparison_value, list):
|
||||
raise FilterError("'$in' operation requires comparison value to be a list.")
|
||||
for value in self.comparison_value:
|
||||
comp_value_type, comp_value = self._get_weaviate_datatype(value)
|
||||
assert isinstance(filter_dict["operands"], list) # Necessary for mypy
|
||||
@ -481,7 +484,8 @@ class InOperation(ComparisonOperation):
|
||||
return filter_dict
|
||||
|
||||
def convert_to_pinecone(self) -> Dict[str, Dict[str, List]]:
|
||||
assert isinstance(self.comparison_value, list), "'$in' operation requires comparison value to be a list."
|
||||
if not isinstance(self.comparison_value, list):
|
||||
raise FilterError("'$in' operation requires comparison value to be a list.")
|
||||
return {self.field_name: {"$in": self.comparison_value}}
|
||||
|
||||
def invert(self) -> "NinOperation":
|
||||
@ -499,7 +503,8 @@ class NeOperation(ComparisonOperation):
|
||||
return fields[self.field_name] != self.comparison_value
|
||||
|
||||
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Dict[str, Union[str, int, float, bool]]]]]:
|
||||
assert not isinstance(self.comparison_value, list), "Use '$nin' operation for lists as comparison values."
|
||||
if isinstance(self.comparison_value, list):
|
||||
raise FilterError("Use '$nin' operation for lists as comparison values.")
|
||||
return {"bool": {"must_not": {"term": {self.field_name: self.comparison_value}}}}
|
||||
|
||||
def convert_to_sql(self, meta_document_orm):
|
||||
@ -530,7 +535,8 @@ class NinOperation(ComparisonOperation):
|
||||
# is only initialized with lists, but changing the type annotation would mean duplicating __init__
|
||||
|
||||
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Dict[str, List]]]]:
|
||||
assert isinstance(self.comparison_value, list), "'$nin' operation requires comparison value to be a list."
|
||||
if not isinstance(self.comparison_value, list):
|
||||
raise FilterError("'$nin' operation requires comparison value to be a list.")
|
||||
return {"bool": {"must_not": {"terms": {self.field_name: self.comparison_value}}}}
|
||||
|
||||
def convert_to_sql(self, meta_document_orm):
|
||||
@ -540,7 +546,8 @@ class NinOperation(ComparisonOperation):
|
||||
|
||||
def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]:
|
||||
filter_dict: Dict[str, Union[str, List[Dict]]] = {"operator": "And", "operands": []}
|
||||
assert isinstance(self.comparison_value, list), "'$nin' operation requires comparison value to be a list."
|
||||
if not isinstance(self.comparison_value, list):
|
||||
raise FilterError("'$nin' operation requires comparison value to be a list.")
|
||||
for value in self.comparison_value:
|
||||
comp_value_type, comp_value = self._get_weaviate_datatype(value)
|
||||
assert isinstance(filter_dict["operands"], list) # Necessary for mypy
|
||||
@ -551,7 +558,8 @@ class NinOperation(ComparisonOperation):
|
||||
return filter_dict
|
||||
|
||||
def convert_to_pinecone(self) -> Dict[str, Dict[str, List]]:
|
||||
assert isinstance(self.comparison_value, list), "'$in' operation requires comparison value to be a list."
|
||||
if not isinstance(self.comparison_value, list):
|
||||
raise FilterError("'$in' operation requires comparison value to be a list.")
|
||||
return {self.field_name: {"$nin": self.comparison_value}}
|
||||
|
||||
def invert(self) -> "InOperation":
|
||||
@ -569,7 +577,8 @@ class GtOperation(ComparisonOperation):
|
||||
return fields[self.field_name] > self.comparison_value
|
||||
|
||||
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]:
|
||||
assert not isinstance(self.comparison_value, list), "Comparison value for '$gt' operation must not be a list."
|
||||
if isinstance(self.comparison_value, list):
|
||||
raise FilterError("Comparison value for '$gt' operation must not be a list.")
|
||||
return {"range": {self.field_name: {"gt": self.comparison_value}}}
|
||||
|
||||
def convert_to_sql(self, meta_document_orm):
|
||||
@ -579,13 +588,13 @@ class GtOperation(ComparisonOperation):
|
||||
|
||||
def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]:
|
||||
comp_value_type, comp_value = self._get_weaviate_datatype()
|
||||
assert not isinstance(comp_value, list), "Comparison value for '$gt' operation must not be a list."
|
||||
if isinstance(comp_value, list):
|
||||
raise FilterError("Comparison value for '$gt' operation must not be a list.")
|
||||
return {"path": [self.field_name], "operator": "GreaterThan", comp_value_type: comp_value}
|
||||
|
||||
def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]:
|
||||
assert not isinstance(
|
||||
self.comparison_value, (list, str)
|
||||
), "Comparison value for '$gt' operation must be a float or int."
|
||||
if not isinstance(self.comparison_value, (float, int)):
|
||||
raise FilterError("Comparison value for '$gt' operation must be a float or int.")
|
||||
return {self.field_name: {"$gt": self.comparison_value}}
|
||||
|
||||
def invert(self) -> "LteOperation":
|
||||
@ -603,7 +612,8 @@ class GteOperation(ComparisonOperation):
|
||||
return fields[self.field_name] >= self.comparison_value
|
||||
|
||||
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]:
|
||||
assert not isinstance(self.comparison_value, list), "Comparison value for '$gte' operation must not be a list."
|
||||
if isinstance(self.comparison_value, list):
|
||||
raise FilterError("Comparison value for '$gte' operation must not be a list.")
|
||||
return {"range": {self.field_name: {"gte": self.comparison_value}}}
|
||||
|
||||
def convert_to_sql(self, meta_document_orm):
|
||||
@ -613,13 +623,13 @@ class GteOperation(ComparisonOperation):
|
||||
|
||||
def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]:
|
||||
comp_value_type, comp_value = self._get_weaviate_datatype()
|
||||
assert not isinstance(comp_value, list), "Comparison value for '$gte' operation must not be a list."
|
||||
if isinstance(comp_value, list):
|
||||
raise FilterError("Comparison value for '$gte' operation must not be a list.")
|
||||
return {"path": [self.field_name], "operator": "GreaterThanEqual", comp_value_type: comp_value}
|
||||
|
||||
def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]:
|
||||
assert not isinstance(
|
||||
self.comparison_value, (list, str)
|
||||
), "Comparison value for '$gte' operation must be a float or int."
|
||||
if not isinstance(self.comparison_value, (float, int)):
|
||||
raise FilterError("Comparison value for '$gte' operation must be a float or int.")
|
||||
return {self.field_name: {"$gte": self.comparison_value}}
|
||||
|
||||
def invert(self) -> "LtOperation":
|
||||
@ -637,7 +647,8 @@ class LtOperation(ComparisonOperation):
|
||||
return fields[self.field_name] < self.comparison_value
|
||||
|
||||
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]:
|
||||
assert not isinstance(self.comparison_value, list), "Comparison value for '$lt' operation must not be a list."
|
||||
if isinstance(self.comparison_value, list):
|
||||
raise FilterError("Comparison value for '$lt' operation must not be a list.")
|
||||
return {"range": {self.field_name: {"lt": self.comparison_value}}}
|
||||
|
||||
def convert_to_sql(self, meta_document_orm):
|
||||
@ -647,13 +658,13 @@ class LtOperation(ComparisonOperation):
|
||||
|
||||
def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]:
|
||||
comp_value_type, comp_value = self._get_weaviate_datatype()
|
||||
assert not isinstance(comp_value, list), "Comparison value for '$lt' operation must not be a list."
|
||||
if isinstance(comp_value, list):
|
||||
raise FilterError("Comparison value for '$lt' operation must not be a list.")
|
||||
return {"path": [self.field_name], "operator": "LessThan", comp_value_type: comp_value}
|
||||
|
||||
def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]:
|
||||
assert not isinstance(
|
||||
self.comparison_value, (list, str)
|
||||
), "Comparison value for '$lt' operation must be a float or int."
|
||||
if not isinstance(self.comparison_value, (float, int)):
|
||||
raise FilterError("Comparison value for '$lt' operation must be a float or int.")
|
||||
return {self.field_name: {"$lt": self.comparison_value}}
|
||||
|
||||
def invert(self) -> "GteOperation":
|
||||
@ -671,7 +682,8 @@ class LteOperation(ComparisonOperation):
|
||||
return fields[self.field_name] <= self.comparison_value
|
||||
|
||||
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]:
|
||||
assert not isinstance(self.comparison_value, list), "Comparison value for '$lte' operation must not be a list."
|
||||
if isinstance(self.comparison_value, list):
|
||||
raise FilterError("Comparison value for '$lte' operation must not be a list.")
|
||||
return {"range": {self.field_name: {"lte": self.comparison_value}}}
|
||||
|
||||
def convert_to_sql(self, meta_document_orm):
|
||||
@ -681,13 +693,13 @@ class LteOperation(ComparisonOperation):
|
||||
|
||||
def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]:
|
||||
comp_value_type, comp_value = self._get_weaviate_datatype()
|
||||
assert not isinstance(comp_value, list), "Comparison value for '$lte' operation must not be a list."
|
||||
if isinstance(comp_value, list):
|
||||
raise FilterError("Comparison value for '$lte' operation must not be a list.")
|
||||
return {"path": [self.field_name], "operator": "LessThanEqual", comp_value_type: comp_value}
|
||||
|
||||
def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]:
|
||||
assert not isinstance(
|
||||
self.comparison_value, (list, str)
|
||||
), "Comparison value for '$lte' operation must be a float or int."
|
||||
if not isinstance(self.comparison_value, (float, int)):
|
||||
raise FilterError("Comparison value for '$lte' operation must be a float or int.")
|
||||
return {self.field_name: {"$lte": self.comparison_value}}
|
||||
|
||||
def invert(self) -> "GtOperation":
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -76,6 +76,20 @@ class DocumentStoreError(HaystackError):
|
||||
super().__init__(message=message)
|
||||
|
||||
|
||||
class FilterError(DocumentStoreError):
|
||||
"""Exception for issues that occur building complex filters"""
|
||||
|
||||
def __init__(self, message: Optional[str] = None):
|
||||
super().__init__(message=message)
|
||||
|
||||
|
||||
class PineconeDocumentStoreError(DocumentStoreError):
|
||||
"""Exception for issues that occur in a Pinecone document store"""
|
||||
|
||||
def __init__(self, message: Optional[str] = None):
|
||||
super().__init__(message=message)
|
||||
|
||||
|
||||
class DuplicateDocumentError(DocumentStoreError, ValueError):
|
||||
"""Exception for Duplicate document"""
|
||||
|
||||
|
||||
@ -1700,11 +1700,6 @@
|
||||
"default": "us-west1-gcp",
|
||||
"type": "string"
|
||||
},
|
||||
"sql_url": {
|
||||
"title": "Sql Url",
|
||||
"default": "sqlite:///pinecone_document_store.db",
|
||||
"type": "string"
|
||||
},
|
||||
"pinecone_index": {
|
||||
"title": "Pinecone Index",
|
||||
"default": null,
|
||||
|
||||
@ -128,7 +128,7 @@ weaviate = [
|
||||
"weaviate-client==3.6.0",
|
||||
]
|
||||
only-pinecone = [
|
||||
"pinecone-client",
|
||||
"pinecone-client>=2.0.11,<3",
|
||||
]
|
||||
pinecone = [
|
||||
"farm-haystack[sql,only-pinecone]",
|
||||
|
||||
@ -171,7 +171,7 @@ def pytest_collection_modifyitems(config, items):
|
||||
"pinecone",
|
||||
"opensearch",
|
||||
]:
|
||||
if cur_doc_store in keywords and cur_doc_store not in document_store_types_to_run:
|
||||
if keywords and cur_doc_store in keywords and cur_doc_store not in document_store_types_to_run:
|
||||
skip_docstore = pytest.mark.skip(
|
||||
reason=f'{cur_doc_store} is disabled. Enable via pytest --document_store_type="{cur_doc_store}"'
|
||||
)
|
||||
@ -180,15 +180,11 @@ def pytest_collection_modifyitems(config, items):
|
||||
if "milvus1" in keywords and not milvus1:
|
||||
skip_milvus1 = pytest.mark.skip(reason="Skipping Tests for 'milvus1', as Milvus2 seems to be installed.")
|
||||
item.add_marker(skip_milvus1)
|
||||
|
||||
elif "milvus" in keywords and milvus1:
|
||||
skip_milvus = pytest.mark.skip(reason="Skipping Tests for 'milvus', as Milvus1 seems to be installed.")
|
||||
item.add_marker(skip_milvus)
|
||||
|
||||
# Skip PineconeDocumentStore if PINECONE_API_KEY not in environment variables
|
||||
# if not os.environ.get("PINECONE_API_KEY", False) and "pinecone" in keywords:
|
||||
# skip_pinecone = pytest.mark.skip(reason="PINECONE_API_KEY not in environment variables.")
|
||||
# item.add_marker(skip_pinecone)
|
||||
|
||||
|
||||
#
|
||||
# Empty mocks, as a base for unit tests.
|
||||
@ -987,7 +983,7 @@ def get_document_store(
|
||||
|
||||
elif document_store_type == "pinecone":
|
||||
document_store = PineconeDocumentStore(
|
||||
api_key=os.environ.get("PINECONE_API_KEY"),
|
||||
api_key=os.environ.get("PINECONE_API_KEY") or "fake-haystack-test-key",
|
||||
embedding_dim=embedding_dim,
|
||||
embedding_field=embedding_field,
|
||||
index=index,
|
||||
|
||||
@ -209,25 +209,25 @@ def test_get_all_documents_large_quantities(document_store: BaseDocumentStore):
|
||||
|
||||
def test_get_all_document_filter_duplicate_text_value(document_store: BaseDocumentStore):
|
||||
documents = [
|
||||
Document(content="Doc1", meta={"f1": "0"}, id_hash_keys=["meta"]),
|
||||
Document(content="Doc1", meta={"f1": "1", "meta_id": "0"}, id_hash_keys=["meta"]),
|
||||
Document(content="Doc2", meta={"f3": "0"}, id_hash_keys=["meta"]),
|
||||
Document(content="Doc1", meta={"meta_field": "0"}, id_hash_keys=["meta"]),
|
||||
Document(content="Doc1", meta={"meta_field": "1", "name": "file.txt"}, id_hash_keys=["meta"]),
|
||||
Document(content="Doc2", meta={"name": "file_2.txt"}, id_hash_keys=["meta"]),
|
||||
]
|
||||
document_store.write_documents(documents)
|
||||
documents = document_store.get_all_documents(filters={"f1": ["1"]})
|
||||
documents = document_store.get_all_documents(filters={"meta_field": ["1"]})
|
||||
assert documents[0].content == "Doc1"
|
||||
assert len(documents) == 1
|
||||
assert {d.meta["meta_id"] for d in documents} == {"0"}
|
||||
assert {d.meta["name"] for d in documents} == {"file.txt"}
|
||||
|
||||
documents = document_store.get_all_documents(filters={"f1": ["0"]})
|
||||
documents = document_store.get_all_documents(filters={"meta_field": ["0"]})
|
||||
assert documents[0].content == "Doc1"
|
||||
assert len(documents) == 1
|
||||
assert documents[0].meta.get("meta_id") is None
|
||||
assert documents[0].meta.get("name") is None
|
||||
|
||||
documents = document_store.get_all_documents(filters={"f3": ["0"]})
|
||||
documents = document_store.get_all_documents(filters={"name": ["file_2.txt"]})
|
||||
assert documents[0].content == "Doc2"
|
||||
assert len(documents) == 1
|
||||
assert documents[0].meta.get("meta_id") is None
|
||||
assert documents[0].meta.get("meta_field") is None
|
||||
|
||||
|
||||
def test_get_all_documents_with_correct_filters(document_store_with_docs):
|
||||
@ -266,9 +266,8 @@ def test_get_all_documents_with_incorrect_filter_value(document_store_with_docs)
|
||||
assert len(documents) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"document_store_with_docs", ["elasticsearch", "sql", "weaviate", "memory", "pinecone"], indirect=True
|
||||
)
|
||||
# See test_pinecone.py
|
||||
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch", "sql", "weaviate", "memory"], indirect=True)
|
||||
def test_extended_filter(document_store_with_docs):
|
||||
# Test comparison operators individually
|
||||
documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$eq": "test1"}})
|
||||
|
||||
337
test/document_stores/test_pinecone.py
Normal file
337
test/document_stores/test_pinecone.py
Normal file
@ -0,0 +1,337 @@
|
||||
from typing import List, Union, Dict, Any
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
from inspect import getmembers, isclass, isfunction
|
||||
|
||||
import pytest
|
||||
|
||||
from haystack.document_stores.pinecone import PineconeDocumentStore
|
||||
from haystack.schema import Document
|
||||
from haystack.errors import FilterError
|
||||
|
||||
|
||||
from ..mocks import pinecone as pinecone_mock
|
||||
from ..conftest import SAMPLES_PATH
|
||||
|
||||
|
||||
# Set metadata fields used during testing for PineconeDocumentStore meta_config
|
||||
META_FIELDS = ["meta_field", "name", "date", "numeric_field", "odd_document"]
|
||||
|
||||
|
||||
#
|
||||
# FIXME This class should extend the base Document Store test class once it exists.
|
||||
# At that point some of the fixtures will be duplicate, so review them.
|
||||
#
|
||||
class TestPineconeDocumentStore:
|
||||
|
||||
# Fixtures
|
||||
|
||||
@pytest.fixture
|
||||
def doc_store(self, monkeypatch, request) -> PineconeDocumentStore:
|
||||
"""
|
||||
This fixture provides an empty document store and takes care of cleaning up after each test
|
||||
"""
|
||||
# If it's a unit test, mock Pinecone
|
||||
if not "integration" in request.keywords:
|
||||
for fname, function in getmembers(pinecone_mock, isfunction):
|
||||
monkeypatch.setattr(f"pinecone.{fname}", function, raising=False)
|
||||
for cname, class_ in getmembers(pinecone_mock, isclass):
|
||||
monkeypatch.setattr(f"pinecone.{cname}", class_, raising=False)
|
||||
|
||||
return PineconeDocumentStore(
|
||||
api_key=os.environ.get("PINECONE_API_KEY") or "fake-pinecone-test-key",
|
||||
embedding_dim=768,
|
||||
embedding_field="embedding",
|
||||
index="haystack_tests",
|
||||
similarity="cosine",
|
||||
recreate_index=True,
|
||||
metadata_config={"indexed": META_FIELDS},
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def doc_store_with_docs(self, doc_store: PineconeDocumentStore, docs: List[Document]) -> PineconeDocumentStore:
|
||||
"""
|
||||
This fixture provides a pre-populated document store and takes care of cleaning up after each test
|
||||
"""
|
||||
doc_store.write_documents(docs)
|
||||
return doc_store
|
||||
|
||||
@pytest.fixture
|
||||
def docs_all_formats(self) -> List[Union[Document, Dict[str, Any]]]:
|
||||
return [
|
||||
# metafield at the top level for backward compatibility
|
||||
{
|
||||
"content": "My name is Paul and I live in New York",
|
||||
"meta_field": "test-1",
|
||||
"name": "file_1.txt",
|
||||
"date": "2019-10-01",
|
||||
"numeric_field": 5.0,
|
||||
"odd_document": True,
|
||||
},
|
||||
# "dict" format
|
||||
{
|
||||
"content": "My name is Carla and I live in Berlin",
|
||||
"meta": {
|
||||
"meta_field": "test-2",
|
||||
"name": "file_2.txt",
|
||||
"date": "2020-03-01",
|
||||
"numeric_field": 5.5,
|
||||
"odd_document": False,
|
||||
},
|
||||
},
|
||||
# Document object
|
||||
Document(
|
||||
content="My name is Christelle and I live in Paris",
|
||||
meta={
|
||||
"meta_field": "test-3",
|
||||
"name": "file_3.txt",
|
||||
"date": "2018-10-01",
|
||||
"numeric_field": 4.5,
|
||||
"odd_document": True,
|
||||
},
|
||||
),
|
||||
Document(
|
||||
content="My name is Camila and I live in Madrid",
|
||||
meta={
|
||||
"meta_field": "test-4",
|
||||
"name": "file_4.txt",
|
||||
"date": "2021-02-01",
|
||||
"numeric_field": 3.0,
|
||||
"odd_document": False,
|
||||
},
|
||||
),
|
||||
Document(
|
||||
content="My name is Matteo and I live in Rome",
|
||||
meta={
|
||||
"meta_field": "test-5",
|
||||
"name": "file_5.txt",
|
||||
"date": "2019-01-01",
|
||||
"numeric_field": 0.0,
|
||||
"odd_document": True,
|
||||
},
|
||||
),
|
||||
# Without meta
|
||||
Document(content="My name is Ahmed and I live in Cairo"),
|
||||
]
|
||||
|
||||
@pytest.fixture
|
||||
def docs(self, docs_all_formats: List[Union[Document, Dict[str, Any]]]) -> List[Document]:
|
||||
return [Document.from_dict(doc) if isinstance(doc, dict) else doc for doc in docs_all_formats]
|
||||
|
||||
#
|
||||
# Tests
|
||||
#
|
||||
|
||||
@pytest.mark.pinecone
|
||||
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||||
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||||
# base document store suite, and can be removed from here.
|
||||
def test_get_all_documents_extended_filter_eq(self, doc_store_with_docs: PineconeDocumentStore):
|
||||
eq_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": {"$eq": "test-1"}})
|
||||
normal_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": "test-1"})
|
||||
assert eq_docs == normal_docs
|
||||
|
||||
@pytest.mark.pinecone
|
||||
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||||
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||||
# base document store suite, and can be removed from here.
|
||||
def test_get_all_documents_extended_filter_in(self, doc_store_with_docs: PineconeDocumentStore):
|
||||
in_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": {"$in": ["test-1", "test-2", "n.a."]}})
|
||||
normal_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": ["test-1", "test-2", "n.a."]})
|
||||
assert in_docs == normal_docs
|
||||
|
||||
@pytest.mark.pinecone
|
||||
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||||
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||||
# base document store suite, and can be removed from here.
|
||||
def test_get_all_documents_extended_filter_ne(self, doc_store_with_docs: PineconeDocumentStore):
|
||||
retrieved_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": {"$ne": "test-1"}})
|
||||
assert all("test-1" != d.meta.get("meta_field", None) for d in retrieved_docs)
|
||||
|
||||
@pytest.mark.pinecone
|
||||
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||||
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||||
# base document store suite, and can be removed from here.
|
||||
def test_get_all_documents_extended_filter_nin(self, doc_store_with_docs: PineconeDocumentStore):
|
||||
retrieved_docs = doc_store_with_docs.get_all_documents(
|
||||
filters={"meta_field": {"$nin": ["test-1", "test-2", "n.a."]}}
|
||||
)
|
||||
assert {"test-1", "test-2"}.isdisjoint({d.meta.get("meta_field", None) for d in retrieved_docs})
|
||||
|
||||
@pytest.mark.pinecone
|
||||
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||||
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||||
# base document store suite, and can be removed from here.
|
||||
def test_get_all_documents_extended_filter_gt(self, doc_store_with_docs: PineconeDocumentStore):
|
||||
retrieved_docs = doc_store_with_docs.get_all_documents(filters={"numeric_field": {"$gt": 3.0}})
|
||||
assert all(d.meta["numeric_field"] > 3.0 for d in retrieved_docs)
|
||||
|
||||
@pytest.mark.pinecone
|
||||
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||||
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||||
# base document store suite, and can be removed from here.
|
||||
def test_get_all_documents_extended_filter_gte(self, doc_store_with_docs: PineconeDocumentStore):
|
||||
retrieved_docs = doc_store_with_docs.get_all_documents(filters={"numeric_field": {"$gte": 3.0}})
|
||||
assert all(d.meta["numeric_field"] >= 3.0 for d in retrieved_docs)
|
||||
|
||||
@pytest.mark.pinecone
|
||||
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||||
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||||
# base document store suite, and can be removed from here.
|
||||
def test_get_all_documents_extended_filter_lt(self, doc_store_with_docs: PineconeDocumentStore):
|
||||
retrieved_docs = doc_store_with_docs.get_all_documents(filters={"numeric_field": {"$lt": 3.0}})
|
||||
assert all(d.meta["numeric_field"] < 3.0 for d in retrieved_docs)
|
||||
|
||||
@pytest.mark.pinecone
|
||||
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||||
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||||
# base document store suite, and can be removed from here.
|
||||
def test_get_all_documents_extended_filter_lte(self, doc_store_with_docs: PineconeDocumentStore):
|
||||
retrieved_docs = doc_store_with_docs.get_all_documents(filters={"numeric_field": {"$lte": 3.0}})
|
||||
assert all(d.meta["numeric_field"] <= 3.0 for d in retrieved_docs)
|
||||
|
||||
@pytest.mark.pinecone
|
||||
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||||
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||||
# base document store suite, and can be removed from here.
|
||||
def test_get_all_documents_extended_filter_compound_dates(self, doc_store_with_docs: PineconeDocumentStore):
|
||||
filters = {"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"}}
|
||||
|
||||
with pytest.raises(FilterError, match=r"Comparison value for '\$[l|g]te' operation must be a float or int."):
|
||||
doc_store_with_docs.get_all_documents(filters=filters)
|
||||
|
||||
@pytest.mark.pinecone
|
||||
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||||
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||||
# base document store suite, and can be removed from here.
|
||||
def test_get_all_documents_extended_filter_compound_dates_and_other_field_explicit(
|
||||
self, doc_store_with_docs: PineconeDocumentStore
|
||||
):
|
||||
filters = {
|
||||
"$and": {
|
||||
"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
|
||||
"name": {"$in": ["file_5.txt", "file_3.txt"]},
|
||||
}
|
||||
}
|
||||
|
||||
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
|
||||
doc_store_with_docs.get_all_documents(filters=filters)
|
||||
|
||||
@pytest.mark.pinecone
|
||||
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||||
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||||
# base document store suite, and can be removed from here.
|
||||
def test_get_all_documents_extended_filter_compound_dates_and_other_field_simplified(
|
||||
self, doc_store_with_docs: PineconeDocumentStore
|
||||
):
|
||||
filters_simplified = {
|
||||
"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
|
||||
"name": ["file_5.txt", "file_3.txt"],
|
||||
}
|
||||
|
||||
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
|
||||
doc_store_with_docs.get_all_documents(filters=filters_simplified)
|
||||
|
||||
@pytest.mark.pinecone
|
||||
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||||
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||||
# base document store suite, and can be removed from here.
|
||||
def test_get_all_documents_extended_filter_compound_dates_and_or_explicit(
|
||||
self, doc_store_with_docs: PineconeDocumentStore
|
||||
):
|
||||
filters = {
|
||||
"$and": {
|
||||
"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
|
||||
"$or": {"name": {"$in": ["file_5.txt", "file_3.txt"]}, "numeric_field": {"$lte": 5.0}},
|
||||
}
|
||||
}
|
||||
|
||||
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
|
||||
doc_store_with_docs.get_all_documents(filters=filters)
|
||||
|
||||
@pytest.mark.pinecone
|
||||
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||||
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||||
# base document store suite, and can be removed from here.
|
||||
def test_get_all_documents_extended_filter_compound_dates_and_or_simplified(
|
||||
self, doc_store_with_docs: PineconeDocumentStore
|
||||
):
|
||||
filters_simplified = {
|
||||
"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
|
||||
"$or": {"name": ["file_5.txt", "file_3.txt"], "numeric_field": {"$lte": 5.0}},
|
||||
}
|
||||
|
||||
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
|
||||
doc_store_with_docs.get_all_documents(filters=filters_simplified)
|
||||
|
||||
@pytest.mark.pinecone
|
||||
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||||
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||||
# base document store suite, and can be removed from here.
|
||||
def test_get_all_documents_extended_filter_compound_dates_and_or_and_not_explicit(
|
||||
self, doc_store_with_docs: PineconeDocumentStore
|
||||
):
|
||||
filters = {
|
||||
"$and": {
|
||||
"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
|
||||
"$or": {
|
||||
"name": {"$in": ["file_5.txt", "file_3.txt"]},
|
||||
"$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": {"$eq": "test-2"}}},
|
||||
},
|
||||
}
|
||||
}
|
||||
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
|
||||
doc_store_with_docs.get_all_documents(filters=filters)
|
||||
|
||||
@pytest.mark.pinecone
|
||||
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||||
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||||
# base document store suite, and can be removed from here.
|
||||
def test_get_all_documents_extended_filter_compound_dates_and_or_and_not_simplified(
|
||||
self, doc_store_with_docs: PineconeDocumentStore
|
||||
):
|
||||
filters_simplified = {
|
||||
"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
|
||||
"$or": {
|
||||
"name": ["file_5.txt", "file_3.txt"],
|
||||
"$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": "test-2"}},
|
||||
},
|
||||
}
|
||||
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
|
||||
doc_store_with_docs.get_all_documents(filters=filters_simplified)
|
||||
|
||||
@pytest.mark.pinecone
|
||||
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||||
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||||
# base document store suite, and can be removed from here.
|
||||
def test_get_all_documents_extended_filter_compound_nested_not(self, doc_store_with_docs: PineconeDocumentStore):
|
||||
# Test nested logical operations within "$not", important as we apply De Morgan's laws in Weaviatedocstore
|
||||
filters = {
|
||||
"$not": {
|
||||
"$or": {
|
||||
"$and": {"numeric_field": {"$gt": 3.0}, "meta_field": {"$ne": "test-3"}},
|
||||
"$not": {"date": {"$lt": "2020-01-01"}},
|
||||
}
|
||||
}
|
||||
}
|
||||
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]t' operation must be a float or int."):
|
||||
doc_store_with_docs.get_all_documents(filters=filters)
|
||||
|
||||
@pytest.mark.pinecone
|
||||
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||||
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||||
# base document store suite, and can be removed from here.
|
||||
def test_get_all_documents_extended_filter_compound_same_level_not(
|
||||
self, doc_store_with_docs: PineconeDocumentStore
|
||||
):
|
||||
# Test same logical operator twice on same level, important as we apply De Morgan's laws in Weaviatedocstore
|
||||
filters = {
|
||||
"$or": [
|
||||
{"$and": {"meta_field": {"$in": ["test-1", "test-2"]}, "date": {"$gte": "2020-01-01"}}},
|
||||
{"$and": {"meta_field": {"$in": ["test-3", "test-4"]}, "date": {"$lt": "2020-01-01"}}},
|
||||
]
|
||||
}
|
||||
|
||||
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
|
||||
doc_store_with_docs.get_all_documents(filters=filters)
|
||||
@ -2,8 +2,10 @@ from typing import Optional, List, Dict, Union
|
||||
|
||||
import logging
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Mock Pinecone instance
|
||||
CONFIG: dict = {"api_key": None, "environment": None, "indexes": {}}
|
||||
|
||||
@ -84,6 +86,24 @@ class Index:
|
||||
include_values: bool = False,
|
||||
include_metadata: bool = False,
|
||||
filter: Optional[dict] = None,
|
||||
):
|
||||
return self.query_filter(
|
||||
vector=vector,
|
||||
top_k=top_k,
|
||||
namespace=namespace,
|
||||
include_values=include_values,
|
||||
include_metadata=include_metadata,
|
||||
filter=filter,
|
||||
)
|
||||
|
||||
def query_filter(
|
||||
self,
|
||||
vector: List[float],
|
||||
top_k: int,
|
||||
namespace: str = "",
|
||||
include_values: bool = False,
|
||||
include_metadata: bool = False,
|
||||
filter: Optional[dict] = None,
|
||||
):
|
||||
assert len(vector) == self.index_config.dimension
|
||||
response: dict = {"matches": []}
|
||||
@ -92,6 +112,7 @@ class Index:
|
||||
else:
|
||||
records = self.index_config.namespaces[namespace]
|
||||
namespace_ids = list(records.keys())[:top_k]
|
||||
|
||||
for _id in namespace_ids:
|
||||
match = {"id": _id}
|
||||
if include_values:
|
||||
@ -99,6 +120,7 @@ class Index:
|
||||
if include_metadata:
|
||||
match["metadata"] = records[_id]["metadata"].copy()
|
||||
match["score"] = 0.0
|
||||
|
||||
if filter is None or (
|
||||
filter is not None and self._filter(records[_id]["metadata"], filter, top_level=True)
|
||||
):
|
||||
@ -258,7 +280,7 @@ class Index:
|
||||
# We find the intersect between the IDs and filtered IDs
|
||||
id_list = set(id_list).intersection(filter_ids)
|
||||
records = self.index_config.namespaces[namespace]
|
||||
for _id in records.keys():
|
||||
for _id in list(records.keys()): # list() is needed to be able to del below
|
||||
if _id in id_list:
|
||||
del records[_id]
|
||||
else:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user