mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-09-02 04:46:31 +00:00
143 lines
4.5 KiB
Python
143 lines
4.5 KiB
Python
from typing import Protocol, Optional, Dict, Any, List
|
|
import logging
|
|
from enum import Enum
|
|
|
|
from haystack.preview.dataclasses import Document
|
|
|
|
|
|
# Ellipsis are needed for the type checker, it's safe to disable module-wide
|
|
# pylint: disable=unnecessary-ellipsis
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DuplicatePolicy(Enum):
|
|
SKIP = "skip"
|
|
OVERWRITE = "overwrite"
|
|
FAIL = "fail"
|
|
|
|
|
|
class DocumentStore(Protocol):
|
|
"""
|
|
Stores Documents to be used by the components of a Pipeline.
|
|
|
|
Classes implementing this protocol often store the documents permanently and allow specialized components to
|
|
perform retrieval on them, either by embedding, by keyword, hybrid, and so on, depending on the backend used.
|
|
|
|
In order to retrieve documents, consider using a Retriever that supports the DocumentStore implementation that
|
|
you're using.
|
|
"""
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""
|
|
Serializes this store to a dictionary.
|
|
"""
|
|
...
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> "DocumentStore":
|
|
"""
|
|
Deserializes the store from a dictionary.
|
|
"""
|
|
...
|
|
|
|
def count_documents(self) -> int:
|
|
"""
|
|
Returns the number of documents stored.
|
|
"""
|
|
...
|
|
|
|
def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
|
|
"""
|
|
Returns the documents that match the filters provided.
|
|
|
|
Filters are defined as nested dictionaries that can be of two types:
|
|
- Comparison
|
|
- Logic
|
|
|
|
Comparison dictionaries must contain the keys:
|
|
|
|
- `field`
|
|
- `operator`
|
|
- `value`
|
|
|
|
Logic dictionaries must contain the keys:
|
|
|
|
- `operator`
|
|
- `conditions`
|
|
|
|
The `conditions` key must be a list of dictionaries, either of type Comparison or Logic.
|
|
|
|
The `operator` value in Comparison dictionaries must be one of:
|
|
|
|
- `==`
|
|
- `!=`
|
|
- `>`
|
|
- `>=`
|
|
- `<`
|
|
- `<=`
|
|
- `in`
|
|
- `not in`
|
|
|
|
The `operator` values in Logic dictionaries must be one of:
|
|
|
|
- `NOT`
|
|
- `OR`
|
|
- `AND`
|
|
|
|
|
|
A simple filter:
|
|
```python
|
|
filters = {"field": "meta.type", "operator": "==", "value": "article"}
|
|
```
|
|
|
|
A more complex filter:
|
|
```python
|
|
filters = {
|
|
"operator": "AND",
|
|
"conditions": [
|
|
{"field": "meta.type", "operator": "==", "value": "article"},
|
|
{"field": "meta.date", "operator": ">=", "value": 1420066800},
|
|
{"field": "meta.date", "operator": "<", "value": 1609455600},
|
|
{"field": "meta.rating", "operator": ">=", "value": 3},
|
|
{
|
|
"operator": "OR",
|
|
"conditions": [
|
|
{"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]},
|
|
{"field": "meta.publisher", "operator": "==", "value": "nytimes"},
|
|
],
|
|
},
|
|
],
|
|
}
|
|
|
|
:param filters: the filters to apply to the document list.
|
|
:return: a list of Documents that match the given filters.
|
|
"""
|
|
...
|
|
|
|
def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> int:
|
|
"""
|
|
Writes (or overwrites) documents into the DocumentStore.
|
|
|
|
:param documents: a list of documents.
|
|
:param policy: documents with the same ID count as duplicates. When duplicates are met,
|
|
the DocumentStore can:
|
|
- skip: keep the existing document and ignore the new one.
|
|
- overwrite: remove the old document and write the new one.
|
|
- fail: an error is raised
|
|
:raises DuplicateError: Exception trigger on duplicate document if `policy=DuplicatePolicy.FAIL`
|
|
:return: The number of documents that was written.
|
|
If DuplicatePolicy.OVERWRITE is used, this number is always equal to the number of documents in input.
|
|
If DuplicatePolicy.SKIP is used, this number can be lower than the number of documents in the input list.
|
|
"""
|
|
...
|
|
|
|
def delete_documents(self, document_ids: List[str]) -> None:
|
|
"""
|
|
Deletes all documents with a matching document_ids from the DocumentStore.
|
|
Fails with `MissingDocumentError` if no document with this id is present in the DocumentStore.
|
|
|
|
:param object_ids: the object_ids to delete
|
|
"""
|
|
...
|