mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-31 20:03:38 +00:00
feat: initial implementation of MemoryDocumentStore
for new Pipelines (#4447)
* add stub implementation * reimplementation * test files * docstore tests * tests for document * better testing * remove mmh3 * readme * only store, no retrieval yet * linting * review feedback * initial filters implementation * working on filters * linters * filtering works and is isolated by document store * simplify filters * comments * improve filters matching code * review feedback * pylint * move logic into_create_id * mypy
This commit is contained in:
parent
db69141642
commit
f2106ab37b
1
haystack/preview/README.md
Normal file
1
haystack/preview/README.md
Normal file
@ -0,0 +1 @@
|
||||
# Haystack - Preview features
|
@ -1,2 +1,3 @@
|
||||
from canals import node
|
||||
from haystack.preview.dataclasses import Document
|
||||
from haystack.preview.pipeline import Pipeline, PipelineError, NoSuchStoreError, load_pipelines, save_pipelines
|
||||
|
1
haystack/preview/dataclasses/__init__.py
Normal file
1
haystack/preview/dataclasses/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from haystack.preview.dataclasses.document import Document
|
110
haystack/preview/dataclasses/document.py
Normal file
110
haystack/preview/dataclasses/document.py
Normal file
@ -0,0 +1,110 @@
|
||||
from typing import List, Any, Dict, Literal, Optional, TYPE_CHECKING
|
||||
|
||||
import json
|
||||
import hashlib
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from dataclasses import asdict, dataclass, field
|
||||
|
||||
from haystack.preview.utils.import_utils import optional_import
|
||||
|
||||
# We need to do this dance because ndarray is an optional dependency used as a type by dataclass
|
||||
if TYPE_CHECKING:
|
||||
from numpy import ndarray
|
||||
else:
|
||||
ndarray = optional_import("numpy", "ndarray", "You won't be able to use embeddings.", __name__)
|
||||
|
||||
DataFrame = optional_import("pandas", "DataFrame", "You won't be able to use table related features.", __name__)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
ContentType = Literal["text", "table", "image", "audio"]
|
||||
PYTHON_TYPES_FOR_CONTENT: Dict[ContentType, type] = {"text": str, "table": DataFrame, "image": Path, "audio": Path}
|
||||
|
||||
|
||||
def _create_id(
|
||||
classname: str, content: Any, metadata: Optional[Dict[str, Any]] = None, id_hash_keys: Optional[List[str]] = None
|
||||
):
|
||||
"""
|
||||
Creates a hash of the content given that acts as the document's ID.
|
||||
"""
|
||||
content_to_hash = f"{classname}:{content}"
|
||||
if id_hash_keys:
|
||||
if not metadata:
|
||||
raise ValueError("If 'id_hash_keys' is provided, you must provide 'metadata' too.")
|
||||
content_to_hash = ":".join([content_to_hash, *[str(metadata.get(key, "")) for key in id_hash_keys]])
|
||||
return hashlib.sha256(str(content_to_hash).encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Document:
|
||||
"""
|
||||
Base data class containing some data to be queried.
|
||||
Can contain text snippets, tables, file paths to files like images or audios.
|
||||
Documents can be sorted by score, serialized to/from dictionary and JSON, and are immutable.
|
||||
|
||||
Immutability is due to the fact that the document's ID depends on its content, so upon changing the content, also
|
||||
the ID should change. To avoid keeping IDs in sync with the content by using properties, and asking docstores to
|
||||
be aware of this corner case, we decide to make Documents immutable and remove the issue. If you need to modify a
|
||||
Document, consider using `to_dict()`, modifying the dict, and then create a new Document object using
|
||||
`Document.from_dict()`.
|
||||
|
||||
Note that `id_hash_keys` are referring to keys in the metadata. `content` is always included in the id hash.
|
||||
In case of file-based documents (images, audios), the content that is hashed is the file paths,
|
||||
so if the file is moved, the hash is different, but if the file is modified without renaming it, the has will
|
||||
not differ.
|
||||
"""
|
||||
|
||||
id: str = field(default_factory=str)
|
||||
content: Any = field(default_factory=lambda: None)
|
||||
content_type: ContentType = "text"
|
||||
metadata: Dict[str, Any] = field(default_factory=dict, hash=False)
|
||||
id_hash_keys: List[str] = field(default_factory=lambda: [], hash=False)
|
||||
score: Optional[float] = field(default=None, compare=True)
|
||||
embedding: Optional[ndarray] = field(default=None, repr=False)
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.__class__.__name__}('{self.content}')"
|
||||
|
||||
def __post_init__(self):
|
||||
"""
|
||||
Generate the ID based on the init parameters and make sure that content_type
|
||||
matches the actual type of content.
|
||||
"""
|
||||
# Validate content_type
|
||||
if not isinstance(self.content, PYTHON_TYPES_FOR_CONTENT[self.content_type]):
|
||||
raise ValueError(
|
||||
f"The type of content ({type(self.content)}) does not match the "
|
||||
f"content type: '{self.content_type}' expects '{PYTHON_TYPES_FOR_CONTENT[self.content_type]}'."
|
||||
)
|
||||
# Check if id_hash_keys are all present in the meta
|
||||
for key in self.id_hash_keys:
|
||||
if key not in self.metadata:
|
||||
raise ValueError(
|
||||
f"'{key}' must be present in the metadata of the Document if you want to use it to generate the ID."
|
||||
)
|
||||
# Generate the ID
|
||||
hashed_content = _create_id(
|
||||
classname=self.__class__.__name__,
|
||||
content=str(self.content),
|
||||
metadata=self.metadata,
|
||||
id_hash_keys=self.id_hash_keys,
|
||||
)
|
||||
|
||||
# Note: we need to set the id this way because the dataclass is frozen. See the docstring.
|
||||
object.__setattr__(self, "id", hashed_content)
|
||||
|
||||
def to_dict(self):
|
||||
return asdict(self)
|
||||
|
||||
def to_json(self, **json_kwargs):
|
||||
return json.dumps(self.to_dict(), *json_kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, dictionary):
|
||||
return cls(**dictionary)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, data, **json_kwargs):
|
||||
dictionary = json.loads(data, **json_kwargs)
|
||||
return cls.from_dict(dictionary=dictionary)
|
2
haystack/preview/document_stores/__init__.py
Normal file
2
haystack/preview/document_stores/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
from haystack.preview.document_stores.memory.document_store import MemoryDocumentStore
|
||||
from haystack.preview.document_stores.errors import StoreError, DuplicateDocumentError, MissingDocumentError
|
10
haystack/preview/document_stores/errors.py
Normal file
10
haystack/preview/document_stores/errors.py
Normal file
@ -0,0 +1,10 @@
|
||||
class StoreError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class DuplicateDocumentError(StoreError):
|
||||
pass
|
||||
|
||||
|
||||
class MissingDocumentError(StoreError):
|
||||
pass
|
1
haystack/preview/document_stores/memory/__init__.py
Normal file
1
haystack/preview/document_stores/memory/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from haystack.preview.document_stores.memory.document_store import MemoryDocumentStore
|
255
haystack/preview/document_stores/memory/_filters.py
Normal file
255
haystack/preview/document_stores/memory/_filters.py
Normal file
@ -0,0 +1,255 @@
|
||||
from typing import List, Any
|
||||
|
||||
from haystack.preview.dataclasses import Document
|
||||
|
||||
|
||||
def not_operation(conditions: List[Any], document: Document, _current_key: str):
|
||||
"""
|
||||
Applies a NOT to all the nested conditions.
|
||||
|
||||
:param conditions: the filters dictionary.
|
||||
:param document: the document to test.
|
||||
:param _current_key: internal, don't use.
|
||||
:return: True if the document matches the negated filters, False otherwise
|
||||
"""
|
||||
return not and_operation(conditions=conditions, document=document, _current_key=_current_key)
|
||||
|
||||
|
||||
def and_operation(conditions: List[Any], document: Document, _current_key: str):
|
||||
"""
|
||||
Applies an AND to all the nested conditions.
|
||||
|
||||
:param conditions: the filters dictionary.
|
||||
:param document: the document to test.
|
||||
:param _current_key: internal, don't use.
|
||||
:return: True if the document matches all the filters, False otherwise
|
||||
"""
|
||||
for condition in conditions:
|
||||
if not _match(conditions=condition, document=document, _current_key=_current_key):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def or_operation(conditions: List[Any], document: Document, _current_key: str):
|
||||
"""
|
||||
Applies an OR to all the nested conditions.
|
||||
|
||||
:param conditions: the filters dictionary.
|
||||
:param document: the document to test.
|
||||
:param _current_key: internal, don't use.
|
||||
:return: True if the document matches ano of the filters, False otherwise
|
||||
"""
|
||||
for condition in conditions:
|
||||
if _match(conditions=condition, document=document, _current_key=_current_key):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def eq_operation(fields, field_name, value):
|
||||
"""
|
||||
Checks for equality between the document's metadata value and a fixed value.
|
||||
|
||||
:param fields: all the document's metadata
|
||||
:param field_name: the field to test
|
||||
:param value; the fixed value to compare against
|
||||
:return: True if the values are equal, False otherwise
|
||||
"""
|
||||
if not field_name in fields:
|
||||
return False
|
||||
return fields[field_name] == value
|
||||
|
||||
|
||||
def in_operation(fields, field_name, value):
|
||||
"""
|
||||
Checks for whether the document's metadata value is present into the given list.
|
||||
|
||||
:param fields: all the document's metadata
|
||||
:param field_name: the field to test
|
||||
:param value; the fixed value to compare against
|
||||
:return: True if the document's value is included in the given list, False otherwise
|
||||
"""
|
||||
if not field_name in fields:
|
||||
return False
|
||||
return fields[field_name] in value
|
||||
|
||||
|
||||
def ne_operation(fields, field_name, value):
|
||||
"""
|
||||
Checks for inequality between the document's metadata value and a fixed value.
|
||||
|
||||
:param fields: all the document's metadata
|
||||
:param field_name: the field to test
|
||||
:param value; the fixed value to compare against
|
||||
:return: True if the values are different, False otherwise
|
||||
"""
|
||||
if not field_name in fields:
|
||||
return True
|
||||
return fields[field_name] != value
|
||||
|
||||
|
||||
def nin_operation(fields, field_name, value):
|
||||
"""
|
||||
Checks whether the document's metadata value is absent from the given list.
|
||||
|
||||
:param fields: all the document's metadata
|
||||
:param field_name: the field to test
|
||||
:param value; the fixed value to compare against
|
||||
:return: True if the document's value is not included in the given list, False otherwise
|
||||
"""
|
||||
if not field_name in fields:
|
||||
return True
|
||||
return fields[field_name] not in value
|
||||
|
||||
|
||||
def gt_operation(fields, field_name, value):
|
||||
"""
|
||||
Checks whether the document's metadata value is (strictly) larger than the given value.
|
||||
|
||||
:param fields: all the document's metadata
|
||||
:param field_name: the field to test
|
||||
:param value; the fixed value to compare against
|
||||
:return: True if the document's value is strictly larger than the fixed value, False otherwise
|
||||
"""
|
||||
if not field_name in fields:
|
||||
return False
|
||||
return fields[field_name] > value
|
||||
|
||||
|
||||
def gte_operation(fields, field_name, value):
|
||||
"""
|
||||
Checks whether the document's metadata value is larger than or equal to the given value.
|
||||
|
||||
:param fields: all the document's metadata
|
||||
:param field_name: the field to test
|
||||
:param value; the fixed value to compare against
|
||||
:return: True if the document's value is larger than or equal to the fixed value, False otherwise
|
||||
"""
|
||||
if not field_name in fields:
|
||||
return False
|
||||
return fields[field_name] >= value
|
||||
|
||||
|
||||
def lt_operation(fields, field_name, value):
|
||||
"""
|
||||
Checks whether the document's metadata value is (strictly) smaller than the given value.
|
||||
|
||||
:param fields: all the document's metadata
|
||||
:param field_name: the field to test
|
||||
:param value; the fixed value to compare against
|
||||
:return: True if the document's value is strictly smaller than the fixed value, False otherwise
|
||||
"""
|
||||
if not field_name in fields:
|
||||
return False
|
||||
return fields[field_name] < value
|
||||
|
||||
|
||||
def lte_operation(fields, field_name, value):
|
||||
"""
|
||||
Checks whether the document's metadata value is smaller than or equal to the given value.
|
||||
|
||||
:param fields: all the document's metadata
|
||||
:param field_name: the field to test
|
||||
:param value; the fixed value to compare against
|
||||
:return: True if the document's value is smaller than or equal to the fixed value, False otherwise
|
||||
"""
|
||||
if not field_name in fields:
|
||||
return False
|
||||
return fields[field_name] <= value
|
||||
|
||||
|
||||
LOGICAL_STATEMENTS = {"$not": not_operation, "$and": and_operation, "$or": or_operation}
|
||||
OPERATORS = {
|
||||
"$eq": eq_operation,
|
||||
"$in": in_operation,
|
||||
"$ne": ne_operation,
|
||||
"$nin": nin_operation,
|
||||
"$gt": gt_operation,
|
||||
"$gte": gte_operation,
|
||||
"$lt": lt_operation,
|
||||
"$lte": lte_operation,
|
||||
}
|
||||
RESERVED_KEYS = [*LOGICAL_STATEMENTS.keys(), *OPERATORS.keys()]
|
||||
|
||||
|
||||
def match(conditions: Any, document: Document):
|
||||
"""
|
||||
This method applies the filters to any given document and returns True when the documents
|
||||
metadata matches the filters, False otherwise.
|
||||
|
||||
:param conditions: the filters dictionary.
|
||||
:param document: the document to test.
|
||||
:return: True if the document matches the filters, False otherwise
|
||||
"""
|
||||
if isinstance(conditions, list):
|
||||
# The default operation for a list of sibling conditions is $and
|
||||
return _match(conditions=conditions, document=document, _current_key="$and")
|
||||
|
||||
if isinstance(conditions, dict):
|
||||
if len(conditions.keys()) > 1:
|
||||
# The default operation for a list of sibling conditions is $and
|
||||
return _match(conditions=conditions, document=document, _current_key="$and")
|
||||
|
||||
field_key, field_value = list(conditions.items())[0]
|
||||
return _match(conditions=field_value, document=document, _current_key=field_key)
|
||||
|
||||
raise ValueError("Filters must be dictionaries or lists. See the examples in the documentation.")
|
||||
|
||||
|
||||
def _match(conditions: Any, document: Document, _current_key: str):
|
||||
"""
|
||||
Recursive implementation of match().
|
||||
"""
|
||||
if isinstance(conditions, list):
|
||||
# The default operation for a list of sibling conditions is $and
|
||||
return _match(conditions={"$and": conditions}, document=document, _current_key=_current_key)
|
||||
|
||||
if isinstance(conditions, dict):
|
||||
# Check for malformed filters, like {"name": {"year": "2020"}}
|
||||
if _current_key not in RESERVED_KEYS and any(key not in RESERVED_KEYS for key in conditions.keys()):
|
||||
raise ValueError(
|
||||
f"This filter ({_current_key}, {conditions}) seems to be malformed. Comparisons with dictionaries are "
|
||||
"not currently supported. Check the documentation to learn more about filters syntax."
|
||||
)
|
||||
|
||||
# The default operation for a list of sibling conditions is $and
|
||||
if len(conditions.keys()) > 1:
|
||||
return and_operation(
|
||||
conditions=_conditions_as_list(conditions), document=document, _current_key=_current_key
|
||||
)
|
||||
|
||||
field_key, field_value = list(conditions.items())[0]
|
||||
|
||||
if field_key in LOGICAL_STATEMENTS.keys():
|
||||
# It's a nested logical statement ($and, $or, $not)
|
||||
return LOGICAL_STATEMENTS[field_key](
|
||||
conditions=_conditions_as_list(field_value), document=document, _current_key=_current_key
|
||||
)
|
||||
if field_key in OPERATORS.keys():
|
||||
# It's a comparison operator ($eq, $in, $gte, ...)
|
||||
if not _current_key:
|
||||
raise ValueError(
|
||||
"Filters can't start with an operator like $eq and $in. You have to specify the field name first. "
|
||||
"See the examples in the documentation."
|
||||
)
|
||||
return OPERATORS[field_key](fields=document.metadata, field_name=_current_key, value=field_value)
|
||||
|
||||
if isinstance(field_value, list):
|
||||
# The default operator for a {key: [value1, value2]} filter is $in
|
||||
return in_operation(fields=document.metadata, field_name=field_key, value=field_value)
|
||||
|
||||
# The default operator for a {key: value} filter is $eq
|
||||
return eq_operation(fields=document.metadata, field_name=_current_key, value=conditions)
|
||||
|
||||
|
||||
def _conditions_as_list(conditions: Any) -> List[Any]:
|
||||
"""
|
||||
Make sure all nested conditions are not dictionaries or single values, but always lists.
|
||||
|
||||
:param conditions: the conditions to transform into a list
|
||||
:returns: a list of filters
|
||||
"""
|
||||
if isinstance(conditions, list):
|
||||
return conditions
|
||||
if isinstance(conditions, dict):
|
||||
return [{key: value} for key, value in conditions.items()]
|
||||
return [conditions]
|
144
haystack/preview/document_stores/memory/document_store.py
Normal file
144
haystack/preview/document_stores/memory/document_store.py
Normal file
@ -0,0 +1,144 @@
|
||||
from typing import Literal, Any, Dict, List, Optional, Iterable
|
||||
|
||||
import logging
|
||||
|
||||
from haystack.preview.dataclasses import Document
|
||||
from haystack.preview.document_stores.memory._filters import match
|
||||
from haystack.preview.document_stores.errors import DuplicateDocumentError, MissingDocumentError
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
DuplicatePolicy = Literal["skip", "overwrite", "fail"]
|
||||
|
||||
|
||||
class MemoryDocumentStore:
|
||||
"""
|
||||
Stores data in-memory. It's ephemeral and cannot be saved to disk.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Initializes the store.
|
||||
"""
|
||||
self.storage = {}
|
||||
|
||||
def count_documents(self) -> int:
|
||||
"""
|
||||
Returns the number of how many documents are present in the document store.
|
||||
"""
|
||||
return len(self.storage.keys())
|
||||
|
||||
def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
|
||||
"""
|
||||
Returns the documents that match the filters provided.
|
||||
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`,
|
||||
`"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`,
|
||||
`"$lte"`) or a metadata field name.
|
||||
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata
|
||||
field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or
|
||||
(in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default
|
||||
operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used
|
||||
as default operation.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
# or simpler using default operators
|
||||
filters = {
|
||||
"type": "article",
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": ["economy", "politics"],
|
||||
"publisher": "nytimes"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
To use the same logical operator multiple times on the same level, logical operators can take a list of
|
||||
dictionaries as value.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
filters = {
|
||||
"$or": [
|
||||
{
|
||||
"$and": {
|
||||
"Type": "News Paper",
|
||||
"Date": {
|
||||
"$lt": "2019-01-01"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"$and": {
|
||||
"Type": "Blog Post",
|
||||
"Date": {
|
||||
"$gte": "2019-01-01"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
:param filters: the filters to apply to the document list.
|
||||
:return: a list of Documents that match the given filters.
|
||||
"""
|
||||
if filters:
|
||||
return [doc for doc in self.storage.values() if match(conditions=filters, document=doc)]
|
||||
return list(self.storage.values())
|
||||
|
||||
def write_documents(self, documents: List[Document], duplicates: DuplicatePolicy = "fail") -> None:
|
||||
"""
|
||||
Writes (or overwrites) documents into the store.
|
||||
|
||||
:param documents: a list of documents.
|
||||
:param duplicates: documents with the same ID count as duplicates. When duplicates are met,
|
||||
the store can:
|
||||
- skip: keep the existing document and ignore the new one.
|
||||
- overwrite: remove the old document and write the new one.
|
||||
- fail: an error is raised
|
||||
:raises DuplicateError: Exception trigger on duplicate document if `duplicates="fail"`
|
||||
:return: None
|
||||
"""
|
||||
if (
|
||||
not isinstance(documents, Iterable)
|
||||
or isinstance(documents, str)
|
||||
or any(not isinstance(doc, Document) for doc in documents)
|
||||
):
|
||||
raise ValueError("Please provide a list of Documents.")
|
||||
|
||||
for document in documents:
|
||||
if document.id in self.storage.keys():
|
||||
if duplicates == "fail":
|
||||
raise DuplicateDocumentError(f"ID '{document.id}' already exists.")
|
||||
if duplicates == "skip":
|
||||
logger.warning("ID '%s' already exists", document.id)
|
||||
self.storage[document.id] = document
|
||||
|
||||
def delete_documents(self, document_ids: List[str]) -> None:
|
||||
"""
|
||||
Deletes all documents with a matching document_ids from the document store.
|
||||
Fails with `MissingDocumentError` if no document with this id is present in the store.
|
||||
|
||||
:param object_ids: the object_ids to delete
|
||||
"""
|
||||
for doc_id in document_ids:
|
||||
if not doc_id in self.storage.keys():
|
||||
raise MissingDocumentError(f"ID '{doc_id}' not found, cannot delete it.")
|
||||
del self.storage[doc_id]
|
@ -21,7 +21,7 @@ class Pipeline(CanalsPipeline):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.stores: Dict[str, object] = {}
|
||||
self.stores = {}
|
||||
|
||||
def add_store(self, name: str, store: object) -> None:
|
||||
"""
|
||||
|
24
haystack/preview/utils/import_utils.py
Normal file
24
haystack/preview/utils/import_utils.py
Normal file
@ -0,0 +1,24 @@
|
||||
from typing import Optional, Any
|
||||
import importlib
|
||||
import logging
|
||||
|
||||
|
||||
def optional_import(import_path: str, import_target: Optional[str], error_msg: str, importer_module: str) -> Any:
|
||||
"""
|
||||
Imports an optional dependency. Emits a DEBUG log if the dependency is missing.
|
||||
"""
|
||||
try:
|
||||
module = importlib.import_module(import_path)
|
||||
if import_target:
|
||||
return getattr(module, import_target)
|
||||
return module
|
||||
except ImportError as exc:
|
||||
logging.getLogger(importer_module).debug(
|
||||
"%s%s%s can't be imported: %s Error raised: %s",
|
||||
import_path,
|
||||
"." if import_target else "",
|
||||
import_target,
|
||||
error_msg,
|
||||
exc,
|
||||
)
|
||||
return None
|
152
test/preview/dataclasses/test_dataclasses.py
Normal file
152
test/preview/dataclasses/test_dataclasses.py
Normal file
@ -0,0 +1,152 @@
|
||||
from pathlib import Path
|
||||
import hashlib
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from haystack.preview import Document
|
||||
from haystack.preview.dataclasses.document import _create_id
|
||||
|
||||
|
||||
def test_default_text_document_to_dict():
|
||||
assert Document(content="test content").to_dict() == {
|
||||
"id": _create_id(classname=Document.__name__, content="test content"),
|
||||
"content": "test content",
|
||||
"content_type": "text",
|
||||
"metadata": {},
|
||||
"id_hash_keys": [],
|
||||
"score": None,
|
||||
"embedding": None,
|
||||
}
|
||||
|
||||
|
||||
def test_default_text_document_from_dict():
|
||||
assert Document.from_dict(
|
||||
{
|
||||
"id": _create_id(classname=Document.__name__, content="test content"),
|
||||
"content": "test content",
|
||||
"content_type": "text",
|
||||
"metadata": {},
|
||||
"id_hash_keys": [],
|
||||
"score": None,
|
||||
"embedding": None,
|
||||
}
|
||||
) == Document(content="test content")
|
||||
|
||||
|
||||
def test_default_table_document_to_dict():
|
||||
df = pd.DataFrame([1, 2])
|
||||
dictionary = Document(content=df, content_type="table").to_dict()
|
||||
|
||||
dataframe = dictionary.pop("content")
|
||||
assert dataframe.equals(df)
|
||||
|
||||
assert dictionary == {
|
||||
"id": _create_id(classname=Document.__name__, content=df),
|
||||
"content_type": "table",
|
||||
"metadata": {},
|
||||
"id_hash_keys": [],
|
||||
"score": None,
|
||||
"embedding": None,
|
||||
}
|
||||
|
||||
|
||||
def test_default_table_document_from_dict():
|
||||
df = pd.DataFrame([1, 2])
|
||||
assert Document.from_dict(
|
||||
{
|
||||
"id": _create_id(classname=Document.__name__, content=df),
|
||||
"content": df,
|
||||
"content_type": "table",
|
||||
"metadata": {},
|
||||
"id_hash_keys": [],
|
||||
"score": None,
|
||||
"embedding": None,
|
||||
}
|
||||
) == Document(content=df, content_type="table")
|
||||
|
||||
|
||||
def test_default_image_document_to_dict():
|
||||
path = Path(__file__).parent / "test_files" / "apple.jpg"
|
||||
assert Document(content=path, content_type="image").to_dict() == {
|
||||
"id": _create_id(classname=Document.__name__, content=path),
|
||||
"content": path,
|
||||
"content_type": "image",
|
||||
"metadata": {},
|
||||
"id_hash_keys": [],
|
||||
"score": None,
|
||||
"embedding": None,
|
||||
}
|
||||
|
||||
|
||||
def test_default_image_document_from_dict():
|
||||
path = Path(__file__).parent / "test_files" / "apple.jpg"
|
||||
assert Document.from_dict(
|
||||
{
|
||||
"id": _create_id(classname=Document.__name__, content=path),
|
||||
"content": path,
|
||||
"content_type": "image",
|
||||
"metadata": {},
|
||||
"id_hash_keys": [],
|
||||
"score": None,
|
||||
"embedding": None,
|
||||
}
|
||||
) == Document(content=path, content_type="image")
|
||||
|
||||
|
||||
def test_document_with_most_attributes_to_dict():
|
||||
"""
|
||||
This tests also id_hash_keys
|
||||
"""
|
||||
doc = Document(
|
||||
content="test content",
|
||||
content_type="text",
|
||||
metadata={"some": "values", "test": 10},
|
||||
id_hash_keys=["test"],
|
||||
score=0.99,
|
||||
embedding=np.zeros([10, 10]),
|
||||
)
|
||||
dictionary = doc.to_dict()
|
||||
|
||||
embedding = dictionary.pop("embedding")
|
||||
assert (embedding == np.zeros([10, 10])).all()
|
||||
|
||||
assert dictionary == {
|
||||
"id": _create_id(
|
||||
classname=Document.__name__,
|
||||
content="test content",
|
||||
id_hash_keys=["test"],
|
||||
metadata={"some": "values", "test": 10},
|
||||
),
|
||||
"content": "test content",
|
||||
"content_type": "text",
|
||||
"metadata": {"some": "values", "test": 10},
|
||||
"id_hash_keys": ["test"],
|
||||
"score": 0.99,
|
||||
}
|
||||
|
||||
|
||||
def test_document_with_most_attributes_from_dict():
|
||||
embedding = np.zeros([10, 10])
|
||||
assert Document.from_dict(
|
||||
{
|
||||
"id": _create_id(
|
||||
classname=Document.__name__,
|
||||
content="test content",
|
||||
id_hash_keys=["test"],
|
||||
metadata={"some": "values", "test": 10},
|
||||
),
|
||||
"content": "test content",
|
||||
"content_type": "text",
|
||||
"metadata": {"some": "values", "test": 10},
|
||||
"id_hash_keys": ["test"],
|
||||
"score": 0.99,
|
||||
"embedding": embedding,
|
||||
}
|
||||
) == Document(
|
||||
content="test content",
|
||||
content_type="text",
|
||||
metadata={"some": "values", "test": 10},
|
||||
id_hash_keys=["test"],
|
||||
score=0.99,
|
||||
embedding=embedding,
|
||||
)
|
286
test/preview/document_stores/_base.py
Normal file
286
test/preview/document_stores/_base.py
Normal file
@ -0,0 +1,286 @@
|
||||
import pytest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from haystack.preview.dataclasses import Document
|
||||
from haystack.preview.document_stores import MissingDocumentError, DuplicateDocumentError
|
||||
|
||||
|
||||
class DocumentStoreBaseTests:
|
||||
@pytest.fixture
|
||||
def docstore(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
@pytest.fixture
|
||||
def filterable_docs(self):
|
||||
documents = []
|
||||
for i in range(3):
|
||||
documents.append(
|
||||
Document(
|
||||
content=f"A Foo Document {i}",
|
||||
metadata={"name": f"name_{i}", "year": "2020", "month": "01", "number": 2},
|
||||
embedding=np.random.rand(768).astype(np.float32),
|
||||
)
|
||||
)
|
||||
documents.append(
|
||||
Document(
|
||||
content=f"A Bar Document {i}",
|
||||
metadata={"name": f"name_{i}", "year": "2021", "month": "02", "number": -2},
|
||||
embedding=np.random.rand(768).astype(np.float32),
|
||||
)
|
||||
)
|
||||
documents.append(
|
||||
Document(
|
||||
content=f"A Foobar Document {i}",
|
||||
metadata={"name": f"name_{i}", "year": "2000", "month": "03", "number": -10},
|
||||
embedding=np.random.rand(768).astype(np.float32),
|
||||
)
|
||||
)
|
||||
documents.append(
|
||||
Document(
|
||||
content=f"Document {i} without embedding",
|
||||
metadata={"name": f"name_{i}", "no_embedding": True, "month": "03"},
|
||||
)
|
||||
)
|
||||
|
||||
return documents
|
||||
|
||||
def test_count_empty(self, docstore):
|
||||
assert docstore.count_documents() == 0
|
||||
|
||||
def test_count_not_empty(self, docstore):
|
||||
self.direct_write(
|
||||
docstore, [Document(content="test doc 1"), Document(content="test doc 2"), Document(content="test doc 3")]
|
||||
)
|
||||
assert docstore.count_documents() == 3
|
||||
|
||||
def test_no_filter_empty(self, docstore):
|
||||
assert docstore.filter_documents() == []
|
||||
assert docstore.filter_documents(filters={}) == []
|
||||
|
||||
def test_no_filter_not_empty(self, docstore):
|
||||
docs = [Document(content="test doc")]
|
||||
self.direct_write(docstore, docs)
|
||||
assert docstore.filter_documents() == docs
|
||||
assert docstore.filter_documents(filters={}) == docs
|
||||
|
||||
def test_filter_simple_value(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
result = docstore.filter_documents(filters={"year": "2020"})
|
||||
assert len(result) == 3
|
||||
|
||||
def test_filter_simple_list(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
result = docstore.filter_documents(filters={"year": ["2020"]})
|
||||
assert all(doc.metadata["year"] == "2020" for doc in result)
|
||||
result = docstore.filter_documents(filters={"year": ["2020", "2021"]})
|
||||
assert all(doc.metadata["year"] in ["2020", "2021"] for doc in result)
|
||||
|
||||
def test_incorrect_filter_name(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
result = docstore.filter_documents(filters={"non_existing_meta_field": ["whatever"]})
|
||||
assert len(result) == 0
|
||||
|
||||
def test_incorrect_filter_type(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
with pytest.raises(ValueError, match="dictionaries or lists"):
|
||||
docstore.filter_documents(filters="something odd")
|
||||
|
||||
def test_incorrect_filter_value(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
result = docstore.filter_documents(filters={"year": ["nope"]})
|
||||
assert len(result) == 0
|
||||
|
||||
def test_incorrect_filter_nesting(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
with pytest.raises(ValueError, match="malformed"):
|
||||
docstore.filter_documents(filters={"number": {"year": "2020"}})
|
||||
with pytest.raises(ValueError, match="malformed"):
|
||||
docstore.filter_documents(filters={"number": {"year": {"month": "01"}}})
|
||||
|
||||
def test_eq_filter(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
result = docstore.filter_documents(filters={"year": {"$eq": "2020"}})
|
||||
assert all(doc.metadata["year"] == "2020" for doc in result)
|
||||
result = docstore.filter_documents(filters={"year": "2020"})
|
||||
assert all(doc.metadata["year"] == "2020" for doc in result)
|
||||
|
||||
def test_in_filter(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
result = docstore.filter_documents(filters={"year": {"$in": ["2020", "2021", "n.a."]}})
|
||||
assert all(doc.metadata["year"] in ["2020", "2021"] for doc in result)
|
||||
result = docstore.filter_documents(filters={"year": ["2020", "2021", "n.a."]})
|
||||
assert all(doc.metadata["year"] in ["2020", "2021"] for doc in result)
|
||||
|
||||
def test_ne_filter(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
result = docstore.filter_documents(filters={"year": {"$ne": "2020"}})
|
||||
assert all(doc.metadata.get("year", None) != "2020" for doc in result)
|
||||
|
||||
def test_nin_filter(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
result = docstore.filter_documents(filters={"year": {"$nin": ["2020", "2021", "n.a."]}})
|
||||
assert all(doc.metadata.get("year", None) not in ["2020", "2021"] for doc in result)
|
||||
|
||||
def test_gt_filter(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
result = docstore.filter_documents(filters={"number": {"$gt": 0.0}})
|
||||
assert all(doc.metadata["number"] > 0 for doc in result)
|
||||
|
||||
def test_gte_filter(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
result = docstore.filter_documents(filters={"number": {"$gte": -2.0}})
|
||||
assert all(doc.metadata["number"] >= -2.0 for doc in result)
|
||||
|
||||
def test_lt_filter(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
result = docstore.filter_documents(filters={"number": {"$lt": 0.0}})
|
||||
assert all(doc.metadata["number"] < 0 for doc in result)
|
||||
|
||||
def test_lte_filter(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
result = docstore.filter_documents(filters={"number": {"$lte": 2.0}})
|
||||
assert all(doc.metadata["number"] <= 2.0 for doc in result)
|
||||
|
||||
def test_filter_simple_explicit_and(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
result = docstore.filter_documents(filters={"year": {"$and": {"$lte": "2021", "$gte": "2020"}}})
|
||||
assert all(int(doc.metadata["year"]) >= 2020 and int(doc.metadata["year"]) <= 2021 for doc in result)
|
||||
result = docstore.filter_documents(filters={"year": {"$and": [{"$lte": "2021"}, {"$gte": "2020"}]}})
|
||||
assert all(int(doc.metadata["year"]) >= 2020 and int(doc.metadata["year"]) <= 2021 for doc in result)
|
||||
|
||||
def test_filter_simple_implicit_and(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
result = docstore.filter_documents(filters={"year": {"$lte": "2021", "$gte": "2020"}})
|
||||
assert all(int(doc.metadata["year"]) >= 2020 and int(doc.metadata["year"]) <= 2021 for doc in result)
|
||||
|
||||
def test_filter_nested_explicit_and(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
filters = {"$and": {"year": {"$and": {"$lte": "2021", "$gte": "2020"}}, "name": {"$in": ["name_0", "name_1"]}}}
|
||||
result = docstore.filter_documents(filters=filters)
|
||||
assert all(
|
||||
int(doc.metadata["year"]) >= 2020
|
||||
and int(doc.metadata["year"]) <= 2021
|
||||
and doc.metadata["name"] in ["name_0", "name_1"]
|
||||
for doc in result
|
||||
)
|
||||
|
||||
def test_filter_nested_implicit_and(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
filters_simplified = {"year": {"$lte": "2021", "$gte": "2020"}, "name": ["name_0", "name_1"]}
|
||||
result = docstore.filter_documents(filters=filters_simplified)
|
||||
assert all(
|
||||
int(doc.metadata["year"]) >= 2020
|
||||
and int(doc.metadata["year"]) <= 2021
|
||||
and doc.metadata["name"] in ["name_0", "name_1"]
|
||||
for doc in result
|
||||
)
|
||||
|
||||
def test_filter_simple_or(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
filters = {"$or": {"name": {"$in": ["name_0", "name_1"]}, "number": {"$lt": 1.0}}}
|
||||
result = docstore.filter_documents(filters=filters)
|
||||
assert all(doc.metadata["name"] in ["name_0", "name_1"] or doc.metadata["number"] < 1.0 for doc in result)
|
||||
|
||||
def test_filter_nested_or(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
filters = {"$or": {"name": {"$or": [{"$eq": "name_0"}, {"$eq": "name_1"}]}, "number": {"$lt": 1.0}}}
|
||||
result = docstore.filter_documents(filters=filters)
|
||||
assert all(doc.metadata["name"] in ["name_0", "name_1"] or doc.metadata["number"] < 1.0 for doc in result)
|
||||
|
||||
def test_filter_nested_and_or(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
filters_simplified = {
|
||||
"year": {"$lte": "2021", "$gte": "2020"},
|
||||
"$or": {"name": {"$in": ["name_0", "name_1"]}, "number": {"$lt": 1.0}},
|
||||
}
|
||||
result = docstore.filter_documents(filters=filters_simplified)
|
||||
assert all(
|
||||
(int(doc.metadata["year"]) >= 2020 and int(doc.metadata["year"]) <= 2021)
|
||||
and (doc.metadata["name"] in ["name_0", "name_1"] or doc.metadata["number"] < 1.0)
|
||||
for doc in result
|
||||
)
|
||||
|
||||
def test_filter_nested_or_and(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
filters_simplified = {
|
||||
"$or": {
|
||||
"number": {"$lt": 1.0},
|
||||
"$and": {"name": {"$in": ["name_0", "name_1"]}, "$not": {"month": {"$eq": "01"}}},
|
||||
}
|
||||
}
|
||||
result = docstore.filter_documents(filters=filters_simplified)
|
||||
assert all(
|
||||
doc.metadata.get("number", 2) < 1.0
|
||||
or (doc.metadata["name"] in ["name_0", "name_1"] and doc.metadata["month"] != "01")
|
||||
for doc in result
|
||||
)
|
||||
|
||||
def test_filter_nested_multiple_identical_operators_same_level(self, docstore, filterable_docs):
|
||||
self.direct_write(docstore, filterable_docs)
|
||||
filters = {
|
||||
"$or": [
|
||||
{"$and": {"name": {"$in": ["name_0", "name_1"]}, "year": {"$gte": "2020"}}},
|
||||
{"$and": {"name": {"$in": ["name_0", "name_1"]}, "year": {"$lt": "2021"}}},
|
||||
]
|
||||
}
|
||||
result = docstore.filter_documents(filters=filters)
|
||||
assert all(doc.metadata["name"] in ["name_0", "name_1"] for doc in result)
|
||||
|
||||
def test_write(self, docstore):
|
||||
doc = Document(content="test doc")
|
||||
docstore.write_documents(documents=[doc])
|
||||
assert self.direct_access(docstore, doc_id=doc.id) == doc
|
||||
|
||||
def test_write_duplicate_fail(self, docstore):
|
||||
doc = Document(content="test doc")
|
||||
self.direct_write(docstore, [doc])
|
||||
with pytest.raises(DuplicateDocumentError, match=f"ID '{doc.id}' already exists."):
|
||||
docstore.write_documents(documents=[doc])
|
||||
assert self.direct_access(docstore, doc_id=doc.id) == doc
|
||||
|
||||
def test_write_duplicate_skip(self, docstore):
|
||||
doc = Document(content="test doc")
|
||||
self.direct_write(docstore, [doc])
|
||||
docstore.write_documents(documents=[doc], duplicates="skip")
|
||||
assert self.direct_access(docstore, doc_id=doc.id) == doc
|
||||
|
||||
def test_write_duplicate_overwrite(self, docstore):
|
||||
doc1 = Document(content="test doc 1")
|
||||
doc2 = Document(content="test doc 2")
|
||||
object.__setattr__(doc2, "id", doc1.id) # Make two docs with different content but same ID
|
||||
|
||||
self.direct_write(docstore, [doc2])
|
||||
assert self.direct_access(docstore, doc_id=doc1.id) == doc2
|
||||
docstore.write_documents(documents=[doc1], duplicates="overwrite")
|
||||
assert self.direct_access(docstore, doc_id=doc1.id) == doc1
|
||||
|
||||
def test_write_not_docs(self, docstore):
|
||||
with pytest.raises(ValueError, match="Please provide a list of Documents"):
|
||||
docstore.write_documents(["not a document for sure"])
|
||||
|
||||
def test_write_not_list(self, docstore):
|
||||
with pytest.raises(ValueError, match="Please provide a list of Documents"):
|
||||
docstore.write_documents("not a list actually")
|
||||
|
||||
def test_delete_empty(self, docstore):
|
||||
with pytest.raises(MissingDocumentError):
|
||||
docstore.delete_documents(["test"])
|
||||
|
||||
def test_delete_not_empty(self, docstore):
|
||||
doc = Document(content="test doc")
|
||||
self.direct_write(docstore, [doc])
|
||||
|
||||
docstore.delete_documents([doc.id])
|
||||
|
||||
with pytest.raises(Exception):
|
||||
assert self.direct_access(docstore, doc_id=doc.id)
|
||||
|
||||
def test_delete_not_empty_nonexisting(self, docstore):
|
||||
doc = Document(content="test doc")
|
||||
self.direct_write(docstore, [doc])
|
||||
|
||||
with pytest.raises(MissingDocumentError):
|
||||
docstore.delete_documents(["non_existing"])
|
||||
|
||||
assert self.direct_access(docstore, doc_id=doc.id) == doc
|
38
test/preview/document_stores/test_memory.py
Normal file
38
test/preview/document_stores/test_memory.py
Normal file
@ -0,0 +1,38 @@
|
||||
import pytest
|
||||
from haystack.preview.document_stores import MemoryDocumentStore
|
||||
|
||||
from test.preview.document_stores._base import DocumentStoreBaseTests
|
||||
|
||||
|
||||
class TestMemoryDocumentStore(DocumentStoreBaseTests):
|
||||
"""
|
||||
Test MemoryDocumentStore's specific features
|
||||
"""
|
||||
|
||||
@pytest.fixture
|
||||
def docstore(self) -> MemoryDocumentStore:
|
||||
return MemoryDocumentStore()
|
||||
|
||||
def direct_access(self, docstore, doc_id):
|
||||
"""
|
||||
Bypass `filter_documents()`
|
||||
"""
|
||||
return docstore.storage[doc_id]
|
||||
|
||||
def direct_write(self, docstore, documents):
|
||||
"""
|
||||
Bypass `write_documents()`
|
||||
"""
|
||||
for doc in documents:
|
||||
docstore.storage[doc.id] = doc
|
||||
|
||||
def direct_delete(self, docstore, ids):
|
||||
"""
|
||||
Bypass `delete_documents()`
|
||||
"""
|
||||
for doc_id in ids:
|
||||
del docstore.storage[doc_id]
|
||||
|
||||
#
|
||||
# Test retrieval
|
||||
#
|
BIN
test/preview/test_files/images/apple.jpg
Normal file
BIN
test/preview/test_files/images/apple.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 68 KiB |
Loading…
x
Reference in New Issue
Block a user