mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-02 01:57:38 +00:00
* add protocol and adapt pipeline * review feedback & update tests * pylint * Update haystack/preview/document_stores/protocols.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Update haystack/preview/document_stores/memory/document_store.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * docstring of Store * adapt memorydocumentstore * fix tests --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
737 lines
32 KiB
Python
737 lines
32 KiB
Python
from typing import List
|
|
|
|
import pytest
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from haystack.preview.dataclasses import Document
|
|
from haystack.preview.document_stores import Store, StoreError, DuplicatePolicy
|
|
from haystack.preview.document_stores import MissingDocumentError, DuplicateDocumentError
|
|
|
|
|
|
class DocumentStoreBaseTests:
|
|
@pytest.fixture
|
|
def docstore(self) -> Store:
|
|
raise NotImplementedError()
|
|
|
|
@pytest.fixture
|
|
def filterable_docs(self) -> List[Document]:
|
|
embedding_zero = np.zeros([768, 1]).astype(np.float32)
|
|
embedding_one = np.ones([768, 1]).astype(np.float32)
|
|
|
|
documents = []
|
|
for i in range(3):
|
|
documents.append(
|
|
Document(
|
|
content=f"A Foo Document {i}",
|
|
metadata={"name": f"name_{i}", "page": "100", "chapter": "intro", "number": 2},
|
|
embedding=np.random.rand(768).astype(np.float32),
|
|
)
|
|
)
|
|
documents.append(
|
|
Document(
|
|
content=f"A Bar Document {i}",
|
|
metadata={"name": f"name_{i}", "page": "123", "chapter": "abstract", "number": -2},
|
|
embedding=np.random.rand(768).astype(np.float32),
|
|
)
|
|
)
|
|
documents.append(
|
|
Document(
|
|
content=f"A Foobar Document {i}",
|
|
metadata={"name": f"name_{i}", "page": "90", "chapter": "conclusion", "number": -10},
|
|
embedding=np.random.rand(768).astype(np.float32),
|
|
)
|
|
)
|
|
documents.append(
|
|
Document(
|
|
content=f"Document {i} without embedding",
|
|
metadata={"name": f"name_{i}", "no_embedding": True, "chapter": "conclusion"},
|
|
)
|
|
)
|
|
documents.append(
|
|
Document(content=pd.DataFrame([i]), content_type="table", metadata={"name": f"table_doc_{i}"})
|
|
)
|
|
documents.append(
|
|
Document(content=f"Doc {i} with zeros emb", metadata={"name": f"zeros_doc"}, embedding=embedding_zero)
|
|
)
|
|
documents.append(
|
|
Document(content=f"Doc {i} with ones emb", metadata={"name": f"ones_doc"}, embedding=embedding_one)
|
|
)
|
|
return documents
|
|
|
|
def contains_same_docs(self, first_list: List[Document], second_list: List[Document]) -> bool:
|
|
"""
|
|
Utility to compare two lists of documents for equality regardless of the order od the documents.
|
|
"""
|
|
return (
|
|
len(first_list) > 0
|
|
and len(second_list) > 0
|
|
and first_list.sort(key=lambda d: d.id) == second_list.sort(key=lambda d: d.id)
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_count_empty(self, docstore: Store):
|
|
assert docstore.count_documents() == 0
|
|
|
|
@pytest.mark.unit
|
|
def test_count_not_empty(self, docstore: Store):
|
|
docstore.write_documents(
|
|
[Document(content="test doc 1"), Document(content="test doc 2"), Document(content="test doc 3")]
|
|
)
|
|
assert docstore.count_documents() == 3
|
|
|
|
@pytest.mark.unit
|
|
def test_no_filter_empty(self, docstore: Store):
|
|
assert docstore.filter_documents() == []
|
|
assert docstore.filter_documents(filters={}) == []
|
|
|
|
@pytest.mark.unit
|
|
def test_no_filter_not_empty(self, docstore: Store):
|
|
docs = [Document(content="test doc")]
|
|
docstore.write_documents(docs)
|
|
assert docstore.filter_documents() == docs
|
|
assert docstore.filter_documents(filters={}) == docs
|
|
|
|
@pytest.mark.unit
|
|
def test_filter_simple_metadata_value(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"page": "100"})
|
|
assert self.contains_same_docs(result, [doc for doc in filterable_docs if doc.metadata.get("page") == "100"])
|
|
|
|
@pytest.mark.unit
|
|
def test_filter_simple_list_single_element(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"page": ["100"]})
|
|
assert self.contains_same_docs(result, [doc for doc in filterable_docs if doc.metadata.get("page") == "100"])
|
|
|
|
@pytest.mark.unit
|
|
def test_filter_document_content(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"content": "A Foo Document 1"})
|
|
assert self.contains_same_docs(
|
|
result, [doc for doc in filterable_docs if doc.content_type == "text" and doc.content == "A Foo Document 1"]
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_filter_document_type(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"content_type": "table"})
|
|
assert self.contains_same_docs(result, [doc for doc in filterable_docs if doc.content_type == "table"])
|
|
|
|
@pytest.mark.unit
|
|
def test_filter_simple_list_one_value(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"page": ["100"]})
|
|
assert self.contains_same_docs(result, [doc for doc in filterable_docs if doc.metadata.get("page") in ["100"]])
|
|
|
|
@pytest.mark.unit
|
|
def test_filter_simple_list(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"page": ["100", "123"]})
|
|
assert self.contains_same_docs(
|
|
result, [doc for doc in filterable_docs if doc.metadata.get("page") in ["100", "123"]]
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_incorrect_filter_name(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"non_existing_meta_field": ["whatever"]})
|
|
assert len(result) == 0
|
|
|
|
@pytest.mark.unit
|
|
def test_incorrect_filter_type(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
with pytest.raises(ValueError, match="dictionaries or lists"):
|
|
docstore.filter_documents(filters="something odd")
|
|
|
|
@pytest.mark.unit
|
|
def test_incorrect_filter_value(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"page": ["nope"]})
|
|
assert len(result) == 0
|
|
|
|
@pytest.mark.unit
|
|
def test_incorrect_filter_nesting(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
with pytest.raises(ValueError, match="malformed"):
|
|
docstore.filter_documents(filters={"number": {"page": "100"}})
|
|
|
|
@pytest.mark.unit
|
|
def test_deeper_incorrect_filter_nesting(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
with pytest.raises(ValueError, match="malformed"):
|
|
docstore.filter_documents(filters={"number": {"page": {"chapter": "intro"}}})
|
|
|
|
@pytest.mark.unit
|
|
def test_eq_filter_explicit(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"page": {"$eq": "100"}})
|
|
assert self.contains_same_docs(result, [doc for doc in filterable_docs if doc.metadata.get("page") == "100"])
|
|
|
|
@pytest.mark.unit
|
|
def test_eq_filter_implicit(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"page": "100"})
|
|
assert self.contains_same_docs(result, [doc for doc in filterable_docs if doc.metadata.get("page") == "100"])
|
|
|
|
@pytest.mark.unit
|
|
def test_eq_filter_table(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"content": pd.DataFrame([1])})
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if isinstance(doc.content, pd.DataFrame) and doc.content.equals(pd.DataFrame([1]))
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_eq_filter_tensor(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
embedding = np.zeros([768, 1]).astype(np.float32)
|
|
result = docstore.filter_documents(filters={"embedding": embedding})
|
|
assert self.contains_same_docs(
|
|
result, [doc for doc in filterable_docs if np.array_equal(embedding, doc.embedding)]
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_deeper_incorrect_filter_nesting(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
with pytest.raises(ValueError, match="malformed"):
|
|
docstore.filter_documents(filters={"number": {"page": {"chapter": "intro"}}})
|
|
|
|
@pytest.mark.unit
|
|
def test_eq_filter_explicit(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"page": {"$eq": "100"}})
|
|
assert self.contains_same_docs(result, [doc for doc in filterable_docs if doc.metadata.get("page") == "100"])
|
|
|
|
@pytest.mark.unit
|
|
def test_eq_filter_implicit(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"page": "100"})
|
|
assert self.contains_same_docs(result, [doc for doc in filterable_docs if doc.metadata.get("page") == "100"])
|
|
|
|
@pytest.mark.unit
|
|
def test_in_filter_explicit(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"page": {"$in": ["100", "123", "n.a."]}})
|
|
assert self.contains_same_docs(
|
|
result, [doc for doc in filterable_docs if doc.metadata.get("page") in ["100", "123"]]
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_in_filter_implicit(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"page": ["100", "123", "n.a."]})
|
|
assert self.contains_same_docs(
|
|
result, [doc for doc in filterable_docs if doc.metadata.get("page") in ["100", "123"]]
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_in_filter_table(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"content": {"$in": [pd.DataFrame([1]), pd.DataFrame([2])]}})
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if isinstance(doc.content, pd.DataFrame)
|
|
and (doc.content.equals(pd.DataFrame([1])) or doc.content.equals(pd.DataFrame([2])))
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_in_filter_tensor(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
embedding_zero = np.zeros([768, 1]).astype(np.float32)
|
|
embedding_one = np.ones([768, 1]).astype(np.float32)
|
|
result = docstore.filter_documents(filters={"embedding": {"$in": [embedding_zero, embedding_one]}})
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if isinstance(doc.embedding, np.ndarray)
|
|
and (np.array_equal(embedding_zero, doc.embedding) or np.array_equal(embedding_one, doc.embedding))
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_ne_filter(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"page": {"$ne": "100"}})
|
|
assert self.contains_same_docs(result, [doc for doc in filterable_docs if doc.metadata.get("page") != "100"])
|
|
|
|
@pytest.mark.unit
|
|
def test_ne_filter_table(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"content": {"$ne": pd.DataFrame([1])}})
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if not isinstance(doc.content, pd.DataFrame) or not doc.content.equals(pd.DataFrame([1]))
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_ne_filter_tensor(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
embedding = np.zeros([768, 1]).astype(np.float32)
|
|
result = docstore.filter_documents(filters={"embedding": {"$ne": embedding}})
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if not isinstance(doc.content, np.ndarray) or not np.array_equal(embedding, doc.embedding)
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_nin_filter(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"page": {"$nin": ["100", "123", "n.a."]}})
|
|
assert self.contains_same_docs(
|
|
result, [doc for doc in filterable_docs if doc.metadata.get("page", None) not in ["100", "123"]]
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_nin_filter_table(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"content": {"$nin": [pd.DataFrame([1]), pd.DataFrame([0])]}})
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if not isinstance(doc.content, pd.DataFrame)
|
|
or (not doc.content.equals(pd.DataFrame([1])) and not doc.content.equals(pd.DataFrame([0])))
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_nin_filter_tensor(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
embedding_zeros = np.zeros([768, 1]).astype(np.float32)
|
|
embedding_ones = np.zeros([768, 1]).astype(np.float32)
|
|
result = docstore.filter_documents(filters={"embedding": {"$nin": [embedding_ones, embedding_zeros]}})
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if not isinstance(doc.content, np.ndarray)
|
|
or (
|
|
not np.array_equal(embedding_zeros, doc.embedding)
|
|
and not np.array_equal(embedding_ones, doc.embedding)
|
|
)
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_nin_filter(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"page": {"$nin": ["100", "123", "n.a."]}})
|
|
assert self.contains_same_docs(
|
|
result, [doc for doc in filterable_docs if doc.metadata.get("page") not in ["100", "123"]]
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_gt_filter(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"number": {"$gt": 0.0}})
|
|
assert self.contains_same_docs(
|
|
result, [doc for doc in filterable_docs if "number" in doc.metadata and doc.metadata["number"] > 0]
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_gt_filter_non_numeric(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
with pytest.raises(StoreError, match="Can't evaluate"):
|
|
docstore.filter_documents(filters={"page": {"$gt": "100"}})
|
|
|
|
@pytest.mark.unit
|
|
def test_gt_filter_table(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
with pytest.raises(StoreError, match="Can't evaluate"):
|
|
docstore.filter_documents(filters={"content": {"$gt": pd.DataFrame([[1, 2, 3], [-1, -2, -3]])}})
|
|
|
|
@pytest.mark.unit
|
|
def test_gt_filter_tensor(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
embedding_zeros = np.zeros([768, 1]).astype(np.float32)
|
|
with pytest.raises(StoreError, match="Can't evaluate"):
|
|
docstore.filter_documents(filters={"embedding": {"$gt": embedding_zeros}})
|
|
|
|
@pytest.mark.unit
|
|
def test_gte_filter(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"number": {"$gte": -2.0}})
|
|
assert self.contains_same_docs(
|
|
result, [doc for doc in filterable_docs if "number" in doc.metadata and doc.metadata["number"] >= -2.0]
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_gte_filter_non_numeric(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
with pytest.raises(StoreError, match="Can't evaluate"):
|
|
docstore.filter_documents(filters={"page": {"$gte": "100"}})
|
|
|
|
@pytest.mark.unit
|
|
def test_gte_filter_table(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
with pytest.raises(StoreError, match="Can't evaluate"):
|
|
docstore.filter_documents(filters={"content": {"$gte": pd.DataFrame([[1, 2, 3], [-1, -2, -3]])}})
|
|
|
|
@pytest.mark.unit
|
|
def test_gte_filter_tensor(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
embedding_zeros = np.zeros([768, 1]).astype(np.float32)
|
|
with pytest.raises(StoreError, match="Can't evaluate"):
|
|
docstore.filter_documents(filters={"embedding": {"$gte": embedding_zeros}})
|
|
|
|
@pytest.mark.unit
|
|
def test_lt_filter(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"number": {"$lt": 0.0}})
|
|
assert self.contains_same_docs(
|
|
result, [doc for doc in filterable_docs if "number" in doc.metadata and doc.metadata["number"] < 0]
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_lt_filter_non_numeric(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
with pytest.raises(StoreError, match="Can't evaluate"):
|
|
docstore.filter_documents(filters={"page": {"$lt": "100"}})
|
|
|
|
@pytest.mark.unit
|
|
def test_lt_filter_table(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
with pytest.raises(StoreError, match="Can't evaluate"):
|
|
docstore.filter_documents(filters={"content": {"$lt": pd.DataFrame([[1, 2, 3], [-1, -2, -3]])}})
|
|
|
|
@pytest.mark.unit
|
|
def test_lt_filter_tensor(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
embedding_ones = np.ones([768, 1]).astype(np.float32)
|
|
with pytest.raises(StoreError, match="Can't evaluate"):
|
|
docstore.filter_documents(filters={"embedding": {"$lt": embedding_ones}})
|
|
|
|
@pytest.mark.unit
|
|
def test_lte_filter(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"number": {"$lte": 2.0}})
|
|
assert self.contains_same_docs(
|
|
result, [doc for doc in filterable_docs if "number" in doc.metadata and doc.metadata["number"] <= 2.0]
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_lte_filter_non_numeric(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
with pytest.raises(StoreError, match="Can't evaluate"):
|
|
docstore.filter_documents(filters={"page": {"$lte": "100"}})
|
|
|
|
@pytest.mark.unit
|
|
def test_lte_filter_table(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
with pytest.raises(StoreError, match="Can't evaluate"):
|
|
docstore.filter_documents(filters={"content": {"$lte": pd.DataFrame([[1, 2, 3], [-1, -2, -3]])}})
|
|
|
|
@pytest.mark.unit
|
|
def test_lte_filter_tensor(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
embedding_ones = np.ones([768, 1]).astype(np.float32)
|
|
with pytest.raises(StoreError, match="Can't evaluate"):
|
|
docstore.filter_documents(filters={"embedding": {"$lte": embedding_ones}})
|
|
|
|
@pytest.mark.unit
|
|
def test_filter_simple_implicit_and_with_multi_key_dict(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"number": {"$lte": 2.0, "$gte": 0.0}})
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if "number" in doc.metadata and doc.metadata["number"] >= 0.0 and doc.metadata["number"] <= 2.0
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_filter_simple_explicit_and_with_multikey_dict(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"number": {"$and": {"$lte": 0, "$gte": -2}}})
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if "number" in doc.metadata and doc.metadata["number"] >= 0.0 and doc.metadata["number"] <= 2.0
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_filter_simple_explicit_and_with_list(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"number": {"$and": [{"$lte": 2}, {"$gte": 0}]}})
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if "number" in doc.metadata and doc.metadata["number"] <= 2.0 and doc.metadata["number"] >= 0.0
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_filter_simple_implicit_and(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
result = docstore.filter_documents(filters={"number": {"$lte": 2.0, "$gte": 0}})
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if "number" in doc.metadata and doc.metadata["number"] <= 2.0 and doc.metadata["number"] >= 0.0
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_filter_nested_explicit_and(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
filters = {"$and": {"number": {"$and": {"$lte": 2, "$gte": 0}}, "name": {"$in": ["name_0", "name_1"]}}}
|
|
result = docstore.filter_documents(filters=filters)
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if (
|
|
"number" in doc.metadata
|
|
and doc.metadata["number"] >= 0
|
|
and doc.metadata["number"] <= 2
|
|
and doc.metadata["name"] in ["name_0", "name_1"]
|
|
)
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_filter_nested_implicit_and(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
filters_simplified = {"number": {"$lte": 2, "$gte": 0}, "name": ["name_0", "name_1"]}
|
|
result = docstore.filter_documents(filters=filters_simplified)
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if (
|
|
"number" in doc.metadata
|
|
and doc.metadata["number"] <= 2
|
|
and doc.metadata["number"] >= 0
|
|
and doc.metadata.get("name") in ["name_0", "name_1"]
|
|
)
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_filter_simple_or(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
filters = {"$or": {"name": {"$in": ["name_0", "name_1"]}, "number": {"$lt": 1.0}}}
|
|
result = docstore.filter_documents(filters=filters)
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if (
|
|
("number" in doc.metadata and doc.metadata["number"] < 1)
|
|
or doc.metadata.get("name") in ["name_0", "name_1"]
|
|
)
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_filter_nested_or(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
filters = {"$or": {"name": {"$or": [{"$eq": "name_0"}, {"$eq": "name_1"}]}, "number": {"$lt": 1.0}}}
|
|
result = docstore.filter_documents(filters=filters)
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if (
|
|
doc.metadata.get("name") in ["name_0", "name_1"]
|
|
or ("number" in doc.metadata and doc.metadata["number"] < 1)
|
|
)
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_filter_nested_and_or_explicit(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
filters_simplified = {
|
|
"$and": {"page": {"$eq": "123"}, "$or": {"name": {"$in": ["name_0", "name_1"]}, "number": {"$lt": 1.0}}}
|
|
}
|
|
result = docstore.filter_documents(filters=filters_simplified)
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if (
|
|
doc.metadata.get("page") in ["123"]
|
|
and (
|
|
doc.metadata.get("name") in ["name_0", "name_1"]
|
|
or ("number" in doc.metadata and doc.metadata["number"] < 1)
|
|
)
|
|
)
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_filter_nested_and_or_implicit(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
filters_simplified = {
|
|
"page": {"$eq": "123"},
|
|
"$or": {"name": {"$in": ["name_0", "name_1"]}, "number": {"$lt": 1.0}},
|
|
}
|
|
result = docstore.filter_documents(filters=filters_simplified)
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if (
|
|
doc.metadata.get("page") in ["123"]
|
|
and (
|
|
doc.metadata.get("name") in ["name_0", "name_1"]
|
|
or ("number" in doc.metadata and doc.metadata["number"] < 1)
|
|
)
|
|
)
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_filter_nested_or_and(self, docstore: Store, filterable_docs: List[Document]):
|
|
docstore.write_documents(filterable_docs)
|
|
filters_simplified = {
|
|
"$or": {
|
|
"number": {"$lt": 1.0},
|
|
"$and": {"name": {"$in": ["name_0", "name_1"]}, "$not": {"chapter": {"$eq": "intro"}}},
|
|
}
|
|
}
|
|
result = docstore.filter_documents(filters=filters_simplified)
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if (
|
|
("number" in doc.metadata and doc.metadata["number"] < 1)
|
|
or (
|
|
doc.metadata.get("name") in ["name_0", "name_1"]
|
|
or ("chapter" in doc.metadata and doc.metadata["chapter"] != "intro")
|
|
)
|
|
)
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_filter_nested_multiple_identical_operators_same_level(
|
|
self, docstore: Store, filterable_docs: List[Document]
|
|
):
|
|
docstore.write_documents(filterable_docs)
|
|
filters = {
|
|
"$or": [
|
|
{"$and": {"name": {"$in": ["name_0", "name_1"]}, "page": "100"}},
|
|
{"$and": {"chapter": {"$in": ["intro", "abstract"]}, "page": "123"}},
|
|
]
|
|
}
|
|
result = docstore.filter_documents(filters=filters)
|
|
assert self.contains_same_docs(
|
|
result,
|
|
[
|
|
doc
|
|
for doc in filterable_docs
|
|
if (
|
|
(doc.metadata.get("name") in ["name_0", "name_1"] and doc.metadata.get("page") == "100")
|
|
or (doc.metadata.get("chapter") in ["intro", "abstract"] and doc.metadata.get("page") == "100")
|
|
)
|
|
],
|
|
)
|
|
|
|
@pytest.mark.unit
|
|
def test_write(self, docstore: Store):
|
|
doc = Document(content="test doc")
|
|
docstore.write_documents([doc])
|
|
assert docstore.filter_documents(filters={"id": doc.id}) == [doc]
|
|
|
|
@pytest.mark.unit
|
|
def test_write_duplicate_fail(self, docstore: Store):
|
|
doc = Document(content="test doc")
|
|
docstore.write_documents([doc])
|
|
with pytest.raises(DuplicateDocumentError, match=f"ID '{doc.id}' already exists."):
|
|
docstore.write_documents(documents=[doc], policy=DuplicatePolicy.FAIL)
|
|
assert docstore.filter_documents(filters={"id": doc.id}) == [doc]
|
|
|
|
@pytest.mark.unit
|
|
def test_write_duplicate_skip(self, docstore: Store):
|
|
doc = Document(content="test doc")
|
|
docstore.write_documents([doc])
|
|
docstore.write_documents(documents=[doc], policy=DuplicatePolicy.SKIP)
|
|
assert docstore.filter_documents(filters={"id": doc.id}) == [doc]
|
|
|
|
@pytest.mark.unit
|
|
def test_write_duplicate_overwrite(self, docstore: Store):
|
|
doc1 = Document(content="test doc 1")
|
|
doc2 = Document(content="test doc 2")
|
|
object.__setattr__(doc2, "id", doc1.id) # Make two docs with different content but same ID
|
|
|
|
docstore.write_documents([doc2])
|
|
docstore.filter_documents(filters={"id": doc1.id}) == [doc2]
|
|
docstore.write_documents(documents=[doc1], policy=DuplicatePolicy.OVERWRITE)
|
|
assert docstore.filter_documents(filters={"id": doc1.id}) == [doc1]
|
|
|
|
@pytest.mark.unit
|
|
def test_write_not_docs(self, docstore: Store):
|
|
with pytest.raises(ValueError, match="Please provide a list of Documents"):
|
|
docstore.write_documents(["not a document for sure"])
|
|
|
|
@pytest.mark.unit
|
|
def test_write_not_list(self, docstore: Store):
|
|
with pytest.raises(ValueError, match="Please provide a list of Documents"):
|
|
docstore.write_documents("not a list actually")
|
|
|
|
@pytest.mark.unit
|
|
def test_delete_empty(self, docstore: Store):
|
|
with pytest.raises(MissingDocumentError):
|
|
docstore.delete_documents(["test"])
|
|
|
|
@pytest.mark.unit
|
|
def test_delete_not_empty(self, docstore: Store):
|
|
doc = Document(content="test doc")
|
|
docstore.write_documents([doc])
|
|
|
|
docstore.delete_documents([doc.id])
|
|
|
|
with pytest.raises(Exception):
|
|
assert docstore.filter_documents(filters={"id": doc.id})
|
|
|
|
@pytest.mark.unit
|
|
def test_delete_not_empty_nonexisting(self, docstore: Store):
|
|
doc = Document(content="test doc")
|
|
docstore.write_documents([doc])
|
|
|
|
with pytest.raises(MissingDocumentError):
|
|
docstore.delete_documents(["non_existing"])
|
|
|
|
assert docstore.filter_documents(filters={"id": doc.id}) == [doc]
|