haystack/test/document_stores/test_weaviate.py

254 lines
8.8 KiB
Python
Raw Normal View History

import pytest
from haystack.document_stores.weaviate import WeaviateDocumentStore
from haystack.schema import Document
from .test_base import DocumentStoreBaseTestAbstract
import uuid
from unittest.mock import MagicMock
import numpy as np
import pytest
Refactoring of the `haystack` package (#1624) * Files moved, imports all broken * Fix most imports and docstrings into * Fix the paths to the modules in the API docs * Add latest docstring and tutorial changes * Add a few pipelines that were lost in the inports * Fix a bunch of mypy warnings * Add latest docstring and tutorial changes * Create a file_classifier module * Add docs for file_classifier * Fixed most circular imports, now the REST API can start * Add latest docstring and tutorial changes * Tackling more mypy issues * Reintroduce from FARM and fix last mypy issues hopefully * Re-enable old-style imports * Fix some more import from the top-level package in an attempt to sort out circular imports * Fix some imports in tests to new-style to prevent failed class equalities from breaking tests * Change document_store into document_stores * Update imports in tutorials * Add latest docstring and tutorial changes * Probably fixes summarizer tests * Improve the old-style import allowing module imports (should work) * Try to fix the docs * Remove dedicated KnowledgeGraph page from autodocs * Remove dedicated GraphRetriever page from autodocs * Fix generate_docstrings.sh with an updated list of yaml files to look for * Fix some more modules in the docs * Fix the document stores docs too * Fix a small issue on Tutorial14 * Add latest docstring and tutorial changes * Add deprecation warning to old-style imports * Remove stray folder and import Dict into dense.py * Change import path for MLFlowLogger * Add old loggers path to the import path aliases * Fix debug output of convert_ipynb.py * Fix circular import on BaseRetriever * Missed one merge block * re-run tutorial 5 * Fix imports in tutorial 5 * Re-enable squad_to_dpr CLI from the root package and move get_batches_from_generator into document_stores.base * Add latest docstring and tutorial changes * Fix typo in utils __init__ * Fix a few more imports * Fix benchmarks too * New-style imports in test_knowledge_graph * Rollback setup.py * Rollback squad_to_dpr too Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2021-10-25 15:50:23 +02:00
from haystack.schema import Document
embedding_dim = 768
def get_uuid():
return str(uuid.uuid4())
class TestWeaviateDocumentStore(DocumentStoreBaseTestAbstract):
# Constants
index_name = "DocumentsTest"
@pytest.fixture
def ds(self):
return WeaviateDocumentStore(index=self.index_name, recreate_index=True, return_embedding=True)
@pytest.fixture(scope="class")
def documents(self):
documents = []
for i in range(3):
documents.append(
Document(
id=get_uuid(),
content=f"A Foo Document {i}",
meta={"name": f"name_{i}", "year": "2020", "month": "01", "numbers": [2.0, 4.0]},
embedding=np.random.rand(768).astype(np.float32),
)
)
documents.append(
Document(
id=get_uuid(),
content=f"A Bar Document {i}",
meta={"name": f"name_{i}", "year": "2021", "month": "02", "numbers": [-2.0, -4.0]},
embedding=np.random.rand(768).astype(np.float32),
)
)
documents.append(
Document(
id=get_uuid(),
content=f"A Baz Document {i}",
meta={"name": f"name_{i}", "month": "03"},
embedding=np.random.rand(768).astype(np.float32),
)
)
return documents
@pytest.mark.skip(reason="Weaviate does not support labels")
@pytest.mark.integration
def test_write_labels(self):
pass
@pytest.mark.skip(reason="Weaviate does not support labels")
@pytest.mark.integration
def test_delete_labels(self):
pass
@pytest.mark.skip(reason="Weaviate does not support labels")
@pytest.mark.integration
def test_delete_labels_by_id(self):
pass
@pytest.mark.skip(reason="Weaviate does not support labels")
@pytest.mark.integration
def test_delete_labels_by_filter(self):
pass
@pytest.mark.skip(reason="Weaviate does not support labels")
@pytest.mark.integration
def test_delete_labels_by_filter_id(self):
pass
@pytest.mark.skip(reason="Weaviate does not support labels")
@pytest.mark.integration
def test_get_label_count(self):
pass
@pytest.mark.skip(reason="Weaviate does not support labels")
@pytest.mark.integration
def test_write_labels_duplicate(self):
pass
@pytest.mark.skip(reason="Weaviate does not support labels")
@pytest.mark.integration
def test_write_get_all_labels(self):
pass
@pytest.mark.skip(reason="Weaviate does not support labels")
@pytest.mark.integration
def test_labels_with_long_texts(self):
pass
@pytest.mark.skip(reason="Weaviate does not support labels")
@pytest.mark.integration
def test_multilabel(self):
pass
@pytest.mark.skip(reason="Weaviate does not support labels")
@pytest.mark.integration
def test_multilabel_no_answer(self):
pass
@pytest.mark.skip(reason="Weaviate does not support labels")
@pytest.mark.integration
def test_multilabel_filter_aggregations(self):
pass
@pytest.mark.skip(reason="Weaviate does not support labels")
@pytest.mark.integration
def test_multilabel_meta_aggregations(self):
pass
@pytest.mark.integration
def test_ne_filters(self, ds, documents):
"""
Weaviate doesn't include documents if the field is missing,
so we customize this test
"""
ds.write_documents(documents)
result = ds.get_all_documents(filters={"year": {"$ne": "2020"}})
assert len(result) == 3
@pytest.mark.integration
def test_nin_filters(self, ds, documents):
"""
Weaviate doesn't include documents if the field is missing,
so we customize this test
"""
ds.write_documents(documents)
result = ds.get_all_documents(filters={"year": {"$nin": ["2020", "2021", "n.a."]}})
assert len(result) == 0
@pytest.mark.integration
def test_delete_index(self, ds, documents):
"""Contrary to other Document Stores, this doesn't raise if the index is empty"""
ds.write_documents(documents, index="custom_index")
assert ds.get_document_count(index="custom_index") == len(documents)
ds.delete_index(index="custom_index")
assert ds.get_document_count(index="custom_index") == 0
@pytest.mark.integration
def test_query_by_embedding(self, ds, documents):
ds.write_documents(documents)
docs = ds.query_by_embedding(np.random.rand(embedding_dim).astype(np.float32))
assert len(docs) == 9
docs = ds.query_by_embedding(np.random.rand(embedding_dim).astype(np.float32), top_k=1)
assert len(docs) == 1
docs = ds.query_by_embedding(np.random.rand(embedding_dim).astype(np.float32), filters={"name": ["name_1"]})
assert len(docs) == 3
@pytest.mark.integration
def test_query(self, ds, documents):
ds.write_documents(documents)
query_text = "Foo"
docs = ds.query(query_text)
assert len(docs) == 3
# BM25 retrieval WITH filters is not yet supported as of Weaviate v1.14.1
# Should be from 1.18: https://github.com/semi-technologies/weaviate/issues/2393
# docs = ds.query(query_text, filters={"name": ["name_1"]})
# assert len(docs) == 1
docs = ds.query(query=None, filters={"name": ["name_0"]})
assert len(docs) == 3
docs = ds.query(query=None, filters={"content": [query_text.lower()]})
assert len(docs) == 3
docs = ds.query(query=None, filters={"content": ["baz"]})
assert len(docs) == 3
@pytest.mark.integration
def test_get_all_documents_unaffected_by_QUERY_MAXIMUM_RESULTS(self, ds, documents, monkeypatch):
"""
Ensure `get_all_documents` works no matter the value of QUERY_MAXIMUM_RESULTS
see https://github.com/deepset-ai/haystack/issues/2517
"""
ds.write_documents(documents)
monkeypatch.setattr(ds, "get_document_count", lambda **kwargs: 13_000)
docs = ds.get_all_documents()
assert len(docs) == 9
@pytest.mark.integration
def test_deleting_by_id_or_by_filters(self, ds, documents):
ds.write_documents(documents)
# This test verifies that deleting an object by its ID does not first require fetching all documents. This fixes
# a bug, as described in https://github.com/deepset-ai/haystack/issues/2898
ds.get_all_documents = MagicMock(wraps=ds.get_all_documents)
assert ds.get_document_count() == 9
# Delete a document by its ID. This should bypass the get_all_documents() call
ds.delete_documents(ids=[documents[0].id])
ds.get_all_documents.assert_not_called()
assert ds.get_document_count() == 8
ds.get_all_documents.reset_mock()
# Delete a document with filters. Prove that using the filters will go through get_all_documents()
ds.delete_documents(filters={"name": ["name_0"]})
ds.get_all_documents.assert_called()
assert ds.get_document_count() == 6
@pytest.mark.integration
@pytest.mark.parametrize("similarity", ["cosine", "l2", "dot_product"])
def test_similarity_existing_index(self, similarity):
"""Testing non-matching similarity"""
# create the document_store
document_store = WeaviateDocumentStore(
similarity=similarity, index=f"test_similarity_existing_index_{similarity}", recreate_index=True
)
# try to connect to the same document store but using the wrong similarity
non_matching_similarity = "l2" if similarity == "cosine" else "cosine"
with pytest.raises(ValueError, match=r"This index already exists in Weaviate with similarity .*"):
document_store2 = WeaviateDocumentStore(
similarity=non_matching_similarity,
index=f"test_similarity_existing_index_{similarity}",
recreate_index=False,
)
@pytest.mark.integration
def test_cant_write_id_in_meta(self, ds):
with pytest.raises(ValueError, match='"meta" info contains duplicate key "id"'):
ds.write_documents([Document(content="test", meta={"id": "test-id"})])
@pytest.mark.integration
def test_cant_write_top_level_fields_in_meta(self, ds):
with pytest.raises(ValueError, match='"meta" info contains duplicate key "content"'):
ds.write_documents([Document(content="test", meta={"content": "test-id"})])