mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-29 11:50:34 +00:00
180 lines
6.6 KiB
Python
180 lines
6.6 KiB
Python
import json
|
|
import logging
|
|
|
|
import pytest
|
|
|
|
from haystack.document_stores.sql import LabelORM, SQLDocumentStore
|
|
from haystack.schema import Document
|
|
from haystack.testing import DocumentStoreBaseTestAbstract
|
|
|
|
|
|
class TestSQLDocumentStore(DocumentStoreBaseTestAbstract):
|
|
# Constants
|
|
|
|
index_name = __name__
|
|
|
|
@pytest.fixture
|
|
def ds(self, tmp_path):
|
|
db_url = f"sqlite:///{tmp_path}/haystack_test.db"
|
|
return SQLDocumentStore(url=db_url, index=self.index_name, isolation_level="AUTOCOMMIT")
|
|
|
|
@pytest.mark.integration
|
|
def test_delete_index(self, ds, documents):
|
|
"""Contrary to other Document Stores, SQLDocumentStore doesn't raise if the index is empty"""
|
|
ds.write_documents(documents, index="custom_index")
|
|
assert ds.get_document_count(index="custom_index") == len(documents)
|
|
ds.delete_index(index="custom_index")
|
|
assert ds.get_document_count(index="custom_index") == 0
|
|
|
|
@pytest.mark.integration
|
|
def test_sql_write_different_documents_same_vector_id(self, ds):
|
|
doc1 = {"content": "content 1", "name": "doc1", "id": "1", "vector_id": "vector_id"}
|
|
doc2 = {"content": "content 2", "name": "doc2", "id": "2", "vector_id": "vector_id"}
|
|
|
|
ds.write_documents([doc1], index="index1")
|
|
documents_in_index1 = ds.get_all_documents(index="index1")
|
|
assert len(documents_in_index1) == 1
|
|
ds.write_documents([doc2], index="index2")
|
|
documents_in_index2 = ds.get_all_documents(index="index2")
|
|
assert len(documents_in_index2) == 1
|
|
|
|
ds.write_documents([doc1], index="index3")
|
|
with pytest.raises(Exception, match=r"(?i)unique"):
|
|
ds.write_documents([doc2], index="index3")
|
|
|
|
@pytest.mark.integration
|
|
def test_sql_get_documents_using_nested_filters_about_classification(self, ds):
|
|
documents = [
|
|
Document(
|
|
content="That's good. I like it.",
|
|
id="1",
|
|
meta={
|
|
"classification": {
|
|
"label": "LABEL_1",
|
|
"score": 0.694,
|
|
"details": {"LABEL_1": 0.694, "LABEL_0": 0.306},
|
|
}
|
|
},
|
|
),
|
|
Document(
|
|
content="That's bad. I don't like it.",
|
|
id="2",
|
|
meta={
|
|
"classification": {
|
|
"label": "LABEL_0",
|
|
"score": 0.898,
|
|
"details": {"LABEL_0": 0.898, "LABEL_1": 0.102},
|
|
}
|
|
},
|
|
),
|
|
]
|
|
ds.write_documents(documents)
|
|
|
|
assert ds.get_document_count() == 2
|
|
assert len(ds.get_all_documents(filters={"classification.score": {"$gt": 0.1}})) == 2
|
|
assert len(ds.get_all_documents(filters={"classification.label": ["LABEL_1", "LABEL_0"]})) == 2
|
|
assert len(ds.get_all_documents(filters={"classification.score": {"$gt": 0.8}})) == 1
|
|
assert len(ds.get_all_documents(filters={"classification.label": ["LABEL_1"]})) == 1
|
|
assert len(ds.get_all_documents(filters={"classification.score": {"$gt": 0.95}})) == 0
|
|
assert len(ds.get_all_documents(filters={"classification.label": ["LABEL_100"]})) == 0
|
|
|
|
# NOTE: the SQLDocumentStore marshals metadata values with JSON so querying
|
|
# using filters doesn't always work. While this should be considered a bug,
|
|
# the relative tests are either customized or skipped while we work on a fix.
|
|
|
|
@pytest.mark.integration
|
|
def test_ne_filters(self, ds, caplog):
|
|
with caplog.at_level(logging.WARNING):
|
|
ds.get_all_documents(filters={"year": {"$ne": "2020"}})
|
|
assert "filters won't work on metadata fields" in caplog.text
|
|
|
|
@pytest.mark.integration
|
|
def test_get_all_labels_legacy_document_id(self, ds):
|
|
ds.session.add(
|
|
LabelORM(
|
|
id="123",
|
|
no_answer=False,
|
|
document=json.dumps(
|
|
{
|
|
"content": "Some content",
|
|
"content_type": "text",
|
|
"score": None,
|
|
"id": "fc18c987a8312e72a47fb1524f230bb0",
|
|
"meta": {},
|
|
"embedding": [0.1, 0.2, 0.3],
|
|
}
|
|
),
|
|
origin="user-feedback",
|
|
query="Who made the PDF specification?",
|
|
is_correct_answer=True,
|
|
is_correct_document=True,
|
|
answer=json.dumps(
|
|
{
|
|
"answer": "Adobe Systems",
|
|
"type": "extractive",
|
|
"context": "Some content",
|
|
"offsets_in_context": [{"start": 60, "end": 73}],
|
|
"offsets_in_document": [{"start": 60, "end": 73}],
|
|
# legacy document_id answer
|
|
"document_id": "fc18c987a8312e72a47fb1524f230bb0",
|
|
"meta": {},
|
|
"score": None,
|
|
}
|
|
),
|
|
pipeline_id="some-123",
|
|
index=ds.label_index,
|
|
)
|
|
)
|
|
labels = ds.get_all_labels()
|
|
assert labels[0].answer.document_ids == ["fc18c987a8312e72a47fb1524f230bb0"]
|
|
|
|
@pytest.mark.skip
|
|
@pytest.mark.integration
|
|
def test_nin_filters(self, ds, documents):
|
|
pass
|
|
|
|
@pytest.mark.skip
|
|
@pytest.mark.integration
|
|
def test_comparison_filters(self, ds, documents):
|
|
pass
|
|
|
|
@pytest.mark.skip
|
|
@pytest.mark.integration
|
|
def test_nested_condition_filters(self, ds, documents):
|
|
pass
|
|
|
|
@pytest.mark.skip
|
|
@pytest.mark.integration
|
|
def test_nested_condition_not_filters(self, ds, documents):
|
|
pass
|
|
|
|
@pytest.mark.skip(reason="labels metadata are not supported")
|
|
@pytest.mark.integration
|
|
def test_delete_labels_by_filter(self, ds, labels):
|
|
pass
|
|
|
|
@pytest.mark.skip(reason="labels metadata are not supported")
|
|
@pytest.mark.integration
|
|
def test_delete_labels_by_filter_id(self, ds, labels):
|
|
pass
|
|
|
|
@pytest.mark.skip(reason="labels metadata are not supported")
|
|
@pytest.mark.integration
|
|
def test_multilabel_filter_aggregations(self):
|
|
pass
|
|
|
|
@pytest.mark.skip(reason="labels metadata are not supported")
|
|
@pytest.mark.integration
|
|
def test_multilabel_meta_aggregations(self):
|
|
pass
|
|
|
|
@pytest.mark.skip(reason="embeddings are not supported")
|
|
@pytest.mark.integration
|
|
def test_get_embedding_count(self):
|
|
pass
|
|
|
|
@pytest.mark.skip(reason="embeddings are not supported")
|
|
@pytest.mark.integration
|
|
def test_custom_embedding_field(self, ds):
|
|
pass
|