haystack/test/test_weaviate.py
Malte Pietsch 4a6c9302b3
Redesign primitives - Document, Answer, Label (#1398)
* first draft / notes on new primitives

* wip label / feedback refactor

* rename doc.text -> doc.content. add doc.content_type

* add datatype for content

* remove faq_question_field from ES and weaviate. rename text_field -> content_field in docstores. update tutorials for content field

* update converters for . Add warning for empty

* renam label.question -> label.query. Allow sorting of Answers.

* WIP primitives

* update ui/reader for new Answer format

* Improve Label. First refactoring of MultiLabel. Adjust eval code

* fixed workflow conflict with introducing new one (#1472)

* Add latest docstring and tutorial changes

* make add_eval_data() work again

* fix reader formats. WIP fix _extract_docs_and_labels_from_dict

* fix test reader

* Add latest docstring and tutorial changes

* fix another test case for reader

* fix mypy in farm reader.eval()

* fix mypy in farm reader.eval()

* WIP ORM refactor

* Add latest docstring and tutorial changes

* fix mypy weaviate

* make label and multilabel dataclasses

* bump mypy env in CI to python 3.8

* WIP refactor Label ORM

* WIP refactor Label ORM

* simplify tests for individual doc stores

* WIP refactoring markers of tests

* test alternative approach for tests with existing parametrization

* WIP refactor ORMs

* fix skip logic of already parametrized tests

* fix weaviate behaviour in tests - not parametrizing it in our general test cases.

* Add latest docstring and tutorial changes

* fix some tests

* remove sql from document_store_types

* fix markers for generator and pipeline test

* remove inmemory marker

* remove unneeded elasticsearch markers

* add dataclasses-json dependency. adjust ORM to just store JSON repr

* ignore type as dataclasses_json seems to miss functionality here

* update readme and contributing.md

* update contributing

* adjust example

* fix duplicate doc handling for custom index

* Add latest docstring and tutorial changes

* fix some ORM issues. fix get_all_labels_aggregated.

* update drop flags where get_all_labels_aggregated() was used before

* Add latest docstring and tutorial changes

* add to_json(). add + fix tests

* fix no_answer handling in label / multilabel

* fix duplicate docs in memory doc store. change primary key for sql doc table

* fix mypy issues

* fix mypy issues

* haystack/retriever/base.py

* fix test_write_document_meta[elastic]

* fix test_elasticsearch_custom_fields

* fix test_labels[elastic]

* fix crawler

* fix converter

* fix docx converter

* fix preprocessor

* fix test_utils

* fix tfidf retriever. fix selection of docstore in tests with multiple fixtures / parameterizations

* Add latest docstring and tutorial changes

* fix crawler test. fix ocrconverter attribute

* fix test_elasticsearch_custom_query

* fix generator pipeline

* fix ocr converter

* fix ragenerator

* Add latest docstring and tutorial changes

* fix test_load_and_save_yaml for elasticsearch

* fixes for pipeline tests

* fix faq pipeline

* fix pipeline tests

* Add latest docstring and tutorial changes

* fix weaviate

* Add latest docstring and tutorial changes

* trigger CI

* satisfy mypy

* Add latest docstring and tutorial changes

* satisfy mypy

* Add latest docstring and tutorial changes

* trigger CI

* fix question generation test

* fix ray. fix Q-generation

* fix translator test

* satisfy mypy

* wip refactor feedback rest api

* fix rest api feedback endpoint

* fix doc classifier

* remove relation of Labels -> Docs in SQL ORM

* fix faiss/milvus tests

* fix doc classifier test

* fix eval test

* fixing eval issues

* Add latest docstring and tutorial changes

* fix mypy

* WIP replace dataclasses-json with manual serialization

* Add latest docstring and tutorial changes

* revert to dataclass-json serialization for now. remove debug prints.

* update docstrings

* fix extractor. fix Answer Span init

* fix api test

* keep meta data of answers in reader.run()

* fix meta handling

* adress review feedback

* Add latest docstring and tutorial changes

* make document=None for open domain labels

* add import

* fix print utils

* fix rest api

* adress review feedback

* Add latest docstring and tutorial changes

* fix mypy

Co-authored-by: Markus Paff <markuspaff.mp@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2021-10-13 14:23:23 +02:00

332 lines
15 KiB
Python

import numpy as np
import pytest
from haystack import Document
from conftest import get_document_store
import uuid
embedding_dim = 768
def get_uuid():
return str(uuid.uuid4())
DOCUMENTS = [
{"content": "text1", "id":get_uuid(), "key": "a", "embedding": np.random.rand(embedding_dim).astype(np.float32)},
{"content": "text2", "id":get_uuid(), "key": "b", "embedding": np.random.rand(embedding_dim).astype(np.float32)},
{"content": "text3", "id":get_uuid(), "key": "b", "embedding": np.random.rand(embedding_dim).astype(np.float32)},
{"content": "text4", "id":get_uuid(), "key": "b", "embedding": np.random.rand(embedding_dim).astype(np.float32)},
{"content": "text5", "id":get_uuid(), "key": "b", "embedding": np.random.rand(embedding_dim).astype(np.float32)},
]
DOCUMENTS_XS = [
# current "dict" format for a document
{"content": "My name is Carla and I live in Berlin", "id":get_uuid(), "meta": {"metafield": "test1", "name": "filename1"}, "embedding": np.random.rand(embedding_dim).astype(np.float32)},
# meta_field at the top level for backward compatibility
{"content": "My name is Paul and I live in New York", "id":get_uuid(), "metafield": "test2", "name": "filename2", "embedding": np.random.rand(embedding_dim).astype(np.float32)},
# Document object for a doc
Document(content="My name is Christelle and I live in Paris", id=get_uuid(), meta={"metafield": "test3", "name": "filename3"}, embedding=np.random.rand(embedding_dim).astype(np.float32))
]
@pytest.fixture(params=["weaviate"])
def document_store_with_docs(request):
document_store = get_document_store(request.param)
document_store.write_documents(DOCUMENTS_XS)
yield document_store
document_store.delete_documents()
@pytest.fixture(params=["weaviate"])
def document_store(request):
document_store = get_document_store(request.param)
yield document_store
document_store.delete_documents()
@pytest.mark.weaviate
@pytest.mark.parametrize("document_store_with_docs", ["weaviate"], indirect=True)
def test_get_all_documents_without_filters(document_store_with_docs):
documents = document_store_with_docs.get_all_documents()
assert all(isinstance(d, Document) for d in documents)
assert len(documents) == 3
assert {d.meta["name"] for d in documents} == {"filename1", "filename2", "filename3"}
assert {d.meta["metafield"] for d in documents} == {"test1", "test2", "test3"}
@pytest.mark.weaviate
def test_get_all_documents_with_correct_filters(document_store_with_docs):
documents = document_store_with_docs.get_all_documents(filters={"metafield": ["test2"]})
assert len(documents) == 1
assert documents[0].meta["name"] == "filename2"
documents = document_store_with_docs.get_all_documents(filters={"metafield": ["test1", "test3"]})
assert len(documents) == 2
assert {d.meta["name"] for d in documents} == {"filename1", "filename3"}
assert {d.meta["metafield"] for d in documents} == {"test1", "test3"}
@pytest.mark.weaviate
def test_get_all_documents_with_incorrect_filter_name(document_store_with_docs):
documents = document_store_with_docs.get_all_documents(filters={"incorrectmetafield": ["test2"]})
assert len(documents) == 0
@pytest.mark.weaviate
def test_get_all_documents_with_incorrect_filter_value(document_store_with_docs):
documents = document_store_with_docs.get_all_documents(filters={"metafield": ["incorrect_value"]})
assert len(documents) == 0
@pytest.mark.weaviate
def test_get_documents_by_id(document_store_with_docs):
documents = document_store_with_docs.get_all_documents()
doc = document_store_with_docs.get_document_by_id(documents[0].id)
assert doc.id == documents[0].id
assert doc.content == documents[0].content
@pytest.mark.weaviate
@pytest.mark.parametrize("document_store", ["weaviate"], indirect=True)
def test_get_document_count(document_store):
document_store.write_documents(DOCUMENTS)
assert document_store.get_document_count() == 5
assert document_store.get_document_count(filters={"key": ["a"]}) == 1
assert document_store.get_document_count(filters={"key": ["b"]}) == 4
@pytest.mark.weaviate
@pytest.mark.parametrize("document_store", ["weaviate"], indirect=True)
@pytest.mark.parametrize("batch_size", [2])
def test_weaviate_write_docs(document_store, batch_size):
# Write in small batches
for i in range(0, len(DOCUMENTS), batch_size):
document_store.write_documents(DOCUMENTS[i: i + batch_size])
documents_indexed = document_store.get_all_documents()
assert len(documents_indexed) == len(DOCUMENTS)
documents_indexed = document_store.get_all_documents(batch_size=batch_size)
assert len(documents_indexed) == len(DOCUMENTS)
@pytest.mark.weaviate
@pytest.mark.parametrize("document_store", ["weaviate"], indirect=True)
def test_get_all_document_filter_duplicate_value(document_store):
documents = [
Document(
content="Doc1",
meta={"fone": "f0"},
id = get_uuid(),
embedding= np.random.rand(embedding_dim).astype(np.float32)
),
Document(
content="Doc1",
meta={"fone": "f1", "metaid": "0"},
id = get_uuid(),
embedding = np.random.rand(embedding_dim).astype(np.float32)
),
Document(
content="Doc2",
meta={"fthree": "f0"},
id = get_uuid(),
embedding=np.random.rand(embedding_dim).astype(np.float32)
)
]
document_store.write_documents(documents)
documents = document_store.get_all_documents(filters={"fone": ["f1"]})
assert documents[0].content == "Doc1"
assert len(documents) == 1
assert {d.meta["metaid"] for d in documents} == {"0"}
@pytest.mark.weaviate
@pytest.mark.parametrize("document_store", ["weaviate"], indirect=True)
def test_get_all_documents_generator(document_store):
document_store.write_documents(DOCUMENTS)
assert len(list(document_store.get_all_documents_generator(batch_size=2))) == 5
@pytest.mark.weaviate
@pytest.mark.parametrize("document_store", ["weaviate"], indirect=True)
def test_write_with_duplicate_doc_ids(document_store):
id = get_uuid()
documents = [
Document(
content="Doc1",
id=id,
embedding=np.random.rand(embedding_dim).astype(np.float32)
),
Document(
content="Doc2",
id=id,
embedding=np.random.rand(embedding_dim).astype(np.float32)
)
]
document_store.write_documents(documents, duplicate_documents="skip")
with pytest.raises(Exception):
document_store.write_documents(documents, duplicate_documents="fail")
@pytest.mark.weaviate
@pytest.mark.parametrize("document_store", ["weaviate"], indirect=True)
@pytest.mark.parametrize("update_existing_documents", [True, False])
def test_update_existing_documents(document_store, update_existing_documents):
id = uuid.uuid4()
original_docs = [
{"content": "text1_orig", "id": id, "metafieldforcount": "a", "embedding": np.random.rand(embedding_dim).astype(np.float32)},
]
updated_docs = [
{"content": "text1_new", "id": id, "metafieldforcount": "a", "embedding": np.random.rand(embedding_dim).astype(np.float32)},
]
document_store.update_existing_documents = update_existing_documents
document_store.write_documents(original_docs)
assert document_store.get_document_count() == 1
if update_existing_documents:
document_store.write_documents(updated_docs, duplicate_documents="overwrite")
else:
with pytest.raises(Exception):
document_store.write_documents(updated_docs, duplicate_documents="fail")
stored_docs = document_store.get_all_documents()
assert len(stored_docs) == 1
if update_existing_documents:
assert stored_docs[0].content == updated_docs[0]["content"]
else:
assert stored_docs[0].content == original_docs[0]["content"]
@pytest.mark.weaviate
@pytest.mark.parametrize("document_store", ["weaviate"], indirect=True)
def test_write_document_meta(document_store):
uid1 = get_uuid()
uid2 = get_uuid()
uid3 = get_uuid()
uid4 = get_uuid()
documents = [
{"content": "dict_without_meta", "id": uid1, "embedding": np.random.rand(embedding_dim).astype(np.float32)},
{"content": "dict_with_meta", "metafield": "test2", "name": "filename2", "id": uid2, "embedding": np.random.rand(embedding_dim).astype(np.float32)},
Document(content="document_object_without_meta", id=uid3, embedding=np.random.rand(embedding_dim).astype(np.float32)),
Document(content="document_object_with_meta", meta={"metafield": "test4", "name": "filename3"}, id=uid4, embedding=np.random.rand(embedding_dim).astype(np.float32)),
]
document_store.write_documents(documents)
documents_in_store = document_store.get_all_documents()
assert len(documents_in_store) == 4
assert not document_store.get_document_by_id(uid1).meta
assert document_store.get_document_by_id(uid2).meta["metafield"] == "test2"
assert not document_store.get_document_by_id(uid3).meta
assert document_store.get_document_by_id(uid4).meta["metafield"] == "test4"
@pytest.mark.weaviate
@pytest.mark.parametrize("document_store", ["weaviate"], indirect=True)
def test_write_document_index(document_store):
documents = [
{"content": "text1", "id": uuid.uuid4(), "embedding": np.random.rand(embedding_dim).astype(np.float32)},
{"content": "text2", "id": uuid.uuid4(), "embedding": np.random.rand(embedding_dim).astype(np.float32)},
]
document_store.write_documents([documents[0]], index="Haystackone")
assert len(document_store.get_all_documents(index="Haystackone")) == 1
document_store.write_documents([documents[1]], index="Haystacktwo")
assert len(document_store.get_all_documents(index="Haystacktwo")) == 1
assert len(document_store.get_all_documents(index="Haystackone")) == 1
assert len(document_store.get_all_documents()) == 0
@pytest.mark.weaviate
@pytest.mark.parametrize("retriever", ["dpr", "embedding"], indirect=True)
@pytest.mark.parametrize("document_store", ["weaviate"], indirect=True)
def test_update_embeddings(document_store, retriever):
documents = []
for i in range(6):
documents.append({"content": f"text_{i}", "id": str(uuid.uuid4()), "metafield": f"value_{i}", "embedding": np.random.rand(embedding_dim).astype(np.float32)})
documents.append({"content": "text_0", "id": str(uuid.uuid4()), "metafield": "value_0", "embedding": np.random.rand(embedding_dim).astype(np.float32)})
document_store.write_documents(documents, index="HaystackTestOne")
document_store.update_embeddings(retriever, index="HaystackTestOne", batch_size=3)
documents = document_store.get_all_documents(index="HaystackTestOne", return_embedding=True)
assert len(documents) == 7
for doc in documents:
assert type(doc.embedding) is np.ndarray
documents = document_store.get_all_documents(
index="HaystackTestOne",
filters={"metafield": ["value_0"]},
return_embedding=True,
)
assert len(documents) == 2
for doc in documents:
assert doc.meta["metafield"] == "value_0"
np.testing.assert_array_almost_equal(documents[0].embedding, documents[1].embedding, decimal=4)
documents = document_store.get_all_documents(
index="HaystackTestOne",
filters={"metafield": ["value_1", "value_5"]},
return_embedding=True,
)
np.testing.assert_raises(
AssertionError,
np.testing.assert_array_equal,
documents[0].embedding,
documents[1].embedding
)
doc = {"content": "text_7", "id": str(uuid.uuid4()), "metafield": "value_7",
"embedding": retriever.embed_queries(texts=["a random string"])[0]}
document_store.write_documents([doc], index="HaystackTestOne")
doc_before_update = document_store.get_all_documents(index="HaystackTestOne", filters={"metafield": ["value_7"]})[0]
embedding_before_update = doc_before_update.embedding
document_store.update_embeddings(
retriever, index="HaystackTestOne", batch_size=3, filters={"metafield": ["value_0", "value_1"]}
)
doc_after_update = document_store.get_all_documents(index="HaystackTestOne", filters={"metafield": ["value_7"]})[0]
embedding_after_update = doc_after_update.embedding
np.testing.assert_array_equal(embedding_before_update, embedding_after_update)
# test update all embeddings
document_store.update_embeddings(retriever, index="HaystackTestOne", batch_size=3, update_existing_embeddings=True)
assert document_store.get_document_count(index="HaystackTestOne") == 8
doc_after_update = document_store.get_all_documents(index="HaystackTestOne", filters={"metafield": ["value_7"]})[0]
embedding_after_update = doc_after_update.embedding
np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, embedding_before_update, embedding_after_update)
@pytest.mark.weaviate
@pytest.mark.parametrize("document_store_with_docs", ["weaviate"], indirect=True)
def test_query_by_embedding(document_store_with_docs):
docs = document_store_with_docs.query_by_embedding(np.random.rand(embedding_dim).astype(np.float32))
assert len(docs) == 3
docs = document_store_with_docs.query_by_embedding(np.random.rand(embedding_dim).astype(np.float32),
top_k=1)
assert len(docs) == 1
docs = document_store_with_docs.query_by_embedding(np.random.rand(embedding_dim).astype(np.float32),
filters = {"name": ['filename2']})
assert len(docs) == 1
@pytest.mark.weaviate
@pytest.mark.parametrize("document_store_with_docs", ["weaviate"], indirect=True)
def test_query(document_store_with_docs):
query_text = 'My name is Carla and I live in Berlin'
with pytest.raises(Exception):
docs = document_store_with_docs.query(query_text)
docs = document_store_with_docs.query(filters = {"name": ['filename2']})
assert len(docs) == 1
docs = document_store_with_docs.query(filters={"content":[query_text.lower()]})
assert len(docs) == 1
docs = document_store_with_docs.query(filters={"content":['live']})
assert len(docs) == 3
@pytest.mark.weaviate
@pytest.mark.parametrize("document_store_with_docs", ["weaviate"], indirect=True)
def test_delete_all_documents(document_store_with_docs):
assert len(document_store_with_docs.get_all_documents()) == 3
document_store_with_docs.delete_documents()
documents = document_store_with_docs.get_all_documents()
assert len(documents) == 0
@pytest.mark.weaviate
@pytest.mark.parametrize("document_store_with_docs", ["weaviate"], indirect=True)
def test_delete_documents_with_filters(document_store_with_docs):
document_store_with_docs.delete_all_documents(filters={"metafield": ["test1", "test2"]})
documents = document_store_with_docs.get_all_documents()
assert len(documents) == 1
assert documents[0].meta["metafield"] == "test3"