haystack/test/test_schema.py
Malte Pietsch 4a6c9302b3
Redesign primitives - Document, Answer, Label (#1398)
* first draft / notes on new primitives

* wip label / feedback refactor

* rename doc.text -> doc.content. add doc.content_type

* add datatype for content

* remove faq_question_field from ES and weaviate. rename text_field -> content_field in docstores. update tutorials for content field

* update converters for . Add warning for empty

* renam label.question -> label.query. Allow sorting of Answers.

* WIP primitives

* update ui/reader for new Answer format

* Improve Label. First refactoring of MultiLabel. Adjust eval code

* fixed workflow conflict with introducing new one (#1472)

* Add latest docstring and tutorial changes

* make add_eval_data() work again

* fix reader formats. WIP fix _extract_docs_and_labels_from_dict

* fix test reader

* Add latest docstring and tutorial changes

* fix another test case for reader

* fix mypy in farm reader.eval()

* fix mypy in farm reader.eval()

* WIP ORM refactor

* Add latest docstring and tutorial changes

* fix mypy weaviate

* make label and multilabel dataclasses

* bump mypy env in CI to python 3.8

* WIP refactor Label ORM

* WIP refactor Label ORM

* simplify tests for individual doc stores

* WIP refactoring markers of tests

* test alternative approach for tests with existing parametrization

* WIP refactor ORMs

* fix skip logic of already parametrized tests

* fix weaviate behaviour in tests - not parametrizing it in our general test cases.

* Add latest docstring and tutorial changes

* fix some tests

* remove sql from document_store_types

* fix markers for generator and pipeline test

* remove inmemory marker

* remove unneeded elasticsearch markers

* add dataclasses-json dependency. adjust ORM to just store JSON repr

* ignore type as dataclasses_json seems to miss functionality here

* update readme and contributing.md

* update contributing

* adjust example

* fix duplicate doc handling for custom index

* Add latest docstring and tutorial changes

* fix some ORM issues. fix get_all_labels_aggregated.

* update drop flags where get_all_labels_aggregated() was used before

* Add latest docstring and tutorial changes

* add to_json(). add + fix tests

* fix no_answer handling in label / multilabel

* fix duplicate docs in memory doc store. change primary key for sql doc table

* fix mypy issues

* fix mypy issues

* haystack/retriever/base.py

* fix test_write_document_meta[elastic]

* fix test_elasticsearch_custom_fields

* fix test_labels[elastic]

* fix crawler

* fix converter

* fix docx converter

* fix preprocessor

* fix test_utils

* fix tfidf retriever. fix selection of docstore in tests with multiple fixtures / parameterizations

* Add latest docstring and tutorial changes

* fix crawler test. fix ocrconverter attribute

* fix test_elasticsearch_custom_query

* fix generator pipeline

* fix ocr converter

* fix ragenerator

* Add latest docstring and tutorial changes

* fix test_load_and_save_yaml for elasticsearch

* fixes for pipeline tests

* fix faq pipeline

* fix pipeline tests

* Add latest docstring and tutorial changes

* fix weaviate

* Add latest docstring and tutorial changes

* trigger CI

* satisfy mypy

* Add latest docstring and tutorial changes

* satisfy mypy

* Add latest docstring and tutorial changes

* trigger CI

* fix question generation test

* fix ray. fix Q-generation

* fix translator test

* satisfy mypy

* wip refactor feedback rest api

* fix rest api feedback endpoint

* fix doc classifier

* remove relation of Labels -> Docs in SQL ORM

* fix faiss/milvus tests

* fix doc classifier test

* fix eval test

* fixing eval issues

* Add latest docstring and tutorial changes

* fix mypy

* WIP replace dataclasses-json with manual serialization

* Add latest docstring and tutorial changes

* revert to dataclass-json serialization for now. remove debug prints.

* update docstrings

* fix extractor. fix Answer Span init

* fix api test

* keep meta data of answers in reader.run()

* fix meta handling

* adress review feedback

* Add latest docstring and tutorial changes

* make document=None for open domain labels

* add import

* fix print utils

* fix rest api

* adress review feedback

* Add latest docstring and tutorial changes

* fix mypy

Co-authored-by: Markus Paff <markuspaff.mp@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2021-10-13 14:23:23 +02:00

133 lines
4.4 KiB
Python

from haystack import Document, Label, Answer, Span
import numpy as np
LABELS = [
Label(query="some",
answer=Answer(answer="an answer",type="extractive", score=0.1, document_id="123"),
document=Document(content="some text", content_type="text"),
is_correct_answer=True,
is_correct_document=True,
origin="user-feedback"),
Label(query="some",
answer=Answer(answer="annother answer", type="extractive", score=0.1, document_id="123"),
document=Document(content="some text", content_type="text"),
is_correct_answer = True,
is_correct_document = True,
origin = "user-feedback"),
Label(query="some",
answer=Answer(answer="an answer",type="extractive", score=0.1, document_id="123"),
document=Document(content="some text", content_type="text"),
is_correct_answer = True,
is_correct_document = True,
origin = "user-feedback")]
def test_no_answer_label():
labels = [
Label(
query="question",
answer=Answer(answer=""),
is_correct_answer=True,
is_correct_document=True,
document=Document(content="some", id="777"),
origin="gold-label",
),
Label(
query="question",
answer=Answer(answer=""),
is_correct_answer=True,
is_correct_document=True,
document=Document(content="some", id="777"),
no_answer=True,
origin="gold-label",
),
Label(
query="question",
answer=Answer(answer="some"),
is_correct_answer=True,
is_correct_document=True,
document=Document(content="some", id="777"),
origin="gold-label",
),
Label(
query="question",
answer=Answer(answer="some"),
is_correct_answer=True,
is_correct_document=True,
document=Document(content="some", id="777"),
no_answer=False,
origin="gold-label",
)
]
assert labels[0].no_answer == True
assert labels[1].no_answer == True
assert labels[2].no_answer == False
assert labels[3].no_answer == False
def test_equal_label():
assert LABELS[2] == LABELS[0]
assert LABELS[1] != LABELS[0]
def test_answer_to_json():
a = Answer(answer="an answer",type="extractive", score=0.1, context="abc",
offsets_in_document=[Span(start=1, end=10)],
offsets_in_context=[Span(start=3, end=5)],
document_id="123")
j = a.to_json()
a_new = Answer.from_json(j)
assert a_new == a
def test_label_to_json():
j0 = LABELS[0].to_json()
l_new = Label.from_json(j0)
assert l_new == LABELS[0]
def test_doc_to_json():
# With embedding
d = Document(content="some text", content_type="text", score=0.99988, meta={"name": "doc1"},
embedding=np.random.rand(768).astype(np.float32))
j0 = d.to_json()
d_new = Document.from_json(j0)
assert d == d_new
# No embedding
d = Document(content="some text", content_type="text", score=0.99988, meta={"name": "doc1"},
embedding=None)
j0 = d.to_json()
d_new = Document.from_json(j0)
assert d == d_new
def test_answer_postinit():
a = Answer(answer="test", offsets_in_document=[{"start": 10, "end": 20}])
assert a.meta == {}
assert isinstance(a.offsets_in_document[0], Span)
def test_generate_doc_id_using_text():
text1 = "text1"
text2 = "text2"
doc1_text1 = Document(content=text1, meta={"name": "doc1"})
doc2_text1 = Document(content=text1, meta={"name": "doc2"})
doc3_text2 = Document(content=text2, meta={"name": "doc3"})
assert doc1_text1.id == doc2_text1.id
assert doc1_text1.id != doc3_text2.id
def test_generate_doc_id_using_custom_list():
text1 = "text1"
text2 = "text2"
doc1_text1 = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["key1", text1])
doc2_text1 = Document(content=text1, meta={"name": "doc2"}, id_hash_keys=["key1", text1])
doc3_text2 = Document(content=text2, meta={"name": "doc3"}, id_hash_keys=["key1", text2])
assert doc1_text1.id == doc2_text1.id
assert doc1_text1.id != doc3_text2.id