mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-08 16:48:48 +00:00

* Add new audio answer primitives * Add AnswerToSpeech * Add dependency group * Update Documentation & Code Style * Extract TextToSpeech in a helper class, create DocumentToSpeech and primitives * Add tests * Update Documentation & Code Style * Add ability to compress audio and more tests * Add audio group to test, all and all-gpu * fix pylint * Update Documentation & Code Style * Accidental git tag * Try pleasing mypy * Update Documentation & Code Style * fix pylint * Add warning for missing OS library and support in CI * Try fixing mypy * Update Documentation & Code Style * Add docs, simplify args for audio nodes and add tutorials * Fix mypy * Fix run_batch * Feedback on tutorials * fix mypy and pylint * Fix mypy again * Fix mypy yet again * Fix the ci * Fix dicts merge and install ffmpeg on CI * Make the audio nodes import safe * Trying to increase tolerance in audio test * Fix import paths * fix linter * Update Documentation & Code Style * Add audio libs in unit tests * Update _text_to_speech.py * Update answer_to_speech.py * Use dedicated dataset & update telemetry * Remove and use distilled roberta * Revert special primitives so that the nodes run in indexing * Improve tutorials and fix smaller bugs * Update Documentation & Code Style * Fix serialization issue * Update Documentation & Code Style * Improve tutorial * Update Documentation & Code Style * Update _text_to_speech.py * Minor lg updates * Minor lg updates to tutorial * Making indexing work in tutorials * Update Documentation & Code Style * Improve docstrings * Try to use GPU when available * Update Documentation & Code Style * Fixi mypy and pylint * Try to pass the device correctly * Update Documentation & Code Style * Use type of device * use .cpu() * Improve .ipynb * update apt index to be able to download libsndfile1 * Fix SpeechDocument.from_dict() * Change pip URL Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
294 lines
9.2 KiB
Python
294 lines
9.2 KiB
Python
from haystack.schema import Document, Label, Answer, Span, MultiLabel, SpeechDocument, SpeechAnswer
|
|
import pytest
|
|
import numpy as np
|
|
|
|
from ..conftest import SAMPLES_PATH
|
|
|
|
LABELS = [
|
|
Label(
|
|
query="some",
|
|
answer=Answer(
|
|
answer="an answer",
|
|
type="extractive",
|
|
score=0.1,
|
|
document_id="123",
|
|
offsets_in_document=[Span(start=1, end=3)],
|
|
),
|
|
document=Document(content="some text", content_type="text"),
|
|
is_correct_answer=True,
|
|
is_correct_document=True,
|
|
origin="user-feedback",
|
|
),
|
|
Label(
|
|
query="some",
|
|
answer=Answer(answer="annother answer", type="extractive", score=0.1, document_id="123"),
|
|
document=Document(content="some text", content_type="text"),
|
|
is_correct_answer=True,
|
|
is_correct_document=True,
|
|
origin="user-feedback",
|
|
),
|
|
Label(
|
|
query="some",
|
|
answer=Answer(
|
|
answer="an answer",
|
|
type="extractive",
|
|
score=0.1,
|
|
document_id="123",
|
|
offsets_in_document=[Span(start=1, end=3)],
|
|
),
|
|
document=Document(content="some text", content_type="text"),
|
|
is_correct_answer=True,
|
|
is_correct_document=True,
|
|
origin="user-feedback",
|
|
),
|
|
]
|
|
|
|
|
|
def test_no_answer_label():
|
|
labels = [
|
|
Label(
|
|
query="question",
|
|
answer=Answer(answer=""),
|
|
is_correct_answer=True,
|
|
is_correct_document=True,
|
|
document=Document(content="some", id="777"),
|
|
origin="gold-label",
|
|
),
|
|
Label(
|
|
query="question",
|
|
answer=Answer(answer=""),
|
|
is_correct_answer=True,
|
|
is_correct_document=True,
|
|
document=Document(content="some", id="777"),
|
|
no_answer=True,
|
|
origin="gold-label",
|
|
),
|
|
Label(
|
|
query="question",
|
|
answer=Answer(answer="some"),
|
|
is_correct_answer=True,
|
|
is_correct_document=True,
|
|
document=Document(content="some", id="777"),
|
|
origin="gold-label",
|
|
),
|
|
Label(
|
|
query="question",
|
|
answer=Answer(answer="some"),
|
|
is_correct_answer=True,
|
|
is_correct_document=True,
|
|
document=Document(content="some", id="777"),
|
|
no_answer=False,
|
|
origin="gold-label",
|
|
),
|
|
]
|
|
|
|
assert labels[0].no_answer == True
|
|
assert labels[1].no_answer == True
|
|
assert labels[2].no_answer == False
|
|
assert labels[3].no_answer == False
|
|
|
|
|
|
def test_equal_label():
|
|
assert LABELS[2] == LABELS[0]
|
|
assert LABELS[1] != LABELS[0]
|
|
|
|
|
|
def test_answer_to_json():
|
|
a = Answer(
|
|
answer="an answer",
|
|
type="extractive",
|
|
score=0.1,
|
|
context="abc",
|
|
offsets_in_document=[Span(start=1, end=10)],
|
|
offsets_in_context=[Span(start=3, end=5)],
|
|
document_id="123",
|
|
)
|
|
j = a.to_json()
|
|
assert type(j) == str
|
|
assert len(j) > 30
|
|
a_new = Answer.from_json(j)
|
|
assert type(a_new.offsets_in_document[0]) == Span
|
|
assert a_new == a
|
|
|
|
|
|
def test_answer_to_dict():
|
|
a = Answer(
|
|
answer="an answer",
|
|
type="extractive",
|
|
score=0.1,
|
|
context="abc",
|
|
offsets_in_document=[Span(start=1, end=10)],
|
|
offsets_in_context=[Span(start=3, end=5)],
|
|
document_id="123",
|
|
)
|
|
j = a.to_dict()
|
|
assert type(j) == dict
|
|
a_new = Answer.from_dict(j)
|
|
assert type(a_new.offsets_in_document[0]) == Span
|
|
assert a_new == a
|
|
|
|
|
|
def test_label_to_json():
|
|
j0 = LABELS[0].to_json()
|
|
l_new = Label.from_json(j0)
|
|
assert l_new == LABELS[0]
|
|
|
|
|
|
def test_label_to_json():
|
|
j0 = LABELS[0].to_json()
|
|
l_new = Label.from_json(j0)
|
|
assert l_new == LABELS[0]
|
|
assert l_new.answer.offsets_in_document[0].start == 1
|
|
|
|
|
|
def test_label_to_dict():
|
|
j0 = LABELS[0].to_dict()
|
|
l_new = Label.from_dict(j0)
|
|
assert l_new == LABELS[0]
|
|
assert l_new.answer.offsets_in_document[0].start == 1
|
|
|
|
|
|
def test_doc_to_json():
|
|
# With embedding
|
|
d = Document(
|
|
content="some text",
|
|
content_type="text",
|
|
score=0.99988,
|
|
meta={"name": "doc1"},
|
|
embedding=np.random.rand(768).astype(np.float32),
|
|
)
|
|
j0 = d.to_json()
|
|
d_new = Document.from_json(j0)
|
|
assert d == d_new
|
|
|
|
# No embedding
|
|
d = Document(content="some text", content_type="text", score=0.99988, meta={"name": "doc1"}, embedding=None)
|
|
j0 = d.to_json()
|
|
d_new = Document.from_json(j0)
|
|
assert d == d_new
|
|
|
|
|
|
def test_answer_postinit():
|
|
a = Answer(answer="test", offsets_in_document=[{"start": 10, "end": 20}])
|
|
assert a.meta == {}
|
|
assert isinstance(a.offsets_in_document[0], Span)
|
|
|
|
|
|
def test_generate_doc_id_using_text():
|
|
text1 = "text1"
|
|
text2 = "text2"
|
|
doc1_text1 = Document(content=text1, meta={"name": "doc1"})
|
|
doc2_text1 = Document(content=text1, meta={"name": "doc2"})
|
|
doc3_text2 = Document(content=text2, meta={"name": "doc3"})
|
|
|
|
assert doc1_text1.id == doc2_text1.id
|
|
assert doc1_text1.id != doc3_text2.id
|
|
|
|
|
|
def test_generate_doc_id_using_custom_list():
|
|
text1 = "text1"
|
|
text2 = "text2"
|
|
|
|
doc1_meta1_id_by_content = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content"])
|
|
doc1_meta2_id_by_content = Document(content=text1, meta={"name": "doc2"}, id_hash_keys=["content"])
|
|
assert doc1_meta1_id_by_content.id == doc1_meta2_id_by_content.id
|
|
|
|
doc1_meta1_id_by_content_and_meta = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content", "meta"])
|
|
doc1_meta2_id_by_content_and_meta = Document(content=text1, meta={"name": "doc2"}, id_hash_keys=["content", "meta"])
|
|
assert doc1_meta1_id_by_content_and_meta.id != doc1_meta2_id_by_content_and_meta.id
|
|
|
|
doc1_text1 = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content"])
|
|
doc3_text2 = Document(content=text2, meta={"name": "doc3"}, id_hash_keys=["content"])
|
|
assert doc1_text1.id != doc3_text2.id
|
|
|
|
with pytest.raises(ValueError):
|
|
_ = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content", "non_existing_field"])
|
|
|
|
|
|
def test_aggregate_labels_with_labels():
|
|
label1_with_filter1 = Label(
|
|
query="question",
|
|
answer=Answer(answer="1"),
|
|
is_correct_answer=True,
|
|
is_correct_document=True,
|
|
document=Document(content="some", id="777"),
|
|
origin="gold-label",
|
|
filters={"name": ["filename1"]},
|
|
)
|
|
label2_with_filter1 = Label(
|
|
query="question",
|
|
answer=Answer(answer="2"),
|
|
is_correct_answer=True,
|
|
is_correct_document=True,
|
|
document=Document(content="some", id="777"),
|
|
origin="gold-label",
|
|
filters={"name": ["filename1"]},
|
|
)
|
|
label3_with_filter2 = Label(
|
|
query="question",
|
|
answer=Answer(answer="2"),
|
|
is_correct_answer=True,
|
|
is_correct_document=True,
|
|
document=Document(content="some", id="777"),
|
|
origin="gold-label",
|
|
filters={"name": ["filename2"]},
|
|
)
|
|
label = MultiLabel(labels=[label1_with_filter1, label2_with_filter1])
|
|
assert label.filters == {"name": ["filename1"]}
|
|
with pytest.raises(ValueError):
|
|
label = MultiLabel(labels=[label1_with_filter1, label3_with_filter2])
|
|
|
|
|
|
def test_serialize_speech_document():
|
|
speech_doc = SpeechDocument(
|
|
id=12345,
|
|
content_type="audio",
|
|
content="this is the content of the document",
|
|
content_audio=SAMPLES_PATH / "audio" / "this is the content of the document.wav",
|
|
meta={"some": "meta"},
|
|
)
|
|
speech_doc_dict = speech_doc.to_dict()
|
|
|
|
assert speech_doc_dict["content"] == "this is the content of the document"
|
|
assert speech_doc_dict["content_audio"] == str(
|
|
(SAMPLES_PATH / "audio" / "this is the content of the document.wav").absolute()
|
|
)
|
|
|
|
|
|
def test_deserialize_speech_document():
|
|
speech_doc = SpeechDocument(
|
|
id=12345,
|
|
content_type="audio",
|
|
content="this is the content of the document",
|
|
content_audio=SAMPLES_PATH / "audio" / "this is the content of the document.wav",
|
|
meta={"some": "meta"},
|
|
)
|
|
assert speech_doc == SpeechDocument.from_dict(speech_doc.to_dict())
|
|
|
|
|
|
def test_serialize_speech_answer():
|
|
speech_answer = SpeechAnswer(
|
|
answer="answer",
|
|
answer_audio=SAMPLES_PATH / "audio" / "answer.wav",
|
|
context="the context for this answer is here",
|
|
context_audio=SAMPLES_PATH / "audio" / "the context for this answer is here.wav",
|
|
)
|
|
speech_answer_dict = speech_answer.to_dict()
|
|
|
|
assert speech_answer_dict["answer"] == "answer"
|
|
assert speech_answer_dict["answer_audio"] == str((SAMPLES_PATH / "audio" / "answer.wav").absolute())
|
|
assert speech_answer_dict["context"] == "the context for this answer is here"
|
|
assert speech_answer_dict["context_audio"] == str(
|
|
(SAMPLES_PATH / "audio" / "the context for this answer is here.wav").absolute()
|
|
)
|
|
|
|
|
|
def test_deserialize_speech_answer():
|
|
speech_answer = SpeechAnswer(
|
|
answer="answer",
|
|
answer_audio=SAMPLES_PATH / "audio" / "answer.wav",
|
|
context="the context for this answer is here",
|
|
context_audio=SAMPLES_PATH / "audio" / "the context for this answer is here.wav",
|
|
)
|
|
assert speech_answer == SpeechAnswer.from_dict(speech_answer.to_dict())
|