haystack/test/nodes/test_audio.py
Sara Zan 584e046642
AnswerToSpeech (#2584)
* Add new audio answer primitives

* Add AnswerToSpeech

* Add dependency group

* Update Documentation & Code Style

* Extract TextToSpeech in a helper class, create DocumentToSpeech and primitives

* Add tests

* Update Documentation & Code Style

* Add ability to compress audio and more tests

* Add audio group to test, all and all-gpu

* fix pylint

* Update Documentation & Code Style

* Accidental git tag

* Try pleasing mypy

* Update Documentation & Code Style

* fix pylint

* Add warning for missing OS library and support in CI

* Try fixing mypy

* Update Documentation & Code Style

* Add docs, simplify args for audio nodes and add tutorials

* Fix mypy

* Fix run_batch

* Feedback on tutorials

* fix mypy and pylint

* Fix mypy again

* Fix mypy yet again

* Fix the ci

* Fix dicts merge and install ffmpeg on CI

* Make the audio nodes import safe

* Trying to increase tolerance in audio test

* Fix import paths

* fix linter

* Update Documentation & Code Style

* Add audio libs in unit tests

* Update _text_to_speech.py

* Update answer_to_speech.py

* Use dedicated dataset & update telemetry

* Remove  and use distilled roberta

* Revert special primitives so that the nodes run in indexing

* Improve tutorials and fix smaller bugs

* Update Documentation & Code Style

* Fix serialization issue

* Update Documentation & Code Style

* Improve tutorial

* Update Documentation & Code Style

* Update _text_to_speech.py

* Minor lg updates

* Minor lg updates to tutorial

* Making indexing work in tutorials

* Update Documentation & Code Style

* Improve docstrings

* Try to use GPU when available

* Update Documentation & Code Style

* Fixi mypy and pylint

* Try to pass the device correctly

* Update Documentation & Code Style

* Use type of device

* use .cpu()

* Improve .ipynb

* update apt index to be able to download libsndfile1

* Fix SpeechDocument.from_dict()

* Change pip URL

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
2022-06-15 10:13:18 +02:00

121 lines
5.1 KiB
Python

import os
import numpy as np
import soundfile as sf
from haystack.schema import Span, Answer, SpeechAnswer, Document, SpeechDocument
from haystack.nodes.audio import AnswerToSpeech, DocumentToSpeech
from haystack.nodes.audio._text_to_speech import TextToSpeech
from ..conftest import SAMPLES_PATH
def test_text_to_speech_audio_data():
text2speech = TextToSpeech(
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
transformers_params={"seed": 777, "always_fix_seed": True},
)
expected_audio_data, _ = sf.read(SAMPLES_PATH / "audio" / "answer.wav")
audio_data = text2speech.text_to_audio_data(text="answer")
assert np.allclose(expected_audio_data, audio_data, atol=0.001)
def test_text_to_speech_audio_file(tmp_path):
text2speech = TextToSpeech(
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
transformers_params={"seed": 777, "always_fix_seed": True},
)
expected_audio_data, _ = sf.read(SAMPLES_PATH / "audio" / "answer.wav")
audio_file = text2speech.text_to_audio_file(text="answer", generated_audio_dir=tmp_path / "test_audio")
assert os.path.exists(audio_file)
assert np.allclose(expected_audio_data, sf.read(audio_file)[0], atol=0.001)
def test_text_to_speech_compress_audio(tmp_path):
text2speech = TextToSpeech(
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
transformers_params={"seed": 777, "always_fix_seed": True},
)
expected_audio_file = SAMPLES_PATH / "audio" / "answer.wav"
audio_file = text2speech.text_to_audio_file(
text="answer", generated_audio_dir=tmp_path / "test_audio", audio_format="mp3"
)
assert os.path.exists(audio_file)
assert audio_file.suffix == ".mp3"
# FIXME find a way to make sure the compressed audio is similar enough to the wav version.
# At a manual inspection, the code seems to be working well.
def test_text_to_speech_naming_function(tmp_path):
text2speech = TextToSpeech(
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
transformers_params={"seed": 777, "always_fix_seed": True},
)
expected_audio_file = SAMPLES_PATH / "audio" / "answer.wav"
audio_file = text2speech.text_to_audio_file(
text="answer", generated_audio_dir=tmp_path / "test_audio", audio_naming_function=lambda text: text
)
assert os.path.exists(audio_file)
assert audio_file.name == expected_audio_file.name
assert np.allclose(sf.read(expected_audio_file)[0], sf.read(audio_file)[0], atol=0.001)
def test_answer_to_speech(tmp_path):
text_answer = Answer(
answer="answer",
type="extractive",
context="the context for this answer is here",
offsets_in_document=[Span(31, 37)],
offsets_in_context=[Span(21, 27)],
meta={"some_meta": "some_value"},
)
expected_audio_answer = SAMPLES_PATH / "audio" / "answer.wav"
expected_audio_context = SAMPLES_PATH / "audio" / "the context for this answer is here.wav"
answer2speech = AnswerToSpeech(
generated_audio_dir=tmp_path / "test_audio",
audio_params={"audio_naming_function": lambda text: text},
transformers_params={"seed": 777, "always_fix_seed": True},
)
results, _ = answer2speech.run(answers=[text_answer])
audio_answer: SpeechAnswer = results["answers"][0]
assert isinstance(audio_answer, SpeechAnswer)
assert audio_answer.type == "generative"
assert audio_answer.answer_audio.name == expected_audio_answer.name
assert audio_answer.context_audio.name == expected_audio_context.name
assert audio_answer.answer == "answer"
assert audio_answer.context == "the context for this answer is here"
assert audio_answer.offsets_in_document == [Span(31, 37)]
assert audio_answer.offsets_in_context == [Span(21, 27)]
assert audio_answer.meta["some_meta"] == "some_value"
assert audio_answer.meta["audio_format"] == "wav"
assert np.allclose(sf.read(audio_answer.answer_audio)[0], sf.read(expected_audio_answer)[0], atol=0.001)
assert np.allclose(sf.read(audio_answer.context_audio)[0], sf.read(expected_audio_context)[0], atol=0.001)
def test_document_to_speech(tmp_path):
text_doc = Document(
content="this is the content of the document", content_type="text", meta={"name": "test_document.txt"}
)
expected_audio_content = SAMPLES_PATH / "audio" / "this is the content of the document.wav"
doc2speech = DocumentToSpeech(
generated_audio_dir=tmp_path / "test_audio",
audio_params={"audio_naming_function": lambda text: text},
transformers_params={"seed": 777, "always_fix_seed": True},
)
results, _ = doc2speech.run(documents=[text_doc])
audio_doc: SpeechDocument = results["documents"][0]
assert isinstance(audio_doc, SpeechDocument)
assert audio_doc.content_type == "audio"
assert audio_doc.content_audio.name == expected_audio_content.name
assert audio_doc.content == "this is the content of the document"
assert audio_doc.meta["name"] == "test_document.txt"
assert audio_doc.meta["audio_format"] == "wav"
assert np.allclose(sf.read(audio_doc.content_audio)[0], sf.read(expected_audio_content)[0], atol=0.001)