haystack/test/nodes/test_audio.py

181 lines
7.4 KiB
Python
Raw Normal View History

`AnswerToSpeech` (#2584) * Add new audio answer primitives * Add AnswerToSpeech * Add dependency group * Update Documentation & Code Style * Extract TextToSpeech in a helper class, create DocumentToSpeech and primitives * Add tests * Update Documentation & Code Style * Add ability to compress audio and more tests * Add audio group to test, all and all-gpu * fix pylint * Update Documentation & Code Style * Accidental git tag * Try pleasing mypy * Update Documentation & Code Style * fix pylint * Add warning for missing OS library and support in CI * Try fixing mypy * Update Documentation & Code Style * Add docs, simplify args for audio nodes and add tutorials * Fix mypy * Fix run_batch * Feedback on tutorials * fix mypy and pylint * Fix mypy again * Fix mypy yet again * Fix the ci * Fix dicts merge and install ffmpeg on CI * Make the audio nodes import safe * Trying to increase tolerance in audio test * Fix import paths * fix linter * Update Documentation & Code Style * Add audio libs in unit tests * Update _text_to_speech.py * Update answer_to_speech.py * Use dedicated dataset & update telemetry * Remove and use distilled roberta * Revert special primitives so that the nodes run in indexing * Improve tutorials and fix smaller bugs * Update Documentation & Code Style * Fix serialization issue * Update Documentation & Code Style * Improve tutorial * Update Documentation & Code Style * Update _text_to_speech.py * Minor lg updates * Minor lg updates to tutorial * Making indexing work in tutorials * Update Documentation & Code Style * Improve docstrings * Try to use GPU when available * Update Documentation & Code Style * Fixi mypy and pylint * Try to pass the device correctly * Update Documentation & Code Style * Use type of device * use .cpu() * Improve .ipynb * update apt index to be able to download libsndfile1 * Fix SpeechDocument.from_dict() * Change pip URL Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
2022-06-15 10:13:18 +02:00
import os
import pytest
`AnswerToSpeech` (#2584) * Add new audio answer primitives * Add AnswerToSpeech * Add dependency group * Update Documentation & Code Style * Extract TextToSpeech in a helper class, create DocumentToSpeech and primitives * Add tests * Update Documentation & Code Style * Add ability to compress audio and more tests * Add audio group to test, all and all-gpu * fix pylint * Update Documentation & Code Style * Accidental git tag * Try pleasing mypy * Update Documentation & Code Style * fix pylint * Add warning for missing OS library and support in CI * Try fixing mypy * Update Documentation & Code Style * Add docs, simplify args for audio nodes and add tutorials * Fix mypy * Fix run_batch * Feedback on tutorials * fix mypy and pylint * Fix mypy again * Fix mypy yet again * Fix the ci * Fix dicts merge and install ffmpeg on CI * Make the audio nodes import safe * Trying to increase tolerance in audio test * Fix import paths * fix linter * Update Documentation & Code Style * Add audio libs in unit tests * Update _text_to_speech.py * Update answer_to_speech.py * Use dedicated dataset & update telemetry * Remove and use distilled roberta * Revert special primitives so that the nodes run in indexing * Improve tutorials and fix smaller bugs * Update Documentation & Code Style * Fix serialization issue * Update Documentation & Code Style * Improve tutorial * Update Documentation & Code Style * Update _text_to_speech.py * Minor lg updates * Minor lg updates to tutorial * Making indexing work in tutorials * Update Documentation & Code Style * Improve docstrings * Try to use GPU when available * Update Documentation & Code Style * Fixi mypy and pylint * Try to pass the device correctly * Update Documentation & Code Style * Use type of device * use .cpu() * Improve .ipynb * update apt index to be able to download libsndfile1 * Fix SpeechDocument.from_dict() * Change pip URL Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
2022-06-15 10:13:18 +02:00
import numpy as np
try:
import soundfile as sf
import ffmpeg
soundfile_not_found = False
except:
soundfile_not_found = True
`AnswerToSpeech` (#2584) * Add new audio answer primitives * Add AnswerToSpeech * Add dependency group * Update Documentation & Code Style * Extract TextToSpeech in a helper class, create DocumentToSpeech and primitives * Add tests * Update Documentation & Code Style * Add ability to compress audio and more tests * Add audio group to test, all and all-gpu * fix pylint * Update Documentation & Code Style * Accidental git tag * Try pleasing mypy * Update Documentation & Code Style * fix pylint * Add warning for missing OS library and support in CI * Try fixing mypy * Update Documentation & Code Style * Add docs, simplify args for audio nodes and add tutorials * Fix mypy * Fix run_batch * Feedback on tutorials * fix mypy and pylint * Fix mypy again * Fix mypy yet again * Fix the ci * Fix dicts merge and install ffmpeg on CI * Make the audio nodes import safe * Trying to increase tolerance in audio test * Fix import paths * fix linter * Update Documentation & Code Style * Add audio libs in unit tests * Update _text_to_speech.py * Update answer_to_speech.py * Use dedicated dataset & update telemetry * Remove and use distilled roberta * Revert special primitives so that the nodes run in indexing * Improve tutorials and fix smaller bugs * Update Documentation & Code Style * Fix serialization issue * Update Documentation & Code Style * Improve tutorial * Update Documentation & Code Style * Update _text_to_speech.py * Minor lg updates * Minor lg updates to tutorial * Making indexing work in tutorials * Update Documentation & Code Style * Improve docstrings * Try to use GPU when available * Update Documentation & Code Style * Fixi mypy and pylint * Try to pass the device correctly * Update Documentation & Code Style * Use type of device * use .cpu() * Improve .ipynb * update apt index to be able to download libsndfile1 * Fix SpeechDocument.from_dict() * Change pip URL Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
2022-06-15 10:13:18 +02:00
from transformers import WhisperProcessor, WhisperForConditionalGeneration
`AnswerToSpeech` (#2584) * Add new audio answer primitives * Add AnswerToSpeech * Add dependency group * Update Documentation & Code Style * Extract TextToSpeech in a helper class, create DocumentToSpeech and primitives * Add tests * Update Documentation & Code Style * Add ability to compress audio and more tests * Add audio group to test, all and all-gpu * fix pylint * Update Documentation & Code Style * Accidental git tag * Try pleasing mypy * Update Documentation & Code Style * fix pylint * Add warning for missing OS library and support in CI * Try fixing mypy * Update Documentation & Code Style * Add docs, simplify args for audio nodes and add tutorials * Fix mypy * Fix run_batch * Feedback on tutorials * fix mypy and pylint * Fix mypy again * Fix mypy yet again * Fix the ci * Fix dicts merge and install ffmpeg on CI * Make the audio nodes import safe * Trying to increase tolerance in audio test * Fix import paths * fix linter * Update Documentation & Code Style * Add audio libs in unit tests * Update _text_to_speech.py * Update answer_to_speech.py * Use dedicated dataset & update telemetry * Remove and use distilled roberta * Revert special primitives so that the nodes run in indexing * Improve tutorials and fix smaller bugs * Update Documentation & Code Style * Fix serialization issue * Update Documentation & Code Style * Improve tutorial * Update Documentation & Code Style * Update _text_to_speech.py * Minor lg updates * Minor lg updates to tutorial * Making indexing work in tutorials * Update Documentation & Code Style * Improve docstrings * Try to use GPU when available * Update Documentation & Code Style * Fixi mypy and pylint * Try to pass the device correctly * Update Documentation & Code Style * Use type of device * use .cpu() * Improve .ipynb * update apt index to be able to download libsndfile1 * Fix SpeechDocument.from_dict() * Change pip URL Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
2022-06-15 10:13:18 +02:00
from haystack.schema import Span, Answer, SpeechAnswer, Document, SpeechDocument
from haystack.nodes.audio import AnswerToSpeech, DocumentToSpeech
from haystack.nodes.audio._text_to_speech import TextToSpeech
from ..conftest import SAMPLES_PATH
class WhisperHelper:
def __init__(self, model):
self._processor = WhisperProcessor.from_pretrained(model)
self._model = WhisperForConditionalGeneration.from_pretrained(model)
self._model.config.forced_decoder_ids = None
def transcribe(self, media_file: str):
output, _ = (
ffmpeg.input(media_file)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=16000)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
data = np.frombuffer(output, np.int16).flatten().astype(np.float32) / 32768.0
features = self._processor(data, sampling_rate=16000, return_tensors="pt").input_features
tokens = self._model.generate(features)
return self._processor.batch_decode(tokens, skip_special_tokens=True)
@pytest.fixture(scope="session", autouse=True)
def whisper_helper():
return WhisperHelper("openai/whisper-medium")
@pytest.mark.skipif(soundfile_not_found, reason="soundfile/ffmpeg not found")
class TestTextToSpeech:
def test_text_to_speech_audio_data(self, tmp_path, whisper_helper: WhisperHelper):
text2speech = TextToSpeech(
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
transformers_params={"seed": 4535, "always_fix_seed": True},
)
audio_data = text2speech.text_to_audio_data(text="answer")
sf.write(
data=audio_data,
file=str(tmp_path / "audio1.wav"),
format="wav",
subtype="PCM_16",
samplerate=text2speech.model.fs,
)
expedtec_doc = whisper_helper.transcribe(str(SAMPLES_PATH / "audio" / "answer.wav"))
generated_doc = whisper_helper.transcribe(str(tmp_path / "audio1.wav"))
assert expedtec_doc == generated_doc
def test_text_to_speech_audio_file(self, tmp_path, whisper_helper: WhisperHelper):
text2speech = TextToSpeech(
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
transformers_params={"seed": 4535, "always_fix_seed": True},
)
audio_file = text2speech.text_to_audio_file(text="answer", generated_audio_dir=tmp_path / "test_audio")
assert os.path.exists(audio_file)
expected_doc = whisper_helper.transcribe(str(SAMPLES_PATH / "audio" / "answer.wav"))
generated_doc = whisper_helper.transcribe(str(audio_file))
assert expected_doc == generated_doc
def test_text_to_speech_compress_audio(self, tmp_path, whisper_helper: WhisperHelper):
text2speech = TextToSpeech(
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
transformers_params={"seed": 4535, "always_fix_seed": True},
)
expected_audio_file = SAMPLES_PATH / "audio" / "answer.wav"
audio_file = text2speech.text_to_audio_file(
text="answer", generated_audio_dir=tmp_path / "test_audio", audio_format="mp3"
)
assert os.path.exists(audio_file)
assert audio_file.suffix == ".mp3"
expected_doc = whisper_helper.transcribe(str(expected_audio_file))
generated_doc = whisper_helper.transcribe(str(audio_file))
assert expected_doc == generated_doc
def test_text_to_speech_naming_function(self, tmp_path, whisper_helper: WhisperHelper):
text2speech = TextToSpeech(
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
transformers_params={"seed": 4535, "always_fix_seed": True},
)
expected_audio_file = SAMPLES_PATH / "audio" / "answer.wav"
audio_file = text2speech.text_to_audio_file(
text="answer", generated_audio_dir=tmp_path / "test_audio", audio_naming_function=lambda text: text
)
assert os.path.exists(audio_file)
assert audio_file.name == expected_audio_file.name
expected_doc = whisper_helper.transcribe(str(expected_audio_file))
generated_doc = whisper_helper.transcribe(str(audio_file))
assert expected_doc == generated_doc
def test_answer_to_speech(self, tmp_path, whisper_helper: WhisperHelper):
text_answer = Answer(
answer="answer",
type="extractive",
context="the context for this answer is here",
offsets_in_document=[Span(31, 37)],
offsets_in_context=[Span(21, 27)],
meta={"some_meta": "some_value"},
)
expected_audio_answer = SAMPLES_PATH / "audio" / "answer.wav"
expected_audio_context = SAMPLES_PATH / "audio" / "the context for this answer is here.wav"
answer2speech = AnswerToSpeech(
generated_audio_dir=tmp_path / "test_audio",
audio_params={"audio_naming_function": lambda text: text},
transformers_params={"seed": 4535, "always_fix_seed": True},
)
results, _ = answer2speech.run(answers=[text_answer])
audio_answer: SpeechAnswer = results["answers"][0]
assert isinstance(audio_answer, SpeechAnswer)
assert audio_answer.type == "generative"
assert audio_answer.answer_audio.name == expected_audio_answer.name
assert audio_answer.context_audio.name == expected_audio_context.name
assert audio_answer.answer == "answer"
assert audio_answer.context == "the context for this answer is here"
assert audio_answer.offsets_in_document == [Span(31, 37)]
assert audio_answer.offsets_in_context == [Span(21, 27)]
assert audio_answer.meta["some_meta"] == "some_value"
assert audio_answer.meta["audio_format"] == "wav"
expected_doc = whisper_helper.transcribe(str(expected_audio_answer))
generated_doc = whisper_helper.transcribe(str(audio_answer.answer_audio))
assert expected_doc == generated_doc
def test_document_to_speech(self, tmp_path, whisper_helper: WhisperHelper):
text_doc = Document(
content="this is the content of the document", content_type="text", meta={"name": "test_document.txt"}
)
expected_audio_content = SAMPLES_PATH / "audio" / "this is the content of the document.wav"
doc2speech = DocumentToSpeech(
generated_audio_dir=tmp_path / "test_audio",
audio_params={"audio_naming_function": lambda text: text},
transformers_params={"seed": 4535, "always_fix_seed": True},
)
results, _ = doc2speech.run(documents=[text_doc])
audio_doc: SpeechDocument = results["documents"][0]
assert isinstance(audio_doc, SpeechDocument)
assert audio_doc.content_type == "audio"
assert audio_doc.content_audio.name == expected_audio_content.name
assert audio_doc.content == "this is the content of the document"
assert audio_doc.meta["name"] == "test_document.txt"
assert audio_doc.meta["audio_format"] == "wav"
expected_doc = whisper_helper.transcribe(str(expected_audio_content))
generated_doc = whisper_helper.transcribe(str(audio_doc.content_audio))
assert expected_doc == generated_doc