haystack/test/nodes/test_audio.py

import os

import pytest
import numpy as np

try:
    import soundfile as sf

    soundfile_not_found = False
except:
    soundfile_not_found = True

from haystack.schema import Span, Answer, SpeechAnswer, Document, SpeechDocument
from haystack.nodes.audio import AnswerToSpeech, DocumentToSpeech
from haystack.nodes.audio._text_to_speech import TextToSpeech

from ..conftest import SAMPLES_PATH


@pytest.mark.skipif(soundfile_not_found, reason="soundfile not found")
class TestTextToSpeech:
    def test_text_to_speech_audio_data(self):
        text2speech = TextToSpeech(
            model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
            transformers_params={"seed": 777, "always_fix_seed": True},
        )
        expected_audio_data, _ = sf.read(SAMPLES_PATH / "audio" / "answer.wav")
        audio_data = text2speech.text_to_audio_data(text="answer")

        assert np.allclose(expected_audio_data, audio_data, atol=0.001)

    def test_text_to_speech_audio_file(self, tmp_path):
        text2speech = TextToSpeech(
            model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
            transformers_params={"seed": 777, "always_fix_seed": True},
        )
        expected_audio_data, _ = sf.read(SAMPLES_PATH / "audio" / "answer.wav")
        audio_file = text2speech.text_to_audio_file(text="answer", generated_audio_dir=tmp_path / "test_audio")
        assert os.path.exists(audio_file)
        assert np.allclose(expected_audio_data, sf.read(audio_file)[0], atol=0.001)

    def test_text_to_speech_compress_audio(self, tmp_path):
        text2speech = TextToSpeech(
            model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
            transformers_params={"seed": 777, "always_fix_seed": True},
        )
        expected_audio_file = SAMPLES_PATH / "audio" / "answer.wav"
        audio_file = text2speech.text_to_audio_file(
            text="answer", generated_audio_dir=tmp_path / "test_audio", audio_format="mp3"
        )
        assert os.path.exists(audio_file)
        assert audio_file.suffix == ".mp3"
        # FIXME find a way to make sure the compressed audio is similar enough to the wav version.
        # At a manual inspection, the code seems to be working well.

    def test_text_to_speech_naming_function(self, tmp_path):
        text2speech = TextToSpeech(
            model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
            transformers_params={"seed": 777, "always_fix_seed": True},
        )
        expected_audio_file = SAMPLES_PATH / "audio" / "answer.wav"
        audio_file = text2speech.text_to_audio_file(
            text="answer", generated_audio_dir=tmp_path / "test_audio", audio_naming_function=lambda text: text
        )
        assert os.path.exists(audio_file)
        assert audio_file.name == expected_audio_file.name
        assert np.allclose(sf.read(expected_audio_file)[0], sf.read(audio_file)[0], atol=0.001)

    def test_answer_to_speech(self, tmp_path):
        text_answer = Answer(
            answer="answer",
            type="extractive",
            context="the context for this answer is here",
            offsets_in_document=[Span(31, 37)],
            offsets_in_context=[Span(21, 27)],
            meta={"some_meta": "some_value"},
        )
        expected_audio_answer = SAMPLES_PATH / "audio" / "answer.wav"
        expected_audio_context = SAMPLES_PATH / "audio" / "the context for this answer is here.wav"

        answer2speech = AnswerToSpeech(
            generated_audio_dir=tmp_path / "test_audio",
            audio_params={"audio_naming_function": lambda text: text},
            transformers_params={"seed": 777, "always_fix_seed": True},
        )
        results, _ = answer2speech.run(answers=[text_answer])

        audio_answer: SpeechAnswer = results["answers"][0]
        assert isinstance(audio_answer, SpeechAnswer)
        assert audio_answer.type == "generative"
        assert audio_answer.answer_audio.name == expected_audio_answer.name
        assert audio_answer.context_audio.name == expected_audio_context.name
        assert audio_answer.answer == "answer"
        assert audio_answer.context == "the context for this answer is here"
        assert audio_answer.offsets_in_document == [Span(31, 37)]
        assert audio_answer.offsets_in_context == [Span(21, 27)]
        assert audio_answer.meta["some_meta"] == "some_value"
        assert audio_answer.meta["audio_format"] == "wav"

        assert np.allclose(sf.read(audio_answer.answer_audio)[0], sf.read(expected_audio_answer)[0], atol=0.001)
        assert np.allclose(sf.read(audio_answer.context_audio)[0], sf.read(expected_audio_context)[0], atol=0.001)

    def test_document_to_speech(self, tmp_path):
        text_doc = Document(
            content="this is the content of the document", content_type="text", meta={"name": "test_document.txt"}
        )
        expected_audio_content = SAMPLES_PATH / "audio" / "this is the content of the document.wav"

        doc2speech = DocumentToSpeech(
            generated_audio_dir=tmp_path / "test_audio",
            audio_params={"audio_naming_function": lambda text: text},
            transformers_params={"seed": 777, "always_fix_seed": True},
        )
        results, _ = doc2speech.run(documents=[text_doc])

        audio_doc: SpeechDocument = results["documents"][0]
        assert isinstance(audio_doc, SpeechDocument)
        assert audio_doc.content_type == "audio"
        assert audio_doc.content_audio.name == expected_audio_content.name
        assert audio_doc.content == "this is the content of the document"
        assert audio_doc.meta["name"] == "test_document.txt"
        assert audio_doc.meta["audio_format"] == "wav"

        assert np.allclose(sf.read(audio_doc.content_audio)[0], sf.read(expected_audio_content)[0], atol=0.001)
`AnswerToSpeech` (#2584) * Add new audio answer primitives * Add AnswerToSpeech * Add dependency group * Update Documentation & Code Style * Extract TextToSpeech in a helper class, create DocumentToSpeech and primitives * Add tests * Update Documentation & Code Style * Add ability to compress audio and more tests * Add audio group to test, all and all-gpu * fix pylint * Update Documentation & Code Style * Accidental git tag * Try pleasing mypy * Update Documentation & Code Style * fix pylint * Add warning for missing OS library and support in CI * Try fixing mypy * Update Documentation & Code Style * Add docs, simplify args for audio nodes and add tutorials * Fix mypy * Fix run_batch * Feedback on tutorials * fix mypy and pylint * Fix mypy again * Fix mypy yet again * Fix the ci * Fix dicts merge and install ffmpeg on CI * Make the audio nodes import safe * Trying to increase tolerance in audio test * Fix import paths * fix linter * Update Documentation & Code Style * Add audio libs in unit tests * Update _text_to_speech.py * Update answer_to_speech.py * Use dedicated dataset & update telemetry * Remove and use distilled roberta * Revert special primitives so that the nodes run in indexing * Improve tutorials and fix smaller bugs * Update Documentation & Code Style * Fix serialization issue * Update Documentation & Code Style * Improve tutorial * Update Documentation & Code Style * Update _text_to_speech.py * Minor lg updates * Minor lg updates to tutorial * Making indexing work in tutorials * Update Documentation & Code Style * Improve docstrings * Try to use GPU when available * Update Documentation & Code Style * Fixi mypy and pylint * Try to pass the device correctly * Update Documentation & Code Style * Use type of device * use .cpu() * Improve .ipynb * update apt index to be able to download libsndfile1 * Fix SpeechDocument.from_dict() * Change pip URL Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> 2022-06-15 10:13:18 +02:00			`import os`

Enable Opensearch unit tests in Windows CI (#2936) * enable Opensearch unit tests under Win * move unit tests into a dedicated job * skip audio tests on missing dependencies * avoid failing test collection when soundfile is not available * Update .github/workflows/tests.yml Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai> Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai> 2022-08-03 19:19:07 +02:00			`import pytest`
`AnswerToSpeech` (#2584) * Add new audio answer primitives * Add AnswerToSpeech * Add dependency group * Update Documentation & Code Style * Extract TextToSpeech in a helper class, create DocumentToSpeech and primitives * Add tests * Update Documentation & Code Style * Add ability to compress audio and more tests * Add audio group to test, all and all-gpu * fix pylint * Update Documentation & Code Style * Accidental git tag * Try pleasing mypy * Update Documentation & Code Style * fix pylint * Add warning for missing OS library and support in CI * Try fixing mypy * Update Documentation & Code Style * Add docs, simplify args for audio nodes and add tutorials * Fix mypy * Fix run_batch * Feedback on tutorials * fix mypy and pylint * Fix mypy again * Fix mypy yet again * Fix the ci * Fix dicts merge and install ffmpeg on CI * Make the audio nodes import safe * Trying to increase tolerance in audio test * Fix import paths * fix linter * Update Documentation & Code Style * Add audio libs in unit tests * Update _text_to_speech.py * Update answer_to_speech.py * Use dedicated dataset & update telemetry * Remove and use distilled roberta * Revert special primitives so that the nodes run in indexing * Improve tutorials and fix smaller bugs * Update Documentation & Code Style * Fix serialization issue * Update Documentation & Code Style * Improve tutorial * Update Documentation & Code Style * Update _text_to_speech.py * Minor lg updates * Minor lg updates to tutorial * Making indexing work in tutorials * Update Documentation & Code Style * Improve docstrings * Try to use GPU when available * Update Documentation & Code Style * Fixi mypy and pylint * Try to pass the device correctly * Update Documentation & Code Style * Use type of device * use .cpu() * Improve .ipynb * update apt index to be able to download libsndfile1 * Fix SpeechDocument.from_dict() * Change pip URL Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> 2022-06-15 10:13:18 +02:00			`import numpy as np`
Enable Opensearch unit tests in Windows CI (#2936) * enable Opensearch unit tests under Win * move unit tests into a dedicated job * skip audio tests on missing dependencies * avoid failing test collection when soundfile is not available * Update .github/workflows/tests.yml Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai> Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai> 2022-08-03 19:19:07 +02:00
			`try:`
			`import soundfile as sf`

			`soundfile_not_found = False`
			`except:`
			`soundfile_not_found = True`
`AnswerToSpeech` (#2584) * Add new audio answer primitives * Add AnswerToSpeech * Add dependency group * Update Documentation & Code Style * Extract TextToSpeech in a helper class, create DocumentToSpeech and primitives * Add tests * Update Documentation & Code Style * Add ability to compress audio and more tests * Add audio group to test, all and all-gpu * fix pylint * Update Documentation & Code Style * Accidental git tag * Try pleasing mypy * Update Documentation & Code Style * fix pylint * Add warning for missing OS library and support in CI * Try fixing mypy * Update Documentation & Code Style * Add docs, simplify args for audio nodes and add tutorials * Fix mypy * Fix run_batch * Feedback on tutorials * fix mypy and pylint * Fix mypy again * Fix mypy yet again * Fix the ci * Fix dicts merge and install ffmpeg on CI * Make the audio nodes import safe * Trying to increase tolerance in audio test * Fix import paths * fix linter * Update Documentation & Code Style * Add audio libs in unit tests * Update _text_to_speech.py * Update answer_to_speech.py * Use dedicated dataset & update telemetry * Remove and use distilled roberta * Revert special primitives so that the nodes run in indexing * Improve tutorials and fix smaller bugs * Update Documentation & Code Style * Fix serialization issue * Update Documentation & Code Style * Improve tutorial * Update Documentation & Code Style * Update _text_to_speech.py * Minor lg updates * Minor lg updates to tutorial * Making indexing work in tutorials * Update Documentation & Code Style * Improve docstrings * Try to use GPU when available * Update Documentation & Code Style * Fixi mypy and pylint * Try to pass the device correctly * Update Documentation & Code Style * Use type of device * use .cpu() * Improve .ipynb * update apt index to be able to download libsndfile1 * Fix SpeechDocument.from_dict() * Change pip URL Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> 2022-06-15 10:13:18 +02:00
			`from haystack.schema import Span, Answer, SpeechAnswer, Document, SpeechDocument`
			`from haystack.nodes.audio import AnswerToSpeech, DocumentToSpeech`
			`from haystack.nodes.audio._text_to_speech import TextToSpeech`

			`from ..conftest import SAMPLES_PATH`


Enable Opensearch unit tests in Windows CI (#2936) * enable Opensearch unit tests under Win * move unit tests into a dedicated job * skip audio tests on missing dependencies * avoid failing test collection when soundfile is not available * Update .github/workflows/tests.yml Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai> Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai> 2022-08-03 19:19:07 +02:00			`@pytest.mark.skipif(soundfile_not_found, reason="soundfile not found")`
			`class TestTextToSpeech:`
			`def test_text_to_speech_audio_data(self):`
			`text2speech = TextToSpeech(`
			`model_name_or_path="espnet/kan-bayashi_ljspeech_vits",`
			`transformers_params={"seed": 777, "always_fix_seed": True},`
			`)`
			`expected_audio_data, _ = sf.read(SAMPLES_PATH / "audio" / "answer.wav")`
			`audio_data = text2speech.text_to_audio_data(text="answer")`

			`assert np.allclose(expected_audio_data, audio_data, atol=0.001)`

			`def test_text_to_speech_audio_file(self, tmp_path):`
			`text2speech = TextToSpeech(`
			`model_name_or_path="espnet/kan-bayashi_ljspeech_vits",`
			`transformers_params={"seed": 777, "always_fix_seed": True},`
			`)`
			`expected_audio_data, _ = sf.read(SAMPLES_PATH / "audio" / "answer.wav")`
			`audio_file = text2speech.text_to_audio_file(text="answer", generated_audio_dir=tmp_path / "test_audio")`
			`assert os.path.exists(audio_file)`
			`assert np.allclose(expected_audio_data, sf.read(audio_file)[0], atol=0.001)`

			`def test_text_to_speech_compress_audio(self, tmp_path):`
			`text2speech = TextToSpeech(`
			`model_name_or_path="espnet/kan-bayashi_ljspeech_vits",`
			`transformers_params={"seed": 777, "always_fix_seed": True},`
			`)`
			`expected_audio_file = SAMPLES_PATH / "audio" / "answer.wav"`
			`audio_file = text2speech.text_to_audio_file(`
			`text="answer", generated_audio_dir=tmp_path / "test_audio", audio_format="mp3"`
			`)`
			`assert os.path.exists(audio_file)`
			`assert audio_file.suffix == ".mp3"`
			`# FIXME find a way to make sure the compressed audio is similar enough to the wav version.`
			`# At a manual inspection, the code seems to be working well.`

			`def test_text_to_speech_naming_function(self, tmp_path):`
			`text2speech = TextToSpeech(`
			`model_name_or_path="espnet/kan-bayashi_ljspeech_vits",`
			`transformers_params={"seed": 777, "always_fix_seed": True},`
			`)`
			`expected_audio_file = SAMPLES_PATH / "audio" / "answer.wav"`
			`audio_file = text2speech.text_to_audio_file(`
			`text="answer", generated_audio_dir=tmp_path / "test_audio", audio_naming_function=lambda text: text`
			`)`
			`assert os.path.exists(audio_file)`
			`assert audio_file.name == expected_audio_file.name`
			`assert np.allclose(sf.read(expected_audio_file)[0], sf.read(audio_file)[0], atol=0.001)`

			`def test_answer_to_speech(self, tmp_path):`
			`text_answer = Answer(`
			`answer="answer",`
			`type="extractive",`
			`context="the context for this answer is here",`
			`offsets_in_document=[Span(31, 37)],`
			`offsets_in_context=[Span(21, 27)],`
			`meta={"some_meta": "some_value"},`
			`)`
			`expected_audio_answer = SAMPLES_PATH / "audio" / "answer.wav"`
			`expected_audio_context = SAMPLES_PATH / "audio" / "the context for this answer is here.wav"`

			`answer2speech = AnswerToSpeech(`
			`generated_audio_dir=tmp_path / "test_audio",`
			`audio_params={"audio_naming_function": lambda text: text},`
			`transformers_params={"seed": 777, "always_fix_seed": True},`
			`)`
			`results, _ = answer2speech.run(answers=[text_answer])`

			`audio_answer: SpeechAnswer = results["answers"][0]`
			`assert isinstance(audio_answer, SpeechAnswer)`
			`assert audio_answer.type == "generative"`
			`assert audio_answer.answer_audio.name == expected_audio_answer.name`
			`assert audio_answer.context_audio.name == expected_audio_context.name`
			`assert audio_answer.answer == "answer"`
			`assert audio_answer.context == "the context for this answer is here"`
			`assert audio_answer.offsets_in_document == [Span(31, 37)]`
			`assert audio_answer.offsets_in_context == [Span(21, 27)]`
			`assert audio_answer.meta["some_meta"] == "some_value"`
			`assert audio_answer.meta["audio_format"] == "wav"`

			`assert np.allclose(sf.read(audio_answer.answer_audio)[0], sf.read(expected_audio_answer)[0], atol=0.001)`
			`assert np.allclose(sf.read(audio_answer.context_audio)[0], sf.read(expected_audio_context)[0], atol=0.001)`

			`def test_document_to_speech(self, tmp_path):`
			`text_doc = Document(`
			`content="this is the content of the document", content_type="text", meta={"name": "test_document.txt"}`
			`)`
			`expected_audio_content = SAMPLES_PATH / "audio" / "this is the content of the document.wav"`

			`doc2speech = DocumentToSpeech(`
			`generated_audio_dir=tmp_path / "test_audio",`
			`audio_params={"audio_naming_function": lambda text: text},`
			`transformers_params={"seed": 777, "always_fix_seed": True},`
			`)`
			`results, _ = doc2speech.run(documents=[text_doc])`

			`audio_doc: SpeechDocument = results["documents"][0]`
			`assert isinstance(audio_doc, SpeechDocument)`
			`assert audio_doc.content_type == "audio"`
			`assert audio_doc.content_audio.name == expected_audio_content.name`
			`assert audio_doc.content == "this is the content of the document"`
			`assert audio_doc.meta["name"] == "test_document.txt"`
			`assert audio_doc.meta["audio_format"] == "wav"`

			`assert np.allclose(sf.read(audio_doc.content_audio)[0], sf.read(expected_audio_content)[0], atol=0.001)`