haystack/test/nodes/test_audio.py

import os

import pytest
import numpy as np

try:
    import soundfile as sf
    import ffmpeg

    soundfile_not_found = False
except:
    soundfile_not_found = True

from transformers import WhisperProcessor, WhisperForConditionalGeneration

from haystack.schema import Span, Answer, SpeechAnswer, Document, SpeechDocument
from haystack.nodes.audio import AnswerToSpeech, DocumentToSpeech
from haystack.nodes.audio._text_to_speech import TextToSpeech

from ..conftest import SAMPLES_PATH


class WhisperHelper:
    def __init__(self, model):
        self._processor = WhisperProcessor.from_pretrained(model)
        self._model = WhisperForConditionalGeneration.from_pretrained(model)
        self._model.config.forced_decoder_ids = None

    def transcribe(self, media_file: str):
        output, _ = (
            ffmpeg.input(media_file)
            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=16000)
            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
        )
        data = np.frombuffer(output, np.int16).flatten().astype(np.float32) / 32768.0

        features = self._processor(data, sampling_rate=16000, return_tensors="pt").input_features
        tokens = self._model.generate(features)

        return self._processor.batch_decode(tokens, skip_special_tokens=True)


@pytest.fixture(scope="session", autouse=True)
def whisper_helper():
    return WhisperHelper("openai/whisper-medium")


@pytest.mark.skipif(soundfile_not_found, reason="soundfile/ffmpeg not found")
class TestTextToSpeech:
    def test_text_to_speech_audio_data(self, tmp_path, whisper_helper: WhisperHelper):
        text2speech = TextToSpeech(
            model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
            transformers_params={"seed": 4535, "always_fix_seed": True},
        )

        audio_data = text2speech.text_to_audio_data(text="answer")

        sf.write(
            data=audio_data,
            file=str(tmp_path / "audio1.wav"),
            format="wav",
            subtype="PCM_16",
            samplerate=text2speech.model.fs,
        )

        expedtec_doc = whisper_helper.transcribe(str(SAMPLES_PATH / "audio" / "answer.wav"))
        generated_doc = whisper_helper.transcribe(str(tmp_path / "audio1.wav"))

        assert expedtec_doc == generated_doc

    def test_text_to_speech_audio_file(self, tmp_path, whisper_helper: WhisperHelper):
        text2speech = TextToSpeech(
            model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
            transformers_params={"seed": 4535, "always_fix_seed": True},
        )

        audio_file = text2speech.text_to_audio_file(text="answer", generated_audio_dir=tmp_path / "test_audio")
        assert os.path.exists(audio_file)

        expected_doc = whisper_helper.transcribe(str(SAMPLES_PATH / "audio" / "answer.wav"))
        generated_doc = whisper_helper.transcribe(str(audio_file))

        assert expected_doc == generated_doc

    def test_text_to_speech_compress_audio(self, tmp_path, whisper_helper: WhisperHelper):
        text2speech = TextToSpeech(
            model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
            transformers_params={"seed": 4535, "always_fix_seed": True},
        )
        expected_audio_file = SAMPLES_PATH / "audio" / "answer.wav"
        audio_file = text2speech.text_to_audio_file(
            text="answer", generated_audio_dir=tmp_path / "test_audio", audio_format="mp3"
        )
        assert os.path.exists(audio_file)
        assert audio_file.suffix == ".mp3"

        expected_doc = whisper_helper.transcribe(str(expected_audio_file))
        generated_doc = whisper_helper.transcribe(str(audio_file))

        assert expected_doc == generated_doc

    def test_text_to_speech_naming_function(self, tmp_path, whisper_helper: WhisperHelper):
        text2speech = TextToSpeech(
            model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
            transformers_params={"seed": 4535, "always_fix_seed": True},
        )
        expected_audio_file = SAMPLES_PATH / "audio" / "answer.wav"
        audio_file = text2speech.text_to_audio_file(
            text="answer", generated_audio_dir=tmp_path / "test_audio", audio_naming_function=lambda text: text
        )
        assert os.path.exists(audio_file)
        assert audio_file.name == expected_audio_file.name

        expected_doc = whisper_helper.transcribe(str(expected_audio_file))
        generated_doc = whisper_helper.transcribe(str(audio_file))

        assert expected_doc == generated_doc

    def test_answer_to_speech(self, tmp_path, whisper_helper: WhisperHelper):
        text_answer = Answer(
            answer="answer",
            type="extractive",
            context="the context for this answer is here",
            offsets_in_document=[Span(31, 37)],
            offsets_in_context=[Span(21, 27)],
            meta={"some_meta": "some_value"},
        )
        expected_audio_answer = SAMPLES_PATH / "audio" / "answer.wav"
        expected_audio_context = SAMPLES_PATH / "audio" / "the context for this answer is here.wav"

        answer2speech = AnswerToSpeech(
            generated_audio_dir=tmp_path / "test_audio",
            audio_params={"audio_naming_function": lambda text: text},
            transformers_params={"seed": 4535, "always_fix_seed": True},
        )
        results, _ = answer2speech.run(answers=[text_answer])

        audio_answer: SpeechAnswer = results["answers"][0]
        assert isinstance(audio_answer, SpeechAnswer)
        assert audio_answer.type == "generative"
        assert audio_answer.answer_audio.name == expected_audio_answer.name
        assert audio_answer.context_audio.name == expected_audio_context.name
        assert audio_answer.answer == "answer"
        assert audio_answer.context == "the context for this answer is here"
        assert audio_answer.offsets_in_document == [Span(31, 37)]
        assert audio_answer.offsets_in_context == [Span(21, 27)]
        assert audio_answer.meta["some_meta"] == "some_value"
        assert audio_answer.meta["audio_format"] == "wav"

        expected_doc = whisper_helper.transcribe(str(expected_audio_answer))
        generated_doc = whisper_helper.transcribe(str(audio_answer.answer_audio))

        assert expected_doc == generated_doc

    def test_document_to_speech(self, tmp_path, whisper_helper: WhisperHelper):
        text_doc = Document(
            content="this is the content of the document", content_type="text", meta={"name": "test_document.txt"}
        )
        expected_audio_content = SAMPLES_PATH / "audio" / "this is the content of the document.wav"

        doc2speech = DocumentToSpeech(
            generated_audio_dir=tmp_path / "test_audio",
            audio_params={"audio_naming_function": lambda text: text},
            transformers_params={"seed": 4535, "always_fix_seed": True},
        )

        results, _ = doc2speech.run(documents=[text_doc])

        audio_doc: SpeechDocument = results["documents"][0]
        assert isinstance(audio_doc, SpeechDocument)
        assert audio_doc.content_type == "audio"
        assert audio_doc.content_audio.name == expected_audio_content.name
        assert audio_doc.content == "this is the content of the document"
        assert audio_doc.meta["name"] == "test_document.txt"
        assert audio_doc.meta["audio_format"] == "wav"

        expected_doc = whisper_helper.transcribe(str(expected_audio_content))
        generated_doc = whisper_helper.transcribe(str(audio_doc.content_audio))

        assert expected_doc == generated_doc