refactor: Remove the pin from the espnet module and fix the audio node tests. (#4128)

* fix: fix audio tests + unbound some dependencies

* fix: update for Python 3.8

* refactor: change numpy assertion

* feat: add voice recog. support on audio tests

* fix: fix var assignement

* chore: dummy commit

* fix: fix sndfile error

* refactor: change skip reason

* refactor: hardcode variable

* refactor: unpin numpy

* fix: pin numpy only for audio
This commit is contained in:
Daniel Bichuetti 2023-02-16 13:42:17 -03:00 committed by GitHub
parent e7c32da8d7
commit 5187cc1801
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 98 additions and 28 deletions

View File

@ -11,6 +11,7 @@ import torch
try: try:
import soundfile as sf import soundfile as sf
from espnet2.bin.tts_inference import Text2Speech as _Text2SpeechModel from espnet2.bin.tts_inference import Text2Speech as _Text2SpeechModel
except OSError as ose: except OSError as ose:
logging.exception( logging.exception(
"`libsndfile` not found, it's probably not installed. The node will most likely crash. " "`libsndfile` not found, it's probably not installed. The node will most likely crash. "
@ -58,7 +59,7 @@ class TextToSpeech:
) )
self.model = _Text2SpeechModel.from_pretrained( self.model = _Text2SpeechModel.from_pretrained(
model_name_or_path, device=resolved_devices[0].type, **(transformers_params or {}) str(model_name_or_path), device=resolved_devices[0].type, **(transformers_params or {})
) )
def text_to_audio_file( def text_to_audio_file(

View File

@ -59,7 +59,6 @@ class DocumentToSpeech(BaseComponent):
content_audio = self.converter.text_to_audio_file( content_audio = self.converter.text_to_audio_file(
text=doc.content, generated_audio_dir=self.generated_audio_dir, **self.params text=doc.content, generated_audio_dir=self.generated_audio_dir, **self.params
) )
audio_document = SpeechDocument.from_text_document( audio_document = SpeechDocument.from_text_document(
document_object=doc, document_object=doc,
audio_content=content_audio, audio_content=content_audio,

View File

@ -2,7 +2,12 @@ import json
import logging import logging
import os import os
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union, Literal from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
try:
from typing import Literal
except ImportError:
from typing_extensions import Literal # type: ignore
import numpy as np import numpy as np
import requests import requests

View File

@ -1,5 +1,10 @@
from abc import abstractmethod from abc import abstractmethod
from typing import List, Dict, Union, Optional, Any, Literal from typing import List, Dict, Union, Optional, Any
try:
from typing import Literal
except ImportError:
from typing_extensions import Literal # type: ignore
import logging import logging
from pathlib import Path from pathlib import Path

View File

@ -148,9 +148,13 @@ docstores-gpu = [
audio = [ audio = [
"pyworld>=0.3.1; python_version >= '3.8'", "pyworld>=0.3.1; python_version >= '3.8'",
"pyworld<0.3.1; python_version < '3.8'", "pyworld<0.3.1; python_version < '3.8'",
"espnet==202209", # https://github.com/deepset-ai/haystack/pull/3693 "ffmpeg-python==0.2.0",
"espnet",
"espnet-model-zoo", "espnet-model-zoo",
"pydub", "pydub",
"protobuf<=3.20.1",
"soundfile< 0.12.0",
"numpy<1.24", # Keep compatibility with latest numba
] ]
beir = [ beir = [
"beir; platform_system != 'Windows'", "beir; platform_system != 'Windows'",

View File

@ -5,11 +5,14 @@ import numpy as np
try: try:
import soundfile as sf import soundfile as sf
import ffmpeg
soundfile_not_found = False soundfile_not_found = False
except: except:
soundfile_not_found = True soundfile_not_found = True
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from haystack.schema import Span, Answer, SpeechAnswer, Document, SpeechDocument from haystack.schema import Span, Answer, SpeechAnswer, Document, SpeechDocument
from haystack.nodes.audio import AnswerToSpeech, DocumentToSpeech from haystack.nodes.audio import AnswerToSpeech, DocumentToSpeech
from haystack.nodes.audio._text_to_speech import TextToSpeech from haystack.nodes.audio._text_to_speech import TextToSpeech
@ -17,32 +20,72 @@ from haystack.nodes.audio._text_to_speech import TextToSpeech
from ..conftest import SAMPLES_PATH from ..conftest import SAMPLES_PATH
@pytest.mark.skipif(soundfile_not_found, reason="soundfile not found") class WhisperHelper:
def __init__(self, model):
self._processor = WhisperProcessor.from_pretrained(model)
self._model = WhisperForConditionalGeneration.from_pretrained(model)
self._model.config.forced_decoder_ids = None
def transcribe(self, media_file: str):
output, _ = (
ffmpeg.input(media_file)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=16000)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
data = np.frombuffer(output, np.int16).flatten().astype(np.float32) / 32768.0
features = self._processor(data, sampling_rate=16000, return_tensors="pt").input_features
tokens = self._model.generate(features)
return self._processor.batch_decode(tokens, skip_special_tokens=True)
@pytest.fixture(scope="session", autouse=True)
def whisper_helper():
return WhisperHelper("openai/whisper-medium")
@pytest.mark.skipif(soundfile_not_found, reason="soundfile/ffmpeg not found")
class TestTextToSpeech: class TestTextToSpeech:
def test_text_to_speech_audio_data(self): def test_text_to_speech_audio_data(self, tmp_path, whisper_helper: WhisperHelper):
text2speech = TextToSpeech( text2speech = TextToSpeech(
model_name_or_path="espnet/kan-bayashi_ljspeech_vits", model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
transformers_params={"seed": 777, "always_fix_seed": True}, transformers_params={"seed": 4535, "always_fix_seed": True},
) )
expected_audio_data, _ = sf.read(SAMPLES_PATH / "audio" / "answer.wav")
audio_data = text2speech.text_to_audio_data(text="answer") audio_data = text2speech.text_to_audio_data(text="answer")
assert np.allclose(expected_audio_data, audio_data, atol=0.001) sf.write(
data=audio_data,
file=str(tmp_path / "audio1.wav"),
format="wav",
subtype="PCM_16",
samplerate=text2speech.model.fs,
)
def test_text_to_speech_audio_file(self, tmp_path): expedtec_doc = whisper_helper.transcribe(str(SAMPLES_PATH / "audio" / "answer.wav"))
generated_doc = whisper_helper.transcribe(str(tmp_path / "audio1.wav"))
assert expedtec_doc == generated_doc
def test_text_to_speech_audio_file(self, tmp_path, whisper_helper: WhisperHelper):
text2speech = TextToSpeech( text2speech = TextToSpeech(
model_name_or_path="espnet/kan-bayashi_ljspeech_vits", model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
transformers_params={"seed": 777, "always_fix_seed": True}, transformers_params={"seed": 4535, "always_fix_seed": True},
) )
expected_audio_data, _ = sf.read(SAMPLES_PATH / "audio" / "answer.wav")
audio_file = text2speech.text_to_audio_file(text="answer", generated_audio_dir=tmp_path / "test_audio") audio_file = text2speech.text_to_audio_file(text="answer", generated_audio_dir=tmp_path / "test_audio")
assert os.path.exists(audio_file) assert os.path.exists(audio_file)
assert np.allclose(expected_audio_data, sf.read(audio_file)[0], atol=0.001)
def test_text_to_speech_compress_audio(self, tmp_path): expected_doc = whisper_helper.transcribe(str(SAMPLES_PATH / "audio" / "answer.wav"))
generated_doc = whisper_helper.transcribe(str(audio_file))
assert expected_doc == generated_doc
def test_text_to_speech_compress_audio(self, tmp_path, whisper_helper: WhisperHelper):
text2speech = TextToSpeech( text2speech = TextToSpeech(
model_name_or_path="espnet/kan-bayashi_ljspeech_vits", model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
transformers_params={"seed": 777, "always_fix_seed": True}, transformers_params={"seed": 4535, "always_fix_seed": True},
) )
expected_audio_file = SAMPLES_PATH / "audio" / "answer.wav" expected_audio_file = SAMPLES_PATH / "audio" / "answer.wav"
audio_file = text2speech.text_to_audio_file( audio_file = text2speech.text_to_audio_file(
@ -50,13 +93,16 @@ class TestTextToSpeech:
) )
assert os.path.exists(audio_file) assert os.path.exists(audio_file)
assert audio_file.suffix == ".mp3" assert audio_file.suffix == ".mp3"
# FIXME find a way to make sure the compressed audio is similar enough to the wav version.
# At a manual inspection, the code seems to be working well.
def test_text_to_speech_naming_function(self, tmp_path): expected_doc = whisper_helper.transcribe(str(expected_audio_file))
generated_doc = whisper_helper.transcribe(str(audio_file))
assert expected_doc == generated_doc
def test_text_to_speech_naming_function(self, tmp_path, whisper_helper: WhisperHelper):
text2speech = TextToSpeech( text2speech = TextToSpeech(
model_name_or_path="espnet/kan-bayashi_ljspeech_vits", model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
transformers_params={"seed": 777, "always_fix_seed": True}, transformers_params={"seed": 4535, "always_fix_seed": True},
) )
expected_audio_file = SAMPLES_PATH / "audio" / "answer.wav" expected_audio_file = SAMPLES_PATH / "audio" / "answer.wav"
audio_file = text2speech.text_to_audio_file( audio_file = text2speech.text_to_audio_file(
@ -64,9 +110,13 @@ class TestTextToSpeech:
) )
assert os.path.exists(audio_file) assert os.path.exists(audio_file)
assert audio_file.name == expected_audio_file.name assert audio_file.name == expected_audio_file.name
assert np.allclose(sf.read(expected_audio_file)[0], sf.read(audio_file)[0], atol=0.001)
def test_answer_to_speech(self, tmp_path): expected_doc = whisper_helper.transcribe(str(expected_audio_file))
generated_doc = whisper_helper.transcribe(str(audio_file))
assert expected_doc == generated_doc
def test_answer_to_speech(self, tmp_path, whisper_helper: WhisperHelper):
text_answer = Answer( text_answer = Answer(
answer="answer", answer="answer",
type="extractive", type="extractive",
@ -81,7 +131,7 @@ class TestTextToSpeech:
answer2speech = AnswerToSpeech( answer2speech = AnswerToSpeech(
generated_audio_dir=tmp_path / "test_audio", generated_audio_dir=tmp_path / "test_audio",
audio_params={"audio_naming_function": lambda text: text}, audio_params={"audio_naming_function": lambda text: text},
transformers_params={"seed": 777, "always_fix_seed": True}, transformers_params={"seed": 4535, "always_fix_seed": True},
) )
results, _ = answer2speech.run(answers=[text_answer]) results, _ = answer2speech.run(answers=[text_answer])
@ -97,10 +147,12 @@ class TestTextToSpeech:
assert audio_answer.meta["some_meta"] == "some_value" assert audio_answer.meta["some_meta"] == "some_value"
assert audio_answer.meta["audio_format"] == "wav" assert audio_answer.meta["audio_format"] == "wav"
assert np.allclose(sf.read(audio_answer.answer_audio)[0], sf.read(expected_audio_answer)[0], atol=0.001) expected_doc = whisper_helper.transcribe(str(expected_audio_answer))
assert np.allclose(sf.read(audio_answer.context_audio)[0], sf.read(expected_audio_context)[0], atol=0.001) generated_doc = whisper_helper.transcribe(str(audio_answer.answer_audio))
def test_document_to_speech(self, tmp_path): assert expected_doc == generated_doc
def test_document_to_speech(self, tmp_path, whisper_helper: WhisperHelper):
text_doc = Document( text_doc = Document(
content="this is the content of the document", content_type="text", meta={"name": "test_document.txt"} content="this is the content of the document", content_type="text", meta={"name": "test_document.txt"}
) )
@ -109,8 +161,9 @@ class TestTextToSpeech:
doc2speech = DocumentToSpeech( doc2speech = DocumentToSpeech(
generated_audio_dir=tmp_path / "test_audio", generated_audio_dir=tmp_path / "test_audio",
audio_params={"audio_naming_function": lambda text: text}, audio_params={"audio_naming_function": lambda text: text},
transformers_params={"seed": 777, "always_fix_seed": True}, transformers_params={"seed": 4535, "always_fix_seed": True},
) )
results, _ = doc2speech.run(documents=[text_doc]) results, _ = doc2speech.run(documents=[text_doc])
audio_doc: SpeechDocument = results["documents"][0] audio_doc: SpeechDocument = results["documents"][0]
@ -121,4 +174,7 @@ class TestTextToSpeech:
assert audio_doc.meta["name"] == "test_document.txt" assert audio_doc.meta["name"] == "test_document.txt"
assert audio_doc.meta["audio_format"] == "wav" assert audio_doc.meta["audio_format"] == "wav"
assert np.allclose(sf.read(audio_doc.content_audio)[0], sf.read(expected_audio_content)[0], atol=0.001) expected_doc = whisper_helper.transcribe(str(expected_audio_content))
generated_doc = whisper_helper.transcribe(str(audio_doc.content_audio))
assert expected_doc == generated_doc

Binary file not shown.