Add Whisper node (#4335)

* Add Whisper node * Add support for audio path, improve tests * Add docs * Improve tests
2025-12-18 10:38:36 +00:00 · 2023-03-13 16:17:07 +01:00 · 2023-03-13 16:17:07 +01:00 · 98256ecf57
commit 98256ecf57
parent 28724e2e25
6 changed files with 250 additions and 0 deletions
--- a/haystack/nodes/init.py
+++ b/haystack/nodes/init.py
@ -48,6 +48,8 @@ from haystack.nodes.retriever import (
 from haystack.nodes.summarizer import BaseSummarizer, TransformersSummarizer
 from haystack.nodes.translator import BaseTranslator, TransformersTranslator

+from haystack.nodes.audio import WhisperTranscriber, WhisperModel
+
 Crawler = safe_import("haystack.nodes.connector.crawler", "Crawler", "crawler")  # Has optional dependencies
 AnswerToSpeech = safe_import(
    "haystack.nodes.audio.answer_to_speech", "AnswerToSpeech", "audio"
--- a/haystack/nodes/audio/init.py
+++ b/haystack/nodes/audio/init.py
@ -1,4 +1,5 @@
 from haystack.utils.import_utils import safe_import
+from haystack.nodes.audio.whisper_transcriber import WhisperTranscriber, WhisperModel

 AnswerToSpeech = safe_import(
    "haystack.nodes.audio.answer_to_speech", "AnswerToSpeech", "audio"
--- a/haystack/nodes/audio/whisper_transcriber.py
+++ b/haystack/nodes/audio/whisper_transcriber.py
@ -0,0 +1,187 @@
+import json
+
+from typing import List, Optional, Dict, Any, Union, BinaryIO, Literal
+
+import requests
+import torch
+from requests import PreparedRequest
+
+from haystack.errors import OpenAIError, OpenAIRateLimitError
+from haystack.nodes.base import BaseComponent
+from haystack.utils.import_utils import is_whisper_available
+
+
+WhisperModel = Literal["tiny", "small", "medium", "large", "large-v2"]
+
+
+class WhisperTranscriber(BaseComponent):
+    """
+    Transcribes audio files using OpenAI's Whisper. This class supports two underlying implementations:
+
+    - API (default): Uses the OpenAI API and requires an API key. See blog
+    [post](https://beta.openai.com/docs/api-reference/whisper for more details.) for more details.
+    - Local (requires installation of whisper): Uses the local installation
+    of [whisper](https://github.com/openai/whisper).
+
+    If you are using local installation of whisper, install whisper following the instructions available on
+    the Whisper [github repo](https://github.com/openai/whisper) and omit the api_key parameter.
+
+    If you are using the API implementation, you need to provide an api_key. You can get one by signing up
+    for an OpenAI account [here](https://beta.openai.com/).
+
+    For the supported audio formats, languages and other parameters, see the Whisper API
+    [documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
+    [github repo](https://github.com/openai/whisper).
+    """
+
+    # If it's not a decision component, there is only one outgoing edge
+    outgoing_edges = 1
+
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        model_name_or_path: WhisperModel = "medium",
+        device: Optional[Union[str, torch.device]] = None,
+    ) -> None:
+        """
+        Creates a WhisperTranscriber instance.
+
+        :param api_key: OpenAI API key. If None, local installation of whisper is used.
+        :param model_name_or_path: Name of the model to use. If using local installation of whisper, this
+        value has to be one of the following: "tiny", "small", "medium", "large", "large-v2". If using
+        the API, this value has to be "whisper-1" (default).
+        :param device: Device to use for inference. This parameter is only used if you are using local
+        installation of whisper. If None, the device is automatically selected.
+        """
+        super().__init__()
+        self.api_key = api_key
+
+        self.use_local_whisper = is_whisper_available() and self.api_key is None
+
+        if self.use_local_whisper:
+            import whisper
+
+            self._model = whisper.load_model(model_name_or_path, device=device)
+        else:
+            if api_key is None:
+                raise ValueError(
+                    "Please provide a valid api_key for OpenAI API. Alternatively, "
+                    "install OpenAI whisper (see https://github.com/openai/whisper for more details)."
+                )
+
+    def transcribe(
+        self,
+        audio_file: Union[str, BinaryIO],
+        language: Optional[str] = None,
+        return_segments: bool = False,
+        translate: bool = False,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """
+        Transcribe audio file.
+
+        :param audio_file: Path to audio file or a binary file-like object.
+        :param language: Language of the audio file. If None, the language is automatically detected.
+        :param return_segments: If True, returns the transcription for each segment of the audio file.
+        :param translate: If True, translates the transcription to English.
+
+        """
+        transcript: Dict[str, Any] = {}
+
+        new_kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        if language is not None:
+            new_kwargs["language"] = language
+
+        if self.use_local_whisper:
+            new_kwargs["return_segments"] = return_segments
+            transcript = self._invoke_local(audio_file, translate, **new_kwargs)
+        elif self.api_key:
+            transcript = self._invoke_api(audio_file, translate, **new_kwargs)
+        return transcript
+
+    def _invoke_api(
+        self, audio_file: Union[str, BinaryIO], translate: Optional[bool] = False, **kwargs
+    ) -> Dict[str, Any]:
+        if isinstance(audio_file, str):
+            with open(audio_file, "rb") as f:
+                return self._invoke_api(f, translate, **kwargs)
+        else:
+            headers = {"Authorization": f"Bearer {self.api_key}"}
+            request = PreparedRequest()
+            url: str = (
+                "https://api.openai.com/v1/audio/transcriptions"
+                if not translate
+                else "https://api.openai.com/v1/audio/translations"
+            )
+
+            request.prepare(
+                method="POST",
+                url=url,
+                headers=headers,
+                data={"model": "whisper-1", **kwargs},
+                files=[("file", (audio_file.name, audio_file, "application/octet-stream"))],
+            )
+            response = requests.post(url, data=request.body, headers=request.headers, timeout=600)
+
+            if response.status_code != 200:
+                openai_error: OpenAIError
+                if response.status_code == 429:
+                    openai_error = OpenAIRateLimitError(f"API rate limit exceeded: {response.text}")
+                else:
+                    openai_error = OpenAIError(
+                        f"OpenAI returned an error.\n"
+                        f"Status code: {response.status_code}\n"
+                        f"Response body: {response.text}",
+                        status_code=response.status_code,
+                    )
+                raise openai_error
+
+            return json.loads(response.content)
+
+    def _invoke_local(
+        self, audio_file: Union[str, BinaryIO], translate: Optional[bool] = False, **kwargs
+    ) -> Dict[str, Any]:
+        if isinstance(audio_file, str):
+            with open(audio_file, "rb") as f:
+                return self._invoke_local(f, translate, **kwargs)
+        else:
+            return_segments = kwargs.pop("return_segments", None)
+            kwargs["task"] = "translate" if translate else "transcribe"
+            transcription = self._model.transcribe(audio_file.name, **kwargs)
+            if not return_segments:
+                transcription.pop("segments", None)
+
+            return transcription
+
+    def run(self, audio_file: Union[str, BinaryIO], language: Optional[str] = None, return_segments: bool = False, translate: bool = False):  # type: ignore
+        """
+        Transcribe audio file.
+
+        :param audio_file: Path to audio file or a binary file-like object.
+        :param language: Language of the audio file. If None, the language is automatically detected.
+        :param return_segments: If True, returns the transcription for each segment of the audio file.
+        :param translate: If True, translates the transcription to English.
+        """
+        document = self.transcribe(audio_file, language, return_segments, translate)
+
+        output = {"documents": [document]}
+
+        return output, "output_1"
+
+    def run_batch(self, audio_files: List[Union[str, BinaryIO]], language: Optional[str] = None, return_segments: bool = False, translate: bool = False):  # type: ignore
+        """
+        Transcribe audio files.
+
+        :param audio_files: List of paths to audio files or binary file-like objects.
+        :param language: Language of the audio files. If None, the language is automatically detected.
+        :param return_segments: If True, returns the transcription for each segment of the audio files.
+        :param translate: If True, translates the transcription to English.
+        """
+        documents = []
+        for audio in audio_files:
+            document = self.transcribe(audio, language, return_segments, translate)
+            documents.append(document)
+
+        output = {"documents": documents}
+
+        return output, "output_1"
--- a/haystack/utils/import_utils.py
+++ b/haystack/utils/import_utils.py
@ -6,6 +6,7 @@ import tarfile
 import zipfile
 import logging
 import importlib
+import importlib.util
 from pathlib import Path

 import requests
@ -118,3 +119,7 @@ def fetch_archive_from_http(
            )

        return True
+
+
+def is_whisper_available():
+    return importlib.util.find_spec("whisper") is not None
--- a/pyproject.toml
+++ b/pyproject.toml
@ -156,6 +156,7 @@ audio = [
  "protobuf<=3.20.1",
  "soundfile< 0.12.0",
  "numpy<1.24",  # Keep compatibility with latest numba
+  "openai-whisper"
 ]
 beir = [
  "beir; platform_system != 'Windows'",
--- a/test/nodes/test_whisper.py
+++ b/test/nodes/test_whisper.py
@ -0,0 +1,54 @@
+import os
+
+import pytest
+
+from haystack.nodes.audio import WhisperTranscriber
+from haystack.utils.import_utils import is_whisper_available
+from ..conftest import SAMPLES_PATH
+
+
+@pytest.mark.skipif(os.environ.get("OPENAI_API_KEY", "") == "", reason="OpenAI API key not found")
+@pytest.mark.integration
+def test_whisper_api_transcribe():
+    w = WhisperTranscriber(api_key=os.environ.get("OPENAI_API_KEY"))
+    audio_object_transcript, audio_path_transcript = transcribe_test_helper(w)
+    assert "segments" not in audio_object_transcript and "segments" not in audio_path_transcript
+
+
+@pytest.mark.skipif(os.environ.get("OPENAI_API_KEY", "") == "", reason="OpenAI API key not found")
+@pytest.mark.integration
+def test_whisper_api_transcribe_with_params():
+    w = WhisperTranscriber(api_key=os.environ.get("OPENAI_API_KEY"))
+    audio_object_transcript, audio_path_transcript = transcribe_test_helper(w)
+    assert "segments" not in audio_object_transcript and "segments" not in audio_path_transcript
+
+
+@pytest.mark.integration
+@pytest.mark.skipif(not is_whisper_available(), reason="Whisper is not installed")
+def test_whisper_local_transcribe():
+    w = WhisperTranscriber()
+    audio_object_transcript, audio_path_transcript = transcribe_test_helper(w, language="en")
+    assert "segments" not in audio_object_transcript and "segments" not in audio_path_transcript
+
+
+@pytest.mark.integration
+@pytest.mark.skipif(not is_whisper_available(), reason="Whisper is not installed")
+def test_whisper_local_transcribe_with_params():
+    w = WhisperTranscriber()
+    audio_object, audio_path = transcribe_test_helper(w, language="en", return_segments=True)
+    assert len(audio_object["segments"]) == 1 and len(audio_path["segments"]) == 1
+
+
+def transcribe_test_helper(whisper, **kwargs):
+    # this file is 1 second long and contains the word "answer"
+    file_path = str(SAMPLES_PATH / "audio" / "answer.wav")
+
+    # using audio object
+    with open(file_path, mode="rb") as audio_file:
+        audio_object_transcript = whisper.transcribe(audio_file=audio_file, **kwargs)
+        assert "answer" in audio_object_transcript["text"].lower()
+
+    # using path to audio file
+    audio_path_transcript = whisper.transcribe(audio_file=file_path, **kwargs)
+    assert "answer" in audio_path_transcript["text"].lower()
+    return audio_object_transcript, audio_path_transcript