chore: adjust docstrings in the audio package (#7246)

* adjust docstrings in the audio package * Apply suggestions from code review Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * Update haystack/components/audio/whisper_remote.py * black complaining for apparently no reason --------- Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
2025-09-19 21:23:37 +00:00 · 2024-02-28 18:22:03 +01:00 · 2024-02-28 18:22:03 +01:00 · 890c613a2c
commit 890c613a2c
parent e5f0e248b6
2 changed files with 86 additions and 40 deletions
--- a/haystack/components/audio/whisper_local.py
+++ b/haystack/components/audio/whisper_local.py
@ -20,11 +20,20 @@ WhisperLocalModel = Literal["tiny", "small", "medium", "large", "large-v2"]
@component
 class LocalWhisperTranscriber:
    """
-    Transcribes audio files using OpenAI's Whisper's model on your local machine.
+    Transcribes audio files using OpenAI's Whisper model in your local machine.

    For the supported audio formats, languages, and other parameters, see the
    [Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
-    [github repo](https://github.com/openai/whisper).
+    [github repository](https://github.com/openai/whisper).
+
+    Usage example:
+    ```python
+    from haystack.components.audio import LocalWhisperTranscriber
+
+    whisper = LocalWhisperTranscriber(model="small")
+    whisper.warm_up()
+    transcription = whisper.run(audio_files=["path/to/audio/file"])
+    ```
    """

    def __init__(
@ -34,10 +43,14 @@ class LocalWhisperTranscriber:
        whisper_params: Optional[Dict[str, Any]] = None,
    ):
        """
-        :param model: Name of the model to use. Set it to one of the following values:
-        :type model: Literal["tiny", "small", "medium", "large", "large-v2"]
-        :param device: The device on which the model is loaded. If `None`, the default device is automatically
-            selected.
+        Creates an instance of the LocalWhisperTranscriber component.
+
+        :param model:
+            Name of the model to use. Set it to one of the following values:
+        :type model:
+            Literal["tiny", "small", "medium", "large", "large-v2"]
+        :param device:
+            The device on which the model is loaded. If `None`, the default device is automatically selected.
        """
        whisper_import.check()
        if model not in get_args(WhisperLocalModel):
@ -51,21 +64,29 @@ class LocalWhisperTranscriber:

    def warm_up(self) -> None:
        """
-        Loads the model.
+        Loads the model in memory.
        """
        if not self._model:
            self._model = whisper.load_model(self.model, device=self.device.to_torch())

    def to_dict(self) -> Dict[str, Any]:
        """
-        Serialize this component to a dictionary.
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
        """
        return default_to_dict(self, model=self.model, device=self.device.to_dict(), whisper_params=self.whisper_params)

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "LocalWhisperTranscriber":
        """
-        Create a `LocalWhisperTranscriber` instance from a dictionary.
+        Deserializes the component from a dictionary.
+
+        :param data:
+            The dictionary to deserialize from.
+        :returns:
+            The deserialized component.
        """
        serialized_device = data["init_parameters"]["device"]
        data["init_parameters"]["device"] = ComponentDevice.from_dict(serialized_device)
@ -75,17 +96,19 @@ class LocalWhisperTranscriber:
    @component.output_types(documents=List[Document])
    def run(self, sources: List[Union[str, Path, ByteStream]], whisper_params: Optional[Dict[str, Any]] = None):
        """
-        Transcribe the audio files into a list of Documents, one for each input file.
+        Transcribes the audio files into a list of Documents, one for each input file.

        For the supported audio formats, languages, and other parameters, see the
        [Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
        [github repo](https://github.com/openai/whisper).

-        :param audio_files: A list of paths or binary streams to transcribe.
-        :returns: A list of Documents, one for each file. The content of the document is the transcription text,
-            while the document's metadata contains all the other values returned by the Whisper model, such as the
-            alignment data. Another key called `audio_file` contains the path to the audio file used for the
-            transcription.
+        :param audio_files:
+            A list of paths or binary streams to transcribe.
+
+        :returns: A dictionary with the following keys:
+            - `documents`: A list of Documents, one for each file. The content of the document is the transcription text,
+                while the document's metadata contains the values returned by the Whisper model, such as the
+                alignment data and the path to the audio file used for the transcription.
        """
        if self._model is None:
            raise ComponentError("The component was not warmed up. Run 'warm_up()' before calling 'run()'.")
@ -98,17 +121,16 @@ class LocalWhisperTranscriber:

    def transcribe(self, sources: List[Union[str, Path, ByteStream]], **kwargs) -> List[Document]:
        """
-        Transcribe the audio files into a list of Documents, one for each input file.
+        Transcribes the audio files into a list of Documents, one for each input file.

        For the supported audio formats, languages, and other parameters, see the
        [Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
        [github repo](https://github.com/openai/whisper).

-        :param audio_files: A list of paths or binary streams to transcribe.
-        :returns: A list of Documents, one for each file. The content of the document is the transcription text,
-            while the document's metadata contains all the other values returned by the Whisper model, such as the
-            alignment data. Another key called `audio_file` contains the path to the audio file used for the
-            transcription.
+        :param audio_files:
+            A list of paths or binary streams to transcribe.
+        :returns:
+            A list of Documents, one for each file.
        """
        transcriptions = self._raw_transcribe(sources, **kwargs)
        documents = []
@ -120,14 +142,16 @@ class LocalWhisperTranscriber:

    def _raw_transcribe(self, sources: List[Union[str, Path, ByteStream]], **kwargs) -> Dict[Path, Any]:
        """
-        Transcribe the given audio files. Returns the output of the model, a dictionary, for each input file.
+        Transcribes the given audio files. Returns the output of the model, a dictionary, for each input file.

        For the supported audio formats, languages, and other parameters, see the
        [Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
        [github repo](https://github.com/openai/whisper).

-        :param audio_files: A list of paths or binary streams to transcribe.
-        :returns: A dictionary of  file_path -> transcription.
+        :param audio_files:
+            A list of paths or binary streams to transcribe.
+        :returns:
+            A dictionary mapping 'file_path' to 'transcription'.
        """
        if self._model is None:
            raise ComponentError("Model is not loaded, please run 'warm_up()' before calling 'run()'")
--- a/haystack/components/audio/whisper_remote.py
+++ b/haystack/components/audio/whisper_remote.py
@ -15,12 +15,20 @@ logger = logging.getLogger(__name__)
@component
 class RemoteWhisperTranscriber:
    """
-    Transcribes audio files using OpenAI's Whisper using OpenAI API. Requires an API key. See the
-    [OpenAI blog post](https://beta.openai.com/docs/api-reference/whisper) for more details.
-    You can get one by signing up for an [OpenAI account](https://beta.openai.com/).
+    Transcribes audio files using the Whisper API from OpenAI.

+    The component requires an API key, see the relative
+    [OpenAI documentation](https://platform.openai.com/docs/api-reference/authentication) for more details.
    For the supported audio formats, languages, and other parameters, see the
    [Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text)
+
+    Usage example:
+    ```python
+    from haystack.components.audio import RemoteWhisperTranscriber
+
+    whisper = RemoteWhisperTranscriber(api_key=Secret.from_token("<your-api-key>"), model="tiny")
+    transcription = whisper.run(sources=["path/to/audio/file"])
+    ```
    """

    def __init__(
@ -32,14 +40,19 @@ class RemoteWhisperTranscriber:
        **kwargs,
    ):
        """
-        Transcribes a list of audio files into a list of Documents.
+        Creates an instance of the RemoteWhisperTranscriber component.

-        :param api_key: OpenAI API key.
-        :param model: Name of the model to use. It now accepts only `whisper-1`.
-        :param organization: The Organization ID, defaults to `None`. See
+        :param api_key:
+            OpenAI API key.
+        :param model:
+            Name of the model to use. It now accepts only `whisper-1`.
+        :param organization:
+            The Organization ID. See
        [production best practices](https://platform.openai.com/docs/guides/production-best-practices/setting-up-your-organization).
-        :param api_base: An optional URL to use as the API base. Defaults to `None`. See OpenAI [docs](https://platform.openai.com/docs/api-reference/audio).
-        :param kwargs: Other parameters to use for the model. These parameters are all sent directly to the OpenAI
+        :param api_base:
+            An optional URL to use as the API base. See OpenAI [docs](https://platform.openai.com/docs/api-reference/audio).
+        :param kwargs:
+            Other parameters to use for the model. These parameters are all sent directly to the OpenAI
            endpoint. See OpenAI [documentation](https://platform.openai.com/docs/api-reference/audio) for more details.
            Some of the supported parameters:
            - `language`: The language of the input audio.
@ -77,9 +90,10 @@ class RemoteWhisperTranscriber:

    def to_dict(self) -> Dict[str, Any]:
        """
-        Serialize this component to a dictionary.
-        This method overrides the default serializer in order to
-        avoid leaking the `api_key` value passed to the constructor.
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
        """
        return default_to_dict(
            self,
@ -93,7 +107,12 @@ class RemoteWhisperTranscriber:
    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "RemoteWhisperTranscriber":
        """
-        Deserialize this component from a dictionary.
+        Deserializes the component from a dictionary.
+
+        :param data:
+            The dictionary to deserialize from.
+        :returns:
+            The deserialized component.
        """
        deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
        return default_from_dict(cls, data)
@ -101,10 +120,13 @@ class RemoteWhisperTranscriber:
    @component.output_types(documents=List[Document])
    def run(self, sources: List[Union[str, Path, ByteStream]]):
        """
-        Transcribe the audio files into a list of Documents, one for each input file.
+        Transcribes the audio files into a list of Documents, one for each input file.

-        :param sources: A list of file paths or ByteStreams containing the audio files to transcribe.
-        :returns: A list of Documents, one for each file. The content of the document is the transcription text.
+        :param sources:
+            A list of file paths or ByteStreams containing the audio files to transcribe.
+
+        :returns: A dictionary with the following keys:
+            - `documents`: A list of Documents, one for each file. The content of the document is the transcribed text.
        """
        documents = []