chore: adjust docstrings in the audio package (#7246)

* adjust docstrings in the audio package

* Apply suggestions from code review

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* Update haystack/components/audio/whisper_remote.py

* black complaining for apparently no reason

---------

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
This commit is contained in:
Massimiliano Pippi 2024-02-28 18:22:03 +01:00 committed by GitHub
parent e5f0e248b6
commit 890c613a2c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 86 additions and 40 deletions

View File

@ -20,11 +20,20 @@ WhisperLocalModel = Literal["tiny", "small", "medium", "large", "large-v2"]
@component
class LocalWhisperTranscriber:
"""
Transcribes audio files using OpenAI's Whisper's model on your local machine.
Transcribes audio files using OpenAI's Whisper model in your local machine.
For the supported audio formats, languages, and other parameters, see the
[Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
[github repo](https://github.com/openai/whisper).
[github repository](https://github.com/openai/whisper).
Usage example:
```python
from haystack.components.audio import LocalWhisperTranscriber
whisper = LocalWhisperTranscriber(model="small")
whisper.warm_up()
transcription = whisper.run(audio_files=["path/to/audio/file"])
```
"""
def __init__(
@ -34,10 +43,14 @@ class LocalWhisperTranscriber:
whisper_params: Optional[Dict[str, Any]] = None,
):
"""
:param model: Name of the model to use. Set it to one of the following values:
:type model: Literal["tiny", "small", "medium", "large", "large-v2"]
:param device: The device on which the model is loaded. If `None`, the default device is automatically
selected.
Creates an instance of the LocalWhisperTranscriber component.
:param model:
Name of the model to use. Set it to one of the following values:
:type model:
Literal["tiny", "small", "medium", "large", "large-v2"]
:param device:
The device on which the model is loaded. If `None`, the default device is automatically selected.
"""
whisper_import.check()
if model not in get_args(WhisperLocalModel):
@ -51,21 +64,29 @@ class LocalWhisperTranscriber:
def warm_up(self) -> None:
"""
Loads the model.
Loads the model in memory.
"""
if not self._model:
self._model = whisper.load_model(self.model, device=self.device.to_torch())
def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
return default_to_dict(self, model=self.model, device=self.device.to_dict(), whisper_params=self.whisper_params)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "LocalWhisperTranscriber":
"""
Create a `LocalWhisperTranscriber` instance from a dictionary.
Deserializes the component from a dictionary.
:param data:
The dictionary to deserialize from.
:returns:
The deserialized component.
"""
serialized_device = data["init_parameters"]["device"]
data["init_parameters"]["device"] = ComponentDevice.from_dict(serialized_device)
@ -75,17 +96,19 @@ class LocalWhisperTranscriber:
@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]], whisper_params: Optional[Dict[str, Any]] = None):
"""
Transcribe the audio files into a list of Documents, one for each input file.
Transcribes the audio files into a list of Documents, one for each input file.
For the supported audio formats, languages, and other parameters, see the
[Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
[github repo](https://github.com/openai/whisper).
:param audio_files: A list of paths or binary streams to transcribe.
:returns: A list of Documents, one for each file. The content of the document is the transcription text,
while the document's metadata contains all the other values returned by the Whisper model, such as the
alignment data. Another key called `audio_file` contains the path to the audio file used for the
transcription.
:param audio_files:
A list of paths or binary streams to transcribe.
:returns: A dictionary with the following keys:
- `documents`: A list of Documents, one for each file. The content of the document is the transcription text,
while the document's metadata contains the values returned by the Whisper model, such as the
alignment data and the path to the audio file used for the transcription.
"""
if self._model is None:
raise ComponentError("The component was not warmed up. Run 'warm_up()' before calling 'run()'.")
@ -98,17 +121,16 @@ class LocalWhisperTranscriber:
def transcribe(self, sources: List[Union[str, Path, ByteStream]], **kwargs) -> List[Document]:
"""
Transcribe the audio files into a list of Documents, one for each input file.
Transcribes the audio files into a list of Documents, one for each input file.
For the supported audio formats, languages, and other parameters, see the
[Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
[github repo](https://github.com/openai/whisper).
:param audio_files: A list of paths or binary streams to transcribe.
:returns: A list of Documents, one for each file. The content of the document is the transcription text,
while the document's metadata contains all the other values returned by the Whisper model, such as the
alignment data. Another key called `audio_file` contains the path to the audio file used for the
transcription.
:param audio_files:
A list of paths or binary streams to transcribe.
:returns:
A list of Documents, one for each file.
"""
transcriptions = self._raw_transcribe(sources, **kwargs)
documents = []
@ -120,14 +142,16 @@ class LocalWhisperTranscriber:
def _raw_transcribe(self, sources: List[Union[str, Path, ByteStream]], **kwargs) -> Dict[Path, Any]:
"""
Transcribe the given audio files. Returns the output of the model, a dictionary, for each input file.
Transcribes the given audio files. Returns the output of the model, a dictionary, for each input file.
For the supported audio formats, languages, and other parameters, see the
[Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper
[github repo](https://github.com/openai/whisper).
:param audio_files: A list of paths or binary streams to transcribe.
:returns: A dictionary of file_path -> transcription.
:param audio_files:
A list of paths or binary streams to transcribe.
:returns:
A dictionary mapping 'file_path' to 'transcription'.
"""
if self._model is None:
raise ComponentError("Model is not loaded, please run 'warm_up()' before calling 'run()'")

View File

@ -15,12 +15,20 @@ logger = logging.getLogger(__name__)
@component
class RemoteWhisperTranscriber:
"""
Transcribes audio files using OpenAI's Whisper using OpenAI API. Requires an API key. See the
[OpenAI blog post](https://beta.openai.com/docs/api-reference/whisper) for more details.
You can get one by signing up for an [OpenAI account](https://beta.openai.com/).
Transcribes audio files using the Whisper API from OpenAI.
The component requires an API key, see the relative
[OpenAI documentation](https://platform.openai.com/docs/api-reference/authentication) for more details.
For the supported audio formats, languages, and other parameters, see the
[Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text)
Usage example:
```python
from haystack.components.audio import RemoteWhisperTranscriber
whisper = RemoteWhisperTranscriber(api_key=Secret.from_token("<your-api-key>"), model="tiny")
transcription = whisper.run(sources=["path/to/audio/file"])
```
"""
def __init__(
@ -32,14 +40,19 @@ class RemoteWhisperTranscriber:
**kwargs,
):
"""
Transcribes a list of audio files into a list of Documents.
Creates an instance of the RemoteWhisperTranscriber component.
:param api_key: OpenAI API key.
:param model: Name of the model to use. It now accepts only `whisper-1`.
:param organization: The Organization ID, defaults to `None`. See
:param api_key:
OpenAI API key.
:param model:
Name of the model to use. It now accepts only `whisper-1`.
:param organization:
The Organization ID. See
[production best practices](https://platform.openai.com/docs/guides/production-best-practices/setting-up-your-organization).
:param api_base: An optional URL to use as the API base. Defaults to `None`. See OpenAI [docs](https://platform.openai.com/docs/api-reference/audio).
:param kwargs: Other parameters to use for the model. These parameters are all sent directly to the OpenAI
:param api_base:
An optional URL to use as the API base. See OpenAI [docs](https://platform.openai.com/docs/api-reference/audio).
:param kwargs:
Other parameters to use for the model. These parameters are all sent directly to the OpenAI
endpoint. See OpenAI [documentation](https://platform.openai.com/docs/api-reference/audio) for more details.
Some of the supported parameters:
- `language`: The language of the input audio.
@ -77,9 +90,10 @@ class RemoteWhisperTranscriber:
def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
This method overrides the default serializer in order to
avoid leaking the `api_key` value passed to the constructor.
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
return default_to_dict(
self,
@ -93,7 +107,12 @@ class RemoteWhisperTranscriber:
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "RemoteWhisperTranscriber":
"""
Deserialize this component from a dictionary.
Deserializes the component from a dictionary.
:param data:
The dictionary to deserialize from.
:returns:
The deserialized component.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
return default_from_dict(cls, data)
@ -101,10 +120,13 @@ class RemoteWhisperTranscriber:
@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]]):
"""
Transcribe the audio files into a list of Documents, one for each input file.
Transcribes the audio files into a list of Documents, one for each input file.
:param sources: A list of file paths or ByteStreams containing the audio files to transcribe.
:returns: A list of Documents, one for each file. The content of the document is the transcription text.
:param sources:
A list of file paths or ByteStreams containing the audio files to transcribe.
:returns: A dictionary with the following keys:
- `documents`: A list of Documents, one for each file. The content of the document is the transcribed text.
"""
documents = []