mirror of
https://github.com/microsoft/autogen.git
synced 2025-12-29 16:09:07 +00:00
Update whisper notebook to use new function registration syntax (#1918)
* Update notebook to use new function registration syntax * Update agentchat_video_transcript_translate_with_whisper.ipynb * formatting
This commit is contained in:
parent
579c3cc466
commit
968483369c
@ -1,20 +1,12 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e4fccaaa-fda5-4f99-a4c5-c463c5c890f5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a href=\"https://colab.research.google.com/github/microsoft/autogen/blob/main/notebook/agentchat_video_transcript_translate_with_whisper.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a5b4540e-4987-4774-9305-764c3133e953",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a id=\"toc\"></a>\n",
|
||||
"# Auto Generated Agent Chat: Translating Video audio using Whisper and GPT-3.5-turbo\n",
|
||||
"# Translating Video audio using Whisper and GPT-3.5-turbo\n",
|
||||
"\n",
|
||||
"In this notebook, we demonstrate how to use whisper and GPT-3.5-turbo with `AssistantAgent` and `UserProxyAgent` to recognize and translate\n",
|
||||
"the speech sound from a video file and add the timestamp like a subtitle file based on [agentchat_function_call.ipynb](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_function_call.ipynb)\n"
|
||||
]
|
||||
@ -25,27 +17,20 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Requirements\n",
|
||||
"AutoGen requires `Python>=3.8`. To run this notebook example, please install `openai`, `pyautogen`, `whisper`, and `moviepy`:\n",
|
||||
"\n",
|
||||
"``````\n",
|
||||
"````{=mdx}\n",
|
||||
":::info Requirements\n",
|
||||
"Some extra dependencies are needed for this notebook, which can be installed via pip:\n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"pip install openai\n",
|
||||
"pip install openai-whisper\n",
|
||||
"pip install moviepy\n",
|
||||
"pip install pyautogen\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bc4600b8-c6df-49dd-945d-ce69f30a65cc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture --no-stderr\n",
|
||||
"# %pip install moviepy~=1.0.3\n",
|
||||
"# %pip install openai-whisper~=20230918\n",
|
||||
"# %pip install openai~=1.3.5\n",
|
||||
"# %pip install \"pyautogen>=0.2.3\""
|
||||
"pip install pyautogen openai openai-whisper\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"For more information, please refer to the [installation guide](/docs/installation/).\n",
|
||||
":::\n",
|
||||
"````\n",
|
||||
"``````"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -59,19 +44,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 2,
|
||||
"id": "26d1ae87-f007-4286-a56a-dcf68abf9393",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"import whisper\n",
|
||||
"from moviepy.editor import VideoFileClip\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"import autogen\n",
|
||||
"\n",
|
||||
"config_list = [\n",
|
||||
" {\n",
|
||||
" \"model\": \"gpt-4\",\n",
|
||||
@ -85,11 +64,164 @@
|
||||
"id": "324fec65-ab23-45db-a7a8-0aaf753fe19c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"````{=mdx}\n",
|
||||
":::tip\n",
|
||||
"Learn more about configuring LLMs for agents [here](/docs/topics/llm_configuration).\n",
|
||||
":::\n",
|
||||
"````\n",
|
||||
"\n",
|
||||
"## Example and Output\n",
|
||||
"Below is an example of speech recognition from a [Peppa Pig cartoon video clip](https://drive.google.com/file/d/1QY0naa2acHw2FuH7sY3c-g2sBLtC2Sv4/view?usp=drive_link) originally in English and translated into Chinese.\n",
|
||||
"'FFmpeg' does not support online files. To run the code on the example video, you need to download the example video locally. You can change `your_file_path` to your local video file path."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c3e691b8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from typing import Annotated, List\n",
|
||||
"\n",
|
||||
"import whisper\n",
|
||||
"from openai import OpenAI\n",
|
||||
"import autogen\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"source_language = \"English\"\n",
|
||||
"target_language = \"Chinese\"\n",
|
||||
"key = os.getenv(\"OPENAI_API_KEY\")\n",
|
||||
"target_video = \"your_file_path\"\n",
|
||||
"\n",
|
||||
"assistant = autogen.AssistantAgent(\n",
|
||||
" name=\"assistant\",\n",
|
||||
" system_message=\"For coding tasks, only use the functions you have been provided with. Reply TERMINATE when the task is done.\",\n",
|
||||
" llm_config={\"config_list\": config_list, \"timeout\": 120},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"user_proxy = autogen.UserProxyAgent(\n",
|
||||
" name=\"user_proxy\",\n",
|
||||
" is_termination_msg=lambda x: x.get(\"content\", \"\") and x.get(\"content\", \"\").rstrip().endswith(\"TERMINATE\"),\n",
|
||||
" human_input_mode=\"NEVER\",\n",
|
||||
" max_consecutive_auto_reply=10,\n",
|
||||
" code_execution_config={},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def translate_text(input_text, source_language, target_language):\n",
|
||||
" client = OpenAI(api_key=key)\n",
|
||||
"\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" model=\"gpt-3.5-turbo\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": f\"Directly translate the following {source_language} text to a pure {target_language} \"\n",
|
||||
" f\"video subtitle text without additional explanation.: '{input_text}'\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" max_tokens=1500,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Correctly accessing the response content\n",
|
||||
" translated_text = response.choices[0].message.content if response.choices else None\n",
|
||||
" return translated_text\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@user_proxy.register_for_execution()\n",
|
||||
"@assistant.register_for_llm(description=\"using translate_text function to translate the script\")\n",
|
||||
"def translate_transcript(\n",
|
||||
" source_language: Annotated[str, \"Source language\"], target_language: Annotated[str, \"Target language\"]\n",
|
||||
") -> str:\n",
|
||||
" with open(\"transcription.txt\", \"r\") as f:\n",
|
||||
" lines = f.readlines()\n",
|
||||
"\n",
|
||||
" translated_transcript = []\n",
|
||||
"\n",
|
||||
" for line in lines:\n",
|
||||
" # Split each line into timestamp and text parts\n",
|
||||
" parts = line.strip().split(\": \")\n",
|
||||
" if len(parts) == 2:\n",
|
||||
" timestamp, text = parts[0], parts[1]\n",
|
||||
" # Translate only the text part\n",
|
||||
" translated_text = translate_text(text, source_language, target_language)\n",
|
||||
" # Reconstruct the line with the translated text and the preserved timestamp\n",
|
||||
" translated_line = f\"{timestamp}: {translated_text}\"\n",
|
||||
" translated_transcript.append(translated_line)\n",
|
||||
" else:\n",
|
||||
" # If the line doesn't contain a timestamp, add it as is\n",
|
||||
" translated_transcript.append(line.strip())\n",
|
||||
"\n",
|
||||
" return \"\\n\".join(translated_transcript)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@user_proxy.register_for_execution()\n",
|
||||
"@assistant.register_for_llm(description=\"recognize the speech from video and transfer into a txt file\")\n",
|
||||
"def recognize_transcript_from_video(filepath: Annotated[str, \"path of the video file\"]) -> List[dict]:\n",
|
||||
" try:\n",
|
||||
" # Load model\n",
|
||||
" model = whisper.load_model(\"small\")\n",
|
||||
"\n",
|
||||
" # Transcribe audio with detailed timestamps\n",
|
||||
" result = model.transcribe(filepath, verbose=True)\n",
|
||||
"\n",
|
||||
" # Initialize variables for transcript\n",
|
||||
" transcript = []\n",
|
||||
" sentence = \"\"\n",
|
||||
" start_time = 0\n",
|
||||
"\n",
|
||||
" # Iterate through the segments in the result\n",
|
||||
" for segment in result[\"segments\"]:\n",
|
||||
" # If new sentence starts, save the previous one and reset variables\n",
|
||||
" if segment[\"start\"] != start_time and sentence:\n",
|
||||
" transcript.append(\n",
|
||||
" {\n",
|
||||
" \"sentence\": sentence.strip() + \".\",\n",
|
||||
" \"timestamp_start\": start_time,\n",
|
||||
" \"timestamp_end\": segment[\"start\"],\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
" sentence = \"\"\n",
|
||||
" start_time = segment[\"start\"]\n",
|
||||
"\n",
|
||||
" # Add the word to the current sentence\n",
|
||||
" sentence += segment[\"text\"] + \" \"\n",
|
||||
"\n",
|
||||
" # Add the final sentence\n",
|
||||
" if sentence:\n",
|
||||
" transcript.append(\n",
|
||||
" {\n",
|
||||
" \"sentence\": sentence.strip() + \".\",\n",
|
||||
" \"timestamp_start\": start_time,\n",
|
||||
" \"timestamp_end\": result[\"segments\"][-1][\"end\"],\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Save the transcript to a file\n",
|
||||
" with open(\"transcription.txt\", \"w\") as file:\n",
|
||||
" for item in transcript:\n",
|
||||
" sentence = item[\"sentence\"]\n",
|
||||
" start_time, end_time = item[\"timestamp_start\"], item[\"timestamp_end\"]\n",
|
||||
" file.write(f\"{start_time}s to {end_time}s: {sentence}\\n\")\n",
|
||||
"\n",
|
||||
" return transcript\n",
|
||||
"\n",
|
||||
" except FileNotFoundError:\n",
|
||||
" return \"The specified audio file could not be found.\"\n",
|
||||
" except Exception as e:\n",
|
||||
" return f\"An unexpected error occurred: {str(e)}\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "072de235",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now, start the chat:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
@ -206,180 +338,30 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def recognize_transcript_from_video(audio_filepath):\n",
|
||||
" try:\n",
|
||||
" # Load model\n",
|
||||
" model = whisper.load_model(\"small\")\n",
|
||||
"\n",
|
||||
" # Transcribe audio with detailed timestamps\n",
|
||||
" result = model.transcribe(audio_filepath, verbose=True)\n",
|
||||
"\n",
|
||||
" # Initialize variables for transcript\n",
|
||||
" transcript = []\n",
|
||||
" sentence = \"\"\n",
|
||||
" start_time = 0\n",
|
||||
"\n",
|
||||
" # Iterate through the segments in the result\n",
|
||||
" for segment in result[\"segments\"]:\n",
|
||||
" # If new sentence starts, save the previous one and reset variables\n",
|
||||
" if segment[\"start\"] != start_time and sentence:\n",
|
||||
" transcript.append(\n",
|
||||
" {\n",
|
||||
" \"sentence\": sentence.strip() + \".\",\n",
|
||||
" \"timestamp_start\": start_time,\n",
|
||||
" \"timestamp_end\": segment[\"start\"],\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
" sentence = \"\"\n",
|
||||
" start_time = segment[\"start\"]\n",
|
||||
"\n",
|
||||
" # Add the word to the current sentence\n",
|
||||
" sentence += segment[\"text\"] + \" \"\n",
|
||||
"\n",
|
||||
" # Add the final sentence\n",
|
||||
" if sentence:\n",
|
||||
" transcript.append(\n",
|
||||
" {\n",
|
||||
" \"sentence\": sentence.strip() + \".\",\n",
|
||||
" \"timestamp_start\": start_time,\n",
|
||||
" \"timestamp_end\": result[\"segments\"][-1][\"end\"],\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Save the transcript to a file\n",
|
||||
" with open(\"transcription.txt\", \"w\") as file:\n",
|
||||
" for item in transcript:\n",
|
||||
" sentence = item[\"sentence\"]\n",
|
||||
" start_time, end_time = item[\"timestamp_start\"], item[\"timestamp_end\"]\n",
|
||||
" file.write(f\"{start_time}s to {end_time}s: {sentence}\\n\")\n",
|
||||
"\n",
|
||||
" return transcript\n",
|
||||
"\n",
|
||||
" except FileNotFoundError:\n",
|
||||
" return \"The specified audio file could not be found.\"\n",
|
||||
" except Exception as e:\n",
|
||||
" return f\"An unexpected error occurred: {str(e)}\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def translate_text(input_text, source_language, target_language):\n",
|
||||
" client = OpenAI(api_key=key)\n",
|
||||
"\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" model=\"gpt-3.5-turbo\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": f\"Directly translate the following {source_language} text to a pure {target_language} \"\n",
|
||||
" f\"video subtitle text without additional explanation.: '{input_text}'\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" max_tokens=1500,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Correctly accessing the response content\n",
|
||||
" translated_text = response.choices[0].message.content if response.choices else None\n",
|
||||
" return translated_text\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def translate_transcript(source_language, target_language):\n",
|
||||
" with open(\"transcription.txt\", \"r\") as f:\n",
|
||||
" lines = f.readlines()\n",
|
||||
"\n",
|
||||
" translated_transcript = []\n",
|
||||
"\n",
|
||||
" for line in lines:\n",
|
||||
" # Split each line into timestamp and text parts\n",
|
||||
" parts = line.strip().split(\": \")\n",
|
||||
" if len(parts) == 2:\n",
|
||||
" timestamp, text = parts[0], parts[1]\n",
|
||||
" # Translate only the text part\n",
|
||||
" translated_text = translate_text(text, source_language, target_language)\n",
|
||||
" # Reconstruct the line with the translated text and the preserved timestamp\n",
|
||||
" translated_line = f\"{timestamp}: {translated_text}\"\n",
|
||||
" translated_transcript.append(translated_line)\n",
|
||||
" else:\n",
|
||||
" # If the line doesn't contain a timestamp, add it as is\n",
|
||||
" translated_transcript.append(line.strip())\n",
|
||||
"\n",
|
||||
" return \"\\n\".join(translated_transcript)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"llm_config = {\n",
|
||||
" \"functions\": [\n",
|
||||
" {\n",
|
||||
" \"name\": \"recognize_transcript_from_video\",\n",
|
||||
" \"description\": \"recognize the speech from video and transfer into a txt file\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"audio_filepath\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"path of the video file\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"required\": [\"audio_filepath\"],\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"translate_transcript\",\n",
|
||||
" \"description\": \"using translate_text function to translate the script\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"source_language\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"source language\",\n",
|
||||
" },\n",
|
||||
" \"target_language\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"target language\",\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" \"required\": [\"source_language\", \"target_language\"],\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" \"config_list\": config_list,\n",
|
||||
" \"timeout\": 120,\n",
|
||||
"}\n",
|
||||
"source_language = \"English\"\n",
|
||||
"target_language = \"Chinese\"\n",
|
||||
"key = os.getenv(\"OPENAI_API_KEY\")\n",
|
||||
"target_video = \"your_file_path\"\n",
|
||||
"\n",
|
||||
"chatbot = autogen.AssistantAgent(\n",
|
||||
" name=\"chatbot\",\n",
|
||||
" system_message=\"For coding tasks, only use the functions you have been provided with. Reply TERMINATE when the task is done.\",\n",
|
||||
" llm_config=llm_config,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"user_proxy = autogen.UserProxyAgent(\n",
|
||||
" name=\"user_proxy\",\n",
|
||||
" is_termination_msg=lambda x: x.get(\"content\", \"\") and x.get(\"content\", \"\").rstrip().endswith(\"TERMINATE\"),\n",
|
||||
" human_input_mode=\"NEVER\",\n",
|
||||
" max_consecutive_auto_reply=10,\n",
|
||||
" code_execution_config={\n",
|
||||
" \"work_dir\": \"coding_2\",\n",
|
||||
" \"use_docker\": False,\n",
|
||||
" }, # Please set use_docker=True if docker is available to run the generated code. Using docker is safer than running the generated code directly.\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"user_proxy.register_function(\n",
|
||||
" function_map={\n",
|
||||
" \"recognize_transcript_from_video\": recognize_transcript_from_video,\n",
|
||||
" \"translate_transcript\": translate_transcript,\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"user_proxy.initiate_chat(\n",
|
||||
" chatbot,\n",
|
||||
" assistant,\n",
|
||||
" message=f\"For the video located in {target_video}, recognize the speech and transfer it into a script file, \"\n",
|
||||
" f\"then translate from {source_language} text to a {target_language} video subtitle text. \",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aeea924a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"front_matter": {
|
||||
"description": "Use tools to extract and translate the transcript of a video file.",
|
||||
"tags": [
|
||||
"whisper",
|
||||
"function call"
|
||||
]
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
@ -395,7 +377,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.10"
|
||||
"version": "3.11.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user