2023-02-03 00:05:28 -08:00
|
|
|
"""Simple Reader that reads transcript of youtube video."""
|
2023-02-16 17:49:54 +01:00
|
|
|
from typing import Any, List, Optional
|
2023-02-03 00:05:28 -08:00
|
|
|
|
2023-02-20 21:46:58 -08:00
|
|
|
from llama_index.readers.base import BaseReader
|
|
|
|
from llama_index.readers.schema.base import Document
|
2023-02-03 00:05:28 -08:00
|
|
|
|
|
|
|
|
|
|
|
class YoutubeTranscriptReader(BaseReader):
|
|
|
|
"""Youtube Transcript reader."""
|
|
|
|
|
2023-02-16 17:49:54 +01:00
|
|
|
def load_data(self, ytlinks: List[str], languages: Optional[List[str]] = ['en'], **load_kwargs: Any) -> List[Document]:
|
2023-02-03 00:05:28 -08:00
|
|
|
"""Load data from the input directory.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
pages (List[str]): List of youtube links \
|
|
|
|
for which transcripts are to be read.
|
|
|
|
|
|
|
|
"""
|
2023-02-03 20:12:03 -08:00
|
|
|
from youtube_transcript_api import YouTubeTranscriptApi
|
2023-02-03 00:05:28 -08:00
|
|
|
|
|
|
|
results = []
|
|
|
|
for link in ytlinks:
|
|
|
|
video_id = link.split("?v=")[-1]
|
2023-02-16 17:49:54 +01:00
|
|
|
srt = YouTubeTranscriptApi.get_transcript(video_id, languages=languages)
|
2023-02-03 00:05:28 -08:00
|
|
|
transcript = ""
|
|
|
|
for chunk in srt:
|
|
|
|
transcript = transcript + chunk["text"] + "\n"
|
|
|
|
results.append(Document(transcript))
|
|
|
|
return results
|