Jerry Liu e97bb81915
swap out gpt_index imports for llama_index imports (#49)
* cr

* cr

* cr

---------

Co-authored-by: Jerry Liu <jerry@robustintelligence.com>
Co-authored-by: Jesse Zhang <jessetanzhang@gmail.com>
2023-02-20 21:46:58 -08:00

30 lines
1.0 KiB
Python

"""Simple Reader that reads transcript of youtube video."""
from typing import Any, List, Optional
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
class YoutubeTranscriptReader(BaseReader):
"""Youtube Transcript reader."""
def load_data(self, ytlinks: List[str], languages: Optional[List[str]] = ['en'], **load_kwargs: Any) -> List[Document]:
"""Load data from the input directory.
Args:
pages (List[str]): List of youtube links \
for which transcripts are to be read.
"""
from youtube_transcript_api import YouTubeTranscriptApi
results = []
for link in ytlinks:
video_id = link.split("?v=")[-1]
srt = YouTubeTranscriptApi.get_transcript(video_id, languages=languages)
transcript = ""
for chunk in srt:
transcript = transcript + chunk["text"] + "\n"
results.append(Document(transcript))
return results