Jesse Zhang a70e60c94b
Remote reader (#17)
* Small bug fixes

* Remote loader for pages/files

* Add to library
2023-02-09 17:27:20 -08:00

57 lines
1.9 KiB
Python

"""Remote file reader.
A loader that fetches an arbitrary remote page or file by URL and parses its contents.
"""
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from gpt_index import download_loader
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
class RemoteReader(BaseReader):
"""General reader for any remote page or file."""
def __init__(
self,
*args: Any,
file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None,
**kwargs: Any,
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self.file_extractor = file_extractor
def load_data(self, url: str) -> List[Document]:
"""Parse whatever is at the URL."""
import tempfile
from urllib.parse import urlparse
from urllib.request import urlopen
extra_info = {"Source": url}
result = urlopen(url)
url_type = result.info().get_content_type()
documents = []
if url_type == "text/html" or url_type == "text/plain":
text = "\n\n".join([str(el.decode("utf-8-sig")) for el in result])
documents = [Document(text, extra_info=extra_info)]
else:
suffix = Path(urlparse(url).path).suffix
with tempfile.TemporaryDirectory() as temp_dir:
filepath = f"{temp_dir}/temp{suffix}"
with open(filepath, "wb") as output:
output.write(result.read())
SimpleDirectoryReader = download_loader("SimpleDirectoryReader")
loader = SimpleDirectoryReader(
temp_dir,
file_metadata=(lambda _: extra_info),
file_extractor=self.file_extractor,
)
documents = loader.load_data()
return documents