llama-hub/loader_hub/remote/base.py

"""Remote file reader.

A loader that fetches an arbitrary remote page or file by URL and parses its contents.

"""
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

from gpt_index import download_loader
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document


class RemoteReader(BaseReader):
    """General reader for any remote page or file."""

    def __init__(
        self,
        *args: Any,
        file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None,
        **kwargs: Any,
    ) -> None:
        """Init params."""
        super().__init__(*args, **kwargs)

        self.file_extractor = file_extractor

    def load_data(self, url: str) -> List[Document]:
        """Parse whatever is at the URL."""
        import tempfile
        from urllib.parse import urlparse
        from urllib.request import urlopen

        extra_info = {"Source": url}

        result = urlopen(url)
        url_type = result.info().get_content_type()
        documents = []
        if url_type == "text/html" or url_type == "text/plain":
            text = "\n\n".join([str(el.decode("utf-8-sig")) for el in result])
            documents = [Document(text, extra_info=extra_info)]
        else:
            suffix = Path(urlparse(url).path).suffix
            with tempfile.TemporaryDirectory() as temp_dir:
                filepath = f"{temp_dir}/temp{suffix}"
                with open(filepath, "wb") as output:
                    output.write(result.read())

                SimpleDirectoryReader = download_loader("SimpleDirectoryReader")
                loader = SimpleDirectoryReader(
                    temp_dir,
                    file_metadata=(lambda _: extra_info),
                    file_extractor=self.file_extractor,
                )
                documents = loader.load_data()
        return documents
Remote reader (#17) * Small bug fixes * Remote loader for pages/files * Add to library 2023-02-09 17:27:20 -08:00			`"""Remote file reader.`

			`A loader that fetches an arbitrary remote page or file by URL and parses its contents.`

			`"""`
			`from pathlib import Path`
			`from typing import Any, Dict, List, Optional, Union`

			`from gpt_index import download_loader`
			`from gpt_index.readers.base import BaseReader`
			`from gpt_index.readers.schema.base import Document`


			`class RemoteReader(BaseReader):`
			`"""General reader for any remote page or file."""`

			`def __init__(`
			`self,`
			`*args: Any,`
			`file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None,`
			`**kwargs: Any,`
			`) -> None:`
			`"""Init params."""`
			`super().__init__(args, *kwargs)`

			`self.file_extractor = file_extractor`

			`def load_data(self, url: str) -> List[Document]:`
			`"""Parse whatever is at the URL."""`
			`import tempfile`
			`from urllib.parse import urlparse`
			`from urllib.request import urlopen`

			`extra_info = {"Source": url}`

			`result = urlopen(url)`
			`url_type = result.info().get_content_type()`
			`documents = []`
			`if url_type == "text/html" or url_type == "text/plain":`
			`text = "\n\n".join([str(el.decode("utf-8-sig")) for el in result])`
			`documents = [Document(text, extra_info=extra_info)]`
			`else:`
			`suffix = Path(urlparse(url).path).suffix`
			`with tempfile.TemporaryDirectory() as temp_dir:`
			`filepath = f"{temp_dir}/temp{suffix}"`
			`with open(filepath, "wb") as output:`
			`output.write(result.read())`

			`SimpleDirectoryReader = download_loader("SimpleDirectoryReader")`
			`loader = SimpleDirectoryReader(`
			`temp_dir,`
			`file_metadata=(lambda _: extra_info),`
			`file_extractor=self.file_extractor,`
			`)`
			`documents = loader.load_data()`
			`return documents`