llama-hub/loader_hub/remote_depth/base.py

"""Remote file reader.

A loader that fetches any remote page or file by URL and retrieves child pages with certain constraints. The class also parses the contents of each page and provides access to the parsed data.
"""
from typing import Any, Dict, List, Optional, Union

import requests
from llama_index import download_loader
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document


class RemoteDepthReader(BaseReader):
    def __init__(
        self,
        *args: Any,
        file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None,
        depth: int = 1,
        domain_lock: bool = False,
        **kwargs: Any,
    ) -> None:
        """Init params."""
        super().__init__(*args, **kwargs)
        self.file_extractor = file_extractor
        self.depth = depth
        self.domain_lock = domain_lock

    def load_data(self, url: str) -> List[Document]:
        from tqdm.auto import tqdm

        """Parse whatever is at the URL.""" ""
        RemoteReader = download_loader("RemoteReader")
        remote_reader = RemoteReader(file_extractor=self.file_extractor)
        documents = []
        links = self.get_links(url)
        urls = {-1: [url]}  # -1 is the starting point
        links_visited = []
        for i in range(self.depth + 1):
            urls[i] = []
            new_links = []
            print(f"Reading links at depth {i}...")
            for link in tqdm(links):
                """Checking if the link belongs the provided domain. """
                if ((self.domain_lock and link.find(url)>-1) or (not self.domain_lock)):
                    print("Loading link: " + link)
                    if link in links_visited:
                        continue
                    if link:
                        urls[i].append(link)
                        new_links.extend(self.get_links(link))
                    links_visited.append(link)
                else:
                    print("Link ignored: " +link)
            new_links = list(set(new_links))
            links = new_links
        print(f"Found {len(urls)} links at depth {self.depth}.")
        for depth_i in urls:
            for url in urls[depth_i]:
                try:
                    documents.extend(remote_reader.load_data(url))
                except Exception as e:
                    print(f"Error reading {url} at depth {depth_i}: {e}")
                    continue

        return documents

    @staticmethod
    def is_url(href) -> bool:
        """Check if a link is a URL."""
        return href.startswith("http")

    def get_links(self, url) -> List[str]:
        from urllib.parse import urljoin, urlparse, urlunparse

        from bs4 import BeautifulSoup

        """Get all links from a page."""
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")

        links = soup.find_all("a")
        result = []
        for link in links:
            if isinstance(link, str):
                href = link
            else:
                href = link.get("href")
            if href is not None:
                if not self.is_url(href):
                    href = urljoin(url, href)

            url_parsed = urlparse(href)
            url_without_query_string = urlunparse(
                (url_parsed.scheme, url_parsed.netloc, url_parsed.path, "", "", "")
            )

            if (
                url_without_query_string not in result
                and url_without_query_string
                and url_without_query_string.startswith("http")
            ):
                result.append(url_without_query_string)
        return result
Add RemoteDepth to llamahub (#62) * adding remote multiple * removing the imports lazyloaded, and added the requirements txt for the libs * updating description 2023-02-24 22:47:23 -08:00			`"""Remote file reader.`

			`A loader that fetches any remote page or file by URL and retrieves child pages with certain constraints. The class also parses the contents of each page and provides access to the parsed data.`
			`"""`
			`from typing import Any, Dict, List, Optional, Union`

			`import requests`
			`from llama_index import download_loader`
			`from llama_index.readers.base import BaseReader`
			`from llama_index.readers.schema.base import Document`


			`class RemoteDepthReader(BaseReader):`
			`def __init__(`
			`self,`
			`*args: Any,`
			`file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None,`
			`depth: int = 1,`
Added domain lock paramater (#115) * Update base.py 2023-03-16 10:00:26 +02:00			`domain_lock: bool = False,`
Add RemoteDepth to llamahub (#62) * adding remote multiple * removing the imports lazyloaded, and added the requirements txt for the libs * updating description 2023-02-24 22:47:23 -08:00			`**kwargs: Any,`
			`) -> None:`
			`"""Init params."""`
			`super().__init__(args, *kwargs)`
			`self.file_extractor = file_extractor`
			`self.depth = depth`
Added domain lock paramater (#115) * Update base.py 2023-03-16 10:00:26 +02:00			`self.domain_lock = domain_lock`
Add RemoteDepth to llamahub (#62) * adding remote multiple * removing the imports lazyloaded, and added the requirements txt for the libs * updating description 2023-02-24 22:47:23 -08:00
			`def load_data(self, url: str) -> List[Document]:`
			`from tqdm.auto import tqdm`

Cleanup 2023-02-24 23:39:32 -08:00			`"""Parse whatever is at the URL.""" ""`
Add RemoteDepth to llamahub (#62) * adding remote multiple * removing the imports lazyloaded, and added the requirements txt for the libs * updating description 2023-02-24 22:47:23 -08:00			`RemoteReader = download_loader("RemoteReader")`
			`remote_reader = RemoteReader(file_extractor=self.file_extractor)`
			`documents = []`
			`links = self.get_links(url)`
			`urls = {-1: [url]} # -1 is the starting point`
			`links_visited = []`
Cleanup 2023-02-24 23:39:32 -08:00			`for i in range(self.depth + 1):`
Add RemoteDepth to llamahub (#62) * adding remote multiple * removing the imports lazyloaded, and added the requirements txt for the libs * updating description 2023-02-24 22:47:23 -08:00			`urls[i] = []`
			`new_links = []`
			`print(f"Reading links at depth {i}...")`
			`for link in tqdm(links):`
Added domain lock paramater (#115) * Update base.py 2023-03-16 10:00:26 +02:00			`"""Checking if the link belongs the provided domain. """`
			`if ((self.domain_lock and link.find(url)>-1) or (not self.domain_lock)):`
			`print("Loading link: " + link)`
			`if link in links_visited:`
			`continue`
			`if link:`
			`urls[i].append(link)`
			`new_links.extend(self.get_links(link))`
			`links_visited.append(link)`
			`else:`
			`print("Link ignored: " +link)`
Add RemoteDepth to llamahub (#62) * adding remote multiple * removing the imports lazyloaded, and added the requirements txt for the libs * updating description 2023-02-24 22:47:23 -08:00			`new_links = list(set(new_links))`
			`links = new_links`
			`print(f"Found {len(urls)} links at depth {self.depth}.")`
			`for depth_i in urls:`
			`for url in urls[depth_i]:`
			`try:`
			`documents.extend(remote_reader.load_data(url))`
			`except Exception as e:`
			`print(f"Error reading {url} at depth {depth_i}: {e}")`
			`continue`

			`return documents`

			`@staticmethod`
			`def is_url(href) -> bool:`
			`"""Check if a link is a URL."""`
Cleanup 2023-02-24 23:39:32 -08:00			`return href.startswith("http")`
Add RemoteDepth to llamahub (#62) * adding remote multiple * removing the imports lazyloaded, and added the requirements txt for the libs * updating description 2023-02-24 22:47:23 -08:00
			`def get_links(self, url) -> List[str]:`
			`from urllib.parse import urljoin, urlparse, urlunparse`
Cleanup 2023-02-24 23:39:32 -08:00
			`from bs4 import BeautifulSoup`

Add RemoteDepth to llamahub (#62) * adding remote multiple * removing the imports lazyloaded, and added the requirements txt for the libs * updating description 2023-02-24 22:47:23 -08:00			`"""Get all links from a page."""`
			`page = requests.get(url)`
			`soup = BeautifulSoup(page.content, "html.parser")`

Cleanup 2023-02-24 23:39:32 -08:00			`links = soup.find_all("a")`
Add RemoteDepth to llamahub (#62) * adding remote multiple * removing the imports lazyloaded, and added the requirements txt for the libs * updating description 2023-02-24 22:47:23 -08:00			`result = []`
			`for link in links:`
			`if isinstance(link, str):`
			`href = link`
			`else:`
Cleanup 2023-02-24 23:39:32 -08:00			`href = link.get("href")`
Add RemoteDepth to llamahub (#62) * adding remote multiple * removing the imports lazyloaded, and added the requirements txt for the libs * updating description 2023-02-24 22:47:23 -08:00			`if href is not None:`
			`if not self.is_url(href):`
			`href = urljoin(url, href)`

			`url_parsed = urlparse(href)`
			`url_without_query_string = urlunparse(`
Cleanup 2023-02-24 23:39:32 -08:00			`(url_parsed.scheme, url_parsed.netloc, url_parsed.path, "", "", "")`
			`)`
Add RemoteDepth to llamahub (#62) * adding remote multiple * removing the imports lazyloaded, and added the requirements txt for the libs * updating description 2023-02-24 22:47:23 -08:00
Cleanup 2023-02-24 23:39:32 -08:00			`if (`
			`url_without_query_string not in result`
			`and url_without_query_string`
			`and url_without_query_string.startswith("http")`
			`):`
Add RemoteDepth to llamahub (#62) * adding remote multiple * removing the imports lazyloaded, and added the requirements txt for the libs * updating description 2023-02-24 22:47:23 -08:00			`result.append(url_without_query_string)`
			`return result`