2023-02-24 22:47:23 -08:00
|
|
|
"""Remote file reader.
|
|
|
|
|
|
|
|
A loader that fetches any remote page or file by URL and retrieves child pages with certain constraints. The class also parses the contents of each page and provides access to the parsed data.
|
|
|
|
"""
|
|
|
|
from typing import Any, Dict, List, Optional, Union
|
|
|
|
|
|
|
|
import requests
|
|
|
|
from llama_index import download_loader
|
|
|
|
from llama_index.readers.base import BaseReader
|
|
|
|
from llama_index.readers.schema.base import Document
|
|
|
|
|
|
|
|
|
|
|
|
class RemoteDepthReader(BaseReader):
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
*args: Any,
|
|
|
|
file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None,
|
|
|
|
depth: int = 1,
|
2023-03-16 10:00:26 +02:00
|
|
|
domain_lock: bool = False,
|
2023-02-24 22:47:23 -08:00
|
|
|
**kwargs: Any,
|
|
|
|
) -> None:
|
|
|
|
"""Init params."""
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
self.file_extractor = file_extractor
|
|
|
|
self.depth = depth
|
2023-03-16 10:00:26 +02:00
|
|
|
self.domain_lock = domain_lock
|
2023-02-24 22:47:23 -08:00
|
|
|
|
|
|
|
def load_data(self, url: str) -> List[Document]:
|
|
|
|
from tqdm.auto import tqdm
|
|
|
|
|
2023-02-24 23:39:32 -08:00
|
|
|
"""Parse whatever is at the URL.""" ""
|
2023-02-24 22:47:23 -08:00
|
|
|
RemoteReader = download_loader("RemoteReader")
|
|
|
|
remote_reader = RemoteReader(file_extractor=self.file_extractor)
|
|
|
|
documents = []
|
|
|
|
links = self.get_links(url)
|
|
|
|
urls = {-1: [url]} # -1 is the starting point
|
|
|
|
links_visited = []
|
2023-02-24 23:39:32 -08:00
|
|
|
for i in range(self.depth + 1):
|
2023-02-24 22:47:23 -08:00
|
|
|
urls[i] = []
|
|
|
|
new_links = []
|
|
|
|
print(f"Reading links at depth {i}...")
|
|
|
|
for link in tqdm(links):
|
2023-03-16 10:00:26 +02:00
|
|
|
"""Checking if the link belongs the provided domain. """
|
|
|
|
if ((self.domain_lock and link.find(url)>-1) or (not self.domain_lock)):
|
|
|
|
print("Loading link: " + link)
|
|
|
|
if link in links_visited:
|
|
|
|
continue
|
|
|
|
if link:
|
|
|
|
urls[i].append(link)
|
|
|
|
new_links.extend(self.get_links(link))
|
|
|
|
links_visited.append(link)
|
|
|
|
else:
|
|
|
|
print("Link ignored: " +link)
|
2023-02-24 22:47:23 -08:00
|
|
|
new_links = list(set(new_links))
|
|
|
|
links = new_links
|
|
|
|
print(f"Found {len(urls)} links at depth {self.depth}.")
|
|
|
|
for depth_i in urls:
|
|
|
|
for url in urls[depth_i]:
|
|
|
|
try:
|
|
|
|
documents.extend(remote_reader.load_data(url))
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Error reading {url} at depth {depth_i}: {e}")
|
|
|
|
continue
|
|
|
|
|
|
|
|
return documents
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def is_url(href) -> bool:
|
|
|
|
"""Check if a link is a URL."""
|
2023-02-24 23:39:32 -08:00
|
|
|
return href.startswith("http")
|
2023-02-24 22:47:23 -08:00
|
|
|
|
|
|
|
def get_links(self, url) -> List[str]:
|
|
|
|
from urllib.parse import urljoin, urlparse, urlunparse
|
2023-02-24 23:39:32 -08:00
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
2023-02-24 22:47:23 -08:00
|
|
|
"""Get all links from a page."""
|
|
|
|
page = requests.get(url)
|
|
|
|
soup = BeautifulSoup(page.content, "html.parser")
|
|
|
|
|
2023-02-24 23:39:32 -08:00
|
|
|
links = soup.find_all("a")
|
2023-02-24 22:47:23 -08:00
|
|
|
result = []
|
|
|
|
for link in links:
|
|
|
|
if isinstance(link, str):
|
|
|
|
href = link
|
|
|
|
else:
|
2023-02-24 23:39:32 -08:00
|
|
|
href = link.get("href")
|
2023-02-24 22:47:23 -08:00
|
|
|
if href is not None:
|
|
|
|
if not self.is_url(href):
|
|
|
|
href = urljoin(url, href)
|
|
|
|
|
|
|
|
url_parsed = urlparse(href)
|
|
|
|
url_without_query_string = urlunparse(
|
2023-02-24 23:39:32 -08:00
|
|
|
(url_parsed.scheme, url_parsed.netloc, url_parsed.path, "", "", "")
|
|
|
|
)
|
2023-02-24 22:47:23 -08:00
|
|
|
|
2023-02-24 23:39:32 -08:00
|
|
|
if (
|
|
|
|
url_without_query_string not in result
|
|
|
|
and url_without_query_string
|
|
|
|
and url_without_query_string.startswith("http")
|
|
|
|
):
|
2023-02-24 22:47:23 -08:00
|
|
|
result.append(url_without_query_string)
|
|
|
|
return result
|