mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-08-14 11:41:56 +00:00
147 lines
4.6 KiB
Python
147 lines
4.6 KiB
Python
"""MangoppsGuides reader."""
|
|
import re
|
|
from typing import List
|
|
from urllib.parse import urlparse
|
|
|
|
from llama_index.readers.base import BaseReader
|
|
from llama_index.readers.schema.base import Document
|
|
|
|
|
|
class MangoppsGuidesReader(BaseReader):
|
|
"""MangoppsGuides reader. Reads data from a MangoppsGuides workspace.
|
|
|
|
Args:
|
|
domain_url (str): MangoppsGuides domain url
|
|
limir (int): depth to crawl
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
"""Initialize MangoppsGuides reader."""
|
|
|
|
def load_data(self, domain_url: str, limit: int) -> List[Document]:
|
|
"""Load data from the workspace.
|
|
Returns:
|
|
List[Document]: List of documents.
|
|
"""
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
self.domain_url = domain_url
|
|
self.limit = limit
|
|
self.start_url = f"{self.domain_url}/home/"
|
|
|
|
fetched_urls = self.crawl_urls()[:self.limit]
|
|
|
|
results = []
|
|
|
|
guides_pages = {}
|
|
for url in fetched_urls:
|
|
try:
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
page_title = soup.find('title').text
|
|
|
|
# Remove the div with aria-label="Table of contents"
|
|
table_of_contents_div = soup.find("div", {"aria-label": "Table of contents"})
|
|
if table_of_contents_div:
|
|
table_of_contents_div.decompose()
|
|
|
|
# Remove header and footer
|
|
header = soup.find("header")
|
|
if header:
|
|
header.decompose()
|
|
footer = soup.find("footer")
|
|
if footer:
|
|
footer.decompose()
|
|
|
|
# Exclude links and their text content from the main content
|
|
for link in soup.find_all("a"):
|
|
link.decompose()
|
|
|
|
# Remove empty elements from the main content
|
|
for element in soup.find_all():
|
|
if element.get_text(strip=True) == "":
|
|
element.decompose()
|
|
|
|
# Find the main element containing the desired content
|
|
main_element = soup.find("main") # Replace "main" with the appropriate element tag or CSS class
|
|
|
|
# Extract the text content from the main element
|
|
if main_element:
|
|
text_content = main_element.get_text("\n")
|
|
# Remove multiple consecutive newlines and keep only one newline
|
|
text_content = re.sub(r'\n+', '\n', text_content)
|
|
else:
|
|
text_content = ""
|
|
|
|
page_text = text_content
|
|
|
|
guides_page = {}
|
|
guides_page['title'] =page_title
|
|
guides_page["text"] = page_text
|
|
guides_pages[url] = guides_page
|
|
except Exception as e:
|
|
print(f"Failed for {url} => {e}")
|
|
|
|
for k, v in guides_pages.items():
|
|
extra_info = {
|
|
"url": k,
|
|
"title": v["title"]
|
|
}
|
|
results.append(
|
|
Document(
|
|
v["text"],
|
|
extra_info=extra_info,
|
|
)
|
|
)
|
|
|
|
return results
|
|
|
|
def crawl_urls(self) -> List[str]:
|
|
"""Crawls all the urls from given domain"""
|
|
|
|
self.visited = []
|
|
|
|
fetched_urls = self.fetch_url(self.start_url)
|
|
fetched_urls = (list(set(fetched_urls)))
|
|
|
|
return fetched_urls
|
|
|
|
def fetch_url(self, url):
|
|
"""Fetch the urls from given domain"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, "html.parser")
|
|
|
|
self.visited.append(url)
|
|
|
|
newurls = []
|
|
for link in soup.find_all("a"):
|
|
href: str = link.get("href")
|
|
if href and urlparse(href).netloc == self.domain_url:
|
|
newurls.append(href)
|
|
elif href and href.startswith("/"):
|
|
newurls.append(f"{self.domain_url}{href}")
|
|
|
|
for newurl in newurls:
|
|
if (
|
|
newurl not in self.visited
|
|
and not newurl.startswith("#")
|
|
and f"https://{urlparse(newurl).netloc}" == self.domain_url
|
|
and len(self.visited) <= self.limit
|
|
):
|
|
newurls = newurls + self.fetch_url(newurl)
|
|
|
|
newurls = (list(set(newurls)))
|
|
return newurls
|
|
|
|
if __name__ == "__main__":
|
|
reader = MangoppsGuidesReader()
|
|
print("Initialized MangoppsGuidesReader")
|
|
output = reader.load_data( domain_url="https://guides.mangoapps.com", limit=5 )
|
|
print(output)
|