2023-05-14 00:18:37 -04:00

147 lines
4.6 KiB
Python

"""MangoppsGuides reader."""
import re
from typing import List
from urllib.parse import urlparse
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
class MangoppsGuidesReader(BaseReader):
"""MangoppsGuides reader. Reads data from a MangoppsGuides workspace.
Args:
domain_url (str): MangoppsGuides domain url
limir (int): depth to crawl
"""
def __init__(self) -> None:
"""Initialize MangoppsGuides reader."""
def load_data(self, domain_url: str, limit: int) -> List[Document]:
"""Load data from the workspace.
Returns:
List[Document]: List of documents.
"""
import requests
from bs4 import BeautifulSoup
self.domain_url = domain_url
self.limit = limit
self.start_url = f"{self.domain_url}/home/"
fetched_urls = self.crawl_urls()[:self.limit]
results = []
guides_pages = {}
for url in fetched_urls:
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
page_title = soup.find('title').text
# Remove the div with aria-label="Table of contents"
table_of_contents_div = soup.find("div", {"aria-label": "Table of contents"})
if table_of_contents_div:
table_of_contents_div.decompose()
# Remove header and footer
header = soup.find("header")
if header:
header.decompose()
footer = soup.find("footer")
if footer:
footer.decompose()
# Exclude links and their text content from the main content
for link in soup.find_all("a"):
link.decompose()
# Remove empty elements from the main content
for element in soup.find_all():
if element.get_text(strip=True) == "":
element.decompose()
# Find the main element containing the desired content
main_element = soup.find("main") # Replace "main" with the appropriate element tag or CSS class
# Extract the text content from the main element
if main_element:
text_content = main_element.get_text("\n")
# Remove multiple consecutive newlines and keep only one newline
text_content = re.sub(r'\n+', '\n', text_content)
else:
text_content = ""
page_text = text_content
guides_page = {}
guides_page['title'] =page_title
guides_page["text"] = page_text
guides_pages[url] = guides_page
except Exception as e:
print(f"Failed for {url} => {e}")
for k, v in guides_pages.items():
extra_info = {
"url": k,
"title": v["title"]
}
results.append(
Document(
v["text"],
extra_info=extra_info,
)
)
return results
def crawl_urls(self) -> List[str]:
"""Crawls all the urls from given domain"""
self.visited = []
fetched_urls = self.fetch_url(self.start_url)
fetched_urls = (list(set(fetched_urls)))
return fetched_urls
def fetch_url(self, url):
"""Fetch the urls from given domain"""
import requests
from bs4 import BeautifulSoup
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
self.visited.append(url)
newurls = []
for link in soup.find_all("a"):
href: str = link.get("href")
if href and urlparse(href).netloc == self.domain_url:
newurls.append(href)
elif href and href.startswith("/"):
newurls.append(f"{self.domain_url}{href}")
for newurl in newurls:
if (
newurl not in self.visited
and not newurl.startswith("#")
and f"https://{urlparse(newurl).netloc}" == self.domain_url
and len(self.visited) <= self.limit
):
newurls = newurls + self.fetch_url(newurl)
newurls = (list(set(newurls)))
return newurls
if __name__ == "__main__":
reader = MangoppsGuidesReader()
print("Initialized MangoppsGuidesReader")
output = reader.load_data( domain_url="https://guides.mangoapps.com", limit=5 )
print(output)