llama-hub/loader_hub/mangoapps_guides/base.py

"""MangoppsGuides reader."""
import re
from typing import List
from urllib.parse import urlparse

from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document


class MangoppsGuidesReader(BaseReader):
    """MangoppsGuides reader. Reads data from a MangoppsGuides workspace.

    Args:
        domain_url (str): MangoppsGuides domain url
        limir (int): depth to crawl
    """

    def __init__(self) -> None:
        """Initialize MangoppsGuides reader."""

    def load_data(self, domain_url: str, limit: int) -> List[Document]:
        """Load data from the workspace.
        Returns:
            List[Document]: List of documents.
        """
        import requests
        from bs4 import BeautifulSoup

        self.domain_url = domain_url
        self.limit = limit
        self.start_url = f"{self.domain_url}/home/"

        fetched_urls = self.crawl_urls()[:self.limit]

        results = []

        guides_pages = {}
        for url in fetched_urls:
            try:
                response = requests.get(url)
                soup = BeautifulSoup(response.content, 'html.parser')

                page_title = soup.find('title').text

                # Remove the div with aria-label="Table of contents"
                table_of_contents_div = soup.find("div", {"aria-label": "Table of contents"})
                if table_of_contents_div:
                    table_of_contents_div.decompose()

                # Remove header and footer
                header = soup.find("header")
                if header:
                    header.decompose()
                footer = soup.find("footer")
                if footer:
                    footer.decompose()

                # Exclude links and their text content from the main content
                for link in soup.find_all("a"):
                    link.decompose()

                # Remove empty elements from the main content
                for element in soup.find_all():
                    if element.get_text(strip=True) == "":
                        element.decompose()

                # Find the main element containing the desired content
                main_element = soup.find("main")  # Replace "main" with the appropriate element tag or CSS class

                # Extract the text content from the main element
                if main_element:
                    text_content = main_element.get_text("\n")
                    # Remove multiple consecutive newlines and keep only one newline
                    text_content = re.sub(r'\n+', '\n', text_content)
                else:
                    text_content = ""

                page_text = text_content

                guides_page = {}
                guides_page['title'] =page_title
                guides_page["text"] = page_text
                guides_pages[url] = guides_page
            except Exception as e:
                print(f"Failed for {url} => {e}")

        for k, v in guides_pages.items():
            extra_info = {
                "url": k,
                "title": v["title"]
            }
            results.append(
                Document(
                    v["text"],
                    extra_info=extra_info,
                )
            )

        return results

    def crawl_urls(self) -> List[str]:
        """Crawls all the urls from given domain"""

        self.visited = []

        fetched_urls = self.fetch_url(self.start_url)
        fetched_urls = (list(set(fetched_urls)))

        return fetched_urls

    def fetch_url(self, url):
        """Fetch the urls from given domain"""

        import requests
        from bs4 import BeautifulSoup

        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")

        self.visited.append(url)

        newurls = []
        for link in soup.find_all("a"):
            href: str = link.get("href")
            if href and urlparse(href).netloc == self.domain_url:
                newurls.append(href)
            elif href and href.startswith("/"):
                newurls.append(f"{self.domain_url}{href}")

        for newurl in newurls:
            if (
                newurl not in self.visited
                and not newurl.startswith("#")
                and f"https://{urlparse(newurl).netloc}" == self.domain_url
                and len(self.visited) <= self.limit
            ):
                newurls = newurls + self.fetch_url(newurl)

        newurls = (list(set(newurls)))
        return newurls

if __name__ == "__main__":
    reader = MangoppsGuidesReader()
    print("Initialized MangoppsGuidesReader")
    output = reader.load_data( domain_url="https://guides.mangoapps.com", limit=5 )
    print(output)