mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-08-14 19:51:25 +00:00
120 lines
3.3 KiB
Python
120 lines
3.3 KiB
Python
![]() |
"""Fully Rendered Web scraper."""
|
||
|
from typing import Dict, List, Literal, Optional
|
||
|
|
||
|
from llama_index.readers.base import BaseReader
|
||
|
from llama_index.readers.schema.base import Document
|
||
|
|
||
|
from playwright.sync_api._generated import Browser
|
||
|
|
||
|
from pathlib import Path
|
||
|
path = Path(__file__).parent / "Readability.js"
|
||
|
|
||
|
readabilityjs = ""
|
||
|
with open(path, "r") as f:
|
||
|
readabilityjs = f.read()
|
||
|
|
||
|
inject_readability = f"""
|
||
|
(function(){{
|
||
|
{readabilityjs}
|
||
|
function executor() {{
|
||
|
return new Readability({{}}, document).parse();
|
||
|
}}
|
||
|
return executor();
|
||
|
}}())
|
||
|
"""
|
||
|
|
||
|
|
||
|
class RenderedWebPageReader(BaseReader):
|
||
|
"""Fully Rendered Webpage Loader
|
||
|
|
||
|
Extracting relevant information from a fully rendered web page.
|
||
|
During the processing, it is always assumed that web pages used as data sources contain textual content.
|
||
|
|
||
|
1. Load the page and wait for it fully loaded. (playwright)
|
||
|
2. Inject Readability.js to extract the main content.
|
||
|
|
||
|
Args:
|
||
|
proxy (Optional[str], optional): Proxy server. Defaults to None.
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(self, proxy: Optional[str] = None, wait_until: Optional[
|
||
|
Literal["commit", "domcontentloaded", "load", "networkidle"]
|
||
|
] = "domcontentloaded") -> None:
|
||
|
self._launch_options = {
|
||
|
"headless": True,
|
||
|
}
|
||
|
self._wait_until = wait_until
|
||
|
if proxy:
|
||
|
self._launch_options["proxy"] = {
|
||
|
"server": proxy,
|
||
|
}
|
||
|
|
||
|
def load_data(self, url: str) -> List[Document]:
|
||
|
"""render and load data content from url.
|
||
|
|
||
|
Args:
|
||
|
url (str): URL to scrape.
|
||
|
|
||
|
Returns:
|
||
|
List[Document]: List of documents.
|
||
|
|
||
|
"""
|
||
|
from playwright.sync_api import sync_playwright
|
||
|
|
||
|
with sync_playwright() as p:
|
||
|
browser = p.chromium.launch(**self._launch_options)
|
||
|
|
||
|
article = self.scrape_page(
|
||
|
browser,
|
||
|
url,
|
||
|
)
|
||
|
extra_info = {key: article[key] for key in [
|
||
|
"title",
|
||
|
"length",
|
||
|
"excerpt",
|
||
|
"byline",
|
||
|
"dir",
|
||
|
"lang",
|
||
|
"siteName",
|
||
|
]}
|
||
|
|
||
|
browser.close()
|
||
|
|
||
|
return [Document(article["textContent"], extra_info=extra_info)]
|
||
|
|
||
|
def scrape_page(
|
||
|
self,
|
||
|
browser: Browser,
|
||
|
url: str,
|
||
|
) -> Dict[str, str]:
|
||
|
"""Scrape a single article url.
|
||
|
|
||
|
Args:
|
||
|
browser (Any): a Playwright Chromium browser.
|
||
|
url (str): URL of the article to scrape.
|
||
|
|
||
|
Returns:
|
||
|
Ref: https://github.com/mozilla/readability
|
||
|
title: article title;
|
||
|
content: HTML string of processed article content;
|
||
|
textContent: text content of the article, with all the HTML tags removed;
|
||
|
length: length of an article, in characters;
|
||
|
excerpt: article description, or short excerpt from the content;
|
||
|
byline: author metadata;
|
||
|
dir: content direction;
|
||
|
siteName: name of the site.
|
||
|
lang: content language
|
||
|
|
||
|
"""
|
||
|
page = browser.new_page(ignore_https_errors=True)
|
||
|
page.set_default_timeout(60000)
|
||
|
page.goto(url, wait_until=self._wait_until)
|
||
|
|
||
|
r = page.evaluate(inject_readability)
|
||
|
|
||
|
page.close()
|
||
|
print("scraped:", url)
|
||
|
|
||
|
return r
|