mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-12-05 03:19:00 +00:00
Update bs4 readme
This commit is contained in:
parent
23ae4928cb
commit
6fb47cf7f6
@ -79,8 +79,9 @@ output = agent_chain.run(input="What language is on this website?")
|
|||||||
```
|
```
|
||||||
|
|
||||||
## Custom hostname example:
|
## Custom hostname example:
|
||||||
To use a custom hostname like readme.co,substack.com or any other custom hostname, you can pass in the `custom_hostname` argument. Do not include a trailing slash for readme.com links.
|
|
||||||
|
To use a custom hostname like readme.co, substack.com or any other commonly-used website template, you can pass in the `custom_hostname` argument to guarantee that a custom parser is used (if it exists). Check out the code to see which ones are currently implemented.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
documents = loader.load_data(urls=["https://langchain.readthedocs.io/en/latest/"], custom_hostname="readthedocs.io")
|
documents = loader.load_data(urls=["https://langchain.readthedocs.io/en/latest/"], custom_hostname="readthedocs.io")
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@ -19,6 +19,9 @@ def _substack_reader(soup: Any) -> Tuple[str, Dict[str, Any]]:
|
|||||||
|
|
||||||
def _readthedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
|
def _readthedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
|
||||||
"""Extract text from a ReadTheDocs documentation site"""
|
"""Extract text from a ReadTheDocs documentation site"""
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
links = soup.find_all("a", {"class": "reference internal"})
|
links = soup.find_all("a", {"class": "reference internal"})
|
||||||
rtd_links = []
|
rtd_links = []
|
||||||
|
|
||||||
@ -27,10 +30,8 @@ def _readthedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
|
|||||||
for i in range(len(rtd_links)):
|
for i in range(len(rtd_links)):
|
||||||
if not rtd_links[i].startswith("http"):
|
if not rtd_links[i].startswith("http"):
|
||||||
rtd_links[i] = urljoin(url, rtd_links[i])
|
rtd_links[i] = urljoin(url, rtd_links[i])
|
||||||
texts = []
|
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
|
texts = []
|
||||||
for doc_link in rtd_links:
|
for doc_link in rtd_links:
|
||||||
page_link = requests.get(doc_link)
|
page_link = requests.get(doc_link)
|
||||||
soup = BeautifulSoup(page_link.text, "html.parser")
|
soup = BeautifulSoup(page_link.text, "html.parser")
|
||||||
@ -46,6 +47,8 @@ def _readthedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
|
|||||||
|
|
||||||
def _readmedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
|
def _readmedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
|
||||||
"""Extract text from a ReadMe documentation site"""
|
"""Extract text from a ReadMe documentation site"""
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
links = soup.find_all("a")
|
links = soup.find_all("a")
|
||||||
docs_links = [link["href"] for link in links if "/docs/" in link["href"]]
|
docs_links = [link["href"] for link in links if "/docs/" in link["href"]]
|
||||||
@ -55,9 +58,6 @@ def _readmedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
|
|||||||
docs_links[i] = urljoin(url, docs_links[i])
|
docs_links[i] = urljoin(url, docs_links[i])
|
||||||
|
|
||||||
texts = []
|
texts = []
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
for doc_link in docs_links:
|
for doc_link in docs_links:
|
||||||
page_link = requests.get(doc_link)
|
page_link = requests.get(doc_link)
|
||||||
soup = BeautifulSoup(page_link.text, "html.parser")
|
soup = BeautifulSoup(page_link.text, "html.parser")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user