mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-12-02 18:10:19 +00:00
Update bs4 readme
This commit is contained in:
parent
23ae4928cb
commit
6fb47cf7f6
@ -79,8 +79,9 @@ output = agent_chain.run(input="What language is on this website?")
|
||||
```
|
||||
|
||||
## Custom hostname example:
|
||||
To use a custom hostname like readme.co,substack.com or any other custom hostname, you can pass in the `custom_hostname` argument. Do not include a trailing slash for readme.com links.
|
||||
|
||||
To use a custom hostname like readme.co, substack.com or any other commonly-used website template, you can pass in the `custom_hostname` argument to guarantee that a custom parser is used (if it exists). Check out the code to see which ones are currently implemented.
|
||||
|
||||
```python
|
||||
documents = loader.load_data(urls=["https://langchain.readthedocs.io/en/latest/"], custom_hostname="readthedocs.io")
|
||||
```
|
||||
|
||||
|
||||
@ -19,6 +19,9 @@ def _substack_reader(soup: Any) -> Tuple[str, Dict[str, Any]]:
|
||||
|
||||
def _readthedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
|
||||
"""Extract text from a ReadTheDocs documentation site"""
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
links = soup.find_all("a", {"class": "reference internal"})
|
||||
rtd_links = []
|
||||
|
||||
@ -27,10 +30,8 @@ def _readthedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
|
||||
for i in range(len(rtd_links)):
|
||||
if not rtd_links[i].startswith("http"):
|
||||
rtd_links[i] = urljoin(url, rtd_links[i])
|
||||
texts = []
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
texts = []
|
||||
for doc_link in rtd_links:
|
||||
page_link = requests.get(doc_link)
|
||||
soup = BeautifulSoup(page_link.text, "html.parser")
|
||||
@ -46,6 +47,8 @@ def _readthedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
|
||||
|
||||
def _readmedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
|
||||
"""Extract text from a ReadMe documentation site"""
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
links = soup.find_all("a")
|
||||
docs_links = [link["href"] for link in links if "/docs/" in link["href"]]
|
||||
@ -55,9 +58,6 @@ def _readmedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
|
||||
docs_links[i] = urljoin(url, docs_links[i])
|
||||
|
||||
texts = []
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
for doc_link in docs_links:
|
||||
page_link = requests.get(doc_link)
|
||||
soup = BeautifulSoup(page_link.text, "html.parser")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user