Update bs4 readme

This commit is contained in:
EmptyCrown 2023-02-15 09:20:06 -08:00
parent 23ae4928cb
commit 6fb47cf7f6
2 changed files with 9 additions and 8 deletions

View File

@ -79,8 +79,9 @@ output = agent_chain.run(input="What language is on this website?")
```
## Custom hostname example:
To use a custom hostname like readme.co,substack.com or any other custom hostname, you can pass in the `custom_hostname` argument. Do not include a trailing slash for readme.com links.
To use a custom hostname like readme.co, substack.com or any other commonly-used website template, you can pass in the `custom_hostname` argument to guarantee that a custom parser is used (if it exists). Check out the code to see which ones are currently implemented.
```python
documents = loader.load_data(urls=["https://langchain.readthedocs.io/en/latest/"], custom_hostname="readthedocs.io")
```

View File

@ -19,6 +19,9 @@ def _substack_reader(soup: Any) -> Tuple[str, Dict[str, Any]]:
def _readthedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
"""Extract text from a ReadTheDocs documentation site"""
import requests
from bs4 import BeautifulSoup
links = soup.find_all("a", {"class": "reference internal"})
rtd_links = []
@ -27,10 +30,8 @@ def _readthedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
for i in range(len(rtd_links)):
if not rtd_links[i].startswith("http"):
rtd_links[i] = urljoin(url, rtd_links[i])
texts = []
import requests
from bs4 import BeautifulSoup
texts = []
for doc_link in rtd_links:
page_link = requests.get(doc_link)
soup = BeautifulSoup(page_link.text, "html.parser")
@ -46,6 +47,8 @@ def _readthedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
def _readmedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
"""Extract text from a ReadMe documentation site"""
import requests
from bs4 import BeautifulSoup
links = soup.find_all("a")
docs_links = [link["href"] for link in links if "/docs/" in link["href"]]
@ -55,9 +58,6 @@ def _readmedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
docs_links[i] = urljoin(url, docs_links[i])
texts = []
import requests
from bs4 import BeautifulSoup
for doc_link in docs_links:
page_link = requests.get(doc_link)
soup = BeautifulSoup(page_link.text, "html.parser")