diff --git a/loader_hub/web/beautiful_soup_web/README.md b/loader_hub/web/beautiful_soup_web/README.md index 1d0f8273..9c60132b 100644 --- a/loader_hub/web/beautiful_soup_web/README.md +++ b/loader_hub/web/beautiful_soup_web/README.md @@ -79,8 +79,9 @@ output = agent_chain.run(input="What language is on this website?") ``` ## Custom hostname example: -To use a custom hostname like readme.co,substack.com or any other custom hostname, you can pass in the `custom_hostname` argument. Do not include a trailing slash for readme.com links. + +To use a custom hostname like readme.co, substack.com or any other commonly-used website template, you can pass in the `custom_hostname` argument to guarantee that a custom parser is used (if it exists). Check out the code to see which ones are currently implemented. + ```python documents = loader.load_data(urls=["https://langchain.readthedocs.io/en/latest/"], custom_hostname="readthedocs.io") ``` - diff --git a/loader_hub/web/beautiful_soup_web/base.py b/loader_hub/web/beautiful_soup_web/base.py index 60778ea3..05634daf 100644 --- a/loader_hub/web/beautiful_soup_web/base.py +++ b/loader_hub/web/beautiful_soup_web/base.py @@ -19,6 +19,9 @@ def _substack_reader(soup: Any) -> Tuple[str, Dict[str, Any]]: def _readthedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]: """Extract text from a ReadTheDocs documentation site""" + import requests + from bs4 import BeautifulSoup + links = soup.find_all("a", {"class": "reference internal"}) rtd_links = [] @@ -27,10 +30,8 @@ def _readthedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]: for i in range(len(rtd_links)): if not rtd_links[i].startswith("http"): rtd_links[i] = urljoin(url, rtd_links[i]) - texts = [] - import requests - from bs4 import BeautifulSoup + texts = [] for doc_link in rtd_links: page_link = requests.get(doc_link) soup = BeautifulSoup(page_link.text, "html.parser") @@ -46,6 +47,8 @@ def _readthedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]: def _readmedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]: """Extract text from a ReadMe documentation site""" + import requests + from bs4 import BeautifulSoup links = soup.find_all("a") docs_links = [link["href"] for link in links if "/docs/" in link["href"]] @@ -55,9 +58,6 @@ def _readmedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]: docs_links[i] = urljoin(url, docs_links[i]) texts = [] - import requests - from bs4 import BeautifulSoup - for doc_link in docs_links: page_link = requests.get(doc_link) soup = BeautifulSoup(page_link.text, "html.parser")