Update bs4 readme

2025-12-02 18:10:19 +00:00 · 2023-02-15 09:20:06 -08:00 · 2023-02-15 09:20:06 -08:00 · 6fb47cf7f6
commit 6fb47cf7f6
parent 23ae4928cb
2 changed files with 9 additions and 8 deletions
--- a/loader_hub/web/beautiful_soup_web/README.md
+++ b/loader_hub/web/beautiful_soup_web/README.md
@ -79,8 +79,9 @@ output = agent_chain.run(input="What language is on this website?")
 ```

 ## Custom hostname example:
-To use a custom hostname like readme.co,substack.com or any other custom hostname, you can pass in the `custom_hostname` argument. Do not include a trailing slash for readme.com links.
+
+To use a custom hostname like readme.co, substack.com or any other commonly-used website template, you can pass in the `custom_hostname` argument to guarantee that a custom parser is used (if it exists). Check out the code to see which ones are currently implemented.
+
 ```python
 documents = loader.load_data(urls=["https://langchain.readthedocs.io/en/latest/"], custom_hostname="readthedocs.io")
 ```
-
--- a/loader_hub/web/beautiful_soup_web/base.py
+++ b/loader_hub/web/beautiful_soup_web/base.py
@ -19,6 +19,9 @@ def _substack_reader(soup: Any) -> Tuple[str, Dict[str, Any]]:

 def _readthedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
    """Extract text from a ReadTheDocs documentation site"""
+    import requests
+    from bs4 import BeautifulSoup
+
    links = soup.find_all("a", {"class": "reference internal"})
    rtd_links = []

@ -27,10 +30,8 @@ def _readthedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
    for i in range(len(rtd_links)):
        if not rtd_links[i].startswith("http"):
            rtd_links[i] = urljoin(url, rtd_links[i])
-    texts = []
-    import requests
-    from bs4 import BeautifulSoup

+    texts = []
    for doc_link in rtd_links:
        page_link = requests.get(doc_link)
        soup = BeautifulSoup(page_link.text, "html.parser")
@ -46,6 +47,8 @@ def _readthedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:

 def _readmedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
    """Extract text from a ReadMe documentation site"""
+    import requests
+    from bs4 import BeautifulSoup

    links = soup.find_all("a")
    docs_links = [link["href"] for link in links if "/docs/" in link["href"]]
@ -55,9 +58,6 @@ def _readmedocs_reader(soup: Any, url: str) -> Tuple[str, Dict[str, Any]]:
            docs_links[i] = urljoin(url, docs_links[i])

    texts = []
-    import requests
-    from bs4 import BeautifulSoup
-
    for doc_link in docs_links:
        page_link = requests.get(doc_link)
        soup = BeautifulSoup(page_link.text, "html.parser")