Replace deprecated Selenium methods (#2724)

* Fix crawler.py * Fix test_connector.py * unused import Co-authored-by: danielbichuetti <daniel.bichuetti@gmail.com>
2025-11-02 18:59:28 +00:00 · 2022-06-24 12:05:32 +02:00 · 2022-06-24 12:05:32 +02:00 · e8546e2124
commit e8546e2124
parent 400d2cdf77
2 changed files with 8 additions and 5 deletions
--- a/haystack/nodes/connector/crawler.py
+++ b/haystack/nodes/connector/crawler.py
@ -9,6 +9,8 @@ from urllib.parse import urlparse

 try:
    from webdriver_manager.chrome import ChromeDriverManager
+    from selenium.webdriver.chrome.service import Service
+    from selenium.webdriver.common.by import By
    from selenium import webdriver
 except (ImportError, ModuleNotFoundError) as ie:
    from haystack.utils.import_utils import _optional_component_not_installed
@ -77,7 +79,7 @@ class Crawler(BaseComponent):
            try:
                options.add_argument("--no-sandbox")
                options.add_argument("--disable-dev-shm-usage")
-                self.driver = webdriver.Chrome("chromedriver", options=options)
+                self.driver = webdriver.Chrome(service=Service("chromedriver"), options=options)
            except:
                raise Exception(
                    """
@ -89,7 +91,7 @@ class Crawler(BaseComponent):
                )
        else:
            logger.info("'chrome-driver' will be automatically installed.")
-            self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
+            self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        self.urls = urls
        self.output_dir = output_dir
        self.crawler_depth = crawler_depth
@ -200,7 +202,7 @@ class Crawler(BaseComponent):
        for link in urls:
            logger.info(f"writing contents from `{link}`")
            self.driver.get(link)
-            el = self.driver.find_element_by_tag_name("body")
+            el = self.driver.find_element(by=By.TAG_NAME, value="body")
            if extract_hidden_text:
                text = el.get_attribute("textContent")
            else:
@ -316,7 +318,7 @@ class Crawler(BaseComponent):
            filter_pattern = re.compile("|".join(filter_urls))

        self.driver.get(base_url)
-        a_elements = self.driver.find_elements_by_xpath("//a[@href]")
+        a_elements = self.driver.find_elements(by=By.XPATH, value="//a[@href]")
        sub_links = set()

        for i in a_elements:
--- a/test/nodes/test_connector.py
+++ b/test/nodes/test_connector.py
@ -4,6 +4,7 @@ import json
 from pathlib import Path

 import pytest
+from selenium.webdriver.common.by import By

 from haystack.nodes.connector import Crawler
 from haystack.schema import Document
@ -23,7 +24,7 @@ def content_match(crawler: Crawler, url: str, crawled_page: Path):
    :param crawled_page: the output of Crawler (one element of the paths list)
    """
    crawler.driver.get(url)
-    body = crawler.driver.find_element_by_tag_name("body")
+    body = crawler.driver.find_element(by=By.TAG_NAME, value="body")

    if crawler.extract_hidden_text:
        expected_crawled_content = body.get_attribute("textContent")