From e8546e21243cbb8750ee5539e91bcc73f981d561 Mon Sep 17 00:00:00 2001 From: Sara Zan Date: Fri, 24 Jun 2022 12:05:32 +0200 Subject: [PATCH] Replace deprecated Selenium methods (#2724) * Fix crawler.py * Fix test_connector.py * unused import Co-authored-by: danielbichuetti --- haystack/nodes/connector/crawler.py | 10 ++++++---- test/nodes/test_connector.py | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/haystack/nodes/connector/crawler.py b/haystack/nodes/connector/crawler.py index fcf442922..b4dd23f55 100644 --- a/haystack/nodes/connector/crawler.py +++ b/haystack/nodes/connector/crawler.py @@ -9,6 +9,8 @@ from urllib.parse import urlparse try: from webdriver_manager.chrome import ChromeDriverManager + from selenium.webdriver.chrome.service import Service + from selenium.webdriver.common.by import By from selenium import webdriver except (ImportError, ModuleNotFoundError) as ie: from haystack.utils.import_utils import _optional_component_not_installed @@ -77,7 +79,7 @@ class Crawler(BaseComponent): try: options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") - self.driver = webdriver.Chrome("chromedriver", options=options) + self.driver = webdriver.Chrome(service=Service("chromedriver"), options=options) except: raise Exception( """ @@ -89,7 +91,7 @@ class Crawler(BaseComponent): ) else: logger.info("'chrome-driver' will be automatically installed.") - self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) + self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) self.urls = urls self.output_dir = output_dir self.crawler_depth = crawler_depth @@ -200,7 +202,7 @@ class Crawler(BaseComponent): for link in urls: logger.info(f"writing contents from `{link}`") self.driver.get(link) - el = self.driver.find_element_by_tag_name("body") + el = self.driver.find_element(by=By.TAG_NAME, value="body") if extract_hidden_text: text = el.get_attribute("textContent") else: @@ -316,7 +318,7 @@ class Crawler(BaseComponent): filter_pattern = re.compile("|".join(filter_urls)) self.driver.get(base_url) - a_elements = self.driver.find_elements_by_xpath("//a[@href]") + a_elements = self.driver.find_elements(by=By.XPATH, value="//a[@href]") sub_links = set() for i in a_elements: diff --git a/test/nodes/test_connector.py b/test/nodes/test_connector.py index 382c1e9be..e9c998320 100644 --- a/test/nodes/test_connector.py +++ b/test/nodes/test_connector.py @@ -4,6 +4,7 @@ import json from pathlib import Path import pytest +from selenium.webdriver.common.by import By from haystack.nodes.connector import Crawler from haystack.schema import Document @@ -23,7 +24,7 @@ def content_match(crawler: Crawler, url: str, crawled_page: Path): :param crawled_page: the output of Crawler (one element of the paths list) """ crawler.driver.get(url) - body = crawler.driver.find_element_by_tag_name("body") + body = crawler.driver.find_element(by=By.TAG_NAME, value="body") if crawler.extract_hidden_text: expected_crawled_content = body.get_attribute("textContent")