Replace deprecated Selenium methods (#2724)

* Fix crawler.py

* Fix test_connector.py

* unused import

Co-authored-by: danielbichuetti <daniel.bichuetti@gmail.com>
This commit is contained in:
Sara Zan 2022-06-24 12:05:32 +02:00 committed by GitHub
parent 400d2cdf77
commit e8546e2124
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 8 additions and 5 deletions

View File

@ -9,6 +9,8 @@ from urllib.parse import urlparse
try: try:
from webdriver_manager.chrome import ChromeDriverManager from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium import webdriver from selenium import webdriver
except (ImportError, ModuleNotFoundError) as ie: except (ImportError, ModuleNotFoundError) as ie:
from haystack.utils.import_utils import _optional_component_not_installed from haystack.utils.import_utils import _optional_component_not_installed
@ -77,7 +79,7 @@ class Crawler(BaseComponent):
try: try:
options.add_argument("--no-sandbox") options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage") options.add_argument("--disable-dev-shm-usage")
self.driver = webdriver.Chrome("chromedriver", options=options) self.driver = webdriver.Chrome(service=Service("chromedriver"), options=options)
except: except:
raise Exception( raise Exception(
""" """
@ -89,7 +91,7 @@ class Crawler(BaseComponent):
) )
else: else:
logger.info("'chrome-driver' will be automatically installed.") logger.info("'chrome-driver' will be automatically installed.")
self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
self.urls = urls self.urls = urls
self.output_dir = output_dir self.output_dir = output_dir
self.crawler_depth = crawler_depth self.crawler_depth = crawler_depth
@ -200,7 +202,7 @@ class Crawler(BaseComponent):
for link in urls: for link in urls:
logger.info(f"writing contents from `{link}`") logger.info(f"writing contents from `{link}`")
self.driver.get(link) self.driver.get(link)
el = self.driver.find_element_by_tag_name("body") el = self.driver.find_element(by=By.TAG_NAME, value="body")
if extract_hidden_text: if extract_hidden_text:
text = el.get_attribute("textContent") text = el.get_attribute("textContent")
else: else:
@ -316,7 +318,7 @@ class Crawler(BaseComponent):
filter_pattern = re.compile("|".join(filter_urls)) filter_pattern = re.compile("|".join(filter_urls))
self.driver.get(base_url) self.driver.get(base_url)
a_elements = self.driver.find_elements_by_xpath("//a[@href]") a_elements = self.driver.find_elements(by=By.XPATH, value="//a[@href]")
sub_links = set() sub_links = set()
for i in a_elements: for i in a_elements:

View File

@ -4,6 +4,7 @@ import json
from pathlib import Path from pathlib import Path
import pytest import pytest
from selenium.webdriver.common.by import By
from haystack.nodes.connector import Crawler from haystack.nodes.connector import Crawler
from haystack.schema import Document from haystack.schema import Document
@ -23,7 +24,7 @@ def content_match(crawler: Crawler, url: str, crawled_page: Path):
:param crawled_page: the output of Crawler (one element of the paths list) :param crawled_page: the output of Crawler (one element of the paths list)
""" """
crawler.driver.get(url) crawler.driver.get(url)
body = crawler.driver.find_element_by_tag_name("body") body = crawler.driver.find_element(by=By.TAG_NAME, value="body")
if crawler.extract_hidden_text: if crawler.extract_hidden_text:
expected_crawled_content = body.get_attribute("textContent") expected_crawled_content = body.get_attribute("textContent")