Replace deprecated Selenium methods (#2724)

* Fix crawler.py

* Fix test_connector.py

* unused import

Co-authored-by: danielbichuetti <daniel.bichuetti@gmail.com>
This commit is contained in:
Sara Zan 2022-06-24 12:05:32 +02:00 committed by GitHub
parent 400d2cdf77
commit e8546e2124
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 8 additions and 5 deletions

View File

@ -9,6 +9,8 @@ from urllib.parse import urlparse
try:
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium import webdriver
except (ImportError, ModuleNotFoundError) as ie:
from haystack.utils.import_utils import _optional_component_not_installed
@ -77,7 +79,7 @@ class Crawler(BaseComponent):
try:
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
self.driver = webdriver.Chrome("chromedriver", options=options)
self.driver = webdriver.Chrome(service=Service("chromedriver"), options=options)
except:
raise Exception(
"""
@ -89,7 +91,7 @@ class Crawler(BaseComponent):
)
else:
logger.info("'chrome-driver' will be automatically installed.")
self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
self.urls = urls
self.output_dir = output_dir
self.crawler_depth = crawler_depth
@ -200,7 +202,7 @@ class Crawler(BaseComponent):
for link in urls:
logger.info(f"writing contents from `{link}`")
self.driver.get(link)
el = self.driver.find_element_by_tag_name("body")
el = self.driver.find_element(by=By.TAG_NAME, value="body")
if extract_hidden_text:
text = el.get_attribute("textContent")
else:
@ -316,7 +318,7 @@ class Crawler(BaseComponent):
filter_pattern = re.compile("|".join(filter_urls))
self.driver.get(base_url)
a_elements = self.driver.find_elements_by_xpath("//a[@href]")
a_elements = self.driver.find_elements(by=By.XPATH, value="//a[@href]")
sub_links = set()
for i in a_elements:

View File

@ -4,6 +4,7 @@ import json
from pathlib import Path
import pytest
from selenium.webdriver.common.by import By
from haystack.nodes.connector import Crawler
from haystack.schema import Document
@ -23,7 +24,7 @@ def content_match(crawler: Crawler, url: str, crawled_page: Path):
:param crawled_page: the output of Crawler (one element of the paths list)
"""
crawler.driver.get(url)
body = crawler.driver.find_element_by_tag_name("body")
body = crawler.driver.find_element(by=By.TAG_NAME, value="body")
if crawler.extract_hidden_text:
expected_crawled_content = body.get_attribute("textContent")