mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-29 16:59:47 +00:00
Replace deprecated Selenium methods (#2724)
* Fix crawler.py * Fix test_connector.py * unused import Co-authored-by: danielbichuetti <daniel.bichuetti@gmail.com>
This commit is contained in:
parent
400d2cdf77
commit
e8546e2124
@ -9,6 +9,8 @@ from urllib.parse import urlparse
|
||||
|
||||
try:
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium import webdriver
|
||||
except (ImportError, ModuleNotFoundError) as ie:
|
||||
from haystack.utils.import_utils import _optional_component_not_installed
|
||||
@ -77,7 +79,7 @@ class Crawler(BaseComponent):
|
||||
try:
|
||||
options.add_argument("--no-sandbox")
|
||||
options.add_argument("--disable-dev-shm-usage")
|
||||
self.driver = webdriver.Chrome("chromedriver", options=options)
|
||||
self.driver = webdriver.Chrome(service=Service("chromedriver"), options=options)
|
||||
except:
|
||||
raise Exception(
|
||||
"""
|
||||
@ -89,7 +91,7 @@ class Crawler(BaseComponent):
|
||||
)
|
||||
else:
|
||||
logger.info("'chrome-driver' will be automatically installed.")
|
||||
self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
|
||||
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
||||
self.urls = urls
|
||||
self.output_dir = output_dir
|
||||
self.crawler_depth = crawler_depth
|
||||
@ -200,7 +202,7 @@ class Crawler(BaseComponent):
|
||||
for link in urls:
|
||||
logger.info(f"writing contents from `{link}`")
|
||||
self.driver.get(link)
|
||||
el = self.driver.find_element_by_tag_name("body")
|
||||
el = self.driver.find_element(by=By.TAG_NAME, value="body")
|
||||
if extract_hidden_text:
|
||||
text = el.get_attribute("textContent")
|
||||
else:
|
||||
@ -316,7 +318,7 @@ class Crawler(BaseComponent):
|
||||
filter_pattern = re.compile("|".join(filter_urls))
|
||||
|
||||
self.driver.get(base_url)
|
||||
a_elements = self.driver.find_elements_by_xpath("//a[@href]")
|
||||
a_elements = self.driver.find_elements(by=By.XPATH, value="//a[@href]")
|
||||
sub_links = set()
|
||||
|
||||
for i in a_elements:
|
||||
|
||||
@ -4,6 +4,7 @@ import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
from haystack.nodes.connector import Crawler
|
||||
from haystack.schema import Document
|
||||
@ -23,7 +24,7 @@ def content_match(crawler: Crawler, url: str, crawled_page: Path):
|
||||
:param crawled_page: the output of Crawler (one element of the paths list)
|
||||
"""
|
||||
crawler.driver.get(url)
|
||||
body = crawler.driver.find_element_by_tag_name("body")
|
||||
body = crawler.driver.find_element(by=By.TAG_NAME, value="body")
|
||||
|
||||
if crawler.extract_hidden_text:
|
||||
expected_crawled_content = body.get_attribute("textContent")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user