mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-30 01:09:43 +00:00
Replace deprecated Selenium methods (#2724)
* Fix crawler.py * Fix test_connector.py * unused import Co-authored-by: danielbichuetti <daniel.bichuetti@gmail.com>
This commit is contained in:
parent
400d2cdf77
commit
e8546e2124
@ -9,6 +9,8 @@ from urllib.parse import urlparse
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
from webdriver_manager.chrome import ChromeDriverManager
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
except (ImportError, ModuleNotFoundError) as ie:
|
except (ImportError, ModuleNotFoundError) as ie:
|
||||||
from haystack.utils.import_utils import _optional_component_not_installed
|
from haystack.utils.import_utils import _optional_component_not_installed
|
||||||
@ -77,7 +79,7 @@ class Crawler(BaseComponent):
|
|||||||
try:
|
try:
|
||||||
options.add_argument("--no-sandbox")
|
options.add_argument("--no-sandbox")
|
||||||
options.add_argument("--disable-dev-shm-usage")
|
options.add_argument("--disable-dev-shm-usage")
|
||||||
self.driver = webdriver.Chrome("chromedriver", options=options)
|
self.driver = webdriver.Chrome(service=Service("chromedriver"), options=options)
|
||||||
except:
|
except:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
"""
|
"""
|
||||||
@ -89,7 +91,7 @@ class Crawler(BaseComponent):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.info("'chrome-driver' will be automatically installed.")
|
logger.info("'chrome-driver' will be automatically installed.")
|
||||||
self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
|
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
||||||
self.urls = urls
|
self.urls = urls
|
||||||
self.output_dir = output_dir
|
self.output_dir = output_dir
|
||||||
self.crawler_depth = crawler_depth
|
self.crawler_depth = crawler_depth
|
||||||
@ -200,7 +202,7 @@ class Crawler(BaseComponent):
|
|||||||
for link in urls:
|
for link in urls:
|
||||||
logger.info(f"writing contents from `{link}`")
|
logger.info(f"writing contents from `{link}`")
|
||||||
self.driver.get(link)
|
self.driver.get(link)
|
||||||
el = self.driver.find_element_by_tag_name("body")
|
el = self.driver.find_element(by=By.TAG_NAME, value="body")
|
||||||
if extract_hidden_text:
|
if extract_hidden_text:
|
||||||
text = el.get_attribute("textContent")
|
text = el.get_attribute("textContent")
|
||||||
else:
|
else:
|
||||||
@ -316,7 +318,7 @@ class Crawler(BaseComponent):
|
|||||||
filter_pattern = re.compile("|".join(filter_urls))
|
filter_pattern = re.compile("|".join(filter_urls))
|
||||||
|
|
||||||
self.driver.get(base_url)
|
self.driver.get(base_url)
|
||||||
a_elements = self.driver.find_elements_by_xpath("//a[@href]")
|
a_elements = self.driver.find_elements(by=By.XPATH, value="//a[@href]")
|
||||||
sub_links = set()
|
sub_links = set()
|
||||||
|
|
||||||
for i in a_elements:
|
for i in a_elements:
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import json
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
|
||||||
from haystack.nodes.connector import Crawler
|
from haystack.nodes.connector import Crawler
|
||||||
from haystack.schema import Document
|
from haystack.schema import Document
|
||||||
@ -23,7 +24,7 @@ def content_match(crawler: Crawler, url: str, crawled_page: Path):
|
|||||||
:param crawled_page: the output of Crawler (one element of the paths list)
|
:param crawled_page: the output of Crawler (one element of the paths list)
|
||||||
"""
|
"""
|
||||||
crawler.driver.get(url)
|
crawler.driver.get(url)
|
||||||
body = crawler.driver.find_element_by_tag_name("body")
|
body = crawler.driver.find_element(by=By.TAG_NAME, value="body")
|
||||||
|
|
||||||
if crawler.extract_hidden_text:
|
if crawler.extract_hidden_text:
|
||||||
expected_crawled_content = body.get_attribute("textContent")
|
expected_crawled_content = body.get_attribute("textContent")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user