mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-10 14:16:59 +00:00
refactor: update Crawler to support selenium>=4.11.0 and simplify it (#5515)
* refactor crawler * rm unused imports * release notes! * rm outdated mock
This commit is contained in:
parent
37cf1fe49c
commit
3f472995bb
@ -9,19 +9,16 @@ from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Set
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from haystack.errors import NodeError
|
||||
from haystack.nodes.base import BaseComponent
|
||||
from haystack.schema import Document
|
||||
from haystack.lazy_imports import LazyImport
|
||||
|
||||
with LazyImport("Run 'pip install farm-haystack[crawler]'") as selenium_import:
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
|
||||
from selenium.common.exceptions import StaleElementReferenceException
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.common.by import By
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -109,8 +106,10 @@ class Crawler(BaseComponent):
|
||||
if webdriver_options is None:
|
||||
webdriver_options = ["--headless", "--disable-gpu", "--disable-dev-shm-usage", "--single-process"]
|
||||
webdriver_options.append("--headless")
|
||||
if IS_ROOT or IN_WINDOWS or IN_COLAB:
|
||||
webdriver_options.append("--no-sandbox")
|
||||
if IS_ROOT or IN_WINDOWS:
|
||||
webdriver_options.extend(["--no-sandbox", "--remote-debugging-port=9222"])
|
||||
webdriver_options.append("--remote-debugging-port=9222")
|
||||
if IN_COLAB or IN_AZUREML:
|
||||
webdriver_options.append("--disable-dev-shm-usage")
|
||||
|
||||
@ -118,50 +117,7 @@ class Crawler(BaseComponent):
|
||||
for option in set(webdriver_options):
|
||||
options.add_argument(option)
|
||||
|
||||
if IN_COLAB:
|
||||
try:
|
||||
self.driver = webdriver.Chrome(service=Service("chromedriver"), options=options)
|
||||
except WebDriverException as exc:
|
||||
raise NodeError(
|
||||
"""
|
||||
\'chromium-driver\' needs to be installed manually when running colab. Follow the below given commands:
|
||||
%%shell
|
||||
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
|
||||
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
|
||||
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
|
||||
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
|
||||
EOF
|
||||
|
||||
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
|
||||
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
|
||||
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
|
||||
apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
|
||||
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
|
||||
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg
|
||||
|
||||
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
|
||||
Package: *
|
||||
Pin: release a=eoan
|
||||
Pin-Priority: 500
|
||||
|
||||
|
||||
Package: *
|
||||
Pin: origin "deb.debian.org"
|
||||
Pin-Priority: 300
|
||||
|
||||
|
||||
Package: chromium*
|
||||
Pin: origin "deb.debian.org"
|
||||
Pin-Priority: 700
|
||||
EOF
|
||||
|
||||
apt-get update
|
||||
apt-get install chromium chromium-driver
|
||||
If it has already been installed, please check if it has been copied to the right directory i.e. to \'/usr/bin\'"""
|
||||
) from exc
|
||||
else:
|
||||
logger.info("'chrome-driver' will be automatically installed.")
|
||||
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
||||
self.driver = webdriver.Chrome(service=Service(), options=options)
|
||||
self.urls = urls
|
||||
self.crawler_depth = crawler_depth
|
||||
self.filter_urls = filter_urls
|
||||
|
||||
@ -154,8 +154,7 @@ aws = [
|
||||
"botocore>=1.27",
|
||||
]
|
||||
crawler = [
|
||||
"selenium>=4.0.0,!=4.1.4", # Avoid 4.1.4 due to https://github.com/SeleniumHQ/selenium/issues/10612
|
||||
"webdriver-manager",
|
||||
"selenium>=4.11.0"
|
||||
]
|
||||
preprocessing = [
|
||||
"nltk",
|
||||
|
||||
@ -0,0 +1,5 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Make the Crawler work properly with Selenium>=4.11.0.
|
||||
Simplify the Crawler, as the new version of Selenium automatically finds or installs the necessary drivers.
|
||||
@ -54,9 +54,8 @@ def content_in_results(crawler: Crawler, url: str, results: List[Path], expected
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
@patch("haystack.nodes.connector.crawler.ChromeDriverManager")
|
||||
@patch("haystack.nodes.connector.crawler.webdriver")
|
||||
def test_crawler_url_none_exception(webdriver, manager):
|
||||
def test_crawler_url_none_exception(webdriver):
|
||||
crawler = Crawler()
|
||||
with pytest.raises(ValueError):
|
||||
crawler.crawl()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user