refactor: update Crawler to support selenium>=4.11.0 and simplify it (#5515)

* refactor crawler

* rm unused imports

* release notes!

* rm outdated mock
This commit is contained in:
Stefano Fiorucci 2023-08-08 15:13:22 +02:00 committed by GitHub
parent 37cf1fe49c
commit 3f472995bb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 12 additions and 53 deletions

View File

@ -9,19 +9,16 @@ from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Set
from urllib.parse import urlparse
from haystack.errors import NodeError
from haystack.nodes.base import BaseComponent
from haystack.schema import Document
from haystack.lazy_imports import LazyImport
with LazyImport("Run 'pip install farm-haystack[crawler]'") as selenium_import:
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
logger = logging.getLogger(__name__)
@ -109,8 +106,10 @@ class Crawler(BaseComponent):
if webdriver_options is None:
webdriver_options = ["--headless", "--disable-gpu", "--disable-dev-shm-usage", "--single-process"]
webdriver_options.append("--headless")
if IS_ROOT or IN_WINDOWS or IN_COLAB:
webdriver_options.append("--no-sandbox")
if IS_ROOT or IN_WINDOWS:
webdriver_options.extend(["--no-sandbox", "--remote-debugging-port=9222"])
webdriver_options.append("--remote-debugging-port=9222")
if IN_COLAB or IN_AZUREML:
webdriver_options.append("--disable-dev-shm-usage")
@ -118,50 +117,7 @@ class Crawler(BaseComponent):
for option in set(webdriver_options):
options.add_argument(option)
if IN_COLAB:
try:
self.driver = webdriver.Chrome(service=Service("chromedriver"), options=options)
except WebDriverException as exc:
raise NodeError(
"""
\'chromium-driver\' needs to be installed manually when running colab. Follow the below given commands:
%%shell
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500
Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300
Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF
apt-get update
apt-get install chromium chromium-driver
If it has already been installed, please check if it has been copied to the right directory i.e. to \'/usr/bin\'"""
) from exc
else:
logger.info("'chrome-driver' will be automatically installed.")
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
self.driver = webdriver.Chrome(service=Service(), options=options)
self.urls = urls
self.crawler_depth = crawler_depth
self.filter_urls = filter_urls

View File

@ -154,8 +154,7 @@ aws = [
"botocore>=1.27",
]
crawler = [
"selenium>=4.0.0,!=4.1.4", # Avoid 4.1.4 due to https://github.com/SeleniumHQ/selenium/issues/10612
"webdriver-manager",
"selenium>=4.11.0"
]
preprocessing = [
"nltk",

View File

@ -0,0 +1,5 @@
---
fixes:
- |
Make the Crawler work properly with Selenium>=4.11.0.
Simplify the Crawler, as the new version of Selenium automatically finds or installs the necessary drivers.

View File

@ -54,9 +54,8 @@ def content_in_results(crawler: Crawler, url: str, results: List[Path], expected
@pytest.mark.unit
@patch("haystack.nodes.connector.crawler.ChromeDriverManager")
@patch("haystack.nodes.connector.crawler.webdriver")
def test_crawler_url_none_exception(webdriver, manager):
def test_crawler_url_none_exception(webdriver):
crawler = Crawler()
with pytest.raises(ValueError):
crawler.crawl()