From 3f472995bb259ebfa9c76d9a0fe686f3101d90d0 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Tue, 8 Aug 2023 15:13:22 +0200 Subject: [PATCH] refactor: update Crawler to support selenium>=4.11.0 and simplify it (#5515) * refactor crawler * rm unused imports * release notes! * rm outdated mock --- haystack/nodes/connector/crawler.py | 54 ++----------------- pyproject.toml | 3 +- ...rawler-selenium-4.11-30fec9f6e345834f.yaml | 5 ++ test/nodes/test_connector.py | 3 +- 4 files changed, 12 insertions(+), 53 deletions(-) create mode 100644 releasenotes/notes/update-crawler-selenium-4.11-30fec9f6e345834f.yaml diff --git a/haystack/nodes/connector/crawler.py b/haystack/nodes/connector/crawler.py index 1e0fd1d7f..d30b5ec98 100644 --- a/haystack/nodes/connector/crawler.py +++ b/haystack/nodes/connector/crawler.py @@ -9,19 +9,16 @@ from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Set from urllib.parse import urlparse -from haystack.errors import NodeError from haystack.nodes.base import BaseComponent from haystack.schema import Document from haystack.lazy_imports import LazyImport with LazyImport("Run 'pip install farm-haystack[crawler]'") as selenium_import: from selenium import webdriver - from selenium.common.exceptions import StaleElementReferenceException, WebDriverException + from selenium.common.exceptions import StaleElementReferenceException from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By - from webdriver_manager.chrome import ChromeDriverManager - logger = logging.getLogger(__name__) @@ -109,8 +106,10 @@ class Crawler(BaseComponent): if webdriver_options is None: webdriver_options = ["--headless", "--disable-gpu", "--disable-dev-shm-usage", "--single-process"] webdriver_options.append("--headless") + if IS_ROOT or IN_WINDOWS or IN_COLAB: + webdriver_options.append("--no-sandbox") if IS_ROOT or IN_WINDOWS: - webdriver_options.extend(["--no-sandbox", "--remote-debugging-port=9222"]) + webdriver_options.append("--remote-debugging-port=9222") if IN_COLAB or IN_AZUREML: webdriver_options.append("--disable-dev-shm-usage") @@ -118,50 +117,7 @@ class Crawler(BaseComponent): for option in set(webdriver_options): options.add_argument(option) - if IN_COLAB: - try: - self.driver = webdriver.Chrome(service=Service("chromedriver"), options=options) - except WebDriverException as exc: - raise NodeError( - """ - \'chromium-driver\' needs to be installed manually when running colab. Follow the below given commands: - %%shell - cat > /etc/apt/sources.list.d/debian.list <<'EOF' - deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main - deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main - deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main - EOF - - apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517 - apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138 - apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A - apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg - apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg - apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg - - cat > /etc/apt/preferences.d/chromium.pref << 'EOF' - Package: * - Pin: release a=eoan - Pin-Priority: 500 - - - Package: * - Pin: origin "deb.debian.org" - Pin-Priority: 300 - - - Package: chromium* - Pin: origin "deb.debian.org" - Pin-Priority: 700 - EOF - - apt-get update - apt-get install chromium chromium-driver - If it has already been installed, please check if it has been copied to the right directory i.e. to \'/usr/bin\'""" - ) from exc - else: - logger.info("'chrome-driver' will be automatically installed.") - self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) + self.driver = webdriver.Chrome(service=Service(), options=options) self.urls = urls self.crawler_depth = crawler_depth self.filter_urls = filter_urls diff --git a/pyproject.toml b/pyproject.toml index fa310efb8..eb84dc3cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -154,8 +154,7 @@ aws = [ "botocore>=1.27", ] crawler = [ - "selenium>=4.0.0,!=4.1.4", # Avoid 4.1.4 due to https://github.com/SeleniumHQ/selenium/issues/10612 - "webdriver-manager", + "selenium>=4.11.0" ] preprocessing = [ "nltk", diff --git a/releasenotes/notes/update-crawler-selenium-4.11-30fec9f6e345834f.yaml b/releasenotes/notes/update-crawler-selenium-4.11-30fec9f6e345834f.yaml new file mode 100644 index 000000000..5db8222cc --- /dev/null +++ b/releasenotes/notes/update-crawler-selenium-4.11-30fec9f6e345834f.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Make the Crawler work properly with Selenium>=4.11.0. + Simplify the Crawler, as the new version of Selenium automatically finds or installs the necessary drivers. diff --git a/test/nodes/test_connector.py b/test/nodes/test_connector.py index 9e91c8ea3..b346600c0 100644 --- a/test/nodes/test_connector.py +++ b/test/nodes/test_connector.py @@ -54,9 +54,8 @@ def content_in_results(crawler: Crawler, url: str, results: List[Path], expected @pytest.mark.unit -@patch("haystack.nodes.connector.crawler.ChromeDriverManager") @patch("haystack.nodes.connector.crawler.webdriver") -def test_crawler_url_none_exception(webdriver, manager): +def test_crawler_url_none_exception(webdriver): crawler = Crawler() with pytest.raises(ValueError): crawler.crawl()