diff --git a/docs/_src/api/api/crawler.md b/docs/_src/api/api/crawler.md index 494f97f1a..d2e9b08c9 100644 --- a/docs/_src/api/api/crawler.md +++ b/docs/_src/api/api/crawler.md @@ -27,7 +27,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus #### Crawler.\_\_init\_\_ ```python -def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) +def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None, webdriver_options: Optional[List[str]] = None) ``` Init object with basic params for crawling (can be overwritten later). @@ -57,6 +57,16 @@ E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores. 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest() This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content. +- `webdriver_options`: A list of options to send to Selenium webdriver. If none is provided, +Crawler uses, as a default option, a reasonable selection for operating locally, on restricted docker containers, +and avoids using GPU. +Crawler always appends the following option: "--headless" +For example: 1) ["--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage", "--single-process"] + These are the default options which disable GPU, disable shared memory usage + and spawn a single process. + 2) ["--no-sandbox"] + This option disables the sandbox, which is required for running Chrome as root. +See [Chrome Web Driver Options](https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.chrome.options) for more details. diff --git a/haystack/json-schemas/haystack-pipeline-1.7.0.schema.json b/haystack/json-schemas/haystack-pipeline-1.7.0.schema.json index b80b7f87c..2368565a5 100644 --- a/haystack/json-schemas/haystack-pipeline-1.7.0.schema.json +++ b/haystack/json-schemas/haystack-pipeline-1.7.0.schema.json @@ -2060,6 +2060,13 @@ "title": "Crawler Naming Function", "type": "string", "default": null + }, + "webdriver_options": { + "title": "Webdriver Options", + "type": "array", + "items": { + "type": "string" + } } }, "required": [ diff --git a/haystack/json-schemas/haystack-pipeline-master.schema.json b/haystack/json-schemas/haystack-pipeline-master.schema.json index 35b27028e..1c238a988 100644 --- a/haystack/json-schemas/haystack-pipeline-master.schema.json +++ b/haystack/json-schemas/haystack-pipeline-master.schema.json @@ -2060,6 +2060,13 @@ "title": "Crawler Naming Function", "type": "string", "default": null + }, + "webdriver_options": { + "title": "Webdriver Options", + "type": "array", + "items": { + "type": "string" + } } }, "required": [ diff --git a/haystack/nodes/connector/crawler.py b/haystack/nodes/connector/crawler.py index f89fad0be..7c0ec384e 100644 --- a/haystack/nodes/connector/crawler.py +++ b/haystack/nodes/connector/crawler.py @@ -1,5 +1,6 @@ from typing import Callable, List, Optional, Dict, Tuple, Union, Any +import os import re import sys import json @@ -12,6 +13,7 @@ import hashlib try: from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.service import Service + from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.common.exceptions import StaleElementReferenceException, WebDriverException from selenium import webdriver @@ -56,6 +58,7 @@ class Crawler(BaseComponent): extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None, + webdriver_options: Optional[List[str]] = None, ): """ Init object with basic params for crawling (can be overwritten later). @@ -83,17 +86,40 @@ class Crawler(BaseComponent): This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores. 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest() This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content. + :param webdriver_options: A list of options to send to Selenium webdriver. If none is provided, + Crawler uses, as a default option, a reasonable selection for operating locally, on restricted docker containers, + and avoids using GPU. + Crawler always appends the following option: "--headless" + For example: 1) ["--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage", "--single-process"] + These are the default options which disable GPU, disable shared memory usage + and spawn a single process. + 2) ["--no-sandbox"] + This option disables the sandbox, which is required for running Chrome as root. + See [Chrome Web Driver Options](https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.chrome.options) for more details. """ super().__init__() IN_COLAB = "google.colab" in sys.modules + IN_AZUREML = True if os.environ.get("AZUREML_ENVIRONMENT_IMAGE", None) == "True" else False + IS_ROOT = True if os.geteuid() == 0 else False + + if webdriver_options is None: + webdriver_options = ["--headless", "--disable-gpu", "--disable-dev-shm-usage", "--single-process"] + elif "--headless" not in webdriver_options: + webdriver_options.append("--headless") + + if IS_ROOT and "--no-sandbox" not in webdriver_options: + webdriver_options.append("--no-sandbox") + + if (IN_COLAB or IN_AZUREML) and "--disable-dev-shm-usage" not in webdriver_options: + webdriver_options.append("--disable-dev-shm-usage") + + options = Options() + for option in webdriver_options: + options.add_argument(option) - options = webdriver.chrome.options.Options() - options.add_argument("--headless") if IN_COLAB: try: - options.add_argument("--no-sandbox") - options.add_argument("--disable-dev-shm-usage") self.driver = webdriver.Chrome(service=Service("chromedriver"), options=options) except WebDriverException as exc: raise NodeError(