fix: update ChromeDriver options on restricted environments and add ChromeDriver options as function parameter (#3043)

* Fix when env does nto exist * Fix missed line * Set conservative chromedriver options * Set default options based on environment * Fix removed line * Updated documentation * Generate new schemas manually * Add arguments via iterator and helper function * Pre-push doc format * Use imported Option vs full namespace access * Manually update schema * Manually add documentation and schema * Fix language and documentation * Fix typo * Auto generated docs * Updated documentation
2025-11-09 14:23:43 +00:00 · 2022-08-22 07:59:33 -03:00 · 2022-08-22 07:59:33 -03:00 · d715d0202d
commit d715d0202d
parent e715dee17d
4 changed files with 55 additions and 5 deletions
--- a/docs/_src/api/api/crawler.md
+++ b/docs/_src/api/api/crawler.md
@ -27,7 +27,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
 #### Crawler.\_\_init\_\_
 ```python
-def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None)
+def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None, webdriver_options: Optional[List[str]] = None)
 ```
 Init object with basic params for crawling (can be overwritten later).
@ -57,6 +57,16 @@ E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0
        This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
     2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
        This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
 - `webdriver_options`: A list of options to send to Selenium webdriver. If none is provided,
 Crawler uses, as a default option, a reasonable selection for operating locally, on restricted docker containers,
 and avoids using GPU.
 Crawler always appends the following option: "--headless"
 For example: 1) ["--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage", "--single-process"]
        These are the default options which disable GPU, disable shared memory usage
        and spawn a single process.
     2) ["--no-sandbox"]
        This option disables the sandbox, which is required for running Chrome as root.
 See [Chrome Web Driver Options](https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.chrome.options) for more details.
 <a id="crawler.Crawler.crawl"></a>
--- a/haystack/json-schemas/haystack-pipeline-1.7.0.schema.json
+++ b/haystack/json-schemas/haystack-pipeline-1.7.0.schema.json
@ -2060,6 +2060,13 @@
              "title": "Crawler Naming Function",
              "type": "string",
              "default": null
            },
            "webdriver_options": {
              "title": "Webdriver Options",
              "type": "array",
              "items": {
                "type": "string"
              }
            }
          },
          "required": [
--- a/haystack/json-schemas/haystack-pipeline-master.schema.json
+++ b/haystack/json-schemas/haystack-pipeline-master.schema.json
@ -2060,6 +2060,13 @@
              "title": "Crawler Naming Function",
              "type": "string",
              "default": null
            },
            "webdriver_options": {
              "title": "Webdriver Options",
              "type": "array",
              "items": {
                "type": "string"
              }
            }
          },
          "required": [
--- a/haystack/nodes/connector/crawler.py
+++ b/haystack/nodes/connector/crawler.py
@ -1,5 +1,6 @@
 from typing import Callable, List, Optional, Dict, Tuple, Union, Any
 import os
 import re
 import sys
 import json
@ -12,6 +13,7 @@ import hashlib
 try:
    from webdriver_manager.chrome import ChromeDriverManager
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.common.by import By
    from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
    from selenium import webdriver
@ -56,6 +58,7 @@ class Crawler(BaseComponent):
        extract_hidden_text=True,
        loading_wait_time: Optional[int] = None,
        crawler_naming_function: Optional[Callable[[str, str], str]] = None,
        webdriver_options: Optional[List[str]] = None,
    ):
        """
        Init object with basic params for crawling (can be overwritten later).
@ -83,17 +86,40 @@ class Crawler(BaseComponent):
                    This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
                 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
                    This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
        :param webdriver_options: A list of options to send to Selenium webdriver. If none is provided,
            Crawler uses, as a default option, a reasonable selection for operating locally, on restricted docker containers,
            and avoids using GPU.
            Crawler always appends the following option: "--headless"
            For example: 1) ["--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage", "--single-process"]
                    These are the default options which disable GPU, disable shared memory usage
                    and spawn a single process.
                 2) ["--no-sandbox"]
                    This option disables the sandbox, which is required for running Chrome as root.
            See [Chrome Web Driver Options](https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.chrome.options) for more details.
        """
        super().__init__()
        IN_COLAB = "google.colab" in sys.modules
        IN_AZUREML = True if os.environ.get("AZUREML_ENVIRONMENT_IMAGE", None) == "True" else False
        IS_ROOT = True if os.geteuid() == 0 else False
        if webdriver_options is None:
            webdriver_options = ["--headless", "--disable-gpu", "--disable-dev-shm-usage", "--single-process"]
        elif "--headless" not in webdriver_options:
            webdriver_options.append("--headless")
        if IS_ROOT and "--no-sandbox" not in webdriver_options:
            webdriver_options.append("--no-sandbox")
        if (IN_COLAB or IN_AZUREML) and "--disable-dev-shm-usage" not in webdriver_options:
            webdriver_options.append("--disable-dev-shm-usage")
        options = Options()
        for option in webdriver_options:
            options.add_argument(option)
        options = webdriver.chrome.options.Options()
        options.add_argument("--headless")
        if IN_COLAB:
            try:
                options.add_argument("--no-sandbox")
                options.add_argument("--disable-dev-shm-usage")
                self.driver = webdriver.Chrome(service=Service("chromedriver"), options=options)
            except WebDriverException as exc:
                raise NodeError(