fix: update ChromeDriver options on restricted environments and add ChromeDriver options as function parameter (#3043)

* Fix when env does nto exist

* Fix missed line

* Set conservative chromedriver options

* Set default options based on environment

* Fix removed line

* Updated documentation

* Generate new schemas manually

* Add arguments via iterator and helper function

* Pre-push doc format

* Use imported Option vs full namespace access

* Manually update schema

* Manually add documentation and schema

* Fix language and documentation

* Fix typo

* Auto generated docs

* Updated documentation
This commit is contained in:
Daniel Bichuetti 2022-08-22 07:59:33 -03:00 committed by GitHub
parent e715dee17d
commit d715d0202d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 55 additions and 5 deletions

View File

@ -27,7 +27,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
#### Crawler.\_\_init\_\_
```python
def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None)
def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None, webdriver_options: Optional[List[str]] = None)
```
Init object with basic params for crawling (can be overwritten later).
@ -57,6 +57,16 @@ E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0
This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
- `webdriver_options`: A list of options to send to Selenium webdriver. If none is provided,
Crawler uses, as a default option, a reasonable selection for operating locally, on restricted docker containers,
and avoids using GPU.
Crawler always appends the following option: "--headless"
For example: 1) ["--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage", "--single-process"]
These are the default options which disable GPU, disable shared memory usage
and spawn a single process.
2) ["--no-sandbox"]
This option disables the sandbox, which is required for running Chrome as root.
See [Chrome Web Driver Options](https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.chrome.options) for more details.
<a id="crawler.Crawler.crawl"></a>

View File

@ -2060,6 +2060,13 @@
"title": "Crawler Naming Function",
"type": "string",
"default": null
},
"webdriver_options": {
"title": "Webdriver Options",
"type": "array",
"items": {
"type": "string"
}
}
},
"required": [

View File

@ -2060,6 +2060,13 @@
"title": "Crawler Naming Function",
"type": "string",
"default": null
},
"webdriver_options": {
"title": "Webdriver Options",
"type": "array",
"items": {
"type": "string"
}
}
},
"required": [

View File

@ -1,5 +1,6 @@
from typing import Callable, List, Optional, Dict, Tuple, Union, Any
import os
import re
import sys
import json
@ -12,6 +13,7 @@ import hashlib
try:
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
from selenium import webdriver
@ -56,6 +58,7 @@ class Crawler(BaseComponent):
extract_hidden_text=True,
loading_wait_time: Optional[int] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
webdriver_options: Optional[List[str]] = None,
):
"""
Init object with basic params for crawling (can be overwritten later).
@ -83,17 +86,40 @@ class Crawler(BaseComponent):
This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
:param webdriver_options: A list of options to send to Selenium webdriver. If none is provided,
Crawler uses, as a default option, a reasonable selection for operating locally, on restricted docker containers,
and avoids using GPU.
Crawler always appends the following option: "--headless"
For example: 1) ["--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage", "--single-process"]
These are the default options which disable GPU, disable shared memory usage
and spawn a single process.
2) ["--no-sandbox"]
This option disables the sandbox, which is required for running Chrome as root.
See [Chrome Web Driver Options](https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.chrome.options) for more details.
"""
super().__init__()
IN_COLAB = "google.colab" in sys.modules
IN_AZUREML = True if os.environ.get("AZUREML_ENVIRONMENT_IMAGE", None) == "True" else False
IS_ROOT = True if os.geteuid() == 0 else False
if webdriver_options is None:
webdriver_options = ["--headless", "--disable-gpu", "--disable-dev-shm-usage", "--single-process"]
elif "--headless" not in webdriver_options:
webdriver_options.append("--headless")
if IS_ROOT and "--no-sandbox" not in webdriver_options:
webdriver_options.append("--no-sandbox")
if (IN_COLAB or IN_AZUREML) and "--disable-dev-shm-usage" not in webdriver_options:
webdriver_options.append("--disable-dev-shm-usage")
options = Options()
for option in webdriver_options:
options.add_argument(option)
options = webdriver.chrome.options.Options()
options.add_argument("--headless")
if IN_COLAB:
try:
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
self.driver = webdriver.Chrome(service=Service("chromedriver"), options=options)
except WebDriverException as exc:
raise NodeError(