fix: update ChromeDriver options on restricted environments and add ChromeDriver options as function parameter (#3043)

* Fix when env does nto exist

* Fix missed line

* Set conservative chromedriver options

* Set default options based on environment

* Fix removed line

* Updated documentation

* Generate new schemas manually

* Add arguments via iterator and helper function

* Pre-push doc format

* Use imported Option vs full namespace access

* Manually update schema

* Manually add documentation and schema

* Fix language and documentation

* Fix typo

* Auto generated docs

* Updated documentation
This commit is contained in:
Daniel Bichuetti 2022-08-22 07:59:33 -03:00 committed by GitHub
parent e715dee17d
commit d715d0202d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 55 additions and 5 deletions

View File

@ -27,7 +27,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
#### Crawler.\_\_init\_\_ #### Crawler.\_\_init\_\_
```python ```python
def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None, webdriver_options: Optional[List[str]] = None)
``` ```
Init object with basic params for crawling (can be overwritten later). Init object with basic params for crawling (can be overwritten later).
@ -57,6 +57,16 @@ E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0
This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores. This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest() 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content. This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
- `webdriver_options`: A list of options to send to Selenium webdriver. If none is provided,
Crawler uses, as a default option, a reasonable selection for operating locally, on restricted docker containers,
and avoids using GPU.
Crawler always appends the following option: "--headless"
For example: 1) ["--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage", "--single-process"]
These are the default options which disable GPU, disable shared memory usage
and spawn a single process.
2) ["--no-sandbox"]
This option disables the sandbox, which is required for running Chrome as root.
See [Chrome Web Driver Options](https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.chrome.options) for more details.
<a id="crawler.Crawler.crawl"></a> <a id="crawler.Crawler.crawl"></a>

View File

@ -2060,6 +2060,13 @@
"title": "Crawler Naming Function", "title": "Crawler Naming Function",
"type": "string", "type": "string",
"default": null "default": null
},
"webdriver_options": {
"title": "Webdriver Options",
"type": "array",
"items": {
"type": "string"
}
} }
}, },
"required": [ "required": [

View File

@ -2060,6 +2060,13 @@
"title": "Crawler Naming Function", "title": "Crawler Naming Function",
"type": "string", "type": "string",
"default": null "default": null
},
"webdriver_options": {
"title": "Webdriver Options",
"type": "array",
"items": {
"type": "string"
}
} }
}, },
"required": [ "required": [

View File

@ -1,5 +1,6 @@
from typing import Callable, List, Optional, Dict, Tuple, Union, Any from typing import Callable, List, Optional, Dict, Tuple, Union, Any
import os
import re import re
import sys import sys
import json import json
@ -12,6 +13,7 @@ import hashlib
try: try:
from webdriver_manager.chrome import ChromeDriverManager from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
from selenium import webdriver from selenium import webdriver
@ -56,6 +58,7 @@ class Crawler(BaseComponent):
extract_hidden_text=True, extract_hidden_text=True,
loading_wait_time: Optional[int] = None, loading_wait_time: Optional[int] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None,
webdriver_options: Optional[List[str]] = None,
): ):
""" """
Init object with basic params for crawling (can be overwritten later). Init object with basic params for crawling (can be overwritten later).
@ -83,17 +86,40 @@ class Crawler(BaseComponent):
This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores. This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest() 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content. This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
:param webdriver_options: A list of options to send to Selenium webdriver. If none is provided,
Crawler uses, as a default option, a reasonable selection for operating locally, on restricted docker containers,
and avoids using GPU.
Crawler always appends the following option: "--headless"
For example: 1) ["--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage", "--single-process"]
These are the default options which disable GPU, disable shared memory usage
and spawn a single process.
2) ["--no-sandbox"]
This option disables the sandbox, which is required for running Chrome as root.
See [Chrome Web Driver Options](https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.chrome.options) for more details.
""" """
super().__init__() super().__init__()
IN_COLAB = "google.colab" in sys.modules IN_COLAB = "google.colab" in sys.modules
IN_AZUREML = True if os.environ.get("AZUREML_ENVIRONMENT_IMAGE", None) == "True" else False
IS_ROOT = True if os.geteuid() == 0 else False
if webdriver_options is None:
webdriver_options = ["--headless", "--disable-gpu", "--disable-dev-shm-usage", "--single-process"]
elif "--headless" not in webdriver_options:
webdriver_options.append("--headless")
if IS_ROOT and "--no-sandbox" not in webdriver_options:
webdriver_options.append("--no-sandbox")
if (IN_COLAB or IN_AZUREML) and "--disable-dev-shm-usage" not in webdriver_options:
webdriver_options.append("--disable-dev-shm-usage")
options = Options()
for option in webdriver_options:
options.add_argument(option)
options = webdriver.chrome.options.Options()
options.add_argument("--headless")
if IN_COLAB: if IN_COLAB:
try: try:
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
self.driver = webdriver.Chrome(service=Service("chromedriver"), options=options) self.driver = webdriver.Chrome(service=Service("chromedriver"), options=options)
except WebDriverException as exc: except WebDriverException as exc:
raise NodeError( raise NodeError(