mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-02 18:59:28 +00:00
fix: update ChromeDriver options on restricted environments and add ChromeDriver options as function parameter (#3043)
* Fix when env does nto exist * Fix missed line * Set conservative chromedriver options * Set default options based on environment * Fix removed line * Updated documentation * Generate new schemas manually * Add arguments via iterator and helper function * Pre-push doc format * Use imported Option vs full namespace access * Manually update schema * Manually add documentation and schema * Fix language and documentation * Fix typo * Auto generated docs * Updated documentation
This commit is contained in:
parent
e715dee17d
commit
d715d0202d
@ -27,7 +27,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
|
||||
#### Crawler.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None)
|
||||
def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None, webdriver_options: Optional[List[str]] = None)
|
||||
```
|
||||
|
||||
Init object with basic params for crawling (can be overwritten later).
|
||||
@ -57,6 +57,16 @@ E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0
|
||||
This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
|
||||
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
|
||||
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
|
||||
- `webdriver_options`: A list of options to send to Selenium webdriver. If none is provided,
|
||||
Crawler uses, as a default option, a reasonable selection for operating locally, on restricted docker containers,
|
||||
and avoids using GPU.
|
||||
Crawler always appends the following option: "--headless"
|
||||
For example: 1) ["--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage", "--single-process"]
|
||||
These are the default options which disable GPU, disable shared memory usage
|
||||
and spawn a single process.
|
||||
2) ["--no-sandbox"]
|
||||
This option disables the sandbox, which is required for running Chrome as root.
|
||||
See [Chrome Web Driver Options](https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.chrome.options) for more details.
|
||||
|
||||
<a id="crawler.Crawler.crawl"></a>
|
||||
|
||||
|
||||
@ -2060,6 +2060,13 @@
|
||||
"title": "Crawler Naming Function",
|
||||
"type": "string",
|
||||
"default": null
|
||||
},
|
||||
"webdriver_options": {
|
||||
"title": "Webdriver Options",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
|
||||
@ -2060,6 +2060,13 @@
|
||||
"title": "Crawler Naming Function",
|
||||
"type": "string",
|
||||
"default": null
|
||||
},
|
||||
"webdriver_options": {
|
||||
"title": "Webdriver Options",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
from typing import Callable, List, Optional, Dict, Tuple, Union, Any
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
@ -12,6 +13,7 @@ import hashlib
|
||||
try:
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
|
||||
from selenium import webdriver
|
||||
@ -56,6 +58,7 @@ class Crawler(BaseComponent):
|
||||
extract_hidden_text=True,
|
||||
loading_wait_time: Optional[int] = None,
|
||||
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
|
||||
webdriver_options: Optional[List[str]] = None,
|
||||
):
|
||||
"""
|
||||
Init object with basic params for crawling (can be overwritten later).
|
||||
@ -83,17 +86,40 @@ class Crawler(BaseComponent):
|
||||
This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
|
||||
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
|
||||
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
|
||||
:param webdriver_options: A list of options to send to Selenium webdriver. If none is provided,
|
||||
Crawler uses, as a default option, a reasonable selection for operating locally, on restricted docker containers,
|
||||
and avoids using GPU.
|
||||
Crawler always appends the following option: "--headless"
|
||||
For example: 1) ["--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage", "--single-process"]
|
||||
These are the default options which disable GPU, disable shared memory usage
|
||||
and spawn a single process.
|
||||
2) ["--no-sandbox"]
|
||||
This option disables the sandbox, which is required for running Chrome as root.
|
||||
See [Chrome Web Driver Options](https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.chrome.options) for more details.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
IN_COLAB = "google.colab" in sys.modules
|
||||
IN_AZUREML = True if os.environ.get("AZUREML_ENVIRONMENT_IMAGE", None) == "True" else False
|
||||
IS_ROOT = True if os.geteuid() == 0 else False
|
||||
|
||||
if webdriver_options is None:
|
||||
webdriver_options = ["--headless", "--disable-gpu", "--disable-dev-shm-usage", "--single-process"]
|
||||
elif "--headless" not in webdriver_options:
|
||||
webdriver_options.append("--headless")
|
||||
|
||||
if IS_ROOT and "--no-sandbox" not in webdriver_options:
|
||||
webdriver_options.append("--no-sandbox")
|
||||
|
||||
if (IN_COLAB or IN_AZUREML) and "--disable-dev-shm-usage" not in webdriver_options:
|
||||
webdriver_options.append("--disable-dev-shm-usage")
|
||||
|
||||
options = Options()
|
||||
for option in webdriver_options:
|
||||
options.add_argument(option)
|
||||
|
||||
options = webdriver.chrome.options.Options()
|
||||
options.add_argument("--headless")
|
||||
if IN_COLAB:
|
||||
try:
|
||||
options.add_argument("--no-sandbox")
|
||||
options.add_argument("--disable-dev-shm-usage")
|
||||
self.driver = webdriver.Chrome(service=Service("chromedriver"), options=options)
|
||||
except WebDriverException as exc:
|
||||
raise NodeError(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user