mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-08 22:03:54 +00:00
fix: update ChromeDriver options on restricted environments and add ChromeDriver options as function parameter (#3043)
* Fix when env does nto exist * Fix missed line * Set conservative chromedriver options * Set default options based on environment * Fix removed line * Updated documentation * Generate new schemas manually * Add arguments via iterator and helper function * Pre-push doc format * Use imported Option vs full namespace access * Manually update schema * Manually add documentation and schema * Fix language and documentation * Fix typo * Auto generated docs * Updated documentation
This commit is contained in:
parent
e715dee17d
commit
d715d0202d
@ -27,7 +27,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
|
|||||||
#### Crawler.\_\_init\_\_
|
#### Crawler.\_\_init\_\_
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None)
|
def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None, webdriver_options: Optional[List[str]] = None)
|
||||||
```
|
```
|
||||||
|
|
||||||
Init object with basic params for crawling (can be overwritten later).
|
Init object with basic params for crawling (can be overwritten later).
|
||||||
@ -57,6 +57,16 @@ E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0
|
|||||||
This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
|
This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
|
||||||
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
|
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
|
||||||
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
|
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
|
||||||
|
- `webdriver_options`: A list of options to send to Selenium webdriver. If none is provided,
|
||||||
|
Crawler uses, as a default option, a reasonable selection for operating locally, on restricted docker containers,
|
||||||
|
and avoids using GPU.
|
||||||
|
Crawler always appends the following option: "--headless"
|
||||||
|
For example: 1) ["--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage", "--single-process"]
|
||||||
|
These are the default options which disable GPU, disable shared memory usage
|
||||||
|
and spawn a single process.
|
||||||
|
2) ["--no-sandbox"]
|
||||||
|
This option disables the sandbox, which is required for running Chrome as root.
|
||||||
|
See [Chrome Web Driver Options](https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.chrome.options) for more details.
|
||||||
|
|
||||||
<a id="crawler.Crawler.crawl"></a>
|
<a id="crawler.Crawler.crawl"></a>
|
||||||
|
|
||||||
|
|||||||
@ -2060,6 +2060,13 @@
|
|||||||
"title": "Crawler Naming Function",
|
"title": "Crawler Naming Function",
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"default": null
|
"default": null
|
||||||
|
},
|
||||||
|
"webdriver_options": {
|
||||||
|
"title": "Webdriver Options",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"required": [
|
"required": [
|
||||||
|
|||||||
@ -2060,6 +2060,13 @@
|
|||||||
"title": "Crawler Naming Function",
|
"title": "Crawler Naming Function",
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"default": null
|
"default": null
|
||||||
|
},
|
||||||
|
"webdriver_options": {
|
||||||
|
"title": "Webdriver Options",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"required": [
|
"required": [
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
from typing import Callable, List, Optional, Dict, Tuple, Union, Any
|
from typing import Callable, List, Optional, Dict, Tuple, Union, Any
|
||||||
|
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
@ -12,6 +13,7 @@ import hashlib
|
|||||||
try:
|
try:
|
||||||
from webdriver_manager.chrome import ChromeDriverManager
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
from selenium.webdriver.chrome.service import Service
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
|
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
@ -56,6 +58,7 @@ class Crawler(BaseComponent):
|
|||||||
extract_hidden_text=True,
|
extract_hidden_text=True,
|
||||||
loading_wait_time: Optional[int] = None,
|
loading_wait_time: Optional[int] = None,
|
||||||
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
|
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
|
||||||
|
webdriver_options: Optional[List[str]] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Init object with basic params for crawling (can be overwritten later).
|
Init object with basic params for crawling (can be overwritten later).
|
||||||
@ -83,17 +86,40 @@ class Crawler(BaseComponent):
|
|||||||
This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
|
This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
|
||||||
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
|
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
|
||||||
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
|
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
|
||||||
|
:param webdriver_options: A list of options to send to Selenium webdriver. If none is provided,
|
||||||
|
Crawler uses, as a default option, a reasonable selection for operating locally, on restricted docker containers,
|
||||||
|
and avoids using GPU.
|
||||||
|
Crawler always appends the following option: "--headless"
|
||||||
|
For example: 1) ["--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage", "--single-process"]
|
||||||
|
These are the default options which disable GPU, disable shared memory usage
|
||||||
|
and spawn a single process.
|
||||||
|
2) ["--no-sandbox"]
|
||||||
|
This option disables the sandbox, which is required for running Chrome as root.
|
||||||
|
See [Chrome Web Driver Options](https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.chrome.options) for more details.
|
||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
IN_COLAB = "google.colab" in sys.modules
|
IN_COLAB = "google.colab" in sys.modules
|
||||||
|
IN_AZUREML = True if os.environ.get("AZUREML_ENVIRONMENT_IMAGE", None) == "True" else False
|
||||||
|
IS_ROOT = True if os.geteuid() == 0 else False
|
||||||
|
|
||||||
|
if webdriver_options is None:
|
||||||
|
webdriver_options = ["--headless", "--disable-gpu", "--disable-dev-shm-usage", "--single-process"]
|
||||||
|
elif "--headless" not in webdriver_options:
|
||||||
|
webdriver_options.append("--headless")
|
||||||
|
|
||||||
|
if IS_ROOT and "--no-sandbox" not in webdriver_options:
|
||||||
|
webdriver_options.append("--no-sandbox")
|
||||||
|
|
||||||
|
if (IN_COLAB or IN_AZUREML) and "--disable-dev-shm-usage" not in webdriver_options:
|
||||||
|
webdriver_options.append("--disable-dev-shm-usage")
|
||||||
|
|
||||||
|
options = Options()
|
||||||
|
for option in webdriver_options:
|
||||||
|
options.add_argument(option)
|
||||||
|
|
||||||
options = webdriver.chrome.options.Options()
|
|
||||||
options.add_argument("--headless")
|
|
||||||
if IN_COLAB:
|
if IN_COLAB:
|
||||||
try:
|
try:
|
||||||
options.add_argument("--no-sandbox")
|
|
||||||
options.add_argument("--disable-dev-shm-usage")
|
|
||||||
self.driver = webdriver.Chrome(service=Service("chromedriver"), options=options)
|
self.driver = webdriver.Chrome(service=Service("chromedriver"), options=options)
|
||||||
except WebDriverException as exc:
|
except WebDriverException as exc:
|
||||||
raise NodeError(
|
raise NodeError(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user