From b9ab7b3ca2d548b3669038d837b83e6558fa8c6d Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Fri, 3 Feb 2023 16:43:18 +0100 Subject: [PATCH] fix: make the crawler more robust on Windows (#4049) * first try * simplify the code a bit * fix; better docstrings * add URL --- haystack/nodes/connector/crawler.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/haystack/nodes/connector/crawler.py b/haystack/nodes/connector/crawler.py index 3d028a5cc..6f055122a 100644 --- a/haystack/nodes/connector/crawler.py +++ b/haystack/nodes/connector/crawler.py @@ -95,27 +95,28 @@ class Crawler(BaseComponent): and spawn a single process. 2) ["--no-sandbox"] This option disables the sandbox, which is required for running Chrome as root. - See [Chrome Web Driver Options](https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.chrome.options) for more details. + 3) ["--remote-debugging-port=9222"] + This option enables remote debug over HTTP. + See [Chromium Command Line Switches](https://peter.sh/experiments/chromium-command-line-switches/) for more details on the available options. + If your crawler fails, rasing a `selenium.WebDriverException`, this [Stack Overflow thread](https://stackoverflow.com/questions/50642308/webdriverexception-unknown-error-devtoolsactiveport-file-doesnt-exist-while-t) can be helpful. Contains useful suggestions for webdriver_options. """ super().__init__() IN_COLAB = "google.colab" in sys.modules IN_AZUREML = os.environ.get("AZUREML_ENVIRONMENT_IMAGE", None) == "True" - IS_ROOT = sys.platform not in ["win32", "cygwin"] and os.geteuid() == 0 # type: ignore # This is a mypy issue of sorts, that fails on Windows. + IN_WINDOWS = sys.platform in ["win32", "cygwin"] + IS_ROOT = not IN_WINDOWS and os.geteuid() == 0 # type: ignore # This is a mypy issue of sorts, that fails on Windows. if webdriver_options is None: webdriver_options = ["--headless", "--disable-gpu", "--disable-dev-shm-usage", "--single-process"] - elif "--headless" not in webdriver_options: - webdriver_options.append("--headless") - - if IS_ROOT and "--no-sandbox" not in webdriver_options: - webdriver_options.append("--no-sandbox") - - if (IN_COLAB or IN_AZUREML) and "--disable-dev-shm-usage" not in webdriver_options: + webdriver_options.append("--headless") + if IS_ROOT or IN_WINDOWS: + webdriver_options.extend(["--no-sandbox", "--remote-debugging-port=9222"]) + if IN_COLAB or IN_AZUREML: webdriver_options.append("--disable-dev-shm-usage") options = Options() - for option in webdriver_options: + for option in set(webdriver_options): options.add_argument(option) if IN_COLAB: