Fix crawler long file names (#2723)

* Changing the name that crawled page is saved to avoid long file names error on some file systems * Custom naming function for saving crawled files * Update Documentation & Code Style * Remove bad characters on file name and preffix * Add test for naming function * Update Documentation & Code Style * Fix expensive regex recalculation and linter warns * Check for exceptions on file dump * Remove param_naming variable * Fix file paths on Windows, Linux and Mac * Update Documentation & Code Style * Test using one of the docstrings examples * Change default naming function Update docstrings * Applying formatting rules * Update Documentation & Code Style * Fix mypy incompatible assignment error * Remove unused type declaration * Fix typo * Update tests for naming function * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2025-12-26 22:48:29 +00:00 · 2022-07-11 07:16:32 -03:00 · 2022-07-11 07:16:32 -03:00 · 77a513fe49
commit 77a513fe49
parent ba08fc86f5
5 changed files with 119 additions and 18 deletions
--- a/docs/_src/api/api/crawler.md
+++ b/docs/_src/api/api/crawler.md
@ -19,7 +19,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
 |    crawler = Crawler(output_dir="crawled_files")
 |    # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/overview/
 |    docs = crawler.crawl(urls=["https://haystack.deepset.ai/overview/get-started"],
-|                         filter_urls= ["haystack\.deepset\.ai\/overview\/"])
+|                         filter_urls= ["haystack.deepset.ai/overview/"])
 ```

 <a id="crawler.Crawler.__init__"></a>
@ -27,7 +27,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
 #### Crawler.\_\_init\_\_

 ```python
-def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None)
+def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None)
 ```

 Init object with basic params for crawling (can be overwritten later).
@ -51,13 +51,19 @@ E.g. the text can be inside a span with style="display: none"
 - `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on
 dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
 E.g. 2: Crawler will wait 2 seconds before scraping page
+- `crawler_naming_function`: A function mapping the crawled page to a file name.
+By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
+E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+        This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
+     2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
+        This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.

 <a id="crawler.Crawler.crawl"></a>

 #### Crawler.crawl

 ```python
-def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None, loading_wait_time: Optional[int] = None) -> List[Path]
+def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) -> List[Path]
 ```

 Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
@ -84,6 +90,12 @@ In this case the id will be generated by using the content and the defined metad
 - `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on
 dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
 E.g. 2: Crawler will wait 2 seconds before scraping page
+- `crawler_naming_function`: A function mapping the crawled page to a file name.
+By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
+E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+        This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
+     2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
+        This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.

 **Returns**:

@ -94,7 +106,7 @@ List of paths where the crawled webpages got stored
 #### Crawler.run

 ```python
-def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True, loading_wait_time: Optional[int] = None) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]
+def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]
 ```

 Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
@ -119,6 +131,12 @@ E.g. the text can be inside a span with style="display: none"
 - `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on
 dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
 E.g. 2: Crawler will wait 2 seconds before scraping page
+- `crawler_naming_function`: A function mapping the crawled page to a file name.
+By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
+E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+        This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
+     2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
+        This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.

 **Returns**:

--- a/haystack/json-schemas/haystack-pipeline-master.schema.json
+++ b/haystack/json-schemas/haystack-pipeline-master.schema.json
@ -2009,6 +2009,11 @@
            "loading_wait_time": {
              "title": "Loading Wait Time",
              "type": "integer"
+            },
+            "crawler_naming_function": {
+              "title": "Crawler Naming Function",
+              "type": "string",
+              "default": null
            }
          },
          "required": [
--- a/haystack/nodes/connector/crawler.py
+++ b/haystack/nodes/connector/crawler.py
@ -1,4 +1,4 @@
-from typing import List, Optional, Dict, Tuple, Union
+from typing import Callable, List, Optional, Dict, Tuple, Union, Any

 import re
 import sys
@ -7,6 +7,7 @@ import time
 import logging
 from pathlib import Path
 from urllib.parse import urlparse
+import hashlib

 try:
    from webdriver_manager.chrome import ChromeDriverManager
@ -37,7 +38,7 @@ class Crawler(BaseComponent):
    |    crawler = Crawler(output_dir="crawled_files")
    |    # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/overview/
    |    docs = crawler.crawl(urls=["https://haystack.deepset.ai/overview/get-started"],
-    |                         filter_urls= ["haystack\.deepset\.ai\/overview\/"])
+    |                         filter_urls= ["haystack.deepset.ai/overview/"])
    ```
    """

@ -53,6 +54,7 @@ class Crawler(BaseComponent):
        id_hash_keys: Optional[List[str]] = None,
        extract_hidden_text=True,
        loading_wait_time: Optional[int] = None,
+        crawler_naming_function: Optional[Callable[[str, str], str]] = None,
    ):
        """
        Init object with basic params for crawling (can be overwritten later).
@ -74,6 +76,12 @@ class Crawler(BaseComponent):
        :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
            dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
            E.g. 2: Crawler will wait 2 seconds before scraping page
+        :param crawler_naming_function: A function mapping the crawled page to a file name.
+            By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
+            E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+                    This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
+                 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
+                    This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
        """
        super().__init__()

@ -106,6 +114,7 @@ class Crawler(BaseComponent):
        self.id_hash_keys = id_hash_keys
        self.extract_hidden_text = extract_hidden_text
        self.loading_wait_time = loading_wait_time
+        self.crawler_naming_function = crawler_naming_function

    def crawl(
        self,
@ -117,6 +126,7 @@ class Crawler(BaseComponent):
        id_hash_keys: Optional[List[str]] = None,
        extract_hidden_text: Optional[bool] = None,
        loading_wait_time: Optional[int] = None,
+        crawler_naming_function: Optional[Callable[[str, str], str]] = None,
    ) -> List[Path]:
        """
        Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
@ -140,6 +150,12 @@ class Crawler(BaseComponent):
        :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
            dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
            E.g. 2: Crawler will wait 2 seconds before scraping page
+        :param crawler_naming_function: A function mapping the crawled page to a file name.
+            By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
+            E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+                    This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
+                 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
+                    This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.

        :return: List of paths where the crawled webpages got stored
        """
@ -160,6 +176,8 @@ class Crawler(BaseComponent):
            extract_hidden_text = self.extract_hidden_text
        if loading_wait_time is None:
            loading_wait_time = self.loading_wait_time
+        if crawler_naming_function is None:
+            crawler_naming_function = self.crawler_naming_function

        output_dir = Path(output_dir)
        if not output_dir.exists():
@ -182,6 +200,7 @@ class Crawler(BaseComponent):
                            output_dir=output_dir,
                            extract_hidden_text=extract_hidden_text,
                            loading_wait_time=loading_wait_time,
+                            crawler_naming_function=crawler_naming_function,
                        )
            else:
                file_paths += self._write_to_files(
@ -189,6 +208,7 @@ class Crawler(BaseComponent):
                    output_dir=output_dir,
                    extract_hidden_text=extract_hidden_text,
                    loading_wait_time=loading_wait_time,
+                    crawler_naming_function=crawler_naming_function,
                )
            # follow one level of sublinks if requested
            if crawler_depth == 1:
@ -211,6 +231,7 @@ class Crawler(BaseComponent):
                        id_hash_keys=id_hash_keys,
                        extract_hidden_text=extract_hidden_text,
                        loading_wait_time=loading_wait_time,
+                        crawler_naming_function=crawler_naming_function,
                    )

        return file_paths
@ -220,9 +241,10 @@ class Crawler(BaseComponent):
        urls: List[str],
        output_dir: Path,
        extract_hidden_text: bool,
-        base_url: str = None,
+        base_url: Optional[str] = None,
        id_hash_keys: Optional[List[str]] = None,
        loading_wait_time: Optional[int] = None,
+        crawler_naming_function: Optional[Callable[[str, str], str]] = None,
    ) -> List[Path]:
        paths = []
        for link in urls:
@ -236,18 +258,30 @@ class Crawler(BaseComponent):
            else:
                text = el.text

-            link_split_values = link.replace("https://", "").split("/")
-            file_name = f"{'_'.join(link_split_values)}.json"
-            file_path = output_dir / file_name
-
-            data = {}
+            data: Dict[str, Any] = {}
            data["meta"] = {"url": link}
            if base_url:
                data["meta"]["base_url"] = base_url
            data["content"] = text
            document = Document.from_dict(data, id_hash_keys=id_hash_keys)
-            with open(file_path, "w", encoding="utf-8") as f:
-                json.dump(document.to_dict(), f)
+
+            if crawler_naming_function is not None:
+                file_name_prefix = crawler_naming_function(link, text)
+            else:
+                file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link[:129])
+                file_name_hash = hashlib.md5(f"{link}".encode("utf-8")).hexdigest()
+                file_name_prefix = f"{file_name_link}_{file_name_hash[-6:]}"
+
+            file_path = output_dir / f"{file_name_prefix}.json"
+
+            try:
+                with open(file_path, "w", encoding="utf-8") as f:
+                    json.dump(document.to_dict(), f)
+            except Exception as e:
+                logging.exception(
+                    f"Crawler can't save the content of '{link}' under '{file_path}'. This webpage will be skipped, but links from this page will still be crawled. Make sure the path above is accessible and the file name is valid. If the file name is invalid, consider setting 'crawler_naming_function' to another function."
+                )
+
            paths.append(file_path)

        return paths
@ -263,6 +297,7 @@ class Crawler(BaseComponent):
        id_hash_keys: Optional[List[str]] = None,
        extract_hidden_text: Optional[bool] = True,
        loading_wait_time: Optional[int] = None,
+        crawler_naming_function: Optional[Callable[[str, str], str]] = None,
    ) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]:
        """
        Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
@ -285,6 +320,12 @@ class Crawler(BaseComponent):
        :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
            dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
            E.g. 2: Crawler will wait 2 seconds before scraping page
+        :param crawler_naming_function: A function mapping the crawled page to a file name.
+            By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
+            E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+                    This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
+                 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
+                    This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.

        :return: Tuple({"paths": List of filepaths, ...}, Name of output edge)
        """
@ -297,6 +338,7 @@ class Crawler(BaseComponent):
            overwrite_existing_files=overwrite_existing_files,
            extract_hidden_text=extract_hidden_text,
            loading_wait_time=loading_wait_time,
+            crawler_naming_function=crawler_naming_function,
        )
        results: Dict[str, Union[List[Document], List[Path]]] = {}
        if return_documents:
@ -321,6 +363,7 @@ class Crawler(BaseComponent):
        id_hash_keys: Optional[List[str]] = None,
        extract_hidden_text: Optional[bool] = True,
        loading_wait_time: Optional[int] = None,
+        crawler_naming_function: Optional[Callable[[str, str], str]] = None,
    ):
        return self.run(
            output_dir=output_dir,
@ -332,6 +375,7 @@ class Crawler(BaseComponent):
            id_hash_keys=id_hash_keys,
            extract_hidden_text=extract_hidden_text,
            loading_wait_time=loading_wait_time,
+            crawler_naming_function=crawler_naming_function,
        )

    @staticmethod
@ -350,11 +394,9 @@ class Crawler(BaseComponent):
        self,
        base_url: str,
        filter_urls: Optional[List] = None,
-        already_found_links: List = None,
+        already_found_links: Optional[List] = None,
        loading_wait_time: Optional[int] = None,
    ) -> set:
-        if filter_urls:
-            filter_pattern = re.compile("|".join(filter_urls))

        self.driver.get(base_url)
        if loading_wait_time is not None:
@ -362,6 +404,8 @@ class Crawler(BaseComponent):
        a_elements = self.driver.find_elements(by=By.XPATH, value="//a[@href]")
        sub_links = set()

+        filter_pattern = re.compile("|".join(filter_urls)) if filter_urls is not None else None
+
        for i in a_elements:
            try:
                sub_link = i.get_attribute("href")
@ -375,7 +419,8 @@ class Crawler(BaseComponent):
                if self._is_internal_url(base_url=base_url, sub_link=sub_link) and (
                    not self._is_inpage_navigation(base_url=base_url, sub_link=sub_link)
                ):
-                    if filter_urls:
+                    if filter_pattern is not None:
+
                        if filter_pattern.search(sub_link):
                            sub_links.add(sub_link)
                    else:
--- a/test/nodes/test_connector.py
+++ b/test/nodes/test_connector.py
@ -2,6 +2,9 @@ from typing import List

 import json
 from pathlib import Path
+import re
+import hashlib
+import os

 import pytest
 from selenium.webdriver.common.by import By
@ -184,3 +187,32 @@ def test_crawler_loading_wait_time(test_url, tmp_path):
    assert content_in_results(crawler, test_url + "/index.html", paths)
    assert content_in_results(crawler, test_url + "/page1.html", paths)
    assert content_in_results(crawler, test_url + "/page2.html", paths)
+
+
+def test_crawler_default_naming_function(test_url, tmp_path):
+    crawler = Crawler(output_dir=tmp_path)
+
+    link = f"{test_url}/page_with_a_very_long_name_to_do_some_tests_Now_let's_add_some_text_just_to_pass_the_129_chars_mark_and_trigger_the_chars_limit_of_the_default_naming_function.html"
+    file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link[:129])
+    file_name_hash = hashlib.md5(f"{link}".encode("utf-8")).hexdigest()
+    expected_crawled_file_path = f"{tmp_path}/{file_name_link}_{file_name_hash[-6:]}.json"
+
+    paths = crawler.crawl(urls=[link], crawler_depth=0)
+
+    assert os.path.exists(paths[0])
+    assert paths[0] == Path(expected_crawled_file_path)
+
+
+def test_crawler_naming_function(test_url, tmp_path):
+    crawler = Crawler(
+        output_dir=tmp_path, crawler_naming_function=lambda link, text: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+    )
+
+    link = f"{test_url}/page_dynamic.html"
+    file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link)
+    expected_crawled_file_path = tmp_path / f"{file_name_link}.json"
+
+    paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=0)
+
+    assert os.path.exists(paths[0])
+    assert paths[0] == expected_crawled_file_path
--- a/test/samples/crawler/page_with_a_very_long_name_to_do_some_tests_Now_let's_add_some_text_just_to_pass_the_129_chars_mark_and_trigger_the_chars_limit_of_the_default_naming_function.html
+++ b/test/samples/crawler/page_with_a_very_long_name_to_do_some_tests_Now_let's_add_some_text_just_to_pass_the_129_chars_mark_and_trigger_the_chars_limit_of_the_default_naming_function.html
@ -0,0 +1 @@
+This is a page with a very long name to do some tests.
				`@ -0,0 +1 @@`
				`This is a page with a very long name to do some tests.`