diff --git a/docs/_src/api/api/crawler.md b/docs/_src/api/api/crawler.md
index 9169c91d5..494f97f1a 100644
--- a/docs/_src/api/api/crawler.md
+++ b/docs/_src/api/api/crawler.md
@@ -19,7 +19,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
| crawler = Crawler(output_dir="crawled_files")
| # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/overview/
| docs = crawler.crawl(urls=["https://haystack.deepset.ai/overview/get-started"],
-| filter_urls= ["haystack\.deepset\.ai\/overview\/"])
+| filter_urls= ["haystack.deepset.ai/overview/"])
```
@@ -27,7 +27,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
#### Crawler.\_\_init\_\_
```python
-def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None)
+def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None)
```
Init object with basic params for crawling (can be overwritten later).
@@ -51,13 +51,19 @@ E.g. the text can be inside a span with style="display: none"
- `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page
+- `crawler_naming_function`: A function mapping the crawled page to a file name.
+By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
+E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+ This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
+ 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
+ This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
#### Crawler.crawl
```python
-def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None, loading_wait_time: Optional[int] = None) -> List[Path]
+def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) -> List[Path]
```
Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
@@ -84,6 +90,12 @@ In this case the id will be generated by using the content and the defined metad
- `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page
+- `crawler_naming_function`: A function mapping the crawled page to a file name.
+By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
+E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+ This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
+ 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
+ This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
**Returns**:
@@ -94,7 +106,7 @@ List of paths where the crawled webpages got stored
#### Crawler.run
```python
-def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True, loading_wait_time: Optional[int] = None) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]
+def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]
```
Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
@@ -119,6 +131,12 @@ E.g. the text can be inside a span with style="display: none"
- `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page
+- `crawler_naming_function`: A function mapping the crawled page to a file name.
+By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
+E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+ This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
+ 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
+ This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
**Returns**:
diff --git a/haystack/json-schemas/haystack-pipeline-master.schema.json b/haystack/json-schemas/haystack-pipeline-master.schema.json
index 2648f35e5..f85d8c62a 100644
--- a/haystack/json-schemas/haystack-pipeline-master.schema.json
+++ b/haystack/json-schemas/haystack-pipeline-master.schema.json
@@ -2009,6 +2009,11 @@
"loading_wait_time": {
"title": "Loading Wait Time",
"type": "integer"
+ },
+ "crawler_naming_function": {
+ "title": "Crawler Naming Function",
+ "type": "string",
+ "default": null
}
},
"required": [
diff --git a/haystack/nodes/connector/crawler.py b/haystack/nodes/connector/crawler.py
index 8f7071ec5..16f703d5e 100644
--- a/haystack/nodes/connector/crawler.py
+++ b/haystack/nodes/connector/crawler.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Dict, Tuple, Union
+from typing import Callable, List, Optional, Dict, Tuple, Union, Any
import re
import sys
@@ -7,6 +7,7 @@ import time
import logging
from pathlib import Path
from urllib.parse import urlparse
+import hashlib
try:
from webdriver_manager.chrome import ChromeDriverManager
@@ -37,7 +38,7 @@ class Crawler(BaseComponent):
| crawler = Crawler(output_dir="crawled_files")
| # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/overview/
| docs = crawler.crawl(urls=["https://haystack.deepset.ai/overview/get-started"],
- | filter_urls= ["haystack\.deepset\.ai\/overview\/"])
+ | filter_urls= ["haystack.deepset.ai/overview/"])
```
"""
@@ -53,6 +54,7 @@ class Crawler(BaseComponent):
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text=True,
loading_wait_time: Optional[int] = None,
+ crawler_naming_function: Optional[Callable[[str, str], str]] = None,
):
"""
Init object with basic params for crawling (can be overwritten later).
@@ -74,6 +76,12 @@ class Crawler(BaseComponent):
:param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page
+ :param crawler_naming_function: A function mapping the crawled page to a file name.
+ By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
+ E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+ This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
+ 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
+ This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
"""
super().__init__()
@@ -106,6 +114,7 @@ class Crawler(BaseComponent):
self.id_hash_keys = id_hash_keys
self.extract_hidden_text = extract_hidden_text
self.loading_wait_time = loading_wait_time
+ self.crawler_naming_function = crawler_naming_function
def crawl(
self,
@@ -117,6 +126,7 @@ class Crawler(BaseComponent):
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = None,
loading_wait_time: Optional[int] = None,
+ crawler_naming_function: Optional[Callable[[str, str], str]] = None,
) -> List[Path]:
"""
Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
@@ -140,6 +150,12 @@ class Crawler(BaseComponent):
:param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page
+ :param crawler_naming_function: A function mapping the crawled page to a file name.
+ By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
+ E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+ This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
+ 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
+ This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
:return: List of paths where the crawled webpages got stored
"""
@@ -160,6 +176,8 @@ class Crawler(BaseComponent):
extract_hidden_text = self.extract_hidden_text
if loading_wait_time is None:
loading_wait_time = self.loading_wait_time
+ if crawler_naming_function is None:
+ crawler_naming_function = self.crawler_naming_function
output_dir = Path(output_dir)
if not output_dir.exists():
@@ -182,6 +200,7 @@ class Crawler(BaseComponent):
output_dir=output_dir,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
+ crawler_naming_function=crawler_naming_function,
)
else:
file_paths += self._write_to_files(
@@ -189,6 +208,7 @@ class Crawler(BaseComponent):
output_dir=output_dir,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
+ crawler_naming_function=crawler_naming_function,
)
# follow one level of sublinks if requested
if crawler_depth == 1:
@@ -211,6 +231,7 @@ class Crawler(BaseComponent):
id_hash_keys=id_hash_keys,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
+ crawler_naming_function=crawler_naming_function,
)
return file_paths
@@ -220,9 +241,10 @@ class Crawler(BaseComponent):
urls: List[str],
output_dir: Path,
extract_hidden_text: bool,
- base_url: str = None,
+ base_url: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None,
loading_wait_time: Optional[int] = None,
+ crawler_naming_function: Optional[Callable[[str, str], str]] = None,
) -> List[Path]:
paths = []
for link in urls:
@@ -236,18 +258,30 @@ class Crawler(BaseComponent):
else:
text = el.text
- link_split_values = link.replace("https://", "").split("/")
- file_name = f"{'_'.join(link_split_values)}.json"
- file_path = output_dir / file_name
-
- data = {}
+ data: Dict[str, Any] = {}
data["meta"] = {"url": link}
if base_url:
data["meta"]["base_url"] = base_url
data["content"] = text
document = Document.from_dict(data, id_hash_keys=id_hash_keys)
- with open(file_path, "w", encoding="utf-8") as f:
- json.dump(document.to_dict(), f)
+
+ if crawler_naming_function is not None:
+ file_name_prefix = crawler_naming_function(link, text)
+ else:
+ file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link[:129])
+ file_name_hash = hashlib.md5(f"{link}".encode("utf-8")).hexdigest()
+ file_name_prefix = f"{file_name_link}_{file_name_hash[-6:]}"
+
+ file_path = output_dir / f"{file_name_prefix}.json"
+
+ try:
+ with open(file_path, "w", encoding="utf-8") as f:
+ json.dump(document.to_dict(), f)
+ except Exception as e:
+ logging.exception(
+ f"Crawler can't save the content of '{link}' under '{file_path}'. This webpage will be skipped, but links from this page will still be crawled. Make sure the path above is accessible and the file name is valid. If the file name is invalid, consider setting 'crawler_naming_function' to another function."
+ )
+
paths.append(file_path)
return paths
@@ -263,6 +297,7 @@ class Crawler(BaseComponent):
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = True,
loading_wait_time: Optional[int] = None,
+ crawler_naming_function: Optional[Callable[[str, str], str]] = None,
) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]:
"""
Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
@@ -285,6 +320,12 @@ class Crawler(BaseComponent):
:param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page
+ :param crawler_naming_function: A function mapping the crawled page to a file name.
+ By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
+ E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+ This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
+ 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
+ This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
:return: Tuple({"paths": List of filepaths, ...}, Name of output edge)
"""
@@ -297,6 +338,7 @@ class Crawler(BaseComponent):
overwrite_existing_files=overwrite_existing_files,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
+ crawler_naming_function=crawler_naming_function,
)
results: Dict[str, Union[List[Document], List[Path]]] = {}
if return_documents:
@@ -321,6 +363,7 @@ class Crawler(BaseComponent):
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = True,
loading_wait_time: Optional[int] = None,
+ crawler_naming_function: Optional[Callable[[str, str], str]] = None,
):
return self.run(
output_dir=output_dir,
@@ -332,6 +375,7 @@ class Crawler(BaseComponent):
id_hash_keys=id_hash_keys,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
+ crawler_naming_function=crawler_naming_function,
)
@staticmethod
@@ -350,11 +394,9 @@ class Crawler(BaseComponent):
self,
base_url: str,
filter_urls: Optional[List] = None,
- already_found_links: List = None,
+ already_found_links: Optional[List] = None,
loading_wait_time: Optional[int] = None,
) -> set:
- if filter_urls:
- filter_pattern = re.compile("|".join(filter_urls))
self.driver.get(base_url)
if loading_wait_time is not None:
@@ -362,6 +404,8 @@ class Crawler(BaseComponent):
a_elements = self.driver.find_elements(by=By.XPATH, value="//a[@href]")
sub_links = set()
+ filter_pattern = re.compile("|".join(filter_urls)) if filter_urls is not None else None
+
for i in a_elements:
try:
sub_link = i.get_attribute("href")
@@ -375,7 +419,8 @@ class Crawler(BaseComponent):
if self._is_internal_url(base_url=base_url, sub_link=sub_link) and (
not self._is_inpage_navigation(base_url=base_url, sub_link=sub_link)
):
- if filter_urls:
+ if filter_pattern is not None:
+
if filter_pattern.search(sub_link):
sub_links.add(sub_link)
else:
diff --git a/test/nodes/test_connector.py b/test/nodes/test_connector.py
index 81796f2a4..a464e3bad 100644
--- a/test/nodes/test_connector.py
+++ b/test/nodes/test_connector.py
@@ -2,6 +2,9 @@ from typing import List
import json
from pathlib import Path
+import re
+import hashlib
+import os
import pytest
from selenium.webdriver.common.by import By
@@ -184,3 +187,32 @@ def test_crawler_loading_wait_time(test_url, tmp_path):
assert content_in_results(crawler, test_url + "/index.html", paths)
assert content_in_results(crawler, test_url + "/page1.html", paths)
assert content_in_results(crawler, test_url + "/page2.html", paths)
+
+
+def test_crawler_default_naming_function(test_url, tmp_path):
+ crawler = Crawler(output_dir=tmp_path)
+
+ link = f"{test_url}/page_with_a_very_long_name_to_do_some_tests_Now_let's_add_some_text_just_to_pass_the_129_chars_mark_and_trigger_the_chars_limit_of_the_default_naming_function.html"
+ file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link[:129])
+ file_name_hash = hashlib.md5(f"{link}".encode("utf-8")).hexdigest()
+ expected_crawled_file_path = f"{tmp_path}/{file_name_link}_{file_name_hash[-6:]}.json"
+
+ paths = crawler.crawl(urls=[link], crawler_depth=0)
+
+ assert os.path.exists(paths[0])
+ assert paths[0] == Path(expected_crawled_file_path)
+
+
+def test_crawler_naming_function(test_url, tmp_path):
+ crawler = Crawler(
+ output_dir=tmp_path, crawler_naming_function=lambda link, text: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+ )
+
+ link = f"{test_url}/page_dynamic.html"
+ file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link)
+ expected_crawled_file_path = tmp_path / f"{file_name_link}.json"
+
+ paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=0)
+
+ assert os.path.exists(paths[0])
+ assert paths[0] == expected_crawled_file_path
diff --git a/test/samples/crawler/page_with_a_very_long_name_to_do_some_tests_Now_let's_add_some_text_just_to_pass_the_129_chars_mark_and_trigger_the_chars_limit_of_the_default_naming_function.html b/test/samples/crawler/page_with_a_very_long_name_to_do_some_tests_Now_let's_add_some_text_just_to_pass_the_129_chars_mark_and_trigger_the_chars_limit_of_the_default_naming_function.html
new file mode 100644
index 000000000..732cc2b77
--- /dev/null
+++ b/test/samples/crawler/page_with_a_very_long_name_to_do_some_tests_Now_let's_add_some_text_just_to_pass_the_129_chars_mark_and_trigger_the_chars_limit_of_the_default_naming_function.html
@@ -0,0 +1 @@
+This is a page with a very long name to do some tests.
\ No newline at end of file