Fix crawler long file names (#2723)

* Changing the name that crawled page is saved to avoid long file names error on some file systems

* Custom naming function for saving crawled files

* Update Documentation & Code Style

* Remove bad characters on file name and preffix

* Add test for naming function

* Update Documentation & Code Style

* Fix expensive regex recalculation and linter warns

* Check for exceptions on file dump

* Remove param_naming variable

* Fix file paths on Windows, Linux and Mac

* Update Documentation & Code Style

* Test using one of the docstrings examples

* Change default naming function
Update docstrings

* Applying formatting rules

* Update Documentation & Code Style

* Fix mypy incompatible assignment error

* Remove unused type declaration

* Fix typo

* Update tests for naming function

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
Daniel Augustus Bichuetti Silva 2022-07-11 07:16:32 -03:00 committed by GitHub
parent ba08fc86f5
commit 77a513fe49
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 119 additions and 18 deletions

View File

@ -19,7 +19,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
| crawler = Crawler(output_dir="crawled_files")
| # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/overview/
| docs = crawler.crawl(urls=["https://haystack.deepset.ai/overview/get-started"],
| filter_urls= ["haystack\.deepset\.ai\/overview\/"])
| filter_urls= ["haystack.deepset.ai/overview/"])
```
<a id="crawler.Crawler.__init__"></a>
@ -27,7 +27,7 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
#### Crawler.\_\_init\_\_
```python
def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None)
def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None)
```
Init object with basic params for crawling (can be overwritten later).
@ -51,13 +51,19 @@ E.g. the text can be inside a span with style="display: none"
- `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page
- `crawler_naming_function`: A function mapping the crawled page to a file name.
By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
<a id="crawler.Crawler.crawl"></a>
#### Crawler.crawl
```python
def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None, loading_wait_time: Optional[int] = None) -> List[Path]
def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) -> List[Path]
```
Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
@ -84,6 +90,12 @@ In this case the id will be generated by using the content and the defined metad
- `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page
- `crawler_naming_function`: A function mapping the crawled page to a file name.
By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
**Returns**:
@ -94,7 +106,7 @@ List of paths where the crawled webpages got stored
#### Crawler.run
```python
def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True, loading_wait_time: Optional[int] = None) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]
def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]
```
Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
@ -119,6 +131,12 @@ E.g. the text can be inside a span with style="display: none"
- `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page
- `crawler_naming_function`: A function mapping the crawled page to a file name.
By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
**Returns**:

View File

@ -2009,6 +2009,11 @@
"loading_wait_time": {
"title": "Loading Wait Time",
"type": "integer"
},
"crawler_naming_function": {
"title": "Crawler Naming Function",
"type": "string",
"default": null
}
},
"required": [

View File

@ -1,4 +1,4 @@
from typing import List, Optional, Dict, Tuple, Union
from typing import Callable, List, Optional, Dict, Tuple, Union, Any
import re
import sys
@ -7,6 +7,7 @@ import time
import logging
from pathlib import Path
from urllib.parse import urlparse
import hashlib
try:
from webdriver_manager.chrome import ChromeDriverManager
@ -37,7 +38,7 @@ class Crawler(BaseComponent):
| crawler = Crawler(output_dir="crawled_files")
| # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/overview/
| docs = crawler.crawl(urls=["https://haystack.deepset.ai/overview/get-started"],
| filter_urls= ["haystack\.deepset\.ai\/overview\/"])
| filter_urls= ["haystack.deepset.ai/overview/"])
```
"""
@ -53,6 +54,7 @@ class Crawler(BaseComponent):
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text=True,
loading_wait_time: Optional[int] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
):
"""
Init object with basic params for crawling (can be overwritten later).
@ -74,6 +76,12 @@ class Crawler(BaseComponent):
:param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page
:param crawler_naming_function: A function mapping the crawled page to a file name.
By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
"""
super().__init__()
@ -106,6 +114,7 @@ class Crawler(BaseComponent):
self.id_hash_keys = id_hash_keys
self.extract_hidden_text = extract_hidden_text
self.loading_wait_time = loading_wait_time
self.crawler_naming_function = crawler_naming_function
def crawl(
self,
@ -117,6 +126,7 @@ class Crawler(BaseComponent):
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = None,
loading_wait_time: Optional[int] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
) -> List[Path]:
"""
Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
@ -140,6 +150,12 @@ class Crawler(BaseComponent):
:param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page
:param crawler_naming_function: A function mapping the crawled page to a file name.
By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
:return: List of paths where the crawled webpages got stored
"""
@ -160,6 +176,8 @@ class Crawler(BaseComponent):
extract_hidden_text = self.extract_hidden_text
if loading_wait_time is None:
loading_wait_time = self.loading_wait_time
if crawler_naming_function is None:
crawler_naming_function = self.crawler_naming_function
output_dir = Path(output_dir)
if not output_dir.exists():
@ -182,6 +200,7 @@ class Crawler(BaseComponent):
output_dir=output_dir,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
crawler_naming_function=crawler_naming_function,
)
else:
file_paths += self._write_to_files(
@ -189,6 +208,7 @@ class Crawler(BaseComponent):
output_dir=output_dir,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
crawler_naming_function=crawler_naming_function,
)
# follow one level of sublinks if requested
if crawler_depth == 1:
@ -211,6 +231,7 @@ class Crawler(BaseComponent):
id_hash_keys=id_hash_keys,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
crawler_naming_function=crawler_naming_function,
)
return file_paths
@ -220,9 +241,10 @@ class Crawler(BaseComponent):
urls: List[str],
output_dir: Path,
extract_hidden_text: bool,
base_url: str = None,
base_url: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None,
loading_wait_time: Optional[int] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
) -> List[Path]:
paths = []
for link in urls:
@ -236,18 +258,30 @@ class Crawler(BaseComponent):
else:
text = el.text
link_split_values = link.replace("https://", "").split("/")
file_name = f"{'_'.join(link_split_values)}.json"
file_path = output_dir / file_name
data = {}
data: Dict[str, Any] = {}
data["meta"] = {"url": link}
if base_url:
data["meta"]["base_url"] = base_url
data["content"] = text
document = Document.from_dict(data, id_hash_keys=id_hash_keys)
with open(file_path, "w", encoding="utf-8") as f:
json.dump(document.to_dict(), f)
if crawler_naming_function is not None:
file_name_prefix = crawler_naming_function(link, text)
else:
file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link[:129])
file_name_hash = hashlib.md5(f"{link}".encode("utf-8")).hexdigest()
file_name_prefix = f"{file_name_link}_{file_name_hash[-6:]}"
file_path = output_dir / f"{file_name_prefix}.json"
try:
with open(file_path, "w", encoding="utf-8") as f:
json.dump(document.to_dict(), f)
except Exception as e:
logging.exception(
f"Crawler can't save the content of '{link}' under '{file_path}'. This webpage will be skipped, but links from this page will still be crawled. Make sure the path above is accessible and the file name is valid. If the file name is invalid, consider setting 'crawler_naming_function' to another function."
)
paths.append(file_path)
return paths
@ -263,6 +297,7 @@ class Crawler(BaseComponent):
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = True,
loading_wait_time: Optional[int] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]:
"""
Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
@ -285,6 +320,12 @@ class Crawler(BaseComponent):
:param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page
:param crawler_naming_function: A function mapping the crawled page to a file name.
By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
:return: Tuple({"paths": List of filepaths, ...}, Name of output edge)
"""
@ -297,6 +338,7 @@ class Crawler(BaseComponent):
overwrite_existing_files=overwrite_existing_files,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
crawler_naming_function=crawler_naming_function,
)
results: Dict[str, Union[List[Document], List[Path]]] = {}
if return_documents:
@ -321,6 +363,7 @@ class Crawler(BaseComponent):
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = True,
loading_wait_time: Optional[int] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
):
return self.run(
output_dir=output_dir,
@ -332,6 +375,7 @@ class Crawler(BaseComponent):
id_hash_keys=id_hash_keys,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
crawler_naming_function=crawler_naming_function,
)
@staticmethod
@ -350,11 +394,9 @@ class Crawler(BaseComponent):
self,
base_url: str,
filter_urls: Optional[List] = None,
already_found_links: List = None,
already_found_links: Optional[List] = None,
loading_wait_time: Optional[int] = None,
) -> set:
if filter_urls:
filter_pattern = re.compile("|".join(filter_urls))
self.driver.get(base_url)
if loading_wait_time is not None:
@ -362,6 +404,8 @@ class Crawler(BaseComponent):
a_elements = self.driver.find_elements(by=By.XPATH, value="//a[@href]")
sub_links = set()
filter_pattern = re.compile("|".join(filter_urls)) if filter_urls is not None else None
for i in a_elements:
try:
sub_link = i.get_attribute("href")
@ -375,7 +419,8 @@ class Crawler(BaseComponent):
if self._is_internal_url(base_url=base_url, sub_link=sub_link) and (
not self._is_inpage_navigation(base_url=base_url, sub_link=sub_link)
):
if filter_urls:
if filter_pattern is not None:
if filter_pattern.search(sub_link):
sub_links.add(sub_link)
else:

View File

@ -2,6 +2,9 @@ from typing import List
import json
from pathlib import Path
import re
import hashlib
import os
import pytest
from selenium.webdriver.common.by import By
@ -184,3 +187,32 @@ def test_crawler_loading_wait_time(test_url, tmp_path):
assert content_in_results(crawler, test_url + "/index.html", paths)
assert content_in_results(crawler, test_url + "/page1.html", paths)
assert content_in_results(crawler, test_url + "/page2.html", paths)
def test_crawler_default_naming_function(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path)
link = f"{test_url}/page_with_a_very_long_name_to_do_some_tests_Now_let's_add_some_text_just_to_pass_the_129_chars_mark_and_trigger_the_chars_limit_of_the_default_naming_function.html"
file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link[:129])
file_name_hash = hashlib.md5(f"{link}".encode("utf-8")).hexdigest()
expected_crawled_file_path = f"{tmp_path}/{file_name_link}_{file_name_hash[-6:]}.json"
paths = crawler.crawl(urls=[link], crawler_depth=0)
assert os.path.exists(paths[0])
assert paths[0] == Path(expected_crawled_file_path)
def test_crawler_naming_function(test_url, tmp_path):
crawler = Crawler(
output_dir=tmp_path, crawler_naming_function=lambda link, text: re.sub("[<>:'/\\|?*\0 ]", "_", link)
)
link = f"{test_url}/page_dynamic.html"
file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link)
expected_crawled_file_path = tmp_path / f"{file_name_link}.json"
paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=0)
assert os.path.exists(paths[0])
assert paths[0] == expected_crawled_file_path