feat!: Increase Crawler standardization regarding Pipelines (#4122)

* feat!(Crawler): Integrate Crawler in the Pipeline.

+Output Documents
+Optional file saving
+Optional Document meta about file path

* refactor: add Optional decl.

* chore: dummy commit

* chore: dummy commit

* refactor: improve overwrite flow

* refactor: change custom file path meta logic + add test

* Update haystack/nodes/connector/crawler.py

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>

* Update haystack/nodes/connector/crawler.py

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>

* Update haystack/nodes/connector/crawler.py

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>

* Update haystack/nodes/connector/crawler.py

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>

* Update haystack/nodes/connector/crawler.py

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>

---------

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
This commit is contained in:
Daniel Bichuetti 2023-02-22 13:34:19 -03:00 committed by GitHub
parent 49ed21b82d
commit e0b0fe1bc3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 257 additions and 186 deletions

View File

@ -1,31 +1,29 @@
from typing import Callable, List, Optional, Dict, Tuple, Union, Any import hashlib
import json
import logging
import os import os
import re import re
import sys import sys
import json
import time import time
import logging
from pathlib import Path from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from urllib.parse import urlparse from urllib.parse import urlparse
import hashlib
try: try:
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
from selenium import webdriver from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
except (ImportError, ModuleNotFoundError) as ie: except (ImportError, ModuleNotFoundError) as ie:
from haystack.utils.import_utils import _optional_component_not_installed from haystack.utils.import_utils import _optional_component_not_installed
_optional_component_not_installed(__name__, "crawler", ie) _optional_component_not_installed(__name__, "crawler", ie)
from haystack.errors import NodeError
from haystack.nodes.base import BaseComponent from haystack.nodes.base import BaseComponent
from haystack.schema import Document from haystack.schema import Document
from haystack.errors import NodeError
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -49,28 +47,27 @@ class Crawler(BaseComponent):
def __init__( def __init__(
self, self,
output_dir: str,
urls: Optional[List[str]] = None, urls: Optional[List[str]] = None,
crawler_depth: int = 1, crawler_depth: int = 1,
filter_urls: Optional[List] = None, filter_urls: Optional[List] = None,
overwrite_existing_files=True,
id_hash_keys: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None,
extract_hidden_text=True, extract_hidden_text=True,
loading_wait_time: Optional[int] = None, loading_wait_time: Optional[int] = None,
output_dir: Union[str, Path, None] = None,
overwrite_existing_files=True,
file_path_meta_field_name: Optional[str] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None,
webdriver_options: Optional[List[str]] = None, webdriver_options: Optional[List[str]] = None,
): ):
""" """
Init object with basic params for crawling (can be overwritten later). Init object with basic params for crawling (can be overwritten later).
:param output_dir: Path for the directory to store files
:param urls: List of http(s) address(es) (can also be supplied later when calling crawl()) :param urls: List of http(s) address(es) (can also be supplied later when calling crawl())
:param crawler_depth: How many sublinks to follow from the initial list of URLs. Current options: :param crawler_depth: How many sublinks to follow from the initial list of URLs. Current options:
0: Only initial list of urls 0: Only initial list of urls
1: Follow links found on the initial URLs (but no further) 1: Follow links found on the initial URLs (but no further)
:param filter_urls: Optional list of regular expressions that the crawled URLs must comply with. :param filter_urls: Optional list of regular expressions that the crawled URLs must comply with.
All URLs not matching at least one of the regular expressions will be dropped. All URLs not matching at least one of the regular expressions will be dropped.
:param overwrite_existing_files: Whether to overwrite existing files in output_dir with new content
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
@ -80,6 +77,9 @@ class Crawler(BaseComponent):
:param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted. dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page E.g. 2: Crawler will wait 2 seconds before scraping page
:param output_dir: If provided, the crawled documents will be saved as JSON files in this directory.
:param overwrite_existing_files: Whether to overwrite existing files in output_dir with new content
:param file_path_meta_field_name: If provided, the file path will be stored in this meta field.
:param crawler_naming_function: A function mapping the crawled page to a file name. :param crawler_naming_function: A function mapping the crawled page to a file name.
By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url. By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link) E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
@ -164,7 +164,6 @@ class Crawler(BaseComponent):
logger.info("'chrome-driver' will be automatically installed.") logger.info("'chrome-driver' will be automatically installed.")
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
self.urls = urls self.urls = urls
self.output_dir = output_dir
self.crawler_depth = crawler_depth self.crawler_depth = crawler_depth
self.filter_urls = filter_urls self.filter_urls = filter_urls
self.overwrite_existing_files = overwrite_existing_files self.overwrite_existing_files = overwrite_existing_files
@ -172,22 +171,25 @@ class Crawler(BaseComponent):
self.extract_hidden_text = extract_hidden_text self.extract_hidden_text = extract_hidden_text
self.loading_wait_time = loading_wait_time self.loading_wait_time = loading_wait_time
self.crawler_naming_function = crawler_naming_function self.crawler_naming_function = crawler_naming_function
self.output_dir = output_dir
self.file_path_meta_field_name = file_path_meta_field_name
def __del__(self): def __del__(self):
self.driver.quit() self.driver.quit()
def crawl( def crawl(
self, self,
output_dir: Union[str, Path, None] = None,
urls: Optional[List[str]] = None, urls: Optional[List[str]] = None,
crawler_depth: Optional[int] = None, crawler_depth: Optional[int] = None,
filter_urls: Optional[List] = None, filter_urls: Optional[List] = None,
overwrite_existing_files: Optional[bool] = None,
id_hash_keys: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = None, extract_hidden_text: Optional[bool] = None,
loading_wait_time: Optional[int] = None, loading_wait_time: Optional[int] = None,
output_dir: Union[str, Path, None] = None,
overwrite_existing_files: Optional[bool] = None,
file_path_meta_field_name: Optional[str] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None,
) -> List[Path]: ) -> List[Document]:
""" """
Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
file per URL, including text and basic meta data). file per URL, including text and basic meta data).
@ -195,7 +197,6 @@ class Crawler(BaseComponent):
All parameters are optional here and only meant to overwrite instance attributes at runtime. All parameters are optional here and only meant to overwrite instance attributes at runtime.
If no parameters are provided to this method, the instance attributes that were passed during __init__ will be used. If no parameters are provided to this method, the instance attributes that were passed during __init__ will be used.
:param output_dir: Path for the directory to store files
:param urls: List of http addresses or single http address :param urls: List of http addresses or single http address
:param crawler_depth: How many sublinks to follow from the initial list of URLs. Current options: :param crawler_depth: How many sublinks to follow from the initial list of URLs. Current options:
0: Only initial list of urls 0: Only initial list of urls
@ -210,6 +211,8 @@ class Crawler(BaseComponent):
:param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted. dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page E.g. 2: Crawler will wait 2 seconds before scraping page
:param output_dir: If provided, the crawled documents will be saved as JSON files in this directory.
:param file_path_meta_field_name: If provided, the file path will be stored in this meta field.
:param crawler_naming_function: A function mapping the crawled page to a file name. :param crawler_naming_function: A function mapping the crawled page to a file name.
By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url. By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link) E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
@ -217,7 +220,7 @@ class Crawler(BaseComponent):
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest() 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content. This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
:return: List of paths where the crawled webpages got stored :return: List of Documents that were created during crawling
""" """
# use passed params or fallback to instance attributes # use passed params or fallback to instance attributes
if id_hash_keys is None: if id_hash_keys is None:
@ -236,42 +239,57 @@ class Crawler(BaseComponent):
extract_hidden_text = self.extract_hidden_text extract_hidden_text = self.extract_hidden_text
if loading_wait_time is None: if loading_wait_time is None:
loading_wait_time = self.loading_wait_time loading_wait_time = self.loading_wait_time
if file_path_meta_field_name is None:
file_path_meta_field_name = self.file_path_meta_field_name
if crawler_naming_function is None: if crawler_naming_function is None:
crawler_naming_function = self.crawler_naming_function crawler_naming_function = self.crawler_naming_function
if isinstance(output_dir, str):
output_dir = Path(output_dir) output_dir = Path(output_dir)
if output_dir:
if not output_dir.exists(): if not output_dir.exists():
output_dir.mkdir(parents=True) output_dir.mkdir(parents=True)
file_paths: list = []
is_not_empty = len(list(output_dir.rglob("*"))) > 0 is_not_empty = len(list(output_dir.rglob("*"))) > 0
if is_not_empty and not overwrite_existing_files: if is_not_empty and not overwrite_existing_files:
logger.info( logger.warning(
"Found data stored in `%s`. Delete this first if you really want to fetch new data.", output_dir "Found data stored in `%s`. Use an empty folder or set `overwrite_existing_files=True`, "
"if you want to overwrite any already present saved files.",
output_dir,
) )
else: else:
logger.info("Fetching from %s to `%s`", urls, output_dir) logger.info("Fetching from %s to `%s`", urls, output_dir)
# Start by writing out the initial list of urls documents: List[Document] = []
# Start by crawling the initial list of urls
if filter_urls: if filter_urls:
pattern = re.compile("|".join(filter_urls)) pattern = re.compile("|".join(filter_urls))
for url in urls: for url in urls:
if pattern.search(url): if pattern.search(url):
file_paths += self._write_to_files( documents += self._crawl_urls(
[url], [url],
output_dir=output_dir,
extract_hidden_text=extract_hidden_text, extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time, loading_wait_time=loading_wait_time,
id_hash_keys=id_hash_keys,
output_dir=output_dir,
overwrite_existing_files=overwrite_existing_files,
file_path_meta_field_name=file_path_meta_field_name,
crawler_naming_function=crawler_naming_function, crawler_naming_function=crawler_naming_function,
) )
else: else:
file_paths += self._write_to_files( documents += self._crawl_urls(
urls, urls,
output_dir=output_dir,
extract_hidden_text=extract_hidden_text, extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time, loading_wait_time=loading_wait_time,
id_hash_keys=id_hash_keys,
output_dir=output_dir,
overwrite_existing_files=overwrite_existing_files,
file_path_meta_field_name=file_path_meta_field_name,
crawler_naming_function=crawler_naming_function, crawler_naming_function=crawler_naming_function,
) )
# follow one level of sublinks if requested # follow one level of sublinks if requested
if crawler_depth == 1: if crawler_depth == 1:
sub_links: Dict[str, List] = {} sub_links: Dict[str, List] = {}
@ -286,31 +304,97 @@ class Crawler(BaseComponent):
) )
) )
for url, extracted_sublink in sub_links.items(): for url, extracted_sublink in sub_links.items():
file_paths += self._write_to_files( documents += self._crawl_urls(
extracted_sublink, extracted_sublink,
output_dir=output_dir,
base_url=url, base_url=url,
id_hash_keys=id_hash_keys,
extract_hidden_text=extract_hidden_text, extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time, loading_wait_time=loading_wait_time,
id_hash_keys=id_hash_keys,
output_dir=output_dir,
overwrite_existing_files=overwrite_existing_files,
file_path_meta_field_name=file_path_meta_field_name,
crawler_naming_function=crawler_naming_function, crawler_naming_function=crawler_naming_function,
) )
return file_paths return documents
def _write_to_files( def _create_document(
self, url: str, text: str, base_url: Optional[str] = None, id_hash_keys: Optional[List[str]] = None
) -> Document:
"""
Create a Document object from the given url and text.
:param url: The current url of the webpage.
:param text: The text content of the webpage.
:param base_url: The original url where we started to crawl.
:param id_hash_keys: The fields that should be used to generate the document id.
"""
data: Dict[str, Any] = {}
data["meta"] = {"url": url}
if base_url:
data["meta"]["base_url"] = base_url
data["content"] = text
if id_hash_keys:
data["id_hash_keys"] = id_hash_keys
return Document.from_dict(data)
def _write_file(
self,
document: Document,
output_dir: Path,
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
overwrite_existing_files: Optional[bool] = None,
file_path_meta_field_name: Optional[str] = None,
) -> Path:
url = document.meta["url"]
if crawler_naming_function is not None:
file_name_prefix = crawler_naming_function(url, document.content) # type: ignore
else:
file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", url[:129])
file_name_hash = hashlib.md5(f"{url}".encode("utf-8")).hexdigest()
file_name_prefix = f"{file_name_link}_{file_name_hash[-6:]}"
file_path = output_dir / f"{file_name_prefix}.json"
if file_path_meta_field_name:
document.meta[file_path_meta_field_name] = str(file_path)
try:
if overwrite_existing_files or not file_path.exists():
with open(file_path, "w", encoding="utf-8") as f:
json.dump(document.to_dict(), f)
else:
logging.debug(
"File '%s' already exists. Set 'overwrite_existing_files=True' to overwrite it.", file_path
)
except Exception:
logging.exception(
"Crawler can't save the content of '%s' under '%s'. "
"This webpage will be skipped, but links from this page will still be crawled. "
"Make sure the path above is accessible and the file name is valid. "
"If the file name is invalid, consider setting 'crawler_naming_function' to another function.",
url,
file_path,
)
return file_path
def _crawl_urls(
self, self,
urls: List[str], urls: List[str],
output_dir: Path,
extract_hidden_text: bool, extract_hidden_text: bool,
base_url: Optional[str] = None, base_url: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None,
loading_wait_time: Optional[int] = None, loading_wait_time: Optional[int] = None,
overwrite_existing_files: Optional[bool] = False,
output_dir: Optional[Path] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None,
) -> List[Path]: file_path_meta_field_name: Optional[str] = None,
paths = [] ) -> List[Document]:
documents: List[Document] = []
for link in urls: for link in urls:
logger.info("writing contents from '%s'", link) logger.info("Scraping contents from '%s'", link)
self.driver.get(link) self.driver.get(link)
if loading_wait_time is not None: if loading_wait_time is not None:
time.sleep(loading_wait_time) time.sleep(loading_wait_time)
@ -320,54 +404,37 @@ class Crawler(BaseComponent):
else: else:
text = el.text text = el.text
data: Dict[str, Any] = {} document = self._create_document(url=link, text=text, base_url=base_url, id_hash_keys=id_hash_keys)
data["meta"] = {"url": link}
if base_url:
data["meta"]["base_url"] = base_url
data["content"] = text
if id_hash_keys:
data["id_hash_keys"] = id_hash_keys
document = Document.from_dict(data)
if crawler_naming_function is not None: if output_dir:
file_name_prefix = crawler_naming_function(link, text) file_path = self._write_file(
else: document,
file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link[:129]) output_dir,
file_name_hash = hashlib.md5(f"{link}".encode("utf-8")).hexdigest() crawler_naming_function,
file_name_prefix = f"{file_name_link}_{file_name_hash[-6:]}" file_path_meta_field_name=file_path_meta_field_name,
overwrite_existing_files=overwrite_existing_files,
file_path = output_dir / f"{file_name_prefix}.json"
try:
with open(file_path, "w", encoding="utf-8") as f:
json.dump(document.to_dict(), f)
except Exception:
logging.exception(
"Crawler can't save the content of '%s' under '%s'. "
"This webpage will be skipped, but links from this page will still be crawled. "
"Make sure the path above is accessible and the file name is valid. "
"If the file name is invalid, consider setting 'crawler_naming_function' to another function.",
link,
file_path,
) )
logger.debug("Saved content to '%s'", file_path)
paths.append(file_path) documents.append(document)
return paths logger.debug("Crawler results: %s Documents", len(documents))
return documents
def run( # type: ignore def run( # type: ignore
self, self,
output_dir: Union[str, Path, None] = None,
urls: Optional[List[str]] = None, urls: Optional[List[str]] = None,
crawler_depth: Optional[int] = None, crawler_depth: Optional[int] = None,
filter_urls: Optional[List] = None, filter_urls: Optional[List] = None,
overwrite_existing_files: Optional[bool] = None,
return_documents: Optional[bool] = False,
id_hash_keys: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = True, extract_hidden_text: Optional[bool] = True,
loading_wait_time: Optional[int] = None, loading_wait_time: Optional[int] = None,
output_dir: Union[str, Path, None] = None,
overwrite_existing_files: Optional[bool] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None,
) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]: file_path_meta_field_name: Optional[str] = None,
) -> Tuple[Dict[str, List[Document]], str]:
""" """
Method to be executed when the Crawler is used as a Node within a Haystack pipeline. Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
@ -389,6 +456,7 @@ class Crawler(BaseComponent):
:param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted. dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page E.g. 2: Crawler will wait 2 seconds before scraping page
:param file_path_meta_field_name: If provided, the file path will be stored in this meta field.
:param crawler_naming_function: A function mapping the crawled page to a file name. :param crawler_naming_function: A function mapping the crawled page to a file name.
By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url. By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link) E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
@ -396,10 +464,10 @@ class Crawler(BaseComponent):
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest() 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content. This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
:return: Tuple({"paths": List of filepaths, ...}, Name of output edge) :return: Tuple({"documents": List of Documents, ...}, Name of output edge)
""" """
file_paths = self.crawl( documents = self.crawl(
urls=urls, urls=urls,
output_dir=output_dir, output_dir=output_dir,
crawler_depth=crawler_depth, crawler_depth=crawler_depth,
@ -407,34 +475,26 @@ class Crawler(BaseComponent):
overwrite_existing_files=overwrite_existing_files, overwrite_existing_files=overwrite_existing_files,
extract_hidden_text=extract_hidden_text, extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time, loading_wait_time=loading_wait_time,
id_hash_keys=id_hash_keys,
file_path_meta_field_name=file_path_meta_field_name,
crawler_naming_function=crawler_naming_function, crawler_naming_function=crawler_naming_function,
) )
results: Dict[str, Union[List[Document], List[Path]]] = {} results = {"documents": documents}
if return_documents:
crawled_data = []
for _file in file_paths:
with open(_file.absolute(), "r") as read_file:
document = json.load(read_file)
document["id_hash_keys"] = id_hash_keys
crawled_data.append(Document.from_dict(document))
results = {"documents": crawled_data}
else:
results = {"paths": file_paths}
return results, "output_1" return results, "output_1"
def run_batch( # type: ignore def run_batch( # type: ignore
self, self,
output_dir: Union[str, Path, None] = None,
urls: Optional[List[str]] = None, urls: Optional[List[str]] = None,
crawler_depth: Optional[int] = None, crawler_depth: Optional[int] = None,
filter_urls: Optional[List] = None, filter_urls: Optional[List] = None,
overwrite_existing_files: Optional[bool] = None,
return_documents: Optional[bool] = False,
id_hash_keys: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = True, extract_hidden_text: Optional[bool] = True,
loading_wait_time: Optional[int] = None, loading_wait_time: Optional[int] = None,
output_dir: Union[str, Path, None] = None,
overwrite_existing_files: Optional[bool] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None,
file_path_meta_field_name: Optional[str] = None,
): ):
return self.run( return self.run(
output_dir=output_dir, output_dir=output_dir,
@ -442,11 +502,11 @@ class Crawler(BaseComponent):
crawler_depth=crawler_depth, crawler_depth=crawler_depth,
filter_urls=filter_urls, filter_urls=filter_urls,
overwrite_existing_files=overwrite_existing_files, overwrite_existing_files=overwrite_existing_files,
return_documents=return_documents,
id_hash_keys=id_hash_keys, id_hash_keys=id_hash_keys,
extract_hidden_text=extract_hidden_text, extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time, loading_wait_time=loading_wait_time,
crawler_naming_function=crawler_naming_function, crawler_naming_function=crawler_naming_function,
file_path_meta_field_name=file_path_meta_field_name,
) )
@staticmethod @staticmethod

View File

@ -7,10 +7,10 @@ import hashlib
import os import os
import pytest import pytest
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from haystack.nodes.connector.crawler import Crawler
from haystack.nodes.connector import Crawler
from haystack.schema import Document from haystack.schema import Document
from ..conftest import SAMPLES_PATH from ..conftest import SAMPLES_PATH
@ -64,12 +64,15 @@ def test_crawler(tmp_path):
tmp_dir = tmp_path tmp_dir = tmp_path
url = ["https://haystack.deepset.ai/"] url = ["https://haystack.deepset.ai/"]
crawler = Crawler(output_dir=tmp_dir) crawler = Crawler(output_dir=tmp_dir, file_path_meta_field_name="file_path")
docs_path = crawler.crawl(urls=url, crawler_depth=0)
results, _ = crawler.run(urls=url, crawler_depth=0, return_documents=True)
documents = results["documents"]
for json_file, document in zip(docs_path, documents): documents = crawler.crawl(urls=url, crawler_depth=0)
docs_path = [Path(doc.meta["file_path"]) for doc in documents]
results, _ = crawler.run(urls=url, crawler_depth=0)
docs_result = results["documents"]
for json_file, document in zip(docs_path, docs_result):
assert isinstance(json_file, Path) assert isinstance(json_file, Path)
assert isinstance(document, Document) assert isinstance(document, Document)
@ -85,42 +88,45 @@ def test_crawler(tmp_path):
def test_crawler_url_none_exception(tmp_path): def test_crawler_url_none_exception(tmp_path):
crawler = Crawler(tmp_path) crawler = Crawler()
with pytest.raises(ValueError): with pytest.raises(ValueError):
crawler.crawl() crawler.crawl()
def test_crawler_depth_0_single_url(test_url, tmp_path): def test_crawler_depth_0_single_url(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path) crawler = Crawler(output_dir=tmp_path, crawler_depth=0, file_path_meta_field_name="file_path")
paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0) documents = crawler.crawl(urls=[test_url + "/index.html"])
assert len(paths) == 1 assert len(documents) == 1
assert content_match(crawler, test_url + "/index.html", paths[0]) assert content_match(crawler, test_url + "/index.html", documents[0].meta["file_path"])
def test_crawler_depth_0_many_urls(test_url, tmp_path): def test_crawler_depth_0_many_urls(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path) crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
_urls = [test_url + "/index.html", test_url + "/page1.html"] _urls = [test_url + "/index.html", test_url + "/page1.html"]
paths = crawler.crawl(urls=_urls, crawler_depth=0) documents = crawler.crawl(urls=_urls, crawler_depth=0)
assert len(paths) == 2 assert len(documents) == 2
paths = [doc.meta["file_path"] for doc in documents]
assert content_in_results(crawler, test_url + "/index.html", paths) assert content_in_results(crawler, test_url + "/index.html", paths)
assert content_in_results(crawler, test_url + "/page1.html", paths) assert content_in_results(crawler, test_url + "/page1.html", paths)
def test_crawler_depth_1_single_url(test_url, tmp_path): def test_crawler_depth_1_single_url(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path) crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1) documents = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1)
assert len(paths) == 3 assert len(documents) == 3
paths = [doc.meta["file_path"] for doc in documents]
assert content_in_results(crawler, test_url + "/index.html", paths) assert content_in_results(crawler, test_url + "/index.html", paths)
assert content_in_results(crawler, test_url + "/page1.html", paths) assert content_in_results(crawler, test_url + "/page1.html", paths)
assert content_in_results(crawler, test_url + "/page2.html", paths) assert content_in_results(crawler, test_url + "/page2.html", paths)
def test_crawler_output_file_structure(test_url, tmp_path): def test_crawler_output_file_structure(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path) crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0) documents = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
assert content_match(crawler, test_url + "/index.html", paths[0]) path = Path(documents[0].meta["file_path"])
assert content_match(crawler, test_url + "/index.html", path)
with open(paths[0].absolute(), "r") as doc_file: with open(path.absolute(), "r") as doc_file:
data = json.load(doc_file) data = json.load(doc_file)
assert "content" in data assert "content" in data
assert "meta" in data assert "meta" in data
@ -129,52 +135,40 @@ def test_crawler_output_file_structure(test_url, tmp_path):
def test_crawler_filter_urls(test_url, tmp_path): def test_crawler_filter_urls(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path) crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1) documents = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1)
assert len(paths) == 1 assert len(documents) == 1
assert content_match(crawler, test_url + "/index.html", paths[0]) assert content_match(crawler, test_url + "/index.html", documents[0].meta["file_path"])
# Note: filter_urls can exclude pages listed in `urls` as well # Note: filter_urls can exclude pages listed in `urls` as well
paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1) documents = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1)
assert len(paths) == 1 assert len(documents) == 1
assert content_match(crawler, test_url + "/page1.html", paths[0]) assert content_match(crawler, test_url + "/page1.html", documents[0].meta["file_path"])
assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google.com"], crawler_depth=1) assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google.com"], crawler_depth=1)
def test_crawler_return_document(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path)
documents, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=True)
paths, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=False)
for path, document in zip(paths["paths"], documents["documents"]):
with open(path.absolute(), "r") as doc_file:
file_content = json.load(doc_file)
assert file_content["meta"] == document.meta
assert file_content["content"] == document.content
def test_crawler_extract_hidden_text(test_url, tmp_path): def test_crawler_extract_hidden_text(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path) crawler = Crawler(output_dir=tmp_path)
documents, _ = crawler.run( documents, _ = crawler.run(urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0)
urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0, return_documents=True
)
crawled_content = documents["documents"][0].content crawled_content = documents["documents"][0].content
assert "hidden text" in crawled_content assert "hidden text" in crawled_content
documents, _ = crawler.run( documents, _ = crawler.run(urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=False, crawler_depth=0)
urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=False, crawler_depth=0, return_documents=True
)
crawled_content = documents["documents"][0].content crawled_content = documents["documents"][0].content
assert "hidden text" not in crawled_content assert "hidden text" not in crawled_content
def test_crawler_loading_wait_time(test_url, tmp_path): def test_crawler_loading_wait_time(test_url, tmp_path):
loading_wait_time = 3 loading_wait_time = 3
crawler = Crawler(output_dir=tmp_path) crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=1, loading_wait_time=loading_wait_time) documents = crawler.crawl(
urls=[test_url + "/page_dynamic.html"], crawler_depth=1, loading_wait_time=loading_wait_time
)
assert len(paths) == 4 assert len(documents) == 4
paths = [doc.meta["file_path"] for doc in documents]
with open(f"{SAMPLES_PATH.absolute()}/crawler/page_dynamic_result.txt", "r") as dynamic_result: with open(f"{SAMPLES_PATH.absolute()}/crawler/page_dynamic_result.txt", "r") as dynamic_result:
dynamic_result_text = dynamic_result.readlines() dynamic_result_text = dynamic_result.readlines()
@ -196,29 +190,46 @@ def test_crawler_loading_wait_time(test_url, tmp_path):
def test_crawler_default_naming_function(test_url, tmp_path): def test_crawler_default_naming_function(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path) crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
link = f"{test_url}/page_with_a_very_long_name_to_do_some_tests_Now_let's_add_some_text_just_to_pass_the_129_chars_mark_and_trigger_the_chars_limit_of_the_default_naming_function.html" link = f"{test_url}/page_with_a_very_long_name_to_do_some_tests_Now_let's_add_some_text_just_to_pass_the_129_chars_mark_and_trigger_the_chars_limit_of_the_default_naming_function.html"
file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link[:129]) file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link[:129])
file_name_hash = hashlib.md5(f"{link}".encode("utf-8")).hexdigest() file_name_hash = hashlib.md5(f"{link}".encode("utf-8")).hexdigest()
expected_crawled_file_path = f"{tmp_path}/{file_name_link}_{file_name_hash[-6:]}.json" expected_crawled_file_path = f"{tmp_path}/{file_name_link}_{file_name_hash[-6:]}.json"
paths = crawler.crawl(urls=[link], crawler_depth=0) documents = crawler.crawl(urls=[link], crawler_depth=0)
assert os.path.exists(paths[0]) path = Path(documents[0].meta["file_path"])
assert paths[0] == Path(expected_crawled_file_path) assert os.path.exists(path)
assert path == Path(expected_crawled_file_path)
def test_crawler_naming_function(test_url, tmp_path): def test_crawler_naming_function(test_url, tmp_path):
crawler = Crawler( crawler = Crawler(
output_dir=tmp_path, crawler_naming_function=lambda link, text: re.sub("[<>:'/\\|?*\0 ]", "_", link) output_dir=tmp_path,
file_path_meta_field_name="file_path",
crawler_naming_function=lambda link, text: re.sub("[<>:'/\\|?*\0 ]", "_", link),
) )
link = f"{test_url}/page_dynamic.html" link = f"{test_url}/page_dynamic.html"
file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link) file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link)
expected_crawled_file_path = tmp_path / f"{file_name_link}.json" expected_crawled_file_path = tmp_path / f"{file_name_link}.json"
paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=0) documents = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=0)
path = Path(documents[0].meta["file_path"])
assert os.path.exists(path)
assert path == expected_crawled_file_path
assert os.path.exists(paths[0])
assert paths[0] == expected_crawled_file_path def test_crawler_not_save_file(test_url):
crawler = Crawler()
documents = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
assert documents[0].meta.get("file_path", None) is None
def test_crawler_custom_meta_file_path_name(test_url, tmp_path):
crawler = Crawler()
documents = crawler.crawl(
urls=[test_url + "/index.html"], crawler_depth=0, output_dir=tmp_path, file_path_meta_field_name="custom"
)
assert documents[0].meta.get("custom", None) is not None