mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-09-26 16:46:58 +00:00
feat!: Increase Crawler standardization regarding Pipelines (#4122)
* feat!(Crawler): Integrate Crawler in the Pipeline. +Output Documents +Optional file saving +Optional Document meta about file path * refactor: add Optional decl. * chore: dummy commit * chore: dummy commit * refactor: improve overwrite flow * refactor: change custom file path meta logic + add test * Update haystack/nodes/connector/crawler.py Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * Update haystack/nodes/connector/crawler.py Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * Update haystack/nodes/connector/crawler.py Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * Update haystack/nodes/connector/crawler.py Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * Update haystack/nodes/connector/crawler.py Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
This commit is contained in:
parent
49ed21b82d
commit
e0b0fe1bc3
@ -1,31 +1,29 @@
|
||||
from typing import Callable, List, Optional, Dict, Tuple, Union, Any
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
from urllib.parse import urlparse
|
||||
import hashlib
|
||||
|
||||
try:
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.common.by import By
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
except (ImportError, ModuleNotFoundError) as ie:
|
||||
from haystack.utils.import_utils import _optional_component_not_installed
|
||||
|
||||
_optional_component_not_installed(__name__, "crawler", ie)
|
||||
|
||||
from haystack.errors import NodeError
|
||||
from haystack.nodes.base import BaseComponent
|
||||
from haystack.schema import Document
|
||||
from haystack.errors import NodeError
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -49,28 +47,27 @@ class Crawler(BaseComponent):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
output_dir: str,
|
||||
urls: Optional[List[str]] = None,
|
||||
crawler_depth: int = 1,
|
||||
filter_urls: Optional[List] = None,
|
||||
overwrite_existing_files=True,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
extract_hidden_text=True,
|
||||
loading_wait_time: Optional[int] = None,
|
||||
output_dir: Union[str, Path, None] = None,
|
||||
overwrite_existing_files=True,
|
||||
file_path_meta_field_name: Optional[str] = None,
|
||||
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
|
||||
webdriver_options: Optional[List[str]] = None,
|
||||
):
|
||||
"""
|
||||
Init object with basic params for crawling (can be overwritten later).
|
||||
|
||||
:param output_dir: Path for the directory to store files
|
||||
:param urls: List of http(s) address(es) (can also be supplied later when calling crawl())
|
||||
:param crawler_depth: How many sublinks to follow from the initial list of URLs. Current options:
|
||||
0: Only initial list of urls
|
||||
1: Follow links found on the initial URLs (but no further)
|
||||
:param filter_urls: Optional list of regular expressions that the crawled URLs must comply with.
|
||||
All URLs not matching at least one of the regular expressions will be dropped.
|
||||
:param overwrite_existing_files: Whether to overwrite existing files in output_dir with new content
|
||||
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
@ -80,6 +77,9 @@ class Crawler(BaseComponent):
|
||||
:param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
|
||||
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
|
||||
E.g. 2: Crawler will wait 2 seconds before scraping page
|
||||
:param output_dir: If provided, the crawled documents will be saved as JSON files in this directory.
|
||||
:param overwrite_existing_files: Whether to overwrite existing files in output_dir with new content
|
||||
:param file_path_meta_field_name: If provided, the file path will be stored in this meta field.
|
||||
:param crawler_naming_function: A function mapping the crawled page to a file name.
|
||||
By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
|
||||
E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
|
||||
@ -164,7 +164,6 @@ class Crawler(BaseComponent):
|
||||
logger.info("'chrome-driver' will be automatically installed.")
|
||||
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
||||
self.urls = urls
|
||||
self.output_dir = output_dir
|
||||
self.crawler_depth = crawler_depth
|
||||
self.filter_urls = filter_urls
|
||||
self.overwrite_existing_files = overwrite_existing_files
|
||||
@ -172,22 +171,25 @@ class Crawler(BaseComponent):
|
||||
self.extract_hidden_text = extract_hidden_text
|
||||
self.loading_wait_time = loading_wait_time
|
||||
self.crawler_naming_function = crawler_naming_function
|
||||
self.output_dir = output_dir
|
||||
self.file_path_meta_field_name = file_path_meta_field_name
|
||||
|
||||
def __del__(self):
|
||||
self.driver.quit()
|
||||
|
||||
def crawl(
|
||||
self,
|
||||
output_dir: Union[str, Path, None] = None,
|
||||
urls: Optional[List[str]] = None,
|
||||
crawler_depth: Optional[int] = None,
|
||||
filter_urls: Optional[List] = None,
|
||||
overwrite_existing_files: Optional[bool] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
extract_hidden_text: Optional[bool] = None,
|
||||
loading_wait_time: Optional[int] = None,
|
||||
output_dir: Union[str, Path, None] = None,
|
||||
overwrite_existing_files: Optional[bool] = None,
|
||||
file_path_meta_field_name: Optional[str] = None,
|
||||
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
|
||||
) -> List[Path]:
|
||||
) -> List[Document]:
|
||||
"""
|
||||
Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
|
||||
file per URL, including text and basic meta data).
|
||||
@ -195,7 +197,6 @@ class Crawler(BaseComponent):
|
||||
All parameters are optional here and only meant to overwrite instance attributes at runtime.
|
||||
If no parameters are provided to this method, the instance attributes that were passed during __init__ will be used.
|
||||
|
||||
:param output_dir: Path for the directory to store files
|
||||
:param urls: List of http addresses or single http address
|
||||
:param crawler_depth: How many sublinks to follow from the initial list of URLs. Current options:
|
||||
0: Only initial list of urls
|
||||
@ -210,6 +211,8 @@ class Crawler(BaseComponent):
|
||||
:param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
|
||||
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
|
||||
E.g. 2: Crawler will wait 2 seconds before scraping page
|
||||
:param output_dir: If provided, the crawled documents will be saved as JSON files in this directory.
|
||||
:param file_path_meta_field_name: If provided, the file path will be stored in this meta field.
|
||||
:param crawler_naming_function: A function mapping the crawled page to a file name.
|
||||
By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
|
||||
E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
|
||||
@ -217,7 +220,7 @@ class Crawler(BaseComponent):
|
||||
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
|
||||
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
|
||||
|
||||
:return: List of paths where the crawled webpages got stored
|
||||
:return: List of Documents that were created during crawling
|
||||
"""
|
||||
# use passed params or fallback to instance attributes
|
||||
if id_hash_keys is None:
|
||||
@ -236,81 +239,162 @@ class Crawler(BaseComponent):
|
||||
extract_hidden_text = self.extract_hidden_text
|
||||
if loading_wait_time is None:
|
||||
loading_wait_time = self.loading_wait_time
|
||||
if file_path_meta_field_name is None:
|
||||
file_path_meta_field_name = self.file_path_meta_field_name
|
||||
if crawler_naming_function is None:
|
||||
crawler_naming_function = self.crawler_naming_function
|
||||
|
||||
output_dir = Path(output_dir)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir(parents=True)
|
||||
if isinstance(output_dir, str):
|
||||
output_dir = Path(output_dir)
|
||||
|
||||
file_paths: list = []
|
||||
is_not_empty = len(list(output_dir.rglob("*"))) > 0
|
||||
if is_not_empty and not overwrite_existing_files:
|
||||
logger.info(
|
||||
"Found data stored in `%s`. Delete this first if you really want to fetch new data.", output_dir
|
||||
)
|
||||
else:
|
||||
logger.info("Fetching from %s to `%s`", urls, output_dir)
|
||||
if output_dir:
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir(parents=True)
|
||||
|
||||
# Start by writing out the initial list of urls
|
||||
if filter_urls:
|
||||
pattern = re.compile("|".join(filter_urls))
|
||||
for url in urls:
|
||||
if pattern.search(url):
|
||||
file_paths += self._write_to_files(
|
||||
[url],
|
||||
output_dir=output_dir,
|
||||
extract_hidden_text=extract_hidden_text,
|
||||
loading_wait_time=loading_wait_time,
|
||||
crawler_naming_function=crawler_naming_function,
|
||||
)
|
||||
else:
|
||||
file_paths += self._write_to_files(
|
||||
urls,
|
||||
output_dir=output_dir,
|
||||
extract_hidden_text=extract_hidden_text,
|
||||
loading_wait_time=loading_wait_time,
|
||||
crawler_naming_function=crawler_naming_function,
|
||||
is_not_empty = len(list(output_dir.rglob("*"))) > 0
|
||||
if is_not_empty and not overwrite_existing_files:
|
||||
logger.warning(
|
||||
"Found data stored in `%s`. Use an empty folder or set `overwrite_existing_files=True`, "
|
||||
"if you want to overwrite any already present saved files.",
|
||||
output_dir,
|
||||
)
|
||||
# follow one level of sublinks if requested
|
||||
if crawler_depth == 1:
|
||||
sub_links: Dict[str, List] = {}
|
||||
for url_ in urls:
|
||||
already_found_links: List = list(sum(list(sub_links.values()), []))
|
||||
sub_links[url_] = list(
|
||||
self._extract_sublinks_from_url(
|
||||
base_url=url_,
|
||||
filter_urls=filter_urls,
|
||||
already_found_links=already_found_links,
|
||||
loading_wait_time=loading_wait_time,
|
||||
)
|
||||
)
|
||||
for url, extracted_sublink in sub_links.items():
|
||||
file_paths += self._write_to_files(
|
||||
extracted_sublink,
|
||||
output_dir=output_dir,
|
||||
base_url=url,
|
||||
id_hash_keys=id_hash_keys,
|
||||
else:
|
||||
logger.info("Fetching from %s to `%s`", urls, output_dir)
|
||||
|
||||
documents: List[Document] = []
|
||||
|
||||
# Start by crawling the initial list of urls
|
||||
if filter_urls:
|
||||
pattern = re.compile("|".join(filter_urls))
|
||||
for url in urls:
|
||||
if pattern.search(url):
|
||||
documents += self._crawl_urls(
|
||||
[url],
|
||||
extract_hidden_text=extract_hidden_text,
|
||||
loading_wait_time=loading_wait_time,
|
||||
id_hash_keys=id_hash_keys,
|
||||
output_dir=output_dir,
|
||||
overwrite_existing_files=overwrite_existing_files,
|
||||
file_path_meta_field_name=file_path_meta_field_name,
|
||||
crawler_naming_function=crawler_naming_function,
|
||||
)
|
||||
else:
|
||||
documents += self._crawl_urls(
|
||||
urls,
|
||||
extract_hidden_text=extract_hidden_text,
|
||||
loading_wait_time=loading_wait_time,
|
||||
id_hash_keys=id_hash_keys,
|
||||
output_dir=output_dir,
|
||||
overwrite_existing_files=overwrite_existing_files,
|
||||
file_path_meta_field_name=file_path_meta_field_name,
|
||||
crawler_naming_function=crawler_naming_function,
|
||||
)
|
||||
|
||||
return file_paths
|
||||
# follow one level of sublinks if requested
|
||||
if crawler_depth == 1:
|
||||
sub_links: Dict[str, List] = {}
|
||||
for url_ in urls:
|
||||
already_found_links: List = list(sum(list(sub_links.values()), []))
|
||||
sub_links[url_] = list(
|
||||
self._extract_sublinks_from_url(
|
||||
base_url=url_,
|
||||
filter_urls=filter_urls,
|
||||
already_found_links=already_found_links,
|
||||
loading_wait_time=loading_wait_time,
|
||||
)
|
||||
)
|
||||
for url, extracted_sublink in sub_links.items():
|
||||
documents += self._crawl_urls(
|
||||
extracted_sublink,
|
||||
base_url=url,
|
||||
extract_hidden_text=extract_hidden_text,
|
||||
loading_wait_time=loading_wait_time,
|
||||
id_hash_keys=id_hash_keys,
|
||||
output_dir=output_dir,
|
||||
overwrite_existing_files=overwrite_existing_files,
|
||||
file_path_meta_field_name=file_path_meta_field_name,
|
||||
crawler_naming_function=crawler_naming_function,
|
||||
)
|
||||
|
||||
def _write_to_files(
|
||||
return documents
|
||||
|
||||
def _create_document(
|
||||
self, url: str, text: str, base_url: Optional[str] = None, id_hash_keys: Optional[List[str]] = None
|
||||
) -> Document:
|
||||
"""
|
||||
Create a Document object from the given url and text.
|
||||
:param url: The current url of the webpage.
|
||||
:param text: The text content of the webpage.
|
||||
:param base_url: The original url where we started to crawl.
|
||||
:param id_hash_keys: The fields that should be used to generate the document id.
|
||||
"""
|
||||
|
||||
data: Dict[str, Any] = {}
|
||||
data["meta"] = {"url": url}
|
||||
if base_url:
|
||||
data["meta"]["base_url"] = base_url
|
||||
data["content"] = text
|
||||
if id_hash_keys:
|
||||
data["id_hash_keys"] = id_hash_keys
|
||||
|
||||
return Document.from_dict(data)
|
||||
|
||||
def _write_file(
|
||||
self,
|
||||
document: Document,
|
||||
output_dir: Path,
|
||||
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
|
||||
overwrite_existing_files: Optional[bool] = None,
|
||||
file_path_meta_field_name: Optional[str] = None,
|
||||
) -> Path:
|
||||
url = document.meta["url"]
|
||||
if crawler_naming_function is not None:
|
||||
file_name_prefix = crawler_naming_function(url, document.content) # type: ignore
|
||||
else:
|
||||
file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", url[:129])
|
||||
file_name_hash = hashlib.md5(f"{url}".encode("utf-8")).hexdigest()
|
||||
file_name_prefix = f"{file_name_link}_{file_name_hash[-6:]}"
|
||||
|
||||
file_path = output_dir / f"{file_name_prefix}.json"
|
||||
|
||||
if file_path_meta_field_name:
|
||||
document.meta[file_path_meta_field_name] = str(file_path)
|
||||
|
||||
try:
|
||||
if overwrite_existing_files or not file_path.exists():
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
json.dump(document.to_dict(), f)
|
||||
else:
|
||||
logging.debug(
|
||||
"File '%s' already exists. Set 'overwrite_existing_files=True' to overwrite it.", file_path
|
||||
)
|
||||
except Exception:
|
||||
logging.exception(
|
||||
"Crawler can't save the content of '%s' under '%s'. "
|
||||
"This webpage will be skipped, but links from this page will still be crawled. "
|
||||
"Make sure the path above is accessible and the file name is valid. "
|
||||
"If the file name is invalid, consider setting 'crawler_naming_function' to another function.",
|
||||
url,
|
||||
file_path,
|
||||
)
|
||||
|
||||
return file_path
|
||||
|
||||
def _crawl_urls(
|
||||
self,
|
||||
urls: List[str],
|
||||
output_dir: Path,
|
||||
extract_hidden_text: bool,
|
||||
base_url: Optional[str] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
loading_wait_time: Optional[int] = None,
|
||||
overwrite_existing_files: Optional[bool] = False,
|
||||
output_dir: Optional[Path] = None,
|
||||
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
|
||||
) -> List[Path]:
|
||||
paths = []
|
||||
file_path_meta_field_name: Optional[str] = None,
|
||||
) -> List[Document]:
|
||||
documents: List[Document] = []
|
||||
for link in urls:
|
||||
logger.info("writing contents from '%s'", link)
|
||||
logger.info("Scraping contents from '%s'", link)
|
||||
self.driver.get(link)
|
||||
if loading_wait_time is not None:
|
||||
time.sleep(loading_wait_time)
|
||||
@ -320,54 +404,37 @@ class Crawler(BaseComponent):
|
||||
else:
|
||||
text = el.text
|
||||
|
||||
data: Dict[str, Any] = {}
|
||||
data["meta"] = {"url": link}
|
||||
if base_url:
|
||||
data["meta"]["base_url"] = base_url
|
||||
data["content"] = text
|
||||
if id_hash_keys:
|
||||
data["id_hash_keys"] = id_hash_keys
|
||||
document = Document.from_dict(data)
|
||||
document = self._create_document(url=link, text=text, base_url=base_url, id_hash_keys=id_hash_keys)
|
||||
|
||||
if crawler_naming_function is not None:
|
||||
file_name_prefix = crawler_naming_function(link, text)
|
||||
else:
|
||||
file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link[:129])
|
||||
file_name_hash = hashlib.md5(f"{link}".encode("utf-8")).hexdigest()
|
||||
file_name_prefix = f"{file_name_link}_{file_name_hash[-6:]}"
|
||||
|
||||
file_path = output_dir / f"{file_name_prefix}.json"
|
||||
|
||||
try:
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
json.dump(document.to_dict(), f)
|
||||
except Exception:
|
||||
logging.exception(
|
||||
"Crawler can't save the content of '%s' under '%s'. "
|
||||
"This webpage will be skipped, but links from this page will still be crawled. "
|
||||
"Make sure the path above is accessible and the file name is valid. "
|
||||
"If the file name is invalid, consider setting 'crawler_naming_function' to another function.",
|
||||
link,
|
||||
file_path,
|
||||
if output_dir:
|
||||
file_path = self._write_file(
|
||||
document,
|
||||
output_dir,
|
||||
crawler_naming_function,
|
||||
file_path_meta_field_name=file_path_meta_field_name,
|
||||
overwrite_existing_files=overwrite_existing_files,
|
||||
)
|
||||
logger.debug("Saved content to '%s'", file_path)
|
||||
|
||||
paths.append(file_path)
|
||||
documents.append(document)
|
||||
|
||||
return paths
|
||||
logger.debug("Crawler results: %s Documents", len(documents))
|
||||
|
||||
return documents
|
||||
|
||||
def run( # type: ignore
|
||||
self,
|
||||
output_dir: Union[str, Path, None] = None,
|
||||
urls: Optional[List[str]] = None,
|
||||
crawler_depth: Optional[int] = None,
|
||||
filter_urls: Optional[List] = None,
|
||||
overwrite_existing_files: Optional[bool] = None,
|
||||
return_documents: Optional[bool] = False,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
extract_hidden_text: Optional[bool] = True,
|
||||
loading_wait_time: Optional[int] = None,
|
||||
output_dir: Union[str, Path, None] = None,
|
||||
overwrite_existing_files: Optional[bool] = None,
|
||||
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
|
||||
) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]:
|
||||
file_path_meta_field_name: Optional[str] = None,
|
||||
) -> Tuple[Dict[str, List[Document]], str]:
|
||||
"""
|
||||
Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
|
||||
|
||||
@ -389,6 +456,7 @@ class Crawler(BaseComponent):
|
||||
:param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
|
||||
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
|
||||
E.g. 2: Crawler will wait 2 seconds before scraping page
|
||||
:param file_path_meta_field_name: If provided, the file path will be stored in this meta field.
|
||||
:param crawler_naming_function: A function mapping the crawled page to a file name.
|
||||
By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
|
||||
E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
|
||||
@ -396,10 +464,10 @@ class Crawler(BaseComponent):
|
||||
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
|
||||
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
|
||||
|
||||
:return: Tuple({"paths": List of filepaths, ...}, Name of output edge)
|
||||
:return: Tuple({"documents": List of Documents, ...}, Name of output edge)
|
||||
"""
|
||||
|
||||
file_paths = self.crawl(
|
||||
documents = self.crawl(
|
||||
urls=urls,
|
||||
output_dir=output_dir,
|
||||
crawler_depth=crawler_depth,
|
||||
@ -407,34 +475,26 @@ class Crawler(BaseComponent):
|
||||
overwrite_existing_files=overwrite_existing_files,
|
||||
extract_hidden_text=extract_hidden_text,
|
||||
loading_wait_time=loading_wait_time,
|
||||
id_hash_keys=id_hash_keys,
|
||||
file_path_meta_field_name=file_path_meta_field_name,
|
||||
crawler_naming_function=crawler_naming_function,
|
||||
)
|
||||
results: Dict[str, Union[List[Document], List[Path]]] = {}
|
||||
if return_documents:
|
||||
crawled_data = []
|
||||
for _file in file_paths:
|
||||
with open(_file.absolute(), "r") as read_file:
|
||||
document = json.load(read_file)
|
||||
document["id_hash_keys"] = id_hash_keys
|
||||
crawled_data.append(Document.from_dict(document))
|
||||
results = {"documents": crawled_data}
|
||||
else:
|
||||
results = {"paths": file_paths}
|
||||
results = {"documents": documents}
|
||||
|
||||
return results, "output_1"
|
||||
|
||||
def run_batch( # type: ignore
|
||||
self,
|
||||
output_dir: Union[str, Path, None] = None,
|
||||
urls: Optional[List[str]] = None,
|
||||
crawler_depth: Optional[int] = None,
|
||||
filter_urls: Optional[List] = None,
|
||||
overwrite_existing_files: Optional[bool] = None,
|
||||
return_documents: Optional[bool] = False,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
extract_hidden_text: Optional[bool] = True,
|
||||
loading_wait_time: Optional[int] = None,
|
||||
output_dir: Union[str, Path, None] = None,
|
||||
overwrite_existing_files: Optional[bool] = None,
|
||||
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
|
||||
file_path_meta_field_name: Optional[str] = None,
|
||||
):
|
||||
return self.run(
|
||||
output_dir=output_dir,
|
||||
@ -442,11 +502,11 @@ class Crawler(BaseComponent):
|
||||
crawler_depth=crawler_depth,
|
||||
filter_urls=filter_urls,
|
||||
overwrite_existing_files=overwrite_existing_files,
|
||||
return_documents=return_documents,
|
||||
id_hash_keys=id_hash_keys,
|
||||
extract_hidden_text=extract_hidden_text,
|
||||
loading_wait_time=loading_wait_time,
|
||||
crawler_naming_function=crawler_naming_function,
|
||||
file_path_meta_field_name=file_path_meta_field_name,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
@ -7,10 +7,10 @@ import hashlib
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
|
||||
from haystack.nodes.connector import Crawler
|
||||
from haystack.nodes.connector.crawler import Crawler
|
||||
from haystack.schema import Document
|
||||
|
||||
from ..conftest import SAMPLES_PATH
|
||||
@ -64,12 +64,15 @@ def test_crawler(tmp_path):
|
||||
tmp_dir = tmp_path
|
||||
url = ["https://haystack.deepset.ai/"]
|
||||
|
||||
crawler = Crawler(output_dir=tmp_dir)
|
||||
docs_path = crawler.crawl(urls=url, crawler_depth=0)
|
||||
results, _ = crawler.run(urls=url, crawler_depth=0, return_documents=True)
|
||||
documents = results["documents"]
|
||||
crawler = Crawler(output_dir=tmp_dir, file_path_meta_field_name="file_path")
|
||||
|
||||
for json_file, document in zip(docs_path, documents):
|
||||
documents = crawler.crawl(urls=url, crawler_depth=0)
|
||||
docs_path = [Path(doc.meta["file_path"]) for doc in documents]
|
||||
|
||||
results, _ = crawler.run(urls=url, crawler_depth=0)
|
||||
docs_result = results["documents"]
|
||||
|
||||
for json_file, document in zip(docs_path, docs_result):
|
||||
assert isinstance(json_file, Path)
|
||||
assert isinstance(document, Document)
|
||||
|
||||
@ -85,42 +88,45 @@ def test_crawler(tmp_path):
|
||||
|
||||
|
||||
def test_crawler_url_none_exception(tmp_path):
|
||||
crawler = Crawler(tmp_path)
|
||||
crawler = Crawler()
|
||||
with pytest.raises(ValueError):
|
||||
crawler.crawl()
|
||||
|
||||
|
||||
def test_crawler_depth_0_single_url(test_url, tmp_path):
|
||||
crawler = Crawler(output_dir=tmp_path)
|
||||
paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
|
||||
assert len(paths) == 1
|
||||
assert content_match(crawler, test_url + "/index.html", paths[0])
|
||||
crawler = Crawler(output_dir=tmp_path, crawler_depth=0, file_path_meta_field_name="file_path")
|
||||
documents = crawler.crawl(urls=[test_url + "/index.html"])
|
||||
assert len(documents) == 1
|
||||
assert content_match(crawler, test_url + "/index.html", documents[0].meta["file_path"])
|
||||
|
||||
|
||||
def test_crawler_depth_0_many_urls(test_url, tmp_path):
|
||||
crawler = Crawler(output_dir=tmp_path)
|
||||
crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
|
||||
_urls = [test_url + "/index.html", test_url + "/page1.html"]
|
||||
paths = crawler.crawl(urls=_urls, crawler_depth=0)
|
||||
assert len(paths) == 2
|
||||
documents = crawler.crawl(urls=_urls, crawler_depth=0)
|
||||
assert len(documents) == 2
|
||||
paths = [doc.meta["file_path"] for doc in documents]
|
||||
assert content_in_results(crawler, test_url + "/index.html", paths)
|
||||
assert content_in_results(crawler, test_url + "/page1.html", paths)
|
||||
|
||||
|
||||
def test_crawler_depth_1_single_url(test_url, tmp_path):
|
||||
crawler = Crawler(output_dir=tmp_path)
|
||||
paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1)
|
||||
assert len(paths) == 3
|
||||
crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
|
||||
documents = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1)
|
||||
assert len(documents) == 3
|
||||
paths = [doc.meta["file_path"] for doc in documents]
|
||||
assert content_in_results(crawler, test_url + "/index.html", paths)
|
||||
assert content_in_results(crawler, test_url + "/page1.html", paths)
|
||||
assert content_in_results(crawler, test_url + "/page2.html", paths)
|
||||
|
||||
|
||||
def test_crawler_output_file_structure(test_url, tmp_path):
|
||||
crawler = Crawler(output_dir=tmp_path)
|
||||
paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
|
||||
assert content_match(crawler, test_url + "/index.html", paths[0])
|
||||
crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
|
||||
documents = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
|
||||
path = Path(documents[0].meta["file_path"])
|
||||
assert content_match(crawler, test_url + "/index.html", path)
|
||||
|
||||
with open(paths[0].absolute(), "r") as doc_file:
|
||||
with open(path.absolute(), "r") as doc_file:
|
||||
data = json.load(doc_file)
|
||||
assert "content" in data
|
||||
assert "meta" in data
|
||||
@ -129,52 +135,40 @@ def test_crawler_output_file_structure(test_url, tmp_path):
|
||||
|
||||
|
||||
def test_crawler_filter_urls(test_url, tmp_path):
|
||||
crawler = Crawler(output_dir=tmp_path)
|
||||
crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
|
||||
|
||||
paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1)
|
||||
assert len(paths) == 1
|
||||
assert content_match(crawler, test_url + "/index.html", paths[0])
|
||||
documents = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1)
|
||||
assert len(documents) == 1
|
||||
assert content_match(crawler, test_url + "/index.html", documents[0].meta["file_path"])
|
||||
|
||||
# Note: filter_urls can exclude pages listed in `urls` as well
|
||||
paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1)
|
||||
assert len(paths) == 1
|
||||
assert content_match(crawler, test_url + "/page1.html", paths[0])
|
||||
documents = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1)
|
||||
assert len(documents) == 1
|
||||
assert content_match(crawler, test_url + "/page1.html", documents[0].meta["file_path"])
|
||||
assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google.com"], crawler_depth=1)
|
||||
|
||||
|
||||
def test_crawler_return_document(test_url, tmp_path):
|
||||
crawler = Crawler(output_dir=tmp_path)
|
||||
documents, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=True)
|
||||
paths, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=False)
|
||||
|
||||
for path, document in zip(paths["paths"], documents["documents"]):
|
||||
with open(path.absolute(), "r") as doc_file:
|
||||
file_content = json.load(doc_file)
|
||||
assert file_content["meta"] == document.meta
|
||||
assert file_content["content"] == document.content
|
||||
|
||||
|
||||
def test_crawler_extract_hidden_text(test_url, tmp_path):
|
||||
crawler = Crawler(output_dir=tmp_path)
|
||||
documents, _ = crawler.run(
|
||||
urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0, return_documents=True
|
||||
)
|
||||
documents, _ = crawler.run(urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0)
|
||||
crawled_content = documents["documents"][0].content
|
||||
assert "hidden text" in crawled_content
|
||||
|
||||
documents, _ = crawler.run(
|
||||
urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=False, crawler_depth=0, return_documents=True
|
||||
)
|
||||
documents, _ = crawler.run(urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=False, crawler_depth=0)
|
||||
crawled_content = documents["documents"][0].content
|
||||
assert "hidden text" not in crawled_content
|
||||
|
||||
|
||||
def test_crawler_loading_wait_time(test_url, tmp_path):
|
||||
loading_wait_time = 3
|
||||
crawler = Crawler(output_dir=tmp_path)
|
||||
paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=1, loading_wait_time=loading_wait_time)
|
||||
crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
|
||||
documents = crawler.crawl(
|
||||
urls=[test_url + "/page_dynamic.html"], crawler_depth=1, loading_wait_time=loading_wait_time
|
||||
)
|
||||
|
||||
assert len(paths) == 4
|
||||
assert len(documents) == 4
|
||||
|
||||
paths = [doc.meta["file_path"] for doc in documents]
|
||||
|
||||
with open(f"{SAMPLES_PATH.absolute()}/crawler/page_dynamic_result.txt", "r") as dynamic_result:
|
||||
dynamic_result_text = dynamic_result.readlines()
|
||||
@ -196,29 +190,46 @@ def test_crawler_loading_wait_time(test_url, tmp_path):
|
||||
|
||||
|
||||
def test_crawler_default_naming_function(test_url, tmp_path):
|
||||
crawler = Crawler(output_dir=tmp_path)
|
||||
crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
|
||||
|
||||
link = f"{test_url}/page_with_a_very_long_name_to_do_some_tests_Now_let's_add_some_text_just_to_pass_the_129_chars_mark_and_trigger_the_chars_limit_of_the_default_naming_function.html"
|
||||
file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link[:129])
|
||||
file_name_hash = hashlib.md5(f"{link}".encode("utf-8")).hexdigest()
|
||||
expected_crawled_file_path = f"{tmp_path}/{file_name_link}_{file_name_hash[-6:]}.json"
|
||||
|
||||
paths = crawler.crawl(urls=[link], crawler_depth=0)
|
||||
documents = crawler.crawl(urls=[link], crawler_depth=0)
|
||||
|
||||
assert os.path.exists(paths[0])
|
||||
assert paths[0] == Path(expected_crawled_file_path)
|
||||
path = Path(documents[0].meta["file_path"])
|
||||
assert os.path.exists(path)
|
||||
assert path == Path(expected_crawled_file_path)
|
||||
|
||||
|
||||
def test_crawler_naming_function(test_url, tmp_path):
|
||||
crawler = Crawler(
|
||||
output_dir=tmp_path, crawler_naming_function=lambda link, text: re.sub("[<>:'/\\|?*\0 ]", "_", link)
|
||||
output_dir=tmp_path,
|
||||
file_path_meta_field_name="file_path",
|
||||
crawler_naming_function=lambda link, text: re.sub("[<>:'/\\|?*\0 ]", "_", link),
|
||||
)
|
||||
|
||||
link = f"{test_url}/page_dynamic.html"
|
||||
file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link)
|
||||
expected_crawled_file_path = tmp_path / f"{file_name_link}.json"
|
||||
|
||||
paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=0)
|
||||
documents = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=0)
|
||||
path = Path(documents[0].meta["file_path"])
|
||||
assert os.path.exists(path)
|
||||
assert path == expected_crawled_file_path
|
||||
|
||||
assert os.path.exists(paths[0])
|
||||
assert paths[0] == expected_crawled_file_path
|
||||
|
||||
def test_crawler_not_save_file(test_url):
|
||||
crawler = Crawler()
|
||||
documents = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
|
||||
assert documents[0].meta.get("file_path", None) is None
|
||||
|
||||
|
||||
def test_crawler_custom_meta_file_path_name(test_url, tmp_path):
|
||||
crawler = Crawler()
|
||||
documents = crawler.crawl(
|
||||
urls=[test_url + "/index.html"], crawler_depth=0, output_dir=tmp_path, file_path_meta_field_name="custom"
|
||||
)
|
||||
assert documents[0].meta.get("custom", None) is not None
|
||||
|
Loading…
x
Reference in New Issue
Block a user