From e0b0fe1bc32b29f739ca45c51659bcc075de8afa Mon Sep 17 00:00:00 2001 From: Daniel Bichuetti Date: Wed, 22 Feb 2023 13:34:19 -0300 Subject: [PATCH] feat!: Increase Crawler standardization regarding Pipelines (#4122) * feat!(Crawler): Integrate Crawler in the Pipeline. +Output Documents +Optional file saving +Optional Document meta about file path * refactor: add Optional decl. * chore: dummy commit * chore: dummy commit * refactor: improve overwrite flow * refactor: change custom file path meta logic + add test * Update haystack/nodes/connector/crawler.py Co-authored-by: Massimiliano Pippi * Update haystack/nodes/connector/crawler.py Co-authored-by: Massimiliano Pippi * Update haystack/nodes/connector/crawler.py Co-authored-by: Massimiliano Pippi * Update haystack/nodes/connector/crawler.py Co-authored-by: Massimiliano Pippi * Update haystack/nodes/connector/crawler.py Co-authored-by: Massimiliano Pippi --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Massimiliano Pippi --- haystack/nodes/connector/crawler.py | 316 +++++++++++++++++----------- test/nodes/test_connector.py | 127 ++++++----- 2 files changed, 257 insertions(+), 186 deletions(-) diff --git a/haystack/nodes/connector/crawler.py b/haystack/nodes/connector/crawler.py index 0864b7175..5d6a1e168 100644 --- a/haystack/nodes/connector/crawler.py +++ b/haystack/nodes/connector/crawler.py @@ -1,31 +1,29 @@ -from typing import Callable, List, Optional, Dict, Tuple, Union, Any - +import hashlib +import json +import logging import os import re import sys -import json import time -import logging from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple, Union from urllib.parse import urlparse -import hashlib try: - from webdriver_manager.chrome import ChromeDriverManager - from selenium.webdriver.chrome.service import Service - from selenium.webdriver.chrome.options import Options - from selenium.webdriver.common.by import By - from selenium.common.exceptions import StaleElementReferenceException, WebDriverException from selenium import webdriver + from selenium.common.exceptions import StaleElementReferenceException, WebDriverException + from selenium.webdriver.chrome.options import Options + from selenium.webdriver.chrome.service import Service + from selenium.webdriver.common.by import By + from webdriver_manager.chrome import ChromeDriverManager except (ImportError, ModuleNotFoundError) as ie: from haystack.utils.import_utils import _optional_component_not_installed _optional_component_not_installed(__name__, "crawler", ie) +from haystack.errors import NodeError from haystack.nodes.base import BaseComponent from haystack.schema import Document -from haystack.errors import NodeError - logger = logging.getLogger(__name__) @@ -49,28 +47,27 @@ class Crawler(BaseComponent): def __init__( self, - output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, - overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, + output_dir: Union[str, Path, None] = None, + overwrite_existing_files=True, + file_path_meta_field_name: Optional[str] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None, webdriver_options: Optional[List[str]] = None, ): """ Init object with basic params for crawling (can be overwritten later). - :param output_dir: Path for the directory to store files :param urls: List of http(s) address(es) (can also be supplied later when calling crawl()) :param crawler_depth: How many sublinks to follow from the initial list of URLs. Current options: 0: Only initial list of urls 1: Follow links found on the initial URLs (but no further) :param filter_urls: Optional list of regular expressions that the crawled URLs must comply with. All URLs not matching at least one of the regular expressions will be dropped. - :param overwrite_existing_files: Whether to overwrite existing files in output_dir with new content :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). @@ -80,6 +77,9 @@ class Crawler(BaseComponent): :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted. E.g. 2: Crawler will wait 2 seconds before scraping page + :param output_dir: If provided, the crawled documents will be saved as JSON files in this directory. + :param overwrite_existing_files: Whether to overwrite existing files in output_dir with new content + :param file_path_meta_field_name: If provided, the file path will be stored in this meta field. :param crawler_naming_function: A function mapping the crawled page to a file name. By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url. E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link) @@ -164,7 +164,6 @@ class Crawler(BaseComponent): logger.info("'chrome-driver' will be automatically installed.") self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) self.urls = urls - self.output_dir = output_dir self.crawler_depth = crawler_depth self.filter_urls = filter_urls self.overwrite_existing_files = overwrite_existing_files @@ -172,22 +171,25 @@ class Crawler(BaseComponent): self.extract_hidden_text = extract_hidden_text self.loading_wait_time = loading_wait_time self.crawler_naming_function = crawler_naming_function + self.output_dir = output_dir + self.file_path_meta_field_name = file_path_meta_field_name def __del__(self): self.driver.quit() def crawl( self, - output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, - overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None, loading_wait_time: Optional[int] = None, + output_dir: Union[str, Path, None] = None, + overwrite_existing_files: Optional[bool] = None, + file_path_meta_field_name: Optional[str] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None, - ) -> List[Path]: + ) -> List[Document]: """ Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON file per URL, including text and basic meta data). @@ -195,7 +197,6 @@ class Crawler(BaseComponent): All parameters are optional here and only meant to overwrite instance attributes at runtime. If no parameters are provided to this method, the instance attributes that were passed during __init__ will be used. - :param output_dir: Path for the directory to store files :param urls: List of http addresses or single http address :param crawler_depth: How many sublinks to follow from the initial list of URLs. Current options: 0: Only initial list of urls @@ -210,6 +211,8 @@ class Crawler(BaseComponent): :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted. E.g. 2: Crawler will wait 2 seconds before scraping page + :param output_dir: If provided, the crawled documents will be saved as JSON files in this directory. + :param file_path_meta_field_name: If provided, the file path will be stored in this meta field. :param crawler_naming_function: A function mapping the crawled page to a file name. By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url. E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link) @@ -217,7 +220,7 @@ class Crawler(BaseComponent): 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest() This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content. - :return: List of paths where the crawled webpages got stored + :return: List of Documents that were created during crawling """ # use passed params or fallback to instance attributes if id_hash_keys is None: @@ -236,81 +239,162 @@ class Crawler(BaseComponent): extract_hidden_text = self.extract_hidden_text if loading_wait_time is None: loading_wait_time = self.loading_wait_time + if file_path_meta_field_name is None: + file_path_meta_field_name = self.file_path_meta_field_name if crawler_naming_function is None: crawler_naming_function = self.crawler_naming_function - output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir(parents=True) + if isinstance(output_dir, str): + output_dir = Path(output_dir) - file_paths: list = [] - is_not_empty = len(list(output_dir.rglob("*"))) > 0 - if is_not_empty and not overwrite_existing_files: - logger.info( - "Found data stored in `%s`. Delete this first if you really want to fetch new data.", output_dir - ) - else: - logger.info("Fetching from %s to `%s`", urls, output_dir) + if output_dir: + if not output_dir.exists(): + output_dir.mkdir(parents=True) - # Start by writing out the initial list of urls - if filter_urls: - pattern = re.compile("|".join(filter_urls)) - for url in urls: - if pattern.search(url): - file_paths += self._write_to_files( - [url], - output_dir=output_dir, - extract_hidden_text=extract_hidden_text, - loading_wait_time=loading_wait_time, - crawler_naming_function=crawler_naming_function, - ) - else: - file_paths += self._write_to_files( - urls, - output_dir=output_dir, - extract_hidden_text=extract_hidden_text, - loading_wait_time=loading_wait_time, - crawler_naming_function=crawler_naming_function, + is_not_empty = len(list(output_dir.rglob("*"))) > 0 + if is_not_empty and not overwrite_existing_files: + logger.warning( + "Found data stored in `%s`. Use an empty folder or set `overwrite_existing_files=True`, " + "if you want to overwrite any already present saved files.", + output_dir, ) - # follow one level of sublinks if requested - if crawler_depth == 1: - sub_links: Dict[str, List] = {} - for url_ in urls: - already_found_links: List = list(sum(list(sub_links.values()), [])) - sub_links[url_] = list( - self._extract_sublinks_from_url( - base_url=url_, - filter_urls=filter_urls, - already_found_links=already_found_links, - loading_wait_time=loading_wait_time, - ) - ) - for url, extracted_sublink in sub_links.items(): - file_paths += self._write_to_files( - extracted_sublink, - output_dir=output_dir, - base_url=url, - id_hash_keys=id_hash_keys, + else: + logger.info("Fetching from %s to `%s`", urls, output_dir) + + documents: List[Document] = [] + + # Start by crawling the initial list of urls + if filter_urls: + pattern = re.compile("|".join(filter_urls)) + for url in urls: + if pattern.search(url): + documents += self._crawl_urls( + [url], extract_hidden_text=extract_hidden_text, loading_wait_time=loading_wait_time, + id_hash_keys=id_hash_keys, + output_dir=output_dir, + overwrite_existing_files=overwrite_existing_files, + file_path_meta_field_name=file_path_meta_field_name, crawler_naming_function=crawler_naming_function, ) + else: + documents += self._crawl_urls( + urls, + extract_hidden_text=extract_hidden_text, + loading_wait_time=loading_wait_time, + id_hash_keys=id_hash_keys, + output_dir=output_dir, + overwrite_existing_files=overwrite_existing_files, + file_path_meta_field_name=file_path_meta_field_name, + crawler_naming_function=crawler_naming_function, + ) - return file_paths + # follow one level of sublinks if requested + if crawler_depth == 1: + sub_links: Dict[str, List] = {} + for url_ in urls: + already_found_links: List = list(sum(list(sub_links.values()), [])) + sub_links[url_] = list( + self._extract_sublinks_from_url( + base_url=url_, + filter_urls=filter_urls, + already_found_links=already_found_links, + loading_wait_time=loading_wait_time, + ) + ) + for url, extracted_sublink in sub_links.items(): + documents += self._crawl_urls( + extracted_sublink, + base_url=url, + extract_hidden_text=extract_hidden_text, + loading_wait_time=loading_wait_time, + id_hash_keys=id_hash_keys, + output_dir=output_dir, + overwrite_existing_files=overwrite_existing_files, + file_path_meta_field_name=file_path_meta_field_name, + crawler_naming_function=crawler_naming_function, + ) - def _write_to_files( + return documents + + def _create_document( + self, url: str, text: str, base_url: Optional[str] = None, id_hash_keys: Optional[List[str]] = None + ) -> Document: + """ + Create a Document object from the given url and text. + :param url: The current url of the webpage. + :param text: The text content of the webpage. + :param base_url: The original url where we started to crawl. + :param id_hash_keys: The fields that should be used to generate the document id. + """ + + data: Dict[str, Any] = {} + data["meta"] = {"url": url} + if base_url: + data["meta"]["base_url"] = base_url + data["content"] = text + if id_hash_keys: + data["id_hash_keys"] = id_hash_keys + + return Document.from_dict(data) + + def _write_file( + self, + document: Document, + output_dir: Path, + crawler_naming_function: Optional[Callable[[str, str], str]] = None, + overwrite_existing_files: Optional[bool] = None, + file_path_meta_field_name: Optional[str] = None, + ) -> Path: + url = document.meta["url"] + if crawler_naming_function is not None: + file_name_prefix = crawler_naming_function(url, document.content) # type: ignore + else: + file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", url[:129]) + file_name_hash = hashlib.md5(f"{url}".encode("utf-8")).hexdigest() + file_name_prefix = f"{file_name_link}_{file_name_hash[-6:]}" + + file_path = output_dir / f"{file_name_prefix}.json" + + if file_path_meta_field_name: + document.meta[file_path_meta_field_name] = str(file_path) + + try: + if overwrite_existing_files or not file_path.exists(): + with open(file_path, "w", encoding="utf-8") as f: + json.dump(document.to_dict(), f) + else: + logging.debug( + "File '%s' already exists. Set 'overwrite_existing_files=True' to overwrite it.", file_path + ) + except Exception: + logging.exception( + "Crawler can't save the content of '%s' under '%s'. " + "This webpage will be skipped, but links from this page will still be crawled. " + "Make sure the path above is accessible and the file name is valid. " + "If the file name is invalid, consider setting 'crawler_naming_function' to another function.", + url, + file_path, + ) + + return file_path + + def _crawl_urls( self, urls: List[str], - output_dir: Path, extract_hidden_text: bool, base_url: Optional[str] = None, id_hash_keys: Optional[List[str]] = None, loading_wait_time: Optional[int] = None, + overwrite_existing_files: Optional[bool] = False, + output_dir: Optional[Path] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None, - ) -> List[Path]: - paths = [] + file_path_meta_field_name: Optional[str] = None, + ) -> List[Document]: + documents: List[Document] = [] for link in urls: - logger.info("writing contents from '%s'", link) + logger.info("Scraping contents from '%s'", link) self.driver.get(link) if loading_wait_time is not None: time.sleep(loading_wait_time) @@ -320,54 +404,37 @@ class Crawler(BaseComponent): else: text = el.text - data: Dict[str, Any] = {} - data["meta"] = {"url": link} - if base_url: - data["meta"]["base_url"] = base_url - data["content"] = text - if id_hash_keys: - data["id_hash_keys"] = id_hash_keys - document = Document.from_dict(data) + document = self._create_document(url=link, text=text, base_url=base_url, id_hash_keys=id_hash_keys) - if crawler_naming_function is not None: - file_name_prefix = crawler_naming_function(link, text) - else: - file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link[:129]) - file_name_hash = hashlib.md5(f"{link}".encode("utf-8")).hexdigest() - file_name_prefix = f"{file_name_link}_{file_name_hash[-6:]}" - - file_path = output_dir / f"{file_name_prefix}.json" - - try: - with open(file_path, "w", encoding="utf-8") as f: - json.dump(document.to_dict(), f) - except Exception: - logging.exception( - "Crawler can't save the content of '%s' under '%s'. " - "This webpage will be skipped, but links from this page will still be crawled. " - "Make sure the path above is accessible and the file name is valid. " - "If the file name is invalid, consider setting 'crawler_naming_function' to another function.", - link, - file_path, + if output_dir: + file_path = self._write_file( + document, + output_dir, + crawler_naming_function, + file_path_meta_field_name=file_path_meta_field_name, + overwrite_existing_files=overwrite_existing_files, ) + logger.debug("Saved content to '%s'", file_path) - paths.append(file_path) + documents.append(document) - return paths + logger.debug("Crawler results: %s Documents", len(documents)) + + return documents def run( # type: ignore self, - output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, - overwrite_existing_files: Optional[bool] = None, - return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True, loading_wait_time: Optional[int] = None, + output_dir: Union[str, Path, None] = None, + overwrite_existing_files: Optional[bool] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None, - ) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]: + file_path_meta_field_name: Optional[str] = None, + ) -> Tuple[Dict[str, List[Document]], str]: """ Method to be executed when the Crawler is used as a Node within a Haystack pipeline. @@ -389,6 +456,7 @@ class Crawler(BaseComponent): :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted. E.g. 2: Crawler will wait 2 seconds before scraping page + :param file_path_meta_field_name: If provided, the file path will be stored in this meta field. :param crawler_naming_function: A function mapping the crawled page to a file name. By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url. E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link) @@ -396,10 +464,10 @@ class Crawler(BaseComponent): 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest() This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content. - :return: Tuple({"paths": List of filepaths, ...}, Name of output edge) + :return: Tuple({"documents": List of Documents, ...}, Name of output edge) """ - file_paths = self.crawl( + documents = self.crawl( urls=urls, output_dir=output_dir, crawler_depth=crawler_depth, @@ -407,34 +475,26 @@ class Crawler(BaseComponent): overwrite_existing_files=overwrite_existing_files, extract_hidden_text=extract_hidden_text, loading_wait_time=loading_wait_time, + id_hash_keys=id_hash_keys, + file_path_meta_field_name=file_path_meta_field_name, crawler_naming_function=crawler_naming_function, ) - results: Dict[str, Union[List[Document], List[Path]]] = {} - if return_documents: - crawled_data = [] - for _file in file_paths: - with open(_file.absolute(), "r") as read_file: - document = json.load(read_file) - document["id_hash_keys"] = id_hash_keys - crawled_data.append(Document.from_dict(document)) - results = {"documents": crawled_data} - else: - results = {"paths": file_paths} + results = {"documents": documents} return results, "output_1" def run_batch( # type: ignore self, - output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, - overwrite_existing_files: Optional[bool] = None, - return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True, loading_wait_time: Optional[int] = None, + output_dir: Union[str, Path, None] = None, + overwrite_existing_files: Optional[bool] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None, + file_path_meta_field_name: Optional[str] = None, ): return self.run( output_dir=output_dir, @@ -442,11 +502,11 @@ class Crawler(BaseComponent): crawler_depth=crawler_depth, filter_urls=filter_urls, overwrite_existing_files=overwrite_existing_files, - return_documents=return_documents, id_hash_keys=id_hash_keys, extract_hidden_text=extract_hidden_text, loading_wait_time=loading_wait_time, crawler_naming_function=crawler_naming_function, + file_path_meta_field_name=file_path_meta_field_name, ) @staticmethod diff --git a/test/nodes/test_connector.py b/test/nodes/test_connector.py index 92b8218e9..0bb1622dc 100644 --- a/test/nodes/test_connector.py +++ b/test/nodes/test_connector.py @@ -7,10 +7,10 @@ import hashlib import os import pytest + from selenium.webdriver.common.by import By - -from haystack.nodes.connector import Crawler +from haystack.nodes.connector.crawler import Crawler from haystack.schema import Document from ..conftest import SAMPLES_PATH @@ -64,12 +64,15 @@ def test_crawler(tmp_path): tmp_dir = tmp_path url = ["https://haystack.deepset.ai/"] - crawler = Crawler(output_dir=tmp_dir) - docs_path = crawler.crawl(urls=url, crawler_depth=0) - results, _ = crawler.run(urls=url, crawler_depth=0, return_documents=True) - documents = results["documents"] + crawler = Crawler(output_dir=tmp_dir, file_path_meta_field_name="file_path") - for json_file, document in zip(docs_path, documents): + documents = crawler.crawl(urls=url, crawler_depth=0) + docs_path = [Path(doc.meta["file_path"]) for doc in documents] + + results, _ = crawler.run(urls=url, crawler_depth=0) + docs_result = results["documents"] + + for json_file, document in zip(docs_path, docs_result): assert isinstance(json_file, Path) assert isinstance(document, Document) @@ -85,42 +88,45 @@ def test_crawler(tmp_path): def test_crawler_url_none_exception(tmp_path): - crawler = Crawler(tmp_path) + crawler = Crawler() with pytest.raises(ValueError): crawler.crawl() def test_crawler_depth_0_single_url(test_url, tmp_path): - crawler = Crawler(output_dir=tmp_path) - paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0) - assert len(paths) == 1 - assert content_match(crawler, test_url + "/index.html", paths[0]) + crawler = Crawler(output_dir=tmp_path, crawler_depth=0, file_path_meta_field_name="file_path") + documents = crawler.crawl(urls=[test_url + "/index.html"]) + assert len(documents) == 1 + assert content_match(crawler, test_url + "/index.html", documents[0].meta["file_path"]) def test_crawler_depth_0_many_urls(test_url, tmp_path): - crawler = Crawler(output_dir=tmp_path) + crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path") _urls = [test_url + "/index.html", test_url + "/page1.html"] - paths = crawler.crawl(urls=_urls, crawler_depth=0) - assert len(paths) == 2 + documents = crawler.crawl(urls=_urls, crawler_depth=0) + assert len(documents) == 2 + paths = [doc.meta["file_path"] for doc in documents] assert content_in_results(crawler, test_url + "/index.html", paths) assert content_in_results(crawler, test_url + "/page1.html", paths) def test_crawler_depth_1_single_url(test_url, tmp_path): - crawler = Crawler(output_dir=tmp_path) - paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1) - assert len(paths) == 3 + crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path") + documents = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1) + assert len(documents) == 3 + paths = [doc.meta["file_path"] for doc in documents] assert content_in_results(crawler, test_url + "/index.html", paths) assert content_in_results(crawler, test_url + "/page1.html", paths) assert content_in_results(crawler, test_url + "/page2.html", paths) def test_crawler_output_file_structure(test_url, tmp_path): - crawler = Crawler(output_dir=tmp_path) - paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0) - assert content_match(crawler, test_url + "/index.html", paths[0]) + crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path") + documents = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0) + path = Path(documents[0].meta["file_path"]) + assert content_match(crawler, test_url + "/index.html", path) - with open(paths[0].absolute(), "r") as doc_file: + with open(path.absolute(), "r") as doc_file: data = json.load(doc_file) assert "content" in data assert "meta" in data @@ -129,52 +135,40 @@ def test_crawler_output_file_structure(test_url, tmp_path): def test_crawler_filter_urls(test_url, tmp_path): - crawler = Crawler(output_dir=tmp_path) + crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path") - paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1) - assert len(paths) == 1 - assert content_match(crawler, test_url + "/index.html", paths[0]) + documents = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1) + assert len(documents) == 1 + assert content_match(crawler, test_url + "/index.html", documents[0].meta["file_path"]) # Note: filter_urls can exclude pages listed in `urls` as well - paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1) - assert len(paths) == 1 - assert content_match(crawler, test_url + "/page1.html", paths[0]) + documents = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1) + assert len(documents) == 1 + assert content_match(crawler, test_url + "/page1.html", documents[0].meta["file_path"]) assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google.com"], crawler_depth=1) -def test_crawler_return_document(test_url, tmp_path): - crawler = Crawler(output_dir=tmp_path) - documents, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=True) - paths, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=False) - - for path, document in zip(paths["paths"], documents["documents"]): - with open(path.absolute(), "r") as doc_file: - file_content = json.load(doc_file) - assert file_content["meta"] == document.meta - assert file_content["content"] == document.content - - def test_crawler_extract_hidden_text(test_url, tmp_path): crawler = Crawler(output_dir=tmp_path) - documents, _ = crawler.run( - urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0, return_documents=True - ) + documents, _ = crawler.run(urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0) crawled_content = documents["documents"][0].content assert "hidden text" in crawled_content - documents, _ = crawler.run( - urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=False, crawler_depth=0, return_documents=True - ) + documents, _ = crawler.run(urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=False, crawler_depth=0) crawled_content = documents["documents"][0].content assert "hidden text" not in crawled_content def test_crawler_loading_wait_time(test_url, tmp_path): loading_wait_time = 3 - crawler = Crawler(output_dir=tmp_path) - paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=1, loading_wait_time=loading_wait_time) + crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path") + documents = crawler.crawl( + urls=[test_url + "/page_dynamic.html"], crawler_depth=1, loading_wait_time=loading_wait_time + ) - assert len(paths) == 4 + assert len(documents) == 4 + + paths = [doc.meta["file_path"] for doc in documents] with open(f"{SAMPLES_PATH.absolute()}/crawler/page_dynamic_result.txt", "r") as dynamic_result: dynamic_result_text = dynamic_result.readlines() @@ -196,29 +190,46 @@ def test_crawler_loading_wait_time(test_url, tmp_path): def test_crawler_default_naming_function(test_url, tmp_path): - crawler = Crawler(output_dir=tmp_path) + crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path") link = f"{test_url}/page_with_a_very_long_name_to_do_some_tests_Now_let's_add_some_text_just_to_pass_the_129_chars_mark_and_trigger_the_chars_limit_of_the_default_naming_function.html" file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link[:129]) file_name_hash = hashlib.md5(f"{link}".encode("utf-8")).hexdigest() expected_crawled_file_path = f"{tmp_path}/{file_name_link}_{file_name_hash[-6:]}.json" - paths = crawler.crawl(urls=[link], crawler_depth=0) + documents = crawler.crawl(urls=[link], crawler_depth=0) - assert os.path.exists(paths[0]) - assert paths[0] == Path(expected_crawled_file_path) + path = Path(documents[0].meta["file_path"]) + assert os.path.exists(path) + assert path == Path(expected_crawled_file_path) def test_crawler_naming_function(test_url, tmp_path): crawler = Crawler( - output_dir=tmp_path, crawler_naming_function=lambda link, text: re.sub("[<>:'/\\|?*\0 ]", "_", link) + output_dir=tmp_path, + file_path_meta_field_name="file_path", + crawler_naming_function=lambda link, text: re.sub("[<>:'/\\|?*\0 ]", "_", link), ) link = f"{test_url}/page_dynamic.html" file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link) expected_crawled_file_path = tmp_path / f"{file_name_link}.json" - paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=0) + documents = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=0) + path = Path(documents[0].meta["file_path"]) + assert os.path.exists(path) + assert path == expected_crawled_file_path - assert os.path.exists(paths[0]) - assert paths[0] == expected_crawled_file_path + +def test_crawler_not_save_file(test_url): + crawler = Crawler() + documents = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0) + assert documents[0].meta.get("file_path", None) is None + + +def test_crawler_custom_meta_file_path_name(test_url, tmp_path): + crawler = Crawler() + documents = crawler.crawl( + urls=[test_url + "/index.html"], crawler_depth=0, output_dir=tmp_path, file_path_meta_field_name="custom" + ) + assert documents[0].meta.get("custom", None) is not None