feat!: Increase Crawler standardization regarding Pipelines (#4122)

* feat!(Crawler): Integrate Crawler in the Pipeline. +Output Documents +Optional file saving +Optional Document meta about file path * refactor: add Optional decl. * chore: dummy commit * chore: dummy commit * refactor: improve overwrite flow * refactor: change custom file path meta logic + add test * Update haystack/nodes/connector/crawler.py Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * Update haystack/nodes/connector/crawler.py Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * Update haystack/nodes/connector/crawler.py Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * Update haystack/nodes/connector/crawler.py Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> * Update haystack/nodes/connector/crawler.py Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
2025-12-05 03:17:31 +00:00 · 2023-02-22 13:34:19 -03:00 · 2023-02-22 13:34:19 -03:00 · e0b0fe1bc3
commit e0b0fe1bc3
parent 49ed21b82d
2 changed files with 257 additions and 186 deletions
--- a/haystack/nodes/connector/crawler.py
+++ b/haystack/nodes/connector/crawler.py
@ -1,31 +1,29 @@
-from typing import Callable, List, Optional, Dict, Tuple, Union, Any
+import hashlib
-
+import json
 import logging
 import os
 import re
 import sys
 import json
 import time
 import logging
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from urllib.parse import urlparse
 import hashlib
 try:
    from webdriver_manager.chrome import ChromeDriverManager
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.common.by import By
    from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
    from selenium import webdriver
    from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.common.by import By
    from webdriver_manager.chrome import ChromeDriverManager
 except (ImportError, ModuleNotFoundError) as ie:
    from haystack.utils.import_utils import _optional_component_not_installed
    _optional_component_not_installed(__name__, "crawler", ie)
 from haystack.errors import NodeError
 from haystack.nodes.base import BaseComponent
 from haystack.schema import Document
 from haystack.errors import NodeError
 logger = logging.getLogger(__name__)
@ -49,28 +47,27 @@ class Crawler(BaseComponent):
    def __init__(
        self,
        output_dir: str,
        urls: Optional[List[str]] = None,
        crawler_depth: int = 1,
        filter_urls: Optional[List] = None,
        overwrite_existing_files=True,
        id_hash_keys: Optional[List[str]] = None,
        extract_hidden_text=True,
        loading_wait_time: Optional[int] = None,
        output_dir: Union[str, Path, None] = None,
        overwrite_existing_files=True,
        file_path_meta_field_name: Optional[str] = None,
        crawler_naming_function: Optional[Callable[[str, str], str]] = None,
        webdriver_options: Optional[List[str]] = None,
    ):
        """
        Init object with basic params for crawling (can be overwritten later).
        :param output_dir: Path for the directory to store files
        :param urls: List of http(s) address(es) (can also be supplied later when calling crawl())
        :param crawler_depth: How many sublinks to follow from the initial list of URLs. Current options:
            0: Only initial list of urls
            1: Follow links found on the initial URLs (but no further)
        :param filter_urls: Optional list of regular expressions that the crawled URLs must comply with.
            All URLs not matching at least one of the regular expressions will be dropped.
        :param overwrite_existing_files: Whether to overwrite existing files in output_dir with new content
        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
@ -80,6 +77,9 @@ class Crawler(BaseComponent):
        :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
            dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
            E.g. 2: Crawler will wait 2 seconds before scraping page
        :param output_dir: If provided, the crawled documents will be saved as JSON files in this directory.
        :param overwrite_existing_files: Whether to overwrite existing files in output_dir with new content
        :param file_path_meta_field_name: If provided, the file path will be stored in this meta field.
        :param crawler_naming_function: A function mapping the crawled page to a file name.
            By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
            E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
@ -164,7 +164,6 @@ class Crawler(BaseComponent):
            logger.info("'chrome-driver' will be automatically installed.")
            self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        self.urls = urls
        self.output_dir = output_dir
        self.crawler_depth = crawler_depth
        self.filter_urls = filter_urls
        self.overwrite_existing_files = overwrite_existing_files
@ -172,22 +171,25 @@ class Crawler(BaseComponent):
        self.extract_hidden_text = extract_hidden_text
        self.loading_wait_time = loading_wait_time
        self.crawler_naming_function = crawler_naming_function
        self.output_dir = output_dir
        self.file_path_meta_field_name = file_path_meta_field_name
    def __del__(self):
        self.driver.quit()
    def crawl(
        self,
        output_dir: Union[str, Path, None] = None,
        urls: Optional[List[str]] = None,
        crawler_depth: Optional[int] = None,
        filter_urls: Optional[List] = None,
        overwrite_existing_files: Optional[bool] = None,
        id_hash_keys: Optional[List[str]] = None,
        extract_hidden_text: Optional[bool] = None,
        loading_wait_time: Optional[int] = None,
        output_dir: Union[str, Path, None] = None,
        overwrite_existing_files: Optional[bool] = None,
        file_path_meta_field_name: Optional[str] = None,
        crawler_naming_function: Optional[Callable[[str, str], str]] = None,
-    ) -> List[Path]:
+    ) -> List[Document]:
        """
        Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
        file per URL, including text and basic meta data).
@ -195,7 +197,6 @@ class Crawler(BaseComponent):
        All parameters are optional here and only meant to overwrite instance attributes at runtime.
        If no parameters are provided to this method, the instance attributes that were passed during __init__ will be used.
        :param output_dir: Path for the directory to store files
        :param urls: List of http addresses or single http address
        :param crawler_depth: How many sublinks to follow from the initial list of URLs. Current options:
                              0: Only initial list of urls
@ -210,6 +211,8 @@ class Crawler(BaseComponent):
        :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
            dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
            E.g. 2: Crawler will wait 2 seconds before scraping page
        :param output_dir: If provided, the crawled documents will be saved as JSON files in this directory.
        :param file_path_meta_field_name: If provided, the file path will be stored in this meta field.
        :param crawler_naming_function: A function mapping the crawled page to a file name.
            By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
            E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
@ -217,7 +220,7 @@ class Crawler(BaseComponent):
                 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
                    This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
-        :return: List of paths where the crawled webpages got stored
+        :return: List of Documents that were created during crawling
        """
        # use passed params or fallback to instance attributes
        if id_hash_keys is None:
@ -236,42 +239,57 @@ class Crawler(BaseComponent):
            extract_hidden_text = self.extract_hidden_text
        if loading_wait_time is None:
            loading_wait_time = self.loading_wait_time
        if file_path_meta_field_name is None:
            file_path_meta_field_name = self.file_path_meta_field_name
        if crawler_naming_function is None:
            crawler_naming_function = self.crawler_naming_function
        if isinstance(output_dir, str):
            output_dir = Path(output_dir)
        if output_dir:
            if not output_dir.exists():
                output_dir.mkdir(parents=True)
        file_paths: list = []
            is_not_empty = len(list(output_dir.rglob("*"))) > 0
            if is_not_empty and not overwrite_existing_files:
-            logger.info(
+                logger.warning(
-                "Found data stored in `%s`. Delete this first if you really want to fetch new data.", output_dir
+                    "Found data stored in `%s`. Use an empty folder or set `overwrite_existing_files=True`, "
                    "if you want to overwrite any already present saved files.",
                    output_dir,
                )
            else:
                logger.info("Fetching from %s to `%s`", urls, output_dir)
-            # Start by writing out the initial list of urls
+        documents: List[Document] = []
        # Start by crawling the initial list of urls
        if filter_urls:
            pattern = re.compile("|".join(filter_urls))
            for url in urls:
                if pattern.search(url):
-                        file_paths += self._write_to_files(
+                    documents += self._crawl_urls(
                        [url],
                            output_dir=output_dir,
                        extract_hidden_text=extract_hidden_text,
                        loading_wait_time=loading_wait_time,
                        id_hash_keys=id_hash_keys,
                        output_dir=output_dir,
                        overwrite_existing_files=overwrite_existing_files,
                        file_path_meta_field_name=file_path_meta_field_name,
                        crawler_naming_function=crawler_naming_function,
                    )
        else:
-                file_paths += self._write_to_files(
+            documents += self._crawl_urls(
                urls,
                    output_dir=output_dir,
                extract_hidden_text=extract_hidden_text,
                loading_wait_time=loading_wait_time,
                id_hash_keys=id_hash_keys,
                output_dir=output_dir,
                overwrite_existing_files=overwrite_existing_files,
                file_path_meta_field_name=file_path_meta_field_name,
                crawler_naming_function=crawler_naming_function,
            )
        # follow one level of sublinks if requested
        if crawler_depth == 1:
            sub_links: Dict[str, List] = {}
@ -286,31 +304,97 @@ class Crawler(BaseComponent):
                    )
                )
            for url, extracted_sublink in sub_links.items():
-                    file_paths += self._write_to_files(
+                documents += self._crawl_urls(
                    extracted_sublink,
                        output_dir=output_dir,
                    base_url=url,
                        id_hash_keys=id_hash_keys,
                    extract_hidden_text=extract_hidden_text,
                    loading_wait_time=loading_wait_time,
                    id_hash_keys=id_hash_keys,
                    output_dir=output_dir,
                    overwrite_existing_files=overwrite_existing_files,
                    file_path_meta_field_name=file_path_meta_field_name,
                    crawler_naming_function=crawler_naming_function,
                )
-        return file_paths
+        return documents
-    def _write_to_files(
+    def _create_document(
        self, url: str, text: str, base_url: Optional[str] = None, id_hash_keys: Optional[List[str]] = None
    ) -> Document:
        """
        Create a Document object from the given url and text.
        :param url: The current url of the webpage.
        :param text: The text content of the webpage.
        :param base_url: The original url where we started to crawl.
        :param id_hash_keys: The fields that should be used to generate the document id.
        """
        data: Dict[str, Any] = {}
        data["meta"] = {"url": url}
        if base_url:
            data["meta"]["base_url"] = base_url
        data["content"] = text
        if id_hash_keys:
            data["id_hash_keys"] = id_hash_keys
        return Document.from_dict(data)
    def _write_file(
        self,
        document: Document,
        output_dir: Path,
        crawler_naming_function: Optional[Callable[[str, str], str]] = None,
        overwrite_existing_files: Optional[bool] = None,
        file_path_meta_field_name: Optional[str] = None,
    ) -> Path:
        url = document.meta["url"]
        if crawler_naming_function is not None:
            file_name_prefix = crawler_naming_function(url, document.content)  # type: ignore
        else:
            file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", url[:129])
            file_name_hash = hashlib.md5(f"{url}".encode("utf-8")).hexdigest()
            file_name_prefix = f"{file_name_link}_{file_name_hash[-6:]}"
        file_path = output_dir / f"{file_name_prefix}.json"
        if file_path_meta_field_name:
            document.meta[file_path_meta_field_name] = str(file_path)
        try:
            if overwrite_existing_files or not file_path.exists():
                with open(file_path, "w", encoding="utf-8") as f:
                    json.dump(document.to_dict(), f)
            else:
                logging.debug(
                    "File '%s' already exists. Set 'overwrite_existing_files=True' to overwrite it.", file_path
                )
        except Exception:
            logging.exception(
                "Crawler can't save the content of '%s' under '%s'. "
                "This webpage will be skipped, but links from this page will still be crawled. "
                "Make sure the path above is accessible and the file name is valid. "
                "If the file name is invalid, consider setting 'crawler_naming_function' to another function.",
                url,
                file_path,
            )
        return file_path
    def _crawl_urls(
        self,
        urls: List[str],
        output_dir: Path,
        extract_hidden_text: bool,
        base_url: Optional[str] = None,
        id_hash_keys: Optional[List[str]] = None,
        loading_wait_time: Optional[int] = None,
        overwrite_existing_files: Optional[bool] = False,
        output_dir: Optional[Path] = None,
        crawler_naming_function: Optional[Callable[[str, str], str]] = None,
-    ) -> List[Path]:
+        file_path_meta_field_name: Optional[str] = None,
-        paths = []
+    ) -> List[Document]:
        documents: List[Document] = []
        for link in urls:
-            logger.info("writing contents from '%s'", link)
+            logger.info("Scraping contents from '%s'", link)
            self.driver.get(link)
            if loading_wait_time is not None:
                time.sleep(loading_wait_time)
@ -320,54 +404,37 @@ class Crawler(BaseComponent):
            else:
                text = el.text
-            data: Dict[str, Any] = {}
+            document = self._create_document(url=link, text=text, base_url=base_url, id_hash_keys=id_hash_keys)
            data["meta"] = {"url": link}
            if base_url:
                data["meta"]["base_url"] = base_url
            data["content"] = text
            if id_hash_keys:
                data["id_hash_keys"] = id_hash_keys
            document = Document.from_dict(data)
-            if crawler_naming_function is not None:
+            if output_dir:
-                file_name_prefix = crawler_naming_function(link, text)
+                file_path = self._write_file(
-            else:
+                    document,
-                file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link[:129])
+                    output_dir,
-                file_name_hash = hashlib.md5(f"{link}".encode("utf-8")).hexdigest()
+                    crawler_naming_function,
-                file_name_prefix = f"{file_name_link}_{file_name_hash[-6:]}"
+                    file_path_meta_field_name=file_path_meta_field_name,
-
+                    overwrite_existing_files=overwrite_existing_files,
            file_path = output_dir / f"{file_name_prefix}.json"
            try:
                with open(file_path, "w", encoding="utf-8") as f:
                    json.dump(document.to_dict(), f)
            except Exception:
                logging.exception(
                    "Crawler can't save the content of '%s' under '%s'. "
                    "This webpage will be skipped, but links from this page will still be crawled. "
                    "Make sure the path above is accessible and the file name is valid. "
                    "If the file name is invalid, consider setting 'crawler_naming_function' to another function.",
                    link,
                    file_path,
                )
                logger.debug("Saved content to '%s'", file_path)
-            paths.append(file_path)
+            documents.append(document)
-        return paths
+        logger.debug("Crawler results: %s Documents", len(documents))
        return documents
    def run(  # type: ignore
        self,
        output_dir: Union[str, Path, None] = None,
        urls: Optional[List[str]] = None,
        crawler_depth: Optional[int] = None,
        filter_urls: Optional[List] = None,
        overwrite_existing_files: Optional[bool] = None,
        return_documents: Optional[bool] = False,
        id_hash_keys: Optional[List[str]] = None,
        extract_hidden_text: Optional[bool] = True,
        loading_wait_time: Optional[int] = None,
        output_dir: Union[str, Path, None] = None,
        overwrite_existing_files: Optional[bool] = None,
        crawler_naming_function: Optional[Callable[[str, str], str]] = None,
-    ) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]:
+        file_path_meta_field_name: Optional[str] = None,
    ) -> Tuple[Dict[str, List[Document]], str]:
        """
        Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
@ -389,6 +456,7 @@ class Crawler(BaseComponent):
        :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
            dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
            E.g. 2: Crawler will wait 2 seconds before scraping page
        :param file_path_meta_field_name: If provided, the file path will be stored in this meta field.
        :param crawler_naming_function: A function mapping the crawled page to a file name.
            By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
            E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
@ -396,10 +464,10 @@ class Crawler(BaseComponent):
                 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
                    This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
-        :return: Tuple({"paths": List of filepaths, ...}, Name of output edge)
+        :return: Tuple({"documents": List of Documents, ...}, Name of output edge)
        """
-        file_paths = self.crawl(
+        documents = self.crawl(
            urls=urls,
            output_dir=output_dir,
            crawler_depth=crawler_depth,
@ -407,34 +475,26 @@ class Crawler(BaseComponent):
            overwrite_existing_files=overwrite_existing_files,
            extract_hidden_text=extract_hidden_text,
            loading_wait_time=loading_wait_time,
            id_hash_keys=id_hash_keys,
            file_path_meta_field_name=file_path_meta_field_name,
            crawler_naming_function=crawler_naming_function,
        )
-        results: Dict[str, Union[List[Document], List[Path]]] = {}
+        results = {"documents": documents}
        if return_documents:
            crawled_data = []
            for _file in file_paths:
                with open(_file.absolute(), "r") as read_file:
                    document = json.load(read_file)
                    document["id_hash_keys"] = id_hash_keys
                    crawled_data.append(Document.from_dict(document))
            results = {"documents": crawled_data}
        else:
            results = {"paths": file_paths}
        return results, "output_1"
    def run_batch(  # type: ignore
        self,
        output_dir: Union[str, Path, None] = None,
        urls: Optional[List[str]] = None,
        crawler_depth: Optional[int] = None,
        filter_urls: Optional[List] = None,
        overwrite_existing_files: Optional[bool] = None,
        return_documents: Optional[bool] = False,
        id_hash_keys: Optional[List[str]] = None,
        extract_hidden_text: Optional[bool] = True,
        loading_wait_time: Optional[int] = None,
        output_dir: Union[str, Path, None] = None,
        overwrite_existing_files: Optional[bool] = None,
        crawler_naming_function: Optional[Callable[[str, str], str]] = None,
        file_path_meta_field_name: Optional[str] = None,
    ):
        return self.run(
            output_dir=output_dir,
@ -442,11 +502,11 @@ class Crawler(BaseComponent):
            crawler_depth=crawler_depth,
            filter_urls=filter_urls,
            overwrite_existing_files=overwrite_existing_files,
            return_documents=return_documents,
            id_hash_keys=id_hash_keys,
            extract_hidden_text=extract_hidden_text,
            loading_wait_time=loading_wait_time,
            crawler_naming_function=crawler_naming_function,
            file_path_meta_field_name=file_path_meta_field_name,
        )
    @staticmethod
--- a/test/nodes/test_connector.py
+++ b/test/nodes/test_connector.py
@ -7,10 +7,10 @@ import hashlib
 import os
 import pytest
 from selenium.webdriver.common.by import By
-
+from haystack.nodes.connector.crawler import Crawler
 from haystack.nodes.connector import Crawler
 from haystack.schema import Document
 from ..conftest import SAMPLES_PATH
@ -64,12 +64,15 @@ def test_crawler(tmp_path):
    tmp_dir = tmp_path
    url = ["https://haystack.deepset.ai/"]
-    crawler = Crawler(output_dir=tmp_dir)
+    crawler = Crawler(output_dir=tmp_dir, file_path_meta_field_name="file_path")
    docs_path = crawler.crawl(urls=url, crawler_depth=0)
    results, _ = crawler.run(urls=url, crawler_depth=0, return_documents=True)
    documents = results["documents"]
-    for json_file, document in zip(docs_path, documents):
+    documents = crawler.crawl(urls=url, crawler_depth=0)
    docs_path = [Path(doc.meta["file_path"]) for doc in documents]
    results, _ = crawler.run(urls=url, crawler_depth=0)
    docs_result = results["documents"]
    for json_file, document in zip(docs_path, docs_result):
        assert isinstance(json_file, Path)
        assert isinstance(document, Document)
@ -85,42 +88,45 @@ def test_crawler(tmp_path):
 def test_crawler_url_none_exception(tmp_path):
-    crawler = Crawler(tmp_path)
+    crawler = Crawler()
    with pytest.raises(ValueError):
        crawler.crawl()
 def test_crawler_depth_0_single_url(test_url, tmp_path):
-    crawler = Crawler(output_dir=tmp_path)
+    crawler = Crawler(output_dir=tmp_path, crawler_depth=0, file_path_meta_field_name="file_path")
-    paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
+    documents = crawler.crawl(urls=[test_url + "/index.html"])
-    assert len(paths) == 1
+    assert len(documents) == 1
-    assert content_match(crawler, test_url + "/index.html", paths[0])
+    assert content_match(crawler, test_url + "/index.html", documents[0].meta["file_path"])
 def test_crawler_depth_0_many_urls(test_url, tmp_path):
-    crawler = Crawler(output_dir=tmp_path)
+    crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
    _urls = [test_url + "/index.html", test_url + "/page1.html"]
-    paths = crawler.crawl(urls=_urls, crawler_depth=0)
+    documents = crawler.crawl(urls=_urls, crawler_depth=0)
-    assert len(paths) == 2
+    assert len(documents) == 2
    paths = [doc.meta["file_path"] for doc in documents]
    assert content_in_results(crawler, test_url + "/index.html", paths)
    assert content_in_results(crawler, test_url + "/page1.html", paths)
 def test_crawler_depth_1_single_url(test_url, tmp_path):
-    crawler = Crawler(output_dir=tmp_path)
+    crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
-    paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1)
+    documents = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1)
-    assert len(paths) == 3
+    assert len(documents) == 3
    paths = [doc.meta["file_path"] for doc in documents]
    assert content_in_results(crawler, test_url + "/index.html", paths)
    assert content_in_results(crawler, test_url + "/page1.html", paths)
    assert content_in_results(crawler, test_url + "/page2.html", paths)
 def test_crawler_output_file_structure(test_url, tmp_path):
-    crawler = Crawler(output_dir=tmp_path)
+    crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
-    paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
+    documents = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
-    assert content_match(crawler, test_url + "/index.html", paths[0])
+    path = Path(documents[0].meta["file_path"])
    assert content_match(crawler, test_url + "/index.html", path)
-    with open(paths[0].absolute(), "r") as doc_file:
+    with open(path.absolute(), "r") as doc_file:
        data = json.load(doc_file)
        assert "content" in data
        assert "meta" in data
@ -129,52 +135,40 @@ def test_crawler_output_file_structure(test_url, tmp_path):
 def test_crawler_filter_urls(test_url, tmp_path):
-    crawler = Crawler(output_dir=tmp_path)
+    crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
-    paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1)
+    documents = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1)
-    assert len(paths) == 1
+    assert len(documents) == 1
-    assert content_match(crawler, test_url + "/index.html", paths[0])
+    assert content_match(crawler, test_url + "/index.html", documents[0].meta["file_path"])
    # Note: filter_urls can exclude pages listed in `urls` as well
-    paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1)
+    documents = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1)
-    assert len(paths) == 1
+    assert len(documents) == 1
-    assert content_match(crawler, test_url + "/page1.html", paths[0])
+    assert content_match(crawler, test_url + "/page1.html", documents[0].meta["file_path"])
    assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google.com"], crawler_depth=1)
 def test_crawler_return_document(test_url, tmp_path):
    crawler = Crawler(output_dir=tmp_path)
    documents, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=True)
    paths, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=False)
    for path, document in zip(paths["paths"], documents["documents"]):
        with open(path.absolute(), "r") as doc_file:
            file_content = json.load(doc_file)
            assert file_content["meta"] == document.meta
            assert file_content["content"] == document.content
 def test_crawler_extract_hidden_text(test_url, tmp_path):
    crawler = Crawler(output_dir=tmp_path)
-    documents, _ = crawler.run(
+    documents, _ = crawler.run(urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0)
        urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0, return_documents=True
    )
    crawled_content = documents["documents"][0].content
    assert "hidden text" in crawled_content
-    documents, _ = crawler.run(
+    documents, _ = crawler.run(urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=False, crawler_depth=0)
        urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=False, crawler_depth=0, return_documents=True
    )
    crawled_content = documents["documents"][0].content
    assert "hidden text" not in crawled_content
 def test_crawler_loading_wait_time(test_url, tmp_path):
    loading_wait_time = 3
-    crawler = Crawler(output_dir=tmp_path)
+    crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
-    paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=1, loading_wait_time=loading_wait_time)
+    documents = crawler.crawl(
        urls=[test_url + "/page_dynamic.html"], crawler_depth=1, loading_wait_time=loading_wait_time
    )
-    assert len(paths) == 4
+    assert len(documents) == 4
    paths = [doc.meta["file_path"] for doc in documents]
    with open(f"{SAMPLES_PATH.absolute()}/crawler/page_dynamic_result.txt", "r") as dynamic_result:
        dynamic_result_text = dynamic_result.readlines()
@ -196,29 +190,46 @@ def test_crawler_loading_wait_time(test_url, tmp_path):
 def test_crawler_default_naming_function(test_url, tmp_path):
-    crawler = Crawler(output_dir=tmp_path)
+    crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
    link = f"{test_url}/page_with_a_very_long_name_to_do_some_tests_Now_let's_add_some_text_just_to_pass_the_129_chars_mark_and_trigger_the_chars_limit_of_the_default_naming_function.html"
    file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link[:129])
    file_name_hash = hashlib.md5(f"{link}".encode("utf-8")).hexdigest()
    expected_crawled_file_path = f"{tmp_path}/{file_name_link}_{file_name_hash[-6:]}.json"
-    paths = crawler.crawl(urls=[link], crawler_depth=0)
+    documents = crawler.crawl(urls=[link], crawler_depth=0)
-    assert os.path.exists(paths[0])
+    path = Path(documents[0].meta["file_path"])
-    assert paths[0] == Path(expected_crawled_file_path)
+    assert os.path.exists(path)
    assert path == Path(expected_crawled_file_path)
 def test_crawler_naming_function(test_url, tmp_path):
    crawler = Crawler(
-        output_dir=tmp_path, crawler_naming_function=lambda link, text: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+        output_dir=tmp_path,
        file_path_meta_field_name="file_path",
        crawler_naming_function=lambda link, text: re.sub("[<>:'/\\|?*\0 ]", "_", link),
    )
    link = f"{test_url}/page_dynamic.html"
    file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link)
    expected_crawled_file_path = tmp_path / f"{file_name_link}.json"
-    paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=0)
+    documents = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=0)
    path = Path(documents[0].meta["file_path"])
    assert os.path.exists(path)
    assert path == expected_crawled_file_path
-    assert os.path.exists(paths[0])
+
-    assert paths[0] == expected_crawled_file_path
+def test_crawler_not_save_file(test_url):
    crawler = Crawler()
    documents = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
    assert documents[0].meta.get("file_path", None) is None
 def test_crawler_custom_meta_file_path_name(test_url, tmp_path):
    crawler = Crawler()
    documents = crawler.crawl(
        urls=[test_url + "/index.html"], crawler_depth=0, output_dir=tmp_path, file_path_meta_field_name="custom"
    )
    assert documents[0].meta.get("custom", None) is not None