feat!: Increase Crawler standardization regarding Pipelines (#4122)

* feat!(Crawler): Integrate Crawler in the Pipeline.

+Output Documents
+Optional file saving
+Optional Document meta about file path

* refactor: add Optional decl.

* chore: dummy commit

* chore: dummy commit

* refactor: improve overwrite flow

* refactor: change custom file path meta logic + add test

* Update haystack/nodes/connector/crawler.py

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>

* Update haystack/nodes/connector/crawler.py

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>

* Update haystack/nodes/connector/crawler.py

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>

* Update haystack/nodes/connector/crawler.py

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>

* Update haystack/nodes/connector/crawler.py

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>

---------

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
This commit is contained in:
Daniel Bichuetti 2023-02-22 13:34:19 -03:00 committed by GitHub
parent 49ed21b82d
commit e0b0fe1bc3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 257 additions and 186 deletions

View File

@ -1,31 +1,29 @@
from typing import Callable, List, Optional, Dict, Tuple, Union, Any
import hashlib
import json
import logging
import os
import re
import sys
import json
import time
import logging
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from urllib.parse import urlparse
import hashlib
try:
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
except (ImportError, ModuleNotFoundError) as ie:
from haystack.utils.import_utils import _optional_component_not_installed
_optional_component_not_installed(__name__, "crawler", ie)
from haystack.errors import NodeError
from haystack.nodes.base import BaseComponent
from haystack.schema import Document
from haystack.errors import NodeError
logger = logging.getLogger(__name__)
@ -49,28 +47,27 @@ class Crawler(BaseComponent):
def __init__(
self,
output_dir: str,
urls: Optional[List[str]] = None,
crawler_depth: int = 1,
filter_urls: Optional[List] = None,
overwrite_existing_files=True,
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text=True,
loading_wait_time: Optional[int] = None,
output_dir: Union[str, Path, None] = None,
overwrite_existing_files=True,
file_path_meta_field_name: Optional[str] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
webdriver_options: Optional[List[str]] = None,
):
"""
Init object with basic params for crawling (can be overwritten later).
:param output_dir: Path for the directory to store files
:param urls: List of http(s) address(es) (can also be supplied later when calling crawl())
:param crawler_depth: How many sublinks to follow from the initial list of URLs. Current options:
0: Only initial list of urls
1: Follow links found on the initial URLs (but no further)
:param filter_urls: Optional list of regular expressions that the crawled URLs must comply with.
All URLs not matching at least one of the regular expressions will be dropped.
:param overwrite_existing_files: Whether to overwrite existing files in output_dir with new content
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
@ -80,6 +77,9 @@ class Crawler(BaseComponent):
:param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page
:param output_dir: If provided, the crawled documents will be saved as JSON files in this directory.
:param overwrite_existing_files: Whether to overwrite existing files in output_dir with new content
:param file_path_meta_field_name: If provided, the file path will be stored in this meta field.
:param crawler_naming_function: A function mapping the crawled page to a file name.
By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
@ -164,7 +164,6 @@ class Crawler(BaseComponent):
logger.info("'chrome-driver' will be automatically installed.")
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
self.urls = urls
self.output_dir = output_dir
self.crawler_depth = crawler_depth
self.filter_urls = filter_urls
self.overwrite_existing_files = overwrite_existing_files
@ -172,22 +171,25 @@ class Crawler(BaseComponent):
self.extract_hidden_text = extract_hidden_text
self.loading_wait_time = loading_wait_time
self.crawler_naming_function = crawler_naming_function
self.output_dir = output_dir
self.file_path_meta_field_name = file_path_meta_field_name
def __del__(self):
self.driver.quit()
def crawl(
self,
output_dir: Union[str, Path, None] = None,
urls: Optional[List[str]] = None,
crawler_depth: Optional[int] = None,
filter_urls: Optional[List] = None,
overwrite_existing_files: Optional[bool] = None,
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = None,
loading_wait_time: Optional[int] = None,
output_dir: Union[str, Path, None] = None,
overwrite_existing_files: Optional[bool] = None,
file_path_meta_field_name: Optional[str] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
) -> List[Path]:
) -> List[Document]:
"""
Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
file per URL, including text and basic meta data).
@ -195,7 +197,6 @@ class Crawler(BaseComponent):
All parameters are optional here and only meant to overwrite instance attributes at runtime.
If no parameters are provided to this method, the instance attributes that were passed during __init__ will be used.
:param output_dir: Path for the directory to store files
:param urls: List of http addresses or single http address
:param crawler_depth: How many sublinks to follow from the initial list of URLs. Current options:
0: Only initial list of urls
@ -210,6 +211,8 @@ class Crawler(BaseComponent):
:param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page
:param output_dir: If provided, the crawled documents will be saved as JSON files in this directory.
:param file_path_meta_field_name: If provided, the file path will be stored in this meta field.
:param crawler_naming_function: A function mapping the crawled page to a file name.
By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
@ -217,7 +220,7 @@ class Crawler(BaseComponent):
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
:return: List of paths where the crawled webpages got stored
:return: List of Documents that were created during crawling
"""
# use passed params or fallback to instance attributes
if id_hash_keys is None:
@ -236,81 +239,162 @@ class Crawler(BaseComponent):
extract_hidden_text = self.extract_hidden_text
if loading_wait_time is None:
loading_wait_time = self.loading_wait_time
if file_path_meta_field_name is None:
file_path_meta_field_name = self.file_path_meta_field_name
if crawler_naming_function is None:
crawler_naming_function = self.crawler_naming_function
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir(parents=True)
if isinstance(output_dir, str):
output_dir = Path(output_dir)
file_paths: list = []
is_not_empty = len(list(output_dir.rglob("*"))) > 0
if is_not_empty and not overwrite_existing_files:
logger.info(
"Found data stored in `%s`. Delete this first if you really want to fetch new data.", output_dir
)
else:
logger.info("Fetching from %s to `%s`", urls, output_dir)
if output_dir:
if not output_dir.exists():
output_dir.mkdir(parents=True)
# Start by writing out the initial list of urls
if filter_urls:
pattern = re.compile("|".join(filter_urls))
for url in urls:
if pattern.search(url):
file_paths += self._write_to_files(
[url],
output_dir=output_dir,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
crawler_naming_function=crawler_naming_function,
)
else:
file_paths += self._write_to_files(
urls,
output_dir=output_dir,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
crawler_naming_function=crawler_naming_function,
is_not_empty = len(list(output_dir.rglob("*"))) > 0
if is_not_empty and not overwrite_existing_files:
logger.warning(
"Found data stored in `%s`. Use an empty folder or set `overwrite_existing_files=True`, "
"if you want to overwrite any already present saved files.",
output_dir,
)
# follow one level of sublinks if requested
if crawler_depth == 1:
sub_links: Dict[str, List] = {}
for url_ in urls:
already_found_links: List = list(sum(list(sub_links.values()), []))
sub_links[url_] = list(
self._extract_sublinks_from_url(
base_url=url_,
filter_urls=filter_urls,
already_found_links=already_found_links,
loading_wait_time=loading_wait_time,
)
)
for url, extracted_sublink in sub_links.items():
file_paths += self._write_to_files(
extracted_sublink,
output_dir=output_dir,
base_url=url,
id_hash_keys=id_hash_keys,
else:
logger.info("Fetching from %s to `%s`", urls, output_dir)
documents: List[Document] = []
# Start by crawling the initial list of urls
if filter_urls:
pattern = re.compile("|".join(filter_urls))
for url in urls:
if pattern.search(url):
documents += self._crawl_urls(
[url],
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
id_hash_keys=id_hash_keys,
output_dir=output_dir,
overwrite_existing_files=overwrite_existing_files,
file_path_meta_field_name=file_path_meta_field_name,
crawler_naming_function=crawler_naming_function,
)
else:
documents += self._crawl_urls(
urls,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
id_hash_keys=id_hash_keys,
output_dir=output_dir,
overwrite_existing_files=overwrite_existing_files,
file_path_meta_field_name=file_path_meta_field_name,
crawler_naming_function=crawler_naming_function,
)
return file_paths
# follow one level of sublinks if requested
if crawler_depth == 1:
sub_links: Dict[str, List] = {}
for url_ in urls:
already_found_links: List = list(sum(list(sub_links.values()), []))
sub_links[url_] = list(
self._extract_sublinks_from_url(
base_url=url_,
filter_urls=filter_urls,
already_found_links=already_found_links,
loading_wait_time=loading_wait_time,
)
)
for url, extracted_sublink in sub_links.items():
documents += self._crawl_urls(
extracted_sublink,
base_url=url,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
id_hash_keys=id_hash_keys,
output_dir=output_dir,
overwrite_existing_files=overwrite_existing_files,
file_path_meta_field_name=file_path_meta_field_name,
crawler_naming_function=crawler_naming_function,
)
def _write_to_files(
return documents
def _create_document(
self, url: str, text: str, base_url: Optional[str] = None, id_hash_keys: Optional[List[str]] = None
) -> Document:
"""
Create a Document object from the given url and text.
:param url: The current url of the webpage.
:param text: The text content of the webpage.
:param base_url: The original url where we started to crawl.
:param id_hash_keys: The fields that should be used to generate the document id.
"""
data: Dict[str, Any] = {}
data["meta"] = {"url": url}
if base_url:
data["meta"]["base_url"] = base_url
data["content"] = text
if id_hash_keys:
data["id_hash_keys"] = id_hash_keys
return Document.from_dict(data)
def _write_file(
self,
document: Document,
output_dir: Path,
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
overwrite_existing_files: Optional[bool] = None,
file_path_meta_field_name: Optional[str] = None,
) -> Path:
url = document.meta["url"]
if crawler_naming_function is not None:
file_name_prefix = crawler_naming_function(url, document.content) # type: ignore
else:
file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", url[:129])
file_name_hash = hashlib.md5(f"{url}".encode("utf-8")).hexdigest()
file_name_prefix = f"{file_name_link}_{file_name_hash[-6:]}"
file_path = output_dir / f"{file_name_prefix}.json"
if file_path_meta_field_name:
document.meta[file_path_meta_field_name] = str(file_path)
try:
if overwrite_existing_files or not file_path.exists():
with open(file_path, "w", encoding="utf-8") as f:
json.dump(document.to_dict(), f)
else:
logging.debug(
"File '%s' already exists. Set 'overwrite_existing_files=True' to overwrite it.", file_path
)
except Exception:
logging.exception(
"Crawler can't save the content of '%s' under '%s'. "
"This webpage will be skipped, but links from this page will still be crawled. "
"Make sure the path above is accessible and the file name is valid. "
"If the file name is invalid, consider setting 'crawler_naming_function' to another function.",
url,
file_path,
)
return file_path
def _crawl_urls(
self,
urls: List[str],
output_dir: Path,
extract_hidden_text: bool,
base_url: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None,
loading_wait_time: Optional[int] = None,
overwrite_existing_files: Optional[bool] = False,
output_dir: Optional[Path] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
) -> List[Path]:
paths = []
file_path_meta_field_name: Optional[str] = None,
) -> List[Document]:
documents: List[Document] = []
for link in urls:
logger.info("writing contents from '%s'", link)
logger.info("Scraping contents from '%s'", link)
self.driver.get(link)
if loading_wait_time is not None:
time.sleep(loading_wait_time)
@ -320,54 +404,37 @@ class Crawler(BaseComponent):
else:
text = el.text
data: Dict[str, Any] = {}
data["meta"] = {"url": link}
if base_url:
data["meta"]["base_url"] = base_url
data["content"] = text
if id_hash_keys:
data["id_hash_keys"] = id_hash_keys
document = Document.from_dict(data)
document = self._create_document(url=link, text=text, base_url=base_url, id_hash_keys=id_hash_keys)
if crawler_naming_function is not None:
file_name_prefix = crawler_naming_function(link, text)
else:
file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link[:129])
file_name_hash = hashlib.md5(f"{link}".encode("utf-8")).hexdigest()
file_name_prefix = f"{file_name_link}_{file_name_hash[-6:]}"
file_path = output_dir / f"{file_name_prefix}.json"
try:
with open(file_path, "w", encoding="utf-8") as f:
json.dump(document.to_dict(), f)
except Exception:
logging.exception(
"Crawler can't save the content of '%s' under '%s'. "
"This webpage will be skipped, but links from this page will still be crawled. "
"Make sure the path above is accessible and the file name is valid. "
"If the file name is invalid, consider setting 'crawler_naming_function' to another function.",
link,
file_path,
if output_dir:
file_path = self._write_file(
document,
output_dir,
crawler_naming_function,
file_path_meta_field_name=file_path_meta_field_name,
overwrite_existing_files=overwrite_existing_files,
)
logger.debug("Saved content to '%s'", file_path)
paths.append(file_path)
documents.append(document)
return paths
logger.debug("Crawler results: %s Documents", len(documents))
return documents
def run( # type: ignore
self,
output_dir: Union[str, Path, None] = None,
urls: Optional[List[str]] = None,
crawler_depth: Optional[int] = None,
filter_urls: Optional[List] = None,
overwrite_existing_files: Optional[bool] = None,
return_documents: Optional[bool] = False,
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = True,
loading_wait_time: Optional[int] = None,
output_dir: Union[str, Path, None] = None,
overwrite_existing_files: Optional[bool] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]:
file_path_meta_field_name: Optional[str] = None,
) -> Tuple[Dict[str, List[Document]], str]:
"""
Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
@ -389,6 +456,7 @@ class Crawler(BaseComponent):
:param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page
:param file_path_meta_field_name: If provided, the file path will be stored in this meta field.
:param crawler_naming_function: A function mapping the crawled page to a file name.
By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
@ -396,10 +464,10 @@ class Crawler(BaseComponent):
2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
:return: Tuple({"paths": List of filepaths, ...}, Name of output edge)
:return: Tuple({"documents": List of Documents, ...}, Name of output edge)
"""
file_paths = self.crawl(
documents = self.crawl(
urls=urls,
output_dir=output_dir,
crawler_depth=crawler_depth,
@ -407,34 +475,26 @@ class Crawler(BaseComponent):
overwrite_existing_files=overwrite_existing_files,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
id_hash_keys=id_hash_keys,
file_path_meta_field_name=file_path_meta_field_name,
crawler_naming_function=crawler_naming_function,
)
results: Dict[str, Union[List[Document], List[Path]]] = {}
if return_documents:
crawled_data = []
for _file in file_paths:
with open(_file.absolute(), "r") as read_file:
document = json.load(read_file)
document["id_hash_keys"] = id_hash_keys
crawled_data.append(Document.from_dict(document))
results = {"documents": crawled_data}
else:
results = {"paths": file_paths}
results = {"documents": documents}
return results, "output_1"
def run_batch( # type: ignore
self,
output_dir: Union[str, Path, None] = None,
urls: Optional[List[str]] = None,
crawler_depth: Optional[int] = None,
filter_urls: Optional[List] = None,
overwrite_existing_files: Optional[bool] = None,
return_documents: Optional[bool] = False,
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = True,
loading_wait_time: Optional[int] = None,
output_dir: Union[str, Path, None] = None,
overwrite_existing_files: Optional[bool] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None,
file_path_meta_field_name: Optional[str] = None,
):
return self.run(
output_dir=output_dir,
@ -442,11 +502,11 @@ class Crawler(BaseComponent):
crawler_depth=crawler_depth,
filter_urls=filter_urls,
overwrite_existing_files=overwrite_existing_files,
return_documents=return_documents,
id_hash_keys=id_hash_keys,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
crawler_naming_function=crawler_naming_function,
file_path_meta_field_name=file_path_meta_field_name,
)
@staticmethod

View File

@ -7,10 +7,10 @@ import hashlib
import os
import pytest
from selenium.webdriver.common.by import By
from haystack.nodes.connector import Crawler
from haystack.nodes.connector.crawler import Crawler
from haystack.schema import Document
from ..conftest import SAMPLES_PATH
@ -64,12 +64,15 @@ def test_crawler(tmp_path):
tmp_dir = tmp_path
url = ["https://haystack.deepset.ai/"]
crawler = Crawler(output_dir=tmp_dir)
docs_path = crawler.crawl(urls=url, crawler_depth=0)
results, _ = crawler.run(urls=url, crawler_depth=0, return_documents=True)
documents = results["documents"]
crawler = Crawler(output_dir=tmp_dir, file_path_meta_field_name="file_path")
for json_file, document in zip(docs_path, documents):
documents = crawler.crawl(urls=url, crawler_depth=0)
docs_path = [Path(doc.meta["file_path"]) for doc in documents]
results, _ = crawler.run(urls=url, crawler_depth=0)
docs_result = results["documents"]
for json_file, document in zip(docs_path, docs_result):
assert isinstance(json_file, Path)
assert isinstance(document, Document)
@ -85,42 +88,45 @@ def test_crawler(tmp_path):
def test_crawler_url_none_exception(tmp_path):
crawler = Crawler(tmp_path)
crawler = Crawler()
with pytest.raises(ValueError):
crawler.crawl()
def test_crawler_depth_0_single_url(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path)
paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
assert len(paths) == 1
assert content_match(crawler, test_url + "/index.html", paths[0])
crawler = Crawler(output_dir=tmp_path, crawler_depth=0, file_path_meta_field_name="file_path")
documents = crawler.crawl(urls=[test_url + "/index.html"])
assert len(documents) == 1
assert content_match(crawler, test_url + "/index.html", documents[0].meta["file_path"])
def test_crawler_depth_0_many_urls(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path)
crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
_urls = [test_url + "/index.html", test_url + "/page1.html"]
paths = crawler.crawl(urls=_urls, crawler_depth=0)
assert len(paths) == 2
documents = crawler.crawl(urls=_urls, crawler_depth=0)
assert len(documents) == 2
paths = [doc.meta["file_path"] for doc in documents]
assert content_in_results(crawler, test_url + "/index.html", paths)
assert content_in_results(crawler, test_url + "/page1.html", paths)
def test_crawler_depth_1_single_url(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path)
paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1)
assert len(paths) == 3
crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
documents = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1)
assert len(documents) == 3
paths = [doc.meta["file_path"] for doc in documents]
assert content_in_results(crawler, test_url + "/index.html", paths)
assert content_in_results(crawler, test_url + "/page1.html", paths)
assert content_in_results(crawler, test_url + "/page2.html", paths)
def test_crawler_output_file_structure(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path)
paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
assert content_match(crawler, test_url + "/index.html", paths[0])
crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
documents = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
path = Path(documents[0].meta["file_path"])
assert content_match(crawler, test_url + "/index.html", path)
with open(paths[0].absolute(), "r") as doc_file:
with open(path.absolute(), "r") as doc_file:
data = json.load(doc_file)
assert "content" in data
assert "meta" in data
@ -129,52 +135,40 @@ def test_crawler_output_file_structure(test_url, tmp_path):
def test_crawler_filter_urls(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path)
crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1)
assert len(paths) == 1
assert content_match(crawler, test_url + "/index.html", paths[0])
documents = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1)
assert len(documents) == 1
assert content_match(crawler, test_url + "/index.html", documents[0].meta["file_path"])
# Note: filter_urls can exclude pages listed in `urls` as well
paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1)
assert len(paths) == 1
assert content_match(crawler, test_url + "/page1.html", paths[0])
documents = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1)
assert len(documents) == 1
assert content_match(crawler, test_url + "/page1.html", documents[0].meta["file_path"])
assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google.com"], crawler_depth=1)
def test_crawler_return_document(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path)
documents, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=True)
paths, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=False)
for path, document in zip(paths["paths"], documents["documents"]):
with open(path.absolute(), "r") as doc_file:
file_content = json.load(doc_file)
assert file_content["meta"] == document.meta
assert file_content["content"] == document.content
def test_crawler_extract_hidden_text(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path)
documents, _ = crawler.run(
urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0, return_documents=True
)
documents, _ = crawler.run(urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0)
crawled_content = documents["documents"][0].content
assert "hidden text" in crawled_content
documents, _ = crawler.run(
urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=False, crawler_depth=0, return_documents=True
)
documents, _ = crawler.run(urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=False, crawler_depth=0)
crawled_content = documents["documents"][0].content
assert "hidden text" not in crawled_content
def test_crawler_loading_wait_time(test_url, tmp_path):
loading_wait_time = 3
crawler = Crawler(output_dir=tmp_path)
paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=1, loading_wait_time=loading_wait_time)
crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
documents = crawler.crawl(
urls=[test_url + "/page_dynamic.html"], crawler_depth=1, loading_wait_time=loading_wait_time
)
assert len(paths) == 4
assert len(documents) == 4
paths = [doc.meta["file_path"] for doc in documents]
with open(f"{SAMPLES_PATH.absolute()}/crawler/page_dynamic_result.txt", "r") as dynamic_result:
dynamic_result_text = dynamic_result.readlines()
@ -196,29 +190,46 @@ def test_crawler_loading_wait_time(test_url, tmp_path):
def test_crawler_default_naming_function(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path)
crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
link = f"{test_url}/page_with_a_very_long_name_to_do_some_tests_Now_let's_add_some_text_just_to_pass_the_129_chars_mark_and_trigger_the_chars_limit_of_the_default_naming_function.html"
file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link[:129])
file_name_hash = hashlib.md5(f"{link}".encode("utf-8")).hexdigest()
expected_crawled_file_path = f"{tmp_path}/{file_name_link}_{file_name_hash[-6:]}.json"
paths = crawler.crawl(urls=[link], crawler_depth=0)
documents = crawler.crawl(urls=[link], crawler_depth=0)
assert os.path.exists(paths[0])
assert paths[0] == Path(expected_crawled_file_path)
path = Path(documents[0].meta["file_path"])
assert os.path.exists(path)
assert path == Path(expected_crawled_file_path)
def test_crawler_naming_function(test_url, tmp_path):
crawler = Crawler(
output_dir=tmp_path, crawler_naming_function=lambda link, text: re.sub("[<>:'/\\|?*\0 ]", "_", link)
output_dir=tmp_path,
file_path_meta_field_name="file_path",
crawler_naming_function=lambda link, text: re.sub("[<>:'/\\|?*\0 ]", "_", link),
)
link = f"{test_url}/page_dynamic.html"
file_name_link = re.sub("[<>:'/\\|?*\0 ]", "_", link)
expected_crawled_file_path = tmp_path / f"{file_name_link}.json"
paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=0)
documents = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=0)
path = Path(documents[0].meta["file_path"])
assert os.path.exists(path)
assert path == expected_crawled_file_path
assert os.path.exists(paths[0])
assert paths[0] == expected_crawled_file_path
def test_crawler_not_save_file(test_url):
crawler = Crawler()
documents = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
assert documents[0].meta.get("file_path", None) is None
def test_crawler_custom_meta_file_path_name(test_url, tmp_path):
crawler = Crawler()
documents = crawler.crawl(
urls=[test_url + "/index.html"], crawler_depth=0, output_dir=tmp_path, file_path_meta_field_name="custom"
)
assert documents[0].meta.get("custom", None) is not None