2022-06-07 18:14:43 +02:00
|
|
|
from typing import List
|
|
|
|
|
2021-08-18 17:05:44 +05:00
|
|
|
import json
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
import pytest
|
2022-06-24 12:05:32 +02:00
|
|
|
from selenium.webdriver.common.by import By
|
2022-06-06 17:52:37 +02:00
|
|
|
|
2022-01-26 18:12:55 +01:00
|
|
|
from haystack.nodes.connector import Crawler
|
2022-03-29 13:53:35 +02:00
|
|
|
from haystack.schema import Document
|
2021-08-18 17:05:44 +05:00
|
|
|
|
2022-06-06 17:52:37 +02:00
|
|
|
from ..conftest import SAMPLES_PATH
|
2021-08-18 17:05:44 +05:00
|
|
|
|
|
|
|
|
2022-06-06 17:52:37 +02:00
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def test_url():
|
|
|
|
return f"file://{SAMPLES_PATH.absolute()}/crawler"
|
2021-08-18 17:05:44 +05:00
|
|
|
|
|
|
|
|
2022-06-06 17:52:37 +02:00
|
|
|
def content_match(crawler: Crawler, url: str, crawled_page: Path):
|
|
|
|
"""
|
|
|
|
:param crawler: the tested Crawler object
|
2022-06-07 18:14:43 +02:00
|
|
|
:param url: the URL of the expected page
|
2022-06-06 17:52:37 +02:00
|
|
|
:param crawled_page: the output of Crawler (one element of the paths list)
|
|
|
|
"""
|
|
|
|
crawler.driver.get(url)
|
2022-06-24 12:05:32 +02:00
|
|
|
body = crawler.driver.find_element(by=By.TAG_NAME, value="body")
|
2022-06-10 09:51:41 +02:00
|
|
|
|
|
|
|
if crawler.extract_hidden_text:
|
|
|
|
expected_crawled_content = body.get_attribute("textContent")
|
|
|
|
else:
|
|
|
|
expected_crawled_content = body.text
|
2021-08-18 17:05:44 +05:00
|
|
|
|
2022-06-06 17:52:37 +02:00
|
|
|
with open(crawled_page, "r") as crawled_file:
|
|
|
|
page_data = json.load(crawled_file)
|
|
|
|
return page_data["content"] == expected_crawled_content
|
2021-08-18 17:05:44 +05:00
|
|
|
|
|
|
|
|
2022-06-07 18:14:43 +02:00
|
|
|
def content_in_results(crawler: Crawler, url: str, results: List[Path], expected_matches_count=1):
|
|
|
|
"""
|
|
|
|
Makes sure there is exactly one matching page in the list of pages returned
|
|
|
|
by the crawler.
|
|
|
|
|
|
|
|
:param crawler: the tested Crawler object
|
|
|
|
:param url: the URL of the page to find in the results
|
|
|
|
:param results: the crawler's output (list of paths)
|
|
|
|
:param expected_matches_count: how many copies of this page should be present in the results (default 1)
|
|
|
|
"""
|
|
|
|
return sum(content_match(crawler, url, path) for path in results) == expected_matches_count
|
|
|
|
|
|
|
|
|
2022-06-06 17:52:37 +02:00
|
|
|
#
|
|
|
|
# Integration
|
|
|
|
#
|
2021-08-18 17:05:44 +05:00
|
|
|
|
|
|
|
|
2022-06-06 17:52:37 +02:00
|
|
|
@pytest.mark.integration
|
|
|
|
def test_crawler(tmp_path):
|
|
|
|
tmp_dir = tmp_path
|
|
|
|
url = ["https://haystack.deepset.ai/"]
|
2021-08-18 17:05:44 +05:00
|
|
|
|
|
|
|
crawler = Crawler(output_dir=tmp_dir)
|
2022-06-06 17:52:37 +02:00
|
|
|
docs_path = crawler.crawl(urls=url, crawler_depth=0)
|
|
|
|
results, _ = crawler.run(urls=url, crawler_depth=0, return_documents=True)
|
2022-02-03 13:43:18 +01:00
|
|
|
documents = results["documents"]
|
2021-08-24 17:25:22 +05:00
|
|
|
|
|
|
|
for json_file, document in zip(docs_path, documents):
|
|
|
|
assert isinstance(json_file, Path)
|
2022-03-29 13:53:35 +02:00
|
|
|
assert isinstance(document, Document)
|
2021-08-24 17:25:22 +05:00
|
|
|
|
|
|
|
with open(json_file.absolute(), "r") as read_file:
|
|
|
|
file_content = json.load(read_file)
|
2022-03-29 13:53:35 +02:00
|
|
|
assert file_content["meta"] == document.meta
|
|
|
|
assert file_content["content"] == document.content
|
2022-06-06 17:52:37 +02:00
|
|
|
|
|
|
|
|
|
|
|
#
|
|
|
|
# Unit tests
|
|
|
|
#
|
|
|
|
|
|
|
|
|
|
|
|
def test_crawler_url_none_exception(tmp_path):
|
|
|
|
crawler = Crawler(tmp_path)
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
crawler.crawl()
|
|
|
|
|
|
|
|
|
|
|
|
def test_crawler_depth_0_single_url(test_url, tmp_path):
|
|
|
|
crawler = Crawler(output_dir=tmp_path)
|
|
|
|
paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
|
|
|
|
assert len(paths) == 1
|
|
|
|
assert content_match(crawler, test_url + "/index.html", paths[0])
|
|
|
|
|
|
|
|
|
|
|
|
def test_crawler_depth_0_many_urls(test_url, tmp_path):
|
|
|
|
crawler = Crawler(output_dir=tmp_path)
|
|
|
|
_urls = [test_url + "/index.html", test_url + "/page1.html"]
|
|
|
|
paths = crawler.crawl(urls=_urls, crawler_depth=0)
|
|
|
|
assert len(paths) == 2
|
2022-06-07 18:14:43 +02:00
|
|
|
assert content_in_results(crawler, test_url + "/index.html", paths)
|
|
|
|
assert content_in_results(crawler, test_url + "/page1.html", paths)
|
2022-06-06 17:52:37 +02:00
|
|
|
|
|
|
|
|
|
|
|
def test_crawler_depth_1_single_url(test_url, tmp_path):
|
|
|
|
crawler = Crawler(output_dir=tmp_path)
|
|
|
|
paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1)
|
|
|
|
assert len(paths) == 3
|
2022-06-07 18:14:43 +02:00
|
|
|
assert content_in_results(crawler, test_url + "/index.html", paths)
|
|
|
|
assert content_in_results(crawler, test_url + "/page1.html", paths)
|
|
|
|
assert content_in_results(crawler, test_url + "/page2.html", paths)
|
2022-06-06 17:52:37 +02:00
|
|
|
|
|
|
|
|
|
|
|
def test_crawler_output_file_structure(test_url, tmp_path):
|
|
|
|
crawler = Crawler(output_dir=tmp_path)
|
|
|
|
paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
|
|
|
|
assert content_match(crawler, test_url + "/index.html", paths[0])
|
|
|
|
|
|
|
|
with open(paths[0].absolute(), "r") as doc_file:
|
|
|
|
data = json.load(doc_file)
|
|
|
|
assert "content" in data
|
|
|
|
assert "meta" in data
|
|
|
|
assert isinstance(data["content"], str)
|
|
|
|
assert len(data["content"].split()) > 2
|
|
|
|
|
|
|
|
|
|
|
|
def test_crawler_filter_urls(test_url, tmp_path):
|
|
|
|
crawler = Crawler(output_dir=tmp_path)
|
|
|
|
|
|
|
|
paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1)
|
|
|
|
assert len(paths) == 1
|
|
|
|
assert content_match(crawler, test_url + "/index.html", paths[0])
|
|
|
|
|
|
|
|
# Note: filter_urls can exclude pages listed in `urls` as well
|
|
|
|
paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1)
|
|
|
|
assert len(paths) == 1
|
|
|
|
assert content_match(crawler, test_url + "/page1.html", paths[0])
|
|
|
|
assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google\.com"], crawler_depth=1)
|
|
|
|
|
|
|
|
|
|
|
|
def test_crawler_return_document(test_url, tmp_path):
|
|
|
|
crawler = Crawler(output_dir=tmp_path)
|
|
|
|
documents, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=True)
|
|
|
|
paths, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=False)
|
|
|
|
|
|
|
|
for path, document in zip(paths["paths"], documents["documents"]):
|
|
|
|
with open(path.absolute(), "r") as doc_file:
|
|
|
|
file_content = json.load(doc_file)
|
|
|
|
assert file_content["meta"] == document.meta
|
|
|
|
assert file_content["content"] == document.content
|
2022-06-10 09:51:41 +02:00
|
|
|
|
|
|
|
|
|
|
|
def test_crawler_extract_hidden_text(test_url, tmp_path):
|
|
|
|
crawler = Crawler(output_dir=tmp_path)
|
|
|
|
documents, _ = crawler.run(
|
|
|
|
urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0, return_documents=True
|
|
|
|
)
|
|
|
|
crawled_content = documents["documents"][0].content
|
|
|
|
assert "hidden text" in crawled_content
|
|
|
|
|
|
|
|
documents, _ = crawler.run(
|
|
|
|
urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=False, crawler_depth=0, return_documents=True
|
|
|
|
)
|
|
|
|
crawled_content = documents["documents"][0].content
|
|
|
|
assert "hidden text" not in crawled_content
|