haystack/test/nodes/test_connector.py

from typing import List

import json
from pathlib import Path

import pytest

from haystack.nodes.connector import Crawler
from haystack.schema import Document

from ..conftest import SAMPLES_PATH


@pytest.fixture(scope="session")
def test_url():
    return f"file://{SAMPLES_PATH.absolute()}/crawler"


def content_match(crawler: Crawler, url: str, crawled_page: Path):
    """
    :param crawler: the tested Crawler object
    :param url: the URL of the expected page
    :param crawled_page: the output of Crawler (one element of the paths list)
    """
    crawler.driver.get(url)
    body = crawler.driver.find_element_by_tag_name("body")

    if crawler.extract_hidden_text:
        expected_crawled_content = body.get_attribute("textContent")
    else:
        expected_crawled_content = body.text

    with open(crawled_page, "r") as crawled_file:
        page_data = json.load(crawled_file)
        return page_data["content"] == expected_crawled_content


def content_in_results(crawler: Crawler, url: str, results: List[Path], expected_matches_count=1):
    """
    Makes sure there is exactly one matching page in the list of pages returned
    by the crawler.

    :param crawler: the tested Crawler object
    :param url: the URL of the page to find in the results
    :param results: the crawler's output (list of paths)
    :param expected_matches_count: how many copies of this page should be present in the results (default 1)
    """
    return sum(content_match(crawler, url, path) for path in results) == expected_matches_count


#
# Integration
#


@pytest.mark.integration
def test_crawler(tmp_path):
    tmp_dir = tmp_path
    url = ["https://haystack.deepset.ai/"]

    crawler = Crawler(output_dir=tmp_dir)
    docs_path = crawler.crawl(urls=url, crawler_depth=0)
    results, _ = crawler.run(urls=url, crawler_depth=0, return_documents=True)
    documents = results["documents"]

    for json_file, document in zip(docs_path, documents):
        assert isinstance(json_file, Path)
        assert isinstance(document, Document)

        with open(json_file.absolute(), "r") as read_file:
            file_content = json.load(read_file)
            assert file_content["meta"] == document.meta
            assert file_content["content"] == document.content


#
# Unit tests
#


def test_crawler_url_none_exception(tmp_path):
    crawler = Crawler(tmp_path)
    with pytest.raises(ValueError):
        crawler.crawl()


def test_crawler_depth_0_single_url(test_url, tmp_path):
    crawler = Crawler(output_dir=tmp_path)
    paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
    assert len(paths) == 1
    assert content_match(crawler, test_url + "/index.html", paths[0])


def test_crawler_depth_0_many_urls(test_url, tmp_path):
    crawler = Crawler(output_dir=tmp_path)
    _urls = [test_url + "/index.html", test_url + "/page1.html"]
    paths = crawler.crawl(urls=_urls, crawler_depth=0)
    assert len(paths) == 2
    assert content_in_results(crawler, test_url + "/index.html", paths)
    assert content_in_results(crawler, test_url + "/page1.html", paths)


def test_crawler_depth_1_single_url(test_url, tmp_path):
    crawler = Crawler(output_dir=tmp_path)
    paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1)
    assert len(paths) == 3
    assert content_in_results(crawler, test_url + "/index.html", paths)
    assert content_in_results(crawler, test_url + "/page1.html", paths)
    assert content_in_results(crawler, test_url + "/page2.html", paths)


def test_crawler_output_file_structure(test_url, tmp_path):
    crawler = Crawler(output_dir=tmp_path)
    paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
    assert content_match(crawler, test_url + "/index.html", paths[0])

    with open(paths[0].absolute(), "r") as doc_file:
        data = json.load(doc_file)
        assert "content" in data
        assert "meta" in data
        assert isinstance(data["content"], str)
        assert len(data["content"].split()) > 2


def test_crawler_filter_urls(test_url, tmp_path):
    crawler = Crawler(output_dir=tmp_path)

    paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1)
    assert len(paths) == 1
    assert content_match(crawler, test_url + "/index.html", paths[0])

    # Note: filter_urls can exclude pages listed in `urls` as well
    paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1)
    assert len(paths) == 1
    assert content_match(crawler, test_url + "/page1.html", paths[0])
    assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google\.com"], crawler_depth=1)


def test_crawler_return_document(test_url, tmp_path):
    crawler = Crawler(output_dir=tmp_path)
    documents, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=True)
    paths, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=False)

    for path, document in zip(paths["paths"], documents["documents"]):
        with open(path.absolute(), "r") as doc_file:
            file_content = json.load(doc_file)
            assert file_content["meta"] == document.meta
            assert file_content["content"] == document.content


def test_crawler_extract_hidden_text(test_url, tmp_path):
    crawler = Crawler(output_dir=tmp_path)
    documents, _ = crawler.run(
        urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0, return_documents=True
    )
    crawled_content = documents["documents"][0].content
    assert "hidden text" in crawled_content

    documents, _ = crawler.run(
        urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=False, crawler_depth=0, return_documents=True
    )
    crawled_content = documents["documents"][0].content
    assert "hidden text" not in crawled_content