haystack/test/test_connector.py

import json
from pathlib import Path
from re import search

import pytest
from haystack.nodes.connector import Crawler
from haystack.schema import Document


def test_crawler_url_none_exception(tmp_path):
    tmp_dir = tmp_path / "crawled_files"
    with pytest.raises(ValueError):
        Crawler(tmp_dir).crawl()


def test_crawler_depth(tmp_path):
    tmp_dir = tmp_path / "crawled_files"
    _url = ["https://haystack.deepset.ai/overview/get-started"]
    crawler = Crawler(output_dir=tmp_dir)
    doc_path = crawler.crawl(urls=_url, crawler_depth=0)
    assert len(doc_path) == 1

    _urls = [
        "https://haystack.deepset.ai/overview/v0.8.0/get-started",
        "https://haystack.deepset.ai/overview/v0.7.0/get-started",
        "https://haystack.deepset.ai/overview/v0.6.0/get-started",
    ]
    doc_path = crawler.crawl(urls=_urls, crawler_depth=0)
    assert len(doc_path) == 3

    doc_path = crawler.crawl(urls=_url, crawler_depth=1)
    assert len(doc_path) > 1

    for json_file in doc_path:
        assert isinstance(json_file, Path)
        with open(json_file.absolute(), "r") as read_file:
            data = json.load(read_file)
            assert "content" in data
            assert "meta" in data
            assert isinstance(data["content"], str)
            assert len(data["content"].split()) > 2


def test_crawler_filter_urls(tmp_path):
    tmp_dir = tmp_path / "crawled_files"
    _url = ["https://haystack.deepset.ai/overview/v0.8.0/"]

    crawler = Crawler(output_dir=tmp_dir)
    doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v0\.9\.0\/"])
    assert len(doc_path) == 0

    doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v0\.8\.0\/"])
    assert len(doc_path) > 0

    doc_path = crawler.crawl(urls=_url, filter_urls=["google\.com"])
    assert len(doc_path) == 0


def test_crawler_content(tmp_path):
    tmp_dir = tmp_path / "crawled_files"

    partial_content_match: list = [
        {
            "url": "https://haystack.deepset.ai/overview/v0.7.0/intro",
            "partial_content": [
                "What is Haystack",
                "Utilize all transformer based models",
                "a Retriever-Reader pipeline in order",
                "Passing on only a small candidate set",
                "fast indexing and querying",
                "Fine-tune models to your own domain",
                "smoothly switch when new ones get published",
            ],
        },
        {
            "url": "https://haystack.deepset.ai/overview/v0.7.0/use-cases",
            "partial_content": [
                "Semantic Search System",
                "Store your documents in the database of ",
                "results are chosen based on compatibility in",
                "Apply a set of standard questions to each document",
                "Return a NO_ANSWER if a given document",
                "like what is the revenue forecast for 2020?",
                "overview of academic papers and internal business",
            ],
        },
    ]

    crawler = Crawler(output_dir=tmp_dir)
    for _dict in partial_content_match:
        url: str = _dict["url"]
        partial_content: list = _dict["partial_content"]

        doc_path = crawler.crawl(urls=[url], crawler_depth=0)
        assert len(doc_path) == 1

        for json_file in doc_path:
            assert isinstance(json_file, Path)
            with open(json_file.absolute(), "r") as read_file:
                content = json.load(read_file)
                assert isinstance(content["content"], str)
                for partial_line in partial_content:
                    assert search(partial_line, content["content"])
                    assert partial_line in content["content"]


def test_crawler_return_document(tmp_path):
    tmp_dir = tmp_path / "crawled_files"
    _url = ["https://haystack.deepset.ai/docs/v0.5.0/intromd"]

    crawler = Crawler(output_dir=tmp_dir)
    docs_path = crawler.crawl(urls=_url, crawler_depth=1)
    results, _ = crawler.run(urls=_url, crawler_depth=1, return_documents=True)
    documents = results["documents"]

    for json_file, document in zip(docs_path, documents):
        assert isinstance(json_file, Path)
        assert isinstance(document, Document)

        with open(json_file.absolute(), "r") as read_file:
            file_content = json.load(read_file)
            assert file_content["meta"] == document.meta
            assert file_content["content"] == document.content