2021-08-18 17:05:44 +05:00
|
|
|
import json
|
|
|
|
from pathlib import Path
|
|
|
|
from re import search
|
|
|
|
|
|
|
|
import pytest
|
2022-01-26 18:12:55 +01:00
|
|
|
from haystack.nodes.connector import Crawler
|
2022-03-29 13:53:35 +02:00
|
|
|
from haystack.schema import Document
|
2021-08-18 17:05:44 +05:00
|
|
|
|
|
|
|
|
|
|
|
def test_crawler_url_none_exception(tmp_path):
|
|
|
|
tmp_dir = tmp_path / "crawled_files"
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
Crawler(tmp_dir).crawl()
|
|
|
|
|
|
|
|
|
|
|
|
def test_crawler_depth(tmp_path):
|
|
|
|
tmp_dir = tmp_path / "crawled_files"
|
2021-08-30 14:59:40 +02:00
|
|
|
_url = ["https://haystack.deepset.ai/overview/get-started"]
|
2021-08-18 17:05:44 +05:00
|
|
|
crawler = Crawler(output_dir=tmp_dir)
|
|
|
|
doc_path = crawler.crawl(urls=_url, crawler_depth=0)
|
|
|
|
assert len(doc_path) == 1
|
|
|
|
|
|
|
|
_urls = [
|
2022-05-04 17:39:06 +02:00
|
|
|
"https://haystack.deepset.ai/overview/v1.2.0/get-started",
|
|
|
|
"https://haystack.deepset.ai/overview/v1.1.0/get-started",
|
|
|
|
"https://haystack.deepset.ai/overview/v1.0.0/get-started",
|
2021-08-18 17:05:44 +05:00
|
|
|
]
|
|
|
|
doc_path = crawler.crawl(urls=_urls, crawler_depth=0)
|
|
|
|
assert len(doc_path) == 3
|
|
|
|
|
|
|
|
doc_path = crawler.crawl(urls=_url, crawler_depth=1)
|
|
|
|
assert len(doc_path) > 1
|
|
|
|
|
|
|
|
for json_file in doc_path:
|
|
|
|
assert isinstance(json_file, Path)
|
|
|
|
with open(json_file.absolute(), "r") as read_file:
|
|
|
|
data = json.load(read_file)
|
2022-02-03 13:43:18 +01:00
|
|
|
assert "content" in data
|
|
|
|
assert "meta" in data
|
|
|
|
assert isinstance(data["content"], str)
|
|
|
|
assert len(data["content"].split()) > 2
|
2021-08-18 17:05:44 +05:00
|
|
|
|
|
|
|
|
|
|
|
def test_crawler_filter_urls(tmp_path):
|
|
|
|
tmp_dir = tmp_path / "crawled_files"
|
2022-05-04 17:39:06 +02:00
|
|
|
_url = ["https://haystack.deepset.ai/overview/v1.2.0/"]
|
2021-08-18 17:05:44 +05:00
|
|
|
|
|
|
|
crawler = Crawler(output_dir=tmp_dir)
|
2022-05-04 17:39:06 +02:00
|
|
|
doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v1\.3\.0\/"])
|
2021-08-18 17:05:44 +05:00
|
|
|
assert len(doc_path) == 0
|
|
|
|
|
2022-05-04 17:39:06 +02:00
|
|
|
doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v1\.2\.0\/"])
|
2021-08-18 17:05:44 +05:00
|
|
|
assert len(doc_path) > 0
|
|
|
|
|
|
|
|
doc_path = crawler.crawl(urls=_url, filter_urls=["google\.com"])
|
|
|
|
assert len(doc_path) == 0
|
|
|
|
|
|
|
|
|
|
|
|
def test_crawler_content(tmp_path):
|
|
|
|
tmp_dir = tmp_path / "crawled_files"
|
|
|
|
|
|
|
|
partial_content_match: list = [
|
2022-02-03 13:43:18 +01:00
|
|
|
{
|
2022-05-04 17:39:06 +02:00
|
|
|
"url": "https://haystack.deepset.ai/overview/v1.1.0/intro",
|
2022-02-03 13:43:18 +01:00
|
|
|
"partial_content": [
|
2022-05-04 17:39:06 +02:00
|
|
|
"Haystack is an open-source framework ",
|
|
|
|
"for building search systems that work intelligently ",
|
|
|
|
"over large document collections.",
|
|
|
|
"Recent advances in NLP have enabled the application of ",
|
|
|
|
"question answering, retrieval and summarization ",
|
|
|
|
"to real world settings and Haystack is designed to be ",
|
|
|
|
"the bridge between research and industry.",
|
2022-02-03 13:43:18 +01:00
|
|
|
],
|
|
|
|
},
|
|
|
|
{
|
2022-05-04 17:39:06 +02:00
|
|
|
"url": "https://haystack.deepset.ai/overview/v1.1.0/use-cases",
|
2022-02-03 13:43:18 +01:00
|
|
|
"partial_content": [
|
2022-05-04 17:39:06 +02:00
|
|
|
"Expect to see results that highlight",
|
|
|
|
"the very sentence that contains the answer to your question.",
|
|
|
|
"Thanks to the power of Transformer based language models,",
|
|
|
|
"results are chosen based on compatibility in meaning",
|
|
|
|
"rather than lexical overlap.",
|
2022-02-03 13:43:18 +01:00
|
|
|
],
|
|
|
|
},
|
|
|
|
]
|
2021-08-18 17:05:44 +05:00
|
|
|
|
|
|
|
crawler = Crawler(output_dir=tmp_dir)
|
|
|
|
for _dict in partial_content_match:
|
2022-02-03 13:43:18 +01:00
|
|
|
url: str = _dict["url"]
|
|
|
|
partial_content: list = _dict["partial_content"]
|
2021-08-18 17:05:44 +05:00
|
|
|
|
|
|
|
doc_path = crawler.crawl(urls=[url], crawler_depth=0)
|
|
|
|
assert len(doc_path) == 1
|
|
|
|
|
|
|
|
for json_file in doc_path:
|
|
|
|
assert isinstance(json_file, Path)
|
|
|
|
with open(json_file.absolute(), "r") as read_file:
|
|
|
|
content = json.load(read_file)
|
2022-02-03 13:43:18 +01:00
|
|
|
assert isinstance(content["content"], str)
|
2021-08-18 17:05:44 +05:00
|
|
|
for partial_line in partial_content:
|
2022-02-03 13:43:18 +01:00
|
|
|
assert search(partial_line, content["content"])
|
|
|
|
assert partial_line in content["content"]
|
2021-08-24 17:25:22 +05:00
|
|
|
|
|
|
|
|
|
|
|
def test_crawler_return_document(tmp_path):
|
|
|
|
tmp_dir = tmp_path / "crawled_files"
|
2022-05-04 17:39:06 +02:00
|
|
|
_url = ["https://haystack.deepset.ai/docs/v1.0.0/intromd"]
|
2021-08-24 17:25:22 +05:00
|
|
|
|
|
|
|
crawler = Crawler(output_dir=tmp_dir)
|
|
|
|
docs_path = crawler.crawl(urls=_url, crawler_depth=1)
|
|
|
|
results, _ = crawler.run(urls=_url, crawler_depth=1, return_documents=True)
|
2022-02-03 13:43:18 +01:00
|
|
|
documents = results["documents"]
|
2021-08-24 17:25:22 +05:00
|
|
|
|
|
|
|
for json_file, document in zip(docs_path, documents):
|
|
|
|
assert isinstance(json_file, Path)
|
2022-03-29 13:53:35 +02:00
|
|
|
assert isinstance(document, Document)
|
2021-08-24 17:25:22 +05:00
|
|
|
|
|
|
|
with open(json_file.absolute(), "r") as read_file:
|
|
|
|
file_content = json.load(read_file)
|
2022-03-29 13:53:35 +02:00
|
|
|
assert file_content["meta"] == document.meta
|
|
|
|
assert file_content["content"] == document.content
|