haystack/test/test_connector.py
bogdankostic 834f8c4902
Change return types of indexing pipeline nodes (#2342)
* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2022-03-29 13:53:35 +02:00

124 lines
4.4 KiB
Python

import json
from pathlib import Path
from re import search
import pytest
from haystack.nodes.connector import Crawler
from haystack.schema import Document
def test_crawler_url_none_exception(tmp_path):
tmp_dir = tmp_path / "crawled_files"
with pytest.raises(ValueError):
Crawler(tmp_dir).crawl()
def test_crawler_depth(tmp_path):
tmp_dir = tmp_path / "crawled_files"
_url = ["https://haystack.deepset.ai/overview/get-started"]
crawler = Crawler(output_dir=tmp_dir)
doc_path = crawler.crawl(urls=_url, crawler_depth=0)
assert len(doc_path) == 1
_urls = [
"https://haystack.deepset.ai/overview/v0.8.0/get-started",
"https://haystack.deepset.ai/overview/v0.7.0/get-started",
"https://haystack.deepset.ai/overview/v0.6.0/get-started",
]
doc_path = crawler.crawl(urls=_urls, crawler_depth=0)
assert len(doc_path) == 3
doc_path = crawler.crawl(urls=_url, crawler_depth=1)
assert len(doc_path) > 1
for json_file in doc_path:
assert isinstance(json_file, Path)
with open(json_file.absolute(), "r") as read_file:
data = json.load(read_file)
assert "content" in data
assert "meta" in data
assert isinstance(data["content"], str)
assert len(data["content"].split()) > 2
def test_crawler_filter_urls(tmp_path):
tmp_dir = tmp_path / "crawled_files"
_url = ["https://haystack.deepset.ai/overview/v0.8.0/"]
crawler = Crawler(output_dir=tmp_dir)
doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v0\.9\.0\/"])
assert len(doc_path) == 0
doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v0\.8\.0\/"])
assert len(doc_path) > 0
doc_path = crawler.crawl(urls=_url, filter_urls=["google\.com"])
assert len(doc_path) == 0
def test_crawler_content(tmp_path):
tmp_dir = tmp_path / "crawled_files"
partial_content_match: list = [
{
"url": "https://haystack.deepset.ai/overview/v0.7.0/intro",
"partial_content": [
"What is Haystack",
"Utilize all transformer based models",
"a Retriever-Reader pipeline in order",
"Passing on only a small candidate set",
"fast indexing and querying",
"Fine-tune models to your own domain",
"smoothly switch when new ones get published",
],
},
{
"url": "https://haystack.deepset.ai/overview/v0.7.0/use-cases",
"partial_content": [
"Semantic Search System",
"Store your documents in the database of ",
"results are chosen based on compatibility in",
"Apply a set of standard questions to each document",
"Return a NO_ANSWER if a given document",
"like what is the revenue forecast for 2020?",
"overview of academic papers and internal business",
],
},
]
crawler = Crawler(output_dir=tmp_dir)
for _dict in partial_content_match:
url: str = _dict["url"]
partial_content: list = _dict["partial_content"]
doc_path = crawler.crawl(urls=[url], crawler_depth=0)
assert len(doc_path) == 1
for json_file in doc_path:
assert isinstance(json_file, Path)
with open(json_file.absolute(), "r") as read_file:
content = json.load(read_file)
assert isinstance(content["content"], str)
for partial_line in partial_content:
assert search(partial_line, content["content"])
assert partial_line in content["content"]
def test_crawler_return_document(tmp_path):
tmp_dir = tmp_path / "crawled_files"
_url = ["https://haystack.deepset.ai/docs/v0.5.0/intromd"]
crawler = Crawler(output_dir=tmp_dir)
docs_path = crawler.crawl(urls=_url, crawler_depth=1)
results, _ = crawler.run(urls=_url, crawler_depth=1, return_documents=True)
documents = results["documents"]
for json_file, document in zip(docs_path, documents):
assert isinstance(json_file, Path)
assert isinstance(document, Document)
with open(json_file.absolute(), "r") as read_file:
file_content = json.load(read_file)
assert file_content["meta"] == document.meta
assert file_content["content"] == document.content