[CI refactoring] Rewrite Crawler tests (#2557)

* Rewrite crawler tests (very slow) and fix small crawler bug

* Update Documentation & Code Style

* compile the regex only once

* Factor out the html files & add content check to most tests

* Clarify that even starting URLs can be excluded

* Update Documentation & Code Style

* Change signature

* Fix failing test

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
Sara Zan 2022-06-06 17:52:37 +02:00 committed by GitHub
parent 0a4477d315
commit 83648b9bc0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 161 additions and 112 deletions

View File

@ -149,18 +149,23 @@ class Crawler(BaseComponent):
logger.info(f"Found data stored in `{output_dir}`. Delete this first if you really want to fetch new data.")
else:
logger.info(f"Fetching from {urls} to `{output_dir}`")
sub_links: Dict[str, List] = {}
# don't go beyond the initial list of urls
if crawler_depth == 0:
# Start by writing out the initial list of urls
if filter_urls:
pattern = re.compile("|".join(filter_urls))
for url in urls:
if pattern.search(url):
file_paths += self._write_to_files([url], output_dir=output_dir)
else:
file_paths += self._write_to_files(urls, output_dir=output_dir)
# follow one level of sublinks
elif crawler_depth == 1:
# follow one level of sublinks if requested
if crawler_depth == 1:
sub_links: Dict[str, List] = {}
for url_ in urls:
existed_links: List = list(sum(list(sub_links.values()), []))
already_found_links: List = list(sum(list(sub_links.values()), []))
sub_links[url_] = list(
self._extract_sublinks_from_url(
base_url=url_, filter_urls=filter_urls, existed_links=existed_links
base_url=url_, filter_urls=filter_urls, already_found_links=already_found_links
)
)
for url, extracted_sublink in sub_links.items():
@ -277,24 +282,23 @@ class Crawler(BaseComponent):
return base_url_.path == sub_link_.path and base_url_.netloc == sub_link_.netloc
def _extract_sublinks_from_url(
self, base_url: str, filter_urls: Optional[List] = None, existed_links: List = None
self, base_url: str, filter_urls: Optional[List] = None, already_found_links: List = None
) -> set:
if filter_urls:
filter_pattern = re.compile("|".join(filter_urls))
self.driver.get(base_url)
a_elements = self.driver.find_elements_by_xpath("//a[@href]")
sub_links = set()
if not (existed_links and base_url in existed_links):
if filter_urls:
if re.compile("|".join(filter_urls)).search(base_url):
sub_links.add(base_url)
for i in a_elements:
sub_link = i.get_attribute("href")
if not (existed_links and sub_link in existed_links):
if not (already_found_links and sub_link in already_found_links):
if self._is_internal_url(base_url=base_url, sub_link=sub_link) and (
not self._is_inpage_navigation(base_url=base_url, sub_link=sub_link)
):
if filter_urls:
if re.compile("|".join(filter_urls)).search(sub_link):
if filter_pattern.search(sub_link):
sub_links.add(sub_link)
else:
sub_links.add(sub_link)

View File

@ -1,114 +1,51 @@
import os
import json
import shutil
import tempfile
from pathlib import Path
from re import search
import pytest
from haystack.nodes.connector import Crawler
from haystack.schema import Document
def test_crawler_url_none_exception(tmp_path):
tmp_dir = tmp_path / "crawled_files"
with pytest.raises(ValueError):
Crawler(tmp_dir).crawl()
from ..conftest import SAMPLES_PATH
def test_crawler_depth(tmp_path):
tmp_dir = tmp_path / "crawled_files"
_url = ["https://haystack.deepset.ai/overview/get-started"]
crawler = Crawler(output_dir=tmp_dir)
doc_path = crawler.crawl(urls=_url, crawler_depth=0)
assert len(doc_path) == 1
_urls = [
"https://haystack.deepset.ai/overview/v1.2.0/get-started",
"https://haystack.deepset.ai/overview/v1.1.0/get-started",
"https://haystack.deepset.ai/overview/v1.0.0/get-started",
]
doc_path = crawler.crawl(urls=_urls, crawler_depth=0)
assert len(doc_path) == 3
doc_path = crawler.crawl(urls=_url, crawler_depth=1)
assert len(doc_path) > 1
for json_file in doc_path:
assert isinstance(json_file, Path)
with open(json_file.absolute(), "r") as read_file:
data = json.load(read_file)
assert "content" in data
assert "meta" in data
assert isinstance(data["content"], str)
assert len(data["content"].split()) > 2
@pytest.fixture(scope="session")
def test_url():
return f"file://{SAMPLES_PATH.absolute()}/crawler"
def test_crawler_filter_urls(tmp_path):
tmp_dir = tmp_path / "crawled_files"
_url = ["https://haystack.deepset.ai/overview/v1.2.0/"]
def content_match(crawler: Crawler, url: str, crawled_page: Path):
"""
:param crawler: the tested Crawler object
:param base_url: the URL from test_url fixture
:param page_name: the expected page
:param crawled_page: the output of Crawler (one element of the paths list)
"""
crawler.driver.get(url)
body = crawler.driver.find_element_by_tag_name("body")
expected_crawled_content = body.text
with open(crawled_page, "r") as crawled_file:
page_data = json.load(crawled_file)
return page_data["content"] == expected_crawled_content
#
# Integration
#
@pytest.mark.integration
def test_crawler(tmp_path):
tmp_dir = tmp_path
url = ["https://haystack.deepset.ai/"]
crawler = Crawler(output_dir=tmp_dir)
doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v1\.3\.0\/"])
assert len(doc_path) == 0
doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v1\.2\.0\/"])
assert len(doc_path) > 0
doc_path = crawler.crawl(urls=_url, filter_urls=["google\.com"])
assert len(doc_path) == 0
def test_crawler_content(tmp_path):
tmp_dir = tmp_path / "crawled_files"
partial_content_match: list = [
{
"url": "https://haystack.deepset.ai/overview/v1.1.0/intro",
"partial_content": [
"Haystack is an open-source framework ",
"for building search systems that work intelligently ",
"over large document collections.",
"Recent advances in NLP have enabled the application of ",
"question answering, retrieval and summarization ",
"to real world settings and Haystack is designed to be ",
"the bridge between research and industry.",
],
},
{
"url": "https://haystack.deepset.ai/overview/v1.1.0/use-cases",
"partial_content": [
"Expect to see results that highlight",
"the very sentence that contains the answer to your question.",
"Thanks to the power of Transformer based language models,",
"results are chosen based on compatibility in meaning",
"rather than lexical overlap.",
],
},
]
crawler = Crawler(output_dir=tmp_dir)
for _dict in partial_content_match:
url: str = _dict["url"]
partial_content: list = _dict["partial_content"]
doc_path = crawler.crawl(urls=[url], crawler_depth=0)
assert len(doc_path) == 1
for json_file in doc_path:
assert isinstance(json_file, Path)
with open(json_file.absolute(), "r") as read_file:
content = json.load(read_file)
assert isinstance(content["content"], str)
for partial_line in partial_content:
assert search(partial_line, content["content"])
assert partial_line in content["content"]
def test_crawler_return_document(tmp_path):
tmp_dir = tmp_path / "crawled_files"
_url = ["https://haystack.deepset.ai/docs/v1.0.0/intromd"]
crawler = Crawler(output_dir=tmp_dir)
docs_path = crawler.crawl(urls=_url, crawler_depth=1)
results, _ = crawler.run(urls=_url, crawler_depth=1, return_documents=True)
docs_path = crawler.crawl(urls=url, crawler_depth=0)
results, _ = crawler.run(urls=url, crawler_depth=0, return_documents=True)
documents = results["documents"]
for json_file, document in zip(docs_path, documents):
@ -119,3 +56,78 @@ def test_crawler_return_document(tmp_path):
file_content = json.load(read_file)
assert file_content["meta"] == document.meta
assert file_content["content"] == document.content
#
# Unit tests
#
def test_crawler_url_none_exception(tmp_path):
crawler = Crawler(tmp_path)
with pytest.raises(ValueError):
crawler.crawl()
def test_crawler_depth_0_single_url(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path)
paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
assert len(paths) == 1
assert content_match(crawler, test_url + "/index.html", paths[0])
def test_crawler_depth_0_many_urls(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path)
_urls = [test_url + "/index.html", test_url + "/page1.html"]
paths = crawler.crawl(urls=_urls, crawler_depth=0)
assert len(paths) == 2
assert content_match(crawler, test_url + "/index.html", paths[0])
assert content_match(crawler, test_url + "/page1.html", paths[1])
def test_crawler_depth_1_single_url(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path)
paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1)
assert len(paths) == 3
assert content_match(crawler, test_url + "/index.html", paths[0])
assert content_match(crawler, test_url + "/page1.html", paths[1])
assert content_match(crawler, test_url + "/page2.html", paths[2])
def test_crawler_output_file_structure(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path)
paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
assert content_match(crawler, test_url + "/index.html", paths[0])
with open(paths[0].absolute(), "r") as doc_file:
data = json.load(doc_file)
assert "content" in data
assert "meta" in data
assert isinstance(data["content"], str)
assert len(data["content"].split()) > 2
def test_crawler_filter_urls(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path)
paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1)
assert len(paths) == 1
assert content_match(crawler, test_url + "/index.html", paths[0])
# Note: filter_urls can exclude pages listed in `urls` as well
paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1)
assert len(paths) == 1
assert content_match(crawler, test_url + "/page1.html", paths[0])
assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google\.com"], crawler_depth=1)
def test_crawler_return_document(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path)
documents, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=True)
paths, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=False)
for path, document in zip(paths["paths"], documents["documents"]):
with open(path.absolute(), "r") as doc_file:
file_content = json.load(doc_file)
assert file_content["meta"] == document.meta
assert file_content["content"] == document.content

View File

@ -0,0 +1,11 @@
<!DOCTYPE html>
<html>
<head>
<title>Test Home Page for Crawler</title>
</head>
<body>
<p>home page content</p>
<a href="page1.html">link to page 1</a>
<a href="page2.html">link to page 2</a>
</body>
</html>

View File

@ -0,0 +1,11 @@
<!DOCTYPE html>
<html>
<head>
<title>Test Page 1 for Crawler</title>
</head>
<body>
<p>page 1 content</p>
<a href="index.html">link to home</a>
<a href="page2.html">link to page 2</a>
</body>
</html>

View File

@ -0,0 +1,11 @@
<!DOCTYPE html>
<html>
<head>
<title>Test Page 2 for Crawler</title>
</head>
<body>
<p>page 2 content</p>
<a href="index.html">link to home</a>
<a href="page1.html">link to page 1</a>
</body>
</html>