mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-30 01:09:43 +00:00
[CI refactoring] Rewrite Crawler tests (#2557)
* Rewrite crawler tests (very slow) and fix small crawler bug * Update Documentation & Code Style * compile the regex only once * Factor out the html files & add content check to most tests * Clarify that even starting URLs can be excluded * Update Documentation & Code Style * Change signature * Fix failing test * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
0a4477d315
commit
83648b9bc0
@ -149,18 +149,23 @@ class Crawler(BaseComponent):
|
|||||||
logger.info(f"Found data stored in `{output_dir}`. Delete this first if you really want to fetch new data.")
|
logger.info(f"Found data stored in `{output_dir}`. Delete this first if you really want to fetch new data.")
|
||||||
else:
|
else:
|
||||||
logger.info(f"Fetching from {urls} to `{output_dir}`")
|
logger.info(f"Fetching from {urls} to `{output_dir}`")
|
||||||
sub_links: Dict[str, List] = {}
|
|
||||||
|
|
||||||
# don't go beyond the initial list of urls
|
# Start by writing out the initial list of urls
|
||||||
if crawler_depth == 0:
|
if filter_urls:
|
||||||
|
pattern = re.compile("|".join(filter_urls))
|
||||||
|
for url in urls:
|
||||||
|
if pattern.search(url):
|
||||||
|
file_paths += self._write_to_files([url], output_dir=output_dir)
|
||||||
|
else:
|
||||||
file_paths += self._write_to_files(urls, output_dir=output_dir)
|
file_paths += self._write_to_files(urls, output_dir=output_dir)
|
||||||
# follow one level of sublinks
|
# follow one level of sublinks if requested
|
||||||
elif crawler_depth == 1:
|
if crawler_depth == 1:
|
||||||
|
sub_links: Dict[str, List] = {}
|
||||||
for url_ in urls:
|
for url_ in urls:
|
||||||
existed_links: List = list(sum(list(sub_links.values()), []))
|
already_found_links: List = list(sum(list(sub_links.values()), []))
|
||||||
sub_links[url_] = list(
|
sub_links[url_] = list(
|
||||||
self._extract_sublinks_from_url(
|
self._extract_sublinks_from_url(
|
||||||
base_url=url_, filter_urls=filter_urls, existed_links=existed_links
|
base_url=url_, filter_urls=filter_urls, already_found_links=already_found_links
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
for url, extracted_sublink in sub_links.items():
|
for url, extracted_sublink in sub_links.items():
|
||||||
@ -277,24 +282,23 @@ class Crawler(BaseComponent):
|
|||||||
return base_url_.path == sub_link_.path and base_url_.netloc == sub_link_.netloc
|
return base_url_.path == sub_link_.path and base_url_.netloc == sub_link_.netloc
|
||||||
|
|
||||||
def _extract_sublinks_from_url(
|
def _extract_sublinks_from_url(
|
||||||
self, base_url: str, filter_urls: Optional[List] = None, existed_links: List = None
|
self, base_url: str, filter_urls: Optional[List] = None, already_found_links: List = None
|
||||||
) -> set:
|
) -> set:
|
||||||
|
if filter_urls:
|
||||||
|
filter_pattern = re.compile("|".join(filter_urls))
|
||||||
|
|
||||||
self.driver.get(base_url)
|
self.driver.get(base_url)
|
||||||
a_elements = self.driver.find_elements_by_xpath("//a[@href]")
|
a_elements = self.driver.find_elements_by_xpath("//a[@href]")
|
||||||
sub_links = set()
|
sub_links = set()
|
||||||
if not (existed_links and base_url in existed_links):
|
|
||||||
if filter_urls:
|
|
||||||
if re.compile("|".join(filter_urls)).search(base_url):
|
|
||||||
sub_links.add(base_url)
|
|
||||||
|
|
||||||
for i in a_elements:
|
for i in a_elements:
|
||||||
sub_link = i.get_attribute("href")
|
sub_link = i.get_attribute("href")
|
||||||
if not (existed_links and sub_link in existed_links):
|
if not (already_found_links and sub_link in already_found_links):
|
||||||
if self._is_internal_url(base_url=base_url, sub_link=sub_link) and (
|
if self._is_internal_url(base_url=base_url, sub_link=sub_link) and (
|
||||||
not self._is_inpage_navigation(base_url=base_url, sub_link=sub_link)
|
not self._is_inpage_navigation(base_url=base_url, sub_link=sub_link)
|
||||||
):
|
):
|
||||||
if filter_urls:
|
if filter_urls:
|
||||||
if re.compile("|".join(filter_urls)).search(sub_link):
|
if filter_pattern.search(sub_link):
|
||||||
sub_links.add(sub_link)
|
sub_links.add(sub_link)
|
||||||
else:
|
else:
|
||||||
sub_links.add(sub_link)
|
sub_links.add(sub_link)
|
||||||
|
|||||||
@ -1,114 +1,51 @@
|
|||||||
|
import os
|
||||||
import json
|
import json
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from re import search
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from haystack.nodes.connector import Crawler
|
from haystack.nodes.connector import Crawler
|
||||||
from haystack.schema import Document
|
from haystack.schema import Document
|
||||||
|
|
||||||
|
from ..conftest import SAMPLES_PATH
|
||||||
def test_crawler_url_none_exception(tmp_path):
|
|
||||||
tmp_dir = tmp_path / "crawled_files"
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
Crawler(tmp_dir).crawl()
|
|
||||||
|
|
||||||
|
|
||||||
def test_crawler_depth(tmp_path):
|
@pytest.fixture(scope="session")
|
||||||
tmp_dir = tmp_path / "crawled_files"
|
def test_url():
|
||||||
_url = ["https://haystack.deepset.ai/overview/get-started"]
|
return f"file://{SAMPLES_PATH.absolute()}/crawler"
|
||||||
crawler = Crawler(output_dir=tmp_dir)
|
|
||||||
doc_path = crawler.crawl(urls=_url, crawler_depth=0)
|
|
||||||
assert len(doc_path) == 1
|
|
||||||
|
|
||||||
_urls = [
|
|
||||||
"https://haystack.deepset.ai/overview/v1.2.0/get-started",
|
|
||||||
"https://haystack.deepset.ai/overview/v1.1.0/get-started",
|
|
||||||
"https://haystack.deepset.ai/overview/v1.0.0/get-started",
|
|
||||||
]
|
|
||||||
doc_path = crawler.crawl(urls=_urls, crawler_depth=0)
|
|
||||||
assert len(doc_path) == 3
|
|
||||||
|
|
||||||
doc_path = crawler.crawl(urls=_url, crawler_depth=1)
|
|
||||||
assert len(doc_path) > 1
|
|
||||||
|
|
||||||
for json_file in doc_path:
|
|
||||||
assert isinstance(json_file, Path)
|
|
||||||
with open(json_file.absolute(), "r") as read_file:
|
|
||||||
data = json.load(read_file)
|
|
||||||
assert "content" in data
|
|
||||||
assert "meta" in data
|
|
||||||
assert isinstance(data["content"], str)
|
|
||||||
assert len(data["content"].split()) > 2
|
|
||||||
|
|
||||||
|
|
||||||
def test_crawler_filter_urls(tmp_path):
|
def content_match(crawler: Crawler, url: str, crawled_page: Path):
|
||||||
tmp_dir = tmp_path / "crawled_files"
|
"""
|
||||||
_url = ["https://haystack.deepset.ai/overview/v1.2.0/"]
|
:param crawler: the tested Crawler object
|
||||||
|
:param base_url: the URL from test_url fixture
|
||||||
|
:param page_name: the expected page
|
||||||
|
:param crawled_page: the output of Crawler (one element of the paths list)
|
||||||
|
"""
|
||||||
|
crawler.driver.get(url)
|
||||||
|
body = crawler.driver.find_element_by_tag_name("body")
|
||||||
|
expected_crawled_content = body.text
|
||||||
|
|
||||||
|
with open(crawled_page, "r") as crawled_file:
|
||||||
|
page_data = json.load(crawled_file)
|
||||||
|
return page_data["content"] == expected_crawled_content
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Integration
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_crawler(tmp_path):
|
||||||
|
tmp_dir = tmp_path
|
||||||
|
url = ["https://haystack.deepset.ai/"]
|
||||||
|
|
||||||
crawler = Crawler(output_dir=tmp_dir)
|
crawler = Crawler(output_dir=tmp_dir)
|
||||||
doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v1\.3\.0\/"])
|
docs_path = crawler.crawl(urls=url, crawler_depth=0)
|
||||||
assert len(doc_path) == 0
|
results, _ = crawler.run(urls=url, crawler_depth=0, return_documents=True)
|
||||||
|
|
||||||
doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v1\.2\.0\/"])
|
|
||||||
assert len(doc_path) > 0
|
|
||||||
|
|
||||||
doc_path = crawler.crawl(urls=_url, filter_urls=["google\.com"])
|
|
||||||
assert len(doc_path) == 0
|
|
||||||
|
|
||||||
|
|
||||||
def test_crawler_content(tmp_path):
|
|
||||||
tmp_dir = tmp_path / "crawled_files"
|
|
||||||
|
|
||||||
partial_content_match: list = [
|
|
||||||
{
|
|
||||||
"url": "https://haystack.deepset.ai/overview/v1.1.0/intro",
|
|
||||||
"partial_content": [
|
|
||||||
"Haystack is an open-source framework ",
|
|
||||||
"for building search systems that work intelligently ",
|
|
||||||
"over large document collections.",
|
|
||||||
"Recent advances in NLP have enabled the application of ",
|
|
||||||
"question answering, retrieval and summarization ",
|
|
||||||
"to real world settings and Haystack is designed to be ",
|
|
||||||
"the bridge between research and industry.",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"url": "https://haystack.deepset.ai/overview/v1.1.0/use-cases",
|
|
||||||
"partial_content": [
|
|
||||||
"Expect to see results that highlight",
|
|
||||||
"the very sentence that contains the answer to your question.",
|
|
||||||
"Thanks to the power of Transformer based language models,",
|
|
||||||
"results are chosen based on compatibility in meaning",
|
|
||||||
"rather than lexical overlap.",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
crawler = Crawler(output_dir=tmp_dir)
|
|
||||||
for _dict in partial_content_match:
|
|
||||||
url: str = _dict["url"]
|
|
||||||
partial_content: list = _dict["partial_content"]
|
|
||||||
|
|
||||||
doc_path = crawler.crawl(urls=[url], crawler_depth=0)
|
|
||||||
assert len(doc_path) == 1
|
|
||||||
|
|
||||||
for json_file in doc_path:
|
|
||||||
assert isinstance(json_file, Path)
|
|
||||||
with open(json_file.absolute(), "r") as read_file:
|
|
||||||
content = json.load(read_file)
|
|
||||||
assert isinstance(content["content"], str)
|
|
||||||
for partial_line in partial_content:
|
|
||||||
assert search(partial_line, content["content"])
|
|
||||||
assert partial_line in content["content"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_crawler_return_document(tmp_path):
|
|
||||||
tmp_dir = tmp_path / "crawled_files"
|
|
||||||
_url = ["https://haystack.deepset.ai/docs/v1.0.0/intromd"]
|
|
||||||
|
|
||||||
crawler = Crawler(output_dir=tmp_dir)
|
|
||||||
docs_path = crawler.crawl(urls=_url, crawler_depth=1)
|
|
||||||
results, _ = crawler.run(urls=_url, crawler_depth=1, return_documents=True)
|
|
||||||
documents = results["documents"]
|
documents = results["documents"]
|
||||||
|
|
||||||
for json_file, document in zip(docs_path, documents):
|
for json_file, document in zip(docs_path, documents):
|
||||||
@ -119,3 +56,78 @@ def test_crawler_return_document(tmp_path):
|
|||||||
file_content = json.load(read_file)
|
file_content = json.load(read_file)
|
||||||
assert file_content["meta"] == document.meta
|
assert file_content["meta"] == document.meta
|
||||||
assert file_content["content"] == document.content
|
assert file_content["content"] == document.content
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Unit tests
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawler_url_none_exception(tmp_path):
|
||||||
|
crawler = Crawler(tmp_path)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
crawler.crawl()
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawler_depth_0_single_url(test_url, tmp_path):
|
||||||
|
crawler = Crawler(output_dir=tmp_path)
|
||||||
|
paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
|
||||||
|
assert len(paths) == 1
|
||||||
|
assert content_match(crawler, test_url + "/index.html", paths[0])
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawler_depth_0_many_urls(test_url, tmp_path):
|
||||||
|
crawler = Crawler(output_dir=tmp_path)
|
||||||
|
_urls = [test_url + "/index.html", test_url + "/page1.html"]
|
||||||
|
paths = crawler.crawl(urls=_urls, crawler_depth=0)
|
||||||
|
assert len(paths) == 2
|
||||||
|
assert content_match(crawler, test_url + "/index.html", paths[0])
|
||||||
|
assert content_match(crawler, test_url + "/page1.html", paths[1])
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawler_depth_1_single_url(test_url, tmp_path):
|
||||||
|
crawler = Crawler(output_dir=tmp_path)
|
||||||
|
paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1)
|
||||||
|
assert len(paths) == 3
|
||||||
|
assert content_match(crawler, test_url + "/index.html", paths[0])
|
||||||
|
assert content_match(crawler, test_url + "/page1.html", paths[1])
|
||||||
|
assert content_match(crawler, test_url + "/page2.html", paths[2])
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawler_output_file_structure(test_url, tmp_path):
|
||||||
|
crawler = Crawler(output_dir=tmp_path)
|
||||||
|
paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
|
||||||
|
assert content_match(crawler, test_url + "/index.html", paths[0])
|
||||||
|
|
||||||
|
with open(paths[0].absolute(), "r") as doc_file:
|
||||||
|
data = json.load(doc_file)
|
||||||
|
assert "content" in data
|
||||||
|
assert "meta" in data
|
||||||
|
assert isinstance(data["content"], str)
|
||||||
|
assert len(data["content"].split()) > 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawler_filter_urls(test_url, tmp_path):
|
||||||
|
crawler = Crawler(output_dir=tmp_path)
|
||||||
|
|
||||||
|
paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1)
|
||||||
|
assert len(paths) == 1
|
||||||
|
assert content_match(crawler, test_url + "/index.html", paths[0])
|
||||||
|
|
||||||
|
# Note: filter_urls can exclude pages listed in `urls` as well
|
||||||
|
paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1)
|
||||||
|
assert len(paths) == 1
|
||||||
|
assert content_match(crawler, test_url + "/page1.html", paths[0])
|
||||||
|
assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google\.com"], crawler_depth=1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawler_return_document(test_url, tmp_path):
|
||||||
|
crawler = Crawler(output_dir=tmp_path)
|
||||||
|
documents, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=True)
|
||||||
|
paths, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=False)
|
||||||
|
|
||||||
|
for path, document in zip(paths["paths"], documents["documents"]):
|
||||||
|
with open(path.absolute(), "r") as doc_file:
|
||||||
|
file_content = json.load(doc_file)
|
||||||
|
assert file_content["meta"] == document.meta
|
||||||
|
assert file_content["content"] == document.content
|
||||||
|
|||||||
11
test/samples/crawler/index.html
Normal file
11
test/samples/crawler/index.html
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Test Home Page for Crawler</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>home page content</p>
|
||||||
|
<a href="page1.html">link to page 1</a>
|
||||||
|
<a href="page2.html">link to page 2</a>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
11
test/samples/crawler/page1.html
Normal file
11
test/samples/crawler/page1.html
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Test Page 1 for Crawler</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>page 1 content</p>
|
||||||
|
<a href="index.html">link to home</a>
|
||||||
|
<a href="page2.html">link to page 2</a>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
11
test/samples/crawler/page2.html
Normal file
11
test/samples/crawler/page2.html
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Test Page 2 for Crawler</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>page 2 content</p>
|
||||||
|
<a href="index.html">link to home</a>
|
||||||
|
<a href="page1.html">link to page 1</a>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
Loading…
x
Reference in New Issue
Block a user