diff --git a/haystack/nodes/connector/crawler.py b/haystack/nodes/connector/crawler.py index d2378d9ad..f57eee7f4 100644 --- a/haystack/nodes/connector/crawler.py +++ b/haystack/nodes/connector/crawler.py @@ -149,18 +149,23 @@ class Crawler(BaseComponent): logger.info(f"Found data stored in `{output_dir}`. Delete this first if you really want to fetch new data.") else: logger.info(f"Fetching from {urls} to `{output_dir}`") - sub_links: Dict[str, List] = {} - # don't go beyond the initial list of urls - if crawler_depth == 0: + # Start by writing out the initial list of urls + if filter_urls: + pattern = re.compile("|".join(filter_urls)) + for url in urls: + if pattern.search(url): + file_paths += self._write_to_files([url], output_dir=output_dir) + else: file_paths += self._write_to_files(urls, output_dir=output_dir) - # follow one level of sublinks - elif crawler_depth == 1: + # follow one level of sublinks if requested + if crawler_depth == 1: + sub_links: Dict[str, List] = {} for url_ in urls: - existed_links: List = list(sum(list(sub_links.values()), [])) + already_found_links: List = list(sum(list(sub_links.values()), [])) sub_links[url_] = list( self._extract_sublinks_from_url( - base_url=url_, filter_urls=filter_urls, existed_links=existed_links + base_url=url_, filter_urls=filter_urls, already_found_links=already_found_links ) ) for url, extracted_sublink in sub_links.items(): @@ -277,24 +282,23 @@ class Crawler(BaseComponent): return base_url_.path == sub_link_.path and base_url_.netloc == sub_link_.netloc def _extract_sublinks_from_url( - self, base_url: str, filter_urls: Optional[List] = None, existed_links: List = None + self, base_url: str, filter_urls: Optional[List] = None, already_found_links: List = None ) -> set: + if filter_urls: + filter_pattern = re.compile("|".join(filter_urls)) + self.driver.get(base_url) a_elements = self.driver.find_elements_by_xpath("//a[@href]") sub_links = set() - if not (existed_links and base_url in existed_links): - if filter_urls: - if re.compile("|".join(filter_urls)).search(base_url): - sub_links.add(base_url) for i in a_elements: sub_link = i.get_attribute("href") - if not (existed_links and sub_link in existed_links): + if not (already_found_links and sub_link in already_found_links): if self._is_internal_url(base_url=base_url, sub_link=sub_link) and ( not self._is_inpage_navigation(base_url=base_url, sub_link=sub_link) ): if filter_urls: - if re.compile("|".join(filter_urls)).search(sub_link): + if filter_pattern.search(sub_link): sub_links.add(sub_link) else: sub_links.add(sub_link) diff --git a/test/nodes/test_connector.py b/test/nodes/test_connector.py index 07115dfcf..e2cd5f9c2 100644 --- a/test/nodes/test_connector.py +++ b/test/nodes/test_connector.py @@ -1,114 +1,51 @@ +import os import json +import shutil +import tempfile from pathlib import Path -from re import search import pytest + from haystack.nodes.connector import Crawler from haystack.schema import Document - -def test_crawler_url_none_exception(tmp_path): - tmp_dir = tmp_path / "crawled_files" - with pytest.raises(ValueError): - Crawler(tmp_dir).crawl() +from ..conftest import SAMPLES_PATH -def test_crawler_depth(tmp_path): - tmp_dir = tmp_path / "crawled_files" - _url = ["https://haystack.deepset.ai/overview/get-started"] - crawler = Crawler(output_dir=tmp_dir) - doc_path = crawler.crawl(urls=_url, crawler_depth=0) - assert len(doc_path) == 1 - - _urls = [ - "https://haystack.deepset.ai/overview/v1.2.0/get-started", - "https://haystack.deepset.ai/overview/v1.1.0/get-started", - "https://haystack.deepset.ai/overview/v1.0.0/get-started", - ] - doc_path = crawler.crawl(urls=_urls, crawler_depth=0) - assert len(doc_path) == 3 - - doc_path = crawler.crawl(urls=_url, crawler_depth=1) - assert len(doc_path) > 1 - - for json_file in doc_path: - assert isinstance(json_file, Path) - with open(json_file.absolute(), "r") as read_file: - data = json.load(read_file) - assert "content" in data - assert "meta" in data - assert isinstance(data["content"], str) - assert len(data["content"].split()) > 2 +@pytest.fixture(scope="session") +def test_url(): + return f"file://{SAMPLES_PATH.absolute()}/crawler" -def test_crawler_filter_urls(tmp_path): - tmp_dir = tmp_path / "crawled_files" - _url = ["https://haystack.deepset.ai/overview/v1.2.0/"] +def content_match(crawler: Crawler, url: str, crawled_page: Path): + """ + :param crawler: the tested Crawler object + :param base_url: the URL from test_url fixture + :param page_name: the expected page + :param crawled_page: the output of Crawler (one element of the paths list) + """ + crawler.driver.get(url) + body = crawler.driver.find_element_by_tag_name("body") + expected_crawled_content = body.text + + with open(crawled_page, "r") as crawled_file: + page_data = json.load(crawled_file) + return page_data["content"] == expected_crawled_content + + +# +# Integration +# + + +@pytest.mark.integration +def test_crawler(tmp_path): + tmp_dir = tmp_path + url = ["https://haystack.deepset.ai/"] crawler = Crawler(output_dir=tmp_dir) - doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v1\.3\.0\/"]) - assert len(doc_path) == 0 - - doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v1\.2\.0\/"]) - assert len(doc_path) > 0 - - doc_path = crawler.crawl(urls=_url, filter_urls=["google\.com"]) - assert len(doc_path) == 0 - - -def test_crawler_content(tmp_path): - tmp_dir = tmp_path / "crawled_files" - - partial_content_match: list = [ - { - "url": "https://haystack.deepset.ai/overview/v1.1.0/intro", - "partial_content": [ - "Haystack is an open-source framework ", - "for building search systems that work intelligently ", - "over large document collections.", - "Recent advances in NLP have enabled the application of ", - "question answering, retrieval and summarization ", - "to real world settings and Haystack is designed to be ", - "the bridge between research and industry.", - ], - }, - { - "url": "https://haystack.deepset.ai/overview/v1.1.0/use-cases", - "partial_content": [ - "Expect to see results that highlight", - "the very sentence that contains the answer to your question.", - "Thanks to the power of Transformer based language models,", - "results are chosen based on compatibility in meaning", - "rather than lexical overlap.", - ], - }, - ] - - crawler = Crawler(output_dir=tmp_dir) - for _dict in partial_content_match: - url: str = _dict["url"] - partial_content: list = _dict["partial_content"] - - doc_path = crawler.crawl(urls=[url], crawler_depth=0) - assert len(doc_path) == 1 - - for json_file in doc_path: - assert isinstance(json_file, Path) - with open(json_file.absolute(), "r") as read_file: - content = json.load(read_file) - assert isinstance(content["content"], str) - for partial_line in partial_content: - assert search(partial_line, content["content"]) - assert partial_line in content["content"] - - -def test_crawler_return_document(tmp_path): - tmp_dir = tmp_path / "crawled_files" - _url = ["https://haystack.deepset.ai/docs/v1.0.0/intromd"] - - crawler = Crawler(output_dir=tmp_dir) - docs_path = crawler.crawl(urls=_url, crawler_depth=1) - results, _ = crawler.run(urls=_url, crawler_depth=1, return_documents=True) + docs_path = crawler.crawl(urls=url, crawler_depth=0) + results, _ = crawler.run(urls=url, crawler_depth=0, return_documents=True) documents = results["documents"] for json_file, document in zip(docs_path, documents): @@ -119,3 +56,78 @@ def test_crawler_return_document(tmp_path): file_content = json.load(read_file) assert file_content["meta"] == document.meta assert file_content["content"] == document.content + + +# +# Unit tests +# + + +def test_crawler_url_none_exception(tmp_path): + crawler = Crawler(tmp_path) + with pytest.raises(ValueError): + crawler.crawl() + + +def test_crawler_depth_0_single_url(test_url, tmp_path): + crawler = Crawler(output_dir=tmp_path) + paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0) + assert len(paths) == 1 + assert content_match(crawler, test_url + "/index.html", paths[0]) + + +def test_crawler_depth_0_many_urls(test_url, tmp_path): + crawler = Crawler(output_dir=tmp_path) + _urls = [test_url + "/index.html", test_url + "/page1.html"] + paths = crawler.crawl(urls=_urls, crawler_depth=0) + assert len(paths) == 2 + assert content_match(crawler, test_url + "/index.html", paths[0]) + assert content_match(crawler, test_url + "/page1.html", paths[1]) + + +def test_crawler_depth_1_single_url(test_url, tmp_path): + crawler = Crawler(output_dir=tmp_path) + paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1) + assert len(paths) == 3 + assert content_match(crawler, test_url + "/index.html", paths[0]) + assert content_match(crawler, test_url + "/page1.html", paths[1]) + assert content_match(crawler, test_url + "/page2.html", paths[2]) + + +def test_crawler_output_file_structure(test_url, tmp_path): + crawler = Crawler(output_dir=tmp_path) + paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0) + assert content_match(crawler, test_url + "/index.html", paths[0]) + + with open(paths[0].absolute(), "r") as doc_file: + data = json.load(doc_file) + assert "content" in data + assert "meta" in data + assert isinstance(data["content"], str) + assert len(data["content"].split()) > 2 + + +def test_crawler_filter_urls(test_url, tmp_path): + crawler = Crawler(output_dir=tmp_path) + + paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1) + assert len(paths) == 1 + assert content_match(crawler, test_url + "/index.html", paths[0]) + + # Note: filter_urls can exclude pages listed in `urls` as well + paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1) + assert len(paths) == 1 + assert content_match(crawler, test_url + "/page1.html", paths[0]) + assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google\.com"], crawler_depth=1) + + +def test_crawler_return_document(test_url, tmp_path): + crawler = Crawler(output_dir=tmp_path) + documents, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=True) + paths, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=False) + + for path, document in zip(paths["paths"], documents["documents"]): + with open(path.absolute(), "r") as doc_file: + file_content = json.load(doc_file) + assert file_content["meta"] == document.meta + assert file_content["content"] == document.content diff --git a/test/samples/crawler/index.html b/test/samples/crawler/index.html new file mode 100644 index 000000000..15ce68605 --- /dev/null +++ b/test/samples/crawler/index.html @@ -0,0 +1,11 @@ + + + + Test Home Page for Crawler + + +

home page content

+ link to page 1 + link to page 2 + + \ No newline at end of file diff --git a/test/samples/crawler/page1.html b/test/samples/crawler/page1.html new file mode 100644 index 000000000..9f5bfc46a --- /dev/null +++ b/test/samples/crawler/page1.html @@ -0,0 +1,11 @@ + + + + Test Page 1 for Crawler + + +

page 1 content

+ link to home + link to page 2 + + \ No newline at end of file diff --git a/test/samples/crawler/page2.html b/test/samples/crawler/page2.html new file mode 100644 index 000000000..21e023dbd --- /dev/null +++ b/test/samples/crawler/page2.html @@ -0,0 +1,11 @@ + + + + Test Page 2 for Crawler + + +

page 2 content

+ link to home + link to page 1 + + \ No newline at end of file