[CI refactoring] Rewrite Crawler tests (#2557)

* Rewrite crawler tests (very slow) and fix small crawler bug * Update Documentation & Code Style * compile the regex only once * Factor out the html files & add content check to most tests * Clarify that even starting URLs can be excluded * Update Documentation & Code Style * Change signature * Fix failing test * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2026-01-06 12:07:04 +00:00 · 2022-06-06 17:52:37 +02:00 · 2022-06-06 17:52:37 +02:00 · 83648b9bc0
commit 83648b9bc0
parent 0a4477d315
5 changed files with 161 additions and 112 deletions
--- a/haystack/nodes/connector/crawler.py
+++ b/haystack/nodes/connector/crawler.py
@ -149,18 +149,23 @@ class Crawler(BaseComponent):
            logger.info(f"Found data stored in `{output_dir}`. Delete this first if you really want to fetch new data.")
        else:
            logger.info(f"Fetching from {urls} to `{output_dir}`")
-            sub_links: Dict[str, List] = {}

-            # don't go beyond the initial list of urls
-            if crawler_depth == 0:
+            # Start by writing out the initial list of urls
+            if filter_urls:
+                pattern = re.compile("|".join(filter_urls))
+                for url in urls:
+                    if pattern.search(url):
+                        file_paths += self._write_to_files([url], output_dir=output_dir)
+            else:
                file_paths += self._write_to_files(urls, output_dir=output_dir)
-            # follow one level of sublinks
-            elif crawler_depth == 1:
+            # follow one level of sublinks if requested
+            if crawler_depth == 1:
+                sub_links: Dict[str, List] = {}
                for url_ in urls:
-                    existed_links: List = list(sum(list(sub_links.values()), []))
+                    already_found_links: List = list(sum(list(sub_links.values()), []))
                    sub_links[url_] = list(
                        self._extract_sublinks_from_url(
-                            base_url=url_, filter_urls=filter_urls, existed_links=existed_links
+                            base_url=url_, filter_urls=filter_urls, already_found_links=already_found_links
                        )
                    )
                for url, extracted_sublink in sub_links.items():
@ -277,24 +282,23 @@ class Crawler(BaseComponent):
        return base_url_.path == sub_link_.path and base_url_.netloc == sub_link_.netloc

    def _extract_sublinks_from_url(
-        self, base_url: str, filter_urls: Optional[List] = None, existed_links: List = None
+        self, base_url: str, filter_urls: Optional[List] = None, already_found_links: List = None
    ) -> set:
+        if filter_urls:
+            filter_pattern = re.compile("|".join(filter_urls))
+
        self.driver.get(base_url)
        a_elements = self.driver.find_elements_by_xpath("//a[@href]")
        sub_links = set()
-        if not (existed_links and base_url in existed_links):
-            if filter_urls:
-                if re.compile("|".join(filter_urls)).search(base_url):
-                    sub_links.add(base_url)

        for i in a_elements:
            sub_link = i.get_attribute("href")
-            if not (existed_links and sub_link in existed_links):
+            if not (already_found_links and sub_link in already_found_links):
                if self._is_internal_url(base_url=base_url, sub_link=sub_link) and (
                    not self._is_inpage_navigation(base_url=base_url, sub_link=sub_link)
                ):
                    if filter_urls:
-                        if re.compile("|".join(filter_urls)).search(sub_link):
+                        if filter_pattern.search(sub_link):
                            sub_links.add(sub_link)
                    else:
                        sub_links.add(sub_link)
--- a/test/nodes/test_connector.py
+++ b/test/nodes/test_connector.py
@ -1,114 +1,51 @@
+import os
 import json
+import shutil
+import tempfile
 from pathlib import Path
-from re import search

 import pytest
+
 from haystack.nodes.connector import Crawler
 from haystack.schema import Document

-
-def test_crawler_url_none_exception(tmp_path):
-    tmp_dir = tmp_path / "crawled_files"
-    with pytest.raises(ValueError):
-        Crawler(tmp_dir).crawl()
+from ..conftest import SAMPLES_PATH


-def test_crawler_depth(tmp_path):
-    tmp_dir = tmp_path / "crawled_files"
-    _url = ["https://haystack.deepset.ai/overview/get-started"]
-    crawler = Crawler(output_dir=tmp_dir)
-    doc_path = crawler.crawl(urls=_url, crawler_depth=0)
-    assert len(doc_path) == 1
-
-    _urls = [
-        "https://haystack.deepset.ai/overview/v1.2.0/get-started",
-        "https://haystack.deepset.ai/overview/v1.1.0/get-started",
-        "https://haystack.deepset.ai/overview/v1.0.0/get-started",
-    ]
-    doc_path = crawler.crawl(urls=_urls, crawler_depth=0)
-    assert len(doc_path) == 3
-
-    doc_path = crawler.crawl(urls=_url, crawler_depth=1)
-    assert len(doc_path) > 1
-
-    for json_file in doc_path:
-        assert isinstance(json_file, Path)
-        with open(json_file.absolute(), "r") as read_file:
-            data = json.load(read_file)
-            assert "content" in data
-            assert "meta" in data
-            assert isinstance(data["content"], str)
-            assert len(data["content"].split()) > 2
+@pytest.fixture(scope="session")
+def test_url():
+    return f"file://{SAMPLES_PATH.absolute()}/crawler"


-def test_crawler_filter_urls(tmp_path):
-    tmp_dir = tmp_path / "crawled_files"
-    _url = ["https://haystack.deepset.ai/overview/v1.2.0/"]
+def content_match(crawler: Crawler, url: str, crawled_page: Path):
+    """
+    :param crawler: the tested Crawler object
+    :param base_url: the URL from test_url fixture
+    :param page_name: the expected page
+    :param crawled_page: the output of Crawler (one element of the paths list)
+    """
+    crawler.driver.get(url)
+    body = crawler.driver.find_element_by_tag_name("body")
+    expected_crawled_content = body.text
+
+    with open(crawled_page, "r") as crawled_file:
+        page_data = json.load(crawled_file)
+        return page_data["content"] == expected_crawled_content
+
+
+#
+# Integration
+#
+
+
+@pytest.mark.integration
+def test_crawler(tmp_path):
+    tmp_dir = tmp_path
+    url = ["https://haystack.deepset.ai/"]

    crawler = Crawler(output_dir=tmp_dir)
-    doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v1\.3\.0\/"])
-    assert len(doc_path) == 0
-
-    doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v1\.2\.0\/"])
-    assert len(doc_path) > 0
-
-    doc_path = crawler.crawl(urls=_url, filter_urls=["google\.com"])
-    assert len(doc_path) == 0
-
-
-def test_crawler_content(tmp_path):
-    tmp_dir = tmp_path / "crawled_files"
-
-    partial_content_match: list = [
-        {
-            "url": "https://haystack.deepset.ai/overview/v1.1.0/intro",
-            "partial_content": [
-                "Haystack is an open-source framework ",
-                "for building search systems that work intelligently ",
-                "over large document collections.",
-                "Recent advances in NLP have enabled the application of ",
-                "question answering, retrieval and summarization ",
-                "to real world settings and Haystack is designed to be ",
-                "the bridge between research and industry.",
-            ],
-        },
-        {
-            "url": "https://haystack.deepset.ai/overview/v1.1.0/use-cases",
-            "partial_content": [
-                "Expect to see results that highlight",
-                "the very sentence that contains the answer to your question.",
-                "Thanks to the power of Transformer based language models,",
-                "results are chosen based on compatibility in meaning",
-                "rather than lexical overlap.",
-            ],
-        },
-    ]
-
-    crawler = Crawler(output_dir=tmp_dir)
-    for _dict in partial_content_match:
-        url: str = _dict["url"]
-        partial_content: list = _dict["partial_content"]
-
-        doc_path = crawler.crawl(urls=[url], crawler_depth=0)
-        assert len(doc_path) == 1
-
-        for json_file in doc_path:
-            assert isinstance(json_file, Path)
-            with open(json_file.absolute(), "r") as read_file:
-                content = json.load(read_file)
-                assert isinstance(content["content"], str)
-                for partial_line in partial_content:
-                    assert search(partial_line, content["content"])
-                    assert partial_line in content["content"]
-
-
-def test_crawler_return_document(tmp_path):
-    tmp_dir = tmp_path / "crawled_files"
-    _url = ["https://haystack.deepset.ai/docs/v1.0.0/intromd"]
-
-    crawler = Crawler(output_dir=tmp_dir)
-    docs_path = crawler.crawl(urls=_url, crawler_depth=1)
-    results, _ = crawler.run(urls=_url, crawler_depth=1, return_documents=True)
+    docs_path = crawler.crawl(urls=url, crawler_depth=0)
+    results, _ = crawler.run(urls=url, crawler_depth=0, return_documents=True)
    documents = results["documents"]

    for json_file, document in zip(docs_path, documents):
@ -119,3 +56,78 @@ def test_crawler_return_document(tmp_path):
            file_content = json.load(read_file)
            assert file_content["meta"] == document.meta
            assert file_content["content"] == document.content
+
+
+#
+# Unit tests
+#
+
+
+def test_crawler_url_none_exception(tmp_path):
+    crawler = Crawler(tmp_path)
+    with pytest.raises(ValueError):
+        crawler.crawl()
+
+
+def test_crawler_depth_0_single_url(test_url, tmp_path):
+    crawler = Crawler(output_dir=tmp_path)
+    paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
+    assert len(paths) == 1
+    assert content_match(crawler, test_url + "/index.html", paths[0])
+
+
+def test_crawler_depth_0_many_urls(test_url, tmp_path):
+    crawler = Crawler(output_dir=tmp_path)
+    _urls = [test_url + "/index.html", test_url + "/page1.html"]
+    paths = crawler.crawl(urls=_urls, crawler_depth=0)
+    assert len(paths) == 2
+    assert content_match(crawler, test_url + "/index.html", paths[0])
+    assert content_match(crawler, test_url + "/page1.html", paths[1])
+
+
+def test_crawler_depth_1_single_url(test_url, tmp_path):
+    crawler = Crawler(output_dir=tmp_path)
+    paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1)
+    assert len(paths) == 3
+    assert content_match(crawler, test_url + "/index.html", paths[0])
+    assert content_match(crawler, test_url + "/page1.html", paths[1])
+    assert content_match(crawler, test_url + "/page2.html", paths[2])
+
+
+def test_crawler_output_file_structure(test_url, tmp_path):
+    crawler = Crawler(output_dir=tmp_path)
+    paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
+    assert content_match(crawler, test_url + "/index.html", paths[0])
+
+    with open(paths[0].absolute(), "r") as doc_file:
+        data = json.load(doc_file)
+        assert "content" in data
+        assert "meta" in data
+        assert isinstance(data["content"], str)
+        assert len(data["content"].split()) > 2
+
+
+def test_crawler_filter_urls(test_url, tmp_path):
+    crawler = Crawler(output_dir=tmp_path)
+
+    paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1)
+    assert len(paths) == 1
+    assert content_match(crawler, test_url + "/index.html", paths[0])
+
+    # Note: filter_urls can exclude pages listed in `urls` as well
+    paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1)
+    assert len(paths) == 1
+    assert content_match(crawler, test_url + "/page1.html", paths[0])
+    assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google\.com"], crawler_depth=1)
+
+
+def test_crawler_return_document(test_url, tmp_path):
+    crawler = Crawler(output_dir=tmp_path)
+    documents, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=True)
+    paths, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=False)
+
+    for path, document in zip(paths["paths"], documents["documents"]):
+        with open(path.absolute(), "r") as doc_file:
+            file_content = json.load(doc_file)
+            assert file_content["meta"] == document.meta
+            assert file_content["content"] == document.content
--- a/test/samples/crawler/index.html
+++ b/test/samples/crawler/index.html
@ -0,0 +1,11 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Test Home Page for Crawler</title>
+</head>
+<body>
+    <p>home page content</p>
+    <a href="page1.html">link to page 1</a>
+    <a href="page2.html">link to page 2</a>
+</body>
+</html>
--- a/test/samples/crawler/page1.html
+++ b/test/samples/crawler/page1.html
@ -0,0 +1,11 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Test Page 1 for Crawler</title>
+</head>
+<body>
+    <p>page 1 content</p>
+    <a href="index.html">link to home</a>
+    <a href="page2.html">link to page 2</a>
+</body>
+</html>
--- a/test/samples/crawler/page2.html
+++ b/test/samples/crawler/page2.html
@ -0,0 +1,11 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Test Page 2 for Crawler</title>
+</head>
+<body>
+    <p>page 2 content</p>
+    <a href="index.html">link to home</a>
+    <a href="page1.html">link to page 1</a>
+</body>
+</html>