haystack/test/nodes/test_connector.py

import os
import json
import shutil
import tempfile
from pathlib import Path

import pytest

from haystack.nodes.connector import Crawler
from haystack.schema import Document

from ..conftest import SAMPLES_PATH


@pytest.fixture(scope="session")
def test_url():
    return f"file://{SAMPLES_PATH.absolute()}/crawler"


def content_match(crawler: Crawler, url: str, crawled_page: Path):
    """
    :param crawler: the tested Crawler object
    :param base_url: the URL from test_url fixture
    :param page_name: the expected page
    :param crawled_page: the output of Crawler (one element of the paths list)
    """
    crawler.driver.get(url)
    body = crawler.driver.find_element_by_tag_name("body")
    expected_crawled_content = body.text

    with open(crawled_page, "r") as crawled_file:
        page_data = json.load(crawled_file)
        return page_data["content"] == expected_crawled_content


#
# Integration
#


@pytest.mark.integration
def test_crawler(tmp_path):
    tmp_dir = tmp_path
    url = ["https://haystack.deepset.ai/"]

    crawler = Crawler(output_dir=tmp_dir)
    docs_path = crawler.crawl(urls=url, crawler_depth=0)
    results, _ = crawler.run(urls=url, crawler_depth=0, return_documents=True)
    documents = results["documents"]

    for json_file, document in zip(docs_path, documents):
        assert isinstance(json_file, Path)
        assert isinstance(document, Document)

        with open(json_file.absolute(), "r") as read_file:
            file_content = json.load(read_file)
            assert file_content["meta"] == document.meta
            assert file_content["content"] == document.content


#
# Unit tests
#


def test_crawler_url_none_exception(tmp_path):
    crawler = Crawler(tmp_path)
    with pytest.raises(ValueError):
        crawler.crawl()


def test_crawler_depth_0_single_url(test_url, tmp_path):
    crawler = Crawler(output_dir=tmp_path)
    paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
    assert len(paths) == 1
    assert content_match(crawler, test_url + "/index.html", paths[0])


def test_crawler_depth_0_many_urls(test_url, tmp_path):
    crawler = Crawler(output_dir=tmp_path)
    _urls = [test_url + "/index.html", test_url + "/page1.html"]
    paths = crawler.crawl(urls=_urls, crawler_depth=0)
    assert len(paths) == 2
    assert content_match(crawler, test_url + "/index.html", paths[0])
    assert content_match(crawler, test_url + "/page1.html", paths[1])


def test_crawler_depth_1_single_url(test_url, tmp_path):
    crawler = Crawler(output_dir=tmp_path)
    paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1)
    assert len(paths) == 3
    assert content_match(crawler, test_url + "/index.html", paths[0])
    assert content_match(crawler, test_url + "/page1.html", paths[1])
    assert content_match(crawler, test_url + "/page2.html", paths[2])


def test_crawler_output_file_structure(test_url, tmp_path):
    crawler = Crawler(output_dir=tmp_path)
    paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
    assert content_match(crawler, test_url + "/index.html", paths[0])

    with open(paths[0].absolute(), "r") as doc_file:
        data = json.load(doc_file)
        assert "content" in data
        assert "meta" in data
        assert isinstance(data["content"], str)
        assert len(data["content"].split()) > 2


def test_crawler_filter_urls(test_url, tmp_path):
    crawler = Crawler(output_dir=tmp_path)

    paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1)
    assert len(paths) == 1
    assert content_match(crawler, test_url + "/index.html", paths[0])

    # Note: filter_urls can exclude pages listed in `urls` as well
    paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1)
    assert len(paths) == 1
    assert content_match(crawler, test_url + "/page1.html", paths[0])
    assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google\.com"], crawler_depth=1)


def test_crawler_return_document(test_url, tmp_path):
    crawler = Crawler(output_dir=tmp_path)
    documents, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=True)
    paths, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=False)

    for path, document in zip(paths["paths"], documents["documents"]):
        with open(path.absolute(), "r") as doc_file:
            file_content = json.load(doc_file)
            assert file_content["meta"] == document.meta
            assert file_content["content"] == document.content
[CI refactoring] Rewrite `Crawler` tests (#2557) * Rewrite crawler tests (very slow) and fix small crawler bug * Update Documentation & Code Style * compile the regex only once * Factor out the html files & add content check to most tests * Clarify that even starting URLs can be excluded * Update Documentation & Code Style * Change signature * Fix failing test * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-06 17:52:37 +02:00			`import os`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00			`import json`
[CI refactoring] Rewrite `Crawler` tests (#2557) * Rewrite crawler tests (very slow) and fix small crawler bug * Update Documentation & Code Style * compile the regex only once * Factor out the html files & add content check to most tests * Clarify that even starting URLs can be excluded * Update Documentation & Code Style * Change signature * Fix failing test * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-06 17:52:37 +02:00			`import shutil`
			`import tempfile`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00			`from pathlib import Path`

			`import pytest`
[CI refactoring] Rewrite `Crawler` tests (#2557) * Rewrite crawler tests (very slow) and fix small crawler bug * Update Documentation & Code Style * compile the regex only once * Factor out the html files & add content check to most tests * Clarify that even starting URLs can be excluded * Update Documentation & Code Style * Change signature * Fix failing test * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-06 17:52:37 +02:00
Improve dependency management (#1994) * Fist attempt at using setup.cfg for dependency management * Trying the new package on the CI and in Docker too * Add composite extras_require * Add the safe_import function for document store imports and add some try-catch statements on rest_api and ui imports * Fix bug on class import and rephrase error message * Introduce typing for optional modules and add type: ignore in sparse.py * Include importlib_metadata backport for py3.7 * Add colab group to extra_requires * Fix pillow version * Fix grpcio * Separate out the crawler as another extra * Make paths relative in rest_api and ui * Update the test matrix in the CI * Add try catch statements around the optional imports too to account for direct imports * Never mix direct deps with self-references and add ES deps to the base install * Refactor several paths in tests to make them insensitive to the execution path * Include tstadel review and re-introduce Milvus1 in the tests suite, to fix * Wrap pdf conversion utils into safe_import * Update some tutorials and rever Milvus1 as default for now, see #2067 * Fix mypy config Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-01-26 18:12:55 +01:00			`from haystack.nodes.connector import Crawler`
Change return types of indexing pipeline nodes (#2342) * Change return types of file converters * Change return types of preprocessor * Change return types of crawler * Adapt utils to functions to new return types * Adapt __init__.py to new method names * Prevent circular imports * Update Documentation & Code Style * Let DocStores' run method accept Documents * Adapt tests to new return types * Update Documentation & Code Style * Put "# type: ignore" to right place * Remove id_hash_keys property from Document primitive * Update Documentation & Code Style * Adapt tests to new return types and missing id_hash_keys property * Fix mypy * Fix mypy * Adapt PDFToTextOCRConverter * Remove id_hash_keys from RestAPI tests * Update Documentation & Code Style * Rename tests * Remove redundant setting of content_type="text" * Add DeprecationWarning * Add id_hash_keys to elasticsearch_index_to_document_store * Change document type from dict to Docuemnt in PreProcessor test * Fix file path in Tutorial 5 * Remove added output in Tutorial 5 * Update Documentation & Code Style * Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http * Adapt tutorials to new return types * Adapt tutorial 14 to new return types * Update Documentation & Code Style * Change assertions to HaystackErrors * Import HaystackError correctly Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-03-29 13:53:35 +02:00			`from haystack.schema import Document`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00
[CI refactoring] Rewrite `Crawler` tests (#2557) * Rewrite crawler tests (very slow) and fix small crawler bug * Update Documentation & Code Style * compile the regex only once * Factor out the html files & add content check to most tests * Clarify that even starting URLs can be excluded * Update Documentation & Code Style * Change signature * Fix failing test * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-06 17:52:37 +02:00			`from ..conftest import SAMPLES_PATH`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00

[CI refactoring] Rewrite `Crawler` tests (#2557) * Rewrite crawler tests (very slow) and fix small crawler bug * Update Documentation & Code Style * compile the regex only once * Factor out the html files & add content check to most tests * Clarify that even starting URLs can be excluded * Update Documentation & Code Style * Change signature * Fix failing test * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-06 17:52:37 +02:00			`@pytest.fixture(scope="session")`
			`def test_url():`
			`return f"file://{SAMPLES_PATH.absolute()}/crawler"`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00

[CI refactoring] Rewrite `Crawler` tests (#2557) * Rewrite crawler tests (very slow) and fix small crawler bug * Update Documentation & Code Style * compile the regex only once * Factor out the html files & add content check to most tests * Clarify that even starting URLs can be excluded * Update Documentation & Code Style * Change signature * Fix failing test * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-06 17:52:37 +02:00			`def content_match(crawler: Crawler, url: str, crawled_page: Path):`
			`"""`
			`:param crawler: the tested Crawler object`
			`:param base_url: the URL from test_url fixture`
			`:param page_name: the expected page`
			`:param crawled_page: the output of Crawler (one element of the paths list)`
			`"""`
			`crawler.driver.get(url)`
			`body = crawler.driver.find_element_by_tag_name("body")`
			`expected_crawled_content = body.text`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00
[CI refactoring] Rewrite `Crawler` tests (#2557) * Rewrite crawler tests (very slow) and fix small crawler bug * Update Documentation & Code Style * compile the regex only once * Factor out the html files & add content check to most tests * Clarify that even starting URLs can be excluded * Update Documentation & Code Style * Change signature * Fix failing test * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-06 17:52:37 +02:00			`with open(crawled_page, "r") as crawled_file:`
			`page_data = json.load(crawled_file)`
			`return page_data["content"] == expected_crawled_content`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00

[CI refactoring] Rewrite `Crawler` tests (#2557) * Rewrite crawler tests (very slow) and fix small crawler bug * Update Documentation & Code Style * compile the regex only once * Factor out the html files & add content check to most tests * Clarify that even starting URLs can be excluded * Update Documentation & Code Style * Change signature * Fix failing test * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-06 17:52:37 +02:00			`#`
			`# Integration`
			`#`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00

[CI refactoring] Rewrite `Crawler` tests (#2557) * Rewrite crawler tests (very slow) and fix small crawler bug * Update Documentation & Code Style * compile the regex only once * Factor out the html files & add content check to most tests * Clarify that even starting URLs can be excluded * Update Documentation & Code Style * Change signature * Fix failing test * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-06 17:52:37 +02:00			`@pytest.mark.integration`
			`def test_crawler(tmp_path):`
			`tmp_dir = tmp_path`
			`url = ["https://haystack.deepset.ai/"]`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00
			`crawler = Crawler(output_dir=tmp_dir)`
[CI refactoring] Rewrite `Crawler` tests (#2557) * Rewrite crawler tests (very slow) and fix small crawler bug * Update Documentation & Code Style * compile the regex only once * Factor out the html files & add content check to most tests * Clarify that even starting URLs can be excluded * Update Documentation & Code Style * Change signature * Fix failing test * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-06 17:52:37 +02:00			`docs_path = crawler.crawl(urls=url, crawler_depth=0)`
			`results, _ = crawler.run(urls=url, crawler_depth=0, return_documents=True)`
Apply black formatting (#2115) * Testing black on ui/ * Applying black on docstores * Add latest docstring and tutorial changes * Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too * Remove comments * Relax constraints on pydoc-markdown * Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade * Fix a couple of bugs * Add a type: ignore that was missing somehow * Give path to black * Apply Black * Apply Black * Relocate a couple of type: ignore * Update documentation * Make Linux CI run after applying Black * Triggering Black * Apply Black * Remove dependency, does not work well * Remove manually double trailing commas * Update documentation Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-02-03 13:43:18 +01:00			`documents = results["documents"]`
Add Crawler support for indexing pipeline (#1360) 2021-08-24 17:25:22 +05:00
			`for json_file, document in zip(docs_path, documents):`
			`assert isinstance(json_file, Path)`
Change return types of indexing pipeline nodes (#2342) * Change return types of file converters * Change return types of preprocessor * Change return types of crawler * Adapt utils to functions to new return types * Adapt __init__.py to new method names * Prevent circular imports * Update Documentation & Code Style * Let DocStores' run method accept Documents * Adapt tests to new return types * Update Documentation & Code Style * Put "# type: ignore" to right place * Remove id_hash_keys property from Document primitive * Update Documentation & Code Style * Adapt tests to new return types and missing id_hash_keys property * Fix mypy * Fix mypy * Adapt PDFToTextOCRConverter * Remove id_hash_keys from RestAPI tests * Update Documentation & Code Style * Rename tests * Remove redundant setting of content_type="text" * Add DeprecationWarning * Add id_hash_keys to elasticsearch_index_to_document_store * Change document type from dict to Docuemnt in PreProcessor test * Fix file path in Tutorial 5 * Remove added output in Tutorial 5 * Update Documentation & Code Style * Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http * Adapt tutorials to new return types * Adapt tutorial 14 to new return types * Update Documentation & Code Style * Change assertions to HaystackErrors * Import HaystackError correctly Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-03-29 13:53:35 +02:00			`assert isinstance(document, Document)`
Add Crawler support for indexing pipeline (#1360) 2021-08-24 17:25:22 +05:00
			`with open(json_file.absolute(), "r") as read_file:`
			`file_content = json.load(read_file)`
Change return types of indexing pipeline nodes (#2342) * Change return types of file converters * Change return types of preprocessor * Change return types of crawler * Adapt utils to functions to new return types * Adapt __init__.py to new method names * Prevent circular imports * Update Documentation & Code Style * Let DocStores' run method accept Documents * Adapt tests to new return types * Update Documentation & Code Style * Put "# type: ignore" to right place * Remove id_hash_keys property from Document primitive * Update Documentation & Code Style * Adapt tests to new return types and missing id_hash_keys property * Fix mypy * Fix mypy * Adapt PDFToTextOCRConverter * Remove id_hash_keys from RestAPI tests * Update Documentation & Code Style * Rename tests * Remove redundant setting of content_type="text" * Add DeprecationWarning * Add id_hash_keys to elasticsearch_index_to_document_store * Change document type from dict to Docuemnt in PreProcessor test * Fix file path in Tutorial 5 * Remove added output in Tutorial 5 * Update Documentation & Code Style * Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http * Adapt tutorials to new return types * Adapt tutorial 14 to new return types * Update Documentation & Code Style * Change assertions to HaystackErrors * Import HaystackError correctly Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-03-29 13:53:35 +02:00			`assert file_content["meta"] == document.meta`
			`assert file_content["content"] == document.content`
[CI refactoring] Rewrite `Crawler` tests (#2557) * Rewrite crawler tests (very slow) and fix small crawler bug * Update Documentation & Code Style * compile the regex only once * Factor out the html files & add content check to most tests * Clarify that even starting URLs can be excluded * Update Documentation & Code Style * Change signature * Fix failing test * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-06 17:52:37 +02:00

			`#`
			`# Unit tests`
			`#`


			`def test_crawler_url_none_exception(tmp_path):`
			`crawler = Crawler(tmp_path)`
			`with pytest.raises(ValueError):`
			`crawler.crawl()`


			`def test_crawler_depth_0_single_url(test_url, tmp_path):`
			`crawler = Crawler(output_dir=tmp_path)`
			`paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)`
			`assert len(paths) == 1`
			`assert content_match(crawler, test_url + "/index.html", paths[0])`


			`def test_crawler_depth_0_many_urls(test_url, tmp_path):`
			`crawler = Crawler(output_dir=tmp_path)`
			`_urls = [test_url + "/index.html", test_url + "/page1.html"]`
			`paths = crawler.crawl(urls=_urls, crawler_depth=0)`
			`assert len(paths) == 2`
			`assert content_match(crawler, test_url + "/index.html", paths[0])`
			`assert content_match(crawler, test_url + "/page1.html", paths[1])`


			`def test_crawler_depth_1_single_url(test_url, tmp_path):`
			`crawler = Crawler(output_dir=tmp_path)`
			`paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1)`
			`assert len(paths) == 3`
			`assert content_match(crawler, test_url + "/index.html", paths[0])`
			`assert content_match(crawler, test_url + "/page1.html", paths[1])`
			`assert content_match(crawler, test_url + "/page2.html", paths[2])`


			`def test_crawler_output_file_structure(test_url, tmp_path):`
			`crawler = Crawler(output_dir=tmp_path)`
			`paths = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)`
			`assert content_match(crawler, test_url + "/index.html", paths[0])`

			`with open(paths[0].absolute(), "r") as doc_file:`
			`data = json.load(doc_file)`
			`assert "content" in data`
			`assert "meta" in data`
			`assert isinstance(data["content"], str)`
			`assert len(data["content"].split()) > 2`


			`def test_crawler_filter_urls(test_url, tmp_path):`
			`crawler = Crawler(output_dir=tmp_path)`

			`paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["index"], crawler_depth=1)`
			`assert len(paths) == 1`
			`assert content_match(crawler, test_url + "/index.html", paths[0])`

			# Note: filter_urls can exclude pages listed in `urls` as well
			`paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1)`
			`assert len(paths) == 1`
			`assert content_match(crawler, test_url + "/page1.html", paths[0])`
			`assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google\.com"], crawler_depth=1)`


			`def test_crawler_return_document(test_url, tmp_path):`
			`crawler = Crawler(output_dir=tmp_path)`
			`documents, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=True)`
			`paths, _ = crawler.run(urls=[test_url + "/index.html"], crawler_depth=0, return_documents=False)`

			`for path, document in zip(paths["paths"], documents["documents"]):`
			`with open(path.absolute(), "r") as doc_file:`
			`file_content = json.load(doc_file)`
			`assert file_content["meta"] == document.meta`
			`assert file_content["content"] == document.content`