haystack/test/test_connector.py

import json
from pathlib import Path
from re import search

import pytest
from haystack.nodes.connector import Crawler
from haystack.schema import Document


def test_crawler_url_none_exception(tmp_path):
    tmp_dir = tmp_path / "crawled_files"
    with pytest.raises(ValueError):
        Crawler(tmp_dir).crawl()


def test_crawler_depth(tmp_path):
    tmp_dir = tmp_path / "crawled_files"
    _url = ["https://haystack.deepset.ai/overview/get-started"]
    crawler = Crawler(output_dir=tmp_dir)
    doc_path = crawler.crawl(urls=_url, crawler_depth=0)
    assert len(doc_path) == 1

    _urls = [
        "https://haystack.deepset.ai/overview/v0.8.0/get-started",
        "https://haystack.deepset.ai/overview/v0.7.0/get-started",
        "https://haystack.deepset.ai/overview/v0.6.0/get-started",
    ]
    doc_path = crawler.crawl(urls=_urls, crawler_depth=0)
    assert len(doc_path) == 3

    doc_path = crawler.crawl(urls=_url, crawler_depth=1)
    assert len(doc_path) > 1

    for json_file in doc_path:
        assert isinstance(json_file, Path)
        with open(json_file.absolute(), "r") as read_file:
            data = json.load(read_file)
            assert "content" in data
            assert "meta" in data
            assert isinstance(data["content"], str)
            assert len(data["content"].split()) > 2


def test_crawler_filter_urls(tmp_path):
    tmp_dir = tmp_path / "crawled_files"
    _url = ["https://haystack.deepset.ai/overview/v0.8.0/"]

    crawler = Crawler(output_dir=tmp_dir)
    doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v0\.9\.0\/"])
    assert len(doc_path) == 0

    doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v0\.8\.0\/"])
    assert len(doc_path) > 0

    doc_path = crawler.crawl(urls=_url, filter_urls=["google\.com"])
    assert len(doc_path) == 0


def test_crawler_content(tmp_path):
    tmp_dir = tmp_path / "crawled_files"

    partial_content_match: list = [
        {
            "url": "https://haystack.deepset.ai/overview/v0.7.0/intro",
            "partial_content": [
                "What is Haystack",
                "Utilize all transformer based models",
                "a Retriever-Reader pipeline in order",
                "Passing on only a small candidate set",
                "fast indexing and querying",
                "Fine-tune models to your own domain",
                "smoothly switch when new ones get published",
            ],
        },
        {
            "url": "https://haystack.deepset.ai/overview/v0.7.0/use-cases",
            "partial_content": [
                "Semantic Search System",
                "Store your documents in the database of ",
                "results are chosen based on compatibility in",
                "Apply a set of standard questions to each document",
                "Return a NO_ANSWER if a given document",
                "like what is the revenue forecast for 2020?",
                "overview of academic papers and internal business",
            ],
        },
    ]

    crawler = Crawler(output_dir=tmp_dir)
    for _dict in partial_content_match:
        url: str = _dict["url"]
        partial_content: list = _dict["partial_content"]

        doc_path = crawler.crawl(urls=[url], crawler_depth=0)
        assert len(doc_path) == 1

        for json_file in doc_path:
            assert isinstance(json_file, Path)
            with open(json_file.absolute(), "r") as read_file:
                content = json.load(read_file)
                assert isinstance(content["content"], str)
                for partial_line in partial_content:
                    assert search(partial_line, content["content"])
                    assert partial_line in content["content"]


def test_crawler_return_document(tmp_path):
    tmp_dir = tmp_path / "crawled_files"
    _url = ["https://haystack.deepset.ai/docs/v0.5.0/intromd"]

    crawler = Crawler(output_dir=tmp_dir)
    docs_path = crawler.crawl(urls=_url, crawler_depth=1)
    results, _ = crawler.run(urls=_url, crawler_depth=1, return_documents=True)
    documents = results["documents"]

    for json_file, document in zip(docs_path, documents):
        assert isinstance(json_file, Path)
        assert isinstance(document, Document)

        with open(json_file.absolute(), "r") as read_file:
            file_content = json.load(read_file)
            assert file_content["meta"] == document.meta
            assert file_content["content"] == document.content
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00			`import json`
			`from pathlib import Path`
			`from re import search`

			`import pytest`
Improve dependency management (#1994) * Fist attempt at using setup.cfg for dependency management * Trying the new package on the CI and in Docker too * Add composite extras_require * Add the safe_import function for document store imports and add some try-catch statements on rest_api and ui imports * Fix bug on class import and rephrase error message * Introduce typing for optional modules and add type: ignore in sparse.py * Include importlib_metadata backport for py3.7 * Add colab group to extra_requires * Fix pillow version * Fix grpcio * Separate out the crawler as another extra * Make paths relative in rest_api and ui * Update the test matrix in the CI * Add try catch statements around the optional imports too to account for direct imports * Never mix direct deps with self-references and add ES deps to the base install * Refactor several paths in tests to make them insensitive to the execution path * Include tstadel review and re-introduce Milvus1 in the tests suite, to fix * Wrap pdf conversion utils into safe_import * Update some tutorials and rever Milvus1 as default for now, see #2067 * Fix mypy config Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-01-26 18:12:55 +01:00			`from haystack.nodes.connector import Crawler`
Change return types of indexing pipeline nodes (#2342) * Change return types of file converters * Change return types of preprocessor * Change return types of crawler * Adapt utils to functions to new return types * Adapt __init__.py to new method names * Prevent circular imports * Update Documentation & Code Style * Let DocStores' run method accept Documents * Adapt tests to new return types * Update Documentation & Code Style * Put "# type: ignore" to right place * Remove id_hash_keys property from Document primitive * Update Documentation & Code Style * Adapt tests to new return types and missing id_hash_keys property * Fix mypy * Fix mypy * Adapt PDFToTextOCRConverter * Remove id_hash_keys from RestAPI tests * Update Documentation & Code Style * Rename tests * Remove redundant setting of content_type="text" * Add DeprecationWarning * Add id_hash_keys to elasticsearch_index_to_document_store * Change document type from dict to Docuemnt in PreProcessor test * Fix file path in Tutorial 5 * Remove added output in Tutorial 5 * Update Documentation & Code Style * Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http * Adapt tutorials to new return types * Adapt tutorial 14 to new return types * Update Documentation & Code Style * Change assertions to HaystackErrors * Import HaystackError correctly Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-03-29 13:53:35 +02:00			`from haystack.schema import Document`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00

			`def test_crawler_url_none_exception(tmp_path):`
			`tmp_dir = tmp_path / "crawled_files"`
			`with pytest.raises(ValueError):`
			`Crawler(tmp_dir).crawl()`


			`def test_crawler_depth(tmp_path):`
			`tmp_dir = tmp_path / "crawled_files"`
Editing docs read.me for new docs website workflow (#1372) * editing docs read.me for new docs website workflow * added new links to docs 2021-08-30 14:59:40 +02:00			`_url = ["https://haystack.deepset.ai/overview/get-started"]`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00			`crawler = Crawler(output_dir=tmp_dir)`
			`doc_path = crawler.crawl(urls=_url, crawler_depth=0)`
			`assert len(doc_path) == 1`

			`_urls = [`
Editing docs read.me for new docs website workflow (#1372) * editing docs read.me for new docs website workflow * added new links to docs 2021-08-30 14:59:40 +02:00			`"https://haystack.deepset.ai/overview/v0.8.0/get-started",`
			`"https://haystack.deepset.ai/overview/v0.7.0/get-started",`
			`"https://haystack.deepset.ai/overview/v0.6.0/get-started",`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00			`]`
			`doc_path = crawler.crawl(urls=_urls, crawler_depth=0)`
			`assert len(doc_path) == 3`

			`doc_path = crawler.crawl(urls=_url, crawler_depth=1)`
			`assert len(doc_path) > 1`

			`for json_file in doc_path:`
			`assert isinstance(json_file, Path)`
			`with open(json_file.absolute(), "r") as read_file:`
			`data = json.load(read_file)`
Apply black formatting (#2115) * Testing black on ui/ * Applying black on docstores * Add latest docstring and tutorial changes * Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too * Remove comments * Relax constraints on pydoc-markdown * Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade * Fix a couple of bugs * Add a type: ignore that was missing somehow * Give path to black * Apply Black * Apply Black * Relocate a couple of type: ignore * Update documentation * Make Linux CI run after applying Black * Triggering Black * Apply Black * Remove dependency, does not work well * Remove manually double trailing commas * Update documentation Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-02-03 13:43:18 +01:00			`assert "content" in data`
			`assert "meta" in data`
			`assert isinstance(data["content"], str)`
			`assert len(data["content"].split()) > 2`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00

			`def test_crawler_filter_urls(tmp_path):`
			`tmp_dir = tmp_path / "crawled_files"`
Editing docs read.me for new docs website workflow (#1372) * editing docs read.me for new docs website workflow * added new links to docs 2021-08-30 14:59:40 +02:00			`_url = ["https://haystack.deepset.ai/overview/v0.8.0/"]`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00
			`crawler = Crawler(output_dir=tmp_dir)`
Editing docs read.me for new docs website workflow (#1372) * editing docs read.me for new docs website workflow * added new links to docs 2021-08-30 14:59:40 +02:00			`doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v0\.9\.0\/"])`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00			`assert len(doc_path) == 0`

Editing docs read.me for new docs website workflow (#1372) * editing docs read.me for new docs website workflow * added new links to docs 2021-08-30 14:59:40 +02:00			`doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v0\.8\.0\/"])`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00			`assert len(doc_path) > 0`

			`doc_path = crawler.crawl(urls=_url, filter_urls=["google\.com"])`
			`assert len(doc_path) == 0`


			`def test_crawler_content(tmp_path):`
			`tmp_dir = tmp_path / "crawled_files"`

			`partial_content_match: list = [`
Apply black formatting (#2115) * Testing black on ui/ * Applying black on docstores * Add latest docstring and tutorial changes * Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too * Remove comments * Relax constraints on pydoc-markdown * Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade * Fix a couple of bugs * Add a type: ignore that was missing somehow * Give path to black * Apply Black * Apply Black * Relocate a couple of type: ignore * Update documentation * Make Linux CI run after applying Black * Triggering Black * Apply Black * Remove dependency, does not work well * Remove manually double trailing commas * Update documentation Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-02-03 13:43:18 +01:00			`{`
			`"url": "https://haystack.deepset.ai/overview/v0.7.0/intro",`
			`"partial_content": [`
			`"What is Haystack",`
			`"Utilize all transformer based models",`
			`"a Retriever-Reader pipeline in order",`
			`"Passing on only a small candidate set",`
			`"fast indexing and querying",`
			`"Fine-tune models to your own domain",`
			`"smoothly switch when new ones get published",`
			`],`
			`},`
			`{`
			`"url": "https://haystack.deepset.ai/overview/v0.7.0/use-cases",`
			`"partial_content": [`
			`"Semantic Search System",`
			`"Store your documents in the database of ",`
			`"results are chosen based on compatibility in",`
			`"Apply a set of standard questions to each document",`
			`"Return a NO_ANSWER if a given document",`
			`"like what is the revenue forecast for 2020?",`
			`"overview of academic papers and internal business",`
			`],`
			`},`
			`]`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00
			`crawler = Crawler(output_dir=tmp_dir)`
			`for _dict in partial_content_match:`
Apply black formatting (#2115) * Testing black on ui/ * Applying black on docstores * Add latest docstring and tutorial changes * Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too * Remove comments * Relax constraints on pydoc-markdown * Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade * Fix a couple of bugs * Add a type: ignore that was missing somehow * Give path to black * Apply Black * Apply Black * Relocate a couple of type: ignore * Update documentation * Make Linux CI run after applying Black * Triggering Black * Apply Black * Remove dependency, does not work well * Remove manually double trailing commas * Update documentation Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-02-03 13:43:18 +01:00			`url: str = _dict["url"]`
			`partial_content: list = _dict["partial_content"]`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00
			`doc_path = crawler.crawl(urls=[url], crawler_depth=0)`
			`assert len(doc_path) == 1`

			`for json_file in doc_path:`
			`assert isinstance(json_file, Path)`
			`with open(json_file.absolute(), "r") as read_file:`
			`content = json.load(read_file)`
Apply black formatting (#2115) * Testing black on ui/ * Applying black on docstores * Add latest docstring and tutorial changes * Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too * Remove comments * Relax constraints on pydoc-markdown * Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade * Fix a couple of bugs * Add a type: ignore that was missing somehow * Give path to black * Apply Black * Apply Black * Relocate a couple of type: ignore * Update documentation * Make Linux CI run after applying Black * Triggering Black * Apply Black * Remove dependency, does not work well * Remove manually double trailing commas * Update documentation Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-02-03 13:43:18 +01:00			`assert isinstance(content["content"], str)`
Add tests for Crawler (#1339) 2021-08-18 17:05:44 +05:00			`for partial_line in partial_content:`
Apply black formatting (#2115) * Testing black on ui/ * Applying black on docstores * Add latest docstring and tutorial changes * Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too * Remove comments * Relax constraints on pydoc-markdown * Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade * Fix a couple of bugs * Add a type: ignore that was missing somehow * Give path to black * Apply Black * Apply Black * Relocate a couple of type: ignore * Update documentation * Make Linux CI run after applying Black * Triggering Black * Apply Black * Remove dependency, does not work well * Remove manually double trailing commas * Update documentation Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-02-03 13:43:18 +01:00			`assert search(partial_line, content["content"])`
			`assert partial_line in content["content"]`
Add Crawler support for indexing pipeline (#1360) 2021-08-24 17:25:22 +05:00

			`def test_crawler_return_document(tmp_path):`
			`tmp_dir = tmp_path / "crawled_files"`
			`_url = ["https://haystack.deepset.ai/docs/v0.5.0/intromd"]`

			`crawler = Crawler(output_dir=tmp_dir)`
			`docs_path = crawler.crawl(urls=_url, crawler_depth=1)`
			`results, _ = crawler.run(urls=_url, crawler_depth=1, return_documents=True)`
Apply black formatting (#2115) * Testing black on ui/ * Applying black on docstores * Add latest docstring and tutorial changes * Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too * Remove comments * Relax constraints on pydoc-markdown * Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade * Fix a couple of bugs * Add a type: ignore that was missing somehow * Give path to black * Apply Black * Apply Black * Relocate a couple of type: ignore * Update documentation * Make Linux CI run after applying Black * Triggering Black * Apply Black * Remove dependency, does not work well * Remove manually double trailing commas * Update documentation Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-02-03 13:43:18 +01:00			`documents = results["documents"]`
Add Crawler support for indexing pipeline (#1360) 2021-08-24 17:25:22 +05:00
			`for json_file, document in zip(docs_path, documents):`
			`assert isinstance(json_file, Path)`
Change return types of indexing pipeline nodes (#2342) * Change return types of file converters * Change return types of preprocessor * Change return types of crawler * Adapt utils to functions to new return types * Adapt __init__.py to new method names * Prevent circular imports * Update Documentation & Code Style * Let DocStores' run method accept Documents * Adapt tests to new return types * Update Documentation & Code Style * Put "# type: ignore" to right place * Remove id_hash_keys property from Document primitive * Update Documentation & Code Style * Adapt tests to new return types and missing id_hash_keys property * Fix mypy * Fix mypy * Adapt PDFToTextOCRConverter * Remove id_hash_keys from RestAPI tests * Update Documentation & Code Style * Rename tests * Remove redundant setting of content_type="text" * Add DeprecationWarning * Add id_hash_keys to elasticsearch_index_to_document_store * Change document type from dict to Docuemnt in PreProcessor test * Fix file path in Tutorial 5 * Remove added output in Tutorial 5 * Update Documentation & Code Style * Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http * Adapt tutorials to new return types * Adapt tutorial 14 to new return types * Update Documentation & Code Style * Change assertions to HaystackErrors * Import HaystackError correctly Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-03-29 13:53:35 +02:00			`assert isinstance(document, Document)`
Add Crawler support for indexing pipeline (#1360) 2021-08-24 17:25:22 +05:00
			`with open(json_file.absolute(), "r") as read_file:`
			`file_content = json.load(read_file)`
Change return types of indexing pipeline nodes (#2342) * Change return types of file converters * Change return types of preprocessor * Change return types of crawler * Adapt utils to functions to new return types * Adapt __init__.py to new method names * Prevent circular imports * Update Documentation & Code Style * Let DocStores' run method accept Documents * Adapt tests to new return types * Update Documentation & Code Style * Put "# type: ignore" to right place * Remove id_hash_keys property from Document primitive * Update Documentation & Code Style * Adapt tests to new return types and missing id_hash_keys property * Fix mypy * Fix mypy * Adapt PDFToTextOCRConverter * Remove id_hash_keys from RestAPI tests * Update Documentation & Code Style * Rename tests * Remove redundant setting of content_type="text" * Add DeprecationWarning * Add id_hash_keys to elasticsearch_index_to_document_store * Change document type from dict to Docuemnt in PreProcessor test * Fix file path in Tutorial 5 * Remove added output in Tutorial 5 * Update Documentation & Code Style * Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http * Adapt tutorials to new return types * Adapt tutorial 14 to new return types * Update Documentation & Code Style * Change assertions to HaystackErrors * Import HaystackError correctly Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-03-29 13:53:35 +02:00			`assert file_content["meta"] == document.meta`
			`assert file_content["content"] == document.content`