haystack/test/nodes/test_web_search.py

import os
import unittest
from unittest.mock import MagicMock, patch

import pytest

from haystack.nodes.search_engine import WebSearch
from haystack.schema import Document

try:
    import googleapiclient

    googleapi_installed = True
except ImportError:
    googleapi_installed = False


@pytest.mark.skipif(
    not os.environ.get("SERPERDEV_API_KEY", None),
    reason="Please export an env var called SERPERDEV_API_KEY containing the serper.dev API key to run this test.",
)
@pytest.mark.integration
def test_web_search():
    ws = WebSearch(api_key=os.environ.get("SERPERDEV_API_KEY", None))
    result, _ = ws.run(query="Who is the boyfriend of Olivia Wilde?")
    assert "documents" in result
    assert len(result["documents"]) > 0
    assert isinstance(result["documents"][0], Document)


@pytest.mark.skipif(
    not os.environ.get("SERPERDEV_API_KEY", None),
    reason="Please export an env var called SERPERDEV_API_KEY containing the serper.dev API key to run this test.",
)
@pytest.mark.integration
def test_web_search_with_site_keyword():
    ws = WebSearch(api_key=os.environ.get("SERPERDEV_API_KEY", None))
    result, _ = ws.run(query='site:lifewire.com OR site:nasa.gov "electric vehicles"')
    assert "documents" in result
    assert len(result["documents"]) > 0
    assert isinstance(result["documents"][0], Document)
    assert all(
        ["nasa" in doc.meta["link"] or "lifewire" in doc.meta["link"] for doc in result["documents"]]
    ), "Some documents are not from the specified sites lifewire.com or nasa.gov."


@pytest.mark.unit
def test_web_search_with_google_api_provider():
    if not googleapi_installed:
        pytest.skip("google-api-python-client is not installed, skipping test.")

    GOOGLE_API_KEY = "dummy_api_key"
    SEARCH_ENGINE_ID = "dummy_search_engine_id"
    query = "The founder of Python"

    with patch("haystack.nodes.search_engine.WebSearch.run") as mock_run:
        mock_run.return_value = ([{"content": "Guido van Rossum"}], None)
        ws = WebSearch(
            api_key=GOOGLE_API_KEY,
            search_engine_provider="GoogleAPI",
            search_engine_kwargs={"engine_id": SEARCH_ENGINE_ID},
        )
        result, _ = ws.run(query=query)

        mock_run.assert_called_once_with(query=query)

        assert "guido" in result[0]["content"].lower()


@pytest.mark.unit
def test_web_search_with_google_api_client():
    if not googleapi_installed:
        pytest.skip("google-api-python-client is not installed, skipping test.")

    GOOGLE_API_KEY = "dummy_api_key"
    SEARCH_ENGINE_ID = "dummy_search_engine_id"
    query = "The founder of Python"

    with patch("googleapiclient.discovery.build") as mock_build:
        mock_service = MagicMock()
        mock_cse = MagicMock()
        mock_list = MagicMock()

        mock_build.return_value = mock_service
        mock_service.cse.return_value = mock_cse
        mock_cse.list.return_value = mock_list
        mock_list.execute.return_value = {
            "items": [
                {
                    "title": "Guido van Rossum",
                    "snippet": "The founder of Python programming language.",
                    "link": "https://example.com/guido",
                }
            ]
        }

        ws = WebSearch(
            api_key=GOOGLE_API_KEY,
            search_engine_provider="GoogleAPI",
            search_engine_kwargs={"engine_id": SEARCH_ENGINE_ID},
        )
        result, _ = ws.run(query=query)

        mock_build.assert_called_once_with("customsearch", "v1", developerKey=GOOGLE_API_KEY)
        mock_service.cse.assert_called_once()
        mock_cse.list.assert_called_once_with(q=query, cx=SEARCH_ENGINE_ID, num=10)
        mock_list.execute.assert_called_once()
feat: Add agent tools (#4437) * Initial commit, add search_engine * Add TopPSampler * Add more TopPSampler unit tests * Remove SearchEngineSampler (converted to TopPSampler) * Add some basic WebSearch unit tests * Rename unit tests * Add WebRetriever into agent_tools * Adjust to WebRetriever * Add WebRetriever mode [snippet\|document] * Minor changes * SerperDev: add peopleAlsoAsk search results * First agent for hotpotqa * Making WebRetriever work on hotpotqa * refactor: minor WebRetriever improvements (#4377) * refactor: remove doc ids rebuild + antecipate cache * refactor: improve caching, fix Document ids * Minor WebRetriever improvements * Overlooked minor fixes * feat: add Bing API as search engine * refactor: let kwargs pass-through * feat: increase search context * check sampler result, improve batch typing * refactor: increase mypy compliance * Initial commit, add search_engine * Add TopPSampler * Add more TopPSampler unit tests * Remove SearchEngineSampler (converted to TopPSampler) * Add some basic WebSearch unit tests * Rename unit tests * Add WebRetriever into agent_tools * Adjust to WebRetriever * Add WebRetriever mode [snippet\|document] * Minor changes * SerperDev: add peopleAlsoAsk search results * First agent for hotpotqa * Making WebRetriever work on hotpotqa * refactor: minor WebRetriever improvements (#4377) * refactor: remove doc ids rebuild + antecipate cache * refactor: improve caching, fix Document ids * Minor WebRetriever improvements * Overlooked minor fixes * feat: add Bing API as search engine * refactor: let kwargs pass-through * feat: increase search context * check sampler result, improve batch typing * refactor: increase mypy compliance * Fix mypy * Minor example fixes * Fix the descriptions * PR feedback updates * More fixes * TopPSampler: handle top p None value, add unit test * Add top_k to WebSearch * Use boilerpy3 instead trafilatura * Remove date finding * Add more WebRetriever docs * Refactor long methods * making the preprocessor optional * hide WebSearch and make NeuralWebSearch a pipeline * remove unused imports * add WebQAPipeline and split example into two * change example search engine to SerperDev * Turn off progress bars in WebRetriever's PreProcesssor * Agent tool examples - final updates * Add webqa test, search results ranking scores * Better answer box handling for SerperDev and SerpAPI * Minor fixes * pylint * pylint fixes * extract TopPSampler from WebRetriever * use sampler only for WebRetriever modes other than snippet * add web retriever tests * add web retriever tests * exclude rdflib@6.3.2 due to license issues * add test for preprocessed docs and kwargs examples in docstrings * Move test_webqa_pipeline to test/pipelines * change docstring for join_documents_and_scores * Use WebQAPipeline in examples/web_lfqa.py * Use WebQAPipeline in examples/web_lfqa.py * Move test_webqa_pipeline to e2e * Updated lg * Sampler added automatically in WebQAPipeline, no need to add it * Updated lg * Updated lg * :ignore Update agent tools examples to new templates (#4503) * Update examples to new templates * Add print back * fix linting and black format issues --------- Co-authored-by: Daniel Bichuetti <daniel.bichuetti@gmail.com> Co-authored-by: agnieszka-m <amarzec13@gmail.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai> 2023-03-27 18:14:58 +02:00			`import os`
Pouyanpi/feat/search engine/providers/google api (#4722) * feat: implement google api search engine provider Signed-off-by: Pouyan <prezakhanipr@gmail.com> --------- Signed-off-by: Pouyan <prezakhanipr@gmail.com> 2023-05-02 17:09:17 +02:00			`import unittest`
			`from unittest.mock import MagicMock, patch`
feat: Add agent tools (#4437) * Initial commit, add search_engine * Add TopPSampler * Add more TopPSampler unit tests * Remove SearchEngineSampler (converted to TopPSampler) * Add some basic WebSearch unit tests * Rename unit tests * Add WebRetriever into agent_tools * Adjust to WebRetriever * Add WebRetriever mode [snippet\|document] * Minor changes * SerperDev: add peopleAlsoAsk search results * First agent for hotpotqa * Making WebRetriever work on hotpotqa * refactor: minor WebRetriever improvements (#4377) * refactor: remove doc ids rebuild + antecipate cache * refactor: improve caching, fix Document ids * Minor WebRetriever improvements * Overlooked minor fixes * feat: add Bing API as search engine * refactor: let kwargs pass-through * feat: increase search context * check sampler result, improve batch typing * refactor: increase mypy compliance * Initial commit, add search_engine * Add TopPSampler * Add more TopPSampler unit tests * Remove SearchEngineSampler (converted to TopPSampler) * Add some basic WebSearch unit tests * Rename unit tests * Add WebRetriever into agent_tools * Adjust to WebRetriever * Add WebRetriever mode [snippet\|document] * Minor changes * SerperDev: add peopleAlsoAsk search results * First agent for hotpotqa * Making WebRetriever work on hotpotqa * refactor: minor WebRetriever improvements (#4377) * refactor: remove doc ids rebuild + antecipate cache * refactor: improve caching, fix Document ids * Minor WebRetriever improvements * Overlooked minor fixes * feat: add Bing API as search engine * refactor: let kwargs pass-through * feat: increase search context * check sampler result, improve batch typing * refactor: increase mypy compliance * Fix mypy * Minor example fixes * Fix the descriptions * PR feedback updates * More fixes * TopPSampler: handle top p None value, add unit test * Add top_k to WebSearch * Use boilerpy3 instead trafilatura * Remove date finding * Add more WebRetriever docs * Refactor long methods * making the preprocessor optional * hide WebSearch and make NeuralWebSearch a pipeline * remove unused imports * add WebQAPipeline and split example into two * change example search engine to SerperDev * Turn off progress bars in WebRetriever's PreProcesssor * Agent tool examples - final updates * Add webqa test, search results ranking scores * Better answer box handling for SerperDev and SerpAPI * Minor fixes * pylint * pylint fixes * extract TopPSampler from WebRetriever * use sampler only for WebRetriever modes other than snippet * add web retriever tests * add web retriever tests * exclude rdflib@6.3.2 due to license issues * add test for preprocessed docs and kwargs examples in docstrings * Move test_webqa_pipeline to test/pipelines * change docstring for join_documents_and_scores * Use WebQAPipeline in examples/web_lfqa.py * Use WebQAPipeline in examples/web_lfqa.py * Move test_webqa_pipeline to e2e * Updated lg * Sampler added automatically in WebQAPipeline, no need to add it * Updated lg * Updated lg * :ignore Update agent tools examples to new templates (#4503) * Update examples to new templates * Add print back * fix linting and black format issues --------- Co-authored-by: Daniel Bichuetti <daniel.bichuetti@gmail.com> Co-authored-by: agnieszka-m <amarzec13@gmail.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai> 2023-03-27 18:14:58 +02:00
			`import pytest`

			`from haystack.nodes.search_engine import WebSearch`
			`from haystack.schema import Document`

Pouyanpi/feat/search engine/providers/google api (#4722) * feat: implement google api search engine provider Signed-off-by: Pouyan <prezakhanipr@gmail.com> --------- Signed-off-by: Pouyan <prezakhanipr@gmail.com> 2023-05-02 17:09:17 +02:00			`try:`
			`import googleapiclient`

			`googleapi_installed = True`
			`except ImportError:`
			`googleapi_installed = False`

feat: Add agent tools (#4437) * Initial commit, add search_engine * Add TopPSampler * Add more TopPSampler unit tests * Remove SearchEngineSampler (converted to TopPSampler) * Add some basic WebSearch unit tests * Rename unit tests * Add WebRetriever into agent_tools * Adjust to WebRetriever * Add WebRetriever mode [snippet\|document] * Minor changes * SerperDev: add peopleAlsoAsk search results * First agent for hotpotqa * Making WebRetriever work on hotpotqa * refactor: minor WebRetriever improvements (#4377) * refactor: remove doc ids rebuild + antecipate cache * refactor: improve caching, fix Document ids * Minor WebRetriever improvements * Overlooked minor fixes * feat: add Bing API as search engine * refactor: let kwargs pass-through * feat: increase search context * check sampler result, improve batch typing * refactor: increase mypy compliance * Initial commit, add search_engine * Add TopPSampler * Add more TopPSampler unit tests * Remove SearchEngineSampler (converted to TopPSampler) * Add some basic WebSearch unit tests * Rename unit tests * Add WebRetriever into agent_tools * Adjust to WebRetriever * Add WebRetriever mode [snippet\|document] * Minor changes * SerperDev: add peopleAlsoAsk search results * First agent for hotpotqa * Making WebRetriever work on hotpotqa * refactor: minor WebRetriever improvements (#4377) * refactor: remove doc ids rebuild + antecipate cache * refactor: improve caching, fix Document ids * Minor WebRetriever improvements * Overlooked minor fixes * feat: add Bing API as search engine * refactor: let kwargs pass-through * feat: increase search context * check sampler result, improve batch typing * refactor: increase mypy compliance * Fix mypy * Minor example fixes * Fix the descriptions * PR feedback updates * More fixes * TopPSampler: handle top p None value, add unit test * Add top_k to WebSearch * Use boilerpy3 instead trafilatura * Remove date finding * Add more WebRetriever docs * Refactor long methods * making the preprocessor optional * hide WebSearch and make NeuralWebSearch a pipeline * remove unused imports * add WebQAPipeline and split example into two * change example search engine to SerperDev * Turn off progress bars in WebRetriever's PreProcesssor * Agent tool examples - final updates * Add webqa test, search results ranking scores * Better answer box handling for SerperDev and SerpAPI * Minor fixes * pylint * pylint fixes * extract TopPSampler from WebRetriever * use sampler only for WebRetriever modes other than snippet * add web retriever tests * add web retriever tests * exclude rdflib@6.3.2 due to license issues * add test for preprocessed docs and kwargs examples in docstrings * Move test_webqa_pipeline to test/pipelines * change docstring for join_documents_and_scores * Use WebQAPipeline in examples/web_lfqa.py * Use WebQAPipeline in examples/web_lfqa.py * Move test_webqa_pipeline to e2e * Updated lg * Sampler added automatically in WebQAPipeline, no need to add it * Updated lg * Updated lg * :ignore Update agent tools examples to new templates (#4503) * Update examples to new templates * Add print back * fix linting and black format issues --------- Co-authored-by: Daniel Bichuetti <daniel.bichuetti@gmail.com> Co-authored-by: agnieszka-m <amarzec13@gmail.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai> 2023-03-27 18:14:58 +02:00
			`@pytest.mark.skipif(`
			`not os.environ.get("SERPERDEV_API_KEY", None),`
			`reason="Please export an env var called SERPERDEV_API_KEY containing the serper.dev API key to run this test.",`
			`)`
			`@pytest.mark.integration`
			`def test_web_search():`
			`ws = WebSearch(api_key=os.environ.get("SERPERDEV_API_KEY", None))`
			`result, _ = ws.run(query="Who is the boyfriend of Olivia Wilde?")`
			`assert "documents" in result`
			`assert len(result["documents"]) > 0`
			`assert isinstance(result["documents"][0], Document)`


			`@pytest.mark.skipif(`
			`not os.environ.get("SERPERDEV_API_KEY", None),`
			`reason="Please export an env var called SERPERDEV_API_KEY containing the serper.dev API key to run this test.",`
			`)`
			`@pytest.mark.integration`
			`def test_web_search_with_site_keyword():`
			`ws = WebSearch(api_key=os.environ.get("SERPERDEV_API_KEY", None))`
			`result, _ = ws.run(query='site:lifewire.com OR site:nasa.gov "electric vehicles"')`
			`assert "documents" in result`
			`assert len(result["documents"]) > 0`
			`assert isinstance(result["documents"][0], Document)`
			`assert all(`
			`["nasa" in doc.meta["link"] or "lifewire" in doc.meta["link"] for doc in result["documents"]]`
			`), "Some documents are not from the specified sites lifewire.com or nasa.gov."`
Pouyanpi/feat/search engine/providers/google api (#4722) * feat: implement google api search engine provider Signed-off-by: Pouyan <prezakhanipr@gmail.com> --------- Signed-off-by: Pouyan <prezakhanipr@gmail.com> 2023-05-02 17:09:17 +02:00

			`@pytest.mark.unit`
			`def test_web_search_with_google_api_provider():`
			`if not googleapi_installed:`
			`pytest.skip("google-api-python-client is not installed, skipping test.")`

			`GOOGLE_API_KEY = "dummy_api_key"`
			`SEARCH_ENGINE_ID = "dummy_search_engine_id"`
			`query = "The founder of Python"`

			`with patch("haystack.nodes.search_engine.WebSearch.run") as mock_run:`
			`mock_run.return_value = ([{"content": "Guido van Rossum"}], None)`
			`ws = WebSearch(`
			`api_key=GOOGLE_API_KEY,`
			`search_engine_provider="GoogleAPI",`
			`search_engine_kwargs={"engine_id": SEARCH_ENGINE_ID},`
			`)`
			`result, _ = ws.run(query=query)`

			`mock_run.assert_called_once_with(query=query)`

			`assert "guido" in result[0]["content"].lower()`


			`@pytest.mark.unit`
			`def test_web_search_with_google_api_client():`
			`if not googleapi_installed:`
			`pytest.skip("google-api-python-client is not installed, skipping test.")`

			`GOOGLE_API_KEY = "dummy_api_key"`
			`SEARCH_ENGINE_ID = "dummy_search_engine_id"`
			`query = "The founder of Python"`

			`with patch("googleapiclient.discovery.build") as mock_build:`
			`mock_service = MagicMock()`
			`mock_cse = MagicMock()`
			`mock_list = MagicMock()`

			`mock_build.return_value = mock_service`
			`mock_service.cse.return_value = mock_cse`
			`mock_cse.list.return_value = mock_list`
			`mock_list.execute.return_value = {`
			`"items": [`
			`{`
			`"title": "Guido van Rossum",`
			`"snippet": "The founder of Python programming language.",`
			`"link": "https://example.com/guido",`
			`}`
			`]`
			`}`

			`ws = WebSearch(`
			`api_key=GOOGLE_API_KEY,`
			`search_engine_provider="GoogleAPI",`
			`search_engine_kwargs={"engine_id": SEARCH_ENGINE_ID},`
			`)`
			`result, _ = ws.run(query=query)`

			`mock_build.assert_called_once_with("customsearch", "v1", developerKey=GOOGLE_API_KEY)`
			`mock_service.cse.assert_called_once()`
			`mock_cse.list.assert_called_once_with(q=query, cx=SEARCH_ENGINE_ID, num=10)`
			`mock_list.execute.assert_called_once()`