haystack/test/nodes/test_web_search.py

108 lines
3.7 KiB
Python
Raw Normal View History

feat: Add agent tools (#4437) * Initial commit, add search_engine * Add TopPSampler * Add more TopPSampler unit tests * Remove SearchEngineSampler (converted to TopPSampler) * Add some basic WebSearch unit tests * Rename unit tests * Add WebRetriever into agent_tools * Adjust to WebRetriever * Add WebRetriever mode [snippet|document] * Minor changes * SerperDev: add peopleAlsoAsk search results * First agent for hotpotqa * Making WebRetriever work on hotpotqa * refactor: minor WebRetriever improvements (#4377) * refactor: remove doc ids rebuild + antecipate cache * refactor: improve caching, fix Document ids * Minor WebRetriever improvements * Overlooked minor fixes * feat: add Bing API as search engine * refactor: let kwargs pass-through * feat: increase search context * check sampler result, improve batch typing * refactor: increase mypy compliance * Initial commit, add search_engine * Add TopPSampler * Add more TopPSampler unit tests * Remove SearchEngineSampler (converted to TopPSampler) * Add some basic WebSearch unit tests * Rename unit tests * Add WebRetriever into agent_tools * Adjust to WebRetriever * Add WebRetriever mode [snippet|document] * Minor changes * SerperDev: add peopleAlsoAsk search results * First agent for hotpotqa * Making WebRetriever work on hotpotqa * refactor: minor WebRetriever improvements (#4377) * refactor: remove doc ids rebuild + antecipate cache * refactor: improve caching, fix Document ids * Minor WebRetriever improvements * Overlooked minor fixes * feat: add Bing API as search engine * refactor: let kwargs pass-through * feat: increase search context * check sampler result, improve batch typing * refactor: increase mypy compliance * Fix mypy * Minor example fixes * Fix the descriptions * PR feedback updates * More fixes * TopPSampler: handle top p None value, add unit test * Add top_k to WebSearch * Use boilerpy3 instead trafilatura * Remove date finding * Add more WebRetriever docs * Refactor long methods * making the preprocessor optional * hide WebSearch and make NeuralWebSearch a pipeline * remove unused imports * add WebQAPipeline and split example into two * change example search engine to SerperDev * Turn off progress bars in WebRetriever's PreProcesssor * Agent tool examples - final updates * Add webqa test, search results ranking scores * Better answer box handling for SerperDev and SerpAPI * Minor fixes * pylint * pylint fixes * extract TopPSampler from WebRetriever * use sampler only for WebRetriever modes other than snippet * add web retriever tests * add web retriever tests * exclude rdflib@6.3.2 due to license issues * add test for preprocessed docs and kwargs examples in docstrings * Move test_webqa_pipeline to test/pipelines * change docstring for join_documents_and_scores * Use WebQAPipeline in examples/web_lfqa.py * Use WebQAPipeline in examples/web_lfqa.py * Move test_webqa_pipeline to e2e * Updated lg * Sampler added automatically in WebQAPipeline, no need to add it * Updated lg * Updated lg * :ignore Update agent tools examples to new templates (#4503) * Update examples to new templates * Add print back * fix linting and black format issues --------- Co-authored-by: Daniel Bichuetti <daniel.bichuetti@gmail.com> Co-authored-by: agnieszka-m <amarzec13@gmail.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai>
2023-03-27 18:14:58 +02:00
import os
import unittest
from unittest.mock import MagicMock, patch
feat: Add agent tools (#4437) * Initial commit, add search_engine * Add TopPSampler * Add more TopPSampler unit tests * Remove SearchEngineSampler (converted to TopPSampler) * Add some basic WebSearch unit tests * Rename unit tests * Add WebRetriever into agent_tools * Adjust to WebRetriever * Add WebRetriever mode [snippet|document] * Minor changes * SerperDev: add peopleAlsoAsk search results * First agent for hotpotqa * Making WebRetriever work on hotpotqa * refactor: minor WebRetriever improvements (#4377) * refactor: remove doc ids rebuild + antecipate cache * refactor: improve caching, fix Document ids * Minor WebRetriever improvements * Overlooked minor fixes * feat: add Bing API as search engine * refactor: let kwargs pass-through * feat: increase search context * check sampler result, improve batch typing * refactor: increase mypy compliance * Initial commit, add search_engine * Add TopPSampler * Add more TopPSampler unit tests * Remove SearchEngineSampler (converted to TopPSampler) * Add some basic WebSearch unit tests * Rename unit tests * Add WebRetriever into agent_tools * Adjust to WebRetriever * Add WebRetriever mode [snippet|document] * Minor changes * SerperDev: add peopleAlsoAsk search results * First agent for hotpotqa * Making WebRetriever work on hotpotqa * refactor: minor WebRetriever improvements (#4377) * refactor: remove doc ids rebuild + antecipate cache * refactor: improve caching, fix Document ids * Minor WebRetriever improvements * Overlooked minor fixes * feat: add Bing API as search engine * refactor: let kwargs pass-through * feat: increase search context * check sampler result, improve batch typing * refactor: increase mypy compliance * Fix mypy * Minor example fixes * Fix the descriptions * PR feedback updates * More fixes * TopPSampler: handle top p None value, add unit test * Add top_k to WebSearch * Use boilerpy3 instead trafilatura * Remove date finding * Add more WebRetriever docs * Refactor long methods * making the preprocessor optional * hide WebSearch and make NeuralWebSearch a pipeline * remove unused imports * add WebQAPipeline and split example into two * change example search engine to SerperDev * Turn off progress bars in WebRetriever's PreProcesssor * Agent tool examples - final updates * Add webqa test, search results ranking scores * Better answer box handling for SerperDev and SerpAPI * Minor fixes * pylint * pylint fixes * extract TopPSampler from WebRetriever * use sampler only for WebRetriever modes other than snippet * add web retriever tests * add web retriever tests * exclude rdflib@6.3.2 due to license issues * add test for preprocessed docs and kwargs examples in docstrings * Move test_webqa_pipeline to test/pipelines * change docstring for join_documents_and_scores * Use WebQAPipeline in examples/web_lfqa.py * Use WebQAPipeline in examples/web_lfqa.py * Move test_webqa_pipeline to e2e * Updated lg * Sampler added automatically in WebQAPipeline, no need to add it * Updated lg * Updated lg * :ignore Update agent tools examples to new templates (#4503) * Update examples to new templates * Add print back * fix linting and black format issues --------- Co-authored-by: Daniel Bichuetti <daniel.bichuetti@gmail.com> Co-authored-by: agnieszka-m <amarzec13@gmail.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai>
2023-03-27 18:14:58 +02:00
import pytest
from haystack.nodes.search_engine import WebSearch
from haystack.schema import Document
try:
import googleapiclient
googleapi_installed = True
except ImportError:
googleapi_installed = False
feat: Add agent tools (#4437) * Initial commit, add search_engine * Add TopPSampler * Add more TopPSampler unit tests * Remove SearchEngineSampler (converted to TopPSampler) * Add some basic WebSearch unit tests * Rename unit tests * Add WebRetriever into agent_tools * Adjust to WebRetriever * Add WebRetriever mode [snippet|document] * Minor changes * SerperDev: add peopleAlsoAsk search results * First agent for hotpotqa * Making WebRetriever work on hotpotqa * refactor: minor WebRetriever improvements (#4377) * refactor: remove doc ids rebuild + antecipate cache * refactor: improve caching, fix Document ids * Minor WebRetriever improvements * Overlooked minor fixes * feat: add Bing API as search engine * refactor: let kwargs pass-through * feat: increase search context * check sampler result, improve batch typing * refactor: increase mypy compliance * Initial commit, add search_engine * Add TopPSampler * Add more TopPSampler unit tests * Remove SearchEngineSampler (converted to TopPSampler) * Add some basic WebSearch unit tests * Rename unit tests * Add WebRetriever into agent_tools * Adjust to WebRetriever * Add WebRetriever mode [snippet|document] * Minor changes * SerperDev: add peopleAlsoAsk search results * First agent for hotpotqa * Making WebRetriever work on hotpotqa * refactor: minor WebRetriever improvements (#4377) * refactor: remove doc ids rebuild + antecipate cache * refactor: improve caching, fix Document ids * Minor WebRetriever improvements * Overlooked minor fixes * feat: add Bing API as search engine * refactor: let kwargs pass-through * feat: increase search context * check sampler result, improve batch typing * refactor: increase mypy compliance * Fix mypy * Minor example fixes * Fix the descriptions * PR feedback updates * More fixes * TopPSampler: handle top p None value, add unit test * Add top_k to WebSearch * Use boilerpy3 instead trafilatura * Remove date finding * Add more WebRetriever docs * Refactor long methods * making the preprocessor optional * hide WebSearch and make NeuralWebSearch a pipeline * remove unused imports * add WebQAPipeline and split example into two * change example search engine to SerperDev * Turn off progress bars in WebRetriever's PreProcesssor * Agent tool examples - final updates * Add webqa test, search results ranking scores * Better answer box handling for SerperDev and SerpAPI * Minor fixes * pylint * pylint fixes * extract TopPSampler from WebRetriever * use sampler only for WebRetriever modes other than snippet * add web retriever tests * add web retriever tests * exclude rdflib@6.3.2 due to license issues * add test for preprocessed docs and kwargs examples in docstrings * Move test_webqa_pipeline to test/pipelines * change docstring for join_documents_and_scores * Use WebQAPipeline in examples/web_lfqa.py * Use WebQAPipeline in examples/web_lfqa.py * Move test_webqa_pipeline to e2e * Updated lg * Sampler added automatically in WebQAPipeline, no need to add it * Updated lg * Updated lg * :ignore Update agent tools examples to new templates (#4503) * Update examples to new templates * Add print back * fix linting and black format issues --------- Co-authored-by: Daniel Bichuetti <daniel.bichuetti@gmail.com> Co-authored-by: agnieszka-m <amarzec13@gmail.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai>
2023-03-27 18:14:58 +02:00
@pytest.mark.skipif(
not os.environ.get("SERPERDEV_API_KEY", None),
reason="Please export an env var called SERPERDEV_API_KEY containing the serper.dev API key to run this test.",
)
@pytest.mark.integration
def test_web_search():
ws = WebSearch(api_key=os.environ.get("SERPERDEV_API_KEY", None))
result, _ = ws.run(query="Who is the boyfriend of Olivia Wilde?")
assert "documents" in result
assert len(result["documents"]) > 0
assert isinstance(result["documents"][0], Document)
@pytest.mark.skipif(
not os.environ.get("SERPERDEV_API_KEY", None),
reason="Please export an env var called SERPERDEV_API_KEY containing the serper.dev API key to run this test.",
)
@pytest.mark.integration
def test_web_search_with_site_keyword():
ws = WebSearch(api_key=os.environ.get("SERPERDEV_API_KEY", None))
result, _ = ws.run(query='site:lifewire.com OR site:nasa.gov "electric vehicles"')
assert "documents" in result
assert len(result["documents"]) > 0
assert isinstance(result["documents"][0], Document)
assert all(
["nasa" in doc.meta["link"] or "lifewire" in doc.meta["link"] for doc in result["documents"]]
), "Some documents are not from the specified sites lifewire.com or nasa.gov."
@pytest.mark.unit
def test_web_search_with_google_api_provider():
if not googleapi_installed:
pytest.skip("google-api-python-client is not installed, skipping test.")
GOOGLE_API_KEY = "dummy_api_key"
SEARCH_ENGINE_ID = "dummy_search_engine_id"
query = "The founder of Python"
with patch("haystack.nodes.search_engine.WebSearch.run") as mock_run:
mock_run.return_value = ([{"content": "Guido van Rossum"}], None)
ws = WebSearch(
api_key=GOOGLE_API_KEY,
search_engine_provider="GoogleAPI",
search_engine_kwargs={"engine_id": SEARCH_ENGINE_ID},
)
result, _ = ws.run(query=query)
mock_run.assert_called_once_with(query=query)
assert "guido" in result[0]["content"].lower()
@pytest.mark.unit
def test_web_search_with_google_api_client():
if not googleapi_installed:
pytest.skip("google-api-python-client is not installed, skipping test.")
GOOGLE_API_KEY = "dummy_api_key"
SEARCH_ENGINE_ID = "dummy_search_engine_id"
query = "The founder of Python"
with patch("googleapiclient.discovery.build") as mock_build:
mock_service = MagicMock()
mock_cse = MagicMock()
mock_list = MagicMock()
mock_build.return_value = mock_service
mock_service.cse.return_value = mock_cse
mock_cse.list.return_value = mock_list
mock_list.execute.return_value = {
"items": [
{
"title": "Guido van Rossum",
"snippet": "The founder of Python programming language.",
"link": "https://example.com/guido",
}
]
}
ws = WebSearch(
api_key=GOOGLE_API_KEY,
search_engine_provider="GoogleAPI",
search_engine_kwargs={"engine_id": SEARCH_ENGINE_ID},
)
result, _ = ws.run(query=query)
mock_build.assert_called_once_with("customsearch", "v1", developerKey=GOOGLE_API_KEY)
mock_service.cse.assert_called_once()
mock_cse.list.assert_called_once_with(q=query, cx=SEARCH_ENGINE_ID, num=10)
mock_list.execute.assert_called_once()