haystack/test/nodes/test_web_retriever.py

199 lines
8.4 KiB
Python
Raw Normal View History

import os
from typing import Dict, Tuple
import pytest
import requests
from boilerpy3.extractors import ArticleExtractor
from haystack import Document, Pipeline
from haystack.nodes import WebSearch, WebRetriever, PromptNode
@pytest.mark.unit
def test_web_retriever_mode_raw_documents(monkeypatch):
expected_search_results = {
"documents": [
Document(
content="Eddard Stark",
score=0.9090909090909091,
meta={"title": "Eddard Stark", "link": "", "score": 0.9090909090909091},
id_hash_keys=["content"],
id="f408db6de8de0ffad0cb47cf8830dbb8",
),
Document(
content="The most likely answer for the clue is NED. How many solutions does Arya Stark's Father have? With crossword-solver.io you will find 1 solutions. We use ...",
score=0.09090909090909091,
meta={
"title": "Arya Stark's Father - Crossword Clue Answers",
"link": "https://crossword-solver.io/clue/arya-stark%27s-father/",
"position": 1,
"score": 0.09090909090909091,
},
id_hash_keys=["content"],
id="51779277acf94cf90e7663db137c0732",
),
]
}
def mock_web_search_run(self, query: str) -> Tuple[Dict, str]:
return expected_search_results, "output_1"
class MockResponse:
def __init__(self, text, status_code):
self.text = text
self.status_code = status_code
def get(url, headers, timeout):
return MockResponse("mocked", 200)
def get_content(self, text: str) -> str:
return "What are the top solutions for\nArya Stark's Father\nWe found 1 solutions for\nArya Stark's Father\n.The top solutions is determined by popularity, ratings and frequency of searches. The most likely answer for the clue is NED..."
monkeypatch.setattr(WebSearch, "run", mock_web_search_run)
monkeypatch.setattr(ArticleExtractor, "get_content", get_content)
monkeypatch.setattr(requests, "get", get)
web_retriever = WebRetriever(api_key="", top_search_results=2, mode="raw_documents")
result = web_retriever.retrieve(query="Who is the father of Arya Stark?")
assert len(result) == 1
assert isinstance(result[0], Document)
assert (
result[0].content
== "What are the top solutions for\nArya Stark's Father\nWe found 1 solutions for\nArya Stark's Father\n.The top solutions is determined by popularity, ratings and frequency of searches. The most likely answer for the clue is NED..."
)
assert result[0].score == None
assert result[0].meta["url"] == "https://crossword-solver.io/clue/arya-stark%27s-father/"
# Only preprocessed docs but not raw docs should have the _split_id field
assert "_split_id" not in result[0].meta
@pytest.mark.unit
def test_web_retriever_mode_preprocessed_documents(monkeypatch):
expected_search_results = {
"documents": [
Document(
content="Eddard Stark",
score=0.9090909090909091,
meta={"title": "Eddard Stark", "link": "", "score": 0.9090909090909091},
id_hash_keys=["content"],
id="f408db6de8de0ffad0cb47cf8830dbb8",
),
Document(
content="The most likely answer for the clue is NED. How many solutions does Arya Stark's Father have? With crossword-solver.io you will find 1 solutions. We use ...",
score=0.09090909090909091,
meta={
"title": "Arya Stark's Father - Crossword Clue Answers",
"link": "https://crossword-solver.io/clue/arya-stark%27s-father/",
"position": 1,
"score": 0.09090909090909091,
},
id_hash_keys=["content"],
id="51779277acf94cf90e7663db137c0732",
),
]
}
def mock_web_search_run(self, query: str) -> Tuple[Dict, str]:
return expected_search_results, "output_1"
class MockResponse:
def __init__(self, text, status_code):
self.text = text
self.status_code = status_code
def get(url, headers, timeout):
return MockResponse("mocked", 200)
def get_content(self, text: str) -> str:
return "What are the top solutions for\nArya Stark's Father\nWe found 1 solutions for\nArya Stark's Father\n.The top solutions is determined by popularity, ratings and frequency of searches. The most likely answer for the clue is NED..."
monkeypatch.setattr(WebSearch, "run", mock_web_search_run)
monkeypatch.setattr(ArticleExtractor, "get_content", get_content)
monkeypatch.setattr(requests, "get", get)
web_retriever = WebRetriever(api_key="", top_search_results=2, mode="preprocessed_documents")
result = web_retriever.retrieve(query="Who is the father of Arya Stark?")
assert len(result) == 1
assert isinstance(result[0], Document)
assert (
result[0].content
== "What are the top solutions for\nArya Stark's Father\nWe found 1 solutions for\nArya Stark's Father\n.The top solutions is determined by popularity, ratings and frequency of searches. The most likely answer for the clue is NED..."
)
assert result[0].score == None
assert result[0].meta["url"] == "https://crossword-solver.io/clue/arya-stark%27s-father/"
assert result[0].meta["_split_id"] == 0
@pytest.mark.unit
def test_web_retriever_mode_snippets(monkeypatch):
expected_search_results = {
"documents": [
Document(
content="Eddard Stark",
score=0.9090909090909091,
meta={"title": "Eddard Stark", "link": "", "score": 0.9090909090909091},
id_hash_keys=["content"],
id="f408db6de8de0ffad0cb47cf8830dbb8",
),
Document(
content="The most likely answer for the clue is NED. How many solutions does Arya Stark's Father have? With crossword-solver.io you will find 1 solutions. We use ...",
score=0.09090909090909091,
meta={
"title": "Arya Stark's Father - Crossword Clue Answers",
"link": "https://crossword-solver.io/clue/arya-stark%27s-father/",
"position": 1,
"score": 0.09090909090909091,
},
id_hash_keys=["content"],
id="51779277acf94cf90e7663db137c0732",
),
]
}
def mock_web_search_run(self, query: str) -> Tuple[Dict, str]:
return expected_search_results, "output_1"
monkeypatch.setattr(WebSearch, "run", mock_web_search_run)
web_retriever = WebRetriever(api_key="", top_search_results=2)
result = web_retriever.retrieve(query="Who is the father of Arya Stark?")
assert result == expected_search_results["documents"]
@pytest.mark.unit
@pytest.mark.parametrize("top_k", [1, 3, 6])
def test_top_k_parameter(mock_web_search, top_k):
web_retriever = WebRetriever(api_key="some_invalid_key", mode="snippets")
result = web_retriever.retrieve(query="Who is the boyfriend of Olivia Wilde?", top_k=top_k)
assert len(result) == top_k
assert all(isinstance(doc, Document) for doc in result)
@pytest.mark.integration
@pytest.mark.skipif(
not os.environ.get("SERPERDEV_API_KEY", None),
reason="Please export an env var called SERPERDEV_API_KEY containing the serper.dev API key to run this test.",
)
@pytest.mark.skipif(
not os.environ.get("OPENAI_API_KEY", None),
reason="Please export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
)
@pytest.mark.parametrize("top_k", [2, 4])
def test_top_k_parameter_in_pipeline(top_k):
# test that WebRetriever top_k param is NOT ignored in a pipeline
prompt_node = PromptNode(
"gpt-3.5-turbo",
api_key=os.environ.get("OPENAI_API_KEY"),
max_length=256,
default_prompt_template="question-answering-with-document-scores",
)
retriever = WebRetriever(api_key=os.environ.get("SERPERDEV_API_KEY"))
pipe = Pipeline()
pipe.add_node(component=retriever, name="WebRetriever", inputs=["Query"])
pipe.add_node(component=prompt_node, name="QAwithScoresPrompt", inputs=["WebRetriever"])
result = pipe.run(query="What year was Obama president", params={"WebRetriever": {"top_k": top_k}})
assert len(result["results"]) == top_k