import os from typing import Dict, Tuple import pytest import requests from boilerpy3.extractors import ArticleExtractor from haystack import Document, Pipeline from haystack.nodes import WebSearch, WebRetriever, PromptNode @pytest.mark.unit def test_web_retriever_mode_raw_documents(monkeypatch): expected_search_results = { "documents": [ Document( content="Eddard Stark", score=0.9090909090909091, meta={"title": "Eddard Stark", "link": "", "score": 0.9090909090909091}, id_hash_keys=["content"], id="f408db6de8de0ffad0cb47cf8830dbb8", ), Document( content="The most likely answer for the clue is NED. How many solutions does Arya Stark's Father have? With crossword-solver.io you will find 1 solutions. We use ...", score=0.09090909090909091, meta={ "title": "Arya Stark's Father - Crossword Clue Answers", "link": "https://crossword-solver.io/clue/arya-stark%27s-father/", "position": 1, "score": 0.09090909090909091, }, id_hash_keys=["content"], id="51779277acf94cf90e7663db137c0732", ), ] } def mock_web_search_run(self, query: str) -> Tuple[Dict, str]: return expected_search_results, "output_1" class MockResponse: def __init__(self, text, status_code): self.text = text self.status_code = status_code def get(url, headers, timeout): return MockResponse("mocked", 200) def get_content(self, text: str) -> str: return "What are the top solutions for\nArya Stark's Father\nWe found 1 solutions for\nArya Stark's Father\n.The top solutions is determined by popularity, ratings and frequency of searches. The most likely answer for the clue is NED..." monkeypatch.setattr(WebSearch, "run", mock_web_search_run) monkeypatch.setattr(ArticleExtractor, "get_content", get_content) monkeypatch.setattr(requests, "get", get) web_retriever = WebRetriever(api_key="", top_search_results=2, mode="raw_documents") result = web_retriever.retrieve(query="Who is the father of Arya Stark?") assert len(result) == 1 assert isinstance(result[0], Document) assert ( result[0].content == "What are the top solutions for\nArya Stark's Father\nWe found 1 solutions for\nArya Stark's Father\n.The top solutions is determined by popularity, ratings and frequency of searches. The most likely answer for the clue is NED..." ) assert result[0].score == None assert result[0].meta["url"] == "https://crossword-solver.io/clue/arya-stark%27s-father/" # Only preprocessed docs but not raw docs should have the _split_id field assert "_split_id" not in result[0].meta @pytest.mark.unit def test_web_retriever_mode_preprocessed_documents(monkeypatch): expected_search_results = { "documents": [ Document( content="Eddard Stark", score=0.9090909090909091, meta={"title": "Eddard Stark", "link": "", "score": 0.9090909090909091}, id_hash_keys=["content"], id="f408db6de8de0ffad0cb47cf8830dbb8", ), Document( content="The most likely answer for the clue is NED. How many solutions does Arya Stark's Father have? With crossword-solver.io you will find 1 solutions. We use ...", score=0.09090909090909091, meta={ "title": "Arya Stark's Father - Crossword Clue Answers", "link": "https://crossword-solver.io/clue/arya-stark%27s-father/", "position": 1, "score": 0.09090909090909091, }, id_hash_keys=["content"], id="51779277acf94cf90e7663db137c0732", ), ] } def mock_web_search_run(self, query: str) -> Tuple[Dict, str]: return expected_search_results, "output_1" class MockResponse: def __init__(self, text, status_code): self.text = text self.status_code = status_code def get(url, headers, timeout): return MockResponse("mocked", 200) def get_content(self, text: str) -> str: return "What are the top solutions for\nArya Stark's Father\nWe found 1 solutions for\nArya Stark's Father\n.The top solutions is determined by popularity, ratings and frequency of searches. The most likely answer for the clue is NED..." monkeypatch.setattr(WebSearch, "run", mock_web_search_run) monkeypatch.setattr(ArticleExtractor, "get_content", get_content) monkeypatch.setattr(requests, "get", get) web_retriever = WebRetriever(api_key="", top_search_results=2, mode="preprocessed_documents") result = web_retriever.retrieve(query="Who is the father of Arya Stark?") assert len(result) == 1 assert isinstance(result[0], Document) assert ( result[0].content == "What are the top solutions for\nArya Stark's Father\nWe found 1 solutions for\nArya Stark's Father\n.The top solutions is determined by popularity, ratings and frequency of searches. The most likely answer for the clue is NED..." ) assert result[0].score == None assert result[0].meta["url"] == "https://crossword-solver.io/clue/arya-stark%27s-father/" assert result[0].meta["_split_id"] == 0 @pytest.mark.unit def test_web_retriever_mode_snippets(monkeypatch): expected_search_results = { "documents": [ Document( content="Eddard Stark", score=0.9090909090909091, meta={"title": "Eddard Stark", "link": "", "score": 0.9090909090909091}, id_hash_keys=["content"], id="f408db6de8de0ffad0cb47cf8830dbb8", ), Document( content="The most likely answer for the clue is NED. How many solutions does Arya Stark's Father have? With crossword-solver.io you will find 1 solutions. We use ...", score=0.09090909090909091, meta={ "title": "Arya Stark's Father - Crossword Clue Answers", "link": "https://crossword-solver.io/clue/arya-stark%27s-father/", "position": 1, "score": 0.09090909090909091, }, id_hash_keys=["content"], id="51779277acf94cf90e7663db137c0732", ), ] } def mock_web_search_run(self, query: str) -> Tuple[Dict, str]: return expected_search_results, "output_1" monkeypatch.setattr(WebSearch, "run", mock_web_search_run) web_retriever = WebRetriever(api_key="", top_search_results=2) result = web_retriever.retrieve(query="Who is the father of Arya Stark?") assert result == expected_search_results["documents"] @pytest.mark.unit @pytest.mark.parametrize("top_k", [1, 3, 6]) def test_top_k_parameter(mock_web_search, top_k): web_retriever = WebRetriever(api_key="some_invalid_key", mode="snippets") result = web_retriever.retrieve(query="Who is the boyfriend of Olivia Wilde?", top_k=top_k) assert len(result) == top_k assert all(isinstance(doc, Document) for doc in result) @pytest.mark.integration @pytest.mark.skipif( not os.environ.get("SERPERDEV_API_KEY", None), reason="Please export an env var called SERPERDEV_API_KEY containing the serper.dev API key to run this test.", ) @pytest.mark.skipif( not os.environ.get("OPENAI_API_KEY", None), reason="Please export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", ) @pytest.mark.parametrize("top_k", [2, 4]) def test_top_k_parameter_in_pipeline(top_k): # test that WebRetriever top_k param is NOT ignored in a pipeline prompt_node = PromptNode( "gpt-3.5-turbo", api_key=os.environ.get("OPENAI_API_KEY"), max_length=256, default_prompt_template="question-answering-with-document-scores", ) retriever = WebRetriever(api_key=os.environ.get("SERPERDEV_API_KEY")) pipe = Pipeline() pipe.add_node(component=retriever, name="WebRetriever", inputs=["Query"]) pipe.add_node(component=prompt_node, name="QAwithScoresPrompt", inputs=["WebRetriever"]) result = pipe.run(query="What year was Obama president", params={"WebRetriever": {"top_k": top_k}}) assert len(result["results"]) == top_k