feat: Add SerperDevWebSearch Haystack 2.0 component (#5712)

* Add SerperDev

* Add release note

* PR Feedback

* Simplify, remove one-liner

* Update haystack/preview/components/websearch/serper_dev.py

Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>

* Update haystack/preview/components/websearch/serper_dev.py

Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>

* Fix formatting

* PR feedback

* Fix tests

* Function rename

* Remove scoring, update tests

* PR feedback

* Fix return

* small adjustments

* fix tests

* add e2e test

* fix release notes

* fix tests

* fix e2e

---------

Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
This commit is contained in:
Vladimir Blagojevic 2023-09-06 17:31:42 +02:00 committed by GitHub
parent 0bbc219a59
commit c5edb45c10
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 340 additions and 0 deletions

View File

@ -16,6 +16,9 @@ on:
env: env:
PYTHON_VERSION: "3.8" PYTHON_VERSION: "3.8"
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }}
SERPERDEV_API_KEY: ${{ secrets.SERPERDEV_API_KEY }}
jobs: jobs:
e2e: e2e:

View File

@ -0,0 +1,15 @@
import os
import pytest
from haystack.preview import Document
from haystack.preview.components.websearch.serper_dev import SerperDevWebSearch
@pytest.mark.skipif(
not os.environ.get("SERPERDEV_API_KEY", None),
reason="Export an env var called SERPERDEV_API_KEY containing the SerperDev API key to run this test.",
)
def test_web_search_top_k():
ws = SerperDevWebSearch(api_key=os.environ.get("SERPERDEV_API_KEY", None), top_k=10)
results = ws.run(query="Who is the boyfriend of Olivia Wilde?")["documents"]
assert len(results) == 10
assert all(isinstance(doc, Document) for doc in results)

View File

@ -0,0 +1 @@
from haystack.preview.components.websearch.serper_dev import SerperDevWebSearch

View File

@ -0,0 +1,140 @@
import json
import logging
from typing import Dict, List, Optional, Any
import requests
from haystack.preview import Document, component, default_from_dict, default_to_dict, ComponentError
logger = logging.getLogger(__name__)
SERPERDEV_BASE_URL = "https://google.serper.dev/search"
class SerperDevError(ComponentError):
...
@component
class SerperDevWebSearch:
"""
Search engine using SerperDev API. Given a query, it returns a list of URLs that are the most relevant.
See the [Serper Dev website](https://serper.dev/) for more details.
"""
def __init__(
self,
api_key: str,
top_k: Optional[int] = 10,
allowed_domains: Optional[List[str]] = None,
search_params: Optional[Dict[str, Any]] = None,
):
"""
:param api_key: API key for the SerperDev API.
:param top_k: Number of documents to return.
:param allowed_domains: List of domains to limit the search to.
:param search_params: Additional parameters passed to the SerperDev API.
For example, you can set 'num' to 20 to increase the number of search results.
See the [Serper Dev website](https://serper.dev/) for more details.
"""
if api_key is None:
raise ValueError("API key for SerperDev API must be set.")
self.api_key = api_key
self.top_k = top_k
self.allowed_domains = allowed_domains
self.search_params = search_params or {}
def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
"""
return default_to_dict(
self,
api_key=self.api_key,
top_k=self.top_k,
allowed_domains=self.allowed_domains,
search_params=self.search_params,
)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "SerperDevWebSearch":
"""
Deserialize this component from a dictionary.
"""
return default_from_dict(cls, data)
@component.output_types(documents=List[Document])
def run(self, query: str):
"""
Search the SerperDev API for the given query and return the results as a list of Documents.
:param query: Query string.
:return: List[Document]
"""
query_prepend = "OR ".join(f"site:{domain} " for domain in self.allowed_domains) if self.allowed_domains else ""
payload = json.dumps(
{"q": query_prepend + query, "gl": "us", "hl": "en", "autocorrect": True, **self.search_params}
)
headers = {"X-API-KEY": self.api_key, "Content-Type": "application/json"}
try:
response = requests.post(SERPERDEV_BASE_URL, headers=headers, data=payload, timeout=30)
response.raise_for_status() # Will raise an HTTPError for bad responses
except requests.Timeout:
raise TimeoutError(f"Request to {self.__class__.__name__} timed out.")
except requests.RequestException as e:
raise SerperDevError(f"An error occurred while querying {self.__class__.__name__}. Error: {e}") from e
# If we reached this point, it means the request was successful and we can proceed
json_result = response.json()
# we get the snippet from the json result and put it in the content field of the document
organic = [
Document(metadata={k: v for k, v in d.items() if k != "snippet"}, content=d["snippet"])
for d in json_result["organic"]
]
# answer box is what search engine shows as a direct answer to the query
answer_box = []
if "answerBox" in json_result:
answer_dict = json_result["answerBox"]
highlighted_answers = answer_dict.get("snippetHighlighted")
answer_box_content = None
# Check if highlighted_answers is a list and has at least one element
if isinstance(highlighted_answers, list) and len(highlighted_answers) > 0:
answer_box_content = highlighted_answers[0]
elif isinstance(highlighted_answers, str):
answer_box_content = highlighted_answers
if not answer_box_content:
for key in ["snippet", "answer", "title"]:
if key in answer_dict:
answer_box_content = answer_dict[key]
break
if answer_box_content:
answer_box = [
Document(
content=answer_box_content,
metadata={"title": answer_dict.get("title", ""), "link": answer_dict.get("link", "")},
)
]
# these are related questions that search engine shows
people_also_ask = []
if "peopleAlsoAsk" in json_result:
for result in json_result["peopleAlsoAsk"]:
title = result.get("title", "")
people_also_ask.append(
Document(
content=result["snippet"] if result.get("snippet") else title,
metadata={"title": title, "link": result.get("link", None)},
)
)
documents = answer_box + organic + people_also_ask
logger.debug("Serper Dev returned %s documents for the query '%s'", len(documents), query)
return {"documents": documents[: self.top_k]}

View File

@ -0,0 +1,3 @@
---
preview:
- Adds SerperDevWebSearch component to retrieve URLs from the web. See https://serper.dev/ for more information.

View File

@ -0,0 +1,178 @@
from unittest.mock import Mock, patch
import pytest
from requests import Timeout, RequestException, HTTPError
from haystack.preview import Document
from haystack.preview.components.websearch.serper_dev import SerperDevWebSearch, SerperDevError
EXAMPLE_SERPERDEV_RESPONSE = {
"searchParameters": {
"q": "Who is the boyfriend of Olivia Wilde?",
"gl": "us",
"hl": "en",
"autocorrect": True,
"type": "search",
},
"organic": [
{
"title": "Olivia Wilde embraces Jason Sudeikis amid custody battle, Harry Styles split - Page Six",
"link": "https://pagesix.com/2023/01/29/olivia-wilde-hugs-it-out-with-jason-sudeikis-after-harry-styles-split/",
"snippet": "Looks like Olivia Wilde and Jason Sudeikis are starting 2023 on good terms. Amid their highly publicized custody battle and the actress' ...",
"date": "Jan 29, 2023",
"position": 1,
},
{
"title": "Olivia Wilde Is 'Quietly Dating' Again Following Harry Styles Split: 'He Makes Her Happy'",
"link": "https://www.yahoo.com/now/olivia-wilde-quietly-dating-again-183844364.html",
"snippet": "Olivia Wilde is “quietly dating again” following her November 2022 split from Harry Styles, a source exclusively tells Life & Style.",
"date": "Feb 10, 2023",
"position": 2,
},
{
"title": "Olivia Wilde and Harry Styles' Relationship Timeline: The Way They Were - Us Weekly",
"link": "https://www.usmagazine.com/celebrity-news/pictures/olivia-wilde-and-harry-styles-relationship-timeline/",
"snippet": "Olivia Wilde started dating Harry Styles after ending her years-long engagement to Jason Sudeikis — see their relationship timeline.",
"date": "Mar 10, 2023",
"imageUrl": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSgTcalNFvptTbYBiDXX55s8yCGfn6F1qbed9DAN16LvynTr9GayK5SPmY&s",
"position": 3,
},
{
"title": "Olivia Wilde Is 'Ready to Date Again' After Harry Styles Split - Us Weekly",
"link": "https://www.usmagazine.com/celebrity-news/news/olivia-wilde-is-ready-to-date-again-after-harry-styles-split/",
"snippet": "Ready for love! Olivia Wilde is officially back on the dating scene following her split from her ex-boyfriend, Harry Styles.",
"date": "Mar 1, 2023",
"imageUrl": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRCRAeRy5sVE631ZctzbzuOF70xkIOHaTvh2K7dYvdiVBwALiKrIjpscok&s",
"position": 4,
},
{
"title": "Harry Styles and Olivia Wilde's Definitive Relationship Timeline - Harper's Bazaar",
"link": "https://www.harpersbazaar.com/celebrity/latest/a35172115/harry-styles-olivia-wilde-relationship-timeline/",
"snippet": "November 2020: News breaks about Olivia splitting from fiancé Jason Sudeikis. ... In mid-November, news breaks of Olivia Wilde's split from Jason ...",
"date": "Feb 23, 2023",
"imageUrl": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRRqw3fvZOIGHEepxCc7yFAWYsS_v_1H6X-4nxyFJxdfRuFQw_BrI6JVzI&s",
"position": 5,
},
{
"title": "Harry Styles and Olivia Wilde's Relationship Timeline - People",
"link": "https://people.com/music/harry-styles-olivia-wilde-relationship-timeline/",
"snippet": "Harry Styles and Olivia Wilde first met on the set of Don't Worry Darling and stepped out as a couple in January 2021. Relive all their biggest relationship ...",
"position": 6,
},
{
"title": "Jason Sudeikis and Olivia Wilde's Relationship Timeline - People",
"link": "https://people.com/movies/jason-sudeikis-olivia-wilde-relationship-timeline/",
"snippet": "Jason Sudeikis and Olivia Wilde ended their engagement of seven years in 2020. Here's a complete timeline of their relationship.",
"date": "Mar 24, 2023",
"imageUrl": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSleZoXusQyJJe2WMgIuck_cVaJ8AE0_hU2QxsXzYvKANi55UQlv82yAVI&s",
"position": 7,
},
{
"title": "Olivia Wilde's anger at ex-boyfriend Harry Styles: She resents him and thinks he was using her | Marca",
"link": "https://www.marca.com/en/lifestyle/celebrities/2023/02/23/63f779a4e2704e8d988b4624.html",
"snippet": "The two started dating after Wilde split up with actor Jason Sudeikisin 2020. However, their relationship came to an end last November.",
"date": "Feb 23, 2023",
"imageUrl": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQBgJF2mSnIWCvPrqUqM4WTI9xPNWPyLvHuune85swpB1yE_G8cy_7KRh0&s",
"position": 8,
},
{
"title": "Olivia Wilde's dating history: Who has the actress dated? | The US Sun",
"link": "https://www.the-sun.com/entertainment/5221040/olivia-wildes-dating-history/",
"snippet": "AMERICAN actress Olivia Wilde started dating Harry Styles in January 2021 after breaking off her engagement the year prior.",
"date": "Nov 19, 2022",
"imageUrl": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTpm8BToVFHJoH6yRggg0fLocLT9mt6lwsnRxFFDNdDGhDydzQiSKZ9__g&s",
"position": 9,
},
],
"relatedSearches": [
{"query": "Harry Styles girlfriends in order"},
{"query": "Harry Styles and Olivia Wilde engaged"},
{"query": "Harry Styles and Olivia Wilde wedding"},
{"query": "Who is Harry Styles married to"},
{"query": "Jason Sudeikis Olivia Wilde relationship"},
{"query": "Olivia Wilde and Jason Sudeikis kids"},
{"query": "Olivia Wilde children"},
{"query": "Harry Styles and Olivia Wilde age difference"},
{"query": "Jason Sudeikis Olivia Wilde, Harry Styles"},
],
}
@pytest.fixture
def mock_serper_dev_search_result():
with patch("haystack.preview.components.websearch.serper_dev.requests") as mock_run:
mock_run.post.return_value = Mock(status_code=200, json=lambda: EXAMPLE_SERPERDEV_RESPONSE)
yield mock_run
class TestSerperDevSearchAPI:
@pytest.mark.unit
def test_to_dict(self):
component = SerperDevWebSearch(
api_key="test_key", top_k=10, allowed_domains=["test.com"], search_params={"param": "test"}
)
data = component.to_dict()
assert data == {
"type": "SerperDevWebSearch",
"init_parameters": {
"api_key": "test_key",
"top_k": 10,
"allowed_domains": ["test.com"],
"search_params": {"param": "test"},
},
}
@pytest.mark.unit
def test_from_dict(self):
data = {
"type": "SerperDevWebSearch",
"init_parameters": {
"api_key": "test_key",
"top_k": 10,
"allowed_domains": ["test.com"],
"search_params": {"param": "test"},
},
}
component = SerperDevWebSearch.from_dict(data)
assert component.api_key == "test_key"
assert component.top_k == 10
assert component.allowed_domains == ["test.com"]
assert component.search_params == {"param": "test"}
@pytest.mark.unit
@pytest.mark.parametrize("top_k", [1, 5, 7])
def test_web_search_top_k(self, mock_serper_dev_search_result, top_k: int):
ws = SerperDevWebSearch(api_key="some_invalid_key", top_k=top_k)
results = ws.run(query="Who is the boyfriend of Olivia Wilde?")["documents"]
assert len(results) == top_k
assert all(isinstance(doc, Document) for doc in results)
@pytest.mark.unit
@patch("requests.post")
def test_timeout_error(self, mock_post):
mock_post.side_effect = Timeout
ws = SerperDevWebSearch(api_key="some_invalid_key")
with pytest.raises(TimeoutError):
ws.run(query="Who is the boyfriend of Olivia Wilde?")
@pytest.mark.unit
@patch("requests.post")
def test_request_exception(self, mock_post):
mock_post.side_effect = RequestException
ws = SerperDevWebSearch(api_key="some_invalid_key")
with pytest.raises(SerperDevError):
ws.run(query="Who is the boyfriend of Olivia Wilde?")
@pytest.mark.unit
@patch("requests.post")
def test_bad_response_code(self, mock_post):
mock_response = mock_post.return_value
mock_response.status_code = 404
mock_response.raise_for_status.side_effect = HTTPError
ws = SerperDevWebSearch(api_key="some_invalid_key")
with pytest.raises(SerperDevError):
ws.run(query="Who is the boyfriend of Olivia Wilde?")