From c5edb45c102ece9e676c65ea8aa25141cce21177 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Wed, 6 Sep 2023 17:31:42 +0200 Subject: [PATCH] feat: Add `SerperDevWebSearch` Haystack 2.0 component (#5712) * Add SerperDev * Add release note * PR Feedback * Simplify, remove one-liner * Update haystack/preview/components/websearch/serper_dev.py Co-authored-by: ZanSara * Update haystack/preview/components/websearch/serper_dev.py Co-authored-by: ZanSara * Fix formatting * PR feedback * Fix tests * Function rename * Remove scoring, update tests * PR feedback * Fix return * small adjustments * fix tests * add e2e test * fix release notes * fix tests * fix e2e --------- Co-authored-by: ZanSara --- .github/workflows/e2e.yml | 3 + .../components/test_serperdev_websearch.py | 15 ++ .../preview/components/websearch/__init__.py | 1 + .../components/websearch/serper_dev.py | 140 ++++++++++++++ .../add-serper-dev-8c582749728e3699.yaml | 3 + test/preview/components/websearch/__init__.py | 0 .../components/websearch/test_serperdev.py | 178 ++++++++++++++++++ 7 files changed, 340 insertions(+) create mode 100644 e2e/preview/components/test_serperdev_websearch.py create mode 100644 haystack/preview/components/websearch/__init__.py create mode 100644 haystack/preview/components/websearch/serper_dev.py create mode 100644 releasenotes/notes/add-serper-dev-8c582749728e3699.yaml create mode 100644 test/preview/components/websearch/__init__.py create mode 100644 test/preview/components/websearch/test_serperdev.py diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index ca03f28d2..a37187302 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -16,6 +16,9 @@ on: env: PYTHON_VERSION: "3.8" + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }} + SERPERDEV_API_KEY: ${{ secrets.SERPERDEV_API_KEY }} jobs: e2e: diff --git a/e2e/preview/components/test_serperdev_websearch.py b/e2e/preview/components/test_serperdev_websearch.py new file mode 100644 index 000000000..1df443532 --- /dev/null +++ b/e2e/preview/components/test_serperdev_websearch.py @@ -0,0 +1,15 @@ +import os +import pytest +from haystack.preview import Document +from haystack.preview.components.websearch.serper_dev import SerperDevWebSearch + + +@pytest.mark.skipif( + not os.environ.get("SERPERDEV_API_KEY", None), + reason="Export an env var called SERPERDEV_API_KEY containing the SerperDev API key to run this test.", +) +def test_web_search_top_k(): + ws = SerperDevWebSearch(api_key=os.environ.get("SERPERDEV_API_KEY", None), top_k=10) + results = ws.run(query="Who is the boyfriend of Olivia Wilde?")["documents"] + assert len(results) == 10 + assert all(isinstance(doc, Document) for doc in results) diff --git a/haystack/preview/components/websearch/__init__.py b/haystack/preview/components/websearch/__init__.py new file mode 100644 index 000000000..df2072334 --- /dev/null +++ b/haystack/preview/components/websearch/__init__.py @@ -0,0 +1 @@ +from haystack.preview.components.websearch.serper_dev import SerperDevWebSearch diff --git a/haystack/preview/components/websearch/serper_dev.py b/haystack/preview/components/websearch/serper_dev.py new file mode 100644 index 000000000..6f14f797d --- /dev/null +++ b/haystack/preview/components/websearch/serper_dev.py @@ -0,0 +1,140 @@ +import json +import logging +from typing import Dict, List, Optional, Any + +import requests + +from haystack.preview import Document, component, default_from_dict, default_to_dict, ComponentError + +logger = logging.getLogger(__name__) + + +SERPERDEV_BASE_URL = "https://google.serper.dev/search" + + +class SerperDevError(ComponentError): + ... + + +@component +class SerperDevWebSearch: + """ + Search engine using SerperDev API. Given a query, it returns a list of URLs that are the most relevant. + + See the [Serper Dev website](https://serper.dev/) for more details. + """ + + def __init__( + self, + api_key: str, + top_k: Optional[int] = 10, + allowed_domains: Optional[List[str]] = None, + search_params: Optional[Dict[str, Any]] = None, + ): + """ + :param api_key: API key for the SerperDev API. + :param top_k: Number of documents to return. + :param allowed_domains: List of domains to limit the search to. + :param search_params: Additional parameters passed to the SerperDev API. + For example, you can set 'num' to 20 to increase the number of search results. + See the [Serper Dev website](https://serper.dev/) for more details. + """ + if api_key is None: + raise ValueError("API key for SerperDev API must be set.") + self.api_key = api_key + self.top_k = top_k + self.allowed_domains = allowed_domains + self.search_params = search_params or {} + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict( + self, + api_key=self.api_key, + top_k=self.top_k, + allowed_domains=self.allowed_domains, + search_params=self.search_params, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "SerperDevWebSearch": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document]) + def run(self, query: str): + """ + Search the SerperDev API for the given query and return the results as a list of Documents. + + :param query: Query string. + :return: List[Document] + """ + query_prepend = "OR ".join(f"site:{domain} " for domain in self.allowed_domains) if self.allowed_domains else "" + + payload = json.dumps( + {"q": query_prepend + query, "gl": "us", "hl": "en", "autocorrect": True, **self.search_params} + ) + headers = {"X-API-KEY": self.api_key, "Content-Type": "application/json"} + + try: + response = requests.post(SERPERDEV_BASE_URL, headers=headers, data=payload, timeout=30) + response.raise_for_status() # Will raise an HTTPError for bad responses + except requests.Timeout: + raise TimeoutError(f"Request to {self.__class__.__name__} timed out.") + + except requests.RequestException as e: + raise SerperDevError(f"An error occurred while querying {self.__class__.__name__}. Error: {e}") from e + + # If we reached this point, it means the request was successful and we can proceed + json_result = response.json() + + # we get the snippet from the json result and put it in the content field of the document + organic = [ + Document(metadata={k: v for k, v in d.items() if k != "snippet"}, content=d["snippet"]) + for d in json_result["organic"] + ] + + # answer box is what search engine shows as a direct answer to the query + answer_box = [] + if "answerBox" in json_result: + answer_dict = json_result["answerBox"] + highlighted_answers = answer_dict.get("snippetHighlighted") + answer_box_content = None + # Check if highlighted_answers is a list and has at least one element + if isinstance(highlighted_answers, list) and len(highlighted_answers) > 0: + answer_box_content = highlighted_answers[0] + elif isinstance(highlighted_answers, str): + answer_box_content = highlighted_answers + if not answer_box_content: + for key in ["snippet", "answer", "title"]: + if key in answer_dict: + answer_box_content = answer_dict[key] + break + if answer_box_content: + answer_box = [ + Document( + content=answer_box_content, + metadata={"title": answer_dict.get("title", ""), "link": answer_dict.get("link", "")}, + ) + ] + + # these are related questions that search engine shows + people_also_ask = [] + if "peopleAlsoAsk" in json_result: + for result in json_result["peopleAlsoAsk"]: + title = result.get("title", "") + people_also_ask.append( + Document( + content=result["snippet"] if result.get("snippet") else title, + metadata={"title": title, "link": result.get("link", None)}, + ) + ) + + documents = answer_box + organic + people_also_ask + + logger.debug("Serper Dev returned %s documents for the query '%s'", len(documents), query) + return {"documents": documents[: self.top_k]} diff --git a/releasenotes/notes/add-serper-dev-8c582749728e3699.yaml b/releasenotes/notes/add-serper-dev-8c582749728e3699.yaml new file mode 100644 index 000000000..8ce2fd9bb --- /dev/null +++ b/releasenotes/notes/add-serper-dev-8c582749728e3699.yaml @@ -0,0 +1,3 @@ +--- +preview: + - Adds SerperDevWebSearch component to retrieve URLs from the web. See https://serper.dev/ for more information. diff --git a/test/preview/components/websearch/__init__.py b/test/preview/components/websearch/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/preview/components/websearch/test_serperdev.py b/test/preview/components/websearch/test_serperdev.py new file mode 100644 index 000000000..e094e6e69 --- /dev/null +++ b/test/preview/components/websearch/test_serperdev.py @@ -0,0 +1,178 @@ +from unittest.mock import Mock, patch + +import pytest +from requests import Timeout, RequestException, HTTPError + +from haystack.preview import Document +from haystack.preview.components.websearch.serper_dev import SerperDevWebSearch, SerperDevError + + +EXAMPLE_SERPERDEV_RESPONSE = { + "searchParameters": { + "q": "Who is the boyfriend of Olivia Wilde?", + "gl": "us", + "hl": "en", + "autocorrect": True, + "type": "search", + }, + "organic": [ + { + "title": "Olivia Wilde embraces Jason Sudeikis amid custody battle, Harry Styles split - Page Six", + "link": "https://pagesix.com/2023/01/29/olivia-wilde-hugs-it-out-with-jason-sudeikis-after-harry-styles-split/", + "snippet": "Looks like Olivia Wilde and Jason Sudeikis are starting 2023 on good terms. Amid their highly publicized custody battle – and the actress' ...", + "date": "Jan 29, 2023", + "position": 1, + }, + { + "title": "Olivia Wilde Is 'Quietly Dating' Again Following Harry Styles Split: 'He Makes Her Happy'", + "link": "https://www.yahoo.com/now/olivia-wilde-quietly-dating-again-183844364.html", + "snippet": "Olivia Wilde is “quietly dating again” following her November 2022 split from Harry Styles, a source exclusively tells Life & Style.", + "date": "Feb 10, 2023", + "position": 2, + }, + { + "title": "Olivia Wilde and Harry Styles' Relationship Timeline: The Way They Were - Us Weekly", + "link": "https://www.usmagazine.com/celebrity-news/pictures/olivia-wilde-and-harry-styles-relationship-timeline/", + "snippet": "Olivia Wilde started dating Harry Styles after ending her years-long engagement to Jason Sudeikis — see their relationship timeline.", + "date": "Mar 10, 2023", + "imageUrl": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSgTcalNFvptTbYBiDXX55s8yCGfn6F1qbed9DAN16LvynTr9GayK5SPmY&s", + "position": 3, + }, + { + "title": "Olivia Wilde Is 'Ready to Date Again' After Harry Styles Split - Us Weekly", + "link": "https://www.usmagazine.com/celebrity-news/news/olivia-wilde-is-ready-to-date-again-after-harry-styles-split/", + "snippet": "Ready for love! Olivia Wilde is officially back on the dating scene following her split from her ex-boyfriend, Harry Styles.", + "date": "Mar 1, 2023", + "imageUrl": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRCRAeRy5sVE631ZctzbzuOF70xkIOHaTvh2K7dYvdiVBwALiKrIjpscok&s", + "position": 4, + }, + { + "title": "Harry Styles and Olivia Wilde's Definitive Relationship Timeline - Harper's Bazaar", + "link": "https://www.harpersbazaar.com/celebrity/latest/a35172115/harry-styles-olivia-wilde-relationship-timeline/", + "snippet": "November 2020: News breaks about Olivia splitting from fiancé Jason Sudeikis. ... In mid-November, news breaks of Olivia Wilde's split from Jason ...", + "date": "Feb 23, 2023", + "imageUrl": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRRqw3fvZOIGHEepxCc7yFAWYsS_v_1H6X-4nxyFJxdfRuFQw_BrI6JVzI&s", + "position": 5, + }, + { + "title": "Harry Styles and Olivia Wilde's Relationship Timeline - People", + "link": "https://people.com/music/harry-styles-olivia-wilde-relationship-timeline/", + "snippet": "Harry Styles and Olivia Wilde first met on the set of Don't Worry Darling and stepped out as a couple in January 2021. Relive all their biggest relationship ...", + "position": 6, + }, + { + "title": "Jason Sudeikis and Olivia Wilde's Relationship Timeline - People", + "link": "https://people.com/movies/jason-sudeikis-olivia-wilde-relationship-timeline/", + "snippet": "Jason Sudeikis and Olivia Wilde ended their engagement of seven years in 2020. Here's a complete timeline of their relationship.", + "date": "Mar 24, 2023", + "imageUrl": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSleZoXusQyJJe2WMgIuck_cVaJ8AE0_hU2QxsXzYvKANi55UQlv82yAVI&s", + "position": 7, + }, + { + "title": "Olivia Wilde's anger at ex-boyfriend Harry Styles: She resents him and thinks he was using her | Marca", + "link": "https://www.marca.com/en/lifestyle/celebrities/2023/02/23/63f779a4e2704e8d988b4624.html", + "snippet": "The two started dating after Wilde split up with actor Jason Sudeikisin 2020. However, their relationship came to an end last November.", + "date": "Feb 23, 2023", + "imageUrl": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQBgJF2mSnIWCvPrqUqM4WTI9xPNWPyLvHuune85swpB1yE_G8cy_7KRh0&s", + "position": 8, + }, + { + "title": "Olivia Wilde's dating history: Who has the actress dated? | The US Sun", + "link": "https://www.the-sun.com/entertainment/5221040/olivia-wildes-dating-history/", + "snippet": "AMERICAN actress Olivia Wilde started dating Harry Styles in January 2021 after breaking off her engagement the year prior.", + "date": "Nov 19, 2022", + "imageUrl": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTpm8BToVFHJoH6yRggg0fLocLT9mt6lwsnRxFFDNdDGhDydzQiSKZ9__g&s", + "position": 9, + }, + ], + "relatedSearches": [ + {"query": "Harry Styles girlfriends in order"}, + {"query": "Harry Styles and Olivia Wilde engaged"}, + {"query": "Harry Styles and Olivia Wilde wedding"}, + {"query": "Who is Harry Styles married to"}, + {"query": "Jason Sudeikis Olivia Wilde relationship"}, + {"query": "Olivia Wilde and Jason Sudeikis kids"}, + {"query": "Olivia Wilde children"}, + {"query": "Harry Styles and Olivia Wilde age difference"}, + {"query": "Jason Sudeikis Olivia Wilde, Harry Styles"}, + ], +} + + +@pytest.fixture +def mock_serper_dev_search_result(): + with patch("haystack.preview.components.websearch.serper_dev.requests") as mock_run: + mock_run.post.return_value = Mock(status_code=200, json=lambda: EXAMPLE_SERPERDEV_RESPONSE) + yield mock_run + + +class TestSerperDevSearchAPI: + @pytest.mark.unit + def test_to_dict(self): + component = SerperDevWebSearch( + api_key="test_key", top_k=10, allowed_domains=["test.com"], search_params={"param": "test"} + ) + data = component.to_dict() + assert data == { + "type": "SerperDevWebSearch", + "init_parameters": { + "api_key": "test_key", + "top_k": 10, + "allowed_domains": ["test.com"], + "search_params": {"param": "test"}, + }, + } + + @pytest.mark.unit + def test_from_dict(self): + data = { + "type": "SerperDevWebSearch", + "init_parameters": { + "api_key": "test_key", + "top_k": 10, + "allowed_domains": ["test.com"], + "search_params": {"param": "test"}, + }, + } + component = SerperDevWebSearch.from_dict(data) + assert component.api_key == "test_key" + assert component.top_k == 10 + assert component.allowed_domains == ["test.com"] + assert component.search_params == {"param": "test"} + + @pytest.mark.unit + @pytest.mark.parametrize("top_k", [1, 5, 7]) + def test_web_search_top_k(self, mock_serper_dev_search_result, top_k: int): + ws = SerperDevWebSearch(api_key="some_invalid_key", top_k=top_k) + results = ws.run(query="Who is the boyfriend of Olivia Wilde?")["documents"] + assert len(results) == top_k + assert all(isinstance(doc, Document) for doc in results) + + @pytest.mark.unit + @patch("requests.post") + def test_timeout_error(self, mock_post): + mock_post.side_effect = Timeout + ws = SerperDevWebSearch(api_key="some_invalid_key") + + with pytest.raises(TimeoutError): + ws.run(query="Who is the boyfriend of Olivia Wilde?") + + @pytest.mark.unit + @patch("requests.post") + def test_request_exception(self, mock_post): + mock_post.side_effect = RequestException + ws = SerperDevWebSearch(api_key="some_invalid_key") + + with pytest.raises(SerperDevError): + ws.run(query="Who is the boyfriend of Olivia Wilde?") + + @pytest.mark.unit + @patch("requests.post") + def test_bad_response_code(self, mock_post): + mock_response = mock_post.return_value + mock_response.status_code = 404 + mock_response.raise_for_status.side_effect = HTTPError + ws = SerperDevWebSearch(api_key="some_invalid_key") + + with pytest.raises(SerperDevError): + ws.run(query="Who is the boyfriend of Olivia Wilde?")