mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-25 06:48:43 +00:00 
			
		
		
		
	Pouyanpi/feat/search engine/providers/google api (#4722)
* feat: implement google api search engine provider Signed-off-by: Pouyan <prezakhanipr@gmail.com> --------- Signed-off-by: Pouyan <prezakhanipr@gmail.com>
This commit is contained in:
		
							parent
							
								
									c88bc19791
								
							
						
					
					
						commit
						75ff768c21
					
				| @ -274,3 +274,78 @@ class BingAPI(SearchEngine): | ||||
| 
 | ||||
|         logger.debug("Bing API returned %s documents for the query '%s'", len(documents), query) | ||||
|         return documents[:top_k] | ||||
| 
 | ||||
| 
 | ||||
| class GoogleAPI(SearchEngine): | ||||
|     """Search engine using the Google API. See [Google Search API](https://developers.google.com/custom-search/v1/overview) for more details.""" | ||||
| 
 | ||||
|     def __init__( | ||||
|         self, | ||||
|         top_k: Optional[int] = 10, | ||||
|         api_key: Optional[str] = None, | ||||
|         engine_id: Optional[str] = None, | ||||
|         search_engine_kwargs: Optional[Dict[str, Any]] = None, | ||||
|     ): | ||||
|         """ | ||||
|         :param top_k: Number of documents to return. | ||||
|         :param api_key: API key for the Google API. | ||||
|         :param engine_id: Engine ID for the Google API. | ||||
|         :param search_engine_kwargs: Additional parameters passed to the Google API. As an example, you can pass the hl parameter to specify the language to use for the query: 'hl':'en'. | ||||
|         """ | ||||
|         super().__init__() | ||||
|         self.api_key = api_key | ||||
|         self.engine_id = engine_id | ||||
|         self.top_k = top_k | ||||
|         self.kwargs = search_engine_kwargs if search_engine_kwargs else {} | ||||
| 
 | ||||
|     def _validate_environment(self): | ||||
|         """ | ||||
|         Validate if the environment variables are set. | ||||
|         """ | ||||
|         if not self.api_key: | ||||
|             raise ValueError( | ||||
|                 "You need to provide an API key for the Google API. See https://developers.google.com/custom-search/v1/overview" | ||||
|             ) | ||||
|         if not self.engine_id: | ||||
|             raise ValueError( | ||||
|                 "You need to provide an engine ID for the Google API. See https://developers.google.com/custom-search/v1/overview" | ||||
|             ) | ||||
| 
 | ||||
|         # check if google api is installed | ||||
|         try: | ||||
|             from googleapiclient.discovery import build | ||||
|         except ImportError: | ||||
|             raise ImportError( | ||||
|                 "You need to install the Google API client. You can do so by running 'pip install google-api-python-client'." | ||||
|             ) | ||||
|         # create a custom search service | ||||
|         self.service = build("customsearch", "v1", developerKey=self.api_key) | ||||
| 
 | ||||
|     def search(self, query: str, **kwargs) -> List[Document]: | ||||
|         """ | ||||
|         :param query: Query string. | ||||
|         :param kwargs: Additional parameters passed to the Google API. | ||||
|                        As an example, you can pass the hl parameter to specify the language to use for the query: 'hl':'en'. | ||||
|                        If you don't specify the hl parameter, the default language for the user's location is used. | ||||
|                        For a complete list of the language codes, see [Language Codes](https://developers.google.com/custom-search/docs/xml_results#languageCollections). | ||||
|                        You can also pass the num parameter to specify the number of results to return: 'num':10. | ||||
|                        You can find a full list of parameters at [Query Parameters](https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list). | ||||
|         :return: List[Document] | ||||
|         """ | ||||
|         kwargs = {**self.kwargs, **kwargs} | ||||
|         self.engine_id = kwargs.pop("engine_id", self.engine_id) | ||||
| 
 | ||||
|         self._validate_environment() | ||||
| 
 | ||||
|         top_k = kwargs.pop("top_k", self.top_k) | ||||
|         params: Dict[str, Union[str, int, float]] = {"num": 10, **kwargs} | ||||
|         res = self.service.cse().list(q=query, cx=self.engine_id, **params).execute() | ||||
|         documents: List[Document] = [] | ||||
|         for i, result in enumerate(res["items"]): | ||||
|             documents.append( | ||||
|                 Document.from_dict( | ||||
|                     {"title": result["title"], "content": result["snippet"], "position": i, "link": result["link"]} | ||||
|                 ) | ||||
|             ) | ||||
|         logger.debug("Google API returned %s documents for the query '%s'", len(documents), query) | ||||
|         return documents[:top_k] | ||||
|  | ||||
| @ -1,10 +1,19 @@ | ||||
| import os | ||||
| import unittest | ||||
| from unittest.mock import MagicMock, patch | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| from haystack.nodes.search_engine import WebSearch | ||||
| from haystack.schema import Document | ||||
| 
 | ||||
| try: | ||||
|     import googleapiclient | ||||
| 
 | ||||
|     googleapi_installed = True | ||||
| except ImportError: | ||||
|     googleapi_installed = False | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.skipif( | ||||
|     not os.environ.get("SERPERDEV_API_KEY", None), | ||||
| @ -33,3 +42,66 @@ def test_web_search_with_site_keyword(): | ||||
|     assert all( | ||||
|         ["nasa" in doc.meta["link"] or "lifewire" in doc.meta["link"] for doc in result["documents"]] | ||||
|     ), "Some documents are not from the specified sites lifewire.com or nasa.gov." | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.unit | ||||
| def test_web_search_with_google_api_provider(): | ||||
|     if not googleapi_installed: | ||||
|         pytest.skip("google-api-python-client is not installed, skipping test.") | ||||
| 
 | ||||
|     GOOGLE_API_KEY = "dummy_api_key" | ||||
|     SEARCH_ENGINE_ID = "dummy_search_engine_id" | ||||
|     query = "The founder of Python" | ||||
| 
 | ||||
|     with patch("haystack.nodes.search_engine.WebSearch.run") as mock_run: | ||||
|         mock_run.return_value = ([{"content": "Guido van Rossum"}], None) | ||||
|         ws = WebSearch( | ||||
|             api_key=GOOGLE_API_KEY, | ||||
|             search_engine_provider="GoogleAPI", | ||||
|             search_engine_kwargs={"engine_id": SEARCH_ENGINE_ID}, | ||||
|         ) | ||||
|         result, _ = ws.run(query=query) | ||||
| 
 | ||||
|         mock_run.assert_called_once_with(query=query) | ||||
| 
 | ||||
|         assert "guido" in result[0]["content"].lower() | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.unit | ||||
| def test_web_search_with_google_api_client(): | ||||
|     if not googleapi_installed: | ||||
|         pytest.skip("google-api-python-client is not installed, skipping test.") | ||||
| 
 | ||||
|     GOOGLE_API_KEY = "dummy_api_key" | ||||
|     SEARCH_ENGINE_ID = "dummy_search_engine_id" | ||||
|     query = "The founder of Python" | ||||
| 
 | ||||
|     with patch("googleapiclient.discovery.build") as mock_build: | ||||
|         mock_service = MagicMock() | ||||
|         mock_cse = MagicMock() | ||||
|         mock_list = MagicMock() | ||||
| 
 | ||||
|         mock_build.return_value = mock_service | ||||
|         mock_service.cse.return_value = mock_cse | ||||
|         mock_cse.list.return_value = mock_list | ||||
|         mock_list.execute.return_value = { | ||||
|             "items": [ | ||||
|                 { | ||||
|                     "title": "Guido van Rossum", | ||||
|                     "snippet": "The founder of Python programming language.", | ||||
|                     "link": "https://example.com/guido", | ||||
|                 } | ||||
|             ] | ||||
|         } | ||||
| 
 | ||||
|         ws = WebSearch( | ||||
|             api_key=GOOGLE_API_KEY, | ||||
|             search_engine_provider="GoogleAPI", | ||||
|             search_engine_kwargs={"engine_id": SEARCH_ENGINE_ID}, | ||||
|         ) | ||||
|         result, _ = ws.run(query=query) | ||||
| 
 | ||||
|         mock_build.assert_called_once_with("customsearch", "v1", developerKey=GOOGLE_API_KEY) | ||||
|         mock_service.cse.assert_called_once() | ||||
|         mock_cse.list.assert_called_once_with(q=query, cx=SEARCH_ENGINE_ID, num=10) | ||||
|         mock_list.execute.assert_called_once() | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Pouyan
						Pouyan