mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-25 23:08:49 +00:00 
			
		
		
		
	Pouyanpi/feat/search engine/providers/google api (#4722)
* feat: implement google api search engine provider Signed-off-by: Pouyan <prezakhanipr@gmail.com> --------- Signed-off-by: Pouyan <prezakhanipr@gmail.com>
This commit is contained in:
		
							parent
							
								
									c88bc19791
								
							
						
					
					
						commit
						75ff768c21
					
				| @ -274,3 +274,78 @@ class BingAPI(SearchEngine): | |||||||
| 
 | 
 | ||||||
|         logger.debug("Bing API returned %s documents for the query '%s'", len(documents), query) |         logger.debug("Bing API returned %s documents for the query '%s'", len(documents), query) | ||||||
|         return documents[:top_k] |         return documents[:top_k] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class GoogleAPI(SearchEngine): | ||||||
|  |     """Search engine using the Google API. See [Google Search API](https://developers.google.com/custom-search/v1/overview) for more details.""" | ||||||
|  | 
 | ||||||
|  |     def __init__( | ||||||
|  |         self, | ||||||
|  |         top_k: Optional[int] = 10, | ||||||
|  |         api_key: Optional[str] = None, | ||||||
|  |         engine_id: Optional[str] = None, | ||||||
|  |         search_engine_kwargs: Optional[Dict[str, Any]] = None, | ||||||
|  |     ): | ||||||
|  |         """ | ||||||
|  |         :param top_k: Number of documents to return. | ||||||
|  |         :param api_key: API key for the Google API. | ||||||
|  |         :param engine_id: Engine ID for the Google API. | ||||||
|  |         :param search_engine_kwargs: Additional parameters passed to the Google API. As an example, you can pass the hl parameter to specify the language to use for the query: 'hl':'en'. | ||||||
|  |         """ | ||||||
|  |         super().__init__() | ||||||
|  |         self.api_key = api_key | ||||||
|  |         self.engine_id = engine_id | ||||||
|  |         self.top_k = top_k | ||||||
|  |         self.kwargs = search_engine_kwargs if search_engine_kwargs else {} | ||||||
|  | 
 | ||||||
|  |     def _validate_environment(self): | ||||||
|  |         """ | ||||||
|  |         Validate if the environment variables are set. | ||||||
|  |         """ | ||||||
|  |         if not self.api_key: | ||||||
|  |             raise ValueError( | ||||||
|  |                 "You need to provide an API key for the Google API. See https://developers.google.com/custom-search/v1/overview" | ||||||
|  |             ) | ||||||
|  |         if not self.engine_id: | ||||||
|  |             raise ValueError( | ||||||
|  |                 "You need to provide an engine ID for the Google API. See https://developers.google.com/custom-search/v1/overview" | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |         # check if google api is installed | ||||||
|  |         try: | ||||||
|  |             from googleapiclient.discovery import build | ||||||
|  |         except ImportError: | ||||||
|  |             raise ImportError( | ||||||
|  |                 "You need to install the Google API client. You can do so by running 'pip install google-api-python-client'." | ||||||
|  |             ) | ||||||
|  |         # create a custom search service | ||||||
|  |         self.service = build("customsearch", "v1", developerKey=self.api_key) | ||||||
|  | 
 | ||||||
|  |     def search(self, query: str, **kwargs) -> List[Document]: | ||||||
|  |         """ | ||||||
|  |         :param query: Query string. | ||||||
|  |         :param kwargs: Additional parameters passed to the Google API. | ||||||
|  |                        As an example, you can pass the hl parameter to specify the language to use for the query: 'hl':'en'. | ||||||
|  |                        If you don't specify the hl parameter, the default language for the user's location is used. | ||||||
|  |                        For a complete list of the language codes, see [Language Codes](https://developers.google.com/custom-search/docs/xml_results#languageCollections). | ||||||
|  |                        You can also pass the num parameter to specify the number of results to return: 'num':10. | ||||||
|  |                        You can find a full list of parameters at [Query Parameters](https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list). | ||||||
|  |         :return: List[Document] | ||||||
|  |         """ | ||||||
|  |         kwargs = {**self.kwargs, **kwargs} | ||||||
|  |         self.engine_id = kwargs.pop("engine_id", self.engine_id) | ||||||
|  | 
 | ||||||
|  |         self._validate_environment() | ||||||
|  | 
 | ||||||
|  |         top_k = kwargs.pop("top_k", self.top_k) | ||||||
|  |         params: Dict[str, Union[str, int, float]] = {"num": 10, **kwargs} | ||||||
|  |         res = self.service.cse().list(q=query, cx=self.engine_id, **params).execute() | ||||||
|  |         documents: List[Document] = [] | ||||||
|  |         for i, result in enumerate(res["items"]): | ||||||
|  |             documents.append( | ||||||
|  |                 Document.from_dict( | ||||||
|  |                     {"title": result["title"], "content": result["snippet"], "position": i, "link": result["link"]} | ||||||
|  |                 ) | ||||||
|  |             ) | ||||||
|  |         logger.debug("Google API returned %s documents for the query '%s'", len(documents), query) | ||||||
|  |         return documents[:top_k] | ||||||
|  | |||||||
| @ -1,10 +1,19 @@ | |||||||
| import os | import os | ||||||
|  | import unittest | ||||||
|  | from unittest.mock import MagicMock, patch | ||||||
| 
 | 
 | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| from haystack.nodes.search_engine import WebSearch | from haystack.nodes.search_engine import WebSearch | ||||||
| from haystack.schema import Document | from haystack.schema import Document | ||||||
| 
 | 
 | ||||||
|  | try: | ||||||
|  |     import googleapiclient | ||||||
|  | 
 | ||||||
|  |     googleapi_installed = True | ||||||
|  | except ImportError: | ||||||
|  |     googleapi_installed = False | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.skipif( | @pytest.mark.skipif( | ||||||
|     not os.environ.get("SERPERDEV_API_KEY", None), |     not os.environ.get("SERPERDEV_API_KEY", None), | ||||||
| @ -33,3 +42,66 @@ def test_web_search_with_site_keyword(): | |||||||
|     assert all( |     assert all( | ||||||
|         ["nasa" in doc.meta["link"] or "lifewire" in doc.meta["link"] for doc in result["documents"]] |         ["nasa" in doc.meta["link"] or "lifewire" in doc.meta["link"] for doc in result["documents"]] | ||||||
|     ), "Some documents are not from the specified sites lifewire.com or nasa.gov." |     ), "Some documents are not from the specified sites lifewire.com or nasa.gov." | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.unit | ||||||
|  | def test_web_search_with_google_api_provider(): | ||||||
|  |     if not googleapi_installed: | ||||||
|  |         pytest.skip("google-api-python-client is not installed, skipping test.") | ||||||
|  | 
 | ||||||
|  |     GOOGLE_API_KEY = "dummy_api_key" | ||||||
|  |     SEARCH_ENGINE_ID = "dummy_search_engine_id" | ||||||
|  |     query = "The founder of Python" | ||||||
|  | 
 | ||||||
|  |     with patch("haystack.nodes.search_engine.WebSearch.run") as mock_run: | ||||||
|  |         mock_run.return_value = ([{"content": "Guido van Rossum"}], None) | ||||||
|  |         ws = WebSearch( | ||||||
|  |             api_key=GOOGLE_API_KEY, | ||||||
|  |             search_engine_provider="GoogleAPI", | ||||||
|  |             search_engine_kwargs={"engine_id": SEARCH_ENGINE_ID}, | ||||||
|  |         ) | ||||||
|  |         result, _ = ws.run(query=query) | ||||||
|  | 
 | ||||||
|  |         mock_run.assert_called_once_with(query=query) | ||||||
|  | 
 | ||||||
|  |         assert "guido" in result[0]["content"].lower() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.unit | ||||||
|  | def test_web_search_with_google_api_client(): | ||||||
|  |     if not googleapi_installed: | ||||||
|  |         pytest.skip("google-api-python-client is not installed, skipping test.") | ||||||
|  | 
 | ||||||
|  |     GOOGLE_API_KEY = "dummy_api_key" | ||||||
|  |     SEARCH_ENGINE_ID = "dummy_search_engine_id" | ||||||
|  |     query = "The founder of Python" | ||||||
|  | 
 | ||||||
|  |     with patch("googleapiclient.discovery.build") as mock_build: | ||||||
|  |         mock_service = MagicMock() | ||||||
|  |         mock_cse = MagicMock() | ||||||
|  |         mock_list = MagicMock() | ||||||
|  | 
 | ||||||
|  |         mock_build.return_value = mock_service | ||||||
|  |         mock_service.cse.return_value = mock_cse | ||||||
|  |         mock_cse.list.return_value = mock_list | ||||||
|  |         mock_list.execute.return_value = { | ||||||
|  |             "items": [ | ||||||
|  |                 { | ||||||
|  |                     "title": "Guido van Rossum", | ||||||
|  |                     "snippet": "The founder of Python programming language.", | ||||||
|  |                     "link": "https://example.com/guido", | ||||||
|  |                 } | ||||||
|  |             ] | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         ws = WebSearch( | ||||||
|  |             api_key=GOOGLE_API_KEY, | ||||||
|  |             search_engine_provider="GoogleAPI", | ||||||
|  |             search_engine_kwargs={"engine_id": SEARCH_ENGINE_ID}, | ||||||
|  |         ) | ||||||
|  |         result, _ = ws.run(query=query) | ||||||
|  | 
 | ||||||
|  |         mock_build.assert_called_once_with("customsearch", "v1", developerKey=GOOGLE_API_KEY) | ||||||
|  |         mock_service.cse.assert_called_once() | ||||||
|  |         mock_cse.list.assert_called_once_with(q=query, cx=SEARCH_ENGINE_ID, num=10) | ||||||
|  |         mock_list.execute.assert_called_once() | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Pouyan
						Pouyan