mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-31 17:59:27 +00:00 
			
		
		
		
	feat: Allow WebRetrieve to use custom LinkContentFetcher (#5662)
* Allow use of custom LinkContentFetcher * Add release note
This commit is contained in:
		
							parent
							
								
									07c85905f3
								
							
						
					
					
						commit
						e5e7bb9654
					
				| @ -51,6 +51,7 @@ class WebRetriever(BaseRetriever): | |||||||
|         api_key: str, |         api_key: str, | ||||||
|         search_engine_provider: Union[str, SearchEngine] = "SerperDev", |         search_engine_provider: Union[str, SearchEngine] = "SerperDev", | ||||||
|         allowed_domains: Optional[List[str]] = None, |         allowed_domains: Optional[List[str]] = None, | ||||||
|  |         link_content_fetcher: Optional[LinkContentFetcher] = None, | ||||||
|         top_search_results: Optional[int] = 10, |         top_search_results: Optional[int] = 10, | ||||||
|         top_k: Optional[int] = 5, |         top_k: Optional[int] = 5, | ||||||
|         mode: Literal["snippets", "raw_documents", "preprocessed_documents"] = "snippets", |         mode: Literal["snippets", "raw_documents", "preprocessed_documents"] = "snippets", | ||||||
| @ -64,6 +65,8 @@ class WebRetriever(BaseRetriever): | |||||||
|         :param api_key: API key for the search engine provider. |         :param api_key: API key for the search engine provider. | ||||||
|         :param search_engine_provider: Name of the search engine provider class, see `providers.py` for a list of supported providers. |         :param search_engine_provider: Name of the search engine provider class, see `providers.py` for a list of supported providers. | ||||||
|         :param allowed_domains: List of domains to restrict the search to. If not provided, the search is unrestricted. |         :param allowed_domains: List of domains to restrict the search to. If not provided, the search is unrestricted. | ||||||
|  |         :param link_content_fetcher: LinkContentFetcher to be used to fetch the content from the links. If not provided, | ||||||
|  |         the default LinkContentFetcher is used. | ||||||
|         :param top_search_results: Number of top search results to be retrieved. |         :param top_search_results: Number of top search results to be retrieved. | ||||||
|         :param top_k: Top k documents to be returned by the retriever. |         :param top_k: Top k documents to be returned by the retriever. | ||||||
|         :param mode: Whether to return snippets, raw documents, or preprocessed documents. Snippets are the default. |         :param mode: Whether to return snippets, raw documents, or preprocessed documents. Snippets are the default. | ||||||
| @ -80,6 +83,7 @@ class WebRetriever(BaseRetriever): | |||||||
|             allowed_domains=allowed_domains, |             allowed_domains=allowed_domains, | ||||||
|             search_engine_provider=search_engine_provider, |             search_engine_provider=search_engine_provider, | ||||||
|         ) |         ) | ||||||
|  |         self.link_content_fetcher = link_content_fetcher or LinkContentFetcher() | ||||||
|         self.mode = mode |         self.mode = mode | ||||||
|         self.cache_document_store = cache_document_store |         self.cache_document_store = cache_document_store | ||||||
|         self.document_store = cache_document_store |         self.document_store = cache_document_store | ||||||
| @ -186,15 +190,13 @@ class WebRetriever(BaseRetriever): | |||||||
|         if not links: |         if not links: | ||||||
|             return [] |             return [] | ||||||
| 
 | 
 | ||||||
|         fetcher = LinkContentFetcher(raise_on_failure=True) |  | ||||||
| 
 |  | ||||||
|         def link_fetch(link: SearchResult) -> List[Document]: |         def link_fetch(link: SearchResult) -> List[Document]: | ||||||
|             """ |             """ | ||||||
|             Encapsulate the link fetching logic in a function to be used in a ThreadPoolExecutor. |             Encapsulate the link fetching logic in a function to be used in a ThreadPoolExecutor. | ||||||
|             """ |             """ | ||||||
|             docs: List[Document] = [] |             docs: List[Document] = [] | ||||||
|             try: |             try: | ||||||
|                 docs = fetcher.fetch( |                 docs = self.link_content_fetcher.fetch( | ||||||
|                     url=link.url, |                     url=link.url, | ||||||
|                     doc_kwargs={ |                     doc_kwargs={ | ||||||
|                         "id_hash_keys": ["meta.url"], |                         "id_hash_keys": ["meta.url"], | ||||||
|  | |||||||
| @ -0,0 +1,4 @@ | |||||||
|  | --- | ||||||
|  | enhancements: | ||||||
|  |   - | | ||||||
|  |     Allow WebRetriever users to specify a custom LinkContentFetcher instance | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Vladimir Blagojevic
						Vladimir Blagojevic