From e5e7bb9654a54168ac15c08a19d6b76b24f25941 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Tue, 29 Aug 2023 15:46:48 +0200 Subject: [PATCH] feat: Allow WebRetrieve to use custom LinkContentFetcher (#5662) * Allow use of custom LinkContentFetcher * Add release note --- haystack/nodes/retriever/web.py | 8 +++++--- ...llow-custom-link-content-fetcher-8728141d81d7a5e5.yaml | 4 ++++ 2 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 releasenotes/notes/web-retriever-allow-custom-link-content-fetcher-8728141d81d7a5e5.yaml diff --git a/haystack/nodes/retriever/web.py b/haystack/nodes/retriever/web.py index e93e32d26..cfb50afd4 100644 --- a/haystack/nodes/retriever/web.py +++ b/haystack/nodes/retriever/web.py @@ -51,6 +51,7 @@ class WebRetriever(BaseRetriever): api_key: str, search_engine_provider: Union[str, SearchEngine] = "SerperDev", allowed_domains: Optional[List[str]] = None, + link_content_fetcher: Optional[LinkContentFetcher] = None, top_search_results: Optional[int] = 10, top_k: Optional[int] = 5, mode: Literal["snippets", "raw_documents", "preprocessed_documents"] = "snippets", @@ -64,6 +65,8 @@ class WebRetriever(BaseRetriever): :param api_key: API key for the search engine provider. :param search_engine_provider: Name of the search engine provider class, see `providers.py` for a list of supported providers. :param allowed_domains: List of domains to restrict the search to. If not provided, the search is unrestricted. + :param link_content_fetcher: LinkContentFetcher to be used to fetch the content from the links. If not provided, + the default LinkContentFetcher is used. :param top_search_results: Number of top search results to be retrieved. :param top_k: Top k documents to be returned by the retriever. :param mode: Whether to return snippets, raw documents, or preprocessed documents. Snippets are the default. @@ -80,6 +83,7 @@ class WebRetriever(BaseRetriever): allowed_domains=allowed_domains, search_engine_provider=search_engine_provider, ) + self.link_content_fetcher = link_content_fetcher or LinkContentFetcher() self.mode = mode self.cache_document_store = cache_document_store self.document_store = cache_document_store @@ -186,15 +190,13 @@ class WebRetriever(BaseRetriever): if not links: return [] - fetcher = LinkContentFetcher(raise_on_failure=True) - def link_fetch(link: SearchResult) -> List[Document]: """ Encapsulate the link fetching logic in a function to be used in a ThreadPoolExecutor. """ docs: List[Document] = [] try: - docs = fetcher.fetch( + docs = self.link_content_fetcher.fetch( url=link.url, doc_kwargs={ "id_hash_keys": ["meta.url"], diff --git a/releasenotes/notes/web-retriever-allow-custom-link-content-fetcher-8728141d81d7a5e5.yaml b/releasenotes/notes/web-retriever-allow-custom-link-content-fetcher-8728141d81d7a5e5.yaml new file mode 100644 index 000000000..fff5904a1 --- /dev/null +++ b/releasenotes/notes/web-retriever-allow-custom-link-content-fetcher-8728141d81d7a5e5.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Allow WebRetriever users to specify a custom LinkContentFetcher instance