feat: Improve LinkContentFetcher content type handling (#7920)

* LinkContentFetcher: add more default content type handlers * Update/add unit test * Add reno note * Add image content handler * Update unit test
2025-12-14 08:37:42 +00:00 · 2024-06-27 11:45:20 +02:00 · 2024-06-27 11:45:20 +02:00 · c2ed275a2d
commit c2ed275a2d
parent 535a281eec
3 changed files with 49 additions and 9 deletions
--- a/haystack/components/fetchers/link_content.py
+++ b/haystack/components/fetchers/link_content.py
@ -4,6 +4,7 @@
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
 from fnmatch import fnmatch
 from typing import Callable, Dict, List, Optional, Tuple
 import requests
@ -94,10 +95,12 @@ class LinkContentFetcher:
        # register default content handlers that extract data from the response
        self.handlers: Dict[str, Callable[[Response], ByteStream]] = defaultdict(lambda: _text_content_handler)
-        self.handlers["text/html"] = _text_content_handler
+        self.handlers["text/*"] = _text_content_handler
-        self.handlers["text/plain"] = _text_content_handler
+        self.handlers["application/json"] = _text_content_handler
-        self.handlers["application/pdf"] = _binary_content_handler
+        self.handlers["application/*"] = _binary_content_handler
-        self.handlers["application/octet-stream"] = _binary_content_handler
+        self.handlers["image/*"] = _binary_content_handler
        self.handlers["audio/*"] = _binary_content_handler
        self.handlers["video/*"] = _binary_content_handler
        @retry(
            reraise=True,
@ -175,7 +178,7 @@ class LinkContentFetcher:
        try:
            response = self._get_response(url)
            content_type = self._get_content_type(response)
-            handler: Callable = self.handlers[content_type]
+            handler: Callable = self._resolve_handler(content_type)
            stream = handler(response)
        except Exception as e:
            if self.raise_on_failure:
@ -217,6 +220,29 @@ class LinkContentFetcher:
        content_type = response.headers.get("Content-Type", "")
        return content_type.split(";")[0]
    def _resolve_handler(self, content_type: str) -> Callable[[Response], ByteStream]:
        """
        Resolves the handler for the given content type.
        First, it tries to find a direct match for the content type in the handlers dictionary.
        If no direct match is found, it tries to find a pattern match using the fnmatch function.
        If no pattern match is found, it returns the default handler for text/plain.
        :param content_type: The content type to resolve the handler for.
        :returns: The handler for the given content type, if found. Otherwise, the default handler for text/plain.
        """
        # direct match
        if content_type in self.handlers:
            return self.handlers[content_type]
        # pattern matches
        for pattern, handler in self.handlers.items():
            if fnmatch(content_type, pattern):
                return handler
        # default handler
        return self.handlers["text/plain"]
    def _switch_user_agent(self, retry_state: RetryCallState) -> None:
        """
        Switches the User-Agent for this LinkContentRetriever to the next one in the list of user agents.
--- a/releasenotes/notes/link-content-fetcher-enhancements-49babe1c60888043.yaml
+++ b/releasenotes/notes/link-content-fetcher-enhancements-49babe1c60888043.yaml
@ -0,0 +1,4 @@
 ---
 enhancements:
 - |
   Improve LinkContentFetcher to support a broader range of content types, including glob patterns for text, application, audio, and video types. This update introduces a more flexible content handler resolution mechanism, allowing for direct matches and pattern matching, thereby greatly improving the handler's adaptability to various content types encountered on the web.
--- a/test/components/fetchers/test_link_content_fetcher.py
+++ b/test/components/fetchers/test_link_content_fetcher.py
@ -46,10 +46,12 @@ class TestLinkContentFetcher:
        assert fetcher.retry_attempts == 2
        assert fetcher.timeout == 3
        assert fetcher.handlers == {
-            "text/html": _text_content_handler,
+            "text/*": _text_content_handler,
-            "text/plain": _text_content_handler,
+            "application/json": _text_content_handler,
-            "application/pdf": _binary_content_handler,
+            "application/*": _binary_content_handler,
-            "application/octet-stream": _binary_content_handler,
+            "image/*": _binary_content_handler,
            "audio/*": _binary_content_handler,
            "video/*": _binary_content_handler,
        }
        assert hasattr(fetcher, "_get_response")
@ -191,3 +193,11 @@ class TestLinkContentFetcher:
        fetcher = LinkContentFetcher()
        with pytest.raises(requests.exceptions.ConnectionError):
            fetcher.run(["https://non_existent_website_dot.com/"])
    @pytest.mark.integration
    def test_link_content_fetcher_audio(self):
        fetcher = LinkContentFetcher()
        streams = fetcher.run(["https://download.samplelib.com/mp3/sample-3s.mp3"])["streams"]
        first_stream = streams[0]
        assert first_stream.meta["content_type"] == "audio/mpeg"
        assert len(first_stream.data) > 0