From c2ed275a2d87fc2ae6686142a03debd3a19c3189 Mon Sep 17 00:00:00 2001
From: Vladimir Blagojevic <dovlex@gmail.com>
Date: Thu, 27 Jun 2024 11:45:20 +0200
Subject: [PATCH] feat: Improve LinkContentFetcher content type handling
 (#7920)

* LinkContentFetcher: add more default content type handlers

* Update/add unit test

* Add reno note

* Add image content handler

* Update unit test
---
 haystack/components/fetchers/link_content.py  | 36 ++++++++++++++++---
 ...fetcher-enhancements-49babe1c60888043.yaml |  4 +++
 .../fetchers/test_link_content_fetcher.py     | 18 +++++++---
 3 files changed, 49 insertions(+), 9 deletions(-)
 create mode 100644 releasenotes/notes/link-content-fetcher-enhancements-49babe1c60888043.yaml

diff --git a/haystack/components/fetchers/link_content.py b/haystack/components/fetchers/link_content.py
index b658ff245..ccc698f61 100644
--- a/haystack/components/fetchers/link_content.py
+++ b/haystack/components/fetchers/link_content.py
@@ -4,6 +4,7 @@
 
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
+from fnmatch import fnmatch
 from typing import Callable, Dict, List, Optional, Tuple
 
 import requests
@@ -94,10 +95,12 @@ class LinkContentFetcher:
 
         # register default content handlers that extract data from the response
         self.handlers: Dict[str, Callable[[Response], ByteStream]] = defaultdict(lambda: _text_content_handler)
-        self.handlers["text/html"] = _text_content_handler
-        self.handlers["text/plain"] = _text_content_handler
-        self.handlers["application/pdf"] = _binary_content_handler
-        self.handlers["application/octet-stream"] = _binary_content_handler
+        self.handlers["text/*"] = _text_content_handler
+        self.handlers["application/json"] = _text_content_handler
+        self.handlers["application/*"] = _binary_content_handler
+        self.handlers["image/*"] = _binary_content_handler
+        self.handlers["audio/*"] = _binary_content_handler
+        self.handlers["video/*"] = _binary_content_handler
 
         @retry(
             reraise=True,
@@ -175,7 +178,7 @@ class LinkContentFetcher:
         try:
             response = self._get_response(url)
             content_type = self._get_content_type(response)
-            handler: Callable = self.handlers[content_type]
+            handler: Callable = self._resolve_handler(content_type)
             stream = handler(response)
         except Exception as e:
             if self.raise_on_failure:
@@ -217,6 +220,29 @@ class LinkContentFetcher:
         content_type = response.headers.get("Content-Type", "")
         return content_type.split(";")[0]
 
+    def _resolve_handler(self, content_type: str) -> Callable[[Response], ByteStream]:
+        """
+        Resolves the handler for the given content type.
+
+        First, it tries to find a direct match for the content type in the handlers dictionary.
+        If no direct match is found, it tries to find a pattern match using the fnmatch function.
+        If no pattern match is found, it returns the default handler for text/plain.
+
+        :param content_type: The content type to resolve the handler for.
+        :returns: The handler for the given content type, if found. Otherwise, the default handler for text/plain.
+        """
+        # direct match
+        if content_type in self.handlers:
+            return self.handlers[content_type]
+
+        # pattern matches
+        for pattern, handler in self.handlers.items():
+            if fnmatch(content_type, pattern):
+                return handler
+
+        # default handler
+        return self.handlers["text/plain"]
+
     def _switch_user_agent(self, retry_state: RetryCallState) -> None:
         """
         Switches the User-Agent for this LinkContentRetriever to the next one in the list of user agents.
diff --git a/releasenotes/notes/link-content-fetcher-enhancements-49babe1c60888043.yaml b/releasenotes/notes/link-content-fetcher-enhancements-49babe1c60888043.yaml
new file mode 100644
index 000000000..d6a7d2428
--- /dev/null
+++ b/releasenotes/notes/link-content-fetcher-enhancements-49babe1c60888043.yaml
@@ -0,0 +1,4 @@
+---
+enhancements:
+ - |
+   Improve LinkContentFetcher to support a broader range of content types, including glob patterns for text, application, audio, and video types. This update introduces a more flexible content handler resolution mechanism, allowing for direct matches and pattern matching, thereby greatly improving the handler's adaptability to various content types encountered on the web.
diff --git a/test/components/fetchers/test_link_content_fetcher.py b/test/components/fetchers/test_link_content_fetcher.py
index ac99bc4cf..c6a4d5c55 100644
--- a/test/components/fetchers/test_link_content_fetcher.py
+++ b/test/components/fetchers/test_link_content_fetcher.py
@@ -46,10 +46,12 @@ class TestLinkContentFetcher:
         assert fetcher.retry_attempts == 2
         assert fetcher.timeout == 3
         assert fetcher.handlers == {
-            "text/html": _text_content_handler,
-            "text/plain": _text_content_handler,
-            "application/pdf": _binary_content_handler,
-            "application/octet-stream": _binary_content_handler,
+            "text/*": _text_content_handler,
+            "application/json": _text_content_handler,
+            "application/*": _binary_content_handler,
+            "image/*": _binary_content_handler,
+            "audio/*": _binary_content_handler,
+            "video/*": _binary_content_handler,
         }
         assert hasattr(fetcher, "_get_response")
 
@@ -191,3 +193,11 @@ class TestLinkContentFetcher:
         fetcher = LinkContentFetcher()
         with pytest.raises(requests.exceptions.ConnectionError):
             fetcher.run(["https://non_existent_website_dot.com/"])
+
+    @pytest.mark.integration
+    def test_link_content_fetcher_audio(self):
+        fetcher = LinkContentFetcher()
+        streams = fetcher.run(["https://download.samplelib.com/mp3/sample-3s.mp3"])["streams"]
+        first_stream = streams[0]
+        assert first_stream.meta["content_type"] == "audio/mpeg"
+        assert len(first_stream.data) > 0