From c2ed275a2d87fc2ae6686142a03debd3a19c3189 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 27 Jun 2024 11:45:20 +0200 Subject: [PATCH] feat: Improve LinkContentFetcher content type handling (#7920) * LinkContentFetcher: add more default content type handlers * Update/add unit test * Add reno note * Add image content handler * Update unit test --- haystack/components/fetchers/link_content.py | 36 ++++++++++++++++--- ...fetcher-enhancements-49babe1c60888043.yaml | 4 +++ .../fetchers/test_link_content_fetcher.py | 18 +++++++--- 3 files changed, 49 insertions(+), 9 deletions(-) create mode 100644 releasenotes/notes/link-content-fetcher-enhancements-49babe1c60888043.yaml diff --git a/haystack/components/fetchers/link_content.py b/haystack/components/fetchers/link_content.py index b658ff245..ccc698f61 100644 --- a/haystack/components/fetchers/link_content.py +++ b/haystack/components/fetchers/link_content.py @@ -4,6 +4,7 @@ from collections import defaultdict from concurrent.futures import ThreadPoolExecutor +from fnmatch import fnmatch from typing import Callable, Dict, List, Optional, Tuple import requests @@ -94,10 +95,12 @@ class LinkContentFetcher: # register default content handlers that extract data from the response self.handlers: Dict[str, Callable[[Response], ByteStream]] = defaultdict(lambda: _text_content_handler) - self.handlers["text/html"] = _text_content_handler - self.handlers["text/plain"] = _text_content_handler - self.handlers["application/pdf"] = _binary_content_handler - self.handlers["application/octet-stream"] = _binary_content_handler + self.handlers["text/*"] = _text_content_handler + self.handlers["application/json"] = _text_content_handler + self.handlers["application/*"] = _binary_content_handler + self.handlers["image/*"] = _binary_content_handler + self.handlers["audio/*"] = _binary_content_handler + self.handlers["video/*"] = _binary_content_handler @retry( reraise=True, @@ -175,7 +178,7 @@ class LinkContentFetcher: try: response = self._get_response(url) content_type = self._get_content_type(response) - handler: Callable = self.handlers[content_type] + handler: Callable = self._resolve_handler(content_type) stream = handler(response) except Exception as e: if self.raise_on_failure: @@ -217,6 +220,29 @@ class LinkContentFetcher: content_type = response.headers.get("Content-Type", "") return content_type.split(";")[0] + def _resolve_handler(self, content_type: str) -> Callable[[Response], ByteStream]: + """ + Resolves the handler for the given content type. + + First, it tries to find a direct match for the content type in the handlers dictionary. + If no direct match is found, it tries to find a pattern match using the fnmatch function. + If no pattern match is found, it returns the default handler for text/plain. + + :param content_type: The content type to resolve the handler for. + :returns: The handler for the given content type, if found. Otherwise, the default handler for text/plain. + """ + # direct match + if content_type in self.handlers: + return self.handlers[content_type] + + # pattern matches + for pattern, handler in self.handlers.items(): + if fnmatch(content_type, pattern): + return handler + + # default handler + return self.handlers["text/plain"] + def _switch_user_agent(self, retry_state: RetryCallState) -> None: """ Switches the User-Agent for this LinkContentRetriever to the next one in the list of user agents. diff --git a/releasenotes/notes/link-content-fetcher-enhancements-49babe1c60888043.yaml b/releasenotes/notes/link-content-fetcher-enhancements-49babe1c60888043.yaml new file mode 100644 index 000000000..d6a7d2428 --- /dev/null +++ b/releasenotes/notes/link-content-fetcher-enhancements-49babe1c60888043.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Improve LinkContentFetcher to support a broader range of content types, including glob patterns for text, application, audio, and video types. This update introduces a more flexible content handler resolution mechanism, allowing for direct matches and pattern matching, thereby greatly improving the handler's adaptability to various content types encountered on the web. diff --git a/test/components/fetchers/test_link_content_fetcher.py b/test/components/fetchers/test_link_content_fetcher.py index ac99bc4cf..c6a4d5c55 100644 --- a/test/components/fetchers/test_link_content_fetcher.py +++ b/test/components/fetchers/test_link_content_fetcher.py @@ -46,10 +46,12 @@ class TestLinkContentFetcher: assert fetcher.retry_attempts == 2 assert fetcher.timeout == 3 assert fetcher.handlers == { - "text/html": _text_content_handler, - "text/plain": _text_content_handler, - "application/pdf": _binary_content_handler, - "application/octet-stream": _binary_content_handler, + "text/*": _text_content_handler, + "application/json": _text_content_handler, + "application/*": _binary_content_handler, + "image/*": _binary_content_handler, + "audio/*": _binary_content_handler, + "video/*": _binary_content_handler, } assert hasattr(fetcher, "_get_response") @@ -191,3 +193,11 @@ class TestLinkContentFetcher: fetcher = LinkContentFetcher() with pytest.raises(requests.exceptions.ConnectionError): fetcher.run(["https://non_existent_website_dot.com/"]) + + @pytest.mark.integration + def test_link_content_fetcher_audio(self): + fetcher = LinkContentFetcher() + streams = fetcher.run(["https://download.samplelib.com/mp3/sample-3s.mp3"])["streams"] + first_stream = streams[0] + assert first_stream.meta["content_type"] == "audio/mpeg" + assert len(first_stream.data) > 0