mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-12 15:27:06 +00:00
feat: Improve LinkContentFetcher content type handling (#7920)
* LinkContentFetcher: add more default content type handlers * Update/add unit test * Add reno note * Add image content handler * Update unit test
This commit is contained in:
parent
535a281eec
commit
c2ed275a2d
@ -4,6 +4,7 @@
|
||||
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from fnmatch import fnmatch
|
||||
from typing import Callable, Dict, List, Optional, Tuple
|
||||
|
||||
import requests
|
||||
@ -94,10 +95,12 @@ class LinkContentFetcher:
|
||||
|
||||
# register default content handlers that extract data from the response
|
||||
self.handlers: Dict[str, Callable[[Response], ByteStream]] = defaultdict(lambda: _text_content_handler)
|
||||
self.handlers["text/html"] = _text_content_handler
|
||||
self.handlers["text/plain"] = _text_content_handler
|
||||
self.handlers["application/pdf"] = _binary_content_handler
|
||||
self.handlers["application/octet-stream"] = _binary_content_handler
|
||||
self.handlers["text/*"] = _text_content_handler
|
||||
self.handlers["application/json"] = _text_content_handler
|
||||
self.handlers["application/*"] = _binary_content_handler
|
||||
self.handlers["image/*"] = _binary_content_handler
|
||||
self.handlers["audio/*"] = _binary_content_handler
|
||||
self.handlers["video/*"] = _binary_content_handler
|
||||
|
||||
@retry(
|
||||
reraise=True,
|
||||
@ -175,7 +178,7 @@ class LinkContentFetcher:
|
||||
try:
|
||||
response = self._get_response(url)
|
||||
content_type = self._get_content_type(response)
|
||||
handler: Callable = self.handlers[content_type]
|
||||
handler: Callable = self._resolve_handler(content_type)
|
||||
stream = handler(response)
|
||||
except Exception as e:
|
||||
if self.raise_on_failure:
|
||||
@ -217,6 +220,29 @@ class LinkContentFetcher:
|
||||
content_type = response.headers.get("Content-Type", "")
|
||||
return content_type.split(";")[0]
|
||||
|
||||
def _resolve_handler(self, content_type: str) -> Callable[[Response], ByteStream]:
|
||||
"""
|
||||
Resolves the handler for the given content type.
|
||||
|
||||
First, it tries to find a direct match for the content type in the handlers dictionary.
|
||||
If no direct match is found, it tries to find a pattern match using the fnmatch function.
|
||||
If no pattern match is found, it returns the default handler for text/plain.
|
||||
|
||||
:param content_type: The content type to resolve the handler for.
|
||||
:returns: The handler for the given content type, if found. Otherwise, the default handler for text/plain.
|
||||
"""
|
||||
# direct match
|
||||
if content_type in self.handlers:
|
||||
return self.handlers[content_type]
|
||||
|
||||
# pattern matches
|
||||
for pattern, handler in self.handlers.items():
|
||||
if fnmatch(content_type, pattern):
|
||||
return handler
|
||||
|
||||
# default handler
|
||||
return self.handlers["text/plain"]
|
||||
|
||||
def _switch_user_agent(self, retry_state: RetryCallState) -> None:
|
||||
"""
|
||||
Switches the User-Agent for this LinkContentRetriever to the next one in the list of user agents.
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
enhancements:
|
||||
- |
|
||||
Improve LinkContentFetcher to support a broader range of content types, including glob patterns for text, application, audio, and video types. This update introduces a more flexible content handler resolution mechanism, allowing for direct matches and pattern matching, thereby greatly improving the handler's adaptability to various content types encountered on the web.
|
||||
@ -46,10 +46,12 @@ class TestLinkContentFetcher:
|
||||
assert fetcher.retry_attempts == 2
|
||||
assert fetcher.timeout == 3
|
||||
assert fetcher.handlers == {
|
||||
"text/html": _text_content_handler,
|
||||
"text/plain": _text_content_handler,
|
||||
"application/pdf": _binary_content_handler,
|
||||
"application/octet-stream": _binary_content_handler,
|
||||
"text/*": _text_content_handler,
|
||||
"application/json": _text_content_handler,
|
||||
"application/*": _binary_content_handler,
|
||||
"image/*": _binary_content_handler,
|
||||
"audio/*": _binary_content_handler,
|
||||
"video/*": _binary_content_handler,
|
||||
}
|
||||
assert hasattr(fetcher, "_get_response")
|
||||
|
||||
@ -191,3 +193,11 @@ class TestLinkContentFetcher:
|
||||
fetcher = LinkContentFetcher()
|
||||
with pytest.raises(requests.exceptions.ConnectionError):
|
||||
fetcher.run(["https://non_existent_website_dot.com/"])
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_link_content_fetcher_audio(self):
|
||||
fetcher = LinkContentFetcher()
|
||||
streams = fetcher.run(["https://download.samplelib.com/mp3/sample-3s.mp3"])["streams"]
|
||||
first_stream = streams[0]
|
||||
assert first_stream.meta["content_type"] == "audio/mpeg"
|
||||
assert len(first_stream.data) > 0
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user