feat: Improve LinkContentFetcher content type handling (#7920)

* LinkContentFetcher: add more default content type handlers

* Update/add unit test

* Add reno note

* Add image content handler

* Update unit test
This commit is contained in:
Vladimir Blagojevic 2024-06-27 11:45:20 +02:00 committed by GitHub
parent 535a281eec
commit c2ed275a2d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 49 additions and 9 deletions

View File

@ -4,6 +4,7 @@
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from fnmatch import fnmatch
from typing import Callable, Dict, List, Optional, Tuple
import requests
@ -94,10 +95,12 @@ class LinkContentFetcher:
# register default content handlers that extract data from the response
self.handlers: Dict[str, Callable[[Response], ByteStream]] = defaultdict(lambda: _text_content_handler)
self.handlers["text/html"] = _text_content_handler
self.handlers["text/plain"] = _text_content_handler
self.handlers["application/pdf"] = _binary_content_handler
self.handlers["application/octet-stream"] = _binary_content_handler
self.handlers["text/*"] = _text_content_handler
self.handlers["application/json"] = _text_content_handler
self.handlers["application/*"] = _binary_content_handler
self.handlers["image/*"] = _binary_content_handler
self.handlers["audio/*"] = _binary_content_handler
self.handlers["video/*"] = _binary_content_handler
@retry(
reraise=True,
@ -175,7 +178,7 @@ class LinkContentFetcher:
try:
response = self._get_response(url)
content_type = self._get_content_type(response)
handler: Callable = self.handlers[content_type]
handler: Callable = self._resolve_handler(content_type)
stream = handler(response)
except Exception as e:
if self.raise_on_failure:
@ -217,6 +220,29 @@ class LinkContentFetcher:
content_type = response.headers.get("Content-Type", "")
return content_type.split(";")[0]
def _resolve_handler(self, content_type: str) -> Callable[[Response], ByteStream]:
"""
Resolves the handler for the given content type.
First, it tries to find a direct match for the content type in the handlers dictionary.
If no direct match is found, it tries to find a pattern match using the fnmatch function.
If no pattern match is found, it returns the default handler for text/plain.
:param content_type: The content type to resolve the handler for.
:returns: The handler for the given content type, if found. Otherwise, the default handler for text/plain.
"""
# direct match
if content_type in self.handlers:
return self.handlers[content_type]
# pattern matches
for pattern, handler in self.handlers.items():
if fnmatch(content_type, pattern):
return handler
# default handler
return self.handlers["text/plain"]
def _switch_user_agent(self, retry_state: RetryCallState) -> None:
"""
Switches the User-Agent for this LinkContentRetriever to the next one in the list of user agents.

View File

@ -0,0 +1,4 @@
---
enhancements:
- |
Improve LinkContentFetcher to support a broader range of content types, including glob patterns for text, application, audio, and video types. This update introduces a more flexible content handler resolution mechanism, allowing for direct matches and pattern matching, thereby greatly improving the handler's adaptability to various content types encountered on the web.

View File

@ -46,10 +46,12 @@ class TestLinkContentFetcher:
assert fetcher.retry_attempts == 2
assert fetcher.timeout == 3
assert fetcher.handlers == {
"text/html": _text_content_handler,
"text/plain": _text_content_handler,
"application/pdf": _binary_content_handler,
"application/octet-stream": _binary_content_handler,
"text/*": _text_content_handler,
"application/json": _text_content_handler,
"application/*": _binary_content_handler,
"image/*": _binary_content_handler,
"audio/*": _binary_content_handler,
"video/*": _binary_content_handler,
}
assert hasattr(fetcher, "_get_response")
@ -191,3 +193,11 @@ class TestLinkContentFetcher:
fetcher = LinkContentFetcher()
with pytest.raises(requests.exceptions.ConnectionError):
fetcher.run(["https://non_existent_website_dot.com/"])
@pytest.mark.integration
def test_link_content_fetcher_audio(self):
fetcher = LinkContentFetcher()
streams = fetcher.run(["https://download.samplelib.com/mp3/sample-3s.mp3"])["streams"]
first_stream = streams[0]
assert first_stream.meta["content_type"] == "audio/mpeg"
assert len(first_stream.data) > 0