feat: Improve LinkContentFetcher content type handling (#7920)

* LinkContentFetcher: add more default content type handlers

* Update/add unit test

* Add reno note

* Add image content handler

* Update unit test
This commit is contained in:
Vladimir Blagojevic 2024-06-27 11:45:20 +02:00 committed by GitHub
parent 535a281eec
commit c2ed275a2d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 49 additions and 9 deletions

View File

@ -4,6 +4,7 @@
from collections import defaultdict from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from fnmatch import fnmatch
from typing import Callable, Dict, List, Optional, Tuple from typing import Callable, Dict, List, Optional, Tuple
import requests import requests
@ -94,10 +95,12 @@ class LinkContentFetcher:
# register default content handlers that extract data from the response # register default content handlers that extract data from the response
self.handlers: Dict[str, Callable[[Response], ByteStream]] = defaultdict(lambda: _text_content_handler) self.handlers: Dict[str, Callable[[Response], ByteStream]] = defaultdict(lambda: _text_content_handler)
self.handlers["text/html"] = _text_content_handler self.handlers["text/*"] = _text_content_handler
self.handlers["text/plain"] = _text_content_handler self.handlers["application/json"] = _text_content_handler
self.handlers["application/pdf"] = _binary_content_handler self.handlers["application/*"] = _binary_content_handler
self.handlers["application/octet-stream"] = _binary_content_handler self.handlers["image/*"] = _binary_content_handler
self.handlers["audio/*"] = _binary_content_handler
self.handlers["video/*"] = _binary_content_handler
@retry( @retry(
reraise=True, reraise=True,
@ -175,7 +178,7 @@ class LinkContentFetcher:
try: try:
response = self._get_response(url) response = self._get_response(url)
content_type = self._get_content_type(response) content_type = self._get_content_type(response)
handler: Callable = self.handlers[content_type] handler: Callable = self._resolve_handler(content_type)
stream = handler(response) stream = handler(response)
except Exception as e: except Exception as e:
if self.raise_on_failure: if self.raise_on_failure:
@ -217,6 +220,29 @@ class LinkContentFetcher:
content_type = response.headers.get("Content-Type", "") content_type = response.headers.get("Content-Type", "")
return content_type.split(";")[0] return content_type.split(";")[0]
def _resolve_handler(self, content_type: str) -> Callable[[Response], ByteStream]:
"""
Resolves the handler for the given content type.
First, it tries to find a direct match for the content type in the handlers dictionary.
If no direct match is found, it tries to find a pattern match using the fnmatch function.
If no pattern match is found, it returns the default handler for text/plain.
:param content_type: The content type to resolve the handler for.
:returns: The handler for the given content type, if found. Otherwise, the default handler for text/plain.
"""
# direct match
if content_type in self.handlers:
return self.handlers[content_type]
# pattern matches
for pattern, handler in self.handlers.items():
if fnmatch(content_type, pattern):
return handler
# default handler
return self.handlers["text/plain"]
def _switch_user_agent(self, retry_state: RetryCallState) -> None: def _switch_user_agent(self, retry_state: RetryCallState) -> None:
""" """
Switches the User-Agent for this LinkContentRetriever to the next one in the list of user agents. Switches the User-Agent for this LinkContentRetriever to the next one in the list of user agents.

View File

@ -0,0 +1,4 @@
---
enhancements:
- |
Improve LinkContentFetcher to support a broader range of content types, including glob patterns for text, application, audio, and video types. This update introduces a more flexible content handler resolution mechanism, allowing for direct matches and pattern matching, thereby greatly improving the handler's adaptability to various content types encountered on the web.

View File

@ -46,10 +46,12 @@ class TestLinkContentFetcher:
assert fetcher.retry_attempts == 2 assert fetcher.retry_attempts == 2
assert fetcher.timeout == 3 assert fetcher.timeout == 3
assert fetcher.handlers == { assert fetcher.handlers == {
"text/html": _text_content_handler, "text/*": _text_content_handler,
"text/plain": _text_content_handler, "application/json": _text_content_handler,
"application/pdf": _binary_content_handler, "application/*": _binary_content_handler,
"application/octet-stream": _binary_content_handler, "image/*": _binary_content_handler,
"audio/*": _binary_content_handler,
"video/*": _binary_content_handler,
} }
assert hasattr(fetcher, "_get_response") assert hasattr(fetcher, "_get_response")
@ -191,3 +193,11 @@ class TestLinkContentFetcher:
fetcher = LinkContentFetcher() fetcher = LinkContentFetcher()
with pytest.raises(requests.exceptions.ConnectionError): with pytest.raises(requests.exceptions.ConnectionError):
fetcher.run(["https://non_existent_website_dot.com/"]) fetcher.run(["https://non_existent_website_dot.com/"])
@pytest.mark.integration
def test_link_content_fetcher_audio(self):
fetcher = LinkContentFetcher()
streams = fetcher.run(["https://download.samplelib.com/mp3/sample-3s.mp3"])["streams"]
first_stream = streams[0]
assert first_stream.meta["content_type"] == "audio/mpeg"
assert len(first_stream.data) > 0