diff --git a/haystack/components/fetchers/link_content.py b/haystack/components/fetchers/link_content.py index ccc698f61..93cb13ede 100644 --- a/haystack/components/fetchers/link_content.py +++ b/haystack/components/fetchers/link_content.py @@ -96,6 +96,7 @@ class LinkContentFetcher: # register default content handlers that extract data from the response self.handlers: Dict[str, Callable[[Response], ByteStream]] = defaultdict(lambda: _text_content_handler) self.handlers["text/*"] = _text_content_handler + self.handlers["text/html"] = _binary_content_handler self.handlers["application/json"] = _text_content_handler self.handlers["application/*"] = _binary_content_handler self.handlers["image/*"] = _binary_content_handler diff --git a/releasenotes/notes/fix-linkcontentfetcher-encoding-6c8df3c5b09fbc50.yaml b/releasenotes/notes/fix-linkcontentfetcher-encoding-6c8df3c5b09fbc50.yaml new file mode 100644 index 000000000..3dbf59254 --- /dev/null +++ b/releasenotes/notes/fix-linkcontentfetcher-encoding-6c8df3c5b09fbc50.yaml @@ -0,0 +1,4 @@ +--- +fixes: + - | + Encoding of HTML files in LinkContentFetcher diff --git a/test/components/fetchers/test_link_content_fetcher.py b/test/components/fetchers/test_link_content_fetcher.py index c6a4d5c55..35cbd5e40 100644 --- a/test/components/fetchers/test_link_content_fetcher.py +++ b/test/components/fetchers/test_link_content_fetcher.py @@ -47,6 +47,7 @@ class TestLinkContentFetcher: assert fetcher.timeout == 3 assert fetcher.handlers == { "text/*": _text_content_handler, + "text/html": _binary_content_handler, "application/json": _text_content_handler, "application/*": _binary_content_handler, "image/*": _binary_content_handler, @@ -78,7 +79,7 @@ class TestLinkContentFetcher: correct_response = b"