From 7e35280d4fede1b577e98e545148cfb7e8631f4f Mon Sep 17 00:00:00 2001 From: tstadel <60758086+tstadel@users.noreply.github.com> Date: Tue, 9 Jul 2024 15:28:49 +0200 Subject: [PATCH] fix: LinkContentFetcher html text encoding (#7975) * fix: content encoding of LinkContentFetcher * fix tests * add reno * only touch html --- haystack/components/fetchers/link_content.py | 1 + .../fix-linkcontentfetcher-encoding-6c8df3c5b09fbc50.yaml | 4 ++++ test/components/fetchers/test_link_content_fetcher.py | 3 ++- 3 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/fix-linkcontentfetcher-encoding-6c8df3c5b09fbc50.yaml diff --git a/haystack/components/fetchers/link_content.py b/haystack/components/fetchers/link_content.py index ccc698f61..93cb13ede 100644 --- a/haystack/components/fetchers/link_content.py +++ b/haystack/components/fetchers/link_content.py @@ -96,6 +96,7 @@ class LinkContentFetcher: # register default content handlers that extract data from the response self.handlers: Dict[str, Callable[[Response], ByteStream]] = defaultdict(lambda: _text_content_handler) self.handlers["text/*"] = _text_content_handler + self.handlers["text/html"] = _binary_content_handler self.handlers["application/json"] = _text_content_handler self.handlers["application/*"] = _binary_content_handler self.handlers["image/*"] = _binary_content_handler diff --git a/releasenotes/notes/fix-linkcontentfetcher-encoding-6c8df3c5b09fbc50.yaml b/releasenotes/notes/fix-linkcontentfetcher-encoding-6c8df3c5b09fbc50.yaml new file mode 100644 index 000000000..3dbf59254 --- /dev/null +++ b/releasenotes/notes/fix-linkcontentfetcher-encoding-6c8df3c5b09fbc50.yaml @@ -0,0 +1,4 @@ +--- +fixes: + - | + Encoding of HTML files in LinkContentFetcher diff --git a/test/components/fetchers/test_link_content_fetcher.py b/test/components/fetchers/test_link_content_fetcher.py index c6a4d5c55..35cbd5e40 100644 --- a/test/components/fetchers/test_link_content_fetcher.py +++ b/test/components/fetchers/test_link_content_fetcher.py @@ -47,6 +47,7 @@ class TestLinkContentFetcher: assert fetcher.timeout == 3 assert fetcher.handlers == { "text/*": _text_content_handler, + "text/html": _binary_content_handler, "application/json": _text_content_handler, "application/*": _binary_content_handler, "image/*": _binary_content_handler, @@ -78,7 +79,7 @@ class TestLinkContentFetcher: correct_response = b"