fix: LinkContentFetcher html text encoding (#7975)

* fix: content encoding of LinkContentFetcher

* fix tests

* add reno

* only touch html
This commit is contained in:
tstadel 2024-07-09 15:28:49 +02:00 committed by GitHub
parent 583eb8a293
commit 7e35280d4f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 7 additions and 1 deletions

View File

@ -96,6 +96,7 @@ class LinkContentFetcher:
# register default content handlers that extract data from the response
self.handlers: Dict[str, Callable[[Response], ByteStream]] = defaultdict(lambda: _text_content_handler)
self.handlers["text/*"] = _text_content_handler
self.handlers["text/html"] = _binary_content_handler
self.handlers["application/json"] = _text_content_handler
self.handlers["application/*"] = _binary_content_handler
self.handlers["image/*"] = _binary_content_handler

View File

@ -0,0 +1,4 @@
---
fixes:
- |
Encoding of HTML files in LinkContentFetcher

View File

@ -47,6 +47,7 @@ class TestLinkContentFetcher:
assert fetcher.timeout == 3
assert fetcher.handlers == {
"text/*": _text_content_handler,
"text/html": _binary_content_handler,
"application/json": _text_content_handler,
"application/*": _binary_content_handler,
"image/*": _binary_content_handler,
@ -78,7 +79,7 @@ class TestLinkContentFetcher:
correct_response = b"<h1>Example test response</h1>"
with patch("haystack.components.fetchers.link_content.requests") as mock_run:
mock_run.get.return_value = Mock(
status_code=200, text="<h1>Example test response</h1>", headers={"Content-Type": "text/html"}
status_code=200, content=b"<h1>Example test response</h1>", headers={"Content-Type": "text/html"}
)
fetcher = LinkContentFetcher()
streams = fetcher.run(urls=["https://www.example.com"])["streams"]