fix: LinkContentFetcher - when no content retrieved (i.e. request blocked), default to snippet text (#5656)

* When no content retrieved (i.e. request blocked), default to snippet

* Add release note
This commit is contained in:
Vladimir Blagojevic 2023-08-29 10:57:47 +02:00 committed by GitHub
parent 2118f68769
commit f13b37db24
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 12 additions and 3 deletions

View File

@ -200,8 +200,10 @@ class LinkContentFetcher(BaseComponent):
logger.debug("%s handler extracted content from %s", handler, url) logger.debug("%s handler extracted content from %s", handler, url)
extracted_doc["content"] = content extracted_doc["content"] = content
document = Document.from_dict(extracted_doc) else:
fetched_documents = self.processor.process(documents=[document]) if self.processor else [document] extracted_doc["content"] = extracted_doc.get("snippet_text", "") # fallback to snippet_text
document = Document.from_dict(extracted_doc)
fetched_documents = self.processor.process(documents=[document]) if self.processor else [document]
return fetched_documents return fetched_documents

View File

@ -0,0 +1,5 @@
---
enhancements:
- |
If LinkContentFetcher encounters a block or receives any response code other than HTTPStatus.OK, return the search
engine snippet as content, if it's available.

View File

@ -279,7 +279,9 @@ def test_handle_various_response_errors(caplog, mocked_requests, error_code: int
docs = r.fetch(url=url) docs = r.fetch(url=url)
assert f"Couldn't retrieve content from {url}" in caplog.text assert f"Couldn't retrieve content from {url}" in caplog.text
assert docs == [] assert len(docs) == 1
assert isinstance(docs[0], Document)
assert docs[0].content == ""
@pytest.mark.unit @pytest.mark.unit