diff --git a/haystack/nodes/retriever/link_content.py b/haystack/nodes/retriever/link_content.py index fa984254b..7f353cb7f 100644 --- a/haystack/nodes/retriever/link_content.py +++ b/haystack/nodes/retriever/link_content.py @@ -200,8 +200,10 @@ class LinkContentFetcher(BaseComponent): logger.debug("%s handler extracted content from %s", handler, url) extracted_doc["content"] = content - document = Document.from_dict(extracted_doc) - fetched_documents = self.processor.process(documents=[document]) if self.processor else [document] + else: + extracted_doc["content"] = extracted_doc.get("snippet_text", "") # fallback to snippet_text + document = Document.from_dict(extracted_doc) + fetched_documents = self.processor.process(documents=[document]) if self.processor else [document] return fetched_documents diff --git a/releasenotes/notes/link-content-include-snippet-if-blocked-53b0e3108f010315.yaml b/releasenotes/notes/link-content-include-snippet-if-blocked-53b0e3108f010315.yaml new file mode 100644 index 000000000..b8291f1c3 --- /dev/null +++ b/releasenotes/notes/link-content-include-snippet-if-blocked-53b0e3108f010315.yaml @@ -0,0 +1,5 @@ +--- +enhancements: + - | + If LinkContentFetcher encounters a block or receives any response code other than HTTPStatus.OK, return the search + engine snippet as content, if it's available. diff --git a/test/nodes/test_link_content_fetcher.py b/test/nodes/test_link_content_fetcher.py index fd0767af8..a01aa0425 100644 --- a/test/nodes/test_link_content_fetcher.py +++ b/test/nodes/test_link_content_fetcher.py @@ -279,7 +279,9 @@ def test_handle_various_response_errors(caplog, mocked_requests, error_code: int docs = r.fetch(url=url) assert f"Couldn't retrieve content from {url}" in caplog.text - assert docs == [] + assert len(docs) == 1 + assert isinstance(docs[0], Document) + assert docs[0].content == "" @pytest.mark.unit