haystack/test/nodes/test_link_content_fetcher.py
Vladimir Blagojevic f21005f8ea
refactor: Extract link retrieval from WebRetriever, introduce LinkContentRetriever (#5227)
* Extract link retrieval from WebRetriever, introduce LinkContentRetriever

* Add example
---------

Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com>
Co-authored-by: Daria Fokina <daria.f93@gmail.com>
2023-07-13 12:54:40 +02:00

369 lines
11 KiB
Python

from unittest.mock import Mock, patch
import logging
import pytest
import requests
from requests import Response
from haystack import Document
from haystack.nodes import LinkContentFetcher
@pytest.fixture
def mocked_requests():
with patch("haystack.nodes.retriever.link_content.requests") as mock_requests:
mock_response = Mock()
mock_requests.get.return_value = mock_response
mock_response.status_code = 200
mock_response.text = "Sample content from webpage"
yield mock_requests
@pytest.fixture
def mocked_article_extractor():
with patch("boilerpy3.extractors.ArticleExtractor.get_content", return_value="Sample content from webpage"):
yield
@pytest.mark.unit
def test_init():
"""
Checks the initialization of the LinkContentFetcher without a preprocessor.
"""
r = LinkContentFetcher()
assert r.processor is None
assert isinstance(r.handlers, dict)
assert "html" in r.handlers
@pytest.mark.unit
def test_init_with_preprocessor():
"""
Checks the initialization of the LinkContentFetcher with a preprocessor.
"""
pre_processor_mock = Mock()
r = LinkContentFetcher(processor=pre_processor_mock)
assert r.processor == pre_processor_mock
assert isinstance(r.handlers, dict)
assert "html" in r.handlers
@pytest.mark.unit
def test_fetch(mocked_requests, mocked_article_extractor):
"""
Checks if the LinkContentFetcher is able to fetch content.
"""
url = "https://haystack.deepset.ai/"
pre_processor_mock = Mock()
pre_processor_mock.process.return_value = [Document("Sample content from webpage")]
r = LinkContentFetcher(pre_processor_mock)
result = r.fetch(url=url, doc_kwargs={"text": "Sample content from webpage"})
assert len(result) == 1
assert isinstance(result[0], Document)
assert result[0].content == "Sample content from webpage"
@pytest.mark.unit
def test_fetch_no_url(mocked_requests, mocked_article_extractor):
"""
Ensures an InvalidURL exception is raised when URL is missing.
"""
pre_processor_mock = Mock()
pre_processor_mock.process.return_value = [Document("Sample content from webpage")]
retriever_no_url = LinkContentFetcher(processor=pre_processor_mock)
with pytest.raises(requests.exceptions.InvalidURL, match="Invalid or missing URL"):
retriever_no_url.fetch(url="")
@pytest.mark.unit
def test_fetch_invalid_url(caplog, mocked_requests, mocked_article_extractor):
"""
Ensures an InvalidURL exception is raised when the URL is invalid.
"""
url = "this-is-invalid-url"
r = LinkContentFetcher()
with pytest.raises(requests.exceptions.InvalidURL):
r.fetch(url=url)
@pytest.mark.unit
def test_fetch_no_preprocessor(mocked_requests, mocked_article_extractor):
"""
Checks if the LinkContentFetcher can fetch content without a preprocessor.
"""
url = "https://www.example.com"
r = LinkContentFetcher()
result_no_preprocessor = r.fetch(url=url)
assert len(result_no_preprocessor) == 1
assert isinstance(result_no_preprocessor[0], Document)
assert result_no_preprocessor[0].content == "Sample content from webpage"
@pytest.mark.unit
def test_fetch_correct_arguments(mocked_requests, mocked_article_extractor):
"""
Ensures that requests.get is called with correct arguments.
"""
url = "https://www.example.com"
r = LinkContentFetcher()
r.fetch(url=url)
# Check the arguments that requests.get was called with
args, kwargs = mocked_requests.get.call_args
assert args[0] == url
assert kwargs["timeout"] == 3
assert kwargs["headers"] == r.REQUEST_HEADERS
# another variant
url = "https://deepset.ai"
r.fetch(url=url, timeout=10)
# Check the arguments that requests.get was called with
args, kwargs = mocked_requests.get.call_args
assert args[0] == url
assert kwargs["timeout"] == 10
assert kwargs["headers"] == r.REQUEST_HEADERS
@pytest.mark.unit
def test_fetch_default_empty_content(mocked_requests):
"""
Checks handling of content extraction returning empty content.
"""
url = "https://www.example.com"
timeout = 10
content_text = ""
r = LinkContentFetcher()
with patch("boilerpy3.extractors.ArticleExtractor.get_content", return_value=content_text):
result = r.fetch(url=url, timeout=timeout)
assert "text" not in result
assert isinstance(result, list) and len(result) == 0
@pytest.mark.unit
def test_fetch_exception_during_content_extraction_no_raise_on_failure(caplog, mocked_requests):
"""
Checks the behavior when there's an exception during content extraction, and raise_on_failure is set to False.
"""
caplog.set_level(logging.WARNING)
url = "https://www.example.com"
r = LinkContentFetcher()
with patch("boilerpy3.extractors.ArticleExtractor.get_content", side_effect=Exception("Could not extract content")):
result = r.fetch(url=url)
assert "text" not in result
assert "failed to extract content from" in caplog.text
@pytest.mark.unit
def test_fetch_exception_during_content_extraction_raise_on_failure(caplog, mocked_requests):
"""
Checks the behavior when there's an exception during content extraction, and raise_on_failure is set to True.
"""
caplog.set_level(logging.WARNING)
url = "https://www.example.com"
r = LinkContentFetcher(raise_on_failure=True)
with patch("boilerpy3.extractors.ArticleExtractor.get_content", side_effect=Exception("Could not extract content")):
with pytest.raises(Exception, match="Could not extract content"):
r.fetch(url=url)
@pytest.mark.unit
def test_fetch_exception_during_request_get_no_raise_on_failure(caplog):
"""
Checks the behavior when there's an exception during request.get, and raise_on_failure is set to False.
"""
caplog.set_level(logging.WARNING)
url = "https://www.example.com"
r = LinkContentFetcher()
with patch("haystack.nodes.retriever.link_content.requests.get", side_effect=requests.RequestException()):
r.fetch(url=url)
assert f"Couldn't retrieve content from {url}" in caplog.text
@pytest.mark.unit
def test_fetch_exception_during_request_get_raise_on_failure(caplog):
"""
Checks the behavior when there's an exception during request.get, and raise_on_failure is set to True.
"""
caplog.set_level(logging.WARNING)
url = "https://www.example.com"
r = LinkContentFetcher(raise_on_failure=True)
with patch("haystack.nodes.retriever.link_content.requests.get", side_effect=requests.RequestException()):
with pytest.raises(requests.RequestException):
r.fetch(url=url)
@pytest.mark.unit
@pytest.mark.parametrize("error_code", [403, 404, 500])
def test_handle_various_response_errors(caplog, mocked_requests, error_code: int):
"""
Tests the handling of various HTTP error responses.
"""
caplog.set_level(logging.WARNING)
url = "https://some-problematic-url.com"
# we don't throw exceptions, there might be many of them
# we log them on debug level
mock_response = Response()
mock_response.status_code = error_code
mocked_requests.get.return_value = mock_response
r = LinkContentFetcher()
docs = r.fetch(url=url)
assert f"Couldn't retrieve content from {url}" in caplog.text
assert docs == []
@pytest.mark.unit
@pytest.mark.parametrize("error_code", [403, 404, 500])
def test_handle_http_error(mocked_requests, error_code: int):
"""
Checks the behavior when there's an HTTPError raised, and raise_on_failure is set to True.
"""
url = "https://some-problematic-url.com"
# we don't throw exceptions, there might be many of them
# we log them on debug level
mock_response = Response()
mock_response.status_code = error_code
mocked_requests.get.return_value = mock_response
r = LinkContentFetcher(raise_on_failure=True)
with pytest.raises(requests.HTTPError):
r.fetch(url=url)
@pytest.mark.unit
def test_is_valid_url():
"""
Checks the _is_valid_url function with a set of valid URLs.
"""
retriever = LinkContentFetcher()
valid_urls = [
"http://www.google.com",
"https://www.google.com",
"http://google.com",
"https://google.com",
"http://localhost",
"https://localhost",
"http://127.0.0.1",
"https://127.0.0.1",
"http://[::1]",
"https://[::1]",
"http://example.com/path/to/page?name=value",
"https://example.com/path/to/page?name=value",
"http://example.com:8000",
"https://example.com:8000",
]
for url in valid_urls:
assert retriever._is_valid_url(url), f"Expected {url} to be valid"
@pytest.mark.unit
def test_is_invalid_url():
"""
Checks the _is_valid_url function with a set of invalid URLs.
"""
retriever = LinkContentFetcher()
invalid_urls = [
"http://",
"https://",
"http:",
"https:",
"www.google.com",
"google.com",
"localhost",
"127.0.0.1",
"[::1]",
"/path/to/page",
"/path/to/page?name=value",
":8000",
"example.com",
"http:///example.com",
"https:///example.com",
"",
None,
]
for url in invalid_urls:
assert not retriever._is_valid_url(url), f"Expected {url} to be invalid"
@pytest.mark.integration
def test_call_with_valid_url_on_live_web():
"""
Test that LinkContentFetcher can fetch content from a valid URL
"""
retriever = LinkContentFetcher()
docs = retriever.fetch(url="https://docs.haystack.deepset.ai/", timeout=2)
assert len(docs) >= 1
assert isinstance(docs[0], Document)
assert "Haystack" in docs[0].content
@pytest.mark.integration
def test_retrieve_with_valid_url_on_live_web():
"""
Test that LinkContentFetcher can fetch content from a valid URL using the run method
"""
retriever = LinkContentFetcher()
docs, _ = retriever.run(query="https://docs.haystack.deepset.ai/")
docs = docs["documents"]
assert len(docs) >= 1
assert isinstance(docs[0], Document)
assert "Haystack" in docs[0].content
@pytest.mark.integration
def test_retrieve_with_invalid_url():
"""
Test that LinkContentFetcher raises ValueError when trying to fetch content from an invalid URL
"""
retriever = LinkContentFetcher()
with pytest.raises(ValueError):
retriever.run(query="")
@pytest.mark.integration
def test_retrieve_batch():
"""
Test that LinkContentFetcher can fetch content from a valid URL using the retrieve_batch method
"""
retriever = LinkContentFetcher()
docs, _ = retriever.run_batch(queries=["https://docs.haystack.deepset.ai/", "https://deepset.ai/"])
assert docs
docs = docs["documents"]
# no processor is applied, so each query should return a list of documents with one entry
assert len(docs) == 2 and len(docs[0]) == 1 and len(docs[1]) == 1
# each query should return a list of documents
assert isinstance(docs[0], list) and isinstance(docs[0][0], Document)
assert isinstance(docs[1], list) and isinstance(docs[1][0], Document)