haystack/test/nodes/test_link_content_fetcher.py

369 lines
11 KiB
Python
Raw Normal View History

from unittest.mock import Mock, patch
import logging
import pytest
import requests
from requests import Response
from haystack import Document
from haystack.nodes import LinkContentFetcher
@pytest.fixture
def mocked_requests():
with patch("haystack.nodes.retriever.link_content.requests") as mock_requests:
mock_response = Mock()
mock_requests.get.return_value = mock_response
mock_response.status_code = 200
mock_response.text = "Sample content from webpage"
yield mock_requests
@pytest.fixture
def mocked_article_extractor():
with patch("boilerpy3.extractors.ArticleExtractor.get_content", return_value="Sample content from webpage"):
yield
@pytest.mark.unit
def test_init():
"""
Checks the initialization of the LinkContentFetcher without a preprocessor.
"""
r = LinkContentFetcher()
assert r.processor is None
assert isinstance(r.handlers, dict)
assert "html" in r.handlers
@pytest.mark.unit
def test_init_with_preprocessor():
"""
Checks the initialization of the LinkContentFetcher with a preprocessor.
"""
pre_processor_mock = Mock()
r = LinkContentFetcher(processor=pre_processor_mock)
assert r.processor == pre_processor_mock
assert isinstance(r.handlers, dict)
assert "html" in r.handlers
@pytest.mark.unit
def test_fetch(mocked_requests, mocked_article_extractor):
"""
Checks if the LinkContentFetcher is able to fetch content.
"""
url = "https://haystack.deepset.ai/"
pre_processor_mock = Mock()
pre_processor_mock.process.return_value = [Document("Sample content from webpage")]
r = LinkContentFetcher(pre_processor_mock)
result = r.fetch(url=url, doc_kwargs={"text": "Sample content from webpage"})
assert len(result) == 1
assert isinstance(result[0], Document)
assert result[0].content == "Sample content from webpage"
@pytest.mark.unit
def test_fetch_no_url(mocked_requests, mocked_article_extractor):
"""
Ensures an InvalidURL exception is raised when URL is missing.
"""
pre_processor_mock = Mock()
pre_processor_mock.process.return_value = [Document("Sample content from webpage")]
retriever_no_url = LinkContentFetcher(processor=pre_processor_mock)
with pytest.raises(requests.exceptions.InvalidURL, match="Invalid or missing URL"):
retriever_no_url.fetch(url="")
@pytest.mark.unit
def test_fetch_invalid_url(caplog, mocked_requests, mocked_article_extractor):
"""
Ensures an InvalidURL exception is raised when the URL is invalid.
"""
url = "this-is-invalid-url"
r = LinkContentFetcher()
with pytest.raises(requests.exceptions.InvalidURL):
r.fetch(url=url)
@pytest.mark.unit
def test_fetch_no_preprocessor(mocked_requests, mocked_article_extractor):
"""
Checks if the LinkContentFetcher can fetch content without a preprocessor.
"""
url = "https://www.example.com"
r = LinkContentFetcher()
result_no_preprocessor = r.fetch(url=url)
assert len(result_no_preprocessor) == 1
assert isinstance(result_no_preprocessor[0], Document)
assert result_no_preprocessor[0].content == "Sample content from webpage"
@pytest.mark.unit
def test_fetch_correct_arguments(mocked_requests, mocked_article_extractor):
"""
Ensures that requests.get is called with correct arguments.
"""
url = "https://www.example.com"
r = LinkContentFetcher()
r.fetch(url=url)
# Check the arguments that requests.get was called with
args, kwargs = mocked_requests.get.call_args
assert args[0] == url
assert kwargs["timeout"] == 3
assert kwargs["headers"] == r.REQUEST_HEADERS
# another variant
url = "https://deepset.ai"
r.fetch(url=url, timeout=10)
# Check the arguments that requests.get was called with
args, kwargs = mocked_requests.get.call_args
assert args[0] == url
assert kwargs["timeout"] == 10
assert kwargs["headers"] == r.REQUEST_HEADERS
@pytest.mark.unit
def test_fetch_default_empty_content(mocked_requests):
"""
Checks handling of content extraction returning empty content.
"""
url = "https://www.example.com"
timeout = 10
content_text = ""
r = LinkContentFetcher()
with patch("boilerpy3.extractors.ArticleExtractor.get_content", return_value=content_text):
result = r.fetch(url=url, timeout=timeout)
assert "text" not in result
assert isinstance(result, list) and len(result) == 0
@pytest.mark.unit
def test_fetch_exception_during_content_extraction_no_raise_on_failure(caplog, mocked_requests):
"""
Checks the behavior when there's an exception during content extraction, and raise_on_failure is set to False.
"""
caplog.set_level(logging.WARNING)
url = "https://www.example.com"
r = LinkContentFetcher()
with patch("boilerpy3.extractors.ArticleExtractor.get_content", side_effect=Exception("Could not extract content")):
result = r.fetch(url=url)
assert "text" not in result
assert "failed to extract content from" in caplog.text
@pytest.mark.unit
def test_fetch_exception_during_content_extraction_raise_on_failure(caplog, mocked_requests):
"""
Checks the behavior when there's an exception during content extraction, and raise_on_failure is set to True.
"""
caplog.set_level(logging.WARNING)
url = "https://www.example.com"
r = LinkContentFetcher(raise_on_failure=True)
with patch("boilerpy3.extractors.ArticleExtractor.get_content", side_effect=Exception("Could not extract content")):
with pytest.raises(Exception, match="Could not extract content"):
r.fetch(url=url)
@pytest.mark.unit
def test_fetch_exception_during_request_get_no_raise_on_failure(caplog):
"""
Checks the behavior when there's an exception during request.get, and raise_on_failure is set to False.
"""
caplog.set_level(logging.WARNING)
url = "https://www.example.com"
r = LinkContentFetcher()
with patch("haystack.nodes.retriever.link_content.requests.get", side_effect=requests.RequestException()):
r.fetch(url=url)
assert f"Couldn't retrieve content from {url}" in caplog.text
@pytest.mark.unit
def test_fetch_exception_during_request_get_raise_on_failure(caplog):
"""
Checks the behavior when there's an exception during request.get, and raise_on_failure is set to True.
"""
caplog.set_level(logging.WARNING)
url = "https://www.example.com"
r = LinkContentFetcher(raise_on_failure=True)
with patch("haystack.nodes.retriever.link_content.requests.get", side_effect=requests.RequestException()):
with pytest.raises(requests.RequestException):
r.fetch(url=url)
@pytest.mark.unit
@pytest.mark.parametrize("error_code", [403, 404, 500])
def test_handle_various_response_errors(caplog, mocked_requests, error_code: int):
"""
Tests the handling of various HTTP error responses.
"""
caplog.set_level(logging.WARNING)
url = "https://some-problematic-url.com"
# we don't throw exceptions, there might be many of them
# we log them on debug level
mock_response = Response()
mock_response.status_code = error_code
mocked_requests.get.return_value = mock_response
r = LinkContentFetcher()
docs = r.fetch(url=url)
assert f"Couldn't retrieve content from {url}" in caplog.text
assert docs == []
@pytest.mark.unit
@pytest.mark.parametrize("error_code", [403, 404, 500])
def test_handle_http_error(mocked_requests, error_code: int):
"""
Checks the behavior when there's an HTTPError raised, and raise_on_failure is set to True.
"""
url = "https://some-problematic-url.com"
# we don't throw exceptions, there might be many of them
# we log them on debug level
mock_response = Response()
mock_response.status_code = error_code
mocked_requests.get.return_value = mock_response
r = LinkContentFetcher(raise_on_failure=True)
with pytest.raises(requests.HTTPError):
r.fetch(url=url)
@pytest.mark.unit
def test_is_valid_url():
"""
Checks the _is_valid_url function with a set of valid URLs.
"""
retriever = LinkContentFetcher()
valid_urls = [
"http://www.google.com",
"https://www.google.com",
"http://google.com",
"https://google.com",
"http://localhost",
"https://localhost",
"http://127.0.0.1",
"https://127.0.0.1",
"http://[::1]",
"https://[::1]",
"http://example.com/path/to/page?name=value",
"https://example.com/path/to/page?name=value",
"http://example.com:8000",
"https://example.com:8000",
]
for url in valid_urls:
assert retriever._is_valid_url(url), f"Expected {url} to be valid"
@pytest.mark.unit
def test_is_invalid_url():
"""
Checks the _is_valid_url function with a set of invalid URLs.
"""
retriever = LinkContentFetcher()
invalid_urls = [
"http://",
"https://",
"http:",
"https:",
"www.google.com",
"google.com",
"localhost",
"127.0.0.1",
"[::1]",
"/path/to/page",
"/path/to/page?name=value",
":8000",
"example.com",
"http:///example.com",
"https:///example.com",
"",
None,
]
for url in invalid_urls:
assert not retriever._is_valid_url(url), f"Expected {url} to be invalid"
@pytest.mark.integration
def test_call_with_valid_url_on_live_web():
"""
Test that LinkContentFetcher can fetch content from a valid URL
"""
retriever = LinkContentFetcher()
docs = retriever.fetch(url="https://docs.haystack.deepset.ai/", timeout=2)
assert len(docs) >= 1
assert isinstance(docs[0], Document)
assert "Haystack" in docs[0].content
@pytest.mark.integration
def test_retrieve_with_valid_url_on_live_web():
"""
Test that LinkContentFetcher can fetch content from a valid URL using the run method
"""
retriever = LinkContentFetcher()
docs, _ = retriever.run(query="https://docs.haystack.deepset.ai/")
docs = docs["documents"]
assert len(docs) >= 1
assert isinstance(docs[0], Document)
assert "Haystack" in docs[0].content
@pytest.mark.integration
def test_retrieve_with_invalid_url():
"""
Test that LinkContentFetcher raises ValueError when trying to fetch content from an invalid URL
"""
retriever = LinkContentFetcher()
with pytest.raises(ValueError):
retriever.run(query="")
@pytest.mark.integration
def test_retrieve_batch():
"""
Test that LinkContentFetcher can fetch content from a valid URL using the retrieve_batch method
"""
retriever = LinkContentFetcher()
docs, _ = retriever.run_batch(queries=["https://docs.haystack.deepset.ai/", "https://deepset.ai/"])
assert docs
docs = docs["documents"]
# no processor is applied, so each query should return a list of documents with one entry
assert len(docs) == 2 and len(docs[0]) == 1 and len(docs[1]) == 1
# each query should return a list of documents
assert isinstance(docs[0], list) and isinstance(docs[0][0], Document)
assert isinstance(docs[1], list) and isinstance(docs[1][0], Document)