haystack/test/nodes/test_link_content_fetcher.py
Christian Clauss bf6d306d68
ci: Simplify Python code with ruff rules SIM (#5833)
* ci: Simplify Python code with ruff rules SIM

* Revert #5828

* ruff --select=I --fix haystack/modeling/infer.py

---------

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
2023-09-20 08:32:44 +02:00

463 lines
14 KiB
Python

from typing import Optional
from unittest.mock import Mock, patch
import logging
import pytest
import requests
from requests import Response
from haystack import Document
from haystack.nodes import LinkContentFetcher
@pytest.fixture
def mocked_requests():
with patch("haystack.nodes.retriever.link_content.requests") as mock_requests:
mock_response = Mock()
mock_requests.get.return_value = mock_response
mock_response.status_code = 200
mock_response.text = "Sample content from webpage"
mock_response.headers = {"Content-Type": "text/html"}
yield mock_requests
@pytest.fixture
def mocked_article_extractor():
with patch("boilerpy3.extractors.ArticleExtractor.get_content", return_value="Sample content from webpage"):
yield
@pytest.mark.unit
def test_init():
"""
Checks the initialization of the LinkContentFetcher without a preprocessor.
"""
r = LinkContentFetcher()
assert r.processor is None
assert isinstance(r.handlers, dict)
assert "text/html" in r.handlers
assert "application/pdf" in r.handlers
@pytest.mark.unit
def test_init_with_preprocessor():
"""
Checks the initialization of the LinkContentFetcher with a preprocessor.
"""
pre_processor_mock = Mock()
r = LinkContentFetcher(processor=pre_processor_mock)
assert r.processor == pre_processor_mock
assert isinstance(r.handlers, dict)
assert "text/html" in r.handlers
assert "application/pdf" in r.handlers
@pytest.mark.unit
def test_init_with_content_handlers():
"""
Checks the initialization of the LinkContentFetcher content handlers.
"""
def fake_but_valid_video_content_handler(response: Response) -> Optional[str]:
pass
r = LinkContentFetcher(content_handlers={"video/mp4": fake_but_valid_video_content_handler})
assert isinstance(r.handlers, dict)
assert "text/html" in r.handlers
assert "application/pdf" in r.handlers
assert "video/mp4" in r.handlers
@pytest.mark.unit
def test_init_with_content_handlers_override():
"""
Checks the initialization of the LinkContentFetcher content handlers but with pdf handler overridden.
"""
def new_pdf_content_handler(response: Response) -> Optional[str]:
pass
r = LinkContentFetcher(content_handlers={"application/pdf": new_pdf_content_handler})
assert isinstance(r.handlers, dict)
assert "text/html" in r.handlers
assert "application/pdf" in r.handlers
assert r.handlers["application/pdf"] == new_pdf_content_handler
@pytest.mark.unit
def test_init_with_invalid_content_handlers():
"""
Checks the initialization of the LinkContentFetcher content handlers fails with invalid content handlers.
"""
# invalid because it does not have the correct signature
def invalid_video_content_handler() -> Optional[str]:
pass
with pytest.raises(ValueError, match="handler must accept"):
LinkContentFetcher(content_handlers={"video/mp4": invalid_video_content_handler})
@pytest.mark.unit
def test_fetch(mocked_requests, mocked_article_extractor):
"""
Checks if the LinkContentFetcher is able to fetch content.
"""
url = "https://haystack.deepset.ai/"
pre_processor_mock = Mock()
pre_processor_mock.process.return_value = [Document("Sample content from webpage")]
r = LinkContentFetcher(processor=pre_processor_mock)
result = r.fetch(url=url, doc_kwargs={"text": "Sample content from webpage"})
assert len(result) == 1
assert isinstance(result[0], Document)
assert result[0].content == "Sample content from webpage"
@pytest.mark.unit
def test_fetch_no_url(mocked_requests, mocked_article_extractor):
"""
Ensures an InvalidURL exception is raised when URL is missing.
"""
pre_processor_mock = Mock()
pre_processor_mock.process.return_value = [Document("Sample content from webpage")]
retriever_no_url = LinkContentFetcher(processor=pre_processor_mock)
with pytest.raises(requests.exceptions.InvalidURL, match="Invalid or missing URL"):
retriever_no_url.fetch(url="")
@pytest.mark.unit
def test_fetch_invalid_url(caplog, mocked_requests, mocked_article_extractor):
"""
Ensures an InvalidURL exception is raised when the URL is invalid.
"""
url = "this-is-invalid-url"
r = LinkContentFetcher()
with pytest.raises(requests.exceptions.InvalidURL):
r.fetch(url=url)
@pytest.mark.unit
def test_fetch_no_preprocessor(mocked_requests, mocked_article_extractor):
"""
Checks if the LinkContentFetcher can fetch content without a preprocessor.
"""
url = "https://www.example.com"
r = LinkContentFetcher()
result_no_preprocessor = r.fetch(url=url)
assert len(result_no_preprocessor) == 1
assert isinstance(result_no_preprocessor[0], Document)
assert result_no_preprocessor[0].content == "Sample content from webpage"
@pytest.mark.unit
def test_fetch_correct_arguments(mocked_requests, mocked_article_extractor):
"""
Ensures that requests.get is called with correct arguments.
"""
url = "https://www.example.com"
r = LinkContentFetcher()
r.fetch(url=url)
# Check the arguments that requests.get was called with
args, kwargs = mocked_requests.get.call_args
assert args[0] == url
assert kwargs["timeout"] == 3
assert kwargs["headers"] == r._REQUEST_HEADERS
# another variant
url = "https://deepset.ai"
r.fetch(url=url, timeout=10)
# Check the arguments that requests.get was called with
args, kwargs = mocked_requests.get.call_args
assert args[0] == url
assert kwargs["timeout"] == 10
assert kwargs["headers"] == r._REQUEST_HEADERS
@pytest.mark.unit
def test_fetch_default_empty_content(mocked_requests):
"""
Checks handling of content extraction returning empty content.
"""
url = "https://www.example.com"
timeout = 10
content_text = ""
r = LinkContentFetcher()
with patch("boilerpy3.extractors.ArticleExtractor.get_content", return_value=content_text):
result = r.fetch(url=url, timeout=timeout)
assert "text" not in result
assert isinstance(result, list) and len(result) == 0
@pytest.mark.unit
def test_fetch_exception_during_content_extraction_no_raise_on_failure(caplog, mocked_requests):
"""
Checks the behavior when there's an exception during content extraction, and raise_on_failure is set to False.
"""
caplog.set_level(logging.WARNING)
url = "https://www.example.com"
r = LinkContentFetcher()
with patch("boilerpy3.extractors.ArticleExtractor.get_content", side_effect=Exception("Could not extract content")):
result = r.fetch(url=url)
assert "text" not in result
assert "failed to extract content from" in caplog.text
@pytest.mark.unit
def test_fetch_exception_during_content_extraction_raise_on_failure(caplog, mocked_requests):
"""
Checks the behavior when there's an exception during content extraction, and raise_on_failure is set to True.
"""
caplog.set_level(logging.WARNING)
url = "https://www.example.com"
r = LinkContentFetcher(raise_on_failure=True)
with patch(
"boilerpy3.extractors.ArticleExtractor.get_content", side_effect=Exception("Could not extract content")
), pytest.raises(Exception, match="Could not extract content"):
r.fetch(url=url)
@pytest.mark.unit
def test_fetch_exception_during_request_get_no_raise_on_failure(caplog):
"""
Checks the behavior when there's an exception during request.get, and raise_on_failure is set to False.
"""
caplog.set_level(logging.WARNING)
url = "https://www.example.com"
r = LinkContentFetcher()
with patch("haystack.nodes.retriever.link_content.requests.get", side_effect=requests.RequestException()):
r.fetch(url=url)
assert f"Couldn't retrieve content from {url}" in caplog.text
@pytest.mark.unit
def test_fetch_exception_during_request_get_raise_on_failure(caplog):
"""
Checks the behavior when there's an exception during request.get, and raise_on_failure is set to True.
"""
caplog.set_level(logging.WARNING)
url = "https://www.example.com"
r = LinkContentFetcher(raise_on_failure=True)
with patch(
"haystack.nodes.retriever.link_content.requests.get", side_effect=requests.RequestException()
), pytest.raises(requests.RequestException):
r.fetch(url=url)
@pytest.mark.unit
@pytest.mark.parametrize("error_code", [403, 404, 500])
def test_handle_various_response_errors(caplog, mocked_requests, error_code: int):
"""
Tests the handling of various HTTP error responses.
"""
caplog.set_level(logging.WARNING)
url = "https://some-problematic-url.com"
# we don't throw exceptions, there might be many of them
# we log them on debug level
mock_response = Response()
mock_response.status_code = error_code
mocked_requests.get.return_value = mock_response
r = LinkContentFetcher()
docs = r.fetch(url=url)
assert f"Couldn't retrieve content from {url}" in caplog.text
assert len(docs) == 1
assert isinstance(docs[0], Document)
assert docs[0].content == ""
@pytest.mark.unit
@pytest.mark.parametrize("error_code", [403, 404, 500])
def test_handle_http_error(mocked_requests, error_code: int):
"""
Checks the behavior when there's an HTTPError raised, and raise_on_failure is set to True.
"""
url = "https://some-problematic-url.com"
# we don't throw exceptions, there might be many of them
# we log them on debug level
mock_response = Response()
mock_response.status_code = error_code
mocked_requests.get.return_value = mock_response
r = LinkContentFetcher(raise_on_failure=True)
with pytest.raises(requests.HTTPError):
r.fetch(url=url)
@pytest.mark.unit
def test_is_valid_url():
"""
Checks the _is_valid_url function with a set of valid URLs.
"""
retriever = LinkContentFetcher()
valid_urls = [
"http://www.google.com",
"https://www.google.com",
"http://google.com",
"https://google.com",
"http://localhost",
"https://localhost",
"http://127.0.0.1",
"https://127.0.0.1",
"http://[::1]",
"https://[::1]",
"http://example.com/path/to/page?name=value",
"https://example.com/path/to/page?name=value",
"http://example.com:8000",
"https://example.com:8000",
]
for url in valid_urls:
assert retriever._is_valid_url(url), f"Expected {url} to be valid"
@pytest.mark.unit
def test_is_invalid_url():
"""
Checks the _is_valid_url function with a set of invalid URLs.
"""
retriever = LinkContentFetcher()
invalid_urls = [
"http://",
"https://",
"http:",
"https:",
"www.google.com",
"google.com",
"localhost",
"127.0.0.1",
"[::1]",
"/path/to/page",
"/path/to/page?name=value",
":8000",
"example.com",
"http:///example.com",
"https:///example.com",
"",
None,
]
for url in invalid_urls:
assert not retriever._is_valid_url(url), f"Expected {url} to be invalid"
@pytest.mark.unit
def test_switch_user_agent_on_failed_request():
"""
Test that LinkContentFetcher switches user agents on failed requests
"""
url = "http://fakeurl.com"
retry_attempts = 2
lc = LinkContentFetcher(user_agents=["ua1", "ua2"], retry_attempts=retry_attempts)
with patch("haystack.nodes.retriever.link_content.requests.get") as mocked_get:
mocked_get.return_value.raise_for_status.side_effect = requests.HTTPError()
lc._get_response(url)
assert mocked_get.call_count == retry_attempts
assert mocked_get.call_args_list[0][1]["headers"]["User-Agent"] == "ua1"
assert mocked_get.call_args_list[1][1]["headers"]["User-Agent"] == "ua2"
@pytest.mark.unit
def test_valid_requests_dont_switch_agent(mocked_requests):
"""
Test that LinkContentFetcher doesn't switch user agents on valid requests
"""
lcf = LinkContentFetcher()
# Make first valid request
lcf._get_response("http://example.com")
# Make second valid request
lcf._get_response("http://example.com")
# Assert that requests.get was called twice with the same default user agents
assert mocked_requests.get.call_count == 2
assert (
mocked_requests.get.call_args_list[0][1]["headers"]["User-Agent"]
== mocked_requests.get.call_args_list[1][1]["headers"]["User-Agent"]
)
@pytest.mark.integration
def test_call_with_valid_url_on_live_web():
"""
Test that LinkContentFetcher can fetch content from a valid URL
"""
retriever = LinkContentFetcher()
docs = retriever.fetch(url="https://docs.haystack.deepset.ai/", timeout=2)
assert len(docs) >= 1
assert isinstance(docs[0], Document)
assert "Haystack" in docs[0].content
@pytest.mark.integration
def test_retrieve_with_valid_url_on_live_web():
"""
Test that LinkContentFetcher can fetch content from a valid URL using the run method
"""
retriever = LinkContentFetcher()
docs, _ = retriever.run(query="https://docs.haystack.deepset.ai/")
docs = docs["documents"]
assert len(docs) >= 1
assert isinstance(docs[0], Document)
assert "Haystack" in docs[0].content
@pytest.mark.integration
def test_retrieve_with_invalid_url():
"""
Test that LinkContentFetcher raises ValueError when trying to fetch content from an invalid URL
"""
retriever = LinkContentFetcher()
with pytest.raises(ValueError):
retriever.run(query="")
@pytest.mark.integration
def test_retrieve_batch():
"""
Test that LinkContentFetcher can fetch content from a valid URL using the retrieve_batch method
"""
retriever = LinkContentFetcher()
docs, _ = retriever.run_batch(queries=["https://docs.haystack.deepset.ai/", "https://deepset.ai/"])
assert docs
docs = docs["documents"]
# no processor is applied, so each query should return a list of documents with one entry
assert len(docs) == 2 and len(docs[0]) == 1 and len(docs[1]) == 1
# each query should return a list of documents
assert isinstance(docs[0], list) and isinstance(docs[0][0], Document)
assert isinstance(docs[1], list) and isinstance(docs[1][0], Document)