mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-03 14:18:46 +00:00

* ci: Simplify Python code with ruff rules SIM * Revert #5828 * ruff --select=I --fix haystack/modeling/infer.py --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
463 lines
14 KiB
Python
463 lines
14 KiB
Python
from typing import Optional
|
|
from unittest.mock import Mock, patch
|
|
import logging
|
|
import pytest
|
|
import requests
|
|
from requests import Response
|
|
|
|
from haystack import Document
|
|
from haystack.nodes import LinkContentFetcher
|
|
|
|
|
|
@pytest.fixture
|
|
def mocked_requests():
|
|
with patch("haystack.nodes.retriever.link_content.requests") as mock_requests:
|
|
mock_response = Mock()
|
|
mock_requests.get.return_value = mock_response
|
|
mock_response.status_code = 200
|
|
mock_response.text = "Sample content from webpage"
|
|
mock_response.headers = {"Content-Type": "text/html"}
|
|
yield mock_requests
|
|
|
|
|
|
@pytest.fixture
|
|
def mocked_article_extractor():
|
|
with patch("boilerpy3.extractors.ArticleExtractor.get_content", return_value="Sample content from webpage"):
|
|
yield
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_init():
|
|
"""
|
|
Checks the initialization of the LinkContentFetcher without a preprocessor.
|
|
"""
|
|
r = LinkContentFetcher()
|
|
|
|
assert r.processor is None
|
|
assert isinstance(r.handlers, dict)
|
|
assert "text/html" in r.handlers
|
|
assert "application/pdf" in r.handlers
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_init_with_preprocessor():
|
|
"""
|
|
Checks the initialization of the LinkContentFetcher with a preprocessor.
|
|
"""
|
|
pre_processor_mock = Mock()
|
|
r = LinkContentFetcher(processor=pre_processor_mock)
|
|
assert r.processor == pre_processor_mock
|
|
assert isinstance(r.handlers, dict)
|
|
assert "text/html" in r.handlers
|
|
assert "application/pdf" in r.handlers
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_init_with_content_handlers():
|
|
"""
|
|
Checks the initialization of the LinkContentFetcher content handlers.
|
|
"""
|
|
|
|
def fake_but_valid_video_content_handler(response: Response) -> Optional[str]:
|
|
pass
|
|
|
|
r = LinkContentFetcher(content_handlers={"video/mp4": fake_but_valid_video_content_handler})
|
|
|
|
assert isinstance(r.handlers, dict)
|
|
assert "text/html" in r.handlers
|
|
assert "application/pdf" in r.handlers
|
|
assert "video/mp4" in r.handlers
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_init_with_content_handlers_override():
|
|
"""
|
|
Checks the initialization of the LinkContentFetcher content handlers but with pdf handler overridden.
|
|
"""
|
|
|
|
def new_pdf_content_handler(response: Response) -> Optional[str]:
|
|
pass
|
|
|
|
r = LinkContentFetcher(content_handlers={"application/pdf": new_pdf_content_handler})
|
|
|
|
assert isinstance(r.handlers, dict)
|
|
assert "text/html" in r.handlers
|
|
assert "application/pdf" in r.handlers
|
|
assert r.handlers["application/pdf"] == new_pdf_content_handler
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_init_with_invalid_content_handlers():
|
|
"""
|
|
Checks the initialization of the LinkContentFetcher content handlers fails with invalid content handlers.
|
|
"""
|
|
|
|
# invalid because it does not have the correct signature
|
|
def invalid_video_content_handler() -> Optional[str]:
|
|
pass
|
|
|
|
with pytest.raises(ValueError, match="handler must accept"):
|
|
LinkContentFetcher(content_handlers={"video/mp4": invalid_video_content_handler})
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_fetch(mocked_requests, mocked_article_extractor):
|
|
"""
|
|
Checks if the LinkContentFetcher is able to fetch content.
|
|
"""
|
|
url = "https://haystack.deepset.ai/"
|
|
|
|
pre_processor_mock = Mock()
|
|
pre_processor_mock.process.return_value = [Document("Sample content from webpage")]
|
|
|
|
r = LinkContentFetcher(processor=pre_processor_mock)
|
|
result = r.fetch(url=url, doc_kwargs={"text": "Sample content from webpage"})
|
|
|
|
assert len(result) == 1
|
|
assert isinstance(result[0], Document)
|
|
assert result[0].content == "Sample content from webpage"
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_fetch_no_url(mocked_requests, mocked_article_extractor):
|
|
"""
|
|
Ensures an InvalidURL exception is raised when URL is missing.
|
|
"""
|
|
pre_processor_mock = Mock()
|
|
pre_processor_mock.process.return_value = [Document("Sample content from webpage")]
|
|
|
|
retriever_no_url = LinkContentFetcher(processor=pre_processor_mock)
|
|
with pytest.raises(requests.exceptions.InvalidURL, match="Invalid or missing URL"):
|
|
retriever_no_url.fetch(url="")
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_fetch_invalid_url(caplog, mocked_requests, mocked_article_extractor):
|
|
"""
|
|
Ensures an InvalidURL exception is raised when the URL is invalid.
|
|
"""
|
|
url = "this-is-invalid-url"
|
|
|
|
r = LinkContentFetcher()
|
|
with pytest.raises(requests.exceptions.InvalidURL):
|
|
r.fetch(url=url)
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_fetch_no_preprocessor(mocked_requests, mocked_article_extractor):
|
|
"""
|
|
Checks if the LinkContentFetcher can fetch content without a preprocessor.
|
|
"""
|
|
url = "https://www.example.com"
|
|
r = LinkContentFetcher()
|
|
|
|
result_no_preprocessor = r.fetch(url=url)
|
|
|
|
assert len(result_no_preprocessor) == 1
|
|
assert isinstance(result_no_preprocessor[0], Document)
|
|
assert result_no_preprocessor[0].content == "Sample content from webpage"
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_fetch_correct_arguments(mocked_requests, mocked_article_extractor):
|
|
"""
|
|
Ensures that requests.get is called with correct arguments.
|
|
"""
|
|
url = "https://www.example.com"
|
|
|
|
r = LinkContentFetcher()
|
|
r.fetch(url=url)
|
|
|
|
# Check the arguments that requests.get was called with
|
|
args, kwargs = mocked_requests.get.call_args
|
|
assert args[0] == url
|
|
assert kwargs["timeout"] == 3
|
|
assert kwargs["headers"] == r._REQUEST_HEADERS
|
|
|
|
# another variant
|
|
url = "https://deepset.ai"
|
|
r.fetch(url=url, timeout=10)
|
|
# Check the arguments that requests.get was called with
|
|
args, kwargs = mocked_requests.get.call_args
|
|
assert args[0] == url
|
|
assert kwargs["timeout"] == 10
|
|
assert kwargs["headers"] == r._REQUEST_HEADERS
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_fetch_default_empty_content(mocked_requests):
|
|
"""
|
|
Checks handling of content extraction returning empty content.
|
|
"""
|
|
url = "https://www.example.com"
|
|
timeout = 10
|
|
content_text = ""
|
|
r = LinkContentFetcher()
|
|
|
|
with patch("boilerpy3.extractors.ArticleExtractor.get_content", return_value=content_text):
|
|
result = r.fetch(url=url, timeout=timeout)
|
|
|
|
assert "text" not in result
|
|
assert isinstance(result, list) and len(result) == 0
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_fetch_exception_during_content_extraction_no_raise_on_failure(caplog, mocked_requests):
|
|
"""
|
|
Checks the behavior when there's an exception during content extraction, and raise_on_failure is set to False.
|
|
"""
|
|
caplog.set_level(logging.WARNING)
|
|
url = "https://www.example.com"
|
|
r = LinkContentFetcher()
|
|
|
|
with patch("boilerpy3.extractors.ArticleExtractor.get_content", side_effect=Exception("Could not extract content")):
|
|
result = r.fetch(url=url)
|
|
|
|
assert "text" not in result
|
|
assert "failed to extract content from" in caplog.text
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_fetch_exception_during_content_extraction_raise_on_failure(caplog, mocked_requests):
|
|
"""
|
|
Checks the behavior when there's an exception during content extraction, and raise_on_failure is set to True.
|
|
"""
|
|
caplog.set_level(logging.WARNING)
|
|
url = "https://www.example.com"
|
|
r = LinkContentFetcher(raise_on_failure=True)
|
|
|
|
with patch(
|
|
"boilerpy3.extractors.ArticleExtractor.get_content", side_effect=Exception("Could not extract content")
|
|
), pytest.raises(Exception, match="Could not extract content"):
|
|
r.fetch(url=url)
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_fetch_exception_during_request_get_no_raise_on_failure(caplog):
|
|
"""
|
|
Checks the behavior when there's an exception during request.get, and raise_on_failure is set to False.
|
|
"""
|
|
caplog.set_level(logging.WARNING)
|
|
url = "https://www.example.com"
|
|
r = LinkContentFetcher()
|
|
|
|
with patch("haystack.nodes.retriever.link_content.requests.get", side_effect=requests.RequestException()):
|
|
r.fetch(url=url)
|
|
assert f"Couldn't retrieve content from {url}" in caplog.text
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_fetch_exception_during_request_get_raise_on_failure(caplog):
|
|
"""
|
|
Checks the behavior when there's an exception during request.get, and raise_on_failure is set to True.
|
|
"""
|
|
caplog.set_level(logging.WARNING)
|
|
url = "https://www.example.com"
|
|
r = LinkContentFetcher(raise_on_failure=True)
|
|
|
|
with patch(
|
|
"haystack.nodes.retriever.link_content.requests.get", side_effect=requests.RequestException()
|
|
), pytest.raises(requests.RequestException):
|
|
r.fetch(url=url)
|
|
|
|
|
|
@pytest.mark.unit
|
|
@pytest.mark.parametrize("error_code", [403, 404, 500])
|
|
def test_handle_various_response_errors(caplog, mocked_requests, error_code: int):
|
|
"""
|
|
Tests the handling of various HTTP error responses.
|
|
"""
|
|
caplog.set_level(logging.WARNING)
|
|
|
|
url = "https://some-problematic-url.com"
|
|
|
|
# we don't throw exceptions, there might be many of them
|
|
# we log them on debug level
|
|
mock_response = Response()
|
|
mock_response.status_code = error_code
|
|
mocked_requests.get.return_value = mock_response
|
|
|
|
r = LinkContentFetcher()
|
|
docs = r.fetch(url=url)
|
|
|
|
assert f"Couldn't retrieve content from {url}" in caplog.text
|
|
assert len(docs) == 1
|
|
assert isinstance(docs[0], Document)
|
|
assert docs[0].content == ""
|
|
|
|
|
|
@pytest.mark.unit
|
|
@pytest.mark.parametrize("error_code", [403, 404, 500])
|
|
def test_handle_http_error(mocked_requests, error_code: int):
|
|
"""
|
|
Checks the behavior when there's an HTTPError raised, and raise_on_failure is set to True.
|
|
"""
|
|
|
|
url = "https://some-problematic-url.com"
|
|
|
|
# we don't throw exceptions, there might be many of them
|
|
# we log them on debug level
|
|
mock_response = Response()
|
|
mock_response.status_code = error_code
|
|
mocked_requests.get.return_value = mock_response
|
|
|
|
r = LinkContentFetcher(raise_on_failure=True)
|
|
with pytest.raises(requests.HTTPError):
|
|
r.fetch(url=url)
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_is_valid_url():
|
|
"""
|
|
Checks the _is_valid_url function with a set of valid URLs.
|
|
"""
|
|
retriever = LinkContentFetcher()
|
|
|
|
valid_urls = [
|
|
"http://www.google.com",
|
|
"https://www.google.com",
|
|
"http://google.com",
|
|
"https://google.com",
|
|
"http://localhost",
|
|
"https://localhost",
|
|
"http://127.0.0.1",
|
|
"https://127.0.0.1",
|
|
"http://[::1]",
|
|
"https://[::1]",
|
|
"http://example.com/path/to/page?name=value",
|
|
"https://example.com/path/to/page?name=value",
|
|
"http://example.com:8000",
|
|
"https://example.com:8000",
|
|
]
|
|
|
|
for url in valid_urls:
|
|
assert retriever._is_valid_url(url), f"Expected {url} to be valid"
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_is_invalid_url():
|
|
"""
|
|
Checks the _is_valid_url function with a set of invalid URLs.
|
|
"""
|
|
retriever = LinkContentFetcher()
|
|
|
|
invalid_urls = [
|
|
"http://",
|
|
"https://",
|
|
"http:",
|
|
"https:",
|
|
"www.google.com",
|
|
"google.com",
|
|
"localhost",
|
|
"127.0.0.1",
|
|
"[::1]",
|
|
"/path/to/page",
|
|
"/path/to/page?name=value",
|
|
":8000",
|
|
"example.com",
|
|
"http:///example.com",
|
|
"https:///example.com",
|
|
"",
|
|
None,
|
|
]
|
|
|
|
for url in invalid_urls:
|
|
assert not retriever._is_valid_url(url), f"Expected {url} to be invalid"
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_switch_user_agent_on_failed_request():
|
|
"""
|
|
Test that LinkContentFetcher switches user agents on failed requests
|
|
"""
|
|
url = "http://fakeurl.com"
|
|
retry_attempts = 2
|
|
lc = LinkContentFetcher(user_agents=["ua1", "ua2"], retry_attempts=retry_attempts)
|
|
with patch("haystack.nodes.retriever.link_content.requests.get") as mocked_get:
|
|
mocked_get.return_value.raise_for_status.side_effect = requests.HTTPError()
|
|
lc._get_response(url)
|
|
|
|
assert mocked_get.call_count == retry_attempts
|
|
assert mocked_get.call_args_list[0][1]["headers"]["User-Agent"] == "ua1"
|
|
assert mocked_get.call_args_list[1][1]["headers"]["User-Agent"] == "ua2"
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_valid_requests_dont_switch_agent(mocked_requests):
|
|
"""
|
|
Test that LinkContentFetcher doesn't switch user agents on valid requests
|
|
"""
|
|
lcf = LinkContentFetcher()
|
|
|
|
# Make first valid request
|
|
lcf._get_response("http://example.com")
|
|
|
|
# Make second valid request
|
|
lcf._get_response("http://example.com")
|
|
|
|
# Assert that requests.get was called twice with the same default user agents
|
|
assert mocked_requests.get.call_count == 2
|
|
assert (
|
|
mocked_requests.get.call_args_list[0][1]["headers"]["User-Agent"]
|
|
== mocked_requests.get.call_args_list[1][1]["headers"]["User-Agent"]
|
|
)
|
|
|
|
|
|
@pytest.mark.integration
|
|
def test_call_with_valid_url_on_live_web():
|
|
"""
|
|
Test that LinkContentFetcher can fetch content from a valid URL
|
|
"""
|
|
|
|
retriever = LinkContentFetcher()
|
|
docs = retriever.fetch(url="https://docs.haystack.deepset.ai/", timeout=2)
|
|
|
|
assert len(docs) >= 1
|
|
assert isinstance(docs[0], Document)
|
|
assert "Haystack" in docs[0].content
|
|
|
|
|
|
@pytest.mark.integration
|
|
def test_retrieve_with_valid_url_on_live_web():
|
|
"""
|
|
Test that LinkContentFetcher can fetch content from a valid URL using the run method
|
|
"""
|
|
|
|
retriever = LinkContentFetcher()
|
|
docs, _ = retriever.run(query="https://docs.haystack.deepset.ai/")
|
|
docs = docs["documents"]
|
|
|
|
assert len(docs) >= 1
|
|
assert isinstance(docs[0], Document)
|
|
assert "Haystack" in docs[0].content
|
|
|
|
|
|
@pytest.mark.integration
|
|
def test_retrieve_with_invalid_url():
|
|
"""
|
|
Test that LinkContentFetcher raises ValueError when trying to fetch content from an invalid URL
|
|
"""
|
|
|
|
retriever = LinkContentFetcher()
|
|
with pytest.raises(ValueError):
|
|
retriever.run(query="")
|
|
|
|
|
|
@pytest.mark.integration
|
|
def test_retrieve_batch():
|
|
"""
|
|
Test that LinkContentFetcher can fetch content from a valid URL using the retrieve_batch method
|
|
"""
|
|
|
|
retriever = LinkContentFetcher()
|
|
docs, _ = retriever.run_batch(queries=["https://docs.haystack.deepset.ai/", "https://deepset.ai/"])
|
|
assert docs
|
|
|
|
docs = docs["documents"]
|
|
# no processor is applied, so each query should return a list of documents with one entry
|
|
assert len(docs) == 2 and len(docs[0]) == 1 and len(docs[1]) == 1
|
|
|
|
# each query should return a list of documents
|
|
assert isinstance(docs[0], list) and isinstance(docs[0][0], Document)
|
|
assert isinstance(docs[1], list) and isinstance(docs[1][0], Document)
|