feat: Add LinkContentFetcher Haystack 2.0 component (#5724)

* Add LinkContentFetcher * Add release note * Small fixes * Fix pydocs * PR feedback * Remove handlers registration * PR feedback * adjustments * improve tests * initial draft * tests * add proposal * proposal number * reno * fix tests and usage of content and content_type * update branch & fix more tests * mypy * use the new document * add docstring * fix more tests * mypy * fix tests * add e2e * review feedback * improve __str__ * Apply suggestions from code review Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/preview/dataclasses/document.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * improve __str__ * fix tests * fix more tests * fix test * Fix end-of-file-fixer * Post merge fixes * Move e2e tests back into component --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
2025-12-16 09:38:07 +00:00 · 2023-09-20 11:03:52 +02:00 · 2023-09-20 11:03:52 +02:00 · 0983fb656a
commit 0983fb656a
parent bf6d306d68
9 changed files with 348 additions and 4 deletions
--- a/haystack/preview/components/fetchers/init.py
+++ b/haystack/preview/components/fetchers/init.py
@ -0,0 +1 @@
 from haystack.preview.components.fetchers.link_content import LinkContentFetcher
--- a/haystack/preview/components/fetchers/link_content.py
+++ b/haystack/preview/components/fetchers/link_content.py
@ -0,0 +1,163 @@
 import io
 import logging
 from collections import defaultdict
 from datetime import datetime
 from typing import Optional, Dict, List, Callable, Any, IO
 import requests
 from requests import Response
 from requests.exceptions import HTTPError
 from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, RetryCallState
 from haystack.preview import component, default_from_dict, default_to_dict
 from haystack import __version__
 from haystack.preview import Document
 logger = logging.getLogger(__name__)
 DEFAULT_USER_AGENT = f"haystack/LinkContentFetcher/{__version__}"
 REQUEST_HEADERS = {
    "accept": "*/*",
    "User-Agent": DEFAULT_USER_AGENT,
    "Accept-Language": "en-US,en;q=0.9,it;q=0.8,es;q=0.7",
    "referer": "https://www.google.com/",
 }
 def text_content_handler(response: Response) -> Dict[str, str]:
    """
    :param response: Response object from the request.
    :return: The extracted text.
    """
    return {"text": response.text}
 def binary_content_handler(response: Response) -> Dict[str, IO[bytes]]:
    """
    :param response: Response object from the request.
    :return: The extracted binary file-like object.
    """
    return {"blob": io.BytesIO(response.content)}
@component
 class LinkContentFetcher:
    """
    LinkContentFetcher fetches content from a URL link and converts it to a Document object.
    """
    def __init__(
        self,
        raise_on_failure: bool = True,
        user_agents: Optional[List[str]] = None,
        retry_attempts: int = 2,
        timeout: int = 3,
    ):
        """
        Creates a LinkContentFetcher instance.
        :param raise_on_failure: A boolean indicating whether to raise an exception when a failure occurs
            during content extraction. If False, the error is simply logged and the program continues.
            Defaults to False.
        :param user_agents: A list of user agents to use when fetching content. Defaults to None, in which case a
            default user agent is used.
        :param retry_attempts: The number of times to retry fetching content. Defaults to 2.
        :param timeout: The timeout in seconds for the request. Defaults to 3.
        """
        self.raise_on_failure = raise_on_failure
        self.user_agents = user_agents or [DEFAULT_USER_AGENT]
        self.current_user_agent_idx: int = 0
        self.retry_attempts = retry_attempts
        self.timeout = timeout
        # register default content handlers that extract data from the response
        self.handlers: Dict[str, Callable[[Response], Dict[str, Any]]] = defaultdict(lambda: text_content_handler)
        self.handlers["text/html"] = text_content_handler
        self.handlers["text/plain"] = text_content_handler
        self.handlers["application/pdf"] = binary_content_handler
        self.handlers["application/octet-stream"] = binary_content_handler
        @retry(
            reraise=True,
            stop=stop_after_attempt(self.retry_attempts),
            wait=wait_exponential(multiplier=1, min=2, max=10),
            retry=(retry_if_exception_type((HTTPError, requests.RequestException))),
            # This method is invoked only after failed requests (exception raised)
            after=self._switch_user_agent,
        )
        def get_response(url):
            # we need to copy because we modify the headers
            headers = REQUEST_HEADERS.copy()
            headers["User-Agent"] = self.user_agents[self.current_user_agent_idx]
            response = requests.get(url, headers=headers, timeout=timeout or 3)
            response.raise_for_status()
            return response
        self._get_response: Callable = get_response
    def to_dict(self) -> Dict[str, Any]:
        """
        Serialize this component to a dictionary.
        """
        return default_to_dict(
            self,
            raise_on_failure=self.raise_on_failure,
            user_agents=self.user_agents,
            retry_attempts=self.retry_attempts,
            timeout=self.timeout,
        )
    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "LinkContentFetcher":
        """
        Deserialize this component from a dictionary.
        """
        return default_from_dict(cls, data)
    @component.output_types(documents=Optional[Document])
    def run(self, url: str):
        """
        Fetches content from a URL and converts it to a Document objects. If no content is extracted,
        an empty Document object is returned (if raise_on_failure is False).
        :param url: URL to fetch content from.
        :param timeout: Timeout in seconds for the request.
        :return: List of Document objects or an empty list if no content is extracted.
        """
        document_data: Dict[str, Any] = {"metadata": {"url": url, "timestamp": int(datetime.utcnow().timestamp())}}
        try:
            response = self._get_response(url)
            content_type = self._get_content_type(response)
            document_data["mime_type"] = content_type
            handler: Callable = self.handlers[content_type]
            document_data.update(handler(response))
            return {"document": Document(**document_data)}
        except Exception as e:
            if self.raise_on_failure:
                raise e
            logger.debug("Couldn't retrieve content from %s", url)
            return {"document": None}
        finally:
            self.current_user_agent_idx = 0
    def _get_content_type(self, response: Response):
        """
        Get the content type of the response.
        :param response: The response object.
        :return: The content type of the response.
        """
        content_type = response.headers.get("Content-Type", "")
        return content_type.split(";")[0]
    def _switch_user_agent(self, retry_state: RetryCallState) -> None:
        """
        Switches the User-Agent for this LinkContentRetriever to the next one in the list of user agents.
        Used by tenacity to retry the requests with a different user agent.
        :param retry_state: The retry state (unused, required by tenacity).
        """
        self.current_user_agent_idx = (self.current_user_agent_idx + 1) % len(self.user_agents)
        logger.debug("Switched user agent to %s", self.user_agents[self.current_user_agent_idx])
--- a/releasenotes/notes/add-link-content-fetcher-145915976f38e1e0.yaml
+++ b/releasenotes/notes/add-link-content-fetcher-145915976f38e1e0.yaml
@ -0,0 +1,5 @@
 ---
 preview:
  - |
    Adds LinkContentFetcher component to Haystack 2.0. LinkContentFetcher fetches content from a given URL and
    converts it into a Document object, which can then be used within the Haystack 2.0 pipeline.
--- a/releasenotes/notes/support-azure-3.5-gpt-16k-model-ece0cfe03260748c.yaml
+++ b/releasenotes/notes/support-azure-3.5-gpt-16k-model-ece0cfe03260748c.yaml
@ -2,4 +2,3 @@
 fixes:
  - |
    gpt-35-turbo-16k model from Azure can integrate correctly
--- a/test/preview/components/fetchers/init.py
+++ b/test/preview/components/fetchers/init.py
--- a/test/preview/components/fetchers/test_link_content_fetcher.py
+++ b/test/preview/components/fetchers/test_link_content_fetcher.py
@ -0,0 +1,170 @@
 import io
 from unittest.mock import patch, Mock
 import pytest
 from haystack.preview.components.fetchers.link_content import (
    LinkContentFetcher,
    text_content_handler,
    binary_content_handler,
    DEFAULT_USER_AGENT,
 )
 HTML_URL = "https://docs.haystack.deepset.ai/docs"
 TEXT_URL = "https://raw.githubusercontent.com/deepset-ai/haystack/main/README.md"
 PDF_URL = "https://raw.githubusercontent.com/deepset-ai/haystack/b5987a6d8d0714eb2f3011183ab40093d2e4a41a/e2e/samples/pipelines/sample_pdf_1.pdf"
@pytest.fixture
 def mock_get_link_text_content():
    with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
        mock_run.get.return_value = Mock(
            status_code=200, text="Example test response", headers={"Content-Type": "text/plain"}
        )
        yield mock_run
@pytest.fixture
 def mock_get_link_content(test_files_path):
    with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
        mock_run.get.return_value = Mock(
            status_code=200,
            content=open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read(),
            headers={"Content-Type": "application/pdf"},
        )
        yield mock_run
 class TestLinkContentFetcher:
    @pytest.mark.unit
    def test_init(self):
        fetcher = LinkContentFetcher()
        assert fetcher.raise_on_failure is True
        assert fetcher.user_agents == [DEFAULT_USER_AGENT]
        assert fetcher.retry_attempts == 2
        assert fetcher.timeout == 3
        assert fetcher.handlers == {
            "text/html": text_content_handler,
            "text/plain": text_content_handler,
            "application/pdf": binary_content_handler,
            "application/octet-stream": binary_content_handler,
        }
        assert hasattr(fetcher, "_get_response")
    @pytest.mark.unit
    def test_init_with_params(self):
        fetcher = LinkContentFetcher(raise_on_failure=False, user_agents=["test"], retry_attempts=1, timeout=2)
        assert fetcher.raise_on_failure is False
        assert fetcher.user_agents == ["test"]
        assert fetcher.retry_attempts == 1
        assert fetcher.timeout == 2
    @pytest.mark.unit
    def test_to_dict(self):
        fetcher = LinkContentFetcher()
        assert fetcher.to_dict() == {
            "type": "LinkContentFetcher",
            "init_parameters": {
                "raise_on_failure": True,
                "user_agents": [DEFAULT_USER_AGENT],
                "retry_attempts": 2,
                "timeout": 3,
            },
        }
    @pytest.mark.unit
    def test_to_dict_with_params(self):
        fetcher = LinkContentFetcher(raise_on_failure=False, user_agents=["test"], retry_attempts=1, timeout=2)
        assert fetcher.to_dict() == {
            "type": "LinkContentFetcher",
            "init_parameters": {"raise_on_failure": False, "user_agents": ["test"], "retry_attempts": 1, "timeout": 2},
        }
    @pytest.mark.unit
    def test_from_dict(self):
        fetcher = LinkContentFetcher.from_dict(
            {
                "type": "LinkContentFetcher",
                "init_parameters": {
                    "raise_on_failure": False,
                    "user_agents": ["test"],
                    "retry_attempts": 1,
                    "timeout": 2,
                },
            }
        )
        assert fetcher.raise_on_failure is False
        assert fetcher.user_agents == ["test"]
        assert fetcher.retry_attempts == 1
    @pytest.mark.unit
    def test_run_text(self):
        with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
            mock_run.get.return_value = Mock(
                status_code=200, text="Example test response", headers={"Content-Type": "text/plain"}
            )
            fetcher = LinkContentFetcher()
            document = fetcher.run("https://www.example.com")["document"]
            assert document.text == "Example test response"
            assert document.metadata["url"] == "https://www.example.com"
            assert "timestamp" in document.metadata
    @pytest.mark.unit
    def test_run_html(self):
        with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
            mock_run.get.return_value = Mock(
                status_code=200, text="<h1>Example test response</h1>", headers={"Content-Type": "text/html"}
            )
            fetcher = LinkContentFetcher()
            document = fetcher.run("https://www.example.com")["document"]
            assert document.text == "<h1>Example test response</h1>"
            assert document.metadata["url"] == "https://www.example.com"
            assert "timestamp" in document.metadata
    @pytest.mark.unit
    def test_run_binary(self, test_files_path):
        file_bytes = open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read()
        with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
            mock_run.get.return_value = Mock(
                status_code=200, content=file_bytes, headers={"Content-Type": "application/pdf"}
            )
            fetcher = LinkContentFetcher()
            document = fetcher.run("https://www.example.com")["document"]
            # casting to list to make the blobs comparable
            assert list(document.blob) == list(io.BytesIO(file_bytes))
            assert document.metadata["url"] == "https://www.example.com"
            assert "timestamp" in document.metadata
    @pytest.mark.unit
    def test_run_bad_status_code(self):
        fetcher = LinkContentFetcher(raise_on_failure=False)
        mock_response = Mock(status_code=403)
        with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
            mock_run.get.return_value = mock_response
            document = fetcher.run("https://www.example.com")["document"]
        assert document is None
    @pytest.mark.integration
    def test_link_content_fetcher_html(self):
        fetcher = LinkContentFetcher()
        document = fetcher.run(HTML_URL)["document"]
        assert document.mime_type == "text/html"
        assert "Introduction to Haystack" in document.text
        assert document.metadata["url"] == HTML_URL
    @pytest.mark.integration
    def test_link_content_fetcher_text(self):
        fetcher = LinkContentFetcher()
        document = fetcher.run(TEXT_URL)["document"]
        assert document.mime_type == "text/plain"
        assert "Haystack" in document.text
        assert document.metadata["url"] == TEXT_URL
    @pytest.mark.integration
    def test_link_content_fetcher_pdf(self):
        fetcher = LinkContentFetcher()
        document = fetcher.run(PDF_URL)["document"]
        assert document.mime_type == "application/octet-stream"  # FIXME Should be "application/pdf"?
        assert document.text is None
        assert document.blob is not None
        assert document.metadata["url"] == PDF_URL
--- a/test/preview/conftest.py
+++ b/test/preview/conftest.py
@ -1,3 +1,4 @@
 from pathlib import Path
 from unittest.mock import Mock, patch
 import pytest
@ -11,3 +12,8 @@ def mock_tokenizer():
    tokenizer.encode = lambda text: text.split()
    tokenizer.decode = lambda tokens: " ".join(tokens)
    return tokenizer
@pytest.fixture()
 def test_files_path():
    return Path(__file__).parent / "test_files"
--- a/test/preview/test_files/pdf/sample_pdf_1.pdf
+++ b/test/preview/test_files/pdf/sample_pdf_1.pdf
		`@ -0,0 +1 @@`
							`from haystack.preview.components.fetchers.link_content import LinkContentFetcher`