feat: Add LinkContentFetcher Haystack 2.0 component (#5724)

* Add LinkContentFetcher * Add release note * Small fixes * Fix pydocs * PR feedback * Remove handlers registration * PR feedback * adjustments * improve tests * initial draft * tests * add proposal * proposal number * reno * fix tests and usage of content and content_type * update branch & fix more tests * mypy * use the new document * add docstring * fix more tests * mypy * fix tests * add e2e * review feedback * improve __str__ * Apply suggestions from code review Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/preview/dataclasses/document.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * improve __str__ * fix tests * fix more tests * fix test * Fix end-of-file-fixer * Post merge fixes * Move e2e tests back into component --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
2025-12-30 16:47:19 +00:00 · 2023-09-20 11:03:52 +02:00 · 2023-09-20 11:03:52 +02:00 · 0983fb656a
commit 0983fb656a
parent bf6d306d68
9 changed files with 348 additions and 4 deletions
--- a/haystack/preview/components/fetchers/init.py
+++ b/haystack/preview/components/fetchers/init.py
@ -0,0 +1 @@
+from haystack.preview.components.fetchers.link_content import LinkContentFetcher
--- a/haystack/preview/components/fetchers/link_content.py
+++ b/haystack/preview/components/fetchers/link_content.py
@ -0,0 +1,163 @@
+import io
+import logging
+from collections import defaultdict
+from datetime import datetime
+from typing import Optional, Dict, List, Callable, Any, IO
+
+import requests
+from requests import Response
+from requests.exceptions import HTTPError
+from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, RetryCallState
+from haystack.preview import component, default_from_dict, default_to_dict
+
+from haystack import __version__
+from haystack.preview import Document
+
+logger = logging.getLogger(__name__)
+
+
+DEFAULT_USER_AGENT = f"haystack/LinkContentFetcher/{__version__}"
+
+REQUEST_HEADERS = {
+    "accept": "*/*",
+    "User-Agent": DEFAULT_USER_AGENT,
+    "Accept-Language": "en-US,en;q=0.9,it;q=0.8,es;q=0.7",
+    "referer": "https://www.google.com/",
+}
+
+
+def text_content_handler(response: Response) -> Dict[str, str]:
+    """
+    :param response: Response object from the request.
+    :return: The extracted text.
+    """
+    return {"text": response.text}
+
+
+def binary_content_handler(response: Response) -> Dict[str, IO[bytes]]:
+    """
+    :param response: Response object from the request.
+    :return: The extracted binary file-like object.
+    """
+    return {"blob": io.BytesIO(response.content)}
+
+
+@component
+class LinkContentFetcher:
+    """
+    LinkContentFetcher fetches content from a URL link and converts it to a Document object.
+    """
+
+    def __init__(
+        self,
+        raise_on_failure: bool = True,
+        user_agents: Optional[List[str]] = None,
+        retry_attempts: int = 2,
+        timeout: int = 3,
+    ):
+        """
+        Creates a LinkContentFetcher instance.
+
+        :param raise_on_failure: A boolean indicating whether to raise an exception when a failure occurs
+            during content extraction. If False, the error is simply logged and the program continues.
+            Defaults to False.
+        :param user_agents: A list of user agents to use when fetching content. Defaults to None, in which case a
+            default user agent is used.
+        :param retry_attempts: The number of times to retry fetching content. Defaults to 2.
+        :param timeout: The timeout in seconds for the request. Defaults to 3.
+        """
+        self.raise_on_failure = raise_on_failure
+        self.user_agents = user_agents or [DEFAULT_USER_AGENT]
+        self.current_user_agent_idx: int = 0
+        self.retry_attempts = retry_attempts
+        self.timeout = timeout
+
+        # register default content handlers that extract data from the response
+        self.handlers: Dict[str, Callable[[Response], Dict[str, Any]]] = defaultdict(lambda: text_content_handler)
+        self.handlers["text/html"] = text_content_handler
+        self.handlers["text/plain"] = text_content_handler
+        self.handlers["application/pdf"] = binary_content_handler
+        self.handlers["application/octet-stream"] = binary_content_handler
+
+        @retry(
+            reraise=True,
+            stop=stop_after_attempt(self.retry_attempts),
+            wait=wait_exponential(multiplier=1, min=2, max=10),
+            retry=(retry_if_exception_type((HTTPError, requests.RequestException))),
+            # This method is invoked only after failed requests (exception raised)
+            after=self._switch_user_agent,
+        )
+        def get_response(url):
+            # we need to copy because we modify the headers
+            headers = REQUEST_HEADERS.copy()
+            headers["User-Agent"] = self.user_agents[self.current_user_agent_idx]
+            response = requests.get(url, headers=headers, timeout=timeout or 3)
+            response.raise_for_status()
+            return response
+
+        self._get_response: Callable = get_response
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serialize this component to a dictionary.
+        """
+        return default_to_dict(
+            self,
+            raise_on_failure=self.raise_on_failure,
+            user_agents=self.user_agents,
+            retry_attempts=self.retry_attempts,
+            timeout=self.timeout,
+        )
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "LinkContentFetcher":
+        """
+        Deserialize this component from a dictionary.
+        """
+        return default_from_dict(cls, data)
+
+    @component.output_types(documents=Optional[Document])
+    def run(self, url: str):
+        """
+        Fetches content from a URL and converts it to a Document objects. If no content is extracted,
+        an empty Document object is returned (if raise_on_failure is False).
+
+        :param url: URL to fetch content from.
+        :param timeout: Timeout in seconds for the request.
+        :return: List of Document objects or an empty list if no content is extracted.
+        """
+        document_data: Dict[str, Any] = {"metadata": {"url": url, "timestamp": int(datetime.utcnow().timestamp())}}
+        try:
+            response = self._get_response(url)
+            content_type = self._get_content_type(response)
+            document_data["mime_type"] = content_type
+            handler: Callable = self.handlers[content_type]
+            document_data.update(handler(response))
+            return {"document": Document(**document_data)}
+
+        except Exception as e:
+            if self.raise_on_failure:
+                raise e
+            logger.debug("Couldn't retrieve content from %s", url)
+            return {"document": None}
+
+        finally:
+            self.current_user_agent_idx = 0
+
+    def _get_content_type(self, response: Response):
+        """
+        Get the content type of the response.
+        :param response: The response object.
+        :return: The content type of the response.
+        """
+        content_type = response.headers.get("Content-Type", "")
+        return content_type.split(";")[0]
+
+    def _switch_user_agent(self, retry_state: RetryCallState) -> None:
+        """
+        Switches the User-Agent for this LinkContentRetriever to the next one in the list of user agents.
+        Used by tenacity to retry the requests with a different user agent.
+        :param retry_state: The retry state (unused, required by tenacity).
+        """
+        self.current_user_agent_idx = (self.current_user_agent_idx + 1) % len(self.user_agents)
+        logger.debug("Switched user agent to %s", self.user_agents[self.current_user_agent_idx])
--- a/releasenotes/notes/add-link-content-fetcher-145915976f38e1e0.yaml
+++ b/releasenotes/notes/add-link-content-fetcher-145915976f38e1e0.yaml
@ -0,0 +1,5 @@
+---
+preview:
+  - |
+    Adds LinkContentFetcher component to Haystack 2.0. LinkContentFetcher fetches content from a given URL and
+    converts it into a Document object, which can then be used within the Haystack 2.0 pipeline.
--- a/releasenotes/notes/refactor-pinecone-document-store.yaml
+++ b/releasenotes/notes/refactor-pinecone-document-store.yaml
@ -1,6 +1,6 @@
 ---
 enhancements:
  - |
-    Refactor PineconeDocumentStore to use metadata instead of namespaces 
-    for distinction between documents with embeddings, documents without 
-    embeddings and labels
+    Refactor PineconeDocumentStore to use metadata instead of namespaces
+    for distinction between documents with embeddings, documents without
+    embeddings and labels
--- a/releasenotes/notes/support-azure-3.5-gpt-16k-model-ece0cfe03260748c.yaml
+++ b/releasenotes/notes/support-azure-3.5-gpt-16k-model-ece0cfe03260748c.yaml
@ -2,4 +2,3 @@
 fixes:
  - |
    gpt-35-turbo-16k model from Azure can integrate correctly
-
--- a/test/preview/components/fetchers/init.py
+++ b/test/preview/components/fetchers/init.py
--- a/test/preview/components/fetchers/test_link_content_fetcher.py
+++ b/test/preview/components/fetchers/test_link_content_fetcher.py
@ -0,0 +1,170 @@
+import io
+from unittest.mock import patch, Mock
+
+import pytest
+
+from haystack.preview.components.fetchers.link_content import (
+    LinkContentFetcher,
+    text_content_handler,
+    binary_content_handler,
+    DEFAULT_USER_AGENT,
+)
+
+HTML_URL = "https://docs.haystack.deepset.ai/docs"
+TEXT_URL = "https://raw.githubusercontent.com/deepset-ai/haystack/main/README.md"
+PDF_URL = "https://raw.githubusercontent.com/deepset-ai/haystack/b5987a6d8d0714eb2f3011183ab40093d2e4a41a/e2e/samples/pipelines/sample_pdf_1.pdf"
+
+
+@pytest.fixture
+def mock_get_link_text_content():
+    with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
+        mock_run.get.return_value = Mock(
+            status_code=200, text="Example test response", headers={"Content-Type": "text/plain"}
+        )
+        yield mock_run
+
+
+@pytest.fixture
+def mock_get_link_content(test_files_path):
+    with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
+        mock_run.get.return_value = Mock(
+            status_code=200,
+            content=open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read(),
+            headers={"Content-Type": "application/pdf"},
+        )
+        yield mock_run
+
+
+class TestLinkContentFetcher:
+    @pytest.mark.unit
+    def test_init(self):
+        fetcher = LinkContentFetcher()
+        assert fetcher.raise_on_failure is True
+        assert fetcher.user_agents == [DEFAULT_USER_AGENT]
+        assert fetcher.retry_attempts == 2
+        assert fetcher.timeout == 3
+        assert fetcher.handlers == {
+            "text/html": text_content_handler,
+            "text/plain": text_content_handler,
+            "application/pdf": binary_content_handler,
+            "application/octet-stream": binary_content_handler,
+        }
+        assert hasattr(fetcher, "_get_response")
+
+    @pytest.mark.unit
+    def test_init_with_params(self):
+        fetcher = LinkContentFetcher(raise_on_failure=False, user_agents=["test"], retry_attempts=1, timeout=2)
+        assert fetcher.raise_on_failure is False
+        assert fetcher.user_agents == ["test"]
+        assert fetcher.retry_attempts == 1
+        assert fetcher.timeout == 2
+
+    @pytest.mark.unit
+    def test_to_dict(self):
+        fetcher = LinkContentFetcher()
+        assert fetcher.to_dict() == {
+            "type": "LinkContentFetcher",
+            "init_parameters": {
+                "raise_on_failure": True,
+                "user_agents": [DEFAULT_USER_AGENT],
+                "retry_attempts": 2,
+                "timeout": 3,
+            },
+        }
+
+    @pytest.mark.unit
+    def test_to_dict_with_params(self):
+        fetcher = LinkContentFetcher(raise_on_failure=False, user_agents=["test"], retry_attempts=1, timeout=2)
+        assert fetcher.to_dict() == {
+            "type": "LinkContentFetcher",
+            "init_parameters": {"raise_on_failure": False, "user_agents": ["test"], "retry_attempts": 1, "timeout": 2},
+        }
+
+    @pytest.mark.unit
+    def test_from_dict(self):
+        fetcher = LinkContentFetcher.from_dict(
+            {
+                "type": "LinkContentFetcher",
+                "init_parameters": {
+                    "raise_on_failure": False,
+                    "user_agents": ["test"],
+                    "retry_attempts": 1,
+                    "timeout": 2,
+                },
+            }
+        )
+        assert fetcher.raise_on_failure is False
+        assert fetcher.user_agents == ["test"]
+        assert fetcher.retry_attempts == 1
+
+    @pytest.mark.unit
+    def test_run_text(self):
+        with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
+            mock_run.get.return_value = Mock(
+                status_code=200, text="Example test response", headers={"Content-Type": "text/plain"}
+            )
+            fetcher = LinkContentFetcher()
+            document = fetcher.run("https://www.example.com")["document"]
+            assert document.text == "Example test response"
+            assert document.metadata["url"] == "https://www.example.com"
+            assert "timestamp" in document.metadata
+
+    @pytest.mark.unit
+    def test_run_html(self):
+        with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
+            mock_run.get.return_value = Mock(
+                status_code=200, text="<h1>Example test response</h1>", headers={"Content-Type": "text/html"}
+            )
+            fetcher = LinkContentFetcher()
+            document = fetcher.run("https://www.example.com")["document"]
+            assert document.text == "<h1>Example test response</h1>"
+            assert document.metadata["url"] == "https://www.example.com"
+            assert "timestamp" in document.metadata
+
+    @pytest.mark.unit
+    def test_run_binary(self, test_files_path):
+        file_bytes = open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read()
+        with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
+            mock_run.get.return_value = Mock(
+                status_code=200, content=file_bytes, headers={"Content-Type": "application/pdf"}
+            )
+            fetcher = LinkContentFetcher()
+            document = fetcher.run("https://www.example.com")["document"]
+            # casting to list to make the blobs comparable
+            assert list(document.blob) == list(io.BytesIO(file_bytes))
+            assert document.metadata["url"] == "https://www.example.com"
+            assert "timestamp" in document.metadata
+
+    @pytest.mark.unit
+    def test_run_bad_status_code(self):
+        fetcher = LinkContentFetcher(raise_on_failure=False)
+        mock_response = Mock(status_code=403)
+        with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
+            mock_run.get.return_value = mock_response
+            document = fetcher.run("https://www.example.com")["document"]
+        assert document is None
+
+    @pytest.mark.integration
+    def test_link_content_fetcher_html(self):
+        fetcher = LinkContentFetcher()
+        document = fetcher.run(HTML_URL)["document"]
+        assert document.mime_type == "text/html"
+        assert "Introduction to Haystack" in document.text
+        assert document.metadata["url"] == HTML_URL
+
+    @pytest.mark.integration
+    def test_link_content_fetcher_text(self):
+        fetcher = LinkContentFetcher()
+        document = fetcher.run(TEXT_URL)["document"]
+        assert document.mime_type == "text/plain"
+        assert "Haystack" in document.text
+        assert document.metadata["url"] == TEXT_URL
+
+    @pytest.mark.integration
+    def test_link_content_fetcher_pdf(self):
+        fetcher = LinkContentFetcher()
+        document = fetcher.run(PDF_URL)["document"]
+        assert document.mime_type == "application/octet-stream"  # FIXME Should be "application/pdf"?
+        assert document.text is None
+        assert document.blob is not None
+        assert document.metadata["url"] == PDF_URL
--- a/test/preview/conftest.py
+++ b/test/preview/conftest.py
@ -1,3 +1,4 @@
+from pathlib import Path
 from unittest.mock import Mock, patch
 import pytest

@ -11,3 +12,8 @@ def mock_tokenizer():
    tokenizer.encode = lambda text: text.split()
    tokenizer.decode = lambda tokens: " ".join(tokens)
    return tokenizer
+
+
+@pytest.fixture()
+def test_files_path():
+    return Path(__file__).parent / "test_files"
--- a/test/preview/test_files/pdf/sample_pdf_1.pdf
+++ b/test/preview/test_files/pdf/sample_pdf_1.pdf
				`@ -0,0 +1 @@`
				`from haystack.preview.components.fetchers.link_content import LinkContentFetcher`