diff --git a/haystack/preview/components/fetchers/__init__.py b/haystack/preview/components/fetchers/__init__.py new file mode 100644 index 000000000..7e39e4a56 --- /dev/null +++ b/haystack/preview/components/fetchers/__init__.py @@ -0,0 +1 @@ +from haystack.preview.components.fetchers.link_content import LinkContentFetcher diff --git a/haystack/preview/components/fetchers/link_content.py b/haystack/preview/components/fetchers/link_content.py new file mode 100644 index 000000000..4412da0f8 --- /dev/null +++ b/haystack/preview/components/fetchers/link_content.py @@ -0,0 +1,163 @@ +import io +import logging +from collections import defaultdict +from datetime import datetime +from typing import Optional, Dict, List, Callable, Any, IO + +import requests +from requests import Response +from requests.exceptions import HTTPError +from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, RetryCallState +from haystack.preview import component, default_from_dict, default_to_dict + +from haystack import __version__ +from haystack.preview import Document + +logger = logging.getLogger(__name__) + + +DEFAULT_USER_AGENT = f"haystack/LinkContentFetcher/{__version__}" + +REQUEST_HEADERS = { + "accept": "*/*", + "User-Agent": DEFAULT_USER_AGENT, + "Accept-Language": "en-US,en;q=0.9,it;q=0.8,es;q=0.7", + "referer": "https://www.google.com/", +} + + +def text_content_handler(response: Response) -> Dict[str, str]: + """ + :param response: Response object from the request. + :return: The extracted text. + """ + return {"text": response.text} + + +def binary_content_handler(response: Response) -> Dict[str, IO[bytes]]: + """ + :param response: Response object from the request. + :return: The extracted binary file-like object. + """ + return {"blob": io.BytesIO(response.content)} + + +@component +class LinkContentFetcher: + """ + LinkContentFetcher fetches content from a URL link and converts it to a Document object. + """ + + def __init__( + self, + raise_on_failure: bool = True, + user_agents: Optional[List[str]] = None, + retry_attempts: int = 2, + timeout: int = 3, + ): + """ + Creates a LinkContentFetcher instance. + + :param raise_on_failure: A boolean indicating whether to raise an exception when a failure occurs + during content extraction. If False, the error is simply logged and the program continues. + Defaults to False. + :param user_agents: A list of user agents to use when fetching content. Defaults to None, in which case a + default user agent is used. + :param retry_attempts: The number of times to retry fetching content. Defaults to 2. + :param timeout: The timeout in seconds for the request. Defaults to 3. + """ + self.raise_on_failure = raise_on_failure + self.user_agents = user_agents or [DEFAULT_USER_AGENT] + self.current_user_agent_idx: int = 0 + self.retry_attempts = retry_attempts + self.timeout = timeout + + # register default content handlers that extract data from the response + self.handlers: Dict[str, Callable[[Response], Dict[str, Any]]] = defaultdict(lambda: text_content_handler) + self.handlers["text/html"] = text_content_handler + self.handlers["text/plain"] = text_content_handler + self.handlers["application/pdf"] = binary_content_handler + self.handlers["application/octet-stream"] = binary_content_handler + + @retry( + reraise=True, + stop=stop_after_attempt(self.retry_attempts), + wait=wait_exponential(multiplier=1, min=2, max=10), + retry=(retry_if_exception_type((HTTPError, requests.RequestException))), + # This method is invoked only after failed requests (exception raised) + after=self._switch_user_agent, + ) + def get_response(url): + # we need to copy because we modify the headers + headers = REQUEST_HEADERS.copy() + headers["User-Agent"] = self.user_agents[self.current_user_agent_idx] + response = requests.get(url, headers=headers, timeout=timeout or 3) + response.raise_for_status() + return response + + self._get_response: Callable = get_response + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict( + self, + raise_on_failure=self.raise_on_failure, + user_agents=self.user_agents, + retry_attempts=self.retry_attempts, + timeout=self.timeout, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "LinkContentFetcher": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + @component.output_types(documents=Optional[Document]) + def run(self, url: str): + """ + Fetches content from a URL and converts it to a Document objects. If no content is extracted, + an empty Document object is returned (if raise_on_failure is False). + + :param url: URL to fetch content from. + :param timeout: Timeout in seconds for the request. + :return: List of Document objects or an empty list if no content is extracted. + """ + document_data: Dict[str, Any] = {"metadata": {"url": url, "timestamp": int(datetime.utcnow().timestamp())}} + try: + response = self._get_response(url) + content_type = self._get_content_type(response) + document_data["mime_type"] = content_type + handler: Callable = self.handlers[content_type] + document_data.update(handler(response)) + return {"document": Document(**document_data)} + + except Exception as e: + if self.raise_on_failure: + raise e + logger.debug("Couldn't retrieve content from %s", url) + return {"document": None} + + finally: + self.current_user_agent_idx = 0 + + def _get_content_type(self, response: Response): + """ + Get the content type of the response. + :param response: The response object. + :return: The content type of the response. + """ + content_type = response.headers.get("Content-Type", "") + return content_type.split(";")[0] + + def _switch_user_agent(self, retry_state: RetryCallState) -> None: + """ + Switches the User-Agent for this LinkContentRetriever to the next one in the list of user agents. + Used by tenacity to retry the requests with a different user agent. + :param retry_state: The retry state (unused, required by tenacity). + """ + self.current_user_agent_idx = (self.current_user_agent_idx + 1) % len(self.user_agents) + logger.debug("Switched user agent to %s", self.user_agents[self.current_user_agent_idx]) diff --git a/releasenotes/notes/add-link-content-fetcher-145915976f38e1e0.yaml b/releasenotes/notes/add-link-content-fetcher-145915976f38e1e0.yaml new file mode 100644 index 000000000..bd4c8610d --- /dev/null +++ b/releasenotes/notes/add-link-content-fetcher-145915976f38e1e0.yaml @@ -0,0 +1,5 @@ +--- +preview: + - | + Adds LinkContentFetcher component to Haystack 2.0. LinkContentFetcher fetches content from a given URL and + converts it into a Document object, which can then be used within the Haystack 2.0 pipeline. diff --git a/releasenotes/notes/refactor-pinecone-document-store.yaml b/releasenotes/notes/refactor-pinecone-document-store.yaml index d67d134a3..b8145ac50 100644 --- a/releasenotes/notes/refactor-pinecone-document-store.yaml +++ b/releasenotes/notes/refactor-pinecone-document-store.yaml @@ -1,6 +1,6 @@ --- enhancements: - | - Refactor PineconeDocumentStore to use metadata instead of namespaces - for distinction between documents with embeddings, documents without - embeddings and labels \ No newline at end of file + Refactor PineconeDocumentStore to use metadata instead of namespaces + for distinction between documents with embeddings, documents without + embeddings and labels diff --git a/releasenotes/notes/support-azure-3.5-gpt-16k-model-ece0cfe03260748c.yaml b/releasenotes/notes/support-azure-3.5-gpt-16k-model-ece0cfe03260748c.yaml index 642831b75..9fc499418 100644 --- a/releasenotes/notes/support-azure-3.5-gpt-16k-model-ece0cfe03260748c.yaml +++ b/releasenotes/notes/support-azure-3.5-gpt-16k-model-ece0cfe03260748c.yaml @@ -2,4 +2,3 @@ fixes: - | gpt-35-turbo-16k model from Azure can integrate correctly - diff --git a/test/preview/components/fetchers/__init__.py b/test/preview/components/fetchers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/preview/components/fetchers/test_link_content_fetcher.py b/test/preview/components/fetchers/test_link_content_fetcher.py new file mode 100644 index 000000000..a8be562cd --- /dev/null +++ b/test/preview/components/fetchers/test_link_content_fetcher.py @@ -0,0 +1,170 @@ +import io +from unittest.mock import patch, Mock + +import pytest + +from haystack.preview.components.fetchers.link_content import ( + LinkContentFetcher, + text_content_handler, + binary_content_handler, + DEFAULT_USER_AGENT, +) + +HTML_URL = "https://docs.haystack.deepset.ai/docs" +TEXT_URL = "https://raw.githubusercontent.com/deepset-ai/haystack/main/README.md" +PDF_URL = "https://raw.githubusercontent.com/deepset-ai/haystack/b5987a6d8d0714eb2f3011183ab40093d2e4a41a/e2e/samples/pipelines/sample_pdf_1.pdf" + + +@pytest.fixture +def mock_get_link_text_content(): + with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run: + mock_run.get.return_value = Mock( + status_code=200, text="Example test response", headers={"Content-Type": "text/plain"} + ) + yield mock_run + + +@pytest.fixture +def mock_get_link_content(test_files_path): + with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run: + mock_run.get.return_value = Mock( + status_code=200, + content=open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read(), + headers={"Content-Type": "application/pdf"}, + ) + yield mock_run + + +class TestLinkContentFetcher: + @pytest.mark.unit + def test_init(self): + fetcher = LinkContentFetcher() + assert fetcher.raise_on_failure is True + assert fetcher.user_agents == [DEFAULT_USER_AGENT] + assert fetcher.retry_attempts == 2 + assert fetcher.timeout == 3 + assert fetcher.handlers == { + "text/html": text_content_handler, + "text/plain": text_content_handler, + "application/pdf": binary_content_handler, + "application/octet-stream": binary_content_handler, + } + assert hasattr(fetcher, "_get_response") + + @pytest.mark.unit + def test_init_with_params(self): + fetcher = LinkContentFetcher(raise_on_failure=False, user_agents=["test"], retry_attempts=1, timeout=2) + assert fetcher.raise_on_failure is False + assert fetcher.user_agents == ["test"] + assert fetcher.retry_attempts == 1 + assert fetcher.timeout == 2 + + @pytest.mark.unit + def test_to_dict(self): + fetcher = LinkContentFetcher() + assert fetcher.to_dict() == { + "type": "LinkContentFetcher", + "init_parameters": { + "raise_on_failure": True, + "user_agents": [DEFAULT_USER_AGENT], + "retry_attempts": 2, + "timeout": 3, + }, + } + + @pytest.mark.unit + def test_to_dict_with_params(self): + fetcher = LinkContentFetcher(raise_on_failure=False, user_agents=["test"], retry_attempts=1, timeout=2) + assert fetcher.to_dict() == { + "type": "LinkContentFetcher", + "init_parameters": {"raise_on_failure": False, "user_agents": ["test"], "retry_attempts": 1, "timeout": 2}, + } + + @pytest.mark.unit + def test_from_dict(self): + fetcher = LinkContentFetcher.from_dict( + { + "type": "LinkContentFetcher", + "init_parameters": { + "raise_on_failure": False, + "user_agents": ["test"], + "retry_attempts": 1, + "timeout": 2, + }, + } + ) + assert fetcher.raise_on_failure is False + assert fetcher.user_agents == ["test"] + assert fetcher.retry_attempts == 1 + + @pytest.mark.unit + def test_run_text(self): + with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run: + mock_run.get.return_value = Mock( + status_code=200, text="Example test response", headers={"Content-Type": "text/plain"} + ) + fetcher = LinkContentFetcher() + document = fetcher.run("https://www.example.com")["document"] + assert document.text == "Example test response" + assert document.metadata["url"] == "https://www.example.com" + assert "timestamp" in document.metadata + + @pytest.mark.unit + def test_run_html(self): + with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run: + mock_run.get.return_value = Mock( + status_code=200, text="