mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-28 10:26:27 +00:00
feat: Add LinkContentFetcher
Haystack 2.0 component (#5724)
* Add LinkContentFetcher * Add release note * Small fixes * Fix pydocs * PR feedback * Remove handlers registration * PR feedback * adjustments * improve tests * initial draft * tests * add proposal * proposal number * reno * fix tests and usage of content and content_type * update branch & fix more tests * mypy * use the new document * add docstring * fix more tests * mypy * fix tests * add e2e * review feedback * improve __str__ * Apply suggestions from code review Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/preview/dataclasses/document.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * improve __str__ * fix tests * fix more tests * fix test * Fix end-of-file-fixer * Post merge fixes * Move e2e tests back into component --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
This commit is contained in:
parent
bf6d306d68
commit
0983fb656a
1
haystack/preview/components/fetchers/__init__.py
Normal file
1
haystack/preview/components/fetchers/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
from haystack.preview.components.fetchers.link_content import LinkContentFetcher
|
163
haystack/preview/components/fetchers/link_content.py
Normal file
163
haystack/preview/components/fetchers/link_content.py
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
import io
|
||||||
|
import logging
|
||||||
|
from collections import defaultdict
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional, Dict, List, Callable, Any, IO
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from requests import Response
|
||||||
|
from requests.exceptions import HTTPError
|
||||||
|
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, RetryCallState
|
||||||
|
from haystack.preview import component, default_from_dict, default_to_dict
|
||||||
|
|
||||||
|
from haystack import __version__
|
||||||
|
from haystack.preview import Document
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_USER_AGENT = f"haystack/LinkContentFetcher/{__version__}"
|
||||||
|
|
||||||
|
REQUEST_HEADERS = {
|
||||||
|
"accept": "*/*",
|
||||||
|
"User-Agent": DEFAULT_USER_AGENT,
|
||||||
|
"Accept-Language": "en-US,en;q=0.9,it;q=0.8,es;q=0.7",
|
||||||
|
"referer": "https://www.google.com/",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def text_content_handler(response: Response) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
:param response: Response object from the request.
|
||||||
|
:return: The extracted text.
|
||||||
|
"""
|
||||||
|
return {"text": response.text}
|
||||||
|
|
||||||
|
|
||||||
|
def binary_content_handler(response: Response) -> Dict[str, IO[bytes]]:
|
||||||
|
"""
|
||||||
|
:param response: Response object from the request.
|
||||||
|
:return: The extracted binary file-like object.
|
||||||
|
"""
|
||||||
|
return {"blob": io.BytesIO(response.content)}
|
||||||
|
|
||||||
|
|
||||||
|
@component
|
||||||
|
class LinkContentFetcher:
|
||||||
|
"""
|
||||||
|
LinkContentFetcher fetches content from a URL link and converts it to a Document object.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
raise_on_failure: bool = True,
|
||||||
|
user_agents: Optional[List[str]] = None,
|
||||||
|
retry_attempts: int = 2,
|
||||||
|
timeout: int = 3,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Creates a LinkContentFetcher instance.
|
||||||
|
|
||||||
|
:param raise_on_failure: A boolean indicating whether to raise an exception when a failure occurs
|
||||||
|
during content extraction. If False, the error is simply logged and the program continues.
|
||||||
|
Defaults to False.
|
||||||
|
:param user_agents: A list of user agents to use when fetching content. Defaults to None, in which case a
|
||||||
|
default user agent is used.
|
||||||
|
:param retry_attempts: The number of times to retry fetching content. Defaults to 2.
|
||||||
|
:param timeout: The timeout in seconds for the request. Defaults to 3.
|
||||||
|
"""
|
||||||
|
self.raise_on_failure = raise_on_failure
|
||||||
|
self.user_agents = user_agents or [DEFAULT_USER_AGENT]
|
||||||
|
self.current_user_agent_idx: int = 0
|
||||||
|
self.retry_attempts = retry_attempts
|
||||||
|
self.timeout = timeout
|
||||||
|
|
||||||
|
# register default content handlers that extract data from the response
|
||||||
|
self.handlers: Dict[str, Callable[[Response], Dict[str, Any]]] = defaultdict(lambda: text_content_handler)
|
||||||
|
self.handlers["text/html"] = text_content_handler
|
||||||
|
self.handlers["text/plain"] = text_content_handler
|
||||||
|
self.handlers["application/pdf"] = binary_content_handler
|
||||||
|
self.handlers["application/octet-stream"] = binary_content_handler
|
||||||
|
|
||||||
|
@retry(
|
||||||
|
reraise=True,
|
||||||
|
stop=stop_after_attempt(self.retry_attempts),
|
||||||
|
wait=wait_exponential(multiplier=1, min=2, max=10),
|
||||||
|
retry=(retry_if_exception_type((HTTPError, requests.RequestException))),
|
||||||
|
# This method is invoked only after failed requests (exception raised)
|
||||||
|
after=self._switch_user_agent,
|
||||||
|
)
|
||||||
|
def get_response(url):
|
||||||
|
# we need to copy because we modify the headers
|
||||||
|
headers = REQUEST_HEADERS.copy()
|
||||||
|
headers["User-Agent"] = self.user_agents[self.current_user_agent_idx]
|
||||||
|
response = requests.get(url, headers=headers, timeout=timeout or 3)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response
|
||||||
|
|
||||||
|
self._get_response: Callable = get_response
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Serialize this component to a dictionary.
|
||||||
|
"""
|
||||||
|
return default_to_dict(
|
||||||
|
self,
|
||||||
|
raise_on_failure=self.raise_on_failure,
|
||||||
|
user_agents=self.user_agents,
|
||||||
|
retry_attempts=self.retry_attempts,
|
||||||
|
timeout=self.timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict[str, Any]) -> "LinkContentFetcher":
|
||||||
|
"""
|
||||||
|
Deserialize this component from a dictionary.
|
||||||
|
"""
|
||||||
|
return default_from_dict(cls, data)
|
||||||
|
|
||||||
|
@component.output_types(documents=Optional[Document])
|
||||||
|
def run(self, url: str):
|
||||||
|
"""
|
||||||
|
Fetches content from a URL and converts it to a Document objects. If no content is extracted,
|
||||||
|
an empty Document object is returned (if raise_on_failure is False).
|
||||||
|
|
||||||
|
:param url: URL to fetch content from.
|
||||||
|
:param timeout: Timeout in seconds for the request.
|
||||||
|
:return: List of Document objects or an empty list if no content is extracted.
|
||||||
|
"""
|
||||||
|
document_data: Dict[str, Any] = {"metadata": {"url": url, "timestamp": int(datetime.utcnow().timestamp())}}
|
||||||
|
try:
|
||||||
|
response = self._get_response(url)
|
||||||
|
content_type = self._get_content_type(response)
|
||||||
|
document_data["mime_type"] = content_type
|
||||||
|
handler: Callable = self.handlers[content_type]
|
||||||
|
document_data.update(handler(response))
|
||||||
|
return {"document": Document(**document_data)}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
if self.raise_on_failure:
|
||||||
|
raise e
|
||||||
|
logger.debug("Couldn't retrieve content from %s", url)
|
||||||
|
return {"document": None}
|
||||||
|
|
||||||
|
finally:
|
||||||
|
self.current_user_agent_idx = 0
|
||||||
|
|
||||||
|
def _get_content_type(self, response: Response):
|
||||||
|
"""
|
||||||
|
Get the content type of the response.
|
||||||
|
:param response: The response object.
|
||||||
|
:return: The content type of the response.
|
||||||
|
"""
|
||||||
|
content_type = response.headers.get("Content-Type", "")
|
||||||
|
return content_type.split(";")[0]
|
||||||
|
|
||||||
|
def _switch_user_agent(self, retry_state: RetryCallState) -> None:
|
||||||
|
"""
|
||||||
|
Switches the User-Agent for this LinkContentRetriever to the next one in the list of user agents.
|
||||||
|
Used by tenacity to retry the requests with a different user agent.
|
||||||
|
:param retry_state: The retry state (unused, required by tenacity).
|
||||||
|
"""
|
||||||
|
self.current_user_agent_idx = (self.current_user_agent_idx + 1) % len(self.user_agents)
|
||||||
|
logger.debug("Switched user agent to %s", self.user_agents[self.current_user_agent_idx])
|
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
preview:
|
||||||
|
- |
|
||||||
|
Adds LinkContentFetcher component to Haystack 2.0. LinkContentFetcher fetches content from a given URL and
|
||||||
|
converts it into a Document object, which can then be used within the Haystack 2.0 pipeline.
|
@ -2,4 +2,3 @@
|
|||||||
fixes:
|
fixes:
|
||||||
- |
|
- |
|
||||||
gpt-35-turbo-16k model from Azure can integrate correctly
|
gpt-35-turbo-16k model from Azure can integrate correctly
|
||||||
|
|
||||||
|
0
test/preview/components/fetchers/__init__.py
Normal file
0
test/preview/components/fetchers/__init__.py
Normal file
170
test/preview/components/fetchers/test_link_content_fetcher.py
Normal file
170
test/preview/components/fetchers/test_link_content_fetcher.py
Normal file
@ -0,0 +1,170 @@
|
|||||||
|
import io
|
||||||
|
from unittest.mock import patch, Mock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from haystack.preview.components.fetchers.link_content import (
|
||||||
|
LinkContentFetcher,
|
||||||
|
text_content_handler,
|
||||||
|
binary_content_handler,
|
||||||
|
DEFAULT_USER_AGENT,
|
||||||
|
)
|
||||||
|
|
||||||
|
HTML_URL = "https://docs.haystack.deepset.ai/docs"
|
||||||
|
TEXT_URL = "https://raw.githubusercontent.com/deepset-ai/haystack/main/README.md"
|
||||||
|
PDF_URL = "https://raw.githubusercontent.com/deepset-ai/haystack/b5987a6d8d0714eb2f3011183ab40093d2e4a41a/e2e/samples/pipelines/sample_pdf_1.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_get_link_text_content():
|
||||||
|
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
|
||||||
|
mock_run.get.return_value = Mock(
|
||||||
|
status_code=200, text="Example test response", headers={"Content-Type": "text/plain"}
|
||||||
|
)
|
||||||
|
yield mock_run
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_get_link_content(test_files_path):
|
||||||
|
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
|
||||||
|
mock_run.get.return_value = Mock(
|
||||||
|
status_code=200,
|
||||||
|
content=open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read(),
|
||||||
|
headers={"Content-Type": "application/pdf"},
|
||||||
|
)
|
||||||
|
yield mock_run
|
||||||
|
|
||||||
|
|
||||||
|
class TestLinkContentFetcher:
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test_init(self):
|
||||||
|
fetcher = LinkContentFetcher()
|
||||||
|
assert fetcher.raise_on_failure is True
|
||||||
|
assert fetcher.user_agents == [DEFAULT_USER_AGENT]
|
||||||
|
assert fetcher.retry_attempts == 2
|
||||||
|
assert fetcher.timeout == 3
|
||||||
|
assert fetcher.handlers == {
|
||||||
|
"text/html": text_content_handler,
|
||||||
|
"text/plain": text_content_handler,
|
||||||
|
"application/pdf": binary_content_handler,
|
||||||
|
"application/octet-stream": binary_content_handler,
|
||||||
|
}
|
||||||
|
assert hasattr(fetcher, "_get_response")
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test_init_with_params(self):
|
||||||
|
fetcher = LinkContentFetcher(raise_on_failure=False, user_agents=["test"], retry_attempts=1, timeout=2)
|
||||||
|
assert fetcher.raise_on_failure is False
|
||||||
|
assert fetcher.user_agents == ["test"]
|
||||||
|
assert fetcher.retry_attempts == 1
|
||||||
|
assert fetcher.timeout == 2
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test_to_dict(self):
|
||||||
|
fetcher = LinkContentFetcher()
|
||||||
|
assert fetcher.to_dict() == {
|
||||||
|
"type": "LinkContentFetcher",
|
||||||
|
"init_parameters": {
|
||||||
|
"raise_on_failure": True,
|
||||||
|
"user_agents": [DEFAULT_USER_AGENT],
|
||||||
|
"retry_attempts": 2,
|
||||||
|
"timeout": 3,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test_to_dict_with_params(self):
|
||||||
|
fetcher = LinkContentFetcher(raise_on_failure=False, user_agents=["test"], retry_attempts=1, timeout=2)
|
||||||
|
assert fetcher.to_dict() == {
|
||||||
|
"type": "LinkContentFetcher",
|
||||||
|
"init_parameters": {"raise_on_failure": False, "user_agents": ["test"], "retry_attempts": 1, "timeout": 2},
|
||||||
|
}
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test_from_dict(self):
|
||||||
|
fetcher = LinkContentFetcher.from_dict(
|
||||||
|
{
|
||||||
|
"type": "LinkContentFetcher",
|
||||||
|
"init_parameters": {
|
||||||
|
"raise_on_failure": False,
|
||||||
|
"user_agents": ["test"],
|
||||||
|
"retry_attempts": 1,
|
||||||
|
"timeout": 2,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
assert fetcher.raise_on_failure is False
|
||||||
|
assert fetcher.user_agents == ["test"]
|
||||||
|
assert fetcher.retry_attempts == 1
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test_run_text(self):
|
||||||
|
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
|
||||||
|
mock_run.get.return_value = Mock(
|
||||||
|
status_code=200, text="Example test response", headers={"Content-Type": "text/plain"}
|
||||||
|
)
|
||||||
|
fetcher = LinkContentFetcher()
|
||||||
|
document = fetcher.run("https://www.example.com")["document"]
|
||||||
|
assert document.text == "Example test response"
|
||||||
|
assert document.metadata["url"] == "https://www.example.com"
|
||||||
|
assert "timestamp" in document.metadata
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test_run_html(self):
|
||||||
|
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
|
||||||
|
mock_run.get.return_value = Mock(
|
||||||
|
status_code=200, text="<h1>Example test response</h1>", headers={"Content-Type": "text/html"}
|
||||||
|
)
|
||||||
|
fetcher = LinkContentFetcher()
|
||||||
|
document = fetcher.run("https://www.example.com")["document"]
|
||||||
|
assert document.text == "<h1>Example test response</h1>"
|
||||||
|
assert document.metadata["url"] == "https://www.example.com"
|
||||||
|
assert "timestamp" in document.metadata
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test_run_binary(self, test_files_path):
|
||||||
|
file_bytes = open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read()
|
||||||
|
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
|
||||||
|
mock_run.get.return_value = Mock(
|
||||||
|
status_code=200, content=file_bytes, headers={"Content-Type": "application/pdf"}
|
||||||
|
)
|
||||||
|
fetcher = LinkContentFetcher()
|
||||||
|
document = fetcher.run("https://www.example.com")["document"]
|
||||||
|
# casting to list to make the blobs comparable
|
||||||
|
assert list(document.blob) == list(io.BytesIO(file_bytes))
|
||||||
|
assert document.metadata["url"] == "https://www.example.com"
|
||||||
|
assert "timestamp" in document.metadata
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test_run_bad_status_code(self):
|
||||||
|
fetcher = LinkContentFetcher(raise_on_failure=False)
|
||||||
|
mock_response = Mock(status_code=403)
|
||||||
|
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
|
||||||
|
mock_run.get.return_value = mock_response
|
||||||
|
document = fetcher.run("https://www.example.com")["document"]
|
||||||
|
assert document is None
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_link_content_fetcher_html(self):
|
||||||
|
fetcher = LinkContentFetcher()
|
||||||
|
document = fetcher.run(HTML_URL)["document"]
|
||||||
|
assert document.mime_type == "text/html"
|
||||||
|
assert "Introduction to Haystack" in document.text
|
||||||
|
assert document.metadata["url"] == HTML_URL
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_link_content_fetcher_text(self):
|
||||||
|
fetcher = LinkContentFetcher()
|
||||||
|
document = fetcher.run(TEXT_URL)["document"]
|
||||||
|
assert document.mime_type == "text/plain"
|
||||||
|
assert "Haystack" in document.text
|
||||||
|
assert document.metadata["url"] == TEXT_URL
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_link_content_fetcher_pdf(self):
|
||||||
|
fetcher = LinkContentFetcher()
|
||||||
|
document = fetcher.run(PDF_URL)["document"]
|
||||||
|
assert document.mime_type == "application/octet-stream" # FIXME Should be "application/pdf"?
|
||||||
|
assert document.text is None
|
||||||
|
assert document.blob is not None
|
||||||
|
assert document.metadata["url"] == PDF_URL
|
@ -1,3 +1,4 @@
|
|||||||
|
from pathlib import Path
|
||||||
from unittest.mock import Mock, patch
|
from unittest.mock import Mock, patch
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -11,3 +12,8 @@ def mock_tokenizer():
|
|||||||
tokenizer.encode = lambda text: text.split()
|
tokenizer.encode = lambda text: text.split()
|
||||||
tokenizer.decode = lambda tokens: " ".join(tokens)
|
tokenizer.decode = lambda tokens: " ".join(tokens)
|
||||||
return tokenizer
|
return tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def test_files_path():
|
||||||
|
return Path(__file__).parent / "test_files"
|
||||||
|
BIN
test/preview/test_files/pdf/sample_pdf_1.pdf
Normal file
BIN
test/preview/test_files/pdf/sample_pdf_1.pdf
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user