mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-27 18:06:17 +00:00
feat: Add LinkContentFetcher
Haystack 2.0 component (#5724)
* Add LinkContentFetcher * Add release note * Small fixes * Fix pydocs * PR feedback * Remove handlers registration * PR feedback * adjustments * improve tests * initial draft * tests * add proposal * proposal number * reno * fix tests and usage of content and content_type * update branch & fix more tests * mypy * use the new document * add docstring * fix more tests * mypy * fix tests * add e2e * review feedback * improve __str__ * Apply suggestions from code review Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update haystack/preview/dataclasses/document.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * improve __str__ * fix tests * fix more tests * fix test * Fix end-of-file-fixer * Post merge fixes * Move e2e tests back into component --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
This commit is contained in:
parent
bf6d306d68
commit
0983fb656a
1
haystack/preview/components/fetchers/__init__.py
Normal file
1
haystack/preview/components/fetchers/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from haystack.preview.components.fetchers.link_content import LinkContentFetcher
|
163
haystack/preview/components/fetchers/link_content.py
Normal file
163
haystack/preview/components/fetchers/link_content.py
Normal file
@ -0,0 +1,163 @@
|
||||
import io
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, List, Callable, Any, IO
|
||||
|
||||
import requests
|
||||
from requests import Response
|
||||
from requests.exceptions import HTTPError
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, RetryCallState
|
||||
from haystack.preview import component, default_from_dict, default_to_dict
|
||||
|
||||
from haystack import __version__
|
||||
from haystack.preview import Document
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
DEFAULT_USER_AGENT = f"haystack/LinkContentFetcher/{__version__}"
|
||||
|
||||
REQUEST_HEADERS = {
|
||||
"accept": "*/*",
|
||||
"User-Agent": DEFAULT_USER_AGENT,
|
||||
"Accept-Language": "en-US,en;q=0.9,it;q=0.8,es;q=0.7",
|
||||
"referer": "https://www.google.com/",
|
||||
}
|
||||
|
||||
|
||||
def text_content_handler(response: Response) -> Dict[str, str]:
|
||||
"""
|
||||
:param response: Response object from the request.
|
||||
:return: The extracted text.
|
||||
"""
|
||||
return {"text": response.text}
|
||||
|
||||
|
||||
def binary_content_handler(response: Response) -> Dict[str, IO[bytes]]:
|
||||
"""
|
||||
:param response: Response object from the request.
|
||||
:return: The extracted binary file-like object.
|
||||
"""
|
||||
return {"blob": io.BytesIO(response.content)}
|
||||
|
||||
|
||||
@component
|
||||
class LinkContentFetcher:
|
||||
"""
|
||||
LinkContentFetcher fetches content from a URL link and converts it to a Document object.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
raise_on_failure: bool = True,
|
||||
user_agents: Optional[List[str]] = None,
|
||||
retry_attempts: int = 2,
|
||||
timeout: int = 3,
|
||||
):
|
||||
"""
|
||||
Creates a LinkContentFetcher instance.
|
||||
|
||||
:param raise_on_failure: A boolean indicating whether to raise an exception when a failure occurs
|
||||
during content extraction. If False, the error is simply logged and the program continues.
|
||||
Defaults to False.
|
||||
:param user_agents: A list of user agents to use when fetching content. Defaults to None, in which case a
|
||||
default user agent is used.
|
||||
:param retry_attempts: The number of times to retry fetching content. Defaults to 2.
|
||||
:param timeout: The timeout in seconds for the request. Defaults to 3.
|
||||
"""
|
||||
self.raise_on_failure = raise_on_failure
|
||||
self.user_agents = user_agents or [DEFAULT_USER_AGENT]
|
||||
self.current_user_agent_idx: int = 0
|
||||
self.retry_attempts = retry_attempts
|
||||
self.timeout = timeout
|
||||
|
||||
# register default content handlers that extract data from the response
|
||||
self.handlers: Dict[str, Callable[[Response], Dict[str, Any]]] = defaultdict(lambda: text_content_handler)
|
||||
self.handlers["text/html"] = text_content_handler
|
||||
self.handlers["text/plain"] = text_content_handler
|
||||
self.handlers["application/pdf"] = binary_content_handler
|
||||
self.handlers["application/octet-stream"] = binary_content_handler
|
||||
|
||||
@retry(
|
||||
reraise=True,
|
||||
stop=stop_after_attempt(self.retry_attempts),
|
||||
wait=wait_exponential(multiplier=1, min=2, max=10),
|
||||
retry=(retry_if_exception_type((HTTPError, requests.RequestException))),
|
||||
# This method is invoked only after failed requests (exception raised)
|
||||
after=self._switch_user_agent,
|
||||
)
|
||||
def get_response(url):
|
||||
# we need to copy because we modify the headers
|
||||
headers = REQUEST_HEADERS.copy()
|
||||
headers["User-Agent"] = self.user_agents[self.current_user_agent_idx]
|
||||
response = requests.get(url, headers=headers, timeout=timeout or 3)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
self._get_response: Callable = get_response
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Serialize this component to a dictionary.
|
||||
"""
|
||||
return default_to_dict(
|
||||
self,
|
||||
raise_on_failure=self.raise_on_failure,
|
||||
user_agents=self.user_agents,
|
||||
retry_attempts=self.retry_attempts,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "LinkContentFetcher":
|
||||
"""
|
||||
Deserialize this component from a dictionary.
|
||||
"""
|
||||
return default_from_dict(cls, data)
|
||||
|
||||
@component.output_types(documents=Optional[Document])
|
||||
def run(self, url: str):
|
||||
"""
|
||||
Fetches content from a URL and converts it to a Document objects. If no content is extracted,
|
||||
an empty Document object is returned (if raise_on_failure is False).
|
||||
|
||||
:param url: URL to fetch content from.
|
||||
:param timeout: Timeout in seconds for the request.
|
||||
:return: List of Document objects or an empty list if no content is extracted.
|
||||
"""
|
||||
document_data: Dict[str, Any] = {"metadata": {"url": url, "timestamp": int(datetime.utcnow().timestamp())}}
|
||||
try:
|
||||
response = self._get_response(url)
|
||||
content_type = self._get_content_type(response)
|
||||
document_data["mime_type"] = content_type
|
||||
handler: Callable = self.handlers[content_type]
|
||||
document_data.update(handler(response))
|
||||
return {"document": Document(**document_data)}
|
||||
|
||||
except Exception as e:
|
||||
if self.raise_on_failure:
|
||||
raise e
|
||||
logger.debug("Couldn't retrieve content from %s", url)
|
||||
return {"document": None}
|
||||
|
||||
finally:
|
||||
self.current_user_agent_idx = 0
|
||||
|
||||
def _get_content_type(self, response: Response):
|
||||
"""
|
||||
Get the content type of the response.
|
||||
:param response: The response object.
|
||||
:return: The content type of the response.
|
||||
"""
|
||||
content_type = response.headers.get("Content-Type", "")
|
||||
return content_type.split(";")[0]
|
||||
|
||||
def _switch_user_agent(self, retry_state: RetryCallState) -> None:
|
||||
"""
|
||||
Switches the User-Agent for this LinkContentRetriever to the next one in the list of user agents.
|
||||
Used by tenacity to retry the requests with a different user agent.
|
||||
:param retry_state: The retry state (unused, required by tenacity).
|
||||
"""
|
||||
self.current_user_agent_idx = (self.current_user_agent_idx + 1) % len(self.user_agents)
|
||||
logger.debug("Switched user agent to %s", self.user_agents[self.current_user_agent_idx])
|
@ -0,0 +1,5 @@
|
||||
---
|
||||
preview:
|
||||
- |
|
||||
Adds LinkContentFetcher component to Haystack 2.0. LinkContentFetcher fetches content from a given URL and
|
||||
converts it into a Document object, which can then be used within the Haystack 2.0 pipeline.
|
@ -1,6 +1,6 @@
|
||||
---
|
||||
enhancements:
|
||||
- |
|
||||
Refactor PineconeDocumentStore to use metadata instead of namespaces
|
||||
for distinction between documents with embeddings, documents without
|
||||
embeddings and labels
|
||||
Refactor PineconeDocumentStore to use metadata instead of namespaces
|
||||
for distinction between documents with embeddings, documents without
|
||||
embeddings and labels
|
||||
|
@ -2,4 +2,3 @@
|
||||
fixes:
|
||||
- |
|
||||
gpt-35-turbo-16k model from Azure can integrate correctly
|
||||
|
||||
|
0
test/preview/components/fetchers/__init__.py
Normal file
0
test/preview/components/fetchers/__init__.py
Normal file
170
test/preview/components/fetchers/test_link_content_fetcher.py
Normal file
170
test/preview/components/fetchers/test_link_content_fetcher.py
Normal file
@ -0,0 +1,170 @@
|
||||
import io
|
||||
from unittest.mock import patch, Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from haystack.preview.components.fetchers.link_content import (
|
||||
LinkContentFetcher,
|
||||
text_content_handler,
|
||||
binary_content_handler,
|
||||
DEFAULT_USER_AGENT,
|
||||
)
|
||||
|
||||
HTML_URL = "https://docs.haystack.deepset.ai/docs"
|
||||
TEXT_URL = "https://raw.githubusercontent.com/deepset-ai/haystack/main/README.md"
|
||||
PDF_URL = "https://raw.githubusercontent.com/deepset-ai/haystack/b5987a6d8d0714eb2f3011183ab40093d2e4a41a/e2e/samples/pipelines/sample_pdf_1.pdf"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_get_link_text_content():
|
||||
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
|
||||
mock_run.get.return_value = Mock(
|
||||
status_code=200, text="Example test response", headers={"Content-Type": "text/plain"}
|
||||
)
|
||||
yield mock_run
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_get_link_content(test_files_path):
|
||||
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
|
||||
mock_run.get.return_value = Mock(
|
||||
status_code=200,
|
||||
content=open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read(),
|
||||
headers={"Content-Type": "application/pdf"},
|
||||
)
|
||||
yield mock_run
|
||||
|
||||
|
||||
class TestLinkContentFetcher:
|
||||
@pytest.mark.unit
|
||||
def test_init(self):
|
||||
fetcher = LinkContentFetcher()
|
||||
assert fetcher.raise_on_failure is True
|
||||
assert fetcher.user_agents == [DEFAULT_USER_AGENT]
|
||||
assert fetcher.retry_attempts == 2
|
||||
assert fetcher.timeout == 3
|
||||
assert fetcher.handlers == {
|
||||
"text/html": text_content_handler,
|
||||
"text/plain": text_content_handler,
|
||||
"application/pdf": binary_content_handler,
|
||||
"application/octet-stream": binary_content_handler,
|
||||
}
|
||||
assert hasattr(fetcher, "_get_response")
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_init_with_params(self):
|
||||
fetcher = LinkContentFetcher(raise_on_failure=False, user_agents=["test"], retry_attempts=1, timeout=2)
|
||||
assert fetcher.raise_on_failure is False
|
||||
assert fetcher.user_agents == ["test"]
|
||||
assert fetcher.retry_attempts == 1
|
||||
assert fetcher.timeout == 2
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_to_dict(self):
|
||||
fetcher = LinkContentFetcher()
|
||||
assert fetcher.to_dict() == {
|
||||
"type": "LinkContentFetcher",
|
||||
"init_parameters": {
|
||||
"raise_on_failure": True,
|
||||
"user_agents": [DEFAULT_USER_AGENT],
|
||||
"retry_attempts": 2,
|
||||
"timeout": 3,
|
||||
},
|
||||
}
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_to_dict_with_params(self):
|
||||
fetcher = LinkContentFetcher(raise_on_failure=False, user_agents=["test"], retry_attempts=1, timeout=2)
|
||||
assert fetcher.to_dict() == {
|
||||
"type": "LinkContentFetcher",
|
||||
"init_parameters": {"raise_on_failure": False, "user_agents": ["test"], "retry_attempts": 1, "timeout": 2},
|
||||
}
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_from_dict(self):
|
||||
fetcher = LinkContentFetcher.from_dict(
|
||||
{
|
||||
"type": "LinkContentFetcher",
|
||||
"init_parameters": {
|
||||
"raise_on_failure": False,
|
||||
"user_agents": ["test"],
|
||||
"retry_attempts": 1,
|
||||
"timeout": 2,
|
||||
},
|
||||
}
|
||||
)
|
||||
assert fetcher.raise_on_failure is False
|
||||
assert fetcher.user_agents == ["test"]
|
||||
assert fetcher.retry_attempts == 1
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_run_text(self):
|
||||
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
|
||||
mock_run.get.return_value = Mock(
|
||||
status_code=200, text="Example test response", headers={"Content-Type": "text/plain"}
|
||||
)
|
||||
fetcher = LinkContentFetcher()
|
||||
document = fetcher.run("https://www.example.com")["document"]
|
||||
assert document.text == "Example test response"
|
||||
assert document.metadata["url"] == "https://www.example.com"
|
||||
assert "timestamp" in document.metadata
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_run_html(self):
|
||||
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
|
||||
mock_run.get.return_value = Mock(
|
||||
status_code=200, text="<h1>Example test response</h1>", headers={"Content-Type": "text/html"}
|
||||
)
|
||||
fetcher = LinkContentFetcher()
|
||||
document = fetcher.run("https://www.example.com")["document"]
|
||||
assert document.text == "<h1>Example test response</h1>"
|
||||
assert document.metadata["url"] == "https://www.example.com"
|
||||
assert "timestamp" in document.metadata
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_run_binary(self, test_files_path):
|
||||
file_bytes = open(test_files_path / "pdf" / "sample_pdf_1.pdf", "rb").read()
|
||||
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
|
||||
mock_run.get.return_value = Mock(
|
||||
status_code=200, content=file_bytes, headers={"Content-Type": "application/pdf"}
|
||||
)
|
||||
fetcher = LinkContentFetcher()
|
||||
document = fetcher.run("https://www.example.com")["document"]
|
||||
# casting to list to make the blobs comparable
|
||||
assert list(document.blob) == list(io.BytesIO(file_bytes))
|
||||
assert document.metadata["url"] == "https://www.example.com"
|
||||
assert "timestamp" in document.metadata
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_run_bad_status_code(self):
|
||||
fetcher = LinkContentFetcher(raise_on_failure=False)
|
||||
mock_response = Mock(status_code=403)
|
||||
with patch("haystack.preview.components.fetchers.link_content.requests") as mock_run:
|
||||
mock_run.get.return_value = mock_response
|
||||
document = fetcher.run("https://www.example.com")["document"]
|
||||
assert document is None
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_link_content_fetcher_html(self):
|
||||
fetcher = LinkContentFetcher()
|
||||
document = fetcher.run(HTML_URL)["document"]
|
||||
assert document.mime_type == "text/html"
|
||||
assert "Introduction to Haystack" in document.text
|
||||
assert document.metadata["url"] == HTML_URL
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_link_content_fetcher_text(self):
|
||||
fetcher = LinkContentFetcher()
|
||||
document = fetcher.run(TEXT_URL)["document"]
|
||||
assert document.mime_type == "text/plain"
|
||||
assert "Haystack" in document.text
|
||||
assert document.metadata["url"] == TEXT_URL
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_link_content_fetcher_pdf(self):
|
||||
fetcher = LinkContentFetcher()
|
||||
document = fetcher.run(PDF_URL)["document"]
|
||||
assert document.mime_type == "application/octet-stream" # FIXME Should be "application/pdf"?
|
||||
assert document.text is None
|
||||
assert document.blob is not None
|
||||
assert document.metadata["url"] == PDF_URL
|
@ -1,3 +1,4 @@
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock, patch
|
||||
import pytest
|
||||
|
||||
@ -11,3 +12,8 @@ def mock_tokenizer():
|
||||
tokenizer.encode = lambda text: text.split()
|
||||
tokenizer.decode = lambda tokens: " ".join(tokens)
|
||||
return tokenizer
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def test_files_path():
|
||||
return Path(__file__).parent / "test_files"
|
||||
|
BIN
test/preview/test_files/pdf/sample_pdf_1.pdf
Normal file
BIN
test/preview/test_files/pdf/sample_pdf_1.pdf
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user