fix: WebRetriever top_k is ignored in a pipeline (#5106)

* Initial changes

* Add WebSearch, WebRetriever top_k unit tests

* Add exact integration test that failed Tuana

* PR review
This commit is contained in:
Vladimir Blagojevic 2023-06-09 10:42:37 +02:00 committed by GitHub
parent d8a4f20379
commit 0cc9ce7522
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 320 additions and 155 deletions

View File

@ -198,7 +198,7 @@ class WebRetriever(BaseRetriever):
search_results, _ = self.web_search.run(query=query)
search_results = search_results["documents"]
if self.mode == "snippets":
return search_results # type: ignore
return search_results[:top_k] # type: ignore
links: List[SearchResult] = [
SearchResult(r.meta["link"], r.meta.get("score", None), r.meta.get("position", None))

View File

@ -63,17 +63,25 @@ class WebSearch(BaseComponent):
labels: Optional[MultiLabel] = None,
documents: Optional[List[Document]] = None,
meta: Optional[dict] = None,
top_k: Optional[int] = None,
) -> Tuple[Dict, str]:
"""
Search the search engine for the given query and return the results. Only the query parameter is used.
Search the search engine for the given query and return the results. Only the query parameter and the top_k
parameter are used.
:param query: The query to search for.
:param file_paths: Not used.
:param labels: Not used.
:param documents: Not used.
:param meta: Not used.
:param top_k: return only the top_k results. If None, the top_k value passed to the constructor is used.
:return: List of search results as documents.
"""
# query is a required parameter for search, we need to keep the signature of run() the same as in other nodes
if not query:
raise ValueError("WebSearch run requires the `query` parameter")
return {"documents": self.search_engine.search(query)}, "output_1"
return {"documents": self.search_engine.search(query, top_k=top_k)}, "output_1"
def run_batch(
self,

View File

@ -1,4 +1,5 @@
from typing import List
from unittest.mock import patch, Mock
from uuid import UUID
from numpy import loadtxt
@ -91,3 +92,102 @@ def indexing_document_classifier():
batch_size=16,
classification_field="class_field",
)
example_serperdev_response = {
"searchParameters": {
"q": "Who is the boyfriend of Olivia Wilde?",
"gl": "us",
"hl": "en",
"autocorrect": True,
"type": "search",
},
"organic": [
{
"title": "Olivia Wilde embraces Jason Sudeikis amid custody battle, Harry Styles split - Page Six",
"link": "https://pagesix.com/2023/01/29/olivia-wilde-hugs-it-out-with-jason-sudeikis-after-harry-styles-split/",
"snippet": "Looks like Olivia Wilde and Jason Sudeikis are starting 2023 on good terms. Amid their highly publicized custody battle and the actress' ...",
"date": "Jan 29, 2023",
"position": 1,
},
{
"title": "Olivia Wilde Is 'Quietly Dating' Again Following Harry Styles Split: 'He Makes Her Happy'",
"link": "https://www.yahoo.com/now/olivia-wilde-quietly-dating-again-183844364.html",
"snippet": "Olivia Wilde is “quietly dating again” following her November 2022 split from Harry Styles, a source exclusively tells Life & Style.",
"date": "Feb 10, 2023",
"position": 2,
},
{
"title": "Olivia Wilde and Harry Styles' Relationship Timeline: The Way They Were - Us Weekly",
"link": "https://www.usmagazine.com/celebrity-news/pictures/olivia-wilde-and-harry-styles-relationship-timeline/",
"snippet": "Olivia Wilde started dating Harry Styles after ending her years-long engagement to Jason Sudeikis — see their relationship timeline.",
"date": "Mar 10, 2023",
"imageUrl": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSgTcalNFvptTbYBiDXX55s8yCGfn6F1qbed9DAN16LvynTr9GayK5SPmY&s",
"position": 3,
},
{
"title": "Olivia Wilde Is 'Ready to Date Again' After Harry Styles Split - Us Weekly",
"link": "https://www.usmagazine.com/celebrity-news/news/olivia-wilde-is-ready-to-date-again-after-harry-styles-split/",
"snippet": "Ready for love! Olivia Wilde is officially back on the dating scene following her split from her ex-boyfriend, Harry Styles.",
"date": "Mar 1, 2023",
"imageUrl": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRCRAeRy5sVE631ZctzbzuOF70xkIOHaTvh2K7dYvdiVBwALiKrIjpscok&s",
"position": 4,
},
{
"title": "Harry Styles and Olivia Wilde's Definitive Relationship Timeline - Harper's Bazaar",
"link": "https://www.harpersbazaar.com/celebrity/latest/a35172115/harry-styles-olivia-wilde-relationship-timeline/",
"snippet": "November 2020: News breaks about Olivia splitting from fiancé Jason Sudeikis. ... In mid-November, news breaks of Olivia Wilde's split from Jason ...",
"date": "Feb 23, 2023",
"imageUrl": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRRqw3fvZOIGHEepxCc7yFAWYsS_v_1H6X-4nxyFJxdfRuFQw_BrI6JVzI&s",
"position": 5,
},
{
"title": "Harry Styles and Olivia Wilde's Relationship Timeline - People",
"link": "https://people.com/music/harry-styles-olivia-wilde-relationship-timeline/",
"snippet": "Harry Styles and Olivia Wilde first met on the set of Don't Worry Darling and stepped out as a couple in January 2021. Relive all their biggest relationship ...",
"position": 6,
},
{
"title": "Jason Sudeikis and Olivia Wilde's Relationship Timeline - People",
"link": "https://people.com/movies/jason-sudeikis-olivia-wilde-relationship-timeline/",
"snippet": "Jason Sudeikis and Olivia Wilde ended their engagement of seven years in 2020. Here's a complete timeline of their relationship.",
"date": "Mar 24, 2023",
"imageUrl": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSleZoXusQyJJe2WMgIuck_cVaJ8AE0_hU2QxsXzYvKANi55UQlv82yAVI&s",
"position": 7,
},
{
"title": "Olivia Wilde's anger at ex-boyfriend Harry Styles: She resents him and thinks he was using her | Marca",
"link": "https://www.marca.com/en/lifestyle/celebrities/2023/02/23/63f779a4e2704e8d988b4624.html",
"snippet": "The two started dating after Wilde split up with actor Jason Sudeikisin 2020. However, their relationship came to an end last November.",
"date": "Feb 23, 2023",
"imageUrl": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQBgJF2mSnIWCvPrqUqM4WTI9xPNWPyLvHuune85swpB1yE_G8cy_7KRh0&s",
"position": 8,
},
{
"title": "Olivia Wilde's dating history: Who has the actress dated? | The US Sun",
"link": "https://www.the-sun.com/entertainment/5221040/olivia-wildes-dating-history/",
"snippet": "AMERICAN actress Olivia Wilde started dating Harry Styles in January 2021 after breaking off her engagement the year prior.",
"date": "Nov 19, 2022",
"imageUrl": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTpm8BToVFHJoH6yRggg0fLocLT9mt6lwsnRxFFDNdDGhDydzQiSKZ9__g&s",
"position": 9,
},
],
"relatedSearches": [
{"query": "Harry Styles girlfriends in order"},
{"query": "Harry Styles and Olivia Wilde engaged"},
{"query": "Harry Styles and Olivia Wilde wedding"},
{"query": "Who is Harry Styles married to"},
{"query": "Jason Sudeikis Olivia Wilde relationship"},
{"query": "Olivia Wilde and Jason Sudeikis kids"},
{"query": "Olivia Wilde children"},
{"query": "Harry Styles and Olivia Wilde age difference"},
{"query": "Jason Sudeikis Olivia Wilde, Harry Styles"},
],
}
@pytest.fixture
def mock_web_search():
with patch("haystack.nodes.search_engine.providers.requests") as mock_run:
mock_run.request.return_value = Mock(status_code=200, json=lambda: example_serperdev_response)
yield mock_run

View File

@ -1124,156 +1124,6 @@ def test_multimodal_text_image_retrieval(text_docs: List[Document], image_docs:
assert text_results[0].content == "My name is Christelle and I live in Paris"
@pytest.mark.unit
def test_web_retriever_mode_raw_documents(monkeypatch):
expected_search_results = {
"documents": [
Document(
content="Eddard Stark",
score=0.9090909090909091,
meta={"title": "Eddard Stark", "link": "", "score": 0.9090909090909091},
id_hash_keys=["content"],
id="f408db6de8de0ffad0cb47cf8830dbb8",
),
Document(
content="The most likely answer for the clue is NED. How many solutions does Arya Stark's Father have? With crossword-solver.io you will find 1 solutions. We use ...",
score=0.09090909090909091,
meta={
"title": "Arya Stark's Father - Crossword Clue Answers",
"link": "https://crossword-solver.io/clue/arya-stark%27s-father/",
"position": 1,
"score": 0.09090909090909091,
},
id_hash_keys=["content"],
id="51779277acf94cf90e7663db137c0732",
),
]
}
def mock_web_search_run(self, query: str) -> Tuple[Dict, str]:
return expected_search_results, "output_1"
class MockResponse:
def __init__(self, text, status_code):
self.text = text
self.status_code = status_code
def get(url, headers, timeout):
return MockResponse("mocked", 200)
def get_content(self, text: str) -> str:
return "What are the top solutions for\nArya Stark's Father\nWe found 1 solutions for\nArya Stark's Father\n.The top solutions is determined by popularity, ratings and frequency of searches. The most likely answer for the clue is NED..."
monkeypatch.setattr(WebSearch, "run", mock_web_search_run)
monkeypatch.setattr(ArticleExtractor, "get_content", get_content)
monkeypatch.setattr(requests, "get", get)
web_retriever = WebRetriever(api_key="", top_search_results=2, mode="raw_documents")
result = web_retriever.retrieve(query="Who is the father of Arya Stark?")
assert len(result) == 1
assert isinstance(result[0], Document)
assert (
result[0].content
== "What are the top solutions for\nArya Stark's Father\nWe found 1 solutions for\nArya Stark's Father\n.The top solutions is determined by popularity, ratings and frequency of searches. The most likely answer for the clue is NED..."
)
assert result[0].score == None
assert result[0].meta["url"] == "https://crossword-solver.io/clue/arya-stark%27s-father/"
# Only preprocessed docs but not raw docs should have the _split_id field
assert "_split_id" not in result[0].meta
@pytest.mark.unit
def test_web_retriever_mode_preprocessed_documents(monkeypatch):
expected_search_results = {
"documents": [
Document(
content="Eddard Stark",
score=0.9090909090909091,
meta={"title": "Eddard Stark", "link": "", "score": 0.9090909090909091},
id_hash_keys=["content"],
id="f408db6de8de0ffad0cb47cf8830dbb8",
),
Document(
content="The most likely answer for the clue is NED. How many solutions does Arya Stark's Father have? With crossword-solver.io you will find 1 solutions. We use ...",
score=0.09090909090909091,
meta={
"title": "Arya Stark's Father - Crossword Clue Answers",
"link": "https://crossword-solver.io/clue/arya-stark%27s-father/",
"position": 1,
"score": 0.09090909090909091,
},
id_hash_keys=["content"],
id="51779277acf94cf90e7663db137c0732",
),
]
}
def mock_web_search_run(self, query: str) -> Tuple[Dict, str]:
return expected_search_results, "output_1"
class MockResponse:
def __init__(self, text, status_code):
self.text = text
self.status_code = status_code
def get(url, headers, timeout):
return MockResponse("mocked", 200)
def get_content(self, text: str) -> str:
return "What are the top solutions for\nArya Stark's Father\nWe found 1 solutions for\nArya Stark's Father\n.The top solutions is determined by popularity, ratings and frequency of searches. The most likely answer for the clue is NED..."
monkeypatch.setattr(WebSearch, "run", mock_web_search_run)
monkeypatch.setattr(ArticleExtractor, "get_content", get_content)
monkeypatch.setattr(requests, "get", get)
web_retriever = WebRetriever(api_key="", top_search_results=2, mode="preprocessed_documents")
result = web_retriever.retrieve(query="Who is the father of Arya Stark?")
assert len(result) == 1
assert isinstance(result[0], Document)
assert (
result[0].content
== "What are the top solutions for\nArya Stark's Father\nWe found 1 solutions for\nArya Stark's Father\n.The top solutions is determined by popularity, ratings and frequency of searches. The most likely answer for the clue is NED..."
)
assert result[0].score == None
assert result[0].meta["url"] == "https://crossword-solver.io/clue/arya-stark%27s-father/"
assert result[0].meta["_split_id"] == 0
@pytest.mark.unit
def test_web_retriever_mode_snippets(monkeypatch):
expected_search_results = {
"documents": [
Document(
content="Eddard Stark",
score=0.9090909090909091,
meta={"title": "Eddard Stark", "link": "", "score": 0.9090909090909091},
id_hash_keys=["content"],
id="f408db6de8de0ffad0cb47cf8830dbb8",
),
Document(
content="The most likely answer for the clue is NED. How many solutions does Arya Stark's Father have? With crossword-solver.io you will find 1 solutions. We use ...",
score=0.09090909090909091,
meta={
"title": "Arya Stark's Father - Crossword Clue Answers",
"link": "https://crossword-solver.io/clue/arya-stark%27s-father/",
"position": 1,
"score": 0.09090909090909091,
},
id_hash_keys=["content"],
id="51779277acf94cf90e7663db137c0732",
),
]
}
def mock_web_search_run(self, query: str) -> Tuple[Dict, str]:
return expected_search_results, "output_1"
monkeypatch.setattr(WebSearch, "run", mock_web_search_run)
web_retriever = WebRetriever(api_key="", top_search_results=2)
result = web_retriever.retrieve(query="Who is the father of Arya Stark?")
assert result == expected_search_results["documents"]
@pytest.mark.unit
@patch("haystack.nodes.retriever._openai_encoder.openai_request")
def test_openai_default_api_base(mock_request):

View File

@ -0,0 +1,198 @@
import os
from typing import Dict, Tuple
import pytest
import requests
from boilerpy3.extractors import ArticleExtractor
from haystack import Document, Pipeline
from haystack.nodes import WebSearch, WebRetriever, PromptNode
@pytest.mark.unit
def test_web_retriever_mode_raw_documents(monkeypatch):
expected_search_results = {
"documents": [
Document(
content="Eddard Stark",
score=0.9090909090909091,
meta={"title": "Eddard Stark", "link": "", "score": 0.9090909090909091},
id_hash_keys=["content"],
id="f408db6de8de0ffad0cb47cf8830dbb8",
),
Document(
content="The most likely answer for the clue is NED. How many solutions does Arya Stark's Father have? With crossword-solver.io you will find 1 solutions. We use ...",
score=0.09090909090909091,
meta={
"title": "Arya Stark's Father - Crossword Clue Answers",
"link": "https://crossword-solver.io/clue/arya-stark%27s-father/",
"position": 1,
"score": 0.09090909090909091,
},
id_hash_keys=["content"],
id="51779277acf94cf90e7663db137c0732",
),
]
}
def mock_web_search_run(self, query: str) -> Tuple[Dict, str]:
return expected_search_results, "output_1"
class MockResponse:
def __init__(self, text, status_code):
self.text = text
self.status_code = status_code
def get(url, headers, timeout):
return MockResponse("mocked", 200)
def get_content(self, text: str) -> str:
return "What are the top solutions for\nArya Stark's Father\nWe found 1 solutions for\nArya Stark's Father\n.The top solutions is determined by popularity, ratings and frequency of searches. The most likely answer for the clue is NED..."
monkeypatch.setattr(WebSearch, "run", mock_web_search_run)
monkeypatch.setattr(ArticleExtractor, "get_content", get_content)
monkeypatch.setattr(requests, "get", get)
web_retriever = WebRetriever(api_key="", top_search_results=2, mode="raw_documents")
result = web_retriever.retrieve(query="Who is the father of Arya Stark?")
assert len(result) == 1
assert isinstance(result[0], Document)
assert (
result[0].content
== "What are the top solutions for\nArya Stark's Father\nWe found 1 solutions for\nArya Stark's Father\n.The top solutions is determined by popularity, ratings and frequency of searches. The most likely answer for the clue is NED..."
)
assert result[0].score == None
assert result[0].meta["url"] == "https://crossword-solver.io/clue/arya-stark%27s-father/"
# Only preprocessed docs but not raw docs should have the _split_id field
assert "_split_id" not in result[0].meta
@pytest.mark.unit
def test_web_retriever_mode_preprocessed_documents(monkeypatch):
expected_search_results = {
"documents": [
Document(
content="Eddard Stark",
score=0.9090909090909091,
meta={"title": "Eddard Stark", "link": "", "score": 0.9090909090909091},
id_hash_keys=["content"],
id="f408db6de8de0ffad0cb47cf8830dbb8",
),
Document(
content="The most likely answer for the clue is NED. How many solutions does Arya Stark's Father have? With crossword-solver.io you will find 1 solutions. We use ...",
score=0.09090909090909091,
meta={
"title": "Arya Stark's Father - Crossword Clue Answers",
"link": "https://crossword-solver.io/clue/arya-stark%27s-father/",
"position": 1,
"score": 0.09090909090909091,
},
id_hash_keys=["content"],
id="51779277acf94cf90e7663db137c0732",
),
]
}
def mock_web_search_run(self, query: str) -> Tuple[Dict, str]:
return expected_search_results, "output_1"
class MockResponse:
def __init__(self, text, status_code):
self.text = text
self.status_code = status_code
def get(url, headers, timeout):
return MockResponse("mocked", 200)
def get_content(self, text: str) -> str:
return "What are the top solutions for\nArya Stark's Father\nWe found 1 solutions for\nArya Stark's Father\n.The top solutions is determined by popularity, ratings and frequency of searches. The most likely answer for the clue is NED..."
monkeypatch.setattr(WebSearch, "run", mock_web_search_run)
monkeypatch.setattr(ArticleExtractor, "get_content", get_content)
monkeypatch.setattr(requests, "get", get)
web_retriever = WebRetriever(api_key="", top_search_results=2, mode="preprocessed_documents")
result = web_retriever.retrieve(query="Who is the father of Arya Stark?")
assert len(result) == 1
assert isinstance(result[0], Document)
assert (
result[0].content
== "What are the top solutions for\nArya Stark's Father\nWe found 1 solutions for\nArya Stark's Father\n.The top solutions is determined by popularity, ratings and frequency of searches. The most likely answer for the clue is NED..."
)
assert result[0].score == None
assert result[0].meta["url"] == "https://crossword-solver.io/clue/arya-stark%27s-father/"
assert result[0].meta["_split_id"] == 0
@pytest.mark.unit
def test_web_retriever_mode_snippets(monkeypatch):
expected_search_results = {
"documents": [
Document(
content="Eddard Stark",
score=0.9090909090909091,
meta={"title": "Eddard Stark", "link": "", "score": 0.9090909090909091},
id_hash_keys=["content"],
id="f408db6de8de0ffad0cb47cf8830dbb8",
),
Document(
content="The most likely answer for the clue is NED. How many solutions does Arya Stark's Father have? With crossword-solver.io you will find 1 solutions. We use ...",
score=0.09090909090909091,
meta={
"title": "Arya Stark's Father - Crossword Clue Answers",
"link": "https://crossword-solver.io/clue/arya-stark%27s-father/",
"position": 1,
"score": 0.09090909090909091,
},
id_hash_keys=["content"],
id="51779277acf94cf90e7663db137c0732",
),
]
}
def mock_web_search_run(self, query: str) -> Tuple[Dict, str]:
return expected_search_results, "output_1"
monkeypatch.setattr(WebSearch, "run", mock_web_search_run)
web_retriever = WebRetriever(api_key="", top_search_results=2)
result = web_retriever.retrieve(query="Who is the father of Arya Stark?")
assert result == expected_search_results["documents"]
@pytest.mark.unit
@pytest.mark.parametrize("top_k", [1, 3, 6])
def test_top_k_parameter(mock_web_search, top_k):
web_retriever = WebRetriever(api_key="some_invalid_key", mode="snippets")
result = web_retriever.retrieve(query="Who is the boyfriend of Olivia Wilde?", top_k=top_k)
assert len(result) == top_k
assert all(isinstance(doc, Document) for doc in result)
@pytest.mark.integration
@pytest.mark.skipif(
not os.environ.get("SERPERDEV_API_KEY", None),
reason="Please export an env var called SERPERDEV_API_KEY containing the serper.dev API key to run this test.",
)
@pytest.mark.skipif(
not os.environ.get("OPENAI_API_KEY", None),
reason="Please export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
)
@pytest.mark.parametrize("top_k", [2, 4])
def test_top_k_parameter_in_pipeline(top_k):
# test that WebRetriever top_k param is NOT ignored in a pipeline
prompt_node = PromptNode(
"gpt-3.5-turbo",
api_key=os.environ.get("OPENAI_API_KEY"),
max_length=256,
default_prompt_template="question-answering-with-document-scores",
)
retriever = WebRetriever(api_key=os.environ.get("SERPERDEV_API_KEY"))
pipe = Pipeline()
pipe.add_node(component=retriever, name="WebRetriever", inputs=["Query"])
pipe.add_node(component=prompt_node, name="QAwithScoresPrompt", inputs=["WebRetriever"])
result = pipe.run(query="What year was Obama president", params={"WebRetriever": {"top_k": top_k}})
assert len(result["results"]) == top_k

View File

@ -1,5 +1,4 @@
import os
import unittest
from unittest.mock import MagicMock, patch
import pytest
@ -99,9 +98,19 @@ def test_web_search_with_google_api_client():
search_engine_provider="GoogleAPI",
search_engine_kwargs={"engine_id": SEARCH_ENGINE_ID},
)
result, _ = ws.run(query=query)
_, _ = ws.run(query=query)
mock_build.assert_called_once_with("customsearch", "v1", developerKey=GOOGLE_API_KEY)
mock_service.cse.assert_called_once()
mock_cse.list.assert_called_once_with(q=query, cx=SEARCH_ENGINE_ID, num=10)
mock_list.execute.assert_called_once()
@pytest.mark.unit
@pytest.mark.parametrize("top_k", [1, 3, 6])
def test_web_search_top_k(mock_web_search, top_k):
ws = WebSearch(api_key="some_invalid_key")
result, _ = ws.run(query="Who is the boyfriend of Olivia Wilde?", top_k=top_k)
assert "documents" in result
assert len(result["documents"]) == top_k
assert all(isinstance(doc, Document) for doc in result["documents"])