mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-06 03:57:19 +00:00
chore: refactor pipeline tests for e2e testing (#5576)
* enable pipeline filder in e2e * merge standard pipeline tests with stanrdard pipeline batch tests * merge summarization tests into standard pipelines tests * Update test_standard_pipelines.py * black
This commit is contained in:
parent
f13b37db24
commit
5985b6d358
1
.github/workflows/e2e.yml
vendored
1
.github/workflows/e2e.yml
vendored
@ -17,6 +17,7 @@ jobs:
|
||||
matrix:
|
||||
folder:
|
||||
- "document_search"
|
||||
- "pipelines"
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
|
||||
@ -10,18 +10,16 @@ from haystack.pipelines import (
|
||||
DocumentSearchPipeline,
|
||||
MostSimilarDocumentsPipeline,
|
||||
WebQAPipeline,
|
||||
SearchSummarizationPipeline,
|
||||
)
|
||||
from haystack.nodes import EmbeddingRetriever, PromptNode
|
||||
from haystack.nodes import EmbeddingRetriever, PromptNode, BM25Retriever, TransformersSummarizer
|
||||
from haystack.schema import Document
|
||||
|
||||
|
||||
def test_faq_pipeline():
|
||||
documents = [
|
||||
{"content": "How to test module-1?", "meta": {"source": "wiki1", "answer": "Using tests for module-1"}},
|
||||
{"content": "How to test module-2?", "meta": {"source": "wiki2", "answer": "Using tests for module-2"}},
|
||||
{"content": "How to test module-3?", "meta": {"source": "wiki3", "answer": "Using tests for module-3"}},
|
||||
{"content": "How to test module-4?", "meta": {"source": "wiki4", "answer": "Using tests for module-4"}},
|
||||
{"content": "How to test module-5?", "meta": {"source": "wiki5", "answer": "Using tests for module-5"}},
|
||||
{"content": f"How to test module-{i}?", "meta": {"source": f"wiki{i}", "answer": f"Using tests for module-{i}"}}
|
||||
for i in range(1, 6)
|
||||
]
|
||||
document_store = InMemoryDocumentStore()
|
||||
retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert")
|
||||
@ -165,3 +163,181 @@ def test_webqa_pipeline():
|
||||
assert len(result["results"]) == 1
|
||||
answer = result["results"][0]
|
||||
assert "Stark" in answer or "NED" in answer
|
||||
|
||||
|
||||
def test_faq_pipeline_batch():
|
||||
documents = [
|
||||
{"content": "How to test module-1?", "meta": {"source": "wiki1", "answer": "Using tests for module-1"}},
|
||||
{"content": "How to test module-2?", "meta": {"source": "wiki2", "answer": "Using tests for module-2"}},
|
||||
{"content": "How to test module-3?", "meta": {"source": "wiki3", "answer": "Using tests for module-3"}},
|
||||
{"content": "How to test module-4?", "meta": {"source": "wiki4", "answer": "Using tests for module-4"}},
|
||||
{"content": "How to test module-5?", "meta": {"source": "wiki5", "answer": "Using tests for module-5"}},
|
||||
]
|
||||
document_store = InMemoryDocumentStore()
|
||||
retriever = EmbeddingRetriever(
|
||||
document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
document_store.write_documents(documents)
|
||||
document_store.update_embeddings(retriever)
|
||||
|
||||
pipeline = FAQPipeline(retriever=retriever)
|
||||
|
||||
output = pipeline.run_batch(queries=["How to test this?", "How to test this?"], params={"Retriever": {"top_k": 3}})
|
||||
assert len(output["answers"]) == 2 # 2 queries
|
||||
assert len(output["answers"][0]) == 3 # 3 answers per query
|
||||
assert output["queries"][0].startswith("How to")
|
||||
assert output["answers"][0][0].answer.startswith("Using tests")
|
||||
|
||||
|
||||
def test_document_search_pipeline_batch():
|
||||
documents = [
|
||||
{"content": "Sample text for document-1", "meta": {"source": "wiki1"}},
|
||||
{"content": "Sample text for document-2", "meta": {"source": "wiki2"}},
|
||||
{"content": "Sample text for document-3", "meta": {"source": "wiki3"}},
|
||||
{"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
|
||||
{"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
|
||||
]
|
||||
document_store = InMemoryDocumentStore()
|
||||
retriever = EmbeddingRetriever(
|
||||
document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
document_store.write_documents(documents)
|
||||
document_store.update_embeddings(retriever)
|
||||
|
||||
pipeline = DocumentSearchPipeline(retriever=retriever)
|
||||
output = pipeline.run_batch(queries=["How to test this?", "How to test this?"], params={"top_k": 4})
|
||||
assert len(output["documents"]) == 2 # 2 queries
|
||||
assert len(output["documents"][0]) == 4 # 4 docs per query
|
||||
|
||||
|
||||
def test_most_similar_documents_pipeline_batch():
|
||||
documents = [
|
||||
{"id": "a", "content": "Sample text for document-1", "meta": {"source": "wiki1"}},
|
||||
{"id": "b", "content": "Sample text for document-2", "meta": {"source": "wiki2"}},
|
||||
{"content": "Sample text for document-3", "meta": {"source": "wiki3"}},
|
||||
{"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
|
||||
{"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
|
||||
]
|
||||
document_store = InMemoryDocumentStore()
|
||||
retriever = EmbeddingRetriever(
|
||||
document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
document_store.write_documents(documents)
|
||||
document_store.update_embeddings(retriever)
|
||||
|
||||
docs_id: list = ["a", "b"]
|
||||
pipeline = MostSimilarDocumentsPipeline(document_store=document_store)
|
||||
list_of_documents = pipeline.run_batch(document_ids=docs_id)
|
||||
|
||||
assert len(list_of_documents[0]) > 1
|
||||
assert isinstance(list_of_documents, list)
|
||||
assert len(list_of_documents) == len(docs_id)
|
||||
|
||||
for another_list in list_of_documents:
|
||||
assert isinstance(another_list, list)
|
||||
for document in another_list:
|
||||
assert isinstance(document, Document)
|
||||
assert isinstance(document.id, str)
|
||||
assert isinstance(document.content, str)
|
||||
|
||||
|
||||
def test_most_similar_documents_pipeline_with_filters_batch():
|
||||
documents = [
|
||||
{"id": "a", "content": "Sample text for document-1", "meta": {"source": "wiki1"}},
|
||||
{"id": "b", "content": "Sample text for document-2", "meta": {"source": "wiki2"}},
|
||||
{"content": "Sample text for document-3", "meta": {"source": "wiki3"}},
|
||||
{"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
|
||||
{"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
|
||||
]
|
||||
document_store = InMemoryDocumentStore()
|
||||
retriever = EmbeddingRetriever(
|
||||
document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
document_store = InMemoryDocumentStore()
|
||||
document_store.write_documents(documents)
|
||||
document_store.update_embeddings(retriever)
|
||||
|
||||
docs_id: list = ["a", "b"]
|
||||
filters = {"source": ["wiki3", "wiki4", "wiki5"]}
|
||||
pipeline = MostSimilarDocumentsPipeline(document_store=document_store)
|
||||
list_of_documents = pipeline.run_batch(document_ids=docs_id, filters=filters)
|
||||
|
||||
assert len(list_of_documents[0]) > 1
|
||||
assert isinstance(list_of_documents, list)
|
||||
assert len(list_of_documents) == len(docs_id)
|
||||
|
||||
for another_list in list_of_documents:
|
||||
assert isinstance(another_list, list)
|
||||
for document in another_list:
|
||||
assert isinstance(document, Document)
|
||||
assert isinstance(document.id, str)
|
||||
assert isinstance(document.content, str)
|
||||
assert document.meta["source"] in ["wiki3", "wiki4", "wiki5"]
|
||||
|
||||
|
||||
def test_summarization_pipeline():
|
||||
docs = [
|
||||
Document(
|
||||
content="""
|
||||
PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions.
|
||||
The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected
|
||||
by the shutoffs which were expected to last through at least midday tomorrow.
|
||||
"""
|
||||
),
|
||||
Document(
|
||||
content="""
|
||||
The Eiffel Tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest
|
||||
structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction,
|
||||
the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a
|
||||
title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first
|
||||
structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower
|
||||
in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel
|
||||
Tower is the second tallest free-standing structure in France after the Millau Viaduct.
|
||||
"""
|
||||
),
|
||||
]
|
||||
summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distilbart-xsum-12-6", use_gpu=False)
|
||||
|
||||
ds = InMemoryDocumentStore(use_bm25=True)
|
||||
retriever = BM25Retriever(document_store=ds)
|
||||
ds.write_documents(docs)
|
||||
|
||||
query = "Eiffel Tower"
|
||||
pipeline = SearchSummarizationPipeline(retriever=retriever, summarizer=summarizer, return_in_answer_format=True)
|
||||
output = pipeline.run(query=query, params={"Retriever": {"top_k": 1}})
|
||||
answers = output["answers"]
|
||||
assert len(answers) == 1
|
||||
assert "The Eiffel Tower is one of the world's tallest structures" == answers[0]["answer"].strip()
|
||||
|
||||
|
||||
def test_summarization_pipeline_one_summary():
|
||||
split_docs = [
|
||||
Document(
|
||||
content="""
|
||||
The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris.
|
||||
Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the
|
||||
Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler
|
||||
Building in New York City was finished in 1930.
|
||||
"""
|
||||
),
|
||||
Document(
|
||||
content="""
|
||||
It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the
|
||||
top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters,
|
||||
the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.
|
||||
"""
|
||||
),
|
||||
]
|
||||
ds = InMemoryDocumentStore(use_bm25=True)
|
||||
retriever = BM25Retriever(document_store=ds)
|
||||
ds.write_documents(split_docs)
|
||||
summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distilbart-xsum-12-6", use_gpu=False)
|
||||
|
||||
query = "Eiffel Tower"
|
||||
pipeline = SearchSummarizationPipeline(
|
||||
retriever=retriever, summarizer=summarizer, generate_single_summary=True, return_in_answer_format=True
|
||||
)
|
||||
output = pipeline.run(query=query, params={"Retriever": {"top_k": 2}})
|
||||
answers = output["answers"]
|
||||
assert len(answers) == 1
|
||||
assert answers[0]["answer"].strip() == "The Eiffel Tower was built in 1924 in Paris, France."
|
||||
|
||||
@ -1,114 +0,0 @@
|
||||
from haystack.document_stores import InMemoryDocumentStore
|
||||
from haystack.pipelines import FAQPipeline, DocumentSearchPipeline, MostSimilarDocumentsPipeline
|
||||
from haystack.nodes import EmbeddingRetriever
|
||||
from haystack.schema import Document
|
||||
|
||||
|
||||
def test_faq_pipeline_batch():
|
||||
documents = [
|
||||
{"content": "How to test module-1?", "meta": {"source": "wiki1", "answer": "Using tests for module-1"}},
|
||||
{"content": "How to test module-2?", "meta": {"source": "wiki2", "answer": "Using tests for module-2"}},
|
||||
{"content": "How to test module-3?", "meta": {"source": "wiki3", "answer": "Using tests for module-3"}},
|
||||
{"content": "How to test module-4?", "meta": {"source": "wiki4", "answer": "Using tests for module-4"}},
|
||||
{"content": "How to test module-5?", "meta": {"source": "wiki5", "answer": "Using tests for module-5"}},
|
||||
]
|
||||
document_store = InMemoryDocumentStore()
|
||||
retriever = EmbeddingRetriever(
|
||||
document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
document_store.write_documents(documents)
|
||||
document_store.update_embeddings(retriever)
|
||||
|
||||
pipeline = FAQPipeline(retriever=retriever)
|
||||
|
||||
output = pipeline.run_batch(queries=["How to test this?", "How to test this?"], params={"Retriever": {"top_k": 3}})
|
||||
assert len(output["answers"]) == 2 # 2 queries
|
||||
assert len(output["answers"][0]) == 3 # 3 answers per query
|
||||
assert output["queries"][0].startswith("How to")
|
||||
assert output["answers"][0][0].answer.startswith("Using tests")
|
||||
|
||||
|
||||
def test_document_search_pipeline_batch():
|
||||
documents = [
|
||||
{"content": "Sample text for document-1", "meta": {"source": "wiki1"}},
|
||||
{"content": "Sample text for document-2", "meta": {"source": "wiki2"}},
|
||||
{"content": "Sample text for document-3", "meta": {"source": "wiki3"}},
|
||||
{"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
|
||||
{"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
|
||||
]
|
||||
document_store = InMemoryDocumentStore()
|
||||
retriever = EmbeddingRetriever(
|
||||
document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
document_store.write_documents(documents)
|
||||
document_store.update_embeddings(retriever)
|
||||
|
||||
pipeline = DocumentSearchPipeline(retriever=retriever)
|
||||
output = pipeline.run_batch(queries=["How to test this?", "How to test this?"], params={"top_k": 4})
|
||||
assert len(output["documents"]) == 2 # 2 queries
|
||||
assert len(output["documents"][0]) == 4 # 4 docs per query
|
||||
|
||||
|
||||
def test_most_similar_documents_pipeline_batch():
|
||||
documents = [
|
||||
{"id": "a", "content": "Sample text for document-1", "meta": {"source": "wiki1"}},
|
||||
{"id": "b", "content": "Sample text for document-2", "meta": {"source": "wiki2"}},
|
||||
{"content": "Sample text for document-3", "meta": {"source": "wiki3"}},
|
||||
{"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
|
||||
{"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
|
||||
]
|
||||
document_store = InMemoryDocumentStore()
|
||||
retriever = EmbeddingRetriever(
|
||||
document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
document_store.write_documents(documents)
|
||||
document_store.update_embeddings(retriever)
|
||||
|
||||
docs_id: list = ["a", "b"]
|
||||
pipeline = MostSimilarDocumentsPipeline(document_store=document_store)
|
||||
list_of_documents = pipeline.run_batch(document_ids=docs_id)
|
||||
|
||||
assert len(list_of_documents[0]) > 1
|
||||
assert isinstance(list_of_documents, list)
|
||||
assert len(list_of_documents) == len(docs_id)
|
||||
|
||||
for another_list in list_of_documents:
|
||||
assert isinstance(another_list, list)
|
||||
for document in another_list:
|
||||
assert isinstance(document, Document)
|
||||
assert isinstance(document.id, str)
|
||||
assert isinstance(document.content, str)
|
||||
|
||||
|
||||
def test_most_similar_documents_pipeline_with_filters_batch():
|
||||
documents = [
|
||||
{"id": "a", "content": "Sample text for document-1", "meta": {"source": "wiki1"}},
|
||||
{"id": "b", "content": "Sample text for document-2", "meta": {"source": "wiki2"}},
|
||||
{"content": "Sample text for document-3", "meta": {"source": "wiki3"}},
|
||||
{"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
|
||||
{"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
|
||||
]
|
||||
document_store = InMemoryDocumentStore()
|
||||
retriever = EmbeddingRetriever(
|
||||
document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
document_store = InMemoryDocumentStore()
|
||||
document_store.write_documents(documents)
|
||||
document_store.update_embeddings(retriever)
|
||||
|
||||
docs_id: list = ["a", "b"]
|
||||
filters = {"source": ["wiki3", "wiki4", "wiki5"]}
|
||||
pipeline = MostSimilarDocumentsPipeline(document_store=document_store)
|
||||
list_of_documents = pipeline.run_batch(document_ids=docs_id, filters=filters)
|
||||
|
||||
assert len(list_of_documents[0]) > 1
|
||||
assert isinstance(list_of_documents, list)
|
||||
assert len(list_of_documents) == len(docs_id)
|
||||
|
||||
for another_list in list_of_documents:
|
||||
assert isinstance(another_list, list)
|
||||
for document in another_list:
|
||||
assert isinstance(document, Document)
|
||||
assert isinstance(document.id, str)
|
||||
assert isinstance(document.content, str)
|
||||
assert document.meta["source"] in ["wiki3", "wiki4", "wiki5"]
|
||||
@ -1,72 +0,0 @@
|
||||
from haystack import Document
|
||||
from haystack.document_stores import InMemoryDocumentStore
|
||||
from haystack.pipelines import SearchSummarizationPipeline
|
||||
from haystack.nodes import BM25Retriever, TransformersSummarizer
|
||||
|
||||
|
||||
def test_summarization_pipeline():
|
||||
docs = [
|
||||
Document(
|
||||
content="""
|
||||
PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions.
|
||||
The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected
|
||||
by the shutoffs which were expected to last through at least midday tomorrow.
|
||||
"""
|
||||
),
|
||||
Document(
|
||||
content="""
|
||||
The Eiffel Tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest
|
||||
structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction,
|
||||
the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a
|
||||
title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first
|
||||
structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower
|
||||
in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel
|
||||
Tower is the second tallest free-standing structure in France after the Millau Viaduct.
|
||||
"""
|
||||
),
|
||||
]
|
||||
summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distilbart-xsum-12-6", use_gpu=False)
|
||||
|
||||
ds = InMemoryDocumentStore(use_bm25=True)
|
||||
retriever = BM25Retriever(document_store=ds)
|
||||
ds.write_documents(docs)
|
||||
|
||||
query = "Eiffel Tower"
|
||||
pipeline = SearchSummarizationPipeline(retriever=retriever, summarizer=summarizer, return_in_answer_format=True)
|
||||
output = pipeline.run(query=query, params={"Retriever": {"top_k": 1}})
|
||||
answers = output["answers"]
|
||||
assert len(answers) == 1
|
||||
assert "The Eiffel Tower is one of the world's tallest structures" == answers[0]["answer"].strip()
|
||||
|
||||
|
||||
def test_summarization_pipeline_one_summary():
|
||||
split_docs = [
|
||||
Document(
|
||||
content="""
|
||||
The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris.
|
||||
Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the
|
||||
Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler
|
||||
Building in New York City was finished in 1930.
|
||||
"""
|
||||
),
|
||||
Document(
|
||||
content="""
|
||||
It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the
|
||||
top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters,
|
||||
the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.
|
||||
"""
|
||||
),
|
||||
]
|
||||
ds = InMemoryDocumentStore(use_bm25=True)
|
||||
retriever = BM25Retriever(document_store=ds)
|
||||
ds.write_documents(split_docs)
|
||||
summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distilbart-xsum-12-6", use_gpu=False)
|
||||
|
||||
query = "Eiffel Tower"
|
||||
pipeline = SearchSummarizationPipeline(
|
||||
retriever=retriever, summarizer=summarizer, generate_single_summary=True, return_in_answer_format=True
|
||||
)
|
||||
output = pipeline.run(query=query, params={"Retriever": {"top_k": 2}})
|
||||
answers = output["answers"]
|
||||
assert len(answers) == 1
|
||||
assert answers[0]["answer"].strip() == "The Eiffel Tower was built in 1924 in Paris, France."
|
||||
Loading…
x
Reference in New Issue
Block a user