diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index e9f9f4048..27c3bc007 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -17,6 +17,7 @@ jobs: matrix: folder: - "document_search" + - "pipelines" runs-on: ubuntu-latest diff --git a/e2e/pipelines/test_standard_pipelines.py b/e2e/pipelines/test_standard_pipelines.py index 33c457441..941f56ef5 100644 --- a/e2e/pipelines/test_standard_pipelines.py +++ b/e2e/pipelines/test_standard_pipelines.py @@ -10,18 +10,16 @@ from haystack.pipelines import ( DocumentSearchPipeline, MostSimilarDocumentsPipeline, WebQAPipeline, + SearchSummarizationPipeline, ) -from haystack.nodes import EmbeddingRetriever, PromptNode +from haystack.nodes import EmbeddingRetriever, PromptNode, BM25Retriever, TransformersSummarizer from haystack.schema import Document def test_faq_pipeline(): documents = [ - {"content": "How to test module-1?", "meta": {"source": "wiki1", "answer": "Using tests for module-1"}}, - {"content": "How to test module-2?", "meta": {"source": "wiki2", "answer": "Using tests for module-2"}}, - {"content": "How to test module-3?", "meta": {"source": "wiki3", "answer": "Using tests for module-3"}}, - {"content": "How to test module-4?", "meta": {"source": "wiki4", "answer": "Using tests for module-4"}}, - {"content": "How to test module-5?", "meta": {"source": "wiki5", "answer": "Using tests for module-5"}}, + {"content": f"How to test module-{i}?", "meta": {"source": f"wiki{i}", "answer": f"Using tests for module-{i}"}} + for i in range(1, 6) ] document_store = InMemoryDocumentStore() retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert") @@ -165,3 +163,181 @@ def test_webqa_pipeline(): assert len(result["results"]) == 1 answer = result["results"][0] assert "Stark" in answer or "NED" in answer + + +def test_faq_pipeline_batch(): + documents = [ + {"content": "How to test module-1?", "meta": {"source": "wiki1", "answer": "Using tests for module-1"}}, + {"content": "How to test module-2?", "meta": {"source": "wiki2", "answer": "Using tests for module-2"}}, + {"content": "How to test module-3?", "meta": {"source": "wiki3", "answer": "Using tests for module-3"}}, + {"content": "How to test module-4?", "meta": {"source": "wiki4", "answer": "Using tests for module-4"}}, + {"content": "How to test module-5?", "meta": {"source": "wiki5", "answer": "Using tests for module-5"}}, + ] + document_store = InMemoryDocumentStore() + retriever = EmbeddingRetriever( + document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2" + ) + document_store.write_documents(documents) + document_store.update_embeddings(retriever) + + pipeline = FAQPipeline(retriever=retriever) + + output = pipeline.run_batch(queries=["How to test this?", "How to test this?"], params={"Retriever": {"top_k": 3}}) + assert len(output["answers"]) == 2 # 2 queries + assert len(output["answers"][0]) == 3 # 3 answers per query + assert output["queries"][0].startswith("How to") + assert output["answers"][0][0].answer.startswith("Using tests") + + +def test_document_search_pipeline_batch(): + documents = [ + {"content": "Sample text for document-1", "meta": {"source": "wiki1"}}, + {"content": "Sample text for document-2", "meta": {"source": "wiki2"}}, + {"content": "Sample text for document-3", "meta": {"source": "wiki3"}}, + {"content": "Sample text for document-4", "meta": {"source": "wiki4"}}, + {"content": "Sample text for document-5", "meta": {"source": "wiki5"}}, + ] + document_store = InMemoryDocumentStore() + retriever = EmbeddingRetriever( + document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2" + ) + document_store.write_documents(documents) + document_store.update_embeddings(retriever) + + pipeline = DocumentSearchPipeline(retriever=retriever) + output = pipeline.run_batch(queries=["How to test this?", "How to test this?"], params={"top_k": 4}) + assert len(output["documents"]) == 2 # 2 queries + assert len(output["documents"][0]) == 4 # 4 docs per query + + +def test_most_similar_documents_pipeline_batch(): + documents = [ + {"id": "a", "content": "Sample text for document-1", "meta": {"source": "wiki1"}}, + {"id": "b", "content": "Sample text for document-2", "meta": {"source": "wiki2"}}, + {"content": "Sample text for document-3", "meta": {"source": "wiki3"}}, + {"content": "Sample text for document-4", "meta": {"source": "wiki4"}}, + {"content": "Sample text for document-5", "meta": {"source": "wiki5"}}, + ] + document_store = InMemoryDocumentStore() + retriever = EmbeddingRetriever( + document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2" + ) + document_store.write_documents(documents) + document_store.update_embeddings(retriever) + + docs_id: list = ["a", "b"] + pipeline = MostSimilarDocumentsPipeline(document_store=document_store) + list_of_documents = pipeline.run_batch(document_ids=docs_id) + + assert len(list_of_documents[0]) > 1 + assert isinstance(list_of_documents, list) + assert len(list_of_documents) == len(docs_id) + + for another_list in list_of_documents: + assert isinstance(another_list, list) + for document in another_list: + assert isinstance(document, Document) + assert isinstance(document.id, str) + assert isinstance(document.content, str) + + +def test_most_similar_documents_pipeline_with_filters_batch(): + documents = [ + {"id": "a", "content": "Sample text for document-1", "meta": {"source": "wiki1"}}, + {"id": "b", "content": "Sample text for document-2", "meta": {"source": "wiki2"}}, + {"content": "Sample text for document-3", "meta": {"source": "wiki3"}}, + {"content": "Sample text for document-4", "meta": {"source": "wiki4"}}, + {"content": "Sample text for document-5", "meta": {"source": "wiki5"}}, + ] + document_store = InMemoryDocumentStore() + retriever = EmbeddingRetriever( + document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2" + ) + document_store = InMemoryDocumentStore() + document_store.write_documents(documents) + document_store.update_embeddings(retriever) + + docs_id: list = ["a", "b"] + filters = {"source": ["wiki3", "wiki4", "wiki5"]} + pipeline = MostSimilarDocumentsPipeline(document_store=document_store) + list_of_documents = pipeline.run_batch(document_ids=docs_id, filters=filters) + + assert len(list_of_documents[0]) > 1 + assert isinstance(list_of_documents, list) + assert len(list_of_documents) == len(docs_id) + + for another_list in list_of_documents: + assert isinstance(another_list, list) + for document in another_list: + assert isinstance(document, Document) + assert isinstance(document.id, str) + assert isinstance(document.content, str) + assert document.meta["source"] in ["wiki3", "wiki4", "wiki5"] + + +def test_summarization_pipeline(): + docs = [ + Document( + content=""" + PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. + The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected + by the shutoffs which were expected to last through at least midday tomorrow. + """ + ), + Document( + content=""" + The Eiffel Tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest + structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, + the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a + title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first + structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower + in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel + Tower is the second tallest free-standing structure in France after the Millau Viaduct. + """ + ), + ] + summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distilbart-xsum-12-6", use_gpu=False) + + ds = InMemoryDocumentStore(use_bm25=True) + retriever = BM25Retriever(document_store=ds) + ds.write_documents(docs) + + query = "Eiffel Tower" + pipeline = SearchSummarizationPipeline(retriever=retriever, summarizer=summarizer, return_in_answer_format=True) + output = pipeline.run(query=query, params={"Retriever": {"top_k": 1}}) + answers = output["answers"] + assert len(answers) == 1 + assert "The Eiffel Tower is one of the world's tallest structures" == answers[0]["answer"].strip() + + +def test_summarization_pipeline_one_summary(): + split_docs = [ + Document( + content=""" + The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. + Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the + Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler + Building in New York City was finished in 1930. + """ + ), + Document( + content=""" + It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the + top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, + the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct. + """ + ), + ] + ds = InMemoryDocumentStore(use_bm25=True) + retriever = BM25Retriever(document_store=ds) + ds.write_documents(split_docs) + summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distilbart-xsum-12-6", use_gpu=False) + + query = "Eiffel Tower" + pipeline = SearchSummarizationPipeline( + retriever=retriever, summarizer=summarizer, generate_single_summary=True, return_in_answer_format=True + ) + output = pipeline.run(query=query, params={"Retriever": {"top_k": 2}}) + answers = output["answers"] + assert len(answers) == 1 + assert answers[0]["answer"].strip() == "The Eiffel Tower was built in 1924 in Paris, France." diff --git a/e2e/pipelines/test_standard_pipelines_batch.py b/e2e/pipelines/test_standard_pipelines_batch.py deleted file mode 100644 index e64b410c3..000000000 --- a/e2e/pipelines/test_standard_pipelines_batch.py +++ /dev/null @@ -1,114 +0,0 @@ -from haystack.document_stores import InMemoryDocumentStore -from haystack.pipelines import FAQPipeline, DocumentSearchPipeline, MostSimilarDocumentsPipeline -from haystack.nodes import EmbeddingRetriever -from haystack.schema import Document - - -def test_faq_pipeline_batch(): - documents = [ - {"content": "How to test module-1?", "meta": {"source": "wiki1", "answer": "Using tests for module-1"}}, - {"content": "How to test module-2?", "meta": {"source": "wiki2", "answer": "Using tests for module-2"}}, - {"content": "How to test module-3?", "meta": {"source": "wiki3", "answer": "Using tests for module-3"}}, - {"content": "How to test module-4?", "meta": {"source": "wiki4", "answer": "Using tests for module-4"}}, - {"content": "How to test module-5?", "meta": {"source": "wiki5", "answer": "Using tests for module-5"}}, - ] - document_store = InMemoryDocumentStore() - retriever = EmbeddingRetriever( - document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2" - ) - document_store.write_documents(documents) - document_store.update_embeddings(retriever) - - pipeline = FAQPipeline(retriever=retriever) - - output = pipeline.run_batch(queries=["How to test this?", "How to test this?"], params={"Retriever": {"top_k": 3}}) - assert len(output["answers"]) == 2 # 2 queries - assert len(output["answers"][0]) == 3 # 3 answers per query - assert output["queries"][0].startswith("How to") - assert output["answers"][0][0].answer.startswith("Using tests") - - -def test_document_search_pipeline_batch(): - documents = [ - {"content": "Sample text for document-1", "meta": {"source": "wiki1"}}, - {"content": "Sample text for document-2", "meta": {"source": "wiki2"}}, - {"content": "Sample text for document-3", "meta": {"source": "wiki3"}}, - {"content": "Sample text for document-4", "meta": {"source": "wiki4"}}, - {"content": "Sample text for document-5", "meta": {"source": "wiki5"}}, - ] - document_store = InMemoryDocumentStore() - retriever = EmbeddingRetriever( - document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2" - ) - document_store.write_documents(documents) - document_store.update_embeddings(retriever) - - pipeline = DocumentSearchPipeline(retriever=retriever) - output = pipeline.run_batch(queries=["How to test this?", "How to test this?"], params={"top_k": 4}) - assert len(output["documents"]) == 2 # 2 queries - assert len(output["documents"][0]) == 4 # 4 docs per query - - -def test_most_similar_documents_pipeline_batch(): - documents = [ - {"id": "a", "content": "Sample text for document-1", "meta": {"source": "wiki1"}}, - {"id": "b", "content": "Sample text for document-2", "meta": {"source": "wiki2"}}, - {"content": "Sample text for document-3", "meta": {"source": "wiki3"}}, - {"content": "Sample text for document-4", "meta": {"source": "wiki4"}}, - {"content": "Sample text for document-5", "meta": {"source": "wiki5"}}, - ] - document_store = InMemoryDocumentStore() - retriever = EmbeddingRetriever( - document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2" - ) - document_store.write_documents(documents) - document_store.update_embeddings(retriever) - - docs_id: list = ["a", "b"] - pipeline = MostSimilarDocumentsPipeline(document_store=document_store) - list_of_documents = pipeline.run_batch(document_ids=docs_id) - - assert len(list_of_documents[0]) > 1 - assert isinstance(list_of_documents, list) - assert len(list_of_documents) == len(docs_id) - - for another_list in list_of_documents: - assert isinstance(another_list, list) - for document in another_list: - assert isinstance(document, Document) - assert isinstance(document.id, str) - assert isinstance(document.content, str) - - -def test_most_similar_documents_pipeline_with_filters_batch(): - documents = [ - {"id": "a", "content": "Sample text for document-1", "meta": {"source": "wiki1"}}, - {"id": "b", "content": "Sample text for document-2", "meta": {"source": "wiki2"}}, - {"content": "Sample text for document-3", "meta": {"source": "wiki3"}}, - {"content": "Sample text for document-4", "meta": {"source": "wiki4"}}, - {"content": "Sample text for document-5", "meta": {"source": "wiki5"}}, - ] - document_store = InMemoryDocumentStore() - retriever = EmbeddingRetriever( - document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2" - ) - document_store = InMemoryDocumentStore() - document_store.write_documents(documents) - document_store.update_embeddings(retriever) - - docs_id: list = ["a", "b"] - filters = {"source": ["wiki3", "wiki4", "wiki5"]} - pipeline = MostSimilarDocumentsPipeline(document_store=document_store) - list_of_documents = pipeline.run_batch(document_ids=docs_id, filters=filters) - - assert len(list_of_documents[0]) > 1 - assert isinstance(list_of_documents, list) - assert len(list_of_documents) == len(docs_id) - - for another_list in list_of_documents: - assert isinstance(another_list, list) - for document in another_list: - assert isinstance(document, Document) - assert isinstance(document.id, str) - assert isinstance(document.content, str) - assert document.meta["source"] in ["wiki3", "wiki4", "wiki5"] diff --git a/e2e/pipelines/test_summarization_pipelines.py b/e2e/pipelines/test_summarization_pipelines.py deleted file mode 100644 index 245e1c4ff..000000000 --- a/e2e/pipelines/test_summarization_pipelines.py +++ /dev/null @@ -1,72 +0,0 @@ -from haystack import Document -from haystack.document_stores import InMemoryDocumentStore -from haystack.pipelines import SearchSummarizationPipeline -from haystack.nodes import BM25Retriever, TransformersSummarizer - - -def test_summarization_pipeline(): - docs = [ - Document( - content=""" - PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. - The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected - by the shutoffs which were expected to last through at least midday tomorrow. - """ - ), - Document( - content=""" - The Eiffel Tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest - structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, - the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a - title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first - structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower - in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel - Tower is the second tallest free-standing structure in France after the Millau Viaduct. - """ - ), - ] - summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distilbart-xsum-12-6", use_gpu=False) - - ds = InMemoryDocumentStore(use_bm25=True) - retriever = BM25Retriever(document_store=ds) - ds.write_documents(docs) - - query = "Eiffel Tower" - pipeline = SearchSummarizationPipeline(retriever=retriever, summarizer=summarizer, return_in_answer_format=True) - output = pipeline.run(query=query, params={"Retriever": {"top_k": 1}}) - answers = output["answers"] - assert len(answers) == 1 - assert "The Eiffel Tower is one of the world's tallest structures" == answers[0]["answer"].strip() - - -def test_summarization_pipeline_one_summary(): - split_docs = [ - Document( - content=""" - The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. - Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the - Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler - Building in New York City was finished in 1930. - """ - ), - Document( - content=""" - It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the - top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, - the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct. - """ - ), - ] - ds = InMemoryDocumentStore(use_bm25=True) - retriever = BM25Retriever(document_store=ds) - ds.write_documents(split_docs) - summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distilbart-xsum-12-6", use_gpu=False) - - query = "Eiffel Tower" - pipeline = SearchSummarizationPipeline( - retriever=retriever, summarizer=summarizer, generate_single_summary=True, return_in_answer_format=True - ) - output = pipeline.run(query=query, params={"Retriever": {"top_k": 2}}) - answers = output["answers"] - assert len(answers) == 1 - assert answers[0]["answer"].strip() == "The Eiffel Tower was built in 1924 in Paris, France."