haystack/e2e/preview/pipelines/test_rag_pipelines.py

import os
import json
import pytest

from haystack.preview import Pipeline, Document
from haystack.preview.document_stores import MemoryDocumentStore
from haystack.preview.components.writers import DocumentWriter
from haystack.preview.components.retrievers import MemoryBM25Retriever, MemoryEmbeddingRetriever
from haystack.preview.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
from haystack.preview.components.generators.openai.gpt import GPTGenerator
from haystack.preview.components.builders.answer_builder import AnswerBuilder
from haystack.preview.components.builders.prompt_builder import PromptBuilder


@pytest.mark.skipif(
    not os.environ.get("OPENAI_API_KEY", None),
    reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
)
def test_bm25_rag_pipeline(tmp_path):
    # Create the RAG pipeline
    prompt_template = """
    Given these documents, answer the question.\nDocuments:
    {% for doc in documents %}
        {{ doc.text }}
    {% endfor %}

    \nQuestion: {{question}}
    \nAnswer:
    """
    rag_pipeline = Pipeline()
    rag_pipeline.add_component(instance=MemoryBM25Retriever(document_store=MemoryDocumentStore()), name="retriever")
    rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder")
    rag_pipeline.add_component(instance=GPTGenerator(api_key=os.environ.get("OPENAI_API_KEY")), name="llm")
    rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")
    rag_pipeline.connect("retriever", "prompt_builder.documents")
    rag_pipeline.connect("prompt_builder", "llm")
    rag_pipeline.connect("llm.replies", "answer_builder.replies")
    rag_pipeline.connect("llm.metadata", "answer_builder.metadata")
    rag_pipeline.connect("retriever", "answer_builder.documents")

    # Draw the pipeline
    rag_pipeline.draw(tmp_path / "test_bm25_rag_pipeline.png")

    # Serialize the pipeline to JSON
    with open(tmp_path / "test_bm25_rag_pipeline.json", "w") as f:
        json.dump(rag_pipeline.to_dict(), f)

    # Load the pipeline back
    with open(tmp_path / "test_bm25_rag_pipeline.json", "r") as f:
        rag_pipeline = Pipeline.from_dict(json.load(f))

    # Populate the document store
    documents = [
        Document(text="My name is Jean and I live in Paris."),
        Document(text="My name is Mark and I live in Berlin."),
        Document(text="My name is Giorgio and I live in Rome."),
    ]
    rag_pipeline.get_component("retriever").document_store.write_documents(documents)

    # Query and assert
    questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"]
    answers_spywords = ["Jean", "Mark", "Giorgio"]

    for question, spyword in zip(questions, answers_spywords):
        result = rag_pipeline.run(
            {
                "retriever": {"query": question},
                "prompt_builder": {"question": question},
                "answer_builder": {"query": question},
            }
        )

        assert len(result["answer_builder"]["answers"]) == 1
        generated_answer = result["answer_builder"]["answers"][0]
        assert spyword in generated_answer.data
        assert generated_answer.query == question
        assert hasattr(generated_answer, "documents")
        assert hasattr(generated_answer, "metadata")


@pytest.mark.skipif(
    not os.environ.get("OPENAI_API_KEY", None),
    reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
)
def test_embedding_retrieval_rag_pipeline(tmp_path):
    # Create the RAG pipeline
    prompt_template = """
    Given these documents, answer the question.\nDocuments:
    {% for doc in documents %}
        {{ doc.text }}
    {% endfor %}

    \nQuestion: {{question}}
    \nAnswer:
    """
    rag_pipeline = Pipeline()
    rag_pipeline.add_component(
        instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
        name="text_embedder",
    )
    rag_pipeline.add_component(
        instance=MemoryEmbeddingRetriever(document_store=MemoryDocumentStore()), name="retriever"
    )
    rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder")
    rag_pipeline.add_component(instance=GPTGenerator(api_key=os.environ.get("OPENAI_API_KEY")), name="llm")
    rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")
    rag_pipeline.connect("text_embedder", "retriever")
    rag_pipeline.connect("retriever", "prompt_builder.documents")
    rag_pipeline.connect("prompt_builder", "llm")
    rag_pipeline.connect("llm.replies", "answer_builder.replies")
    rag_pipeline.connect("llm.metadata", "answer_builder.metadata")
    rag_pipeline.connect("retriever", "answer_builder.documents")

    # Draw the pipeline
    rag_pipeline.draw(tmp_path / "test_embedding_rag_pipeline.png")

    # Serialize the pipeline to JSON
    with open(tmp_path / "test_bm25_rag_pipeline.json", "w") as f:
        json.dump(rag_pipeline.to_dict(), f)

    # Load the pipeline back
    with open(tmp_path / "test_bm25_rag_pipeline.json", "r") as f:
        rag_pipeline = Pipeline.from_dict(json.load(f))

    # Populate the document store
    documents = [
        Document(text="My name is Jean and I live in Paris."),
        Document(text="My name is Mark and I live in Berlin."),
        Document(text="My name is Giorgio and I live in Rome."),
    ]
    document_store = rag_pipeline.get_component("retriever").document_store
    indexing_pipeline = Pipeline()
    indexing_pipeline.add_component(
        instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
        name="document_embedder",
    )
    indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="document_writer")
    indexing_pipeline.connect("document_embedder", "document_writer")
    indexing_pipeline.run({"document_embedder": {"documents": documents}})

    # Query and assert
    questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"]
    answers_spywords = ["Jean", "Mark", "Giorgio"]

    for question, spyword in zip(questions, answers_spywords):
        result = rag_pipeline.run(
            {
                "text_embedder": {"text": question},
                "prompt_builder": {"question": question},
                "answer_builder": {"query": question},
            }
        )

        assert len(result["answer_builder"]["answers"]) == 1
        generated_answer = result["answer_builder"]["answers"][0]
        assert spyword in generated_answer.data
        assert generated_answer.query == question
        assert hasattr(generated_answer, "documents")
        assert hasattr(generated_answer, "metadata")
test: e2e tests for RAG Pipelines (#5876) * relax extractive reader integration tests * force reader to CPU * ensure integration tests reproducibility * e2e rag tests * move set_all_seeds to testing package * refine rag tests * Update e2e/preview/pipelines/test_rag_pipelines.py Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-09-26 11:49:50 +02:00			`import os`
test: enhance e2e tests to also draw and serialize/deserialize the test pipelines (#5910) * add draw and serialization/deserialization to e2e pipeline examples * add comment about json serialization * fix a small gptgenerator bug and move indexing in tests * to json * review feedback 2023-10-09 12:54:17 +01:00			`import json`
test: e2e tests for RAG Pipelines (#5876) * relax extractive reader integration tests * force reader to CPU * ensure integration tests reproducibility * e2e rag tests * move set_all_seeds to testing package * refine rag tests * Update e2e/preview/pipelines/test_rag_pipelines.py Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-09-26 11:49:50 +02:00			`import pytest`

			`from haystack.preview import Pipeline, Document`
			`from haystack.preview.document_stores import MemoryDocumentStore`
			`from haystack.preview.components.writers import DocumentWriter`
			`from haystack.preview.components.retrievers import MemoryBM25Retriever, MemoryEmbeddingRetriever`
			`from haystack.preview.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder`
			`from haystack.preview.components.generators.openai.gpt import GPTGenerator`
			`from haystack.preview.components.builders.answer_builder import AnswerBuilder`
			`from haystack.preview.components.builders.prompt_builder import PromptBuilder`


			`@pytest.mark.skipif(`
			`not os.environ.get("OPENAI_API_KEY", None),`
			`reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",`
			`)`
test: enhance e2e tests to also draw and serialize/deserialize the test pipelines (#5910) * add draw and serialization/deserialization to e2e pipeline examples * add comment about json serialization * fix a small gptgenerator bug and move indexing in tests * to json * review feedback 2023-10-09 12:54:17 +01:00			`def test_bm25_rag_pipeline(tmp_path):`
			`# Create the RAG pipeline`
test: e2e tests for RAG Pipelines (#5876) * relax extractive reader integration tests * force reader to CPU * ensure integration tests reproducibility * e2e rag tests * move set_all_seeds to testing package * refine rag tests * Update e2e/preview/pipelines/test_rag_pipelines.py Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-09-26 11:49:50 +02:00			`prompt_template = """`
			`Given these documents, answer the question.\nDocuments:`
			`{% for doc in documents %}`
			`{{ doc.text }}`
			`{% endfor %}`

			`\nQuestion: {{question}}`
			`\nAnswer:`
			`"""`
			`rag_pipeline = Pipeline()`
test: enhance e2e tests to also draw and serialize/deserialize the test pipelines (#5910) * add draw and serialization/deserialization to e2e pipeline examples * add comment about json serialization * fix a small gptgenerator bug and move indexing in tests * to json * review feedback 2023-10-09 12:54:17 +01:00			`rag_pipeline.add_component(instance=MemoryBM25Retriever(document_store=MemoryDocumentStore()), name="retriever")`
test: e2e tests for RAG Pipelines (#5876) * relax extractive reader integration tests * force reader to CPU * ensure integration tests reproducibility * e2e rag tests * move set_all_seeds to testing package * refine rag tests * Update e2e/preview/pipelines/test_rag_pipelines.py Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-09-26 11:49:50 +02:00			`rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder")`
			`rag_pipeline.add_component(instance=GPTGenerator(api_key=os.environ.get("OPENAI_API_KEY")), name="llm")`
			`rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")`
			`rag_pipeline.connect("retriever", "prompt_builder.documents")`
			`rag_pipeline.connect("prompt_builder", "llm")`
			`rag_pipeline.connect("llm.replies", "answer_builder.replies")`
			`rag_pipeline.connect("llm.metadata", "answer_builder.metadata")`
			`rag_pipeline.connect("retriever", "answer_builder.documents")`

test: enhance e2e tests to also draw and serialize/deserialize the test pipelines (#5910) * add draw and serialization/deserialization to e2e pipeline examples * add comment about json serialization * fix a small gptgenerator bug and move indexing in tests * to json * review feedback 2023-10-09 12:54:17 +01:00			`# Draw the pipeline`
			`rag_pipeline.draw(tmp_path / "test_bm25_rag_pipeline.png")`

			`# Serialize the pipeline to JSON`
			`with open(tmp_path / "test_bm25_rag_pipeline.json", "w") as f:`
			`json.dump(rag_pipeline.to_dict(), f)`

			`# Load the pipeline back`
			`with open(tmp_path / "test_bm25_rag_pipeline.json", "r") as f:`
			`rag_pipeline = Pipeline.from_dict(json.load(f))`

			`# Populate the document store`
			`documents = [`
			`Document(text="My name is Jean and I live in Paris."),`
			`Document(text="My name is Mark and I live in Berlin."),`
			`Document(text="My name is Giorgio and I live in Rome."),`
			`]`
			`rag_pipeline.get_component("retriever").document_store.write_documents(documents)`

			`# Query and assert`
test: e2e tests for RAG Pipelines (#5876) * relax extractive reader integration tests * force reader to CPU * ensure integration tests reproducibility * e2e rag tests * move set_all_seeds to testing package * refine rag tests * Update e2e/preview/pipelines/test_rag_pipelines.py Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-09-26 11:49:50 +02:00			`questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"]`
			`answers_spywords = ["Jean", "Mark", "Giorgio"]`

			`for question, spyword in zip(questions, answers_spywords):`
			`result = rag_pipeline.run(`
			`{`
			`"retriever": {"query": question},`
			`"prompt_builder": {"question": question},`
			`"answer_builder": {"query": question},`
			`}`
			`)`

			`assert len(result["answer_builder"]["answers"]) == 1`
			`generated_answer = result["answer_builder"]["answers"][0]`
			`assert spyword in generated_answer.data`
			`assert generated_answer.query == question`
			`assert hasattr(generated_answer, "documents")`
			`assert hasattr(generated_answer, "metadata")`


			`@pytest.mark.skipif(`
			`not os.environ.get("OPENAI_API_KEY", None),`
			`reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",`
			`)`
test: enhance e2e tests to also draw and serialize/deserialize the test pipelines (#5910) * add draw and serialization/deserialization to e2e pipeline examples * add comment about json serialization * fix a small gptgenerator bug and move indexing in tests * to json * review feedback 2023-10-09 12:54:17 +01:00			`def test_embedding_retrieval_rag_pipeline(tmp_path):`
			`# Create the RAG pipeline`
test: e2e tests for RAG Pipelines (#5876) * relax extractive reader integration tests * force reader to CPU * ensure integration tests reproducibility * e2e rag tests * move set_all_seeds to testing package * refine rag tests * Update e2e/preview/pipelines/test_rag_pipelines.py Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-09-26 11:49:50 +02:00			`prompt_template = """`
			`Given these documents, answer the question.\nDocuments:`
			`{% for doc in documents %}`
			`{{ doc.text }}`
			`{% endfor %}`

			`\nQuestion: {{question}}`
			`\nAnswer:`
			`"""`
			`rag_pipeline = Pipeline()`
			`rag_pipeline.add_component(`
test: enhance e2e tests to also draw and serialize/deserialize the test pipelines (#5910) * add draw and serialization/deserialization to e2e pipeline examples * add comment about json serialization * fix a small gptgenerator bug and move indexing in tests * to json * review feedback 2023-10-09 12:54:17 +01:00			`instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),`
test: e2e tests for RAG Pipelines (#5876) * relax extractive reader integration tests * force reader to CPU * ensure integration tests reproducibility * e2e rag tests * move set_all_seeds to testing package * refine rag tests * Update e2e/preview/pipelines/test_rag_pipelines.py Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-09-26 11:49:50 +02:00			`name="text_embedder",`
			`)`
test: enhance e2e tests to also draw and serialize/deserialize the test pipelines (#5910) * add draw and serialization/deserialization to e2e pipeline examples * add comment about json serialization * fix a small gptgenerator bug and move indexing in tests * to json * review feedback 2023-10-09 12:54:17 +01:00			`rag_pipeline.add_component(`
			`instance=MemoryEmbeddingRetriever(document_store=MemoryDocumentStore()), name="retriever"`
			`)`
test: e2e tests for RAG Pipelines (#5876) * relax extractive reader integration tests * force reader to CPU * ensure integration tests reproducibility * e2e rag tests * move set_all_seeds to testing package * refine rag tests * Update e2e/preview/pipelines/test_rag_pipelines.py Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-09-26 11:49:50 +02:00			`rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder")`
			`rag_pipeline.add_component(instance=GPTGenerator(api_key=os.environ.get("OPENAI_API_KEY")), name="llm")`
			`rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")`
			`rag_pipeline.connect("text_embedder", "retriever")`
			`rag_pipeline.connect("retriever", "prompt_builder.documents")`
			`rag_pipeline.connect("prompt_builder", "llm")`
			`rag_pipeline.connect("llm.replies", "answer_builder.replies")`
			`rag_pipeline.connect("llm.metadata", "answer_builder.metadata")`
			`rag_pipeline.connect("retriever", "answer_builder.documents")`

test: enhance e2e tests to also draw and serialize/deserialize the test pipelines (#5910) * add draw and serialization/deserialization to e2e pipeline examples * add comment about json serialization * fix a small gptgenerator bug and move indexing in tests * to json * review feedback 2023-10-09 12:54:17 +01:00			`# Draw the pipeline`
			`rag_pipeline.draw(tmp_path / "test_embedding_rag_pipeline.png")`

			`# Serialize the pipeline to JSON`
			`with open(tmp_path / "test_bm25_rag_pipeline.json", "w") as f:`
			`json.dump(rag_pipeline.to_dict(), f)`

			`# Load the pipeline back`
			`with open(tmp_path / "test_bm25_rag_pipeline.json", "r") as f:`
			`rag_pipeline = Pipeline.from_dict(json.load(f))`

			`# Populate the document store`
			`documents = [`
			`Document(text="My name is Jean and I live in Paris."),`
			`Document(text="My name is Mark and I live in Berlin."),`
			`Document(text="My name is Giorgio and I live in Rome."),`
			`]`
			`document_store = rag_pipeline.get_component("retriever").document_store`
			`indexing_pipeline = Pipeline()`
			`indexing_pipeline.add_component(`
			`instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),`
			`name="document_embedder",`
			`)`
			`indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="document_writer")`
			`indexing_pipeline.connect("document_embedder", "document_writer")`
			`indexing_pipeline.run({"document_embedder": {"documents": documents}})`

			`# Query and assert`
test: e2e tests for RAG Pipelines (#5876) * relax extractive reader integration tests * force reader to CPU * ensure integration tests reproducibility * e2e rag tests * move set_all_seeds to testing package * refine rag tests * Update e2e/preview/pipelines/test_rag_pipelines.py Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> 2023-09-26 11:49:50 +02:00			`questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"]`
			`answers_spywords = ["Jean", "Mark", "Giorgio"]`

			`for question, spyword in zip(questions, answers_spywords):`
			`result = rag_pipeline.run(`
			`{`
			`"text_embedder": {"text": question},`
			`"prompt_builder": {"question": question},`
			`"answer_builder": {"query": question},`
			`}`
			`)`

			`assert len(result["answer_builder"]["answers"]) == 1`
			`generated_answer = result["answer_builder"]["answers"][0]`
			`assert spyword in generated_answer.data`
			`assert generated_answer.query == question`
			`assert hasattr(generated_answer, "documents")`
			`assert hasattr(generated_answer, "metadata")`