haystack/e2e/pipelines/test_hybrid_doc_search_pipeline.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0


from haystack import Document, Pipeline
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
from haystack.components.joiners.document_joiner import DocumentJoiner
from haystack.components.rankers import TransformersSimilarityRanker
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
from haystack.document_stores.in_memory import InMemoryDocumentStore


def test_hybrid_doc_search_pipeline(tmp_path):
    # Create the pipeline
    document_store = InMemoryDocumentStore()
    hybrid_pipeline = Pipeline()
    hybrid_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=document_store), name="bm25_retriever")
    hybrid_pipeline.add_component(
        instance=SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), name="text_embedder"
    )
    hybrid_pipeline.add_component(
        instance=InMemoryEmbeddingRetriever(document_store=document_store), name="embedding_retriever"
    )
    hybrid_pipeline.add_component(instance=DocumentJoiner(), name="joiner")
    hybrid_pipeline.add_component(instance=TransformersSimilarityRanker(top_k=20), name="ranker")

    hybrid_pipeline.connect("bm25_retriever", "joiner")
    hybrid_pipeline.connect("text_embedder", "embedding_retriever")
    hybrid_pipeline.connect("embedding_retriever", "joiner")
    hybrid_pipeline.connect("joiner", "ranker")

    # Serialize the pipeline to YAML
    with open(tmp_path / "test_hybrid_doc_search_pipeline.yaml", "w") as f:
        hybrid_pipeline.dump(f)

    # Load the pipeline back
    with open(tmp_path / "test_hybrid_doc_search_pipeline.yaml", "r") as f:
        hybrid_pipeline = Pipeline.load(f)

    # Populate the document store
    documents = [
        Document(content="My name is Jean and I live in Paris."),
        Document(content="My name is Mark and I live in Berlin."),
        Document(content="My name is Mario and I live in the capital of Italy."),
        Document(content="My name is Giorgio and I live in Rome."),
    ]
    doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
    doc_embedder.warm_up()
    embedded_documents = doc_embedder.run(documents=documents)["documents"]
    hybrid_pipeline.get_component("embedding_retriever").document_store.write_documents(embedded_documents)

    query = "Who lives in Rome?"
    result = hybrid_pipeline.run(
        {"bm25_retriever": {"query": query}, "text_embedder": {"text": query}, "ranker": {"query": query}}
    )
    assert result["ranker"]["documents"][0].content == "My name is Giorgio and I live in Rome."
    assert result["ranker"]["documents"][1].content == "My name is Mario and I live in the capital of Italy."
chore: add license header to all modules (#7675) * add license header to modules * check license header at linting time 2024-05-09 15:40:36 +02:00			`# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>`
			`#`
			`# SPDX-License-Identifier: Apache-2.0`

feat: Add DocumentJoiner component 2.0 (#6105) * draft DocumentJoiner * implement merge and rrf * draft end-to-end test with DocumentJoiner in hybrid doc search pipeline * adjust for variadics Canals PR #122 * fix text_embedder input * adapt to the new Document class * adapt to new doc id * specify documents input as Variadic in run method * compare doc ids instead of full docs * rename text_file_converter input to sources * update docstring * Update haystack/preview/components/routers/document_joiner.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from docstring review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * capitalize Documents and Retrievers in docstrings * fix log message in test --------- Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> 2023-11-20 10:56:56 +01:00
chore: add license header to all modules (#7675) * add license header to modules * check license header at linting time 2024-05-09 15:40:36 +02:00			`from haystack import Document, Pipeline`
			`from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder`
chore: move `DocumentJoiner` to new `joiners` package (#6692) * move DocumentJoiner to new joiners package * relnote * leftovers * fix docstrings generation * fix unrelated pydoc misconfiguration * more unrelated work, yay! * fix assertions 2024-01-08 22:06:27 +01:00			`from haystack.components.joiners.document_joiner import DocumentJoiner`
chore: add license header to all modules (#7675) * add license header to modules * check license header at linting time 2024-05-09 15:40:36 +02:00			`from haystack.components.rankers import TransformersSimilarityRanker`
			`from haystack.components.retrievers.in_memory import InMemoryBM25Retriever, InMemoryEmbeddingRetriever`
refact!: Remove symbols under the `haystack.document_stores` namespace (#6714) * remove symbols under the haystack.document_stores namespace * Update haystack/document_stores/types/protocol.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * fix * same for retrievers * leftovers * more leftovers * add relnote * leftovers * one more * fix examples --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> 2024-01-10 21:20:42 +01:00			`from haystack.document_stores.in_memory import InMemoryDocumentStore`
feat: Add DocumentJoiner component 2.0 (#6105) * draft DocumentJoiner * implement merge and rrf * draft end-to-end test with DocumentJoiner in hybrid doc search pipeline * adjust for variadics Canals PR #122 * fix text_embedder input * adapt to the new Document class * adapt to new doc id * specify documents input as Variadic in run method * compare doc ids instead of full docs * rename text_file_converter input to sources * update docstring * Update haystack/preview/components/routers/document_joiner.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from docstring review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * capitalize Documents and Retrievers in docstrings * fix log message in test --------- Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> 2023-11-20 10:56:56 +01:00

			`def test_hybrid_doc_search_pipeline(tmp_path):`
			`# Create the pipeline`
			`document_store = InMemoryDocumentStore()`
			`hybrid_pipeline = Pipeline()`
			`hybrid_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=document_store), name="bm25_retriever")`
			`hybrid_pipeline.add_component(`
feat!: Rename `model_name` or `model_name_or_path` to `model` in all Embedder classes (#6733) * rename model parameter in the openai doc embedder * fix tests for openai doc embedder * rename model parameter in the openai text embedder * fix tests for openai text embedder * rename model parameter in the st doc embedder * fix tests for st doc embedder * rename model parameter in the st backend * fix tests for st backend * rename model parameter in the st text embedder * fix tests for st text embedder * fix docstring * fix pipeline utils * fix e2e * reno * fix the indexing pipeline _create_embedder function * fix e2e eval rag pipeline * pytest 2024-01-12 15:30:17 +01:00			`instance=SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), name="text_embedder"`
feat: Add DocumentJoiner component 2.0 (#6105) * draft DocumentJoiner * implement merge and rrf * draft end-to-end test with DocumentJoiner in hybrid doc search pipeline * adjust for variadics Canals PR #122 * fix text_embedder input * adapt to the new Document class * adapt to new doc id * specify documents input as Variadic in run method * compare doc ids instead of full docs * rename text_file_converter input to sources * update docstring * Update haystack/preview/components/routers/document_joiner.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from docstring review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * capitalize Documents and Retrievers in docstrings * fix log message in test --------- Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> 2023-11-20 10:56:56 +01:00			`)`
			`hybrid_pipeline.add_component(`
			`instance=InMemoryEmbeddingRetriever(document_store=document_store), name="embedding_retriever"`
			`)`
			`hybrid_pipeline.add_component(instance=DocumentJoiner(), name="joiner")`
			`hybrid_pipeline.add_component(instance=TransformersSimilarityRanker(top_k=20), name="ranker")`

			`hybrid_pipeline.connect("bm25_retriever", "joiner")`
			`hybrid_pipeline.connect("text_embedder", "embedding_retriever")`
			`hybrid_pipeline.connect("embedding_retriever", "joiner")`
			`hybrid_pipeline.connect("joiner", "ranker")`

test: Update E2E tests to use `Pipeline.dump/load` (#6756) 2024-01-17 15:09:27 +01:00			`# Serialize the pipeline to YAML`
			`with open(tmp_path / "test_hybrid_doc_search_pipeline.yaml", "w") as f:`
			`hybrid_pipeline.dump(f)`
feat: Add DocumentJoiner component 2.0 (#6105) * draft DocumentJoiner * implement merge and rrf * draft end-to-end test with DocumentJoiner in hybrid doc search pipeline * adjust for variadics Canals PR #122 * fix text_embedder input * adapt to the new Document class * adapt to new doc id * specify documents input as Variadic in run method * compare doc ids instead of full docs * rename text_file_converter input to sources * update docstring * Update haystack/preview/components/routers/document_joiner.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from docstring review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * capitalize Documents and Retrievers in docstrings * fix log message in test --------- Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> 2023-11-20 10:56:56 +01:00
			`# Load the pipeline back`
test: Update E2E tests to use `Pipeline.dump/load` (#6756) 2024-01-17 15:09:27 +01:00			`with open(tmp_path / "test_hybrid_doc_search_pipeline.yaml", "r") as f:`
			`hybrid_pipeline = Pipeline.load(f)`
feat: Add DocumentJoiner component 2.0 (#6105) * draft DocumentJoiner * implement merge and rrf * draft end-to-end test with DocumentJoiner in hybrid doc search pipeline * adjust for variadics Canals PR #122 * fix text_embedder input * adapt to the new Document class * adapt to new doc id * specify documents input as Variadic in run method * compare doc ids instead of full docs * rename text_file_converter input to sources * update docstring * Update haystack/preview/components/routers/document_joiner.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from docstring review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * capitalize Documents and Retrievers in docstrings * fix log message in test --------- Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> 2023-11-20 10:56:56 +01:00
			`# Populate the document store`
			`documents = [`
			`Document(content="My name is Jean and I live in Paris."),`
			`Document(content="My name is Mark and I live in Berlin."),`
			`Document(content="My name is Mario and I live in the capital of Italy."),`
			`Document(content="My name is Giorgio and I live in Rome."),`
			`]`
fix: hybrid pipeline e2e test (#6740) * fix hybrid pipeline e2e test * warmup * write to the right docstore 2024-01-15 14:20:02 +01:00			`doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")`
			`doc_embedder.warm_up()`
			`embedded_documents = doc_embedder.run(documents=documents)["documents"]`
			`hybrid_pipeline.get_component("embedding_retriever").document_store.write_documents(embedded_documents)`
feat: Add DocumentJoiner component 2.0 (#6105) * draft DocumentJoiner * implement merge and rrf * draft end-to-end test with DocumentJoiner in hybrid doc search pipeline * adjust for variadics Canals PR #122 * fix text_embedder input * adapt to the new Document class * adapt to new doc id * specify documents input as Variadic in run method * compare doc ids instead of full docs * rename text_file_converter input to sources * update docstring * Update haystack/preview/components/routers/document_joiner.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from docstring review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * capitalize Documents and Retrievers in docstrings * fix log message in test --------- Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: anakin87 <stefanofiorucci@gmail.com> Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> 2023-11-20 10:56:56 +01:00
			`query = "Who lives in Rome?"`
			`result = hybrid_pipeline.run(`
			`{"bm25_retriever": {"query": query}, "text_embedder": {"text": query}, "ranker": {"query": query}}`
			`)`
			`assert result["ranker"]["documents"][0].content == "My name is Giorgio and I live in Rome."`
			`assert result["ranker"]["documents"][1].content == "My name is Mario and I live in the capital of Italy."`